{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9477357571822758, "eval_steps": 500, "global_step": 12000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.702702702702703e-09, "logits/chosen": -0.4574483633041382, "logits/rejected": -0.4685072898864746, "logps/chosen": -89.16776275634766, "logps/rejected": -47.404972076416016, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 5.405405405405406e-09, "logits/chosen": -0.32161709666252136, "logits/rejected": -0.3056718111038208, "logps/chosen": -100.6582260131836, "logps/rejected": -53.24464416503906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0, "learning_rate": 8.108108108108109e-09, "logits/chosen": -0.42628276348114014, "logits/rejected": -0.41666004061698914, "logps/chosen": -114.20658874511719, "logps/rejected": -65.70346069335938, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3 }, { "epoch": 0.0, "learning_rate": 1.0810810810810811e-08, "logits/chosen": -0.41333869099617004, "logits/rejected": -0.396832674741745, "logps/chosen": -117.12739562988281, "logps/rejected": -48.928802490234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4 }, { "epoch": 0.0, "learning_rate": 1.3513513513513514e-08, "logits/chosen": -0.4063996970653534, "logits/rejected": -0.3712907135486603, "logps/chosen": -61.68594741821289, "logps/rejected": -113.11006927490234, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5 }, { "epoch": 0.0, "learning_rate": 1.6216216216216218e-08, "logits/chosen": -0.014998831786215305, "logits/rejected": -0.028041517361998558, "logps/chosen": -83.23030090332031, "logps/rejected": -140.50054931640625, "loss": 0.6905, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": -0.013751220889389515, "rewards/rejected": 0.013751220889389515, "step": 6 }, { "epoch": 0.0, "learning_rate": 1.891891891891892e-08, "logits/chosen": -0.5215149521827698, "logits/rejected": -0.536442756652832, "logps/chosen": -65.89936828613281, "logps/rejected": -94.0841064453125, "loss": 0.7257, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 7 }, { "epoch": 0.0, "learning_rate": 2.1621621621621623e-08, "logits/chosen": -0.4198572635650635, "logits/rejected": -0.37813600897789, "logps/chosen": -207.9281768798828, "logps/rejected": -272.84490966796875, "loss": 0.7186, "rewards/accuracies": 0.0, "rewards/chosen": -0.009831237606704235, "rewards/margins": -0.02941131591796875, "rewards/rejected": 0.01958007924258709, "step": 8 }, { "epoch": 0.0, "learning_rate": 2.4324324324324324e-08, "logits/chosen": -0.3930993676185608, "logits/rejected": -0.37075257301330566, "logps/chosen": -105.44384765625, "logps/rejected": -70.6500244140625, "loss": 0.6348, "rewards/accuracies": 1.0, "rewards/chosen": 0.013652038760483265, "rewards/margins": 0.007357788272202015, "rewards/rejected": 0.00629425048828125, "step": 9 }, { "epoch": 0.0, "learning_rate": 2.7027027027027028e-08, "logits/chosen": -0.4363981783390045, "logits/rejected": -0.4500162601470947, "logps/chosen": -10.444131851196289, "logps/rejected": -6.491818904876709, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": -0.0032381059136241674, "rewards/margins": 0.0020339011680334806, "rewards/rejected": -0.005272007081657648, "step": 10 }, { "epoch": 0.0, "learning_rate": 2.972972972972973e-08, "logits/chosen": -0.7961447834968567, "logits/rejected": -0.7598959803581238, "logps/chosen": -79.2223892211914, "logps/rejected": -64.91506958007812, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.00886383093893528, "rewards/margins": 0.02132263220846653, "rewards/rejected": -0.01245880126953125, "step": 11 }, { "epoch": 0.0, "learning_rate": 3.2432432432432436e-08, "logits/chosen": -0.6158071160316467, "logits/rejected": -0.5526894927024841, "logps/chosen": -74.07313537597656, "logps/rejected": -127.72220611572266, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": -0.04720306396484375, "rewards/margins": -0.038784027099609375, "rewards/rejected": -0.008419036865234375, "step": 12 }, { "epoch": 0.0, "learning_rate": 3.513513513513514e-08, "logits/chosen": -0.3028618097305298, "logits/rejected": -0.26373910903930664, "logps/chosen": -57.48249053955078, "logps/rejected": -93.26950073242188, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": -0.012020492926239967, "rewards/margins": -0.04482994228601456, "rewards/rejected": 0.03280944749712944, "step": 13 }, { "epoch": 0.0, "learning_rate": 3.783783783783784e-08, "logits/chosen": -0.6135565042495728, "logits/rejected": -0.6135565042495728, "logps/chosen": -104.983154296875, "logps/rejected": -104.983154296875, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": -0.025572968646883965, "rewards/margins": 0.0, "rewards/rejected": -0.025572968646883965, "step": 14 }, { "epoch": 0.0, "learning_rate": 4.054054054054054e-08, "logits/chosen": -0.3213292956352234, "logits/rejected": -0.2872928977012634, "logps/chosen": -136.76844787597656, "logps/rejected": -223.98184204101562, "loss": 0.7046, "rewards/accuracies": 0.0, "rewards/chosen": 0.00630950927734375, "rewards/margins": -0.008685302920639515, "rewards/rejected": 0.014994812197983265, "step": 15 }, { "epoch": 0.0, "learning_rate": 4.3243243243243246e-08, "logits/chosen": -0.12903794646263123, "logits/rejected": -0.12139445543289185, "logps/chosen": -93.21926879882812, "logps/rejected": -121.59129333496094, "loss": 0.7199, "rewards/accuracies": 0.0, "rewards/chosen": -0.04407348856329918, "rewards/margins": -0.056475069373846054, "rewards/rejected": 0.012401580810546875, "step": 16 }, { "epoch": 0.0, "learning_rate": 4.5945945945945947e-08, "logits/chosen": -0.3550924062728882, "logits/rejected": -0.3157065510749817, "logps/chosen": -90.94242095947266, "logps/rejected": -201.2190399169922, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.06440506130456924, "rewards/margins": 0.07587051391601562, "rewards/rejected": -0.01146545447409153, "step": 17 }, { "epoch": 0.0, "learning_rate": 4.864864864864865e-08, "logits/chosen": -0.2801969051361084, "logits/rejected": -0.6365466713905334, "logps/chosen": -104.32650756835938, "logps/rejected": -60.384395599365234, "loss": 0.6903, "rewards/accuracies": 0.0, "rewards/chosen": -0.014154816046357155, "rewards/margins": -0.0023181913420557976, "rewards/rejected": -0.011836624704301357, "step": 18 }, { "epoch": 0.0, "learning_rate": 5.1351351351351355e-08, "logits/chosen": -0.38283270597457886, "logits/rejected": -0.2837836742401123, "logps/chosen": -263.32525634765625, "logps/rejected": -30.108945846557617, "loss": 0.6787, "rewards/accuracies": 1.0, "rewards/chosen": 0.0092315673828125, "rewards/margins": 0.012894058600068092, "rewards/rejected": -0.0036624909844249487, "step": 19 }, { "epoch": 0.0, "learning_rate": 5.4054054054054056e-08, "logits/chosen": -0.22906503081321716, "logits/rejected": -0.2514221966266632, "logps/chosen": -149.56744384765625, "logps/rejected": -144.52554321289062, "loss": 0.7051, "rewards/accuracies": 0.0, "rewards/chosen": -0.06680603325366974, "rewards/margins": -0.03898468241095543, "rewards/rejected": -0.02782135084271431, "step": 20 }, { "epoch": 0.0, "learning_rate": 5.6756756756756756e-08, "logits/chosen": -0.4769887924194336, "logits/rejected": -0.9768126606941223, "logps/chosen": -237.3230743408203, "logps/rejected": -38.842315673828125, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.07454528659582138, "rewards/margins": 0.07287139445543289, "rewards/rejected": 0.001673889230005443, "step": 21 }, { "epoch": 0.0, "learning_rate": 5.945945945945946e-08, "logits/chosen": -0.4802466034889221, "logits/rejected": -0.3694188892841339, "logps/chosen": -70.7939453125, "logps/rejected": -167.08042907714844, "loss": 0.6973, "rewards/accuracies": 1.0, "rewards/chosen": -0.020050048828125, "rewards/margins": 0.07873993366956711, "rewards/rejected": -0.09878998249769211, "step": 22 }, { "epoch": 0.0, "learning_rate": 6.216216216216216e-08, "logits/chosen": -0.39191746711730957, "logits/rejected": -0.3637760877609253, "logps/chosen": -99.01658630371094, "logps/rejected": -50.174842834472656, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": -0.007884216494858265, "rewards/margins": 0.00835952814668417, "rewards/rejected": -0.016243744641542435, "step": 23 }, { "epoch": 0.0, "learning_rate": 6.486486486486487e-08, "logits/chosen": -0.7008110284805298, "logits/rejected": -0.7157371044158936, "logps/chosen": -67.90617370605469, "logps/rejected": -16.2481689453125, "loss": 0.672, "rewards/accuracies": 1.0, "rewards/chosen": -0.0012054443359375, "rewards/margins": 0.028290748596191406, "rewards/rejected": -0.029496192932128906, "step": 24 }, { "epoch": 0.0, "learning_rate": 6.756756756756757e-08, "logits/chosen": -0.6836633682250977, "logits/rejected": -0.7150352597236633, "logps/chosen": -183.3684844970703, "logps/rejected": -161.2510986328125, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": -0.01700439490377903, "rewards/margins": 0.01286010816693306, "rewards/rejected": -0.02986450307071209, "step": 25 }, { "epoch": 0.0, "learning_rate": 7.027027027027027e-08, "logits/chosen": -0.25069156289100647, "logits/rejected": -0.2550268769264221, "logps/chosen": -79.00028991699219, "logps/rejected": -121.57803344726562, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.01075515802949667, "rewards/margins": 0.01012954767793417, "rewards/rejected": 0.0006256103515625, "step": 26 }, { "epoch": 0.0, "learning_rate": 7.297297297297297e-08, "logits/chosen": -0.18104097247123718, "logits/rejected": -0.18104097247123718, "logps/chosen": -39.98165512084961, "logps/rejected": -39.98165512084961, "loss": 0.6861, "rewards/accuracies": 0.0, "rewards/chosen": 0.036931611597537994, "rewards/margins": 0.0, "rewards/rejected": 0.036931611597537994, "step": 27 }, { "epoch": 0.0, "learning_rate": 7.567567567567568e-08, "logits/chosen": -0.15514032542705536, "logits/rejected": -0.15514032542705536, "logps/chosen": -72.5407943725586, "logps/rejected": -72.5407943725586, "loss": 0.6939, "rewards/accuracies": 0.0, "rewards/chosen": -0.016268158331513405, "rewards/margins": 0.0, "rewards/rejected": -0.016268158331513405, "step": 28 }, { "epoch": 0.0, "learning_rate": 7.837837837837838e-08, "logits/chosen": -0.4431764483451843, "logits/rejected": -0.4217994511127472, "logps/chosen": -171.45108032226562, "logps/rejected": -67.66255950927734, "loss": 0.6828, "rewards/accuracies": 0.0, "rewards/chosen": 0.0016159057850018144, "rewards/margins": -0.01575775071978569, "rewards/rejected": 0.01737365685403347, "step": 29 }, { "epoch": 0.0, "learning_rate": 8.108108108108108e-08, "logits/chosen": -0.4948759973049164, "logits/rejected": -0.46931925415992737, "logps/chosen": -89.25163269042969, "logps/rejected": -88.7713394165039, "loss": 0.7155, "rewards/accuracies": 0.0, "rewards/chosen": 0.0018096923595294356, "rewards/margins": -0.03197326511144638, "rewards/rejected": 0.033782958984375, "step": 30 }, { "epoch": 0.01, "learning_rate": 8.378378378378379e-08, "logits/chosen": -0.21655869483947754, "logits/rejected": -0.2348528355360031, "logps/chosen": -132.27581787109375, "logps/rejected": -199.43511962890625, "loss": 0.7309, "rewards/accuracies": 0.0, "rewards/chosen": 0.02433166466653347, "rewards/margins": -0.04292907565832138, "rewards/rejected": 0.0672607421875, "step": 31 }, { "epoch": 0.01, "learning_rate": 8.648648648648649e-08, "logits/chosen": -0.2564854323863983, "logits/rejected": -0.3237861394882202, "logps/chosen": -143.43984985351562, "logps/rejected": -143.3053741455078, "loss": 0.7172, "rewards/accuracies": 0.0, "rewards/chosen": -0.05624847486615181, "rewards/margins": -0.09093628078699112, "rewards/rejected": 0.03468780592083931, "step": 32 }, { "epoch": 0.01, "learning_rate": 8.918918918918919e-08, "logits/chosen": -0.32255616784095764, "logits/rejected": -0.3002498149871826, "logps/chosen": -113.14665222167969, "logps/rejected": -107.28828430175781, "loss": 0.7021, "rewards/accuracies": 1.0, "rewards/chosen": 0.025031281635165215, "rewards/margins": 0.004402924329042435, "rewards/rejected": 0.02062835730612278, "step": 33 }, { "epoch": 0.01, "learning_rate": 9.189189189189189e-08, "logits/chosen": -0.4384247958660126, "logits/rejected": -0.49135464429855347, "logps/chosen": -129.80722045898438, "logps/rejected": -75.83414459228516, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.012667846865952015, "rewards/margins": 0.03738861158490181, "rewards/rejected": -0.02472076378762722, "step": 34 }, { "epoch": 0.01, "learning_rate": 9.45945945945946e-08, "logits/chosen": -1.3956577777862549, "logits/rejected": -1.4270826578140259, "logps/chosen": -41.77008819580078, "logps/rejected": -31.03044891357422, "loss": 0.7185, "rewards/accuracies": 0.0, "rewards/chosen": 0.010449218563735485, "rewards/margins": -0.011626244522631168, "rewards/rejected": 0.022075463086366653, "step": 35 }, { "epoch": 0.01, "learning_rate": 9.72972972972973e-08, "logits/chosen": -0.5214447975158691, "logits/rejected": -0.5153151154518127, "logps/chosen": -67.65122985839844, "logps/rejected": -29.398672103881836, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.0012344360584393144, "rewards/margins": 0.024415969848632812, "rewards/rejected": -0.023181533440947533, "step": 36 }, { "epoch": 0.01, "learning_rate": 1e-07, "logits/chosen": -0.5391016006469727, "logits/rejected": -0.4588797986507416, "logps/chosen": -112.70020294189453, "logps/rejected": -145.36062622070312, "loss": 0.6496, "rewards/accuracies": 1.0, "rewards/chosen": 0.05281677469611168, "rewards/margins": 0.09342804551124573, "rewards/rejected": -0.04061126708984375, "step": 37 }, { "epoch": 0.01, "learning_rate": 1.0270270270270271e-07, "logits/chosen": -0.6623775362968445, "logits/rejected": -1.1053534746170044, "logps/chosen": -150.60397338867188, "logps/rejected": -39.296478271484375, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.0314483642578125, "rewards/margins": 0.04236755520105362, "rewards/rejected": -0.010919190011918545, "step": 38 }, { "epoch": 0.01, "learning_rate": 1.0540540540540541e-07, "logits/chosen": -0.27772006392478943, "logits/rejected": -0.2688523530960083, "logps/chosen": -125.54698181152344, "logps/rejected": -157.04031372070312, "loss": 0.7251, "rewards/accuracies": 0.0, "rewards/chosen": -0.042189788073301315, "rewards/margins": -0.017174528911709785, "rewards/rejected": -0.02501525916159153, "step": 39 }, { "epoch": 0.01, "learning_rate": 1.0810810810810811e-07, "logits/chosen": -0.10453150421380997, "logits/rejected": -0.048462238162755966, "logps/chosen": -144.72750854492188, "logps/rejected": -48.102760314941406, "loss": 0.6729, "rewards/accuracies": 1.0, "rewards/chosen": 0.02886657789349556, "rewards/margins": 0.01479034498333931, "rewards/rejected": 0.01407623291015625, "step": 40 }, { "epoch": 0.01, "learning_rate": 1.1081081081081081e-07, "logits/chosen": -0.6031096577644348, "logits/rejected": -0.5896523594856262, "logps/chosen": -201.462890625, "logps/rejected": -190.50701904296875, "loss": 0.6989, "rewards/accuracies": 1.0, "rewards/chosen": -0.02431945875287056, "rewards/margins": 0.003692626953125, "rewards/rejected": -0.02801208570599556, "step": 41 }, { "epoch": 0.01, "learning_rate": 1.1351351351351351e-07, "logits/chosen": -0.18167558312416077, "logits/rejected": -0.17382089793682098, "logps/chosen": -47.91633605957031, "logps/rejected": -46.87039566040039, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": -0.018035506829619408, "rewards/margins": -0.02879943698644638, "rewards/rejected": 0.010763931088149548, "step": 42 }, { "epoch": 0.01, "learning_rate": 1.1621621621621621e-07, "logits/chosen": -0.635549008846283, "logits/rejected": -0.5975205898284912, "logps/chosen": -78.81935119628906, "logps/rejected": -102.70780944824219, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": -0.01701965369284153, "rewards/margins": 0.01006317138671875, "rewards/rejected": -0.02708282507956028, "step": 43 }, { "epoch": 0.01, "learning_rate": 1.1891891891891891e-07, "logits/chosen": -0.2405467927455902, "logits/rejected": -0.25062593817710876, "logps/chosen": -216.34109497070312, "logps/rejected": -139.3271026611328, "loss": 0.6806, "rewards/accuracies": 1.0, "rewards/chosen": 0.023162841796875, "rewards/margins": 0.007537841796875, "rewards/rejected": 0.015625, "step": 44 }, { "epoch": 0.01, "learning_rate": 1.2162162162162163e-07, "logits/chosen": -0.3288933336734772, "logits/rejected": -0.28540632128715515, "logps/chosen": -130.8501739501953, "logps/rejected": -113.89783477783203, "loss": 0.712, "rewards/accuracies": 1.0, "rewards/chosen": 0.02927093580365181, "rewards/margins": 0.02456054836511612, "rewards/rejected": 0.004710388369858265, "step": 45 }, { "epoch": 0.01, "learning_rate": 1.2432432432432432e-07, "logits/chosen": -0.504202663898468, "logits/rejected": -0.44949233531951904, "logps/chosen": -114.13026428222656, "logps/rejected": -120.87135314941406, "loss": 0.7012, "rewards/accuracies": 0.0, "rewards/chosen": 0.002044677734375, "rewards/margins": -0.02487487904727459, "rewards/rejected": 0.02691955678164959, "step": 46 }, { "epoch": 0.01, "learning_rate": 1.2702702702702703e-07, "logits/chosen": -0.2336266040802002, "logits/rejected": -0.2336266040802002, "logps/chosen": -99.86741638183594, "logps/rejected": -99.86741638183594, "loss": 0.7042, "rewards/accuracies": 0.0, "rewards/chosen": 0.0084686279296875, "rewards/margins": 0.0, "rewards/rejected": 0.0084686279296875, "step": 47 }, { "epoch": 0.01, "learning_rate": 1.2972972972972974e-07, "logits/chosen": -0.22775404155254364, "logits/rejected": -0.1965351104736328, "logps/chosen": -94.30618286132812, "logps/rejected": -113.19313049316406, "loss": 0.6942, "rewards/accuracies": 1.0, "rewards/chosen": 0.044403076171875, "rewards/margins": 0.05803527683019638, "rewards/rejected": -0.01363220252096653, "step": 48 }, { "epoch": 0.01, "learning_rate": 1.3243243243243243e-07, "logits/chosen": -0.24666862189769745, "logits/rejected": -0.24075134098529816, "logps/chosen": -92.30989074707031, "logps/rejected": -77.63444519042969, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.01954498328268528, "rewards/margins": 0.02264862135052681, "rewards/rejected": -0.003103637835010886, "step": 49 }, { "epoch": 0.01, "learning_rate": 1.3513513513513515e-07, "logits/chosen": -0.1458999067544937, "logits/rejected": -0.1458999067544937, "logps/chosen": -98.3622817993164, "logps/rejected": -98.3622817993164, "loss": 0.6921, "rewards/accuracies": 0.0, "rewards/chosen": 0.01353988703340292, "rewards/margins": 0.0, "rewards/rejected": 0.01353988703340292, "step": 50 }, { "epoch": 0.01, "learning_rate": 1.3783783783783783e-07, "logits/chosen": -0.26953455805778503, "logits/rejected": -0.2573530375957489, "logps/chosen": -21.0843448638916, "logps/rejected": -5.748525619506836, "loss": 0.6852, "rewards/accuracies": 0.0, "rewards/chosen": -0.0020292282570153475, "rewards/margins": -0.009203624911606312, "rewards/rejected": 0.007174396421760321, "step": 51 }, { "epoch": 0.01, "learning_rate": 1.4054054054054055e-07, "logits/chosen": -0.2980785667896271, "logits/rejected": -0.2980785667896271, "logps/chosen": -39.09075164794922, "logps/rejected": -39.09075164794922, "loss": 0.7168, "rewards/accuracies": 0.0, "rewards/chosen": -0.0060066222213208675, "rewards/margins": 0.0, "rewards/rejected": -0.0060066222213208675, "step": 52 }, { "epoch": 0.01, "learning_rate": 1.4324324324324323e-07, "logits/chosen": -0.30244091153144836, "logits/rejected": -0.2704448401927948, "logps/chosen": -86.57439422607422, "logps/rejected": -147.02542114257812, "loss": 0.7024, "rewards/accuracies": 0.0, "rewards/chosen": -0.02774353139102459, "rewards/margins": -0.02389831654727459, "rewards/rejected": -0.00384521484375, "step": 53 }, { "epoch": 0.01, "learning_rate": 1.4594594594594595e-07, "logits/chosen": -0.35547441244125366, "logits/rejected": -0.3560970425605774, "logps/chosen": -68.78904724121094, "logps/rejected": -91.10145568847656, "loss": 0.7022, "rewards/accuracies": 0.0, "rewards/chosen": 0.00505828857421875, "rewards/margins": -0.03649444505572319, "rewards/rejected": 0.04155273362994194, "step": 54 }, { "epoch": 0.01, "learning_rate": 1.4864864864864866e-07, "logits/chosen": -0.6874668598175049, "logits/rejected": -0.8552953600883484, "logps/chosen": -231.1009521484375, "logps/rejected": -25.190282821655273, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.09915771335363388, "rewards/margins": 0.0969722718000412, "rewards/rejected": 0.00218544015660882, "step": 55 }, { "epoch": 0.01, "learning_rate": 1.5135135135135135e-07, "logits/chosen": -0.5312785506248474, "logits/rejected": -0.5661916136741638, "logps/chosen": -96.31318664550781, "logps/rejected": -105.13238525390625, "loss": 0.7108, "rewards/accuracies": 0.0, "rewards/chosen": -0.056462861597537994, "rewards/margins": -0.041022494435310364, "rewards/rejected": -0.01544036902487278, "step": 56 }, { "epoch": 0.01, "learning_rate": 1.5405405405405406e-07, "logits/chosen": -0.14247272908687592, "logits/rejected": -0.14208896458148956, "logps/chosen": -83.37798309326172, "logps/rejected": -98.6143798828125, "loss": 0.7275, "rewards/accuracies": 0.0, "rewards/chosen": -0.036306001245975494, "rewards/margins": -0.027629852294921875, "rewards/rejected": -0.008676148019731045, "step": 57 }, { "epoch": 0.01, "learning_rate": 1.5675675675675675e-07, "logits/chosen": -0.22910155355930328, "logits/rejected": -0.19177718460559845, "logps/chosen": -81.40386962890625, "logps/rejected": -70.64883422851562, "loss": 0.6997, "rewards/accuracies": 1.0, "rewards/chosen": 0.038475800305604935, "rewards/margins": 0.053919222205877304, "rewards/rejected": -0.015443420968949795, "step": 58 }, { "epoch": 0.01, "learning_rate": 1.5945945945945947e-07, "logits/chosen": -0.4123285412788391, "logits/rejected": -0.4099842309951782, "logps/chosen": -47.291786193847656, "logps/rejected": -29.595088958740234, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.035034943372011185, "rewards/margins": 0.03463859483599663, "rewards/rejected": 0.0003963470517192036, "step": 59 }, { "epoch": 0.01, "learning_rate": 1.6216216216216215e-07, "logits/chosen": -0.17085811495780945, "logits/rejected": -0.19071833789348602, "logps/chosen": -246.43589782714844, "logps/rejected": -185.94427490234375, "loss": 0.6751, "rewards/accuracies": 1.0, "rewards/chosen": 0.08955230563879013, "rewards/margins": 0.10589599609375, "rewards/rejected": -0.01634368859231472, "step": 60 }, { "epoch": 0.01, "learning_rate": 1.6486486486486487e-07, "logits/chosen": -0.4638718068599701, "logits/rejected": -0.4477804899215698, "logps/chosen": -110.71261596679688, "logps/rejected": -27.375043869018555, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.007327270694077015, "rewards/margins": 0.010113716125488281, "rewards/rejected": -0.00278644566424191, "step": 61 }, { "epoch": 0.01, "learning_rate": 1.6756756756756758e-07, "logits/chosen": -0.43762993812561035, "logits/rejected": -0.4521141052246094, "logps/chosen": -37.28195571899414, "logps/rejected": -28.56700897216797, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.016688156872987747, "rewards/margins": 0.02699432522058487, "rewards/rejected": -0.010306167416274548, "step": 62 }, { "epoch": 0.01, "learning_rate": 1.7027027027027027e-07, "logits/chosen": -0.7247428894042969, "logits/rejected": -0.7251338958740234, "logps/chosen": -50.31498336791992, "logps/rejected": -74.01026916503906, "loss": 0.682, "rewards/accuracies": 0.0, "rewards/chosen": 0.016135025769472122, "rewards/margins": -0.017121504992246628, "rewards/rejected": 0.03325653076171875, "step": 63 }, { "epoch": 0.01, "learning_rate": 1.7297297297297298e-07, "logits/chosen": -0.8719493746757507, "logits/rejected": -0.9076974987983704, "logps/chosen": -96.20845031738281, "logps/rejected": -63.511505126953125, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.02361755445599556, "rewards/margins": 0.03260993957519531, "rewards/rejected": -0.008992386050522327, "step": 64 }, { "epoch": 0.01, "learning_rate": 1.7567567567567567e-07, "logits/chosen": -0.6289616227149963, "logits/rejected": -0.6355901956558228, "logps/chosen": -140.11801147460938, "logps/rejected": -194.10174560546875, "loss": 0.6825, "rewards/accuracies": 0.0, "rewards/chosen": -0.07399597018957138, "rewards/margins": -0.05186919867992401, "rewards/rejected": -0.02212676964700222, "step": 65 }, { "epoch": 0.01, "learning_rate": 1.7837837837837838e-07, "logits/chosen": -0.5856955647468567, "logits/rejected": -0.5172184705734253, "logps/chosen": -110.26551055908203, "logps/rejected": -87.49639892578125, "loss": 0.7171, "rewards/accuracies": 0.0, "rewards/chosen": -0.09449996799230576, "rewards/margins": -0.07109451293945312, "rewards/rejected": -0.02340545691549778, "step": 66 }, { "epoch": 0.01, "learning_rate": 1.8108108108108107e-07, "logits/chosen": -0.3695804476737976, "logits/rejected": -0.380123496055603, "logps/chosen": -85.62812805175781, "logps/rejected": -134.5016326904297, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.03887329250574112, "rewards/margins": 0.004949953407049179, "rewards/rejected": 0.03392333909869194, "step": 67 }, { "epoch": 0.01, "learning_rate": 1.8378378378378379e-07, "logits/chosen": -0.24298271536827087, "logits/rejected": -0.24298271536827087, "logps/chosen": -57.67979049682617, "logps/rejected": -57.67979049682617, "loss": 0.6988, "rewards/accuracies": 0.0, "rewards/chosen": 0.005508041474968195, "rewards/margins": 0.0, "rewards/rejected": 0.005508041474968195, "step": 68 }, { "epoch": 0.01, "learning_rate": 1.864864864864865e-07, "logits/chosen": -0.7270678877830505, "logits/rejected": -0.7270678877830505, "logps/chosen": -73.20380401611328, "logps/rejected": -73.20380401611328, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.006679534912109375, "rewards/margins": 0.0, "rewards/rejected": 0.006679534912109375, "step": 69 }, { "epoch": 0.01, "learning_rate": 1.891891891891892e-07, "logits/chosen": -0.5320895910263062, "logits/rejected": -0.49750086665153503, "logps/chosen": -70.35761260986328, "logps/rejected": -49.29041290283203, "loss": 0.7047, "rewards/accuracies": 0.0, "rewards/chosen": -0.03357086330652237, "rewards/margins": -0.037935640662908554, "rewards/rejected": 0.004364776890724897, "step": 70 }, { "epoch": 0.01, "learning_rate": 1.918918918918919e-07, "logits/chosen": -0.24125409126281738, "logits/rejected": -0.24125409126281738, "logps/chosen": -38.312713623046875, "logps/rejected": -38.312713623046875, "loss": 0.72, "rewards/accuracies": 0.0, "rewards/chosen": -0.019681548699736595, "rewards/margins": 0.0, "rewards/rejected": -0.019681548699736595, "step": 71 }, { "epoch": 0.01, "learning_rate": 1.945945945945946e-07, "logits/chosen": -0.3383989632129669, "logits/rejected": -0.28850236535072327, "logps/chosen": -107.56510925292969, "logps/rejected": -92.4212875366211, "loss": 0.6623, "rewards/accuracies": 1.0, "rewards/chosen": -0.05733642727136612, "rewards/margins": 0.011103056371212006, "rewards/rejected": -0.06843948364257812, "step": 72 }, { "epoch": 0.01, "learning_rate": 1.972972972972973e-07, "logits/chosen": -0.10578275471925735, "logits/rejected": -0.09253601729869843, "logps/chosen": -55.61961364746094, "logps/rejected": -14.928435325622559, "loss": 0.7096, "rewards/accuracies": 0.0, "rewards/chosen": -0.020219041034579277, "rewards/margins": -0.010937786661088467, "rewards/rejected": -0.00928125437349081, "step": 73 }, { "epoch": 0.01, "learning_rate": 2e-07, "logits/chosen": -0.34549859166145325, "logits/rejected": -0.34549859166145325, "logps/chosen": -9.4378662109375, "logps/rejected": -9.4378662109375, "loss": 0.6974, "rewards/accuracies": 0.0, "rewards/chosen": 0.009312152862548828, "rewards/margins": 0.0, "rewards/rejected": 0.009312152862548828, "step": 74 }, { "epoch": 0.01, "learning_rate": 2.027027027027027e-07, "logits/chosen": -0.21705082058906555, "logits/rejected": -0.21705082058906555, "logps/chosen": -106.22474670410156, "logps/rejected": -106.22474670410156, "loss": 0.6774, "rewards/accuracies": 0.0, "rewards/chosen": -0.013169861398637295, "rewards/margins": 0.0, "rewards/rejected": -0.013169861398637295, "step": 75 }, { "epoch": 0.01, "learning_rate": 2.0540540540540542e-07, "logits/chosen": 0.20349524915218353, "logits/rejected": 0.20093855261802673, "logps/chosen": -15.982999801635742, "logps/rejected": -10.335448265075684, "loss": 0.6791, "rewards/accuracies": 0.0, "rewards/chosen": 0.008326912298798561, "rewards/margins": -0.014885615557432175, "rewards/rejected": 0.023212527856230736, "step": 76 }, { "epoch": 0.01, "learning_rate": 2.081081081081081e-07, "logits/chosen": -0.07979954034090042, "logits/rejected": -0.08253735303878784, "logps/chosen": -26.185331344604492, "logps/rejected": -24.045766830444336, "loss": 0.7042, "rewards/accuracies": 0.0, "rewards/chosen": 0.0005216598510742188, "rewards/margins": -0.00347137451171875, "rewards/rejected": 0.003993034362792969, "step": 77 }, { "epoch": 0.01, "learning_rate": 2.1081081081081082e-07, "logits/chosen": -0.46265318989753723, "logits/rejected": -0.5309581756591797, "logps/chosen": -233.11062622070312, "logps/rejected": -105.21795654296875, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.03152770921587944, "rewards/margins": 0.03536071628332138, "rewards/rejected": -0.0038330077659338713, "step": 78 }, { "epoch": 0.01, "learning_rate": 2.135135135135135e-07, "logits/chosen": -0.23779058456420898, "logits/rejected": -0.24285298585891724, "logps/chosen": -111.81716918945312, "logps/rejected": -69.55924987792969, "loss": 0.7392, "rewards/accuracies": 0.0, "rewards/chosen": -0.003504181047901511, "rewards/margins": -0.0748802199959755, "rewards/rejected": 0.07137604057788849, "step": 79 }, { "epoch": 0.01, "learning_rate": 2.1621621621621622e-07, "logits/chosen": -0.19272474944591522, "logits/rejected": -0.22290301322937012, "logps/chosen": -11.308154106140137, "logps/rejected": -55.87501525878906, "loss": 0.6566, "rewards/accuracies": 1.0, "rewards/chosen": -0.0002766609250102192, "rewards/margins": 0.02400236204266548, "rewards/rejected": -0.024279022589325905, "step": 80 }, { "epoch": 0.01, "learning_rate": 2.189189189189189e-07, "logits/chosen": -0.5148763060569763, "logits/rejected": -0.5019383430480957, "logps/chosen": -94.63790893554688, "logps/rejected": -71.1953125, "loss": 0.6884, "rewards/accuracies": 0.0, "rewards/chosen": -0.032578278332948685, "rewards/margins": -0.007135773077607155, "rewards/rejected": -0.02544250525534153, "step": 81 }, { "epoch": 0.01, "learning_rate": 2.2162162162162162e-07, "logits/chosen": -0.36406734585762024, "logits/rejected": -0.3333701193332672, "logps/chosen": -133.52593994140625, "logps/rejected": -179.96914672851562, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 0.03157501295208931, "rewards/margins": -0.00834197923541069, "rewards/rejected": 0.0399169921875, "step": 82 }, { "epoch": 0.01, "learning_rate": 2.2432432432432434e-07, "logits/chosen": -0.45190754532814026, "logits/rejected": -0.4415975511074066, "logps/chosen": -150.2541046142578, "logps/rejected": -114.25859069824219, "loss": 0.6902, "rewards/accuracies": 0.0, "rewards/chosen": -0.01854553259909153, "rewards/margins": -0.007732391357421875, "rewards/rejected": -0.010813141241669655, "step": 83 }, { "epoch": 0.01, "learning_rate": 2.2702702702702703e-07, "logits/chosen": -0.29479897022247314, "logits/rejected": -0.29610952734947205, "logps/chosen": -29.168535232543945, "logps/rejected": -6.011186599731445, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.014700508676469326, "rewards/margins": 0.013163662515580654, "rewards/rejected": 0.0015368461608886719, "step": 84 }, { "epoch": 0.01, "learning_rate": 2.2972972972972974e-07, "logits/chosen": -0.7457717061042786, "logits/rejected": -0.7140187621116638, "logps/chosen": -180.11395263671875, "logps/rejected": -41.07224655151367, "loss": 0.7128, "rewards/accuracies": 1.0, "rewards/chosen": 0.07253418117761612, "rewards/margins": 0.05944061279296875, "rewards/rejected": 0.013093567453324795, "step": 85 }, { "epoch": 0.01, "learning_rate": 2.3243243243243243e-07, "logits/chosen": -0.48659276962280273, "logits/rejected": -0.47701093554496765, "logps/chosen": -246.07666015625, "logps/rejected": -96.64920806884766, "loss": 0.6939, "rewards/accuracies": 1.0, "rewards/chosen": 0.06417236477136612, "rewards/margins": 0.036443330347537994, "rewards/rejected": 0.027729034423828125, "step": 86 }, { "epoch": 0.01, "learning_rate": 2.3513513513513514e-07, "logits/chosen": -0.30337461829185486, "logits/rejected": -0.29526323080062866, "logps/chosen": -129.53268432617188, "logps/rejected": -64.26728820800781, "loss": 0.6742, "rewards/accuracies": 1.0, "rewards/chosen": -0.02077789418399334, "rewards/margins": 0.015106962993741035, "rewards/rejected": -0.035884857177734375, "step": 87 }, { "epoch": 0.01, "learning_rate": 2.3783783783783783e-07, "logits/chosen": -0.3413613438606262, "logits/rejected": -0.31273478269577026, "logps/chosen": -95.98593139648438, "logps/rejected": -137.15240478515625, "loss": 0.6688, "rewards/accuracies": 1.0, "rewards/chosen": -0.0005508423200808465, "rewards/margins": 0.08510742336511612, "rewards/rejected": -0.08565826714038849, "step": 88 }, { "epoch": 0.01, "learning_rate": 2.4054054054054054e-07, "logits/chosen": -0.4221193194389343, "logits/rejected": -0.38248753547668457, "logps/chosen": -59.77336883544922, "logps/rejected": -20.738597869873047, "loss": 0.7286, "rewards/accuracies": 0.0, "rewards/chosen": -0.01537857111543417, "rewards/margins": -0.011651039123535156, "rewards/rejected": -0.003727531526237726, "step": 89 }, { "epoch": 0.01, "learning_rate": 2.4324324324324326e-07, "logits/chosen": -0.5572382211685181, "logits/rejected": -0.5483370423316956, "logps/chosen": -141.571533203125, "logps/rejected": -185.57432556152344, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.02987213246524334, "rewards/margins": 0.0545654296875, "rewards/rejected": -0.02469329908490181, "step": 90 }, { "epoch": 0.01, "learning_rate": 2.4594594594594597e-07, "logits/chosen": -0.5232946276664734, "logits/rejected": -0.493459552526474, "logps/chosen": -128.22964477539062, "logps/rejected": -24.319225311279297, "loss": 0.6563, "rewards/accuracies": 1.0, "rewards/chosen": 0.05040283128619194, "rewards/margins": 0.04625682905316353, "rewards/rejected": 0.004146003630012274, "step": 91 }, { "epoch": 0.01, "learning_rate": 2.4864864864864863e-07, "logits/chosen": -0.502502977848053, "logits/rejected": -0.45696043968200684, "logps/chosen": -209.74441528320312, "logps/rejected": -21.400175094604492, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.11849518120288849, "rewards/margins": 0.11514854431152344, "rewards/rejected": 0.0033466338645666838, "step": 92 }, { "epoch": 0.02, "learning_rate": 2.5135135135135135e-07, "logits/chosen": -0.23714762926101685, "logits/rejected": -0.2349853813648224, "logps/chosen": -58.09187698364258, "logps/rejected": -40.152244567871094, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.03674736246466637, "rewards/margins": 0.04014625772833824, "rewards/rejected": -0.003398895263671875, "step": 93 }, { "epoch": 0.02, "learning_rate": 2.5405405405405406e-07, "logits/chosen": -0.34188154339790344, "logits/rejected": -0.3044472336769104, "logps/chosen": -88.22383880615234, "logps/rejected": -92.65487670898438, "loss": 0.7105, "rewards/accuracies": 0.0, "rewards/chosen": -0.033394623547792435, "rewards/margins": -0.043245699256658554, "rewards/rejected": 0.009851074777543545, "step": 94 }, { "epoch": 0.02, "learning_rate": 2.567567567567567e-07, "logits/chosen": -0.858312726020813, "logits/rejected": -0.765798032283783, "logps/chosen": -133.55941772460938, "logps/rejected": -120.48493194580078, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": -0.013078308664262295, "rewards/margins": 0.005043028853833675, "rewards/rejected": -0.01812133751809597, "step": 95 }, { "epoch": 0.02, "learning_rate": 2.594594594594595e-07, "logits/chosen": -0.12803782522678375, "logits/rejected": -0.12803782522678375, "logps/chosen": -47.74540710449219, "logps/rejected": -47.74540710449219, "loss": 0.6979, "rewards/accuracies": 0.0, "rewards/chosen": -0.022724533453583717, "rewards/margins": 0.0, "rewards/rejected": -0.022724533453583717, "step": 96 }, { "epoch": 0.02, "learning_rate": 2.6216216216216215e-07, "logits/chosen": -0.2836954891681671, "logits/rejected": -0.29024195671081543, "logps/chosen": -77.07351684570312, "logps/rejected": -99.91641235351562, "loss": 0.7347, "rewards/accuracies": 0.0, "rewards/chosen": -0.040248870849609375, "rewards/margins": -0.0967002883553505, "rewards/rejected": 0.05645141750574112, "step": 97 }, { "epoch": 0.02, "learning_rate": 2.6486486486486486e-07, "logits/chosen": -0.2599557042121887, "logits/rejected": -0.2754264175891876, "logps/chosen": -168.5241241455078, "logps/rejected": -34.569427490234375, "loss": 0.6981, "rewards/accuracies": 0.0, "rewards/chosen": -0.04150543361902237, "rewards/margins": -0.03328094631433487, "rewards/rejected": -0.0082244873046875, "step": 98 }, { "epoch": 0.02, "learning_rate": 2.675675675675675e-07, "logits/chosen": -0.4904974400997162, "logits/rejected": -0.45753082633018494, "logps/chosen": -89.15298461914062, "logps/rejected": -158.56509399414062, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": -0.036348726600408554, "rewards/margins": 0.010861966758966446, "rewards/rejected": -0.047210693359375, "step": 99 }, { "epoch": 0.02, "learning_rate": 2.702702702702703e-07, "logits/chosen": -0.6604823470115662, "logits/rejected": -0.6498090028762817, "logps/chosen": -140.8720703125, "logps/rejected": -78.39651489257812, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": -0.008837890811264515, "rewards/margins": 0.033107757568359375, "rewards/rejected": -0.041945647448301315, "step": 100 }, { "epoch": 0.02, "learning_rate": 2.7297297297297295e-07, "logits/chosen": -0.4302668869495392, "logits/rejected": -0.4059874415397644, "logps/chosen": -85.78804016113281, "logps/rejected": -70.36668395996094, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.041133880615234375, "rewards/margins": 0.013024138286709785, "rewards/rejected": 0.02810974232852459, "step": 101 }, { "epoch": 0.02, "learning_rate": 2.7567567567567567e-07, "logits/chosen": -0.6101118922233582, "logits/rejected": -0.5136879086494446, "logps/chosen": -90.1182861328125, "logps/rejected": -187.500244140625, "loss": 0.7129, "rewards/accuracies": 0.0, "rewards/chosen": -0.014741516672074795, "rewards/margins": -0.06167145073413849, "rewards/rejected": 0.04692993313074112, "step": 102 }, { "epoch": 0.02, "learning_rate": 2.7837837837837833e-07, "logits/chosen": -0.48063626885414124, "logits/rejected": -0.4669080972671509, "logps/chosen": -127.52775573730469, "logps/rejected": -89.54747009277344, "loss": 0.6764, "rewards/accuracies": 1.0, "rewards/chosen": 0.00308990478515625, "rewards/margins": 0.037506867200136185, "rewards/rejected": -0.034416962414979935, "step": 103 }, { "epoch": 0.02, "learning_rate": 2.810810810810811e-07, "logits/chosen": -0.45558997988700867, "logits/rejected": -0.4819584786891937, "logps/chosen": -97.61251831054688, "logps/rejected": -180.79013061523438, "loss": 0.682, "rewards/accuracies": 0.0, "rewards/chosen": 0.012089538387954235, "rewards/margins": -0.03429107740521431, "rewards/rejected": 0.04638061672449112, "step": 104 }, { "epoch": 0.02, "learning_rate": 2.8378378378378376e-07, "logits/chosen": -0.43667200207710266, "logits/rejected": -0.41790643334388733, "logps/chosen": -74.58078002929688, "logps/rejected": -65.32780456542969, "loss": 0.6379, "rewards/accuracies": 1.0, "rewards/chosen": 0.07682953029870987, "rewards/margins": 0.06102752685546875, "rewards/rejected": 0.01580200158059597, "step": 105 }, { "epoch": 0.02, "learning_rate": 2.8648648648648647e-07, "logits/chosen": -0.6027013659477234, "logits/rejected": -0.2610365152359009, "logps/chosen": -79.4964599609375, "logps/rejected": -284.44775390625, "loss": 0.666, "rewards/accuracies": 1.0, "rewards/chosen": 0.008251190185546875, "rewards/margins": 0.10515060275793076, "rewards/rejected": -0.09689941257238388, "step": 106 }, { "epoch": 0.02, "learning_rate": 2.891891891891892e-07, "logits/chosen": -0.2675948441028595, "logits/rejected": -0.2622217833995819, "logps/chosen": -69.43916320800781, "logps/rejected": -51.569984436035156, "loss": 0.6534, "rewards/accuracies": 1.0, "rewards/chosen": 0.00020599365234375, "rewards/margins": 0.055890657007694244, "rewards/rejected": -0.055684663355350494, "step": 107 }, { "epoch": 0.02, "learning_rate": 2.918918918918919e-07, "logits/chosen": -0.43873438239097595, "logits/rejected": -0.43503403663635254, "logps/chosen": -100.69526672363281, "logps/rejected": -136.63363647460938, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.01964569091796875, "rewards/margins": 0.03274231031537056, "rewards/rejected": -0.013096618466079235, "step": 108 }, { "epoch": 0.02, "learning_rate": 2.9459459459459456e-07, "logits/chosen": -0.48678824305534363, "logits/rejected": -0.5054294466972351, "logps/chosen": -86.35154724121094, "logps/rejected": -127.58827209472656, "loss": 0.6542, "rewards/accuracies": 1.0, "rewards/chosen": -0.014477538876235485, "rewards/margins": 0.05716552957892418, "rewards/rejected": -0.07164306938648224, "step": 109 }, { "epoch": 0.02, "learning_rate": 2.972972972972973e-07, "logits/chosen": -0.5430227518081665, "logits/rejected": -0.5578933954238892, "logps/chosen": -143.0216064453125, "logps/rejected": -88.64386749267578, "loss": 0.6824, "rewards/accuracies": 0.0, "rewards/chosen": -0.008114623837172985, "rewards/margins": -0.018129730597138405, "rewards/rejected": 0.01001510675996542, "step": 110 }, { "epoch": 0.02, "learning_rate": 3e-07, "logits/chosen": -0.6262672543525696, "logits/rejected": -0.6254271268844604, "logps/chosen": -85.8165283203125, "logps/rejected": -75.49491119384766, "loss": 0.7108, "rewards/accuracies": 0.0, "rewards/chosen": -0.019277190789580345, "rewards/margins": -0.006497192196547985, "rewards/rejected": -0.01277999859303236, "step": 111 }, { "epoch": 0.02, "learning_rate": 3.027027027027027e-07, "logits/chosen": -0.457510769367218, "logits/rejected": -0.5378742814064026, "logps/chosen": -80.43922424316406, "logps/rejected": -114.78785705566406, "loss": 0.6726, "rewards/accuracies": 1.0, "rewards/chosen": -0.05627289041876793, "rewards/margins": 0.03155669942498207, "rewards/rejected": -0.08782958984375, "step": 112 }, { "epoch": 0.02, "learning_rate": 3.0540540540540536e-07, "logits/chosen": -0.33606821298599243, "logits/rejected": -0.3064122796058655, "logps/chosen": -74.66036987304688, "logps/rejected": -90.19623565673828, "loss": 0.6976, "rewards/accuracies": 0.0, "rewards/chosen": -0.02735443226993084, "rewards/margins": -0.017919160425662994, "rewards/rejected": -0.00943527277559042, "step": 113 }, { "epoch": 0.02, "learning_rate": 3.0810810810810813e-07, "logits/chosen": -0.6705072522163391, "logits/rejected": -0.6338600516319275, "logps/chosen": -146.64114379882812, "logps/rejected": -40.2343635559082, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.03147430345416069, "rewards/margins": 0.05284462124109268, "rewards/rejected": -0.021370315924286842, "step": 114 }, { "epoch": 0.02, "learning_rate": 3.108108108108108e-07, "logits/chosen": -0.531464695930481, "logits/rejected": -0.4789521098136902, "logps/chosen": -182.2274932861328, "logps/rejected": -200.99205017089844, "loss": 0.7053, "rewards/accuracies": 0.0, "rewards/chosen": -0.03684387356042862, "rewards/margins": -0.06227264553308487, "rewards/rejected": 0.02542877197265625, "step": 115 }, { "epoch": 0.02, "learning_rate": 3.135135135135135e-07, "logits/chosen": 0.1123599261045456, "logits/rejected": 0.1123599261045456, "logps/chosen": -23.25205421447754, "logps/rejected": -23.25205421447754, "loss": 0.6884, "rewards/accuracies": 0.0, "rewards/chosen": 0.005395126529037952, "rewards/margins": 0.0, "rewards/rejected": 0.005395126529037952, "step": 116 }, { "epoch": 0.02, "learning_rate": 3.162162162162162e-07, "logits/chosen": -0.5354961156845093, "logits/rejected": -0.47919073700904846, "logps/chosen": -83.12342834472656, "logps/rejected": -189.90115356445312, "loss": 0.7228, "rewards/accuracies": 0.0, "rewards/chosen": -0.08088379353284836, "rewards/margins": -0.07893677055835724, "rewards/rejected": -0.0019470214610919356, "step": 117 }, { "epoch": 0.02, "learning_rate": 3.1891891891891893e-07, "logits/chosen": -0.19211018085479736, "logits/rejected": -0.14636245369911194, "logps/chosen": -55.638885498046875, "logps/rejected": -57.65934753417969, "loss": 0.6836, "rewards/accuracies": 0.0, "rewards/chosen": 0.00506591796875, "rewards/margins": -0.04758148267865181, "rewards/rejected": 0.05264740064740181, "step": 118 }, { "epoch": 0.02, "learning_rate": 3.216216216216216e-07, "logits/chosen": -0.42548295855522156, "logits/rejected": -0.4367714524269104, "logps/chosen": -9.379322052001953, "logps/rejected": -20.594425201416016, "loss": 0.6609, "rewards/accuracies": 1.0, "rewards/chosen": -0.004791259765625, "rewards/margins": 0.0011009215377271175, "rewards/rejected": -0.0058921813033521175, "step": 119 }, { "epoch": 0.02, "learning_rate": 3.243243243243243e-07, "logits/chosen": -0.21091599762439728, "logits/rejected": -0.25646913051605225, "logps/chosen": -269.9447326660156, "logps/rejected": -95.23390197753906, "loss": 0.7006, "rewards/accuracies": 0.0, "rewards/chosen": -0.03367004543542862, "rewards/margins": -0.027596283704042435, "rewards/rejected": -0.006073761265724897, "step": 120 }, { "epoch": 0.02, "learning_rate": 3.27027027027027e-07, "logits/chosen": -0.4443598687648773, "logits/rejected": -0.4658859074115753, "logps/chosen": -165.03977966308594, "logps/rejected": -187.27285766601562, "loss": 0.7215, "rewards/accuracies": 0.0, "rewards/chosen": -0.03979644924402237, "rewards/margins": -0.08998413383960724, "rewards/rejected": 0.05018768459558487, "step": 121 }, { "epoch": 0.02, "learning_rate": 3.2972972972972973e-07, "logits/chosen": -0.4343929886817932, "logits/rejected": -0.4155595004558563, "logps/chosen": -59.10508728027344, "logps/rejected": -57.539154052734375, "loss": 0.7144, "rewards/accuracies": 0.0, "rewards/chosen": -0.023688508197665215, "rewards/margins": -0.01194458082318306, "rewards/rejected": -0.011743927374482155, "step": 122 }, { "epoch": 0.02, "learning_rate": 3.324324324324324e-07, "logits/chosen": -0.5170040726661682, "logits/rejected": -0.48114633560180664, "logps/chosen": -92.64785766601562, "logps/rejected": -42.570655822753906, "loss": 0.695, "rewards/accuracies": 1.0, "rewards/chosen": 0.01424407958984375, "rewards/margins": 0.0009178156033158302, "rewards/rejected": 0.01332626398652792, "step": 123 }, { "epoch": 0.02, "learning_rate": 3.3513513513513516e-07, "logits/chosen": 0.06962600350379944, "logits/rejected": -0.5058327913284302, "logps/chosen": -21.966323852539062, "logps/rejected": -89.90515899658203, "loss": 0.6752, "rewards/accuracies": 1.0, "rewards/chosen": 0.0327606201171875, "rewards/margins": 0.053237155079841614, "rewards/rejected": -0.020476533100008965, "step": 124 }, { "epoch": 0.02, "learning_rate": 3.378378378378378e-07, "logits/chosen": -0.4174959361553192, "logits/rejected": -0.3943346440792084, "logps/chosen": -40.712589263916016, "logps/rejected": -89.76652526855469, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": -0.002962494036182761, "rewards/margins": 0.05866623297333717, "rewards/rejected": -0.06162872537970543, "step": 125 }, { "epoch": 0.02, "learning_rate": 3.4054054054054054e-07, "logits/chosen": -0.4860614538192749, "logits/rejected": -0.7521403431892395, "logps/chosen": -206.7527618408203, "logps/rejected": -51.07719421386719, "loss": 0.6758, "rewards/accuracies": 1.0, "rewards/chosen": 0.01658782921731472, "rewards/margins": 0.047543715685606, "rewards/rejected": -0.030955886468291283, "step": 126 }, { "epoch": 0.02, "learning_rate": 3.432432432432432e-07, "logits/chosen": -0.04114343971014023, "logits/rejected": -0.058321356773376465, "logps/chosen": -163.70484924316406, "logps/rejected": -118.59695434570312, "loss": 0.7026, "rewards/accuracies": 0.0, "rewards/chosen": -0.01861267164349556, "rewards/margins": -0.03238372877240181, "rewards/rejected": 0.01377105712890625, "step": 127 }, { "epoch": 0.02, "learning_rate": 3.4594594594594597e-07, "logits/chosen": -0.09121374785900116, "logits/rejected": -0.09582386165857315, "logps/chosen": -45.795127868652344, "logps/rejected": -67.62117004394531, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.047681428492069244, "rewards/margins": 0.044429779052734375, "rewards/rejected": 0.0032516480423510075, "step": 128 }, { "epoch": 0.02, "learning_rate": 3.4864864864864863e-07, "logits/chosen": -0.3792528510093689, "logits/rejected": -0.314059853553772, "logps/chosen": -172.4921417236328, "logps/rejected": -269.31787109375, "loss": 0.6598, "rewards/accuracies": 1.0, "rewards/chosen": 0.009318542666733265, "rewards/margins": 0.06190032884478569, "rewards/rejected": -0.052581787109375, "step": 129 }, { "epoch": 0.02, "learning_rate": 3.5135135135135134e-07, "logits/chosen": -0.4061877131462097, "logits/rejected": -0.4033457934856415, "logps/chosen": -64.20816040039062, "logps/rejected": -78.70085906982422, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.01371841412037611, "rewards/margins": 0.03266143798828125, "rewards/rejected": -0.018943024799227715, "step": 130 }, { "epoch": 0.02, "learning_rate": 3.5405405405405406e-07, "logits/chosen": -0.25841549038887024, "logits/rejected": -0.25841549038887024, "logps/chosen": -74.90705108642578, "logps/rejected": -74.90705108642578, "loss": 0.7044, "rewards/accuracies": 0.0, "rewards/chosen": 0.01528091449290514, "rewards/margins": 0.0, "rewards/rejected": 0.01528091449290514, "step": 131 }, { "epoch": 0.02, "learning_rate": 3.5675675675675677e-07, "logits/chosen": -0.3377506136894226, "logits/rejected": -0.32851657271385193, "logps/chosen": -77.65176391601562, "logps/rejected": -103.34415435791016, "loss": 0.6778, "rewards/accuracies": 1.0, "rewards/chosen": 0.007632446475327015, "rewards/margins": 0.026911165565252304, "rewards/rejected": -0.019278718158602715, "step": 132 }, { "epoch": 0.02, "learning_rate": 3.5945945945945943e-07, "logits/chosen": -0.35729414224624634, "logits/rejected": -0.32657530903816223, "logps/chosen": -111.97097778320312, "logps/rejected": -100.93412017822266, "loss": 0.7164, "rewards/accuracies": 0.0, "rewards/chosen": -0.045919038355350494, "rewards/margins": -0.07423248887062073, "rewards/rejected": 0.028313446789979935, "step": 133 }, { "epoch": 0.02, "learning_rate": 3.6216216216216214e-07, "logits/chosen": -0.41516420245170593, "logits/rejected": -0.48751771450042725, "logps/chosen": -144.47756958007812, "logps/rejected": -82.90008544921875, "loss": 0.6783, "rewards/accuracies": 0.0, "rewards/chosen": 0.005831909365952015, "rewards/margins": -0.014634705148637295, "rewards/rejected": 0.02046661451458931, "step": 134 }, { "epoch": 0.02, "learning_rate": 3.6486486486486486e-07, "logits/chosen": -0.42283007502555847, "logits/rejected": -0.4187219440937042, "logps/chosen": -106.69719696044922, "logps/rejected": -74.30683898925781, "loss": 0.6994, "rewards/accuracies": 0.0, "rewards/chosen": 0.016191864386200905, "rewards/margins": -0.031943514943122864, "rewards/rejected": 0.04813537746667862, "step": 135 }, { "epoch": 0.02, "learning_rate": 3.6756756756756757e-07, "logits/chosen": -0.5284646153450012, "logits/rejected": -0.35243669152259827, "logps/chosen": -108.50602722167969, "logps/rejected": -76.23590087890625, "loss": 0.6845, "rewards/accuracies": 0.0, "rewards/chosen": 0.008160400204360485, "rewards/margins": -0.036035917699337006, "rewards/rejected": 0.044196318835020065, "step": 136 }, { "epoch": 0.02, "learning_rate": 3.7027027027027023e-07, "logits/chosen": -0.04741958901286125, "logits/rejected": -0.03845483809709549, "logps/chosen": -53.21049880981445, "logps/rejected": -45.128814697265625, "loss": 0.6924, "rewards/accuracies": 0.0, "rewards/chosen": -0.028887558728456497, "rewards/margins": -0.037743762135505676, "rewards/rejected": 0.00885620154440403, "step": 137 }, { "epoch": 0.02, "learning_rate": 3.72972972972973e-07, "logits/chosen": -0.27041199803352356, "logits/rejected": -0.24802780151367188, "logps/chosen": -91.22891235351562, "logps/rejected": -99.35017395019531, "loss": 0.6749, "rewards/accuracies": 0.0, "rewards/chosen": 0.009652710519731045, "rewards/margins": -0.042067717760801315, "rewards/rejected": 0.051720429211854935, "step": 138 }, { "epoch": 0.02, "learning_rate": 3.7567567567567566e-07, "logits/chosen": 0.023483330383896828, "logits/rejected": 0.022165104746818542, "logps/chosen": -105.61767578125, "logps/rejected": -133.98867797851562, "loss": 0.6957, "rewards/accuracies": 1.0, "rewards/chosen": -0.01991424523293972, "rewards/margins": 0.01962432824075222, "rewards/rejected": -0.03953857347369194, "step": 139 }, { "epoch": 0.02, "learning_rate": 3.783783783783784e-07, "logits/chosen": -0.26159045100212097, "logits/rejected": -0.24941739439964294, "logps/chosen": -60.812530517578125, "logps/rejected": -14.849217414855957, "loss": 0.6978, "rewards/accuracies": 0.0, "rewards/chosen": -0.019080353900790215, "rewards/margins": -0.031057167798280716, "rewards/rejected": 0.011976814828813076, "step": 140 }, { "epoch": 0.02, "learning_rate": 3.8108108108108104e-07, "logits/chosen": -0.17335355281829834, "logits/rejected": -0.17335355281829834, "logps/chosen": -43.975730895996094, "logps/rejected": -43.975730895996094, "loss": 0.7015, "rewards/accuracies": 0.0, "rewards/chosen": 0.00568389892578125, "rewards/margins": 0.0, "rewards/rejected": 0.00568389892578125, "step": 141 }, { "epoch": 0.02, "learning_rate": 3.837837837837838e-07, "logits/chosen": -0.07741741836071014, "logits/rejected": -0.056744158267974854, "logps/chosen": -35.772621154785156, "logps/rejected": -23.05094337463379, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.025529099628329277, "rewards/margins": 0.038028910756111145, "rewards/rejected": -0.012499809265136719, "step": 142 }, { "epoch": 0.02, "learning_rate": 3.8648648648648646e-07, "logits/chosen": 0.09492703527212143, "logits/rejected": 0.08789538592100143, "logps/chosen": -98.54158020019531, "logps/rejected": -69.60623168945312, "loss": 0.7095, "rewards/accuracies": 0.0, "rewards/chosen": 0.023535920307040215, "rewards/margins": -0.010346220806241035, "rewards/rejected": 0.03388214111328125, "step": 143 }, { "epoch": 0.02, "learning_rate": 3.891891891891892e-07, "logits/chosen": -0.34437382221221924, "logits/rejected": -0.32363173365592957, "logps/chosen": -73.7204818725586, "logps/rejected": -73.9080810546875, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": -0.023824309930205345, "rewards/margins": -0.024394989013671875, "rewards/rejected": 0.0005706787342205644, "step": 144 }, { "epoch": 0.02, "learning_rate": 3.918918918918919e-07, "logits/chosen": -0.4500068128108978, "logits/rejected": -0.3890002965927124, "logps/chosen": -172.98178100585938, "logps/rejected": -149.42935180664062, "loss": 0.661, "rewards/accuracies": 1.0, "rewards/chosen": 0.10786133259534836, "rewards/margins": 0.13588868081569672, "rewards/rejected": -0.02802734449505806, "step": 145 }, { "epoch": 0.02, "learning_rate": 3.945945945945946e-07, "logits/chosen": -0.37733393907546997, "logits/rejected": -0.37943169474601746, "logps/chosen": -4.967727184295654, "logps/rejected": -3.8278138637542725, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": -0.006818199064582586, "rewards/margins": -0.008564924821257591, "rewards/rejected": 0.0017467261059209704, "step": 146 }, { "epoch": 0.02, "learning_rate": 3.9729729729729727e-07, "logits/chosen": -0.31117406487464905, "logits/rejected": -0.29989197850227356, "logps/chosen": -136.37112426757812, "logps/rejected": -114.39299011230469, "loss": 0.7219, "rewards/accuracies": 0.0, "rewards/chosen": 0.038360595703125, "rewards/margins": -0.02016906812787056, "rewards/rejected": 0.05852966383099556, "step": 147 }, { "epoch": 0.02, "learning_rate": 4e-07, "logits/chosen": -0.10729793459177017, "logits/rejected": -0.10755807906389236, "logps/chosen": -72.13920593261719, "logps/rejected": -100.378173828125, "loss": 0.6888, "rewards/accuracies": 0.0, "rewards/chosen": -0.01849823072552681, "rewards/margins": -0.02035217359662056, "rewards/rejected": 0.00185394287109375, "step": 148 }, { "epoch": 0.02, "learning_rate": 4.027027027027027e-07, "logits/chosen": -0.2847537696361542, "logits/rejected": -0.2885819971561432, "logps/chosen": -70.42921447753906, "logps/rejected": -79.92598724365234, "loss": 0.6839, "rewards/accuracies": 0.0, "rewards/chosen": -0.02775726281106472, "rewards/margins": -0.02559814415872097, "rewards/rejected": -0.00215911865234375, "step": 149 }, { "epoch": 0.02, "learning_rate": 4.054054054054054e-07, "logits/chosen": -0.43016016483306885, "logits/rejected": -0.3115110993385315, "logps/chosen": -189.0746307373047, "logps/rejected": -90.49302673339844, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.05774078518152237, "rewards/margins": 0.04943695291876793, "rewards/rejected": 0.008303833194077015, "step": 150 }, { "epoch": 0.02, "learning_rate": 4.0810810810810807e-07, "logits/chosen": -0.4740147590637207, "logits/rejected": -0.4840453863143921, "logps/chosen": -17.468555450439453, "logps/rejected": -10.110576629638672, "loss": 0.7239, "rewards/accuracies": 0.0, "rewards/chosen": -0.00791778601706028, "rewards/margins": -0.0028995517641305923, "rewards/rejected": -0.0050182342529296875, "step": 151 }, { "epoch": 0.02, "learning_rate": 4.1081081081081084e-07, "logits/chosen": -0.7416688203811646, "logits/rejected": -0.7157127261161804, "logps/chosen": -106.77338409423828, "logps/rejected": -95.122314453125, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 0.00980148371309042, "rewards/margins": 0.028955843299627304, "rewards/rejected": -0.01915435865521431, "step": 152 }, { "epoch": 0.02, "learning_rate": 4.135135135135135e-07, "logits/chosen": -0.44858577847480774, "logits/rejected": -0.429670125246048, "logps/chosen": -83.85598754882812, "logps/rejected": -63.96984100341797, "loss": 0.6964, "rewards/accuracies": 0.0, "rewards/chosen": -0.026396943256258965, "rewards/margins": -0.011467362754046917, "rewards/rejected": -0.014929580502212048, "step": 153 }, { "epoch": 0.02, "learning_rate": 4.162162162162162e-07, "logits/chosen": -0.2924996018409729, "logits/rejected": -0.2583092749118805, "logps/chosen": -67.37045288085938, "logps/rejected": -121.9477767944336, "loss": 0.6858, "rewards/accuracies": 0.0, "rewards/chosen": -0.01719818077981472, "rewards/margins": -0.023476410657167435, "rewards/rejected": 0.00627822894603014, "step": 154 }, { "epoch": 0.03, "learning_rate": 4.189189189189189e-07, "logits/chosen": 0.19871847331523895, "logits/rejected": 0.23179113864898682, "logps/chosen": -83.71090698242188, "logps/rejected": -53.49070739746094, "loss": 0.6888, "rewards/accuracies": 0.0, "rewards/chosen": -0.05315246805548668, "rewards/margins": -0.011826325207948685, "rewards/rejected": -0.041326142847537994, "step": 155 }, { "epoch": 0.03, "learning_rate": 4.2162162162162164e-07, "logits/chosen": -0.1704348921775818, "logits/rejected": -0.1750057190656662, "logps/chosen": -200.64659118652344, "logps/rejected": -179.60118103027344, "loss": 0.6962, "rewards/accuracies": 1.0, "rewards/chosen": 0.05722350999712944, "rewards/margins": 0.04662017524242401, "rewards/rejected": 0.01060333289206028, "step": 156 }, { "epoch": 0.03, "learning_rate": 4.243243243243243e-07, "logits/chosen": -0.2732248306274414, "logits/rejected": -0.2732248306274414, "logps/chosen": -86.15199279785156, "logps/rejected": -86.15199279785156, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": -0.01701507531106472, "rewards/margins": 0.0, "rewards/rejected": -0.01701507531106472, "step": 157 }, { "epoch": 0.03, "learning_rate": 4.27027027027027e-07, "logits/chosen": -0.24047128856182098, "logits/rejected": -0.18847733736038208, "logps/chosen": -297.6524963378906, "logps/rejected": -189.7757568359375, "loss": 0.706, "rewards/accuracies": 1.0, "rewards/chosen": 0.0295257568359375, "rewards/margins": 0.01003723032772541, "rewards/rejected": 0.01948852650821209, "step": 158 }, { "epoch": 0.03, "learning_rate": 4.2972972972972973e-07, "logits/chosen": -0.46448588371276855, "logits/rejected": -0.4505486488342285, "logps/chosen": -74.614501953125, "logps/rejected": -92.679443359375, "loss": 0.7064, "rewards/accuracies": 1.0, "rewards/chosen": -0.01561660785228014, "rewards/margins": 0.02295990288257599, "rewards/rejected": -0.038576509803533554, "step": 159 }, { "epoch": 0.03, "learning_rate": 4.3243243243243244e-07, "logits/chosen": -0.2529315650463104, "logits/rejected": -0.2633204758167267, "logps/chosen": -117.00637817382812, "logps/rejected": -71.7982177734375, "loss": 0.6699, "rewards/accuracies": 1.0, "rewards/chosen": 0.011688232421875, "rewards/margins": 0.008084106259047985, "rewards/rejected": 0.0036041259299963713, "step": 160 }, { "epoch": 0.03, "learning_rate": 4.351351351351351e-07, "logits/chosen": -1.9372954368591309, "logits/rejected": -1.9324028491973877, "logps/chosen": -91.79108428955078, "logps/rejected": -91.09053039550781, "loss": 0.683, "rewards/accuracies": 0.0, "rewards/chosen": -0.042096711695194244, "rewards/margins": -0.051648713648319244, "rewards/rejected": 0.009552001953125, "step": 161 }, { "epoch": 0.03, "learning_rate": 4.378378378378378e-07, "logits/chosen": -0.6492835879325867, "logits/rejected": -0.6210626363754272, "logps/chosen": -66.37069702148438, "logps/rejected": -67.86139678955078, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.040265657007694244, "rewards/margins": 0.013550568372011185, "rewards/rejected": 0.02671508863568306, "step": 162 }, { "epoch": 0.03, "learning_rate": 4.4054054054054053e-07, "logits/chosen": -0.18090297281742096, "logits/rejected": -0.19653116166591644, "logps/chosen": -5.007916450500488, "logps/rejected": -43.968589782714844, "loss": 0.6696, "rewards/accuracies": 1.0, "rewards/chosen": -0.0029819488991051912, "rewards/margins": 0.0011768343392759562, "rewards/rejected": -0.004158783238381147, "step": 163 }, { "epoch": 0.03, "learning_rate": 4.4324324324324325e-07, "logits/chosen": -0.47376176714897156, "logits/rejected": -0.49974706768989563, "logps/chosen": -187.84408569335938, "logps/rejected": -77.05567932128906, "loss": 0.7152, "rewards/accuracies": 1.0, "rewards/chosen": 0.012347412295639515, "rewards/margins": 0.022954560816287994, "rewards/rejected": -0.010607147589325905, "step": 164 }, { "epoch": 0.03, "learning_rate": 4.459459459459459e-07, "logits/chosen": -0.6174046993255615, "logits/rejected": -0.6241037845611572, "logps/chosen": -143.48190307617188, "logps/rejected": -158.86245727539062, "loss": 0.7112, "rewards/accuracies": 0.0, "rewards/chosen": -0.05041198804974556, "rewards/margins": -0.06338043510913849, "rewards/rejected": 0.01296844519674778, "step": 165 }, { "epoch": 0.03, "learning_rate": 4.486486486486487e-07, "logits/chosen": -0.5158352255821228, "logits/rejected": -0.5158352255821228, "logps/chosen": -55.80262756347656, "logps/rejected": -55.80262756347656, "loss": 0.7238, "rewards/accuracies": 0.0, "rewards/chosen": 0.0074138641357421875, "rewards/margins": 0.0, "rewards/rejected": 0.0074138641357421875, "step": 166 }, { "epoch": 0.03, "learning_rate": 4.5135135135135134e-07, "logits/chosen": -0.5692415237426758, "logits/rejected": -0.5677110552787781, "logps/chosen": -65.58023071289062, "logps/rejected": -92.3155517578125, "loss": 0.7191, "rewards/accuracies": 0.0, "rewards/chosen": -0.01234207209199667, "rewards/margins": -0.010782623663544655, "rewards/rejected": -0.001559448312036693, "step": 167 }, { "epoch": 0.03, "learning_rate": 4.5405405405405405e-07, "logits/chosen": -0.2670995593070984, "logits/rejected": -0.2950684428215027, "logps/chosen": -93.536376953125, "logps/rejected": -101.2466049194336, "loss": 0.7015, "rewards/accuracies": 0.0, "rewards/chosen": -0.0242156982421875, "rewards/margins": -0.022510528564453125, "rewards/rejected": -0.001705169677734375, "step": 168 }, { "epoch": 0.03, "learning_rate": 4.567567567567567e-07, "logits/chosen": -0.8001706004142761, "logits/rejected": -0.8012626767158508, "logps/chosen": -58.32379913330078, "logps/rejected": -114.00485229492188, "loss": 0.7002, "rewards/accuracies": 0.0, "rewards/chosen": 0.01790466345846653, "rewards/margins": -0.01104431226849556, "rewards/rejected": 0.02894897572696209, "step": 169 }, { "epoch": 0.03, "learning_rate": 4.594594594594595e-07, "logits/chosen": -0.4744794964790344, "logits/rejected": -0.4907238185405731, "logps/chosen": -70.9477767944336, "logps/rejected": -65.30652618408203, "loss": 0.7113, "rewards/accuracies": 0.0, "rewards/chosen": 0.01118392962962389, "rewards/margins": -0.02711792290210724, "rewards/rejected": 0.038301851600408554, "step": 170 }, { "epoch": 0.03, "learning_rate": 4.6216216216216214e-07, "logits/chosen": -0.6769611239433289, "logits/rejected": -1.1167420148849487, "logps/chosen": -104.00798034667969, "logps/rejected": -38.98392868041992, "loss": 0.6843, "rewards/accuracies": 0.0, "rewards/chosen": -0.0016387939685955644, "rewards/margins": -0.019131088629364967, "rewards/rejected": 0.017492294311523438, "step": 171 }, { "epoch": 0.03, "learning_rate": 4.6486486486486485e-07, "logits/chosen": -0.008607422932982445, "logits/rejected": -0.0037072408013045788, "logps/chosen": -86.29592895507812, "logps/rejected": -93.88296508789062, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.015559387393295765, "rewards/margins": 0.00923614576458931, "rewards/rejected": 0.0063232420943677425, "step": 172 }, { "epoch": 0.03, "learning_rate": 4.6756756756756757e-07, "logits/chosen": -0.34862020611763, "logits/rejected": -0.37752896547317505, "logps/chosen": -144.71778869628906, "logps/rejected": -69.67393493652344, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.08448334038257599, "rewards/margins": 0.06905670464038849, "rewards/rejected": 0.0154266357421875, "step": 173 }, { "epoch": 0.03, "learning_rate": 4.702702702702703e-07, "logits/chosen": -0.5185635089874268, "logits/rejected": -0.4410436749458313, "logps/chosen": -240.47708129882812, "logps/rejected": -142.1592254638672, "loss": 0.6994, "rewards/accuracies": 0.0, "rewards/chosen": -0.03545990213751793, "rewards/margins": -0.06340179592370987, "rewards/rejected": 0.02794189564883709, "step": 174 }, { "epoch": 0.03, "learning_rate": 4.7297297297297294e-07, "logits/chosen": -0.6060545444488525, "logits/rejected": -0.595509946346283, "logps/chosen": -163.13311767578125, "logps/rejected": -75.22720336914062, "loss": 0.7053, "rewards/accuracies": 0.0, "rewards/chosen": -0.00801696814596653, "rewards/margins": -0.006512451451271772, "rewards/rejected": -0.0015045165782794356, "step": 175 }, { "epoch": 0.03, "learning_rate": 4.7567567567567566e-07, "logits/chosen": -0.30027565360069275, "logits/rejected": -0.2900085747241974, "logps/chosen": -72.82412719726562, "logps/rejected": -98.73751831054688, "loss": 0.6635, "rewards/accuracies": 1.0, "rewards/chosen": 0.060999300330877304, "rewards/margins": 0.08173828572034836, "rewards/rejected": -0.020738983526825905, "step": 176 }, { "epoch": 0.03, "learning_rate": 4.783783783783784e-07, "logits/chosen": -0.6171017289161682, "logits/rejected": -0.6090446710586548, "logps/chosen": -133.97557067871094, "logps/rejected": -111.06114959716797, "loss": 0.7116, "rewards/accuracies": 0.0, "rewards/chosen": -0.0011260986793786287, "rewards/margins": -0.017122650519013405, "rewards/rejected": 0.015996551141142845, "step": 177 }, { "epoch": 0.03, "learning_rate": 4.810810810810811e-07, "logits/chosen": 0.0017333823489025235, "logits/rejected": 0.023158086463809013, "logps/chosen": -64.97401428222656, "logps/rejected": -56.513328552246094, "loss": 0.6964, "rewards/accuracies": 1.0, "rewards/chosen": -0.01993866078555584, "rewards/margins": 0.006496809422969818, "rewards/rejected": -0.026435470208525658, "step": 178 }, { "epoch": 0.03, "learning_rate": 4.837837837837838e-07, "logits/chosen": -0.5212526917457581, "logits/rejected": -0.4828587472438812, "logps/chosen": -90.57811737060547, "logps/rejected": -190.023681640625, "loss": 0.7441, "rewards/accuracies": 0.0, "rewards/chosen": -0.045903779566287994, "rewards/margins": -0.1952964961528778, "rewards/rejected": 0.14939270913600922, "step": 179 }, { "epoch": 0.03, "learning_rate": 4.864864864864865e-07, "logits/chosen": -0.26429498195648193, "logits/rejected": -0.19159291684627533, "logps/chosen": -100.38447570800781, "logps/rejected": -62.971412658691406, "loss": 0.7222, "rewards/accuracies": 0.0, "rewards/chosen": -0.056723784655332565, "rewards/margins": -0.03339347988367081, "rewards/rejected": -0.023330306634306908, "step": 180 }, { "epoch": 0.03, "learning_rate": 4.891891891891891e-07, "logits/chosen": -0.4529516398906708, "logits/rejected": -0.3683266341686249, "logps/chosen": -163.21792602539062, "logps/rejected": -102.83082580566406, "loss": 0.7018, "rewards/accuracies": 1.0, "rewards/chosen": 0.03654327616095543, "rewards/margins": 0.02384643629193306, "rewards/rejected": 0.012696838937699795, "step": 181 }, { "epoch": 0.03, "learning_rate": 4.918918918918919e-07, "logits/chosen": -0.4925695061683655, "logits/rejected": -0.5782694816589355, "logps/chosen": -243.38841247558594, "logps/rejected": -65.74244689941406, "loss": 0.6875, "rewards/accuracies": 0.0, "rewards/chosen": -0.03884124755859375, "rewards/margins": -0.012230681255459785, "rewards/rejected": -0.026610566303133965, "step": 182 }, { "epoch": 0.03, "learning_rate": 4.945945945945945e-07, "logits/chosen": -0.2093626856803894, "logits/rejected": -0.21673426032066345, "logps/chosen": -26.15707015991211, "logps/rejected": -7.023551940917969, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.0027301788795739412, "rewards/margins": 0.004504299256950617, "rewards/rejected": -0.0017741203773766756, "step": 183 }, { "epoch": 0.03, "learning_rate": 4.972972972972973e-07, "logits/chosen": -0.49334225058555603, "logits/rejected": -0.941505491733551, "logps/chosen": -86.2278823852539, "logps/rejected": -125.7615966796875, "loss": 0.709, "rewards/accuracies": 1.0, "rewards/chosen": 0.05732269212603569, "rewards/margins": 0.027870940044522285, "rewards/rejected": 0.029451752081513405, "step": 184 }, { "epoch": 0.03, "learning_rate": 5e-07, "logits/chosen": -0.3643590807914734, "logits/rejected": -0.37441256642341614, "logps/chosen": -63.79286193847656, "logps/rejected": -52.649837493896484, "loss": 0.7339, "rewards/accuracies": 0.0, "rewards/chosen": -0.019195938482880592, "rewards/margins": -0.05263214558362961, "rewards/rejected": 0.03343620523810387, "step": 185 }, { "epoch": 0.03, "learning_rate": 5.027027027027027e-07, "logits/chosen": -0.19701798260211945, "logits/rejected": -0.18522240221500397, "logps/chosen": -101.10002136230469, "logps/rejected": -65.15164184570312, "loss": 0.7197, "rewards/accuracies": 0.0, "rewards/chosen": -0.008668518625199795, "rewards/margins": -0.0417633093893528, "rewards/rejected": 0.03309478983283043, "step": 186 }, { "epoch": 0.03, "learning_rate": 5.054054054054053e-07, "logits/chosen": -0.36221781373023987, "logits/rejected": -0.4045085906982422, "logps/chosen": -155.25347900390625, "logps/rejected": -24.371259689331055, "loss": 0.6662, "rewards/accuracies": 0.0, "rewards/chosen": -0.05643463134765625, "rewards/margins": -0.060204505920410156, "rewards/rejected": 0.0037698745727539062, "step": 187 }, { "epoch": 0.03, "learning_rate": 5.081081081081081e-07, "logits/chosen": -0.48246055841445923, "logits/rejected": -0.46139100193977356, "logps/chosen": -58.489967346191406, "logps/rejected": -64.35302734375, "loss": 0.6965, "rewards/accuracies": 0.0, "rewards/chosen": -0.05034599453210831, "rewards/margins": -0.0819576233625412, "rewards/rejected": 0.03161163255572319, "step": 188 }, { "epoch": 0.03, "learning_rate": 5.108108108108108e-07, "logits/chosen": -0.3401764929294586, "logits/rejected": -0.944966733455658, "logps/chosen": -95.12159729003906, "logps/rejected": -39.26844024658203, "loss": 0.6653, "rewards/accuracies": 1.0, "rewards/chosen": 0.0076660155318677425, "rewards/margins": 0.02399444580078125, "rewards/rejected": -0.01632842980325222, "step": 189 }, { "epoch": 0.03, "learning_rate": 5.135135135135134e-07, "logits/chosen": -0.44840818643569946, "logits/rejected": -0.46560385823249817, "logps/chosen": -72.79779052734375, "logps/rejected": -74.87864685058594, "loss": 0.6559, "rewards/accuracies": 1.0, "rewards/chosen": 0.02132721059024334, "rewards/margins": 0.043123628944158554, "rewards/rejected": -0.021796418353915215, "step": 190 }, { "epoch": 0.03, "learning_rate": 5.162162162162162e-07, "logits/chosen": -0.5364332795143127, "logits/rejected": -0.550709068775177, "logps/chosen": -164.20361328125, "logps/rejected": -113.58912658691406, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.08963318169116974, "rewards/margins": 0.06670303642749786, "rewards/rejected": 0.022930145263671875, "step": 191 }, { "epoch": 0.03, "learning_rate": 5.18918918918919e-07, "logits/chosen": -0.4915311932563782, "logits/rejected": -0.4844262897968292, "logps/chosen": -255.0994873046875, "logps/rejected": -244.48243713378906, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.014880371280014515, "rewards/margins": 0.06144256517291069, "rewards/rejected": -0.04656219482421875, "step": 192 }, { "epoch": 0.03, "learning_rate": 5.216216216216216e-07, "logits/chosen": -0.20907878875732422, "logits/rejected": -0.2128347009420395, "logps/chosen": -93.14086151123047, "logps/rejected": -63.231170654296875, "loss": 0.6919, "rewards/accuracies": 0.0, "rewards/chosen": -0.037171173840761185, "rewards/margins": -0.020476913079619408, "rewards/rejected": -0.016694260761141777, "step": 193 }, { "epoch": 0.03, "learning_rate": 5.243243243243243e-07, "logits/chosen": -0.39043909311294556, "logits/rejected": -0.39223259687423706, "logps/chosen": -73.60832977294922, "logps/rejected": -66.17975616455078, "loss": 0.709, "rewards/accuracies": 0.0, "rewards/chosen": -0.032946016639471054, "rewards/margins": -0.07857971638441086, "rewards/rejected": 0.045633699744939804, "step": 194 }, { "epoch": 0.03, "learning_rate": 5.270270270270269e-07, "logits/chosen": -0.6086284518241882, "logits/rejected": -0.569827139377594, "logps/chosen": -99.35194396972656, "logps/rejected": -45.306640625, "loss": 0.7026, "rewards/accuracies": 1.0, "rewards/chosen": 0.038299560546875, "rewards/margins": 0.03615608066320419, "rewards/rejected": 0.002143478486686945, "step": 195 }, { "epoch": 0.03, "learning_rate": 5.297297297297297e-07, "logits/chosen": -0.23012498021125793, "logits/rejected": -0.23361486196517944, "logps/chosen": -192.9204559326172, "logps/rejected": -103.91221618652344, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.04486541822552681, "rewards/margins": 0.033893585205078125, "rewards/rejected": 0.01097183208912611, "step": 196 }, { "epoch": 0.03, "learning_rate": 5.324324324324324e-07, "logits/chosen": -0.08081789314746857, "logits/rejected": -0.054920926690101624, "logps/chosen": -93.31140899658203, "logps/rejected": -83.15040588378906, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": 0.00936813373118639, "rewards/margins": -0.030182648450136185, "rewards/rejected": 0.03955078125, "step": 197 }, { "epoch": 0.03, "learning_rate": 5.35135135135135e-07, "logits/chosen": -0.785611629486084, "logits/rejected": -0.7614572048187256, "logps/chosen": -211.4677734375, "logps/rejected": -226.98402404785156, "loss": 0.728, "rewards/accuracies": 1.0, "rewards/chosen": -0.0022781372535973787, "rewards/margins": 0.03945770114660263, "rewards/rejected": -0.04173583909869194, "step": 198 }, { "epoch": 0.03, "learning_rate": 5.378378378378378e-07, "logits/chosen": -0.0013047961983829737, "logits/rejected": -0.0013047961983829737, "logps/chosen": -108.76164245605469, "logps/rejected": -108.76164245605469, "loss": 0.7219, "rewards/accuracies": 0.0, "rewards/chosen": 0.007015991490334272, "rewards/margins": 0.0, "rewards/rejected": 0.007015991490334272, "step": 199 }, { "epoch": 0.03, "learning_rate": 5.405405405405406e-07, "logits/chosen": -0.129562109708786, "logits/rejected": -0.11524704843759537, "logps/chosen": -13.36000919342041, "logps/rejected": -68.17887878417969, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": -0.010632038116455078, "rewards/margins": -0.05441408231854439, "rewards/rejected": 0.04378204420208931, "step": 200 }, { "epoch": 0.03, "learning_rate": 5.432432432432432e-07, "logits/chosen": -0.6021972894668579, "logits/rejected": -0.5256741046905518, "logps/chosen": -121.74339294433594, "logps/rejected": -59.956565856933594, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.012027740478515625, "rewards/margins": 0.01632842980325222, "rewards/rejected": -0.0043006897903978825, "step": 201 }, { "epoch": 0.03, "learning_rate": 5.459459459459459e-07, "logits/chosen": -0.47110119462013245, "logits/rejected": -0.5272548794746399, "logps/chosen": -206.34317016601562, "logps/rejected": -139.122802734375, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": -0.05689697340130806, "rewards/margins": -0.03887176513671875, "rewards/rejected": -0.01802520826458931, "step": 202 }, { "epoch": 0.03, "learning_rate": 5.486486486486486e-07, "logits/chosen": -0.7173296213150024, "logits/rejected": -0.695926308631897, "logps/chosen": -163.9253387451172, "logps/rejected": -229.80783081054688, "loss": 0.6756, "rewards/accuracies": 1.0, "rewards/chosen": 0.02720336988568306, "rewards/margins": 0.0529022216796875, "rewards/rejected": -0.02569885365664959, "step": 203 }, { "epoch": 0.03, "learning_rate": 5.513513513513513e-07, "logits/chosen": -0.3340732157230377, "logits/rejected": -0.3314175605773926, "logps/chosen": -90.16183471679688, "logps/rejected": -125.69871520996094, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.038806915283203125, "rewards/margins": 0.06759719550609589, "rewards/rejected": -0.02879028394818306, "step": 204 }, { "epoch": 0.03, "learning_rate": 5.54054054054054e-07, "logits/chosen": -0.241455078125, "logits/rejected": -0.21831464767456055, "logps/chosen": -90.89179992675781, "logps/rejected": -83.26554870605469, "loss": 0.6975, "rewards/accuracies": 1.0, "rewards/chosen": 0.05341186746954918, "rewards/margins": 0.005883790552616119, "rewards/rejected": 0.04752807691693306, "step": 205 }, { "epoch": 0.03, "learning_rate": 5.567567567567567e-07, "logits/chosen": -0.3660608232021332, "logits/rejected": -0.37601572275161743, "logps/chosen": -29.07537078857422, "logps/rejected": -39.855770111083984, "loss": 0.6974, "rewards/accuracies": 1.0, "rewards/chosen": 0.05385017395019531, "rewards/margins": 0.09870147705078125, "rewards/rejected": -0.04485130310058594, "step": 206 }, { "epoch": 0.03, "learning_rate": 5.594594594594594e-07, "logits/chosen": -0.3279568552970886, "logits/rejected": -0.27947574853897095, "logps/chosen": -53.15504455566406, "logps/rejected": -195.9281463623047, "loss": 0.6746, "rewards/accuracies": 0.0, "rewards/chosen": 0.017242049798369408, "rewards/margins": -0.023984147235751152, "rewards/rejected": 0.04122619703412056, "step": 207 }, { "epoch": 0.03, "learning_rate": 5.621621621621622e-07, "logits/chosen": -0.2202240377664566, "logits/rejected": -0.19246090948581696, "logps/chosen": -177.26473999023438, "logps/rejected": -161.04989624023438, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.01788940466940403, "rewards/margins": 0.007138061337172985, "rewards/rejected": 0.010751343332231045, "step": 208 }, { "epoch": 0.03, "learning_rate": 5.648648648648648e-07, "logits/chosen": -0.44128191471099854, "logits/rejected": -0.4193665087223053, "logps/chosen": -63.25440216064453, "logps/rejected": -97.239501953125, "loss": 0.7073, "rewards/accuracies": 1.0, "rewards/chosen": 0.0035408020485192537, "rewards/margins": 0.027480317279696465, "rewards/rejected": -0.02393951453268528, "step": 209 }, { "epoch": 0.03, "learning_rate": 5.675675675675675e-07, "logits/chosen": -0.6388649344444275, "logits/rejected": -0.5760959386825562, "logps/chosen": -99.96739196777344, "logps/rejected": -237.70953369140625, "loss": 0.7052, "rewards/accuracies": 1.0, "rewards/chosen": -0.02067718468606472, "rewards/margins": 0.0298538226634264, "rewards/rejected": -0.05053100734949112, "step": 210 }, { "epoch": 0.03, "learning_rate": 5.702702702702702e-07, "logits/chosen": -0.17148266732692719, "logits/rejected": -0.16975213587284088, "logps/chosen": -52.781028747558594, "logps/rejected": -81.3660888671875, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.032416533678770065, "rewards/margins": 0.07612381130456924, "rewards/rejected": -0.04370727762579918, "step": 211 }, { "epoch": 0.03, "learning_rate": 5.729729729729729e-07, "logits/chosen": -0.2822418808937073, "logits/rejected": -0.2960236072540283, "logps/chosen": -74.47755432128906, "logps/rejected": -87.04132843017578, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.011308289133012295, "rewards/margins": 0.033356476575136185, "rewards/rejected": -0.022048188373446465, "step": 212 }, { "epoch": 0.03, "learning_rate": 5.756756756756757e-07, "logits/chosen": -0.425685316324234, "logits/rejected": -0.4012870192527771, "logps/chosen": -127.59596252441406, "logps/rejected": -142.23548889160156, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.015001678839325905, "rewards/margins": 0.036016084253787994, "rewards/rejected": -0.02101440541446209, "step": 213 }, { "epoch": 0.03, "learning_rate": 5.783783783783784e-07, "logits/chosen": -0.26042765378952026, "logits/rejected": -0.284633994102478, "logps/chosen": -213.6543426513672, "logps/rejected": -184.49127197265625, "loss": 0.7183, "rewards/accuracies": 0.0, "rewards/chosen": -0.014674377627670765, "rewards/margins": -0.08786468952894211, "rewards/rejected": 0.07319030910730362, "step": 214 }, { "epoch": 0.03, "learning_rate": 5.81081081081081e-07, "logits/chosen": -0.3571830987930298, "logits/rejected": -0.3556253910064697, "logps/chosen": -89.38218688964844, "logps/rejected": -86.68556213378906, "loss": 0.6801, "rewards/accuracies": 0.0, "rewards/chosen": 0.01679840125143528, "rewards/margins": -0.00567932054400444, "rewards/rejected": 0.02247772179543972, "step": 215 }, { "epoch": 0.04, "learning_rate": 5.837837837837838e-07, "logits/chosen": -0.33598601818084717, "logits/rejected": -0.35534486174583435, "logps/chosen": -18.52389907836914, "logps/rejected": -22.616931915283203, "loss": 0.6988, "rewards/accuracies": 1.0, "rewards/chosen": 0.0029485702980309725, "rewards/margins": 0.008405494503676891, "rewards/rejected": -0.0054569244384765625, "step": 216 }, { "epoch": 0.04, "learning_rate": 5.864864864864865e-07, "logits/chosen": -0.6587826609611511, "logits/rejected": -0.6438984870910645, "logps/chosen": -99.18016052246094, "logps/rejected": -54.39301300048828, "loss": 0.6818, "rewards/accuracies": 0.0, "rewards/chosen": -0.01617584191262722, "rewards/margins": -0.03876190260052681, "rewards/rejected": 0.02258606068789959, "step": 217 }, { "epoch": 0.04, "learning_rate": 5.891891891891891e-07, "logits/chosen": -0.4065646827220917, "logits/rejected": -0.35917365550994873, "logps/chosen": -125.89253234863281, "logps/rejected": -135.01055908203125, "loss": 0.6708, "rewards/accuracies": 1.0, "rewards/chosen": 0.03346100077033043, "rewards/margins": 0.04822540655732155, "rewards/rejected": -0.014764404855668545, "step": 218 }, { "epoch": 0.04, "learning_rate": 5.918918918918918e-07, "logits/chosen": -0.445956826210022, "logits/rejected": -0.42787981033325195, "logps/chosen": -80.64057159423828, "logps/rejected": -80.96443939208984, "loss": 0.6788, "rewards/accuracies": 0.0, "rewards/chosen": 0.01975097693502903, "rewards/margins": -0.009899139404296875, "rewards/rejected": 0.029650116339325905, "step": 219 }, { "epoch": 0.04, "learning_rate": 5.945945945945947e-07, "logits/chosen": -0.4720660150051117, "logits/rejected": -0.46073609590530396, "logps/chosen": -56.11831283569336, "logps/rejected": -15.630918502807617, "loss": 0.7029, "rewards/accuracies": 0.0, "rewards/chosen": -0.03264045715332031, "rewards/margins": -0.03416604921221733, "rewards/rejected": 0.0015255928738042712, "step": 220 }, { "epoch": 0.04, "learning_rate": 5.972972972972973e-07, "logits/chosen": -0.5394752025604248, "logits/rejected": -0.5034905672073364, "logps/chosen": -85.73247528076172, "logps/rejected": -130.95449829101562, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.02739563025534153, "rewards/margins": 0.05995483696460724, "rewards/rejected": -0.03255920484662056, "step": 221 }, { "epoch": 0.04, "learning_rate": 6e-07, "logits/chosen": -0.2833939492702484, "logits/rejected": -0.2934708893299103, "logps/chosen": -29.84561538696289, "logps/rejected": -6.565009117126465, "loss": 0.7093, "rewards/accuracies": 0.0, "rewards/chosen": -0.004375457763671875, "rewards/margins": -0.006939745042473078, "rewards/rejected": 0.0025642872788012028, "step": 222 }, { "epoch": 0.04, "learning_rate": 6.027027027027026e-07, "logits/chosen": -0.3417595624923706, "logits/rejected": -0.31400012969970703, "logps/chosen": -78.68473052978516, "logps/rejected": -112.43597412109375, "loss": 0.7001, "rewards/accuracies": 0.0, "rewards/chosen": 0.003679657122120261, "rewards/margins": -0.024346161633729935, "rewards/rejected": 0.02802581898868084, "step": 223 }, { "epoch": 0.04, "learning_rate": 6.054054054054054e-07, "logits/chosen": -0.4203702211380005, "logits/rejected": -0.4203702211380005, "logps/chosen": -89.51409912109375, "logps/rejected": -89.51409912109375, "loss": 0.7174, "rewards/accuracies": 0.0, "rewards/chosen": 0.02226562611758709, "rewards/margins": 0.0, "rewards/rejected": 0.02226562611758709, "step": 224 }, { "epoch": 0.04, "learning_rate": 6.081081081081081e-07, "logits/chosen": -0.13014069199562073, "logits/rejected": -0.13014069199562073, "logps/chosen": -90.28758239746094, "logps/rejected": -90.28758239746094, "loss": 0.7234, "rewards/accuracies": 0.0, "rewards/chosen": -0.0005935669178143144, "rewards/margins": 0.0, "rewards/rejected": -0.0005935669178143144, "step": 225 }, { "epoch": 0.04, "learning_rate": 6.108108108108107e-07, "logits/chosen": -0.37279728055000305, "logits/rejected": -0.34254857897758484, "logps/chosen": -81.11062622070312, "logps/rejected": -148.68687438964844, "loss": 0.6797, "rewards/accuracies": 0.0, "rewards/chosen": 0.01329803466796875, "rewards/margins": -0.06174468994140625, "rewards/rejected": 0.075042724609375, "step": 226 }, { "epoch": 0.04, "learning_rate": 6.135135135135134e-07, "logits/chosen": -0.25029119849205017, "logits/rejected": -0.2613299489021301, "logps/chosen": -9.812155723571777, "logps/rejected": -8.345915794372559, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": -0.0015774726634845138, "rewards/margins": -0.011697006411850452, "rewards/rejected": 0.010119534097611904, "step": 227 }, { "epoch": 0.04, "learning_rate": 6.162162162162163e-07, "logits/chosen": -0.4305778741836548, "logits/rejected": -0.4658035337924957, "logps/chosen": -194.57115173339844, "logps/rejected": -152.090576171875, "loss": 0.7306, "rewards/accuracies": 0.0, "rewards/chosen": -0.01772003248333931, "rewards/margins": -0.07214812934398651, "rewards/rejected": 0.0544281005859375, "step": 228 }, { "epoch": 0.04, "learning_rate": 6.189189189189189e-07, "logits/chosen": -0.21824759244918823, "logits/rejected": -0.21274280548095703, "logps/chosen": -77.19744110107422, "logps/rejected": -88.6993408203125, "loss": 0.6822, "rewards/accuracies": 0.0, "rewards/chosen": -0.048116303980350494, "rewards/margins": -0.04875946044921875, "rewards/rejected": 0.0006431579822674394, "step": 229 }, { "epoch": 0.04, "learning_rate": 6.216216216216216e-07, "logits/chosen": -0.43516477942466736, "logits/rejected": -0.4510044455528259, "logps/chosen": -11.049665451049805, "logps/rejected": -23.373416900634766, "loss": 0.6748, "rewards/accuracies": 1.0, "rewards/chosen": -0.006404304411262274, "rewards/margins": 0.007246589753776789, "rewards/rejected": -0.013650894165039062, "step": 230 }, { "epoch": 0.04, "learning_rate": 6.243243243243243e-07, "logits/chosen": -0.45157939195632935, "logits/rejected": -0.34638121724128723, "logps/chosen": -175.5052032470703, "logps/rejected": -36.18605422973633, "loss": 0.7008, "rewards/accuracies": 0.0, "rewards/chosen": -0.03753509745001793, "rewards/margins": -0.03737793117761612, "rewards/rejected": -0.00015716553025413305, "step": 231 }, { "epoch": 0.04, "learning_rate": 6.27027027027027e-07, "logits/chosen": -0.3939274847507477, "logits/rejected": -0.3939274847507477, "logps/chosen": -106.82140350341797, "logps/rejected": -106.82140350341797, "loss": 0.7091, "rewards/accuracies": 0.0, "rewards/chosen": -0.032199859619140625, "rewards/margins": 0.0, "rewards/rejected": -0.032199859619140625, "step": 232 }, { "epoch": 0.04, "learning_rate": 6.297297297297297e-07, "logits/chosen": -1.0073930025100708, "logits/rejected": -1.074399471282959, "logps/chosen": -86.71502685546875, "logps/rejected": -49.65925598144531, "loss": 0.7016, "rewards/accuracies": 0.0, "rewards/chosen": -0.004341888707131147, "rewards/margins": -0.0010719301644712687, "rewards/rejected": -0.0032699585426598787, "step": 233 }, { "epoch": 0.04, "learning_rate": 6.324324324324324e-07, "logits/chosen": -0.561054527759552, "logits/rejected": -0.46231305599212646, "logps/chosen": -158.2218017578125, "logps/rejected": -221.37579345703125, "loss": 0.6808, "rewards/accuracies": 1.0, "rewards/chosen": 0.02565307728946209, "rewards/margins": 0.06707610934972763, "rewards/rejected": -0.04142303392291069, "step": 234 }, { "epoch": 0.04, "learning_rate": 6.35135135135135e-07, "logits/chosen": -0.44775187969207764, "logits/rejected": -0.4137595295906067, "logps/chosen": -93.68125915527344, "logps/rejected": -112.10260009765625, "loss": 0.6969, "rewards/accuracies": 1.0, "rewards/chosen": -0.025852203369140625, "rewards/margins": 0.009162139147520065, "rewards/rejected": -0.03501434251666069, "step": 235 }, { "epoch": 0.04, "learning_rate": 6.378378378378379e-07, "logits/chosen": -0.4385606348514557, "logits/rejected": -0.4535049796104431, "logps/chosen": -166.29481506347656, "logps/rejected": -133.12106323242188, "loss": 0.6745, "rewards/accuracies": 0.0, "rewards/chosen": -0.02919769287109375, "rewards/margins": -0.03551178053021431, "rewards/rejected": 0.006314087193459272, "step": 236 }, { "epoch": 0.04, "learning_rate": 6.405405405405405e-07, "logits/chosen": -0.1994488686323166, "logits/rejected": -0.026805859059095383, "logps/chosen": -261.6794128417969, "logps/rejected": -34.97176742553711, "loss": 0.701, "rewards/accuracies": 0.0, "rewards/chosen": -0.03641662746667862, "rewards/margins": -0.05025634914636612, "rewards/rejected": 0.0138397216796875, "step": 237 }, { "epoch": 0.04, "learning_rate": 6.432432432432432e-07, "logits/chosen": -0.49614644050598145, "logits/rejected": -0.5233829617500305, "logps/chosen": -30.21469497680664, "logps/rejected": -9.447880744934082, "loss": 0.6722, "rewards/accuracies": 1.0, "rewards/chosen": 0.03280944749712944, "rewards/margins": 0.04362764209508896, "rewards/rejected": -0.010818195529282093, "step": 238 }, { "epoch": 0.04, "learning_rate": 6.459459459459459e-07, "logits/chosen": -0.5037032961845398, "logits/rejected": -0.4302486181259155, "logps/chosen": -94.60592651367188, "logps/rejected": -94.9873275756836, "loss": 0.664, "rewards/accuracies": 1.0, "rewards/chosen": 0.02930145338177681, "rewards/margins": 0.06311646103858948, "rewards/rejected": -0.03381500393152237, "step": 239 }, { "epoch": 0.04, "learning_rate": 6.486486486486486e-07, "logits/chosen": -0.22167417407035828, "logits/rejected": -0.1884053498506546, "logps/chosen": -66.865234375, "logps/rejected": -85.220703125, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": -0.00293731689453125, "rewards/margins": 0.018036652356386185, "rewards/rejected": -0.020973969250917435, "step": 240 }, { "epoch": 0.04, "learning_rate": 6.513513513513513e-07, "logits/chosen": -0.4948139488697052, "logits/rejected": -0.44142383337020874, "logps/chosen": -207.1372833251953, "logps/rejected": -214.3231964111328, "loss": 0.7016, "rewards/accuracies": 0.0, "rewards/chosen": -0.02859039418399334, "rewards/margins": -0.09594116359949112, "rewards/rejected": 0.06735076755285263, "step": 241 }, { "epoch": 0.04, "learning_rate": 6.54054054054054e-07, "logits/chosen": -0.16547484695911407, "logits/rejected": -0.16547484695911407, "logps/chosen": -3.7300684452056885, "logps/rejected": -3.7300684452056885, "loss": 0.7144, "rewards/accuracies": 0.0, "rewards/chosen": -0.001807570457458496, "rewards/margins": 0.0, "rewards/rejected": -0.001807570457458496, "step": 242 }, { "epoch": 0.04, "learning_rate": 6.567567567567566e-07, "logits/chosen": -0.77171790599823, "logits/rejected": -0.7803393602371216, "logps/chosen": -110.85273742675781, "logps/rejected": -79.45901489257812, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.05333862453699112, "rewards/margins": 0.01600799709558487, "rewards/rejected": 0.03733062744140625, "step": 243 }, { "epoch": 0.04, "learning_rate": 6.594594594594595e-07, "logits/chosen": -0.42948538064956665, "logits/rejected": -0.4139667749404907, "logps/chosen": -101.26619720458984, "logps/rejected": -103.11192321777344, "loss": 0.7207, "rewards/accuracies": 0.0, "rewards/chosen": -0.026189422234892845, "rewards/margins": -0.06819228827953339, "rewards/rejected": 0.04200286790728569, "step": 244 }, { "epoch": 0.04, "learning_rate": 6.621621621621622e-07, "logits/chosen": -0.6693140268325806, "logits/rejected": -0.9721900224685669, "logps/chosen": -122.94084167480469, "logps/rejected": -39.591102600097656, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": -0.0060791014693677425, "rewards/margins": 0.02226715162396431, "rewards/rejected": -0.02834625355899334, "step": 245 }, { "epoch": 0.04, "learning_rate": 6.648648648648648e-07, "logits/chosen": -0.33384084701538086, "logits/rejected": -0.240623340010643, "logps/chosen": -149.04698181152344, "logps/rejected": -139.54238891601562, "loss": 0.6834, "rewards/accuracies": 0.0, "rewards/chosen": 0.02001953125, "rewards/margins": -0.0071594249457120895, "rewards/rejected": 0.02717895619571209, "step": 246 }, { "epoch": 0.04, "learning_rate": 6.675675675675675e-07, "logits/chosen": -0.27768197655677795, "logits/rejected": -0.27768197655677795, "logps/chosen": -51.7473030090332, "logps/rejected": -51.7473030090332, "loss": 0.6725, "rewards/accuracies": 0.0, "rewards/chosen": -0.020504379644989967, "rewards/margins": 0.0, "rewards/rejected": -0.020504379644989967, "step": 247 }, { "epoch": 0.04, "learning_rate": 6.702702702702703e-07, "logits/chosen": -0.6999572515487671, "logits/rejected": -0.6420531272888184, "logps/chosen": -140.1139373779297, "logps/rejected": -97.4136962890625, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 0.006520080845803022, "rewards/margins": -0.057541657239198685, "rewards/rejected": 0.064061738550663, "step": 248 }, { "epoch": 0.04, "learning_rate": 6.729729729729729e-07, "logits/chosen": -0.5038520097732544, "logits/rejected": -0.48174554109573364, "logps/chosen": -45.04695510864258, "logps/rejected": -63.89573287963867, "loss": 0.673, "rewards/accuracies": 1.0, "rewards/chosen": -0.014804840087890625, "rewards/margins": 0.0017570499330759048, "rewards/rejected": -0.01656189002096653, "step": 249 }, { "epoch": 0.04, "learning_rate": 6.756756756756756e-07, "logits/chosen": -0.6232605576515198, "logits/rejected": -0.6070888042449951, "logps/chosen": -94.93798828125, "logps/rejected": -90.6438217163086, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.02532196044921875, "rewards/margins": 0.056858062744140625, "rewards/rejected": -0.031536102294921875, "step": 250 }, { "epoch": 0.04, "learning_rate": 6.783783783783783e-07, "logits/chosen": -0.45837539434432983, "logits/rejected": -0.42151355743408203, "logps/chosen": -82.55921173095703, "logps/rejected": -29.46199607849121, "loss": 0.6674, "rewards/accuracies": 1.0, "rewards/chosen": 0.016350556164979935, "rewards/margins": 0.02233715169131756, "rewards/rejected": -0.005986595060676336, "step": 251 }, { "epoch": 0.04, "learning_rate": 6.810810810810811e-07, "logits/chosen": -0.5073668956756592, "logits/rejected": -0.5199209451675415, "logps/chosen": -104.58456420898438, "logps/rejected": -194.01553344726562, "loss": 0.6699, "rewards/accuracies": 1.0, "rewards/chosen": -0.01952514611184597, "rewards/margins": 0.03218688815832138, "rewards/rejected": -0.0517120361328125, "step": 252 }, { "epoch": 0.04, "learning_rate": 6.837837837837838e-07, "logits/chosen": -0.2649472951889038, "logits/rejected": -0.26771417260169983, "logps/chosen": -63.7597770690918, "logps/rejected": -130.7951202392578, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.06033134460449219, "rewards/margins": 0.0897396057844162, "rewards/rejected": -0.02940826490521431, "step": 253 }, { "epoch": 0.04, "learning_rate": 6.864864864864864e-07, "logits/chosen": -0.6303379535675049, "logits/rejected": -0.6148648262023926, "logps/chosen": -130.77667236328125, "logps/rejected": -90.79554748535156, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.01809387281537056, "rewards/margins": 0.009239197708666325, "rewards/rejected": 0.008854675106704235, "step": 254 }, { "epoch": 0.04, "learning_rate": 6.891891891891891e-07, "logits/chosen": -0.4541706442832947, "logits/rejected": -0.48525547981262207, "logps/chosen": -84.65930938720703, "logps/rejected": -102.06793212890625, "loss": 0.6973, "rewards/accuracies": 0.0, "rewards/chosen": -0.006966400425881147, "rewards/margins": -0.006510925479233265, "rewards/rejected": -0.0004554748593363911, "step": 255 }, { "epoch": 0.04, "learning_rate": 6.918918918918919e-07, "logits/chosen": -0.2679090201854706, "logits/rejected": -0.27536800503730774, "logps/chosen": -77.8499755859375, "logps/rejected": -57.29789733886719, "loss": 0.7173, "rewards/accuracies": 0.0, "rewards/chosen": -0.01720886304974556, "rewards/margins": -0.05727806314826012, "rewards/rejected": 0.04006920009851456, "step": 256 }, { "epoch": 0.04, "learning_rate": 6.945945945945945e-07, "logits/chosen": -0.16961568593978882, "logits/rejected": -0.18173067271709442, "logps/chosen": -94.90941619873047, "logps/rejected": -64.95528411865234, "loss": 0.7089, "rewards/accuracies": 0.0, "rewards/chosen": 0.000983429024927318, "rewards/margins": -0.002849578857421875, "rewards/rejected": 0.0038330077659338713, "step": 257 }, { "epoch": 0.04, "learning_rate": 6.972972972972973e-07, "logits/chosen": -0.17157819867134094, "logits/rejected": -0.16041086614131927, "logps/chosen": -45.46064758300781, "logps/rejected": -7.8642988204956055, "loss": 0.684, "rewards/accuracies": 0.0, "rewards/chosen": -0.0025180818047374487, "rewards/margins": -0.002364683197811246, "rewards/rejected": -0.0001533985196147114, "step": 258 }, { "epoch": 0.04, "learning_rate": 7e-07, "logits/chosen": -0.48598793148994446, "logits/rejected": -0.42568495869636536, "logps/chosen": -102.76895904541016, "logps/rejected": -177.5306396484375, "loss": 0.6833, "rewards/accuracies": 0.0, "rewards/chosen": -0.024352265521883965, "rewards/margins": -0.041243746876716614, "rewards/rejected": 0.0168914794921875, "step": 259 }, { "epoch": 0.04, "learning_rate": 7.027027027027027e-07, "logits/chosen": -0.33346906304359436, "logits/rejected": -0.3542168438434601, "logps/chosen": -145.94412231445312, "logps/rejected": -104.02857208251953, "loss": 0.6956, "rewards/accuracies": 0.0, "rewards/chosen": 0.03770294412970543, "rewards/margins": -0.036852266639471054, "rewards/rejected": 0.07455521076917648, "step": 260 }, { "epoch": 0.04, "learning_rate": 7.054054054054054e-07, "logits/chosen": -0.1139557734131813, "logits/rejected": -0.1139557734131813, "logps/chosen": -30.508420944213867, "logps/rejected": -30.508420944213867, "loss": 0.6813, "rewards/accuracies": 0.0, "rewards/chosen": 0.018706321716308594, "rewards/margins": 0.0, "rewards/rejected": 0.018706321716308594, "step": 261 }, { "epoch": 0.04, "learning_rate": 7.081081081081081e-07, "logits/chosen": -0.4168432354927063, "logits/rejected": -0.4168432354927063, "logps/chosen": -3.4278104305267334, "logps/rejected": -3.4278104305267334, "loss": 0.7149, "rewards/accuracies": 0.0, "rewards/chosen": 0.0036205053329467773, "rewards/margins": 0.0, "rewards/rejected": 0.0036205053329467773, "step": 262 }, { "epoch": 0.04, "learning_rate": 7.108108108108107e-07, "logits/chosen": -0.34351110458374023, "logits/rejected": -0.2881513237953186, "logps/chosen": -130.06344604492188, "logps/rejected": -83.08836364746094, "loss": 0.6996, "rewards/accuracies": 1.0, "rewards/chosen": 0.011578368954360485, "rewards/margins": 0.00668411236256361, "rewards/rejected": 0.004894256591796875, "step": 263 }, { "epoch": 0.04, "learning_rate": 7.135135135135135e-07, "logits/chosen": -0.16990581154823303, "logits/rejected": -0.17038729786872864, "logps/chosen": -68.46524810791016, "logps/rejected": -90.34840393066406, "loss": 0.71, "rewards/accuracies": 0.0, "rewards/chosen": -0.023273468017578125, "rewards/margins": -0.027779389172792435, "rewards/rejected": 0.004505920689553022, "step": 264 }, { "epoch": 0.04, "learning_rate": 7.162162162162161e-07, "logits/chosen": -0.6283978819847107, "logits/rejected": -0.6398565769195557, "logps/chosen": -93.19718933105469, "logps/rejected": -124.4268798828125, "loss": 0.712, "rewards/accuracies": 0.0, "rewards/chosen": -0.01484527625143528, "rewards/margins": -0.034590914845466614, "rewards/rejected": 0.019745636731386185, "step": 265 }, { "epoch": 0.04, "learning_rate": 7.189189189189189e-07, "logits/chosen": -0.21319420635700226, "logits/rejected": -0.1945137232542038, "logps/chosen": -48.31044006347656, "logps/rejected": -51.869384765625, "loss": 0.6649, "rewards/accuracies": 0.0, "rewards/chosen": -0.01446456927806139, "rewards/margins": -0.030923079699277878, "rewards/rejected": 0.016458511352539062, "step": 266 }, { "epoch": 0.04, "learning_rate": 7.216216216216216e-07, "logits/chosen": -0.34576892852783203, "logits/rejected": -0.316360741853714, "logps/chosen": -91.73020935058594, "logps/rejected": -76.59224700927734, "loss": 0.7036, "rewards/accuracies": 1.0, "rewards/chosen": 0.038782503455877304, "rewards/margins": 0.02391662821173668, "rewards/rejected": 0.014865875244140625, "step": 267 }, { "epoch": 0.04, "learning_rate": 7.243243243243243e-07, "logits/chosen": -0.9471164345741272, "logits/rejected": -0.9449308514595032, "logps/chosen": -126.88983917236328, "logps/rejected": -107.86639404296875, "loss": 0.6751, "rewards/accuracies": 1.0, "rewards/chosen": 0.03363800048828125, "rewards/margins": 0.0066612232476472855, "rewards/rejected": 0.026976777240633965, "step": 268 }, { "epoch": 0.04, "learning_rate": 7.27027027027027e-07, "logits/chosen": -0.29122644662857056, "logits/rejected": -0.2580624520778656, "logps/chosen": -85.7589111328125, "logps/rejected": -23.444839477539062, "loss": 0.6739, "rewards/accuracies": 1.0, "rewards/chosen": 0.052091218531131744, "rewards/margins": 0.04447002708911896, "rewards/rejected": 0.007621192838996649, "step": 269 }, { "epoch": 0.04, "learning_rate": 7.297297297297297e-07, "logits/chosen": -0.8326250910758972, "logits/rejected": -0.9100193977355957, "logps/chosen": -286.0299072265625, "logps/rejected": -32.567962646484375, "loss": 0.7077, "rewards/accuracies": 1.0, "rewards/chosen": 0.07532043755054474, "rewards/margins": 0.07178612053394318, "rewards/rejected": 0.0035343170166015625, "step": 270 }, { "epoch": 0.04, "learning_rate": 7.324324324324323e-07, "logits/chosen": -0.581637978553772, "logits/rejected": -0.581637978553772, "logps/chosen": -8.195045471191406, "logps/rejected": -8.195045471191406, "loss": 0.7029, "rewards/accuracies": 0.0, "rewards/chosen": 0.03478879854083061, "rewards/margins": 0.0, "rewards/rejected": 0.03478879854083061, "step": 271 }, { "epoch": 0.04, "learning_rate": 7.351351351351351e-07, "logits/chosen": 0.053717803210020065, "logits/rejected": 0.053717803210020065, "logps/chosen": -78.0581283569336, "logps/rejected": -78.0581283569336, "loss": 0.7001, "rewards/accuracies": 0.0, "rewards/chosen": 0.03225402906537056, "rewards/margins": 0.0, "rewards/rejected": 0.03225402906537056, "step": 272 }, { "epoch": 0.04, "learning_rate": 7.378378378378379e-07, "logits/chosen": -0.5462653040885925, "logits/rejected": -0.501912534236908, "logps/chosen": -108.71451568603516, "logps/rejected": -169.38490295410156, "loss": 0.7389, "rewards/accuracies": 0.0, "rewards/chosen": 0.007755279541015625, "rewards/margins": -0.0731300339102745, "rewards/rejected": 0.08088531345129013, "step": 273 }, { "epoch": 0.04, "learning_rate": 7.405405405405405e-07, "logits/chosen": -0.5175575017929077, "logits/rejected": -0.5107718706130981, "logps/chosen": -228.3626251220703, "logps/rejected": -111.40919494628906, "loss": 0.686, "rewards/accuracies": 0.0, "rewards/chosen": -0.012368774972856045, "rewards/margins": -0.01322631910443306, "rewards/rejected": 0.0008575439569540322, "step": 274 }, { "epoch": 0.04, "learning_rate": 7.432432432432432e-07, "logits/chosen": -0.4925699532032013, "logits/rejected": -0.5021453499794006, "logps/chosen": -76.84034729003906, "logps/rejected": -65.8656997680664, "loss": 0.6903, "rewards/accuracies": 0.0, "rewards/chosen": -0.011295318603515625, "rewards/margins": -0.05182342603802681, "rewards/rejected": 0.040528107434511185, "step": 275 }, { "epoch": 0.04, "learning_rate": 7.45945945945946e-07, "logits/chosen": -0.6591641902923584, "logits/rejected": -0.6676951050758362, "logps/chosen": -249.1314239501953, "logps/rejected": -24.305997848510742, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.006871032994240522, "rewards/margins": 0.005375099368393421, "rewards/rejected": 0.0014959335094317794, "step": 276 }, { "epoch": 0.04, "learning_rate": 7.486486486486486e-07, "logits/chosen": -0.4917125403881073, "logits/rejected": -0.5260026454925537, "logps/chosen": -112.41964721679688, "logps/rejected": -141.2938690185547, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.0047393799759447575, "rewards/margins": 0.01755828969180584, "rewards/rejected": -0.012818909250199795, "step": 277 }, { "epoch": 0.05, "learning_rate": 7.513513513513513e-07, "logits/chosen": -0.6115276217460632, "logits/rejected": -0.6115276217460632, "logps/chosen": -63.94850158691406, "logps/rejected": -63.94850158691406, "loss": 0.6765, "rewards/accuracies": 0.0, "rewards/chosen": 0.02819366566836834, "rewards/margins": 0.0, "rewards/rejected": 0.02819366566836834, "step": 278 }, { "epoch": 0.05, "learning_rate": 7.540540540540539e-07, "logits/chosen": -0.4088778495788574, "logits/rejected": -0.4218658208847046, "logps/chosen": -69.97645568847656, "logps/rejected": -86.32235717773438, "loss": 0.7471, "rewards/accuracies": 0.0, "rewards/chosen": -0.023642731830477715, "rewards/margins": -0.060948945581912994, "rewards/rejected": 0.03730621561408043, "step": 279 }, { "epoch": 0.05, "learning_rate": 7.567567567567568e-07, "logits/chosen": -0.27607494592666626, "logits/rejected": -0.2560552656650543, "logps/chosen": -108.0770034790039, "logps/rejected": -92.43142700195312, "loss": 0.6981, "rewards/accuracies": 1.0, "rewards/chosen": 0.024384308606386185, "rewards/margins": 0.01643524318933487, "rewards/rejected": 0.00794906634837389, "step": 280 }, { "epoch": 0.05, "learning_rate": 7.594594594594595e-07, "logits/chosen": -0.40547052025794983, "logits/rejected": -0.4031304717063904, "logps/chosen": -94.25276184082031, "logps/rejected": -100.1546859741211, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": -0.02718353271484375, "rewards/margins": 0.010414887219667435, "rewards/rejected": -0.037598419934511185, "step": 281 }, { "epoch": 0.05, "learning_rate": 7.621621621621621e-07, "logits/chosen": -0.10161691159009933, "logits/rejected": -0.10161691159009933, "logps/chosen": -22.866010665893555, "logps/rejected": -22.866010665893555, "loss": 0.6923, "rewards/accuracies": 0.0, "rewards/chosen": -0.0017698288429528475, "rewards/margins": 0.0, "rewards/rejected": -0.0017698288429528475, "step": 282 }, { "epoch": 0.05, "learning_rate": 7.648648648648648e-07, "logits/chosen": -0.16066543757915497, "logits/rejected": -0.1537611037492752, "logps/chosen": -63.396427154541016, "logps/rejected": -89.58740234375, "loss": 0.6735, "rewards/accuracies": 1.0, "rewards/chosen": 0.020494461059570312, "rewards/margins": 0.03603630140423775, "rewards/rejected": -0.01554183941334486, "step": 283 }, { "epoch": 0.05, "learning_rate": 7.675675675675676e-07, "logits/chosen": -0.0898287445306778, "logits/rejected": -0.08354562520980835, "logps/chosen": -60.325340270996094, "logps/rejected": -46.755470275878906, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.0015968322986736894, "rewards/margins": 0.027779771015048027, "rewards/rejected": -0.026182938367128372, "step": 284 }, { "epoch": 0.05, "learning_rate": 7.702702702702702e-07, "logits/chosen": -0.5653208494186401, "logits/rejected": -0.5085735321044922, "logps/chosen": -99.89974975585938, "logps/rejected": -89.02179718017578, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.04496307298541069, "rewards/margins": 0.02736968919634819, "rewards/rejected": 0.0175933837890625, "step": 285 }, { "epoch": 0.05, "learning_rate": 7.729729729729729e-07, "logits/chosen": -0.2837832272052765, "logits/rejected": -0.24622344970703125, "logps/chosen": -83.137939453125, "logps/rejected": -75.85929870605469, "loss": 0.6758, "rewards/accuracies": 1.0, "rewards/chosen": 0.03794250637292862, "rewards/margins": 0.10157165676355362, "rewards/rejected": -0.063629150390625, "step": 286 }, { "epoch": 0.05, "learning_rate": 7.756756756756756e-07, "logits/chosen": -0.32201650738716125, "logits/rejected": -0.36186483502388, "logps/chosen": -136.7382354736328, "logps/rejected": -76.36276245117188, "loss": 0.7087, "rewards/accuracies": 0.0, "rewards/chosen": 0.00794219970703125, "rewards/margins": -0.04180755838751793, "rewards/rejected": 0.04974975809454918, "step": 287 }, { "epoch": 0.05, "learning_rate": 7.783783783783784e-07, "logits/chosen": -0.1565636247396469, "logits/rejected": -0.1402961164712906, "logps/chosen": -77.84473419189453, "logps/rejected": -114.89642333984375, "loss": 0.7034, "rewards/accuracies": 0.0, "rewards/chosen": 0.038945771753787994, "rewards/margins": -0.05694427341222763, "rewards/rejected": 0.09589004516601562, "step": 288 }, { "epoch": 0.05, "learning_rate": 7.810810810810811e-07, "logits/chosen": -0.1755819171667099, "logits/rejected": -0.10587473958730698, "logps/chosen": -58.79627227783203, "logps/rejected": -107.64315795898438, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.020520782098174095, "rewards/margins": 0.027497101575136185, "rewards/rejected": -0.006976318545639515, "step": 289 }, { "epoch": 0.05, "learning_rate": 7.837837837837838e-07, "logits/chosen": -0.46392470598220825, "logits/rejected": -0.4508579671382904, "logps/chosen": -81.06454467773438, "logps/rejected": -52.47039031982422, "loss": 0.7239, "rewards/accuracies": 0.0, "rewards/chosen": 0.0164947509765625, "rewards/margins": -0.018993761390447617, "rewards/rejected": 0.03548851236701012, "step": 290 }, { "epoch": 0.05, "learning_rate": 7.864864864864864e-07, "logits/chosen": -0.4314313232898712, "logits/rejected": -0.42866188287734985, "logps/chosen": -133.37445068359375, "logps/rejected": -96.01898193359375, "loss": 0.7005, "rewards/accuracies": 0.0, "rewards/chosen": -0.010118103586137295, "rewards/margins": -0.0025329594500362873, "rewards/rejected": -0.0075851441361010075, "step": 291 }, { "epoch": 0.05, "learning_rate": 7.891891891891892e-07, "logits/chosen": -0.43567630648612976, "logits/rejected": -0.4367689788341522, "logps/chosen": -89.0173568725586, "logps/rejected": -110.29659271240234, "loss": 0.6842, "rewards/accuracies": 0.0, "rewards/chosen": 0.033724214881658554, "rewards/margins": -0.010453794151544571, "rewards/rejected": 0.044178009033203125, "step": 292 }, { "epoch": 0.05, "learning_rate": 7.918918918918918e-07, "logits/chosen": -0.5743986368179321, "logits/rejected": -0.5472896099090576, "logps/chosen": -191.8577880859375, "logps/rejected": -226.91354370117188, "loss": 0.6758, "rewards/accuracies": 1.0, "rewards/chosen": 0.04781799390912056, "rewards/margins": 0.0670623779296875, "rewards/rejected": -0.01924438588321209, "step": 293 }, { "epoch": 0.05, "learning_rate": 7.945945945945945e-07, "logits/chosen": -0.3871232569217682, "logits/rejected": -0.3734929859638214, "logps/chosen": -68.12725830078125, "logps/rejected": -10.591185569763184, "loss": 0.7019, "rewards/accuracies": 1.0, "rewards/chosen": 0.013240051455795765, "rewards/margins": 0.01644735410809517, "rewards/rejected": -0.003207302186638117, "step": 294 }, { "epoch": 0.05, "learning_rate": 7.972972972972972e-07, "logits/chosen": -0.31515684723854065, "logits/rejected": -0.3287794589996338, "logps/chosen": -41.691837310791016, "logps/rejected": -13.799124717712402, "loss": 0.6765, "rewards/accuracies": 1.0, "rewards/chosen": 0.011811829172074795, "rewards/margins": 0.021708393469452858, "rewards/rejected": -0.009896564297378063, "step": 295 }, { "epoch": 0.05, "learning_rate": 8e-07, "logits/chosen": -0.2376575469970703, "logits/rejected": -0.2376575469970703, "logps/chosen": -67.7696533203125, "logps/rejected": -67.7696533203125, "loss": 0.678, "rewards/accuracies": 0.0, "rewards/chosen": 0.036829378455877304, "rewards/margins": 0.0, "rewards/rejected": 0.036829378455877304, "step": 296 }, { "epoch": 0.05, "learning_rate": 8.027027027027027e-07, "logits/chosen": -0.24191127717494965, "logits/rejected": -0.18976977467536926, "logps/chosen": -157.98666381835938, "logps/rejected": -97.6490707397461, "loss": 0.6896, "rewards/accuracies": 0.0, "rewards/chosen": 0.03161773830652237, "rewards/margins": -0.008750151842832565, "rewards/rejected": 0.040367890149354935, "step": 297 }, { "epoch": 0.05, "learning_rate": 8.054054054054054e-07, "logits/chosen": -0.4153968095779419, "logits/rejected": -0.4258129894733429, "logps/chosen": -57.64097595214844, "logps/rejected": -117.24942779541016, "loss": 0.7028, "rewards/accuracies": 0.0, "rewards/chosen": -0.012463378719985485, "rewards/margins": -0.030207060277462006, "rewards/rejected": 0.017743682488799095, "step": 298 }, { "epoch": 0.05, "learning_rate": 8.08108108108108e-07, "logits/chosen": -0.9008798599243164, "logits/rejected": -0.9145456552505493, "logps/chosen": -82.61708068847656, "logps/rejected": -109.30799865722656, "loss": 0.672, "rewards/accuracies": 1.0, "rewards/chosen": 0.006060791201889515, "rewards/margins": 0.05852355808019638, "rewards/rejected": -0.05246276780962944, "step": 299 }, { "epoch": 0.05, "learning_rate": 8.108108108108108e-07, "logits/chosen": -0.14267686009407043, "logits/rejected": -0.1133304238319397, "logps/chosen": -74.96328735351562, "logps/rejected": -92.35094451904297, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.042704012244939804, "rewards/margins": 0.003406524658203125, "rewards/rejected": 0.03929748758673668, "step": 300 }, { "epoch": 0.05, "learning_rate": 8.135135135135135e-07, "logits/chosen": -0.18098582327365875, "logits/rejected": -0.18098582327365875, "logps/chosen": -74.21546173095703, "logps/rejected": -74.21546173095703, "loss": 0.6792, "rewards/accuracies": 0.0, "rewards/chosen": -0.026297761127352715, "rewards/margins": 0.0, "rewards/rejected": -0.026297761127352715, "step": 301 }, { "epoch": 0.05, "learning_rate": 8.162162162162161e-07, "logits/chosen": -0.3694453537464142, "logits/rejected": -0.3808564245700836, "logps/chosen": -126.30577087402344, "logps/rejected": -99.38382720947266, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.02087249793112278, "rewards/margins": 0.02921295166015625, "rewards/rejected": -0.008340454660356045, "step": 302 }, { "epoch": 0.05, "learning_rate": 8.189189189189189e-07, "logits/chosen": -0.4955960214138031, "logits/rejected": -0.48369207978248596, "logps/chosen": -43.00499725341797, "logps/rejected": -99.40017700195312, "loss": 0.7122, "rewards/accuracies": 0.0, "rewards/chosen": -0.013941573910415173, "rewards/margins": -0.04316673427820206, "rewards/rejected": 0.02922515943646431, "step": 303 }, { "epoch": 0.05, "learning_rate": 8.216216216216217e-07, "logits/chosen": -0.28972142934799194, "logits/rejected": -0.2723579406738281, "logps/chosen": -85.0233383178711, "logps/rejected": -88.48387145996094, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": -0.0007904052617959678, "rewards/margins": 0.00762176513671875, "rewards/rejected": -0.008412170223891735, "step": 304 }, { "epoch": 0.05, "learning_rate": 8.243243243243243e-07, "logits/chosen": -0.31397587060928345, "logits/rejected": -0.33982953429222107, "logps/chosen": -49.87921142578125, "logps/rejected": -134.2371826171875, "loss": 0.6662, "rewards/accuracies": 1.0, "rewards/chosen": -0.012074279598891735, "rewards/margins": 0.03902588039636612, "rewards/rejected": -0.05110016092658043, "step": 305 }, { "epoch": 0.05, "learning_rate": 8.27027027027027e-07, "logits/chosen": -0.7139043807983398, "logits/rejected": -0.668512225151062, "logps/chosen": -130.62603759765625, "logps/rejected": -80.37443542480469, "loss": 0.7263, "rewards/accuracies": 0.0, "rewards/chosen": -0.03348236158490181, "rewards/margins": -0.014440154656767845, "rewards/rejected": -0.019042206928133965, "step": 306 }, { "epoch": 0.05, "learning_rate": 8.297297297297296e-07, "logits/chosen": -0.44448122382164, "logits/rejected": -0.4248316287994385, "logps/chosen": -89.12185668945312, "logps/rejected": -68.3450927734375, "loss": 0.6908, "rewards/accuracies": 0.0, "rewards/chosen": -0.02060699462890625, "rewards/margins": -0.02913665771484375, "rewards/rejected": 0.0085296630859375, "step": 307 }, { "epoch": 0.05, "learning_rate": 8.324324324324324e-07, "logits/chosen": -0.3502560257911682, "logits/rejected": -0.36018145084381104, "logps/chosen": -103.72532653808594, "logps/rejected": -113.87272644042969, "loss": 0.7012, "rewards/accuracies": 0.0, "rewards/chosen": 0.03646545484662056, "rewards/margins": -0.0035469047725200653, "rewards/rejected": 0.040012359619140625, "step": 308 }, { "epoch": 0.05, "learning_rate": 8.351351351351351e-07, "logits/chosen": -0.16980798542499542, "logits/rejected": -0.13041651248931885, "logps/chosen": -158.14117431640625, "logps/rejected": -103.6114501953125, "loss": 0.6823, "rewards/accuracies": 0.0, "rewards/chosen": 0.01886901818215847, "rewards/margins": -0.0021850597113370895, "rewards/rejected": 0.02105407789349556, "step": 309 }, { "epoch": 0.05, "learning_rate": 8.378378378378377e-07, "logits/chosen": -0.5031245350837708, "logits/rejected": -0.4590701460838318, "logps/chosen": -96.58393859863281, "logps/rejected": -182.747314453125, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 0.014556884765625, "rewards/margins": -0.0043289195746183395, "rewards/rejected": 0.01888580434024334, "step": 310 }, { "epoch": 0.05, "learning_rate": 8.405405405405405e-07, "logits/chosen": -0.4729987382888794, "logits/rejected": -0.5078628659248352, "logps/chosen": -131.16949462890625, "logps/rejected": -43.932220458984375, "loss": 0.6552, "rewards/accuracies": 1.0, "rewards/chosen": 0.08995971828699112, "rewards/margins": 0.08648490905761719, "rewards/rejected": 0.00347480783239007, "step": 311 }, { "epoch": 0.05, "learning_rate": 8.432432432432433e-07, "logits/chosen": -0.49183422327041626, "logits/rejected": -0.44914141297340393, "logps/chosen": -79.34181213378906, "logps/rejected": -103.85028076171875, "loss": 0.7165, "rewards/accuracies": 0.0, "rewards/chosen": -0.017386628314852715, "rewards/margins": -0.07249297946691513, "rewards/rejected": 0.055106353014707565, "step": 312 }, { "epoch": 0.05, "learning_rate": 8.459459459459459e-07, "logits/chosen": 0.03482908010482788, "logits/rejected": 0.03482908010482788, "logps/chosen": -69.79248046875, "logps/rejected": -69.79248046875, "loss": 0.683, "rewards/accuracies": 0.0, "rewards/chosen": -0.041778564453125, "rewards/margins": 0.0, "rewards/rejected": -0.041778564453125, "step": 313 }, { "epoch": 0.05, "learning_rate": 8.486486486486486e-07, "logits/chosen": -0.07682986557483673, "logits/rejected": -0.05983927845954895, "logps/chosen": -62.72063446044922, "logps/rejected": -22.391048431396484, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.011114883236587048, "rewards/margins": 0.016224097460508347, "rewards/rejected": -0.005109214689582586, "step": 314 }, { "epoch": 0.05, "learning_rate": 8.513513513513513e-07, "logits/chosen": -0.23869453370571136, "logits/rejected": -0.21436481177806854, "logps/chosen": -80.88421630859375, "logps/rejected": -161.00975036621094, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.010731506161391735, "rewards/margins": 0.02365722693502903, "rewards/rejected": -0.012925720773637295, "step": 315 }, { "epoch": 0.05, "learning_rate": 8.54054054054054e-07, "logits/chosen": -0.7771386504173279, "logits/rejected": -0.7138268351554871, "logps/chosen": -94.72341918945312, "logps/rejected": -116.08442687988281, "loss": 0.7097, "rewards/accuracies": 0.0, "rewards/chosen": 0.0083770751953125, "rewards/margins": -0.06791000813245773, "rewards/rejected": 0.07628708332777023, "step": 316 }, { "epoch": 0.05, "learning_rate": 8.567567567567567e-07, "logits/chosen": -0.2680341601371765, "logits/rejected": -0.2721710801124573, "logps/chosen": -88.35153198242188, "logps/rejected": -105.4986343383789, "loss": 0.7002, "rewards/accuracies": 0.0, "rewards/chosen": 0.02320404164493084, "rewards/margins": -0.02065887488424778, "rewards/rejected": 0.04386291652917862, "step": 317 }, { "epoch": 0.05, "learning_rate": 8.594594594594595e-07, "logits/chosen": -0.13815467059612274, "logits/rejected": -0.07973288744688034, "logps/chosen": -210.9352264404297, "logps/rejected": -101.07438659667969, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 0.02292938344180584, "rewards/margins": 0.0273208636790514, "rewards/rejected": -0.004391479771584272, "step": 318 }, { "epoch": 0.05, "learning_rate": 8.621621621621621e-07, "logits/chosen": -0.27456676959991455, "logits/rejected": -0.26549792289733887, "logps/chosen": -99.86965942382812, "logps/rejected": -182.59732055664062, "loss": 0.7174, "rewards/accuracies": 0.0, "rewards/chosen": 0.03241882473230362, "rewards/margins": -0.03167877346277237, "rewards/rejected": 0.06409759819507599, "step": 319 }, { "epoch": 0.05, "learning_rate": 8.648648648648649e-07, "logits/chosen": -1.1698318719863892, "logits/rejected": -0.8177074193954468, "logps/chosen": -310.7459716796875, "logps/rejected": -82.5965805053711, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.07966309040784836, "rewards/margins": 0.08347702771425247, "rewards/rejected": -0.0038139342796057463, "step": 320 }, { "epoch": 0.05, "learning_rate": 8.675675675675675e-07, "logits/chosen": -0.608300507068634, "logits/rejected": -0.5060200691223145, "logps/chosen": -129.9212188720703, "logps/rejected": -76.34538269042969, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.08044280856847763, "rewards/margins": 0.110204316675663, "rewards/rejected": -0.029761506244540215, "step": 321 }, { "epoch": 0.05, "learning_rate": 8.702702702702702e-07, "logits/chosen": -0.7447606325149536, "logits/rejected": -0.6950793862342834, "logps/chosen": -294.0896911621094, "logps/rejected": -260.32061767578125, "loss": 0.6761, "rewards/accuracies": 0.0, "rewards/chosen": 0.0350494384765625, "rewards/margins": -0.03240051120519638, "rewards/rejected": 0.06744994968175888, "step": 322 }, { "epoch": 0.05, "learning_rate": 8.729729729729729e-07, "logits/chosen": -0.1308836042881012, "logits/rejected": -0.13820135593414307, "logps/chosen": -9.386747360229492, "logps/rejected": -10.166028022766113, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.006267357151955366, "rewards/margins": 0.010010052472352982, "rewards/rejected": -0.003742694854736328, "step": 323 }, { "epoch": 0.05, "learning_rate": 8.756756756756756e-07, "logits/chosen": 0.040678974241018295, "logits/rejected": 0.03448816388845444, "logps/chosen": -15.129755973815918, "logps/rejected": -70.11109924316406, "loss": 0.6991, "rewards/accuracies": 0.0, "rewards/chosen": -0.005198574159294367, "rewards/margins": -0.026453305035829544, "rewards/rejected": 0.021254731342196465, "step": 324 }, { "epoch": 0.05, "learning_rate": 8.783783783783784e-07, "logits/chosen": -0.3882763981819153, "logits/rejected": -0.41277599334716797, "logps/chosen": -114.85098266601562, "logps/rejected": -56.464969635009766, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.03826294094324112, "rewards/margins": 0.042488861829042435, "rewards/rejected": -0.00422592181712389, "step": 325 }, { "epoch": 0.05, "learning_rate": 8.810810810810811e-07, "logits/chosen": -0.37186017632484436, "logits/rejected": -0.40151095390319824, "logps/chosen": -185.59791564941406, "logps/rejected": -84.33418273925781, "loss": 0.6611, "rewards/accuracies": 1.0, "rewards/chosen": 0.06361236423254013, "rewards/margins": 0.014588925987482071, "rewards/rejected": 0.04902343824505806, "step": 326 }, { "epoch": 0.05, "learning_rate": 8.837837837837837e-07, "logits/chosen": -0.5298293828964233, "logits/rejected": -0.5181041955947876, "logps/chosen": -98.101318359375, "logps/rejected": -124.74580383300781, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.02893371693789959, "rewards/margins": 0.043193817138671875, "rewards/rejected": -0.01426010113209486, "step": 327 }, { "epoch": 0.05, "learning_rate": 8.864864864864865e-07, "logits/chosen": -0.7664762139320374, "logits/rejected": -0.7235049605369568, "logps/chosen": -94.0849609375, "logps/rejected": -98.53656768798828, "loss": 0.6853, "rewards/accuracies": 0.0, "rewards/chosen": -0.05650177225470543, "rewards/margins": -0.06883011013269424, "rewards/rejected": 0.01232833880931139, "step": 328 }, { "epoch": 0.05, "learning_rate": 8.891891891891892e-07, "logits/chosen": -0.46557873487472534, "logits/rejected": -0.4654148817062378, "logps/chosen": -113.11990356445312, "logps/rejected": -119.06544494628906, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": -0.027574921026825905, "rewards/margins": -0.015293884091079235, "rewards/rejected": -0.01228103693574667, "step": 329 }, { "epoch": 0.05, "learning_rate": 8.918918918918918e-07, "logits/chosen": -0.7565405964851379, "logits/rejected": -0.7421918511390686, "logps/chosen": -125.98054504394531, "logps/rejected": -39.58802795410156, "loss": 0.687, "rewards/accuracies": 0.0, "rewards/chosen": -0.01710357703268528, "rewards/margins": -0.015143967233598232, "rewards/rejected": -0.0019596100319176912, "step": 330 }, { "epoch": 0.05, "learning_rate": 8.945945945945945e-07, "logits/chosen": 0.11074335873126984, "logits/rejected": 0.10500230640172958, "logps/chosen": -4.433313846588135, "logps/rejected": -3.4017348289489746, "loss": 0.7021, "rewards/accuracies": 0.0, "rewards/chosen": 0.007984734140336514, "rewards/margins": -0.005373167805373669, "rewards/rejected": 0.013357901945710182, "step": 331 }, { "epoch": 0.05, "learning_rate": 8.972972972972974e-07, "logits/chosen": -0.2829308807849884, "logits/rejected": -0.29356881976127625, "logps/chosen": -86.78286743164062, "logps/rejected": -151.90359497070312, "loss": 0.7191, "rewards/accuracies": 0.0, "rewards/chosen": 0.05296936258673668, "rewards/margins": -0.02117462083697319, "rewards/rejected": 0.07414398342370987, "step": 332 }, { "epoch": 0.05, "learning_rate": 9e-07, "logits/chosen": -0.1085514947772026, "logits/rejected": -0.10982663929462433, "logps/chosen": -4.999255657196045, "logps/rejected": -2.1878628730773926, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.007209205534309149, "rewards/margins": 0.003383970120921731, "rewards/rejected": 0.003825235413387418, "step": 333 }, { "epoch": 0.05, "learning_rate": 9.027027027027027e-07, "logits/chosen": -0.6815992593765259, "logits/rejected": -0.6318789720535278, "logps/chosen": -109.98084259033203, "logps/rejected": -76.60066986083984, "loss": 0.6768, "rewards/accuracies": 0.0, "rewards/chosen": 0.020211029797792435, "rewards/margins": -0.02009124681353569, "rewards/rejected": 0.040302276611328125, "step": 334 }, { "epoch": 0.05, "learning_rate": 9.054054054054053e-07, "logits/chosen": -0.5480333566665649, "logits/rejected": -0.5029409527778625, "logps/chosen": -171.03933715820312, "logps/rejected": -102.09197998046875, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.03612671047449112, "rewards/margins": 0.01886444166302681, "rewards/rejected": 0.01726226881146431, "step": 335 }, { "epoch": 0.05, "learning_rate": 9.081081081081081e-07, "logits/chosen": -0.218074232339859, "logits/rejected": -0.16932913661003113, "logps/chosen": -141.57281494140625, "logps/rejected": -180.095947265625, "loss": 0.6802, "rewards/accuracies": 0.0, "rewards/chosen": 0.07378540188074112, "rewards/margins": -0.011608883738517761, "rewards/rejected": 0.08539428561925888, "step": 336 }, { "epoch": 0.05, "learning_rate": 9.108108108108108e-07, "logits/chosen": -0.5237767696380615, "logits/rejected": -0.5161314010620117, "logps/chosen": -67.05658721923828, "logps/rejected": -54.76858139038086, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": -0.0068191527388989925, "rewards/margins": 0.0036598206497728825, "rewards/rejected": -0.010478973388671875, "step": 337 }, { "epoch": 0.05, "learning_rate": 9.135135135135134e-07, "logits/chosen": -0.29052096605300903, "logits/rejected": -0.20254601538181305, "logps/chosen": -96.47233581542969, "logps/rejected": -14.500325202941895, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.0022445679642260075, "rewards/margins": 0.002213096711784601, "rewards/rejected": 3.147125244140625e-05, "step": 338 }, { "epoch": 0.06, "learning_rate": 9.162162162162161e-07, "logits/chosen": -0.5112229585647583, "logits/rejected": -0.529826283454895, "logps/chosen": -98.7402114868164, "logps/rejected": -115.48056030273438, "loss": 0.718, "rewards/accuracies": 0.0, "rewards/chosen": 0.058716583997011185, "rewards/margins": -0.03489379957318306, "rewards/rejected": 0.09361038357019424, "step": 339 }, { "epoch": 0.06, "learning_rate": 9.18918918918919e-07, "logits/chosen": -0.341605007648468, "logits/rejected": -0.34981754422187805, "logps/chosen": -107.05131530761719, "logps/rejected": -115.7171401977539, "loss": 0.6603, "rewards/accuracies": 1.0, "rewards/chosen": 0.07178039848804474, "rewards/margins": 0.04798584431409836, "rewards/rejected": 0.02379455603659153, "step": 340 }, { "epoch": 0.06, "learning_rate": 9.216216216216216e-07, "logits/chosen": -0.6143463253974915, "logits/rejected": -0.6143463253974915, "logps/chosen": -98.4697494506836, "logps/rejected": -98.4697494506836, "loss": 0.6676, "rewards/accuracies": 0.0, "rewards/chosen": -0.017092132940888405, "rewards/margins": 0.0, "rewards/rejected": -0.017092132940888405, "step": 341 }, { "epoch": 0.06, "learning_rate": 9.243243243243243e-07, "logits/chosen": -0.10425739735364914, "logits/rejected": -0.10425739735364914, "logps/chosen": -71.74494171142578, "logps/rejected": -71.74494171142578, "loss": 0.7024, "rewards/accuracies": 0.0, "rewards/chosen": 0.034806061536073685, "rewards/margins": 0.0, "rewards/rejected": 0.034806061536073685, "step": 342 }, { "epoch": 0.06, "learning_rate": 9.27027027027027e-07, "logits/chosen": -0.6913438439369202, "logits/rejected": -0.7387556433677673, "logps/chosen": -110.76118469238281, "logps/rejected": -41.728599548339844, "loss": 0.6646, "rewards/accuracies": 1.0, "rewards/chosen": 0.07950973510742188, "rewards/margins": 0.07346954196691513, "rewards/rejected": 0.0060401917435228825, "step": 343 }, { "epoch": 0.06, "learning_rate": 9.297297297297297e-07, "logits/chosen": -0.3355300724506378, "logits/rejected": -0.32703882455825806, "logps/chosen": -118.29271697998047, "logps/rejected": -59.319541931152344, "loss": 0.7128, "rewards/accuracies": 0.0, "rewards/chosen": 0.0018203735817223787, "rewards/margins": -0.06938935071229935, "rewards/rejected": 0.07120972126722336, "step": 344 }, { "epoch": 0.06, "learning_rate": 9.324324324324324e-07, "logits/chosen": 0.128102108836174, "logits/rejected": 0.1299338936805725, "logps/chosen": -11.857009887695312, "logps/rejected": -6.252457141876221, "loss": 0.6844, "rewards/accuracies": 0.0, "rewards/chosen": -0.0027757645584642887, "rewards/margins": -0.01188211515545845, "rewards/rejected": 0.009106350131332874, "step": 345 }, { "epoch": 0.06, "learning_rate": 9.351351351351351e-07, "logits/chosen": -0.49098503589630127, "logits/rejected": -0.3811093866825104, "logps/chosen": -100.12268829345703, "logps/rejected": -155.27830505371094, "loss": 0.6983, "rewards/accuracies": 0.0, "rewards/chosen": 0.07893753051757812, "rewards/margins": -0.004004672169685364, "rewards/rejected": 0.08294220268726349, "step": 346 }, { "epoch": 0.06, "learning_rate": 9.378378378378377e-07, "logits/chosen": -0.44689539074897766, "logits/rejected": -0.4250628352165222, "logps/chosen": -84.4073486328125, "logps/rejected": -135.8135223388672, "loss": 0.6804, "rewards/accuracies": 0.0, "rewards/chosen": 0.0002685546933207661, "rewards/margins": -0.02073516882956028, "rewards/rejected": 0.02100372314453125, "step": 347 }, { "epoch": 0.06, "learning_rate": 9.405405405405406e-07, "logits/chosen": -0.627684473991394, "logits/rejected": -0.6065748333930969, "logps/chosen": -62.66859817504883, "logps/rejected": -109.74117279052734, "loss": 0.6948, "rewards/accuracies": 1.0, "rewards/chosen": 0.04082069545984268, "rewards/margins": 0.04975395277142525, "rewards/rejected": -0.00893325824290514, "step": 348 }, { "epoch": 0.06, "learning_rate": 9.432432432432433e-07, "logits/chosen": -0.3008269667625427, "logits/rejected": -0.3082747161388397, "logps/chosen": -81.85430908203125, "logps/rejected": -184.37481689453125, "loss": 0.7021, "rewards/accuracies": 0.0, "rewards/chosen": 0.02289276197552681, "rewards/margins": -0.02297363430261612, "rewards/rejected": 0.04586639627814293, "step": 349 }, { "epoch": 0.06, "learning_rate": 9.459459459459459e-07, "logits/chosen": -0.1604209840297699, "logits/rejected": -0.12150335311889648, "logps/chosen": -86.95320129394531, "logps/rejected": -82.2430419921875, "loss": 0.7037, "rewards/accuracies": 0.0, "rewards/chosen": 0.011463928036391735, "rewards/margins": -0.03505706787109375, "rewards/rejected": 0.04652099683880806, "step": 350 }, { "epoch": 0.06, "learning_rate": 9.486486486486486e-07, "logits/chosen": -0.10099282115697861, "logits/rejected": -0.09336333721876144, "logps/chosen": -125.43267059326172, "logps/rejected": -61.864627838134766, "loss": 0.675, "rewards/accuracies": 0.0, "rewards/chosen": 0.058493804186582565, "rewards/margins": -0.019537735730409622, "rewards/rejected": 0.07803153991699219, "step": 351 }, { "epoch": 0.06, "learning_rate": 9.513513513513513e-07, "logits/chosen": -0.1804327815771103, "logits/rejected": -0.18720528483390808, "logps/chosen": -53.58554458618164, "logps/rejected": -60.85645294189453, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.006764602847397327, "rewards/margins": 0.007092666812241077, "rewards/rejected": -0.00032806396484375, "step": 352 }, { "epoch": 0.06, "learning_rate": 9.540540540540541e-07, "logits/chosen": -0.44671374559402466, "logits/rejected": -0.44130128622055054, "logps/chosen": -74.46231079101562, "logps/rejected": -100.8460693359375, "loss": 0.7114, "rewards/accuracies": 0.0, "rewards/chosen": 0.0034568787086755037, "rewards/margins": -0.07166977226734161, "rewards/rejected": 0.07512664794921875, "step": 353 }, { "epoch": 0.06, "learning_rate": 9.567567567567567e-07, "logits/chosen": -0.23295053839683533, "logits/rejected": -0.20678897202014923, "logps/chosen": -80.74369812011719, "logps/rejected": -74.7661361694336, "loss": 0.7083, "rewards/accuracies": 0.0, "rewards/chosen": -0.0022933960426598787, "rewards/margins": -0.058618929237127304, "rewards/rejected": 0.056325532495975494, "step": 354 }, { "epoch": 0.06, "learning_rate": 9.594594594594594e-07, "logits/chosen": -0.21176190674304962, "logits/rejected": -0.18859165906906128, "logps/chosen": -105.6881332397461, "logps/rejected": -88.67036437988281, "loss": 0.6789, "rewards/accuracies": 0.0, "rewards/chosen": 0.037798311561346054, "rewards/margins": -0.032140348106622696, "rewards/rejected": 0.06993865966796875, "step": 355 }, { "epoch": 0.06, "learning_rate": 9.621621621621622e-07, "logits/chosen": -0.44207605719566345, "logits/rejected": -0.4404984712600708, "logps/chosen": -113.5150146484375, "logps/rejected": -61.58863830566406, "loss": 0.6826, "rewards/accuracies": 0.0, "rewards/chosen": 0.010951233096420765, "rewards/margins": -0.00610122736543417, "rewards/rejected": 0.017052460461854935, "step": 356 }, { "epoch": 0.06, "learning_rate": 9.648648648648648e-07, "logits/chosen": -0.2356811761856079, "logits/rejected": -0.25332456827163696, "logps/chosen": -89.03773498535156, "logps/rejected": -83.67353820800781, "loss": 0.7254, "rewards/accuracies": 0.0, "rewards/chosen": -0.006741333287209272, "rewards/margins": -0.04880676418542862, "rewards/rejected": 0.04206543043255806, "step": 357 }, { "epoch": 0.06, "learning_rate": 9.675675675675676e-07, "logits/chosen": -0.41000601649284363, "logits/rejected": -0.4412704408168793, "logps/chosen": -115.93838500976562, "logps/rejected": -25.78310775756836, "loss": 0.7089, "rewards/accuracies": 1.0, "rewards/chosen": 0.02016601525247097, "rewards/margins": 0.021120261400938034, "rewards/rejected": -0.0009542465559206903, "step": 358 }, { "epoch": 0.06, "learning_rate": 9.702702702702702e-07, "logits/chosen": -0.7073876857757568, "logits/rejected": -0.7039561867713928, "logps/chosen": -94.04855346679688, "logps/rejected": -117.68563842773438, "loss": 0.702, "rewards/accuracies": 0.0, "rewards/chosen": 0.022336577996611595, "rewards/margins": -0.04870452731847763, "rewards/rejected": 0.07104110717773438, "step": 359 }, { "epoch": 0.06, "learning_rate": 9.72972972972973e-07, "logits/chosen": -0.5672470331192017, "logits/rejected": -0.47898048162460327, "logps/chosen": -139.5933380126953, "logps/rejected": -89.7555160522461, "loss": 0.6734, "rewards/accuracies": 1.0, "rewards/chosen": 0.03546905517578125, "rewards/margins": 0.02663116529583931, "rewards/rejected": 0.008837890811264515, "step": 360 }, { "epoch": 0.06, "learning_rate": 9.756756756756756e-07, "logits/chosen": -0.10434169322252274, "logits/rejected": -0.10624919086694717, "logps/chosen": -80.87747192382812, "logps/rejected": -98.54951477050781, "loss": 0.648, "rewards/accuracies": 1.0, "rewards/chosen": 0.000994873116724193, "rewards/margins": 0.0054870606400072575, "rewards/rejected": -0.0044921874068677425, "step": 361 }, { "epoch": 0.06, "learning_rate": 9.783783783783782e-07, "logits/chosen": -0.3114411234855652, "logits/rejected": -0.3114411234855652, "logps/chosen": -64.81149291992188, "logps/rejected": -64.81149291992188, "loss": 0.7067, "rewards/accuracies": 0.0, "rewards/chosen": 0.036403656005859375, "rewards/margins": 0.0, "rewards/rejected": 0.036403656005859375, "step": 362 }, { "epoch": 0.06, "learning_rate": 9.81081081081081e-07, "logits/chosen": -0.3882998824119568, "logits/rejected": -0.3882998824119568, "logps/chosen": -97.013671875, "logps/rejected": -97.013671875, "loss": 0.7031, "rewards/accuracies": 0.0, "rewards/chosen": 0.02761077880859375, "rewards/margins": 0.0, "rewards/rejected": 0.02761077880859375, "step": 363 }, { "epoch": 0.06, "learning_rate": 9.837837837837839e-07, "logits/chosen": -0.20375950634479523, "logits/rejected": -0.1830262541770935, "logps/chosen": -64.17417907714844, "logps/rejected": -112.25331115722656, "loss": 0.671, "rewards/accuracies": 0.0, "rewards/chosen": 0.05600280687212944, "rewards/margins": -0.019739534705877304, "rewards/rejected": 0.07574234157800674, "step": 364 }, { "epoch": 0.06, "learning_rate": 9.864864864864865e-07, "logits/chosen": -0.4067549705505371, "logits/rejected": -0.3803490996360779, "logps/chosen": -62.10698699951172, "logps/rejected": -96.20384216308594, "loss": 0.6553, "rewards/accuracies": 0.0, "rewards/chosen": -0.022872162982821465, "rewards/margins": -0.030701447278261185, "rewards/rejected": 0.007829285226762295, "step": 365 }, { "epoch": 0.06, "learning_rate": 9.89189189189189e-07, "logits/chosen": -0.5416258573532104, "logits/rejected": -0.10101370513439178, "logps/chosen": -255.708740234375, "logps/rejected": -243.59133911132812, "loss": 0.7192, "rewards/accuracies": 0.0, "rewards/chosen": 0.02257843129336834, "rewards/margins": -0.09534301608800888, "rewards/rejected": 0.11792144924402237, "step": 366 }, { "epoch": 0.06, "learning_rate": 9.91891891891892e-07, "logits/chosen": -0.3935460150241852, "logits/rejected": -0.4075452983379364, "logps/chosen": -108.59591674804688, "logps/rejected": -110.55990600585938, "loss": 0.7003, "rewards/accuracies": 1.0, "rewards/chosen": 0.06169433519244194, "rewards/margins": 0.00742645189166069, "rewards/rejected": 0.05426788330078125, "step": 367 }, { "epoch": 0.06, "learning_rate": 9.945945945945945e-07, "logits/chosen": -0.812423825263977, "logits/rejected": -0.8684738278388977, "logps/chosen": -87.89889526367188, "logps/rejected": -25.776247024536133, "loss": 0.6733, "rewards/accuracies": 1.0, "rewards/chosen": 0.08570938557386398, "rewards/margins": 0.06707878410816193, "rewards/rejected": 0.018630599603056908, "step": 368 }, { "epoch": 0.06, "learning_rate": 9.972972972972973e-07, "logits/chosen": -0.3549596667289734, "logits/rejected": -0.3536057472229004, "logps/chosen": -123.93199157714844, "logps/rejected": -21.539287567138672, "loss": 0.6717, "rewards/accuracies": 1.0, "rewards/chosen": 0.040811922401189804, "rewards/margins": 0.03700294718146324, "rewards/rejected": 0.0038089752197265625, "step": 369 }, { "epoch": 0.06, "learning_rate": 1e-06, "logits/chosen": -0.12387119978666306, "logits/rejected": -0.13428431749343872, "logps/chosen": -45.722496032714844, "logps/rejected": -37.792789459228516, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.021237565204501152, "rewards/margins": 0.001969529315829277, "rewards/rejected": 0.019268035888671875, "step": 370 }, { "epoch": 0.06, "learning_rate": 9.999999827273655e-07, "logits/chosen": -0.5891165733337402, "logits/rejected": -0.5726391673088074, "logps/chosen": -103.53028869628906, "logps/rejected": -48.24945831298828, "loss": 0.6872, "rewards/accuracies": 0.0, "rewards/chosen": -0.01857452467083931, "rewards/margins": -0.015496826730668545, "rewards/rejected": -0.0030776977073401213, "step": 371 }, { "epoch": 0.06, "learning_rate": 9.999999309094632e-07, "logits/chosen": -0.29549914598464966, "logits/rejected": -0.29549914598464966, "logps/chosen": -23.73455810546875, "logps/rejected": -23.73455810546875, "loss": 0.6838, "rewards/accuracies": 0.0, "rewards/chosen": 0.00275936140678823, "rewards/margins": 0.0, "rewards/rejected": 0.00275936140678823, "step": 372 }, { "epoch": 0.06, "learning_rate": 9.999998445462969e-07, "logits/chosen": -0.24159851670265198, "logits/rejected": -0.18436022102832794, "logps/chosen": -79.37989807128906, "logps/rejected": -140.3987579345703, "loss": 0.7039, "rewards/accuracies": 0.0, "rewards/chosen": 0.07922440022230148, "rewards/margins": -0.029325097799301147, "rewards/rejected": 0.10854949802160263, "step": 373 }, { "epoch": 0.06, "learning_rate": 9.99999723637872e-07, "logits/chosen": -0.3887360095977783, "logits/rejected": -0.395585298538208, "logps/chosen": -2.5222036838531494, "logps/rejected": -16.88914680480957, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.009483146481215954, "rewards/margins": -0.0023963935673236847, "rewards/rejected": 0.011879540048539639, "step": 374 }, { "epoch": 0.06, "learning_rate": 9.999995681841978e-07, "logits/chosen": -0.5645462870597839, "logits/rejected": -1.0671247243881226, "logps/chosen": -113.23602294921875, "logps/rejected": -35.972686767578125, "loss": 0.6707, "rewards/accuracies": 1.0, "rewards/chosen": 0.07692413777112961, "rewards/margins": 0.06791801750659943, "rewards/rejected": 0.009006119333207607, "step": 375 }, { "epoch": 0.06, "learning_rate": 9.99999378185284e-07, "logits/chosen": -0.2717573642730713, "logits/rejected": -0.16578057408332825, "logps/chosen": -46.63580322265625, "logps/rejected": -24.684232711791992, "loss": 0.6585, "rewards/accuracies": 1.0, "rewards/chosen": 0.03720550611615181, "rewards/margins": 0.032857704907655716, "rewards/rejected": 0.004347801208496094, "step": 376 }, { "epoch": 0.06, "learning_rate": 9.999991536411446e-07, "logits/chosen": -0.45269739627838135, "logits/rejected": -0.40852996706962585, "logps/chosen": -150.66708374023438, "logps/rejected": -85.23141479492188, "loss": 0.6749, "rewards/accuracies": 0.0, "rewards/chosen": 0.01933593861758709, "rewards/margins": -0.030313873663544655, "rewards/rejected": 0.049649812281131744, "step": 377 }, { "epoch": 0.06, "learning_rate": 9.999988945517943e-07, "logits/chosen": -0.6119242906570435, "logits/rejected": -0.7058473229408264, "logps/chosen": -264.839599609375, "logps/rejected": -310.1995849609375, "loss": 0.6488, "rewards/accuracies": 1.0, "rewards/chosen": 0.29349976778030396, "rewards/margins": 0.16533814370632172, "rewards/rejected": 0.12816162407398224, "step": 378 }, { "epoch": 0.06, "learning_rate": 9.999986009172517e-07, "logits/chosen": -0.580294668674469, "logits/rejected": -0.5698191523551941, "logps/chosen": -145.39541625976562, "logps/rejected": -218.00706481933594, "loss": 0.7367, "rewards/accuracies": 0.0, "rewards/chosen": 0.06436767429113388, "rewards/margins": -0.16581878066062927, "rewards/rejected": 0.23018646240234375, "step": 379 }, { "epoch": 0.06, "learning_rate": 9.999982727375366e-07, "logits/chosen": -0.5154333114624023, "logits/rejected": -0.5978396534919739, "logps/chosen": -151.79039001464844, "logps/rejected": -62.684749603271484, "loss": 0.6725, "rewards/accuracies": 1.0, "rewards/chosen": 0.02009429968893528, "rewards/margins": 0.019781876355409622, "rewards/rejected": 0.0003124237118754536, "step": 380 }, { "epoch": 0.06, "learning_rate": 9.99997910012672e-07, "logits/chosen": -0.3944045901298523, "logits/rejected": -0.40281713008880615, "logps/chosen": -77.78738403320312, "logps/rejected": -61.83123779296875, "loss": 0.7035, "rewards/accuracies": 0.0, "rewards/chosen": 0.028018951416015625, "rewards/margins": -0.06707534939050674, "rewards/rejected": 0.09509430080652237, "step": 381 }, { "epoch": 0.06, "learning_rate": 9.99997512742683e-07, "logits/chosen": -0.39863061904907227, "logits/rejected": -0.4117414653301239, "logps/chosen": -106.54573059082031, "logps/rejected": -80.36248779296875, "loss": 0.7226, "rewards/accuracies": 1.0, "rewards/chosen": 0.09822387993335724, "rewards/margins": 0.03556670993566513, "rewards/rejected": 0.06265716999769211, "step": 382 }, { "epoch": 0.06, "learning_rate": 9.999970809275967e-07, "logits/chosen": -0.5449909567832947, "logits/rejected": -0.557068943977356, "logps/chosen": -113.61973571777344, "logps/rejected": -68.05712127685547, "loss": 0.6215, "rewards/accuracies": 1.0, "rewards/chosen": 0.203685000538826, "rewards/margins": 0.10456466674804688, "rewards/rejected": 0.09912033379077911, "step": 383 }, { "epoch": 0.06, "learning_rate": 9.999966145674432e-07, "logits/chosen": -0.21414268016815186, "logits/rejected": -0.19549721479415894, "logps/chosen": -54.072853088378906, "logps/rejected": -101.20774841308594, "loss": 0.7304, "rewards/accuracies": 0.0, "rewards/chosen": 0.014613724313676357, "rewards/margins": -0.0351078025996685, "rewards/rejected": 0.049721527844667435, "step": 384 }, { "epoch": 0.06, "learning_rate": 9.999961136622546e-07, "logits/chosen": -0.41743847727775574, "logits/rejected": -0.3875029981136322, "logps/chosen": -62.781700134277344, "logps/rejected": -27.793636322021484, "loss": 0.715, "rewards/accuracies": 0.0, "rewards/chosen": 0.009372711181640625, "rewards/margins": -0.009732818230986595, "rewards/rejected": 0.01910552941262722, "step": 385 }, { "epoch": 0.06, "learning_rate": 9.999955782120655e-07, "logits/chosen": -0.36205825209617615, "logits/rejected": -0.33297210931777954, "logps/chosen": -142.02870178222656, "logps/rejected": -72.96728515625, "loss": 0.7059, "rewards/accuracies": 1.0, "rewards/chosen": 0.07023773342370987, "rewards/margins": 0.01671295240521431, "rewards/rejected": 0.05352478101849556, "step": 386 }, { "epoch": 0.06, "learning_rate": 9.999950082169131e-07, "logits/chosen": -0.4899241626262665, "logits/rejected": -0.47255486249923706, "logps/chosen": -52.1991081237793, "logps/rejected": -22.880229949951172, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": -0.018036270514130592, "rewards/margins": -0.024555206298828125, "rewards/rejected": 0.00651893625035882, "step": 387 }, { "epoch": 0.06, "learning_rate": 9.999944036768366e-07, "logits/chosen": -0.15746107697486877, "logits/rejected": -0.09881272912025452, "logps/chosen": -87.44729614257812, "logps/rejected": -47.19255065917969, "loss": 0.6968, "rewards/accuracies": 0.0, "rewards/chosen": 0.01593017578125, "rewards/margins": -0.02868194505572319, "rewards/rejected": 0.04461212083697319, "step": 388 }, { "epoch": 0.06, "learning_rate": 9.999937645918776e-07, "logits/chosen": -0.42399927973747253, "logits/rejected": -0.4562405049800873, "logps/chosen": -201.20303344726562, "logps/rejected": -94.11534881591797, "loss": 0.6698, "rewards/accuracies": 1.0, "rewards/chosen": 0.17054139077663422, "rewards/margins": 0.08200302720069885, "rewards/rejected": 0.08853836357593536, "step": 389 }, { "epoch": 0.06, "learning_rate": 9.999930909620807e-07, "logits/chosen": -0.6556652784347534, "logits/rejected": -0.6611606478691101, "logps/chosen": -226.9092254638672, "logps/rejected": -108.88285827636719, "loss": 0.6923, "rewards/accuracies": 0.0, "rewards/chosen": 0.005244445987045765, "rewards/margins": -0.03426818922162056, "rewards/rejected": 0.03951263427734375, "step": 390 }, { "epoch": 0.06, "learning_rate": 9.99992382787492e-07, "logits/chosen": -0.5366853475570679, "logits/rejected": -0.5458175539970398, "logps/chosen": -151.0570831298828, "logps/rejected": -123.42682647705078, "loss": 0.6556, "rewards/accuracies": 1.0, "rewards/chosen": 0.15699005126953125, "rewards/margins": 0.11298294365406036, "rewards/rejected": 0.044007111340761185, "step": 391 }, { "epoch": 0.06, "learning_rate": 9.999916400681607e-07, "logits/chosen": -0.6523032784461975, "logits/rejected": -0.6812893748283386, "logps/chosen": -160.70016479492188, "logps/rejected": -31.80668067932129, "loss": 0.7208, "rewards/accuracies": 0.0, "rewards/chosen": -0.01651916466653347, "rewards/margins": -0.020888328552246094, "rewards/rejected": 0.004369163420051336, "step": 392 }, { "epoch": 0.06, "learning_rate": 9.99990862804138e-07, "logits/chosen": -0.2295282930135727, "logits/rejected": -0.15869875252246857, "logps/chosen": -203.82931518554688, "logps/rejected": -77.52662658691406, "loss": 0.6456, "rewards/accuracies": 1.0, "rewards/chosen": 0.08121033012866974, "rewards/margins": 0.06740265339612961, "rewards/rejected": 0.01380767859518528, "step": 393 }, { "epoch": 0.06, "learning_rate": 9.999900509954777e-07, "logits/chosen": -0.2550954818725586, "logits/rejected": -0.2656903564929962, "logps/chosen": -155.84994506835938, "logps/rejected": -143.69805908203125, "loss": 0.6412, "rewards/accuracies": 1.0, "rewards/chosen": 0.164631649851799, "rewards/margins": 0.011479184031486511, "rewards/rejected": 0.1531524658203125, "step": 394 }, { "epoch": 0.06, "learning_rate": 9.999892046422358e-07, "logits/chosen": -0.56670743227005, "logits/rejected": -0.6322862505912781, "logps/chosen": -187.57305908203125, "logps/rejected": -58.98921203613281, "loss": 0.6551, "rewards/accuracies": 1.0, "rewards/chosen": 0.073272705078125, "rewards/margins": 0.06339187920093536, "rewards/rejected": 0.00988082867115736, "step": 395 }, { "epoch": 0.06, "learning_rate": 9.999883237444709e-07, "logits/chosen": -0.16541419923305511, "logits/rejected": -0.1508130431175232, "logps/chosen": -74.06045532226562, "logps/rejected": -148.81430053710938, "loss": 0.6995, "rewards/accuracies": 0.0, "rewards/chosen": 0.08304901421070099, "rewards/margins": -0.08267669379711151, "rewards/rejected": 0.1657257080078125, "step": 396 }, { "epoch": 0.06, "learning_rate": 9.999874083022436e-07, "logits/chosen": -0.6695342659950256, "logits/rejected": -0.6171880960464478, "logps/chosen": -94.3324966430664, "logps/rejected": -198.58847045898438, "loss": 0.6895, "rewards/accuracies": 0.0, "rewards/chosen": 0.08361587673425674, "rewards/margins": -0.10474777966737747, "rewards/rejected": 0.18836365640163422, "step": 397 }, { "epoch": 0.06, "learning_rate": 9.999864583156174e-07, "logits/chosen": -0.15741460025310516, "logits/rejected": -0.15741460025310516, "logps/chosen": -103.67754364013672, "logps/rejected": -103.67754364013672, "loss": 0.6565, "rewards/accuracies": 0.0, "rewards/chosen": -0.006931304931640625, "rewards/margins": 0.0, "rewards/rejected": -0.006931304931640625, "step": 398 }, { "epoch": 0.06, "learning_rate": 9.999854737846577e-07, "logits/chosen": -0.42824625968933105, "logits/rejected": -0.42756763100624084, "logps/chosen": -157.56236267089844, "logps/rejected": -188.7030029296875, "loss": 0.7276, "rewards/accuracies": 0.0, "rewards/chosen": 0.05188293382525444, "rewards/margins": -0.1998291164636612, "rewards/rejected": 0.25171205401420593, "step": 399 }, { "epoch": 0.06, "learning_rate": 9.99984454709433e-07, "logits/chosen": -0.6430537104606628, "logits/rejected": -0.6740773320198059, "logps/chosen": -125.7249755859375, "logps/rejected": -94.91410827636719, "loss": 0.6439, "rewards/accuracies": 1.0, "rewards/chosen": 0.2583419978618622, "rewards/margins": 0.0979919582605362, "rewards/rejected": 0.160350039601326, "step": 400 }, { "epoch": 0.07, "learning_rate": 9.999834010900131e-07, "logits/chosen": -0.6101714968681335, "logits/rejected": -0.44327330589294434, "logps/chosen": -158.1851806640625, "logps/rejected": -57.89967727661133, "loss": 0.6676, "rewards/accuracies": 1.0, "rewards/chosen": 0.13975678384304047, "rewards/margins": 0.08235932141542435, "rewards/rejected": 0.05739746242761612, "step": 401 }, { "epoch": 0.07, "learning_rate": 9.99982312926471e-07, "logits/chosen": -0.42833077907562256, "logits/rejected": -0.42015403509140015, "logps/chosen": -103.23133087158203, "logps/rejected": -87.1596908569336, "loss": 0.7132, "rewards/accuracies": 0.0, "rewards/chosen": 0.009617614559829235, "rewards/margins": -0.059288788586854935, "rewards/rejected": 0.06890640407800674, "step": 402 }, { "epoch": 0.07, "learning_rate": 9.999811902188821e-07, "logits/chosen": -0.37701699137687683, "logits/rejected": -0.40070053935050964, "logps/chosen": -57.33810043334961, "logps/rejected": -54.959205627441406, "loss": 0.6704, "rewards/accuracies": 1.0, "rewards/chosen": 0.10058784484863281, "rewards/margins": 0.10569877922534943, "rewards/rejected": -0.00511093158274889, "step": 403 }, { "epoch": 0.07, "learning_rate": 9.99980032967324e-07, "logits/chosen": -0.4918137788772583, "logits/rejected": -0.48826393485069275, "logps/chosen": -107.77630615234375, "logps/rejected": -105.44876098632812, "loss": 0.7133, "rewards/accuracies": 0.0, "rewards/chosen": 0.03453826904296875, "rewards/margins": -0.10374604165554047, "rewards/rejected": 0.13828431069850922, "step": 404 }, { "epoch": 0.07, "learning_rate": 9.999788411718763e-07, "logits/chosen": -0.33089542388916016, "logits/rejected": -0.32332679629325867, "logps/chosen": -57.734039306640625, "logps/rejected": -60.37776184082031, "loss": 0.7158, "rewards/accuracies": 0.0, "rewards/chosen": 0.0043045044876635075, "rewards/margins": -0.07081375271081924, "rewards/rejected": 0.07511825859546661, "step": 405 }, { "epoch": 0.07, "learning_rate": 9.999776148326214e-07, "logits/chosen": -0.3349669575691223, "logits/rejected": -0.22930103540420532, "logps/chosen": -67.7504653930664, "logps/rejected": -63.943843841552734, "loss": 0.7034, "rewards/accuracies": 1.0, "rewards/chosen": 0.11836929619312286, "rewards/margins": 0.0450107604265213, "rewards/rejected": 0.07335853576660156, "step": 406 }, { "epoch": 0.07, "learning_rate": 9.999763539496443e-07, "logits/chosen": -0.2417858988046646, "logits/rejected": -0.2560396194458008, "logps/chosen": -87.63804626464844, "logps/rejected": -118.47180938720703, "loss": 0.685, "rewards/accuracies": 0.0, "rewards/chosen": 0.030654145404696465, "rewards/margins": -0.11981353163719177, "rewards/rejected": 0.1504676789045334, "step": 407 }, { "epoch": 0.07, "learning_rate": 9.99975058523032e-07, "logits/chosen": -0.40841272473335266, "logits/rejected": -0.3766116797924042, "logps/chosen": -79.86907958984375, "logps/rejected": -95.3398666381836, "loss": 0.7004, "rewards/accuracies": 0.0, "rewards/chosen": 0.11169586330652237, "rewards/margins": -0.05278854817152023, "rewards/rejected": 0.1644844114780426, "step": 408 }, { "epoch": 0.07, "learning_rate": 9.999737285528738e-07, "logits/chosen": -0.5360091924667358, "logits/rejected": -0.5342167019844055, "logps/chosen": -137.0596160888672, "logps/rejected": -99.256591796875, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.07323455810546875, "rewards/margins": 0.0013053864240646362, "rewards/rejected": 0.07192917168140411, "step": 409 }, { "epoch": 0.07, "learning_rate": 9.99972364039262e-07, "logits/chosen": -0.279249370098114, "logits/rejected": -0.2801867127418518, "logps/chosen": -226.50433349609375, "logps/rejected": -137.465576171875, "loss": 0.6869, "rewards/accuracies": 0.0, "rewards/chosen": 0.14621277153491974, "rewards/margins": -0.022363275289535522, "rewards/rejected": 0.16857604682445526, "step": 410 }, { "epoch": 0.07, "learning_rate": 9.999709649822902e-07, "logits/chosen": -0.3483566343784332, "logits/rejected": -0.3482142984867096, "logps/chosen": -73.2462158203125, "logps/rejected": -57.60054016113281, "loss": 0.7054, "rewards/accuracies": 0.0, "rewards/chosen": 0.004795837681740522, "rewards/margins": -0.034471891820430756, "rewards/rejected": 0.039267729967832565, "step": 411 }, { "epoch": 0.07, "learning_rate": 9.999695313820558e-07, "logits/chosen": 0.16734004020690918, "logits/rejected": 0.1480332314968109, "logps/chosen": -75.41600036621094, "logps/rejected": -77.61137390136719, "loss": 0.7018, "rewards/accuracies": 0.0, "rewards/chosen": 0.002199554583057761, "rewards/margins": -0.10050278156995773, "rewards/rejected": 0.10270233452320099, "step": 412 }, { "epoch": 0.07, "learning_rate": 9.999680632386575e-07, "logits/chosen": -0.5108360052108765, "logits/rejected": -0.4651206135749817, "logps/chosen": -112.19233703613281, "logps/rejected": -81.66230773925781, "loss": 0.7398, "rewards/accuracies": 0.0, "rewards/chosen": 0.04593658447265625, "rewards/margins": -0.14934693276882172, "rewards/rejected": 0.19528351724147797, "step": 413 }, { "epoch": 0.07, "learning_rate": 9.999665605521969e-07, "logits/chosen": -0.3477672338485718, "logits/rejected": -0.2952291965484619, "logps/chosen": -107.59217834472656, "logps/rejected": -201.3736572265625, "loss": 0.6913, "rewards/accuracies": 0.0, "rewards/chosen": 0.13901595771312714, "rewards/margins": -0.14398576319217682, "rewards/rejected": 0.28300172090530396, "step": 414 }, { "epoch": 0.07, "learning_rate": 9.999650233227774e-07, "logits/chosen": -0.4064844250679016, "logits/rejected": -0.3811263144016266, "logps/chosen": -96.99520874023438, "logps/rejected": -86.89559936523438, "loss": 0.7012, "rewards/accuracies": 0.0, "rewards/chosen": 0.033663179725408554, "rewards/margins": -0.032196808606386185, "rewards/rejected": 0.06585998833179474, "step": 415 }, { "epoch": 0.07, "learning_rate": 9.999634515505058e-07, "logits/chosen": -0.5747688412666321, "logits/rejected": -0.4661404490470886, "logps/chosen": -204.3419647216797, "logps/rejected": -142.81336975097656, "loss": 0.7122, "rewards/accuracies": 0.0, "rewards/chosen": 0.03426361083984375, "rewards/margins": -0.03003082424402237, "rewards/rejected": 0.06429443508386612, "step": 416 }, { "epoch": 0.07, "learning_rate": 9.999618452354901e-07, "logits/chosen": -0.4086279571056366, "logits/rejected": -0.40147125720977783, "logps/chosen": -63.796897888183594, "logps/rejected": -80.80718231201172, "loss": 0.7056, "rewards/accuracies": 0.0, "rewards/chosen": 0.050551604479551315, "rewards/margins": -0.02534027025103569, "rewards/rejected": 0.075891874730587, "step": 417 }, { "epoch": 0.07, "learning_rate": 9.99960204377842e-07, "logits/chosen": -0.30606910586357117, "logits/rejected": -0.32941046357154846, "logps/chosen": -215.28746032714844, "logps/rejected": -144.98403930664062, "loss": 0.6452, "rewards/accuracies": 1.0, "rewards/chosen": 0.06821136921644211, "rewards/margins": 0.0027679502964019775, "rewards/rejected": 0.06544341892004013, "step": 418 }, { "epoch": 0.07, "learning_rate": 9.999585289776739e-07, "logits/chosen": -0.6519638299942017, "logits/rejected": -0.6508263349533081, "logps/chosen": -118.14949035644531, "logps/rejected": -74.46650695800781, "loss": 0.713, "rewards/accuracies": 0.0, "rewards/chosen": 0.03592529520392418, "rewards/margins": -0.0007225014269351959, "rewards/rejected": 0.036647796630859375, "step": 419 }, { "epoch": 0.07, "learning_rate": 9.999568190351024e-07, "logits/chosen": -0.26979246735572815, "logits/rejected": -0.24376678466796875, "logps/chosen": -75.72088623046875, "logps/rejected": -66.21484375, "loss": 0.6576, "rewards/accuracies": 1.0, "rewards/chosen": 0.11062393337488174, "rewards/margins": 0.08736038208007812, "rewards/rejected": 0.02326354943215847, "step": 420 }, { "epoch": 0.07, "learning_rate": 9.999550745502453e-07, "logits/chosen": -0.4137313961982727, "logits/rejected": -0.4247390627861023, "logps/chosen": -125.58146667480469, "logps/rejected": -97.9300537109375, "loss": 0.7392, "rewards/accuracies": 0.0, "rewards/chosen": -0.04576263576745987, "rewards/margins": -0.10606078803539276, "rewards/rejected": 0.06029815599322319, "step": 421 }, { "epoch": 0.07, "learning_rate": 9.999532955232233e-07, "logits/chosen": -0.5380750894546509, "logits/rejected": -0.5174559354782104, "logps/chosen": -133.67388916015625, "logps/rejected": -65.720703125, "loss": 0.739, "rewards/accuracies": 0.0, "rewards/chosen": 0.04869995266199112, "rewards/margins": -0.05897674709558487, "rewards/rejected": 0.10767669975757599, "step": 422 }, { "epoch": 0.07, "learning_rate": 9.99951481954159e-07, "logits/chosen": -0.39796847105026245, "logits/rejected": -0.39186012744903564, "logps/chosen": -67.74238586425781, "logps/rejected": -74.15849304199219, "loss": 0.667, "rewards/accuracies": 1.0, "rewards/chosen": 0.13591919839382172, "rewards/margins": 0.046868905425071716, "rewards/rejected": 0.08905029296875, "step": 423 }, { "epoch": 0.07, "learning_rate": 9.999496338431782e-07, "logits/chosen": -0.18024396896362305, "logits/rejected": -0.13461069762706757, "logps/chosen": -266.04205322265625, "logps/rejected": -185.76919555664062, "loss": 0.684, "rewards/accuracies": 0.0, "rewards/chosen": 0.19721375405788422, "rewards/margins": -0.12257538735866547, "rewards/rejected": 0.3197891414165497, "step": 424 }, { "epoch": 0.07, "learning_rate": 9.999477511904078e-07, "logits/chosen": -0.21046000719070435, "logits/rejected": -0.19809351861476898, "logps/chosen": -121.38398742675781, "logps/rejected": -83.04643249511719, "loss": 0.7619, "rewards/accuracies": 0.0, "rewards/chosen": 0.015293121337890625, "rewards/margins": -0.12384568154811859, "rewards/rejected": 0.13913880288600922, "step": 425 }, { "epoch": 0.07, "learning_rate": 9.999458339959785e-07, "logits/chosen": -0.5736907124519348, "logits/rejected": -0.47164270281791687, "logps/chosen": -244.02088928222656, "logps/rejected": -152.5070343017578, "loss": 0.6523, "rewards/accuracies": 1.0, "rewards/chosen": 0.09635772556066513, "rewards/margins": 0.0042053163051605225, "rewards/rejected": 0.09215240925550461, "step": 426 }, { "epoch": 0.07, "learning_rate": 9.999438822600227e-07, "logits/chosen": -0.42662855982780457, "logits/rejected": -0.3866802752017975, "logps/chosen": -315.13653564453125, "logps/rejected": -73.82395935058594, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.09086304157972336, "rewards/margins": 0.030023198574781418, "rewards/rejected": 0.06083984300494194, "step": 427 }, { "epoch": 0.07, "learning_rate": 9.99941895982675e-07, "logits/chosen": -0.3421221375465393, "logits/rejected": -0.31793850660324097, "logps/chosen": -79.94613647460938, "logps/rejected": -103.40752410888672, "loss": 0.6882, "rewards/accuracies": 0.0, "rewards/chosen": 0.1407524198293686, "rewards/margins": -0.019746392965316772, "rewards/rejected": 0.16049881279468536, "step": 428 }, { "epoch": 0.07, "learning_rate": 9.99939875164073e-07, "logits/chosen": -0.43557289242744446, "logits/rejected": -0.3836863934993744, "logps/chosen": -219.8617401123047, "logps/rejected": -143.44943237304688, "loss": 0.6247, "rewards/accuracies": 1.0, "rewards/chosen": 0.15556488931179047, "rewards/margins": 0.019621282815933228, "rewards/rejected": 0.13594360649585724, "step": 429 }, { "epoch": 0.07, "learning_rate": 9.99937819804356e-07, "logits/chosen": -0.2436915934085846, "logits/rejected": -0.2242281585931778, "logps/chosen": -66.93733978271484, "logps/rejected": -59.827789306640625, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.13321304321289062, "rewards/margins": 0.08273620903491974, "rewards/rejected": 0.050476837903261185, "step": 430 }, { "epoch": 0.07, "learning_rate": 9.99935729903666e-07, "logits/chosen": 0.006044622976332903, "logits/rejected": 0.034576572477817535, "logps/chosen": -83.02842712402344, "logps/rejected": -79.97151184082031, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.10590057820081711, "rewards/margins": 0.07954178005456924, "rewards/rejected": 0.026358796283602715, "step": 431 }, { "epoch": 0.07, "learning_rate": 9.999336054621477e-07, "logits/chosen": -0.5106647610664368, "logits/rejected": -0.5402275919914246, "logps/chosen": -192.53350830078125, "logps/rejected": -103.51844024658203, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.1038360595703125, "rewards/margins": 0.029921717941761017, "rewards/rejected": 0.07391434162855148, "step": 432 }, { "epoch": 0.07, "learning_rate": 9.999314464799476e-07, "logits/chosen": -0.39959830045700073, "logits/rejected": -0.21641090512275696, "logps/chosen": -239.36102294921875, "logps/rejected": -179.86102294921875, "loss": 0.6974, "rewards/accuracies": 0.0, "rewards/chosen": 0.16649018228054047, "rewards/margins": -0.022096246480941772, "rewards/rejected": 0.18858642876148224, "step": 433 }, { "epoch": 0.07, "learning_rate": 9.99929252957215e-07, "logits/chosen": -0.24147234857082367, "logits/rejected": -0.22348792850971222, "logps/chosen": -153.25265502929688, "logps/rejected": -102.99939727783203, "loss": 0.6763, "rewards/accuracies": 1.0, "rewards/chosen": 0.23149414360523224, "rewards/margins": 0.08894500136375427, "rewards/rejected": 0.14254914224147797, "step": 434 }, { "epoch": 0.07, "learning_rate": 9.999270248941012e-07, "logits/chosen": -0.24986691772937775, "logits/rejected": -0.21904124319553375, "logps/chosen": -97.23841094970703, "logps/rejected": -117.53643035888672, "loss": 0.72, "rewards/accuracies": 0.0, "rewards/chosen": 0.06687011569738388, "rewards/margins": -0.055843353271484375, "rewards/rejected": 0.12271346896886826, "step": 435 }, { "epoch": 0.07, "learning_rate": 9.999247622907606e-07, "logits/chosen": -0.41479524970054626, "logits/rejected": -0.4260745346546173, "logps/chosen": -158.14112854003906, "logps/rejected": -121.34944152832031, "loss": 0.6429, "rewards/accuracies": 1.0, "rewards/chosen": 0.24267731606960297, "rewards/margins": 0.05809326469898224, "rewards/rejected": 0.18458405137062073, "step": 436 }, { "epoch": 0.07, "learning_rate": 9.99922465147349e-07, "logits/chosen": 0.10443585366010666, "logits/rejected": 0.08683042228221893, "logps/chosen": -10.9591064453125, "logps/rejected": -34.67107391357422, "loss": 0.6692, "rewards/accuracies": 1.0, "rewards/chosen": 0.014448833651840687, "rewards/margins": 0.011786174960434437, "rewards/rejected": 0.00266265869140625, "step": 437 }, { "epoch": 0.07, "learning_rate": 9.999201334640254e-07, "logits/chosen": -0.24465030431747437, "logits/rejected": -0.24365675449371338, "logps/chosen": -28.824169158935547, "logps/rejected": -14.35012435913086, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.03590049967169762, "rewards/margins": 0.008347323164343834, "rewards/rejected": 0.027553176507353783, "step": 438 }, { "epoch": 0.07, "learning_rate": 9.999177672409511e-07, "logits/chosen": -0.120562344789505, "logits/rejected": -0.09931214153766632, "logps/chosen": -10.701976776123047, "logps/rejected": -20.891427993774414, "loss": 0.7238, "rewards/accuracies": 0.0, "rewards/chosen": -0.009842204861342907, "rewards/margins": -0.023749638348817825, "rewards/rejected": 0.013907432556152344, "step": 439 }, { "epoch": 0.07, "learning_rate": 9.999153664782892e-07, "logits/chosen": -0.346163272857666, "logits/rejected": -0.346163272857666, "logps/chosen": -17.937744140625, "logps/rejected": -17.937744140625, "loss": 0.6447, "rewards/accuracies": 0.0, "rewards/chosen": 0.008923721499741077, "rewards/margins": 0.0, "rewards/rejected": 0.008923721499741077, "step": 440 }, { "epoch": 0.07, "learning_rate": 9.999129311762055e-07, "logits/chosen": -0.6739262342453003, "logits/rejected": -0.6807886958122253, "logps/chosen": -165.68698120117188, "logps/rejected": -125.73355102539062, "loss": 0.6945, "rewards/accuracies": 1.0, "rewards/chosen": 0.16613464057445526, "rewards/margins": 0.030513763427734375, "rewards/rejected": 0.1356208771467209, "step": 441 }, { "epoch": 0.07, "learning_rate": 9.999104613348689e-07, "logits/chosen": -0.7025949954986572, "logits/rejected": -0.5849623680114746, "logps/chosen": -158.46560668945312, "logps/rejected": -156.57322692871094, "loss": 0.7674, "rewards/accuracies": 0.0, "rewards/chosen": 0.00961914099752903, "rewards/margins": -0.20211029052734375, "rewards/rejected": 0.21172943711280823, "step": 442 }, { "epoch": 0.07, "learning_rate": 9.999079569544492e-07, "logits/chosen": -1.0445334911346436, "logits/rejected": -1.0779991149902344, "logps/chosen": -87.38723754882812, "logps/rejected": -37.6074333190918, "loss": 0.6005, "rewards/accuracies": 1.0, "rewards/chosen": 0.10769500583410263, "rewards/margins": 0.09924125671386719, "rewards/rejected": 0.008453750982880592, "step": 443 }, { "epoch": 0.07, "learning_rate": 9.999054180351202e-07, "logits/chosen": -0.4317642152309418, "logits/rejected": -0.39012575149536133, "logps/chosen": -56.70927047729492, "logps/rejected": -59.39767074584961, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 0.03200493007898331, "rewards/margins": -0.04039764404296875, "rewards/rejected": 0.07240257412195206, "step": 444 }, { "epoch": 0.07, "learning_rate": 9.999028445770568e-07, "logits/chosen": -0.21947075426578522, "logits/rejected": -0.22864097356796265, "logps/chosen": -31.421489715576172, "logps/rejected": -29.475297927856445, "loss": 0.6769, "rewards/accuracies": 1.0, "rewards/chosen": 0.026130294427275658, "rewards/margins": 0.007534407079219818, "rewards/rejected": 0.01859588734805584, "step": 445 }, { "epoch": 0.07, "learning_rate": 9.99900236580437e-07, "logits/chosen": -0.1335817575454712, "logits/rejected": -0.1382875144481659, "logps/chosen": -5.732008934020996, "logps/rejected": -5.3857421875, "loss": 0.7254, "rewards/accuracies": 0.0, "rewards/chosen": 0.011242724023759365, "rewards/margins": -0.027208421379327774, "rewards/rejected": 0.038451146334409714, "step": 446 }, { "epoch": 0.07, "learning_rate": 9.998975940454409e-07, "logits/chosen": -0.3957219421863556, "logits/rejected": -0.3183833658695221, "logps/chosen": -88.499267578125, "logps/rejected": -72.41578674316406, "loss": 0.665, "rewards/accuracies": 1.0, "rewards/chosen": 0.05973052978515625, "rewards/margins": 0.04140777513384819, "rewards/rejected": 0.01832275465130806, "step": 447 }, { "epoch": 0.07, "learning_rate": 9.998949169722512e-07, "logits/chosen": -0.4980175495147705, "logits/rejected": -1.0097789764404297, "logps/chosen": -172.53765869140625, "logps/rejected": -37.5943489074707, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.18950043618679047, "rewards/margins": 0.18293572962284088, "rewards/rejected": 0.00656471261754632, "step": 448 }, { "epoch": 0.07, "learning_rate": 9.998922053610528e-07, "logits/chosen": 0.0013240156695246696, "logits/rejected": -0.002861183136701584, "logps/chosen": -145.77557373046875, "logps/rejected": -86.78624725341797, "loss": 0.6913, "rewards/accuracies": 0.0, "rewards/chosen": 0.10516967624425888, "rewards/margins": -0.08381577581167221, "rewards/rejected": 0.1889854520559311, "step": 449 }, { "epoch": 0.07, "learning_rate": 9.998894592120328e-07, "logits/chosen": -0.7539251446723938, "logits/rejected": -0.771041989326477, "logps/chosen": -70.45652770996094, "logps/rejected": -71.65037536621094, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.10622558742761612, "rewards/margins": 0.0992637649178505, "rewards/rejected": 0.006961822509765625, "step": 450 }, { "epoch": 0.07, "learning_rate": 9.998866785253814e-07, "logits/chosen": -0.6634476780891418, "logits/rejected": -0.6251946687698364, "logps/chosen": -183.09271240234375, "logps/rejected": -218.57916259765625, "loss": 0.6878, "rewards/accuracies": 0.0, "rewards/chosen": 0.21978759765625, "rewards/margins": -0.11229249835014343, "rewards/rejected": 0.33208009600639343, "step": 451 }, { "epoch": 0.07, "learning_rate": 9.998838633012904e-07, "logits/chosen": 0.129043310880661, "logits/rejected": 0.09112590551376343, "logps/chosen": -97.99311828613281, "logps/rejected": -84.77686309814453, "loss": 0.7107, "rewards/accuracies": 0.0, "rewards/chosen": 0.054631806910037994, "rewards/margins": -0.096257783472538, "rewards/rejected": 0.150889590382576, "step": 452 }, { "epoch": 0.07, "learning_rate": 9.998810135399545e-07, "logits/chosen": 0.07046102732419968, "logits/rejected": 0.07122933864593506, "logps/chosen": -3.641920566558838, "logps/rejected": -4.036410331726074, "loss": 0.636, "rewards/accuracies": 0.0, "rewards/chosen": 0.022990990430116653, "rewards/margins": -0.018304824829101562, "rewards/rejected": 0.041295815259218216, "step": 453 }, { "epoch": 0.07, "learning_rate": 9.998781292415703e-07, "logits/chosen": -1.0311038494110107, "logits/rejected": -1.0402452945709229, "logps/chosen": -76.15064239501953, "logps/rejected": -81.80973815917969, "loss": 0.7052, "rewards/accuracies": 0.0, "rewards/chosen": 0.037784576416015625, "rewards/margins": -0.04769134521484375, "rewards/rejected": 0.08547592163085938, "step": 454 }, { "epoch": 0.07, "learning_rate": 9.998752104063374e-07, "logits/chosen": -0.3314584493637085, "logits/rejected": -0.3314584493637085, "logps/chosen": -67.98968505859375, "logps/rejected": -67.98968505859375, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.1221923828125, "rewards/margins": 0.0, "rewards/rejected": 0.1221923828125, "step": 455 }, { "epoch": 0.07, "learning_rate": 9.998722570344573e-07, "logits/chosen": -0.21538320183753967, "logits/rejected": -0.22356510162353516, "logps/chosen": -60.3460807800293, "logps/rejected": -81.10765075683594, "loss": 0.7139, "rewards/accuracies": 0.0, "rewards/chosen": 0.09549331665039062, "rewards/margins": -0.007760621607303619, "rewards/rejected": 0.10325393825769424, "step": 456 }, { "epoch": 0.07, "learning_rate": 9.998692691261342e-07, "logits/chosen": -0.8453508615493774, "logits/rejected": -0.832522451877594, "logps/chosen": -79.52375793457031, "logps/rejected": -103.17071533203125, "loss": 0.6365, "rewards/accuracies": 1.0, "rewards/chosen": 0.07931365817785263, "rewards/margins": 0.0017974823713302612, "rewards/rejected": 0.07751617580652237, "step": 457 }, { "epoch": 0.07, "learning_rate": 9.99866246681574e-07, "logits/chosen": -0.3683987259864807, "logits/rejected": -0.33789485692977905, "logps/chosen": -112.86846160888672, "logps/rejected": -255.062744140625, "loss": 0.6993, "rewards/accuracies": 0.0, "rewards/chosen": 0.09901046752929688, "rewards/margins": -0.20074540376663208, "rewards/rejected": 0.29975587129592896, "step": 458 }, { "epoch": 0.07, "learning_rate": 9.998631897009864e-07, "logits/chosen": -0.4012013375759125, "logits/rejected": -0.3503439724445343, "logps/chosen": -132.29501342773438, "logps/rejected": -219.62667846679688, "loss": 0.9135, "rewards/accuracies": 0.0, "rewards/chosen": 0.26076507568359375, "rewards/margins": -0.30835723876953125, "rewards/rejected": 0.569122314453125, "step": 459 }, { "epoch": 0.07, "learning_rate": 9.998600981845819e-07, "logits/chosen": -0.4350295066833496, "logits/rejected": -0.37979552149772644, "logps/chosen": -92.86959838867188, "logps/rejected": -103.55986785888672, "loss": 0.7186, "rewards/accuracies": 0.0, "rewards/chosen": 0.11139144748449326, "rewards/margins": -0.04189225286245346, "rewards/rejected": 0.15328370034694672, "step": 460 }, { "epoch": 0.07, "learning_rate": 9.998569721325746e-07, "logits/chosen": -0.7639936804771423, "logits/rejected": -0.726574182510376, "logps/chosen": -161.9620361328125, "logps/rejected": -165.4063262939453, "loss": 0.6888, "rewards/accuracies": 0.0, "rewards/chosen": 0.23405762016773224, "rewards/margins": -0.016532912850379944, "rewards/rejected": 0.2505905330181122, "step": 461 }, { "epoch": 0.07, "learning_rate": 9.998538115451798e-07, "logits/chosen": -0.05299515649676323, "logits/rejected": -0.05299515649676323, "logps/chosen": -3.9482316970825195, "logps/rejected": -3.9482316970825195, "loss": 0.699, "rewards/accuracies": 0.0, "rewards/chosen": 0.013947772793471813, "rewards/margins": 0.0, "rewards/rejected": 0.013947772793471813, "step": 462 }, { "epoch": 0.08, "learning_rate": 9.998506164226167e-07, "logits/chosen": -0.1762915700674057, "logits/rejected": -0.16825172305107117, "logps/chosen": -173.93826293945312, "logps/rejected": -131.7659912109375, "loss": 0.7915, "rewards/accuracies": 0.0, "rewards/chosen": 0.264974981546402, "rewards/margins": -0.14456480741500854, "rewards/rejected": 0.4095397889614105, "step": 463 }, { "epoch": 0.08, "learning_rate": 9.998473867651053e-07, "logits/chosen": -0.6382539868354797, "logits/rejected": -0.6997461318969727, "logps/chosen": -154.78753662109375, "logps/rejected": -180.42503356933594, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.12565003335475922, "rewards/margins": 0.08675232529640198, "rewards/rejected": 0.03889770433306694, "step": 464 }, { "epoch": 0.08, "learning_rate": 9.99844122572869e-07, "logits/chosen": -0.5150778293609619, "logits/rejected": -0.5009642243385315, "logps/chosen": -69.11157989501953, "logps/rejected": -55.002201080322266, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.1292167752981186, "rewards/margins": 0.027732856571674347, "rewards/rejected": 0.10148391872644424, "step": 465 }, { "epoch": 0.08, "learning_rate": 9.998408238461337e-07, "logits/chosen": -0.5436778664588928, "logits/rejected": -0.5197164416313171, "logps/chosen": -115.26678466796875, "logps/rejected": -127.90682983398438, "loss": 0.6455, "rewards/accuracies": 1.0, "rewards/chosen": 0.24856872856616974, "rewards/margins": 0.089324951171875, "rewards/rejected": 0.15924377739429474, "step": 466 }, { "epoch": 0.08, "learning_rate": 9.99837490585127e-07, "logits/chosen": -0.6514274477958679, "logits/rejected": -0.5867668390274048, "logps/chosen": -96.27217102050781, "logps/rejected": -24.168075561523438, "loss": 0.6824, "rewards/accuracies": 0.0, "rewards/chosen": -0.033409882336854935, "rewards/margins": -0.012895012274384499, "rewards/rejected": -0.020514870062470436, "step": 467 }, { "epoch": 0.08, "learning_rate": 9.998341227900791e-07, "logits/chosen": -0.35868391394615173, "logits/rejected": -0.3777523338794708, "logps/chosen": -10.733156204223633, "logps/rejected": -32.26748275756836, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.04397296905517578, "rewards/margins": 0.006586264818906784, "rewards/rejected": 0.037386704236269, "step": 468 }, { "epoch": 0.08, "learning_rate": 9.998307204612228e-07, "logits/chosen": -0.49731579422950745, "logits/rejected": -0.4536868631839752, "logps/chosen": -65.29032897949219, "logps/rejected": -84.59844970703125, "loss": 0.6887, "rewards/accuracies": 0.0, "rewards/chosen": 0.12985916435718536, "rewards/margins": -0.052051544189453125, "rewards/rejected": 0.1819107085466385, "step": 469 }, { "epoch": 0.08, "learning_rate": 9.998272835987931e-07, "logits/chosen": -0.23483526706695557, "logits/rejected": -0.27970564365386963, "logps/chosen": -139.54135131835938, "logps/rejected": -103.22409057617188, "loss": 0.6508, "rewards/accuracies": 1.0, "rewards/chosen": 0.10497283935546875, "rewards/margins": 0.11379776149988174, "rewards/rejected": -0.00882492121309042, "step": 470 }, { "epoch": 0.08, "learning_rate": 9.998238122030276e-07, "logits/chosen": -0.22806410491466522, "logits/rejected": -0.2553757131099701, "logps/chosen": -186.08160400390625, "logps/rejected": -169.29763793945312, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": 0.151092529296875, "rewards/margins": -0.16774597764015198, "rewards/rejected": 0.318838506937027, "step": 471 }, { "epoch": 0.08, "learning_rate": 9.998203062741658e-07, "logits/chosen": -0.6710246801376343, "logits/rejected": -0.6076139807701111, "logps/chosen": -74.16877746582031, "logps/rejected": -75.15518188476562, "loss": 0.6781, "rewards/accuracies": 0.0, "rewards/chosen": 0.10639648884534836, "rewards/margins": -0.07148437947034836, "rewards/rejected": 0.17788086831569672, "step": 472 }, { "epoch": 0.08, "learning_rate": 9.998167658124506e-07, "logits/chosen": -0.3847707509994507, "logits/rejected": -0.3751414716243744, "logps/chosen": -117.02056884765625, "logps/rejected": -71.45468139648438, "loss": 0.5849, "rewards/accuracies": 1.0, "rewards/chosen": 0.23629073798656464, "rewards/margins": 0.052333831787109375, "rewards/rejected": 0.18395690619945526, "step": 473 }, { "epoch": 0.08, "learning_rate": 9.99813190818126e-07, "logits/chosen": -0.39513012766838074, "logits/rejected": -0.36030659079551697, "logps/chosen": -87.70088958740234, "logps/rejected": -140.44049072265625, "loss": 0.7259, "rewards/accuracies": 0.0, "rewards/chosen": 0.057486724108457565, "rewards/margins": -0.25626450777053833, "rewards/rejected": 0.313751220703125, "step": 474 }, { "epoch": 0.08, "learning_rate": 9.998095812914391e-07, "logits/chosen": -0.6880674362182617, "logits/rejected": -0.39826181530952454, "logps/chosen": -163.62388610839844, "logps/rejected": -87.1515884399414, "loss": 0.5625, "rewards/accuracies": 1.0, "rewards/chosen": 0.354318231344223, "rewards/margins": 0.1993873566389084, "rewards/rejected": 0.15493087470531464, "step": 475 }, { "epoch": 0.08, "learning_rate": 9.998059372326395e-07, "logits/chosen": -0.6105181574821472, "logits/rejected": -0.5988742709159851, "logps/chosen": -66.38056945800781, "logps/rejected": -110.199951171875, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.24013137817382812, "rewards/margins": 0.049881741404533386, "rewards/rejected": 0.19024963676929474, "step": 476 }, { "epoch": 0.08, "learning_rate": 9.998022586419788e-07, "logits/chosen": -0.09219197183847427, "logits/rejected": -0.09219197183847427, "logps/chosen": -59.631717681884766, "logps/rejected": -59.631717681884766, "loss": 0.6819, "rewards/accuracies": 0.0, "rewards/chosen": -0.03457794338464737, "rewards/margins": 0.0, "rewards/rejected": -0.03457794338464737, "step": 477 }, { "epoch": 0.08, "learning_rate": 9.997985455197113e-07, "logits/chosen": 0.017610058188438416, "logits/rejected": -0.009611408226191998, "logps/chosen": -56.35251998901367, "logps/rejected": -26.612506866455078, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.13082771003246307, "rewards/margins": 0.03897552192211151, "rewards/rejected": 0.09185218811035156, "step": 478 }, { "epoch": 0.08, "learning_rate": 9.997947978660933e-07, "logits/chosen": -0.6690641641616821, "logits/rejected": -0.6375259160995483, "logps/chosen": -148.0516815185547, "logps/rejected": -189.61175537109375, "loss": 0.7782, "rewards/accuracies": 0.0, "rewards/chosen": 0.03292083740234375, "rewards/margins": -0.0587615966796875, "rewards/rejected": 0.09168243408203125, "step": 479 }, { "epoch": 0.08, "learning_rate": 9.997910156813839e-07, "logits/chosen": -0.14329113066196442, "logits/rejected": -0.17370419204235077, "logps/chosen": -93.15702819824219, "logps/rejected": -99.27584838867188, "loss": 0.7589, "rewards/accuracies": 0.0, "rewards/chosen": 0.05964508280158043, "rewards/margins": -0.16746748983860016, "rewards/rejected": 0.2271125763654709, "step": 480 }, { "epoch": 0.08, "learning_rate": 9.997871989658444e-07, "logits/chosen": -0.6691842675209045, "logits/rejected": -0.6575924754142761, "logps/chosen": -88.8980712890625, "logps/rejected": -65.38198852539062, "loss": 0.6894, "rewards/accuracies": 0.0, "rewards/chosen": 0.14057083427906036, "rewards/margins": -0.150828555226326, "rewards/rejected": 0.29139938950538635, "step": 481 }, { "epoch": 0.08, "learning_rate": 9.997833477197385e-07, "logits/chosen": -0.6192788481712341, "logits/rejected": -0.632963240146637, "logps/chosen": -127.042236328125, "logps/rejected": -43.74454116821289, "loss": 0.7111, "rewards/accuracies": 1.0, "rewards/chosen": 0.10296630859375, "rewards/margins": 0.0863163024187088, "rewards/rejected": 0.016650009900331497, "step": 482 }, { "epoch": 0.08, "learning_rate": 9.997794619433322e-07, "logits/chosen": -0.6787851452827454, "logits/rejected": -0.314082533121109, "logps/chosen": -161.38809204101562, "logps/rejected": -117.1109390258789, "loss": 0.6535, "rewards/accuracies": 0.0, "rewards/chosen": -0.00871887244284153, "rewards/margins": -0.22730790078639984, "rewards/rejected": 0.21858902275562286, "step": 483 }, { "epoch": 0.08, "learning_rate": 9.99775541636894e-07, "logits/chosen": -0.33714559674263, "logits/rejected": -0.3231692314147949, "logps/chosen": -102.8837890625, "logps/rejected": -102.1104507446289, "loss": 0.6079, "rewards/accuracies": 1.0, "rewards/chosen": 0.15528564155101776, "rewards/margins": 0.04025420546531677, "rewards/rejected": 0.11503143608570099, "step": 484 }, { "epoch": 0.08, "learning_rate": 9.997715868006952e-07, "logits/chosen": -0.48443707823753357, "logits/rejected": -0.506281852722168, "logps/chosen": -181.04293823242188, "logps/rejected": -145.175048828125, "loss": 0.6767, "rewards/accuracies": 1.0, "rewards/chosen": 0.48141786456108093, "rewards/margins": 0.07371065020561218, "rewards/rejected": 0.40770721435546875, "step": 485 }, { "epoch": 0.08, "learning_rate": 9.997675974350082e-07, "logits/chosen": -0.08361175656318665, "logits/rejected": -0.431325763463974, "logps/chosen": -126.93327331542969, "logps/rejected": -96.41871643066406, "loss": 0.7363, "rewards/accuracies": 0.0, "rewards/chosen": 0.1880142241716385, "rewards/margins": -0.10126723349094391, "rewards/rejected": 0.2892814576625824, "step": 486 }, { "epoch": 0.08, "learning_rate": 9.997635735401091e-07, "logits/chosen": -0.31374090909957886, "logits/rejected": -0.2814875543117523, "logps/chosen": -110.09774780273438, "logps/rejected": -59.285255432128906, "loss": 0.715, "rewards/accuracies": 0.0, "rewards/chosen": 0.05995941162109375, "rewards/margins": -0.03531227260828018, "rewards/rejected": 0.09527168422937393, "step": 487 }, { "epoch": 0.08, "learning_rate": 9.99759515116276e-07, "logits/chosen": -0.45418283343315125, "logits/rejected": -0.4458563029766083, "logps/chosen": -93.31747436523438, "logps/rejected": -133.28961181640625, "loss": 0.7445, "rewards/accuracies": 0.0, "rewards/chosen": 0.14829102158546448, "rewards/margins": -0.19588318467140198, "rewards/rejected": 0.34417420625686646, "step": 488 }, { "epoch": 0.08, "learning_rate": 9.99755422163789e-07, "logits/chosen": -0.6599708199501038, "logits/rejected": -0.6352304220199585, "logps/chosen": -100.06990051269531, "logps/rejected": -147.0265350341797, "loss": 0.7272, "rewards/accuracies": 0.0, "rewards/chosen": 0.006382751744240522, "rewards/margins": -0.2705139219760895, "rewards/rejected": 0.2768966853618622, "step": 489 }, { "epoch": 0.08, "learning_rate": 9.997512946829312e-07, "logits/chosen": -0.17181387543678284, "logits/rejected": -0.17181387543678284, "logps/chosen": -28.886140823364258, "logps/rejected": -28.886140823364258, "loss": 0.6878, "rewards/accuracies": 0.0, "rewards/chosen": 0.02827167510986328, "rewards/margins": 0.0, "rewards/rejected": 0.02827167510986328, "step": 490 }, { "epoch": 0.08, "learning_rate": 9.997471326739877e-07, "logits/chosen": -0.01657232828438282, "logits/rejected": -0.01934371143579483, "logps/chosen": -5.40560245513916, "logps/rejected": -2.5555505752563477, "loss": 0.6912, "rewards/accuracies": 0.0, "rewards/chosen": 0.03038344345986843, "rewards/margins": -0.0019071120768785477, "rewards/rejected": 0.03229055553674698, "step": 491 }, { "epoch": 0.08, "learning_rate": 9.997429361372458e-07, "logits/chosen": -0.518852174282074, "logits/rejected": -0.5629228949546814, "logps/chosen": -82.83326721191406, "logps/rejected": -110.20355224609375, "loss": 0.7873, "rewards/accuracies": 0.0, "rewards/chosen": 0.10366668552160263, "rewards/margins": -0.33345261216163635, "rewards/rejected": 0.4371193051338196, "step": 492 }, { "epoch": 0.08, "learning_rate": 9.997387050729956e-07, "logits/chosen": -0.5352405309677124, "logits/rejected": -0.57368004322052, "logps/chosen": -124.03460693359375, "logps/rejected": -80.51252746582031, "loss": 0.6845, "rewards/accuracies": 0.0, "rewards/chosen": 0.2750656306743622, "rewards/margins": -0.10302427411079407, "rewards/rejected": 0.37808990478515625, "step": 493 }, { "epoch": 0.08, "learning_rate": 9.997344394815298e-07, "logits/chosen": -0.40793824195861816, "logits/rejected": -0.3671092092990875, "logps/chosen": -98.95084381103516, "logps/rejected": -151.63230895996094, "loss": 0.6841, "rewards/accuracies": 0.0, "rewards/chosen": -0.03844146803021431, "rewards/margins": -0.20050659775733948, "rewards/rejected": 0.16206513345241547, "step": 494 }, { "epoch": 0.08, "learning_rate": 9.997301393631426e-07, "logits/chosen": -0.3158003091812134, "logits/rejected": -0.31499311327934265, "logps/chosen": -65.93904876708984, "logps/rejected": -83.25444030761719, "loss": 0.7222, "rewards/accuracies": 0.0, "rewards/chosen": 0.09571915119886398, "rewards/margins": -0.08803253620862961, "rewards/rejected": 0.1837516874074936, "step": 495 }, { "epoch": 0.08, "learning_rate": 9.997258047181312e-07, "logits/chosen": -0.4005250036716461, "logits/rejected": -0.3620366156101227, "logps/chosen": -95.36074829101562, "logps/rejected": -122.85305786132812, "loss": 0.5806, "rewards/accuracies": 1.0, "rewards/chosen": 0.349334716796875, "rewards/margins": 0.14520569145679474, "rewards/rejected": 0.20412902534008026, "step": 496 }, { "epoch": 0.08, "learning_rate": 9.997214355467952e-07, "logits/chosen": -0.6988350749015808, "logits/rejected": -0.7624680399894714, "logps/chosen": -246.07752990722656, "logps/rejected": -80.31900024414062, "loss": 0.6961, "rewards/accuracies": 1.0, "rewards/chosen": 0.4248703122138977, "rewards/margins": 0.1660354733467102, "rewards/rejected": 0.2588348388671875, "step": 497 }, { "epoch": 0.08, "learning_rate": 9.997170318494362e-07, "logits/chosen": -0.38667139410972595, "logits/rejected": -0.3799760639667511, "logps/chosen": -75.36064147949219, "logps/rejected": -75.90936279296875, "loss": 0.6512, "rewards/accuracies": 0.0, "rewards/chosen": 0.06419296562671661, "rewards/margins": -0.08580322563648224, "rewards/rejected": 0.14999619126319885, "step": 498 }, { "epoch": 0.08, "learning_rate": 9.997125936263588e-07, "logits/chosen": -0.17970699071884155, "logits/rejected": -0.16381745040416718, "logps/chosen": -52.7287483215332, "logps/rejected": -77.03904724121094, "loss": 0.798, "rewards/accuracies": 0.0, "rewards/chosen": 0.09171829372644424, "rewards/margins": -0.16650390625, "rewards/rejected": 0.25822219252586365, "step": 499 }, { "epoch": 0.08, "learning_rate": 9.997081208778696e-07, "logits/chosen": -0.032031215727329254, "logits/rejected": -0.032031215727329254, "logps/chosen": -12.573806762695312, "logps/rejected": -12.573806762695312, "loss": 0.6822, "rewards/accuracies": 0.0, "rewards/chosen": 0.08714809268712997, "rewards/margins": 0.0, "rewards/rejected": 0.08714809268712997, "step": 500 }, { "epoch": 0.08, "learning_rate": 9.997036136042773e-07, "logits/chosen": -0.5442038178443909, "logits/rejected": -0.5328113436698914, "logps/chosen": -70.77381134033203, "logps/rejected": -74.23667907714844, "loss": 0.5827, "rewards/accuracies": 1.0, "rewards/chosen": 0.19939498603343964, "rewards/margins": 0.06350097060203552, "rewards/rejected": 0.1358940154314041, "step": 501 }, { "epoch": 0.08, "learning_rate": 9.996990718058937e-07, "logits/chosen": -0.549376368522644, "logits/rejected": -0.5370655655860901, "logps/chosen": -71.49890899658203, "logps/rejected": -90.18115997314453, "loss": 0.6462, "rewards/accuracies": 1.0, "rewards/chosen": 0.2925712764263153, "rewards/margins": 0.19054871797561646, "rewards/rejected": 0.10202255100011826, "step": 502 }, { "epoch": 0.08, "learning_rate": 9.996944954830324e-07, "logits/chosen": -0.3732921779155731, "logits/rejected": -1.1505093574523926, "logps/chosen": -112.56815338134766, "logps/rejected": -67.02336120605469, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 0.012933350168168545, "rewards/margins": -0.05670776218175888, "rewards/rejected": 0.06964111328125, "step": 503 }, { "epoch": 0.08, "learning_rate": 9.996898846360098e-07, "logits/chosen": -0.5335409045219421, "logits/rejected": -0.47167080640792847, "logps/chosen": -159.0290069580078, "logps/rejected": -214.3314666748047, "loss": 0.7857, "rewards/accuracies": 0.0, "rewards/chosen": 0.29754334688186646, "rewards/margins": -0.2425125241279602, "rewards/rejected": 0.5400558710098267, "step": 504 }, { "epoch": 0.08, "learning_rate": 9.99685239265144e-07, "logits/chosen": -0.9269115924835205, "logits/rejected": -0.8023152351379395, "logps/chosen": -207.4789581298828, "logps/rejected": -140.0423583984375, "loss": 0.6695, "rewards/accuracies": 0.0, "rewards/chosen": 0.3299560546875, "rewards/margins": -0.029231280088424683, "rewards/rejected": 0.3591873347759247, "step": 505 }, { "epoch": 0.08, "learning_rate": 9.996805593707565e-07, "logits/chosen": -0.13791508972644806, "logits/rejected": -0.13650783896446228, "logps/chosen": -56.311256408691406, "logps/rejected": -24.351619720458984, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 0.12283172458410263, "rewards/margins": 0.004641912877559662, "rewards/rejected": 0.11818981170654297, "step": 506 }, { "epoch": 0.08, "learning_rate": 9.996758449531702e-07, "logits/chosen": -0.5143824219703674, "logits/rejected": -0.5159533023834229, "logps/chosen": -66.92019653320312, "logps/rejected": -95.26266479492188, "loss": 0.6827, "rewards/accuracies": 0.0, "rewards/chosen": 0.19695435464382172, "rewards/margins": -0.02585066854953766, "rewards/rejected": 0.22280502319335938, "step": 507 }, { "epoch": 0.08, "learning_rate": 9.99671096012711e-07, "logits/chosen": -0.6793432235717773, "logits/rejected": -0.6272879838943481, "logps/chosen": -114.13043212890625, "logps/rejected": -142.4876708984375, "loss": 0.7302, "rewards/accuracies": 1.0, "rewards/chosen": 0.3582557737827301, "rewards/margins": 0.09701156616210938, "rewards/rejected": 0.2612442076206207, "step": 508 }, { "epoch": 0.08, "learning_rate": 9.99666312549707e-07, "logits/chosen": -0.6319185495376587, "logits/rejected": -0.6086170077323914, "logps/chosen": -95.35896301269531, "logps/rejected": -36.358062744140625, "loss": 0.8231, "rewards/accuracies": 0.0, "rewards/chosen": 0.005489349365234375, "rewards/margins": -0.007129669189453125, "rewards/rejected": 0.0126190185546875, "step": 509 }, { "epoch": 0.08, "learning_rate": 9.996614945644886e-07, "logits/chosen": -0.4431668221950531, "logits/rejected": -0.4248926043510437, "logps/chosen": -177.51214599609375, "logps/rejected": -232.20623779296875, "loss": 0.7695, "rewards/accuracies": 0.0, "rewards/chosen": 0.485769659280777, "rewards/margins": -0.26956483721733093, "rewards/rejected": 0.7553344964981079, "step": 510 }, { "epoch": 0.08, "learning_rate": 9.996566420573888e-07, "logits/chosen": -0.048057861626148224, "logits/rejected": -0.053171854466199875, "logps/chosen": -66.78604125976562, "logps/rejected": -91.02644348144531, "loss": 0.6992, "rewards/accuracies": 1.0, "rewards/chosen": 0.11626892536878586, "rewards/margins": 0.06712722778320312, "rewards/rejected": 0.049141693860292435, "step": 511 }, { "epoch": 0.08, "learning_rate": 9.996517550287432e-07, "logits/chosen": -0.3670233190059662, "logits/rejected": -0.3670233190059662, "logps/chosen": -35.398101806640625, "logps/rejected": -35.398101806640625, "loss": 0.781, "rewards/accuracies": 0.0, "rewards/chosen": 0.07534027099609375, "rewards/margins": 0.0, "rewards/rejected": 0.07534027099609375, "step": 512 }, { "epoch": 0.08, "learning_rate": 9.996468334788886e-07, "logits/chosen": -0.5161520838737488, "logits/rejected": -0.4185478389263153, "logps/chosen": -118.42587280273438, "logps/rejected": -76.86222839355469, "loss": 0.5653, "rewards/accuracies": 1.0, "rewards/chosen": 0.5220398306846619, "rewards/margins": 0.3259521722793579, "rewards/rejected": 0.19608764350414276, "step": 513 }, { "epoch": 0.08, "learning_rate": 9.996418774081656e-07, "logits/chosen": -0.19179171323776245, "logits/rejected": -0.24708659946918488, "logps/chosen": -245.03872680664062, "logps/rejected": -105.9139175415039, "loss": 0.8248, "rewards/accuracies": 0.0, "rewards/chosen": 0.15469665825366974, "rewards/margins": -0.19620589911937714, "rewards/rejected": 0.3509025573730469, "step": 514 }, { "epoch": 0.08, "learning_rate": 9.996368868169168e-07, "logits/chosen": -0.9768289923667908, "logits/rejected": -0.9310335516929626, "logps/chosen": -122.79861450195312, "logps/rejected": -72.06340789794922, "loss": 0.8012, "rewards/accuracies": 0.0, "rewards/chosen": -0.0022323608864098787, "rewards/margins": -0.1057228073477745, "rewards/rejected": 0.103490449488163, "step": 515 }, { "epoch": 0.08, "learning_rate": 9.996318617054863e-07, "logits/chosen": -0.4266386926174164, "logits/rejected": -0.3818626403808594, "logps/chosen": -87.4832763671875, "logps/rejected": -60.69643020629883, "loss": 0.759, "rewards/accuracies": 1.0, "rewards/chosen": 0.2811935544013977, "rewards/margins": 0.12188568711280823, "rewards/rejected": 0.15930786728858948, "step": 516 }, { "epoch": 0.08, "learning_rate": 9.99626802074222e-07, "logits/chosen": -0.4528619647026062, "logits/rejected": -0.4654445946216583, "logps/chosen": -43.12525939941406, "logps/rejected": -71.75186157226562, "loss": 0.6244, "rewards/accuracies": 1.0, "rewards/chosen": 0.2628238797187805, "rewards/margins": 0.12860070168972015, "rewards/rejected": 0.13422317802906036, "step": 517 }, { "epoch": 0.08, "learning_rate": 9.996217079234732e-07, "logits/chosen": -0.23125076293945312, "logits/rejected": -0.2266612946987152, "logps/chosen": -55.94890594482422, "logps/rejected": -63.99674987792969, "loss": 0.5616, "rewards/accuracies": 1.0, "rewards/chosen": 0.23330001533031464, "rewards/margins": 0.13387298583984375, "rewards/rejected": 0.09942703694105148, "step": 518 }, { "epoch": 0.08, "learning_rate": 9.996165792535917e-07, "logits/chosen": -0.16698379814624786, "logits/rejected": -0.1534995287656784, "logps/chosen": -80.31352233886719, "logps/rejected": -179.9515380859375, "loss": 0.7227, "rewards/accuracies": 1.0, "rewards/chosen": 0.3192764222621918, "rewards/margins": 0.22496795654296875, "rewards/rejected": 0.09430847316980362, "step": 519 }, { "epoch": 0.08, "learning_rate": 9.996114160649323e-07, "logits/chosen": -0.6363029479980469, "logits/rejected": -0.5538884401321411, "logps/chosen": -138.24380493164062, "logps/rejected": -181.26763916015625, "loss": 0.9659, "rewards/accuracies": 0.0, "rewards/chosen": 0.09957275539636612, "rewards/margins": -0.45325472950935364, "rewards/rejected": 0.5528274774551392, "step": 520 }, { "epoch": 0.08, "learning_rate": 9.996062183578511e-07, "logits/chosen": -0.6961076259613037, "logits/rejected": -0.6480351686477661, "logps/chosen": -140.69186401367188, "logps/rejected": -144.61900329589844, "loss": 0.8008, "rewards/accuracies": 1.0, "rewards/chosen": 0.4813064634799957, "rewards/margins": 0.032139599323272705, "rewards/rejected": 0.449166864156723, "step": 521 }, { "epoch": 0.08, "learning_rate": 9.996009861327075e-07, "logits/chosen": -0.15412919223308563, "logits/rejected": -0.14591887593269348, "logps/chosen": -75.79817962646484, "logps/rejected": -159.25668334960938, "loss": 0.7221, "rewards/accuracies": 1.0, "rewards/chosen": 0.25412750244140625, "rewards/margins": 0.20518799126148224, "rewards/rejected": 0.04893951490521431, "step": 522 }, { "epoch": 0.08, "learning_rate": 9.995957193898632e-07, "logits/chosen": -0.34809496998786926, "logits/rejected": -0.34998831152915955, "logps/chosen": -163.33815002441406, "logps/rejected": -179.76455688476562, "loss": 0.5504, "rewards/accuracies": 1.0, "rewards/chosen": 0.6369583010673523, "rewards/margins": 0.26061245799064636, "rewards/rejected": 0.37634584307670593, "step": 523 }, { "epoch": 0.09, "learning_rate": 9.99590418129682e-07, "logits/chosen": -0.15949222445487976, "logits/rejected": -0.15860646963119507, "logps/chosen": -6.360712051391602, "logps/rejected": -4.930509567260742, "loss": 0.8745, "rewards/accuracies": 1.0, "rewards/chosen": 0.04776415973901749, "rewards/margins": 0.013815261423587799, "rewards/rejected": 0.03394889831542969, "step": 524 }, { "epoch": 0.09, "learning_rate": 9.995850823525298e-07, "logits/chosen": -0.3410502076148987, "logits/rejected": -0.2390539050102234, "logps/chosen": -184.8939208984375, "logps/rejected": -60.194236755371094, "loss": 0.5057, "rewards/accuracies": 1.0, "rewards/chosen": 0.5507522821426392, "rewards/margins": 0.2954048216342926, "rewards/rejected": 0.25534746050834656, "step": 525 }, { "epoch": 0.09, "learning_rate": 9.995797120587756e-07, "logits/chosen": -0.4848942458629608, "logits/rejected": -0.4905734062194824, "logps/chosen": -254.37283325195312, "logps/rejected": -25.483638763427734, "loss": 0.6311, "rewards/accuracies": 1.0, "rewards/chosen": 0.22578124701976776, "rewards/margins": 0.14846095442771912, "rewards/rejected": 0.07732029259204865, "step": 526 }, { "epoch": 0.09, "learning_rate": 9.995743072487905e-07, "logits/chosen": -0.4489092230796814, "logits/rejected": -0.47572311758995056, "logps/chosen": -190.9016571044922, "logps/rejected": -40.35283660888672, "loss": 0.514, "rewards/accuracies": 1.0, "rewards/chosen": 0.5572418570518494, "rewards/margins": 0.4826507866382599, "rewards/rejected": 0.07459106296300888, "step": 527 }, { "epoch": 0.09, "learning_rate": 9.995688679229476e-07, "logits/chosen": -0.6455541849136353, "logits/rejected": -0.6003097295761108, "logps/chosen": -110.6303482055664, "logps/rejected": -124.17668914794922, "loss": 0.7614, "rewards/accuracies": 0.0, "rewards/chosen": 0.15223312377929688, "rewards/margins": -0.3051498532295227, "rewards/rejected": 0.4573829770088196, "step": 528 }, { "epoch": 0.09, "learning_rate": 9.995633940816232e-07, "logits/chosen": -0.5639429688453674, "logits/rejected": -0.5190355777740479, "logps/chosen": -184.72186279296875, "logps/rejected": -188.11842346191406, "loss": 0.7468, "rewards/accuracies": 0.0, "rewards/chosen": 0.06091613695025444, "rewards/margins": -0.5627334713935852, "rewards/rejected": 0.6236495971679688, "step": 529 }, { "epoch": 0.09, "learning_rate": 9.99557885725195e-07, "logits/chosen": -0.6117937564849854, "logits/rejected": -0.604965329170227, "logps/chosen": -132.26351928710938, "logps/rejected": -118.3027114868164, "loss": 0.6802, "rewards/accuracies": 0.0, "rewards/chosen": -0.03495330736041069, "rewards/margins": -0.09324951469898224, "rewards/rejected": 0.05829620361328125, "step": 530 }, { "epoch": 0.09, "learning_rate": 9.995523428540437e-07, "logits/chosen": -0.0734168216586113, "logits/rejected": -0.0734168216586113, "logps/chosen": -54.6221923828125, "logps/rejected": -54.6221923828125, "loss": 0.642, "rewards/accuracies": 0.0, "rewards/chosen": 0.2748832702636719, "rewards/margins": 0.0, "rewards/rejected": 0.2748832702636719, "step": 531 }, { "epoch": 0.09, "learning_rate": 9.995467654685524e-07, "logits/chosen": -0.4245575964450836, "logits/rejected": -0.40795421600341797, "logps/chosen": -74.89496612548828, "logps/rejected": -97.69242858886719, "loss": 0.6119, "rewards/accuracies": 1.0, "rewards/chosen": 0.4140067994594574, "rewards/margins": 0.09106063842773438, "rewards/rejected": 0.322946161031723, "step": 532 }, { "epoch": 0.09, "learning_rate": 9.995411535691062e-07, "logits/chosen": -0.46086516976356506, "logits/rejected": -0.46086516976356506, "logps/chosen": -44.117469787597656, "logps/rejected": -44.117469787597656, "loss": 0.6417, "rewards/accuracies": 0.0, "rewards/chosen": 0.1743263304233551, "rewards/margins": 0.0, "rewards/rejected": 0.1743263304233551, "step": 533 }, { "epoch": 0.09, "learning_rate": 9.995355071560932e-07, "logits/chosen": -0.22826315462589264, "logits/rejected": -0.2547145485877991, "logps/chosen": -131.0828399658203, "logps/rejected": -145.16000366210938, "loss": 0.9313, "rewards/accuracies": 0.0, "rewards/chosen": 0.214070126414299, "rewards/margins": -0.32852476835250854, "rewards/rejected": 0.5425949096679688, "step": 534 }, { "epoch": 0.09, "learning_rate": 9.995298262299033e-07, "logits/chosen": -0.24070657789707184, "logits/rejected": -0.23016346991062164, "logps/chosen": -64.5634536743164, "logps/rejected": -69.85154724121094, "loss": 0.7484, "rewards/accuracies": 0.0, "rewards/chosen": 0.08845444023609161, "rewards/margins": -0.0506339967250824, "rewards/rejected": 0.139088436961174, "step": 535 }, { "epoch": 0.09, "learning_rate": 9.99524110790929e-07, "logits/chosen": -0.31264573335647583, "logits/rejected": -0.3190898299217224, "logps/chosen": -66.94940185546875, "logps/rejected": -55.955650329589844, "loss": 0.5816, "rewards/accuracies": 1.0, "rewards/chosen": 0.3592590391635895, "rewards/margins": 0.0866645872592926, "rewards/rejected": 0.2725944519042969, "step": 536 }, { "epoch": 0.09, "learning_rate": 9.99518360839565e-07, "logits/chosen": -0.6500090956687927, "logits/rejected": -0.6547026634216309, "logps/chosen": -68.3613510131836, "logps/rejected": -86.74486541748047, "loss": 0.5823, "rewards/accuracies": 1.0, "rewards/chosen": 0.22784729301929474, "rewards/margins": 0.039249420166015625, "rewards/rejected": 0.1885978728532791, "step": 537 }, { "epoch": 0.09, "learning_rate": 9.995125763762088e-07, "logits/chosen": -0.4452447295188904, "logits/rejected": -0.45880722999572754, "logps/chosen": -112.88807678222656, "logps/rejected": -103.89400482177734, "loss": 0.7536, "rewards/accuracies": 0.0, "rewards/chosen": 0.052312470972537994, "rewards/margins": -0.30946046113967896, "rewards/rejected": 0.36177292466163635, "step": 538 }, { "epoch": 0.09, "learning_rate": 9.9950675740126e-07, "logits/chosen": -0.4244796335697174, "logits/rejected": -0.43569475412368774, "logps/chosen": -91.65670776367188, "logps/rejected": -39.47470474243164, "loss": 0.7087, "rewards/accuracies": 0.0, "rewards/chosen": 0.26535797119140625, "rewards/margins": -0.06117859482765198, "rewards/rejected": 0.3265365660190582, "step": 539 }, { "epoch": 0.09, "learning_rate": 9.995009039151208e-07, "logits/chosen": -0.600155770778656, "logits/rejected": -0.5449559688568115, "logps/chosen": -152.4233856201172, "logps/rejected": -107.86453247070312, "loss": 0.6963, "rewards/accuracies": 1.0, "rewards/chosen": 0.6655640006065369, "rewards/margins": 0.16958469152450562, "rewards/rejected": 0.49597930908203125, "step": 540 }, { "epoch": 0.09, "learning_rate": 9.994950159181953e-07, "logits/chosen": -0.2002328485250473, "logits/rejected": -0.19155707955360413, "logps/chosen": -88.74955749511719, "logps/rejected": -71.55838012695312, "loss": 0.7812, "rewards/accuracies": 0.0, "rewards/chosen": 0.22565995156764984, "rewards/margins": -0.11481092870235443, "rewards/rejected": 0.3404708802700043, "step": 541 }, { "epoch": 0.09, "learning_rate": 9.994890934108907e-07, "logits/chosen": -0.1224045529961586, "logits/rejected": -0.0743660256266594, "logps/chosen": -62.34539031982422, "logps/rejected": -47.89684295654297, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.4280845820903778, "rewards/margins": 0.17394715547561646, "rewards/rejected": 0.25413742661476135, "step": 542 }, { "epoch": 0.09, "learning_rate": 9.994831363936155e-07, "logits/chosen": -0.1239490807056427, "logits/rejected": -0.11070673912763596, "logps/chosen": -32.05152130126953, "logps/rejected": -74.71443176269531, "loss": 0.718, "rewards/accuracies": 0.0, "rewards/chosen": 0.09046363830566406, "rewards/margins": -0.13264428079128265, "rewards/rejected": 0.22310791909694672, "step": 543 }, { "epoch": 0.09, "learning_rate": 9.994771448667822e-07, "logits/chosen": -0.5027511119842529, "logits/rejected": -0.42837971448898315, "logps/chosen": -84.11154174804688, "logps/rejected": -80.69224548339844, "loss": 0.5378, "rewards/accuracies": 1.0, "rewards/chosen": 0.5446991324424744, "rewards/margins": 0.2593033015727997, "rewards/rejected": 0.2853958308696747, "step": 544 }, { "epoch": 0.09, "learning_rate": 9.99471118830804e-07, "logits/chosen": -0.12094072252511978, "logits/rejected": -0.1420312076807022, "logps/chosen": -98.52981567382812, "logps/rejected": -58.46683120727539, "loss": 0.7741, "rewards/accuracies": 0.0, "rewards/chosen": -0.017656708136200905, "rewards/margins": -0.456869512796402, "rewards/rejected": 0.4392127990722656, "step": 545 }, { "epoch": 0.09, "learning_rate": 9.994650582860977e-07, "logits/chosen": -0.19163693487644196, "logits/rejected": -0.18436376750469208, "logps/chosen": -67.34837341308594, "logps/rejected": -51.81394958496094, "loss": 0.5981, "rewards/accuracies": 1.0, "rewards/chosen": 0.5224151611328125, "rewards/margins": 0.17873230576515198, "rewards/rejected": 0.3436828553676605, "step": 546 }, { "epoch": 0.09, "learning_rate": 9.994589632330817e-07, "logits/chosen": -1.983293056488037, "logits/rejected": -1.0773383378982544, "logps/chosen": -114.09536743164062, "logps/rejected": -148.1244659423828, "loss": 0.6548, "rewards/accuracies": 0.0, "rewards/chosen": 0.07338409870862961, "rewards/margins": -0.3571228086948395, "rewards/rejected": 0.4305069148540497, "step": 547 }, { "epoch": 0.09, "learning_rate": 9.994528336721774e-07, "logits/chosen": -0.5958647131919861, "logits/rejected": -0.35821887850761414, "logps/chosen": -172.7799835205078, "logps/rejected": -170.29185485839844, "loss": 0.6559, "rewards/accuracies": 0.0, "rewards/chosen": 0.5596511960029602, "rewards/margins": -0.1381744146347046, "rewards/rejected": 0.6978256106376648, "step": 548 }, { "epoch": 0.09, "learning_rate": 9.994466696038082e-07, "logits/chosen": -0.5360320806503296, "logits/rejected": -0.4819166660308838, "logps/chosen": -87.21072387695312, "logps/rejected": -78.80302429199219, "loss": 0.7101, "rewards/accuracies": 0.0, "rewards/chosen": 0.41552734375, "rewards/margins": -0.04354172945022583, "rewards/rejected": 0.45906907320022583, "step": 549 }, { "epoch": 0.09, "learning_rate": 9.994404710283998e-07, "logits/chosen": -0.8861310482025146, "logits/rejected": -0.8652746677398682, "logps/chosen": -151.5071563720703, "logps/rejected": -133.03758239746094, "loss": 0.6745, "rewards/accuracies": 0.0, "rewards/chosen": 0.07930755615234375, "rewards/margins": -0.11928711831569672, "rewards/rejected": 0.19859467446804047, "step": 550 }, { "epoch": 0.09, "learning_rate": 9.994342379463805e-07, "logits/chosen": -0.05981095880270004, "logits/rejected": -0.05981095880270004, "logps/chosen": -80.49606323242188, "logps/rejected": -80.49606323242188, "loss": 0.6662, "rewards/accuracies": 0.0, "rewards/chosen": 0.15529786050319672, "rewards/margins": 0.0, "rewards/rejected": 0.15529786050319672, "step": 551 }, { "epoch": 0.09, "learning_rate": 9.994279703581814e-07, "logits/chosen": -0.29688215255737305, "logits/rejected": -0.3097083270549774, "logps/chosen": -60.12037658691406, "logps/rejected": -82.41217803955078, "loss": 0.6659, "rewards/accuracies": 1.0, "rewards/chosen": 0.5581749081611633, "rewards/margins": 0.2573745846748352, "rewards/rejected": 0.3008003234863281, "step": 552 }, { "epoch": 0.09, "learning_rate": 9.994216682642348e-07, "logits/chosen": -0.21848614513874054, "logits/rejected": -0.23523716628551483, "logps/chosen": -89.69328308105469, "logps/rejected": -131.9182586669922, "loss": 0.818, "rewards/accuracies": 0.0, "rewards/chosen": 0.13224487006664276, "rewards/margins": -0.2589218020439148, "rewards/rejected": 0.39116668701171875, "step": 553 }, { "epoch": 0.09, "learning_rate": 9.994153316649767e-07, "logits/chosen": -0.2838658392429352, "logits/rejected": -0.30093082785606384, "logps/chosen": -89.41966247558594, "logps/rejected": -145.1793212890625, "loss": 0.8703, "rewards/accuracies": 0.0, "rewards/chosen": 0.41444626450538635, "rewards/margins": -0.46232375502586365, "rewards/rejected": 0.87677001953125, "step": 554 }, { "epoch": 0.09, "learning_rate": 9.994089605608447e-07, "logits/chosen": -0.06035793945193291, "logits/rejected": 0.012369128875434399, "logps/chosen": -58.7030029296875, "logps/rejected": -151.70654296875, "loss": 0.7384, "rewards/accuracies": 0.0, "rewards/chosen": 0.40595170855522156, "rewards/margins": -0.19340893626213074, "rewards/rejected": 0.5993606448173523, "step": 555 }, { "epoch": 0.09, "learning_rate": 9.99402554952279e-07, "logits/chosen": -0.6877167224884033, "logits/rejected": -0.6726930737495422, "logps/chosen": -26.853633880615234, "logps/rejected": -82.61300659179688, "loss": 0.655, "rewards/accuracies": 0.0, "rewards/chosen": 0.19115658104419708, "rewards/margins": -0.055794715881347656, "rewards/rejected": 0.24695129692554474, "step": 556 }, { "epoch": 0.09, "learning_rate": 9.99396114839722e-07, "logits/chosen": -0.4306987226009369, "logits/rejected": -0.3922390043735504, "logps/chosen": -175.23947143554688, "logps/rejected": -80.98526000976562, "loss": 0.6559, "rewards/accuracies": 1.0, "rewards/chosen": 0.7967315912246704, "rewards/margins": 0.2279006838798523, "rewards/rejected": 0.5688309073448181, "step": 557 }, { "epoch": 0.09, "learning_rate": 9.993896402236188e-07, "logits/chosen": -0.19645538926124573, "logits/rejected": -0.15441767871379852, "logps/chosen": -135.00205993652344, "logps/rejected": -80.70088195800781, "loss": 0.7107, "rewards/accuracies": 0.0, "rewards/chosen": 0.2776931822299957, "rewards/margins": -0.21567076444625854, "rewards/rejected": 0.4933639466762543, "step": 558 }, { "epoch": 0.09, "learning_rate": 9.99383131104417e-07, "logits/chosen": -0.4853827655315399, "logits/rejected": -0.4473375678062439, "logps/chosen": -55.107421875, "logps/rejected": -49.92839050292969, "loss": 0.6032, "rewards/accuracies": 1.0, "rewards/chosen": 0.3023933470249176, "rewards/margins": 0.15884093940258026, "rewards/rejected": 0.14355240762233734, "step": 559 }, { "epoch": 0.09, "learning_rate": 9.993765874825659e-07, "logits/chosen": -0.26005396246910095, "logits/rejected": -0.1800066977739334, "logps/chosen": -79.24726867675781, "logps/rejected": -17.741212844848633, "loss": 0.5088, "rewards/accuracies": 1.0, "rewards/chosen": 0.5170280337333679, "rewards/margins": 0.5004890561103821, "rewards/rejected": 0.01653900183737278, "step": 560 }, { "epoch": 0.09, "learning_rate": 9.993700093585176e-07, "logits/chosen": -0.35124069452285767, "logits/rejected": -0.3570329546928406, "logps/chosen": -168.78472900390625, "logps/rejected": -125.04426574707031, "loss": 0.6497, "rewards/accuracies": 1.0, "rewards/chosen": 0.601702868938446, "rewards/margins": 0.49063795804977417, "rewards/rejected": 0.11106491088867188, "step": 561 }, { "epoch": 0.09, "learning_rate": 9.993633967327268e-07, "logits/chosen": -0.27746203541755676, "logits/rejected": -0.32373514771461487, "logps/chosen": -79.04095458984375, "logps/rejected": -95.70184326171875, "loss": 0.7248, "rewards/accuracies": 0.0, "rewards/chosen": 0.16301880776882172, "rewards/margins": -0.08989410102367401, "rewards/rejected": 0.2529129087924957, "step": 562 }, { "epoch": 0.09, "learning_rate": 9.993567496056504e-07, "logits/chosen": -0.2799704372882843, "logits/rejected": -0.2270732820034027, "logps/chosen": -102.6758041381836, "logps/rejected": -153.45431518554688, "loss": 0.6765, "rewards/accuracies": 0.0, "rewards/chosen": 0.2858108580112457, "rewards/margins": -0.4177749454975128, "rewards/rejected": 0.7035858035087585, "step": 563 }, { "epoch": 0.09, "learning_rate": 9.993500679777476e-07, "logits/chosen": -0.5137310028076172, "logits/rejected": -0.4988330900669098, "logps/chosen": -88.45338439941406, "logps/rejected": -72.70054626464844, "loss": 0.6274, "rewards/accuracies": 0.0, "rewards/chosen": 0.11913452297449112, "rewards/margins": -0.1305030882358551, "rewards/rejected": 0.24963760375976562, "step": 564 }, { "epoch": 0.09, "learning_rate": 9.993433518494797e-07, "logits/chosen": -0.4011997878551483, "logits/rejected": -0.4232230484485626, "logps/chosen": -65.84834289550781, "logps/rejected": -150.36293029785156, "loss": 0.7302, "rewards/accuracies": 0.0, "rewards/chosen": 0.5802947878837585, "rewards/margins": -0.36906129121780396, "rewards/rejected": 0.9493560791015625, "step": 565 }, { "epoch": 0.09, "learning_rate": 9.993366012213113e-07, "logits/chosen": -0.29247522354125977, "logits/rejected": -0.29247522354125977, "logps/chosen": -73.43783569335938, "logps/rejected": -73.43783569335938, "loss": 0.6475, "rewards/accuracies": 0.0, "rewards/chosen": 0.02815856970846653, "rewards/margins": 0.0, "rewards/rejected": 0.02815856970846653, "step": 566 }, { "epoch": 0.09, "learning_rate": 9.993298160937084e-07, "logits/chosen": -0.4182246923446655, "logits/rejected": -0.4545069634914398, "logps/chosen": -89.07850646972656, "logps/rejected": -115.5596923828125, "loss": 1.0396, "rewards/accuracies": 0.0, "rewards/chosen": 0.3360244929790497, "rewards/margins": -0.5437721014022827, "rewards/rejected": 0.8797966241836548, "step": 567 }, { "epoch": 0.09, "learning_rate": 9.9932299646714e-07, "logits/chosen": -0.22186477482318878, "logits/rejected": -0.20611266791820526, "logps/chosen": -163.4054412841797, "logps/rejected": -63.660770416259766, "loss": 0.9045, "rewards/accuracies": 1.0, "rewards/chosen": 0.6570267081260681, "rewards/margins": 0.4777408838272095, "rewards/rejected": 0.17928580939769745, "step": 568 }, { "epoch": 0.09, "learning_rate": 9.993161423420772e-07, "logits/chosen": -0.3559265434741974, "logits/rejected": -0.42028316855430603, "logps/chosen": -108.73538208007812, "logps/rejected": -121.06903839111328, "loss": 0.4561, "rewards/accuracies": 1.0, "rewards/chosen": 0.4856064021587372, "rewards/margins": 0.22617721557617188, "rewards/rejected": 0.2594291865825653, "step": 569 }, { "epoch": 0.09, "learning_rate": 9.993092537189934e-07, "logits/chosen": -0.39488327503204346, "logits/rejected": -0.3755762279033661, "logps/chosen": -59.71554183959961, "logps/rejected": -60.89064025878906, "loss": 0.4607, "rewards/accuracies": 1.0, "rewards/chosen": 0.41572076082229614, "rewards/margins": 0.1973598599433899, "rewards/rejected": 0.21836090087890625, "step": 570 }, { "epoch": 0.09, "learning_rate": 9.993023305983647e-07, "logits/chosen": -0.34681177139282227, "logits/rejected": -0.3625129461288452, "logps/chosen": -96.30587768554688, "logps/rejected": -189.48805236816406, "loss": 0.8335, "rewards/accuracies": 0.0, "rewards/chosen": 0.0005462646367959678, "rewards/margins": -0.05076294019818306, "rewards/rejected": 0.05130920559167862, "step": 571 }, { "epoch": 0.09, "learning_rate": 9.992953729806694e-07, "logits/chosen": -0.3764987289905548, "logits/rejected": -0.3742693364620209, "logps/chosen": -245.09451293945312, "logps/rejected": -121.7729263305664, "loss": 0.5466, "rewards/accuracies": 1.0, "rewards/chosen": 0.8451629877090454, "rewards/margins": 0.4076835811138153, "rewards/rejected": 0.4374794065952301, "step": 572 }, { "epoch": 0.09, "learning_rate": 9.992883808663884e-07, "logits/chosen": -0.7121617794036865, "logits/rejected": -0.7306926846504211, "logps/chosen": -10.43566608428955, "logps/rejected": -73.46865844726562, "loss": 0.5292, "rewards/accuracies": 1.0, "rewards/chosen": 0.16526193916797638, "rewards/margins": 0.009491831064224243, "rewards/rejected": 0.15577010810375214, "step": 573 }, { "epoch": 0.09, "learning_rate": 9.992813542560045e-07, "logits/chosen": -0.11987943947315216, "logits/rejected": -0.11720668524503708, "logps/chosen": -120.67794036865234, "logps/rejected": -48.534244537353516, "loss": 0.7184, "rewards/accuracies": 0.0, "rewards/chosen": -0.05067748948931694, "rewards/margins": -0.3170204162597656, "rewards/rejected": 0.2663429379463196, "step": 574 }, { "epoch": 0.09, "learning_rate": 9.992742931500031e-07, "logits/chosen": -0.646571159362793, "logits/rejected": -0.6730866432189941, "logps/chosen": -187.65109252929688, "logps/rejected": -26.609725952148438, "loss": 0.5308, "rewards/accuracies": 1.0, "rewards/chosen": 0.847210705280304, "rewards/margins": 0.8251447677612305, "rewards/rejected": 0.02206592634320259, "step": 575 }, { "epoch": 0.09, "learning_rate": 9.992671975488725e-07, "logits/chosen": -0.47167399525642395, "logits/rejected": -0.4764876663684845, "logps/chosen": -74.56547546386719, "logps/rejected": -62.14323425292969, "loss": 0.6125, "rewards/accuracies": 0.0, "rewards/chosen": 0.3285583555698395, "rewards/margins": -0.054766833782196045, "rewards/rejected": 0.3833251893520355, "step": 576 }, { "epoch": 0.09, "learning_rate": 9.992600674531025e-07, "logits/chosen": -0.47938936948776245, "logits/rejected": -0.4800698459148407, "logps/chosen": -167.45639038085938, "logps/rejected": -66.79580688476562, "loss": 0.7235, "rewards/accuracies": 0.0, "rewards/chosen": 0.36382752656936646, "rewards/margins": -0.11796721816062927, "rewards/rejected": 0.4817947447299957, "step": 577 }, { "epoch": 0.09, "learning_rate": 9.992529028631858e-07, "logits/chosen": -0.28341758251190186, "logits/rejected": -0.2888900339603424, "logps/chosen": -123.4639892578125, "logps/rejected": -146.33819580078125, "loss": 0.6101, "rewards/accuracies": 1.0, "rewards/chosen": 0.2917419373989105, "rewards/margins": 0.07388915121555328, "rewards/rejected": 0.21785278618335724, "step": 578 }, { "epoch": 0.09, "learning_rate": 9.992457037796176e-07, "logits/chosen": -0.2948223352432251, "logits/rejected": -0.2948223352432251, "logps/chosen": -0.8231717944145203, "logps/rejected": -0.8231717944145203, "loss": 0.726, "rewards/accuracies": 0.0, "rewards/chosen": 0.02318057417869568, "rewards/margins": 0.0, "rewards/rejected": 0.02318057417869568, "step": 579 }, { "epoch": 0.09, "learning_rate": 9.99238470202895e-07, "logits/chosen": -0.42517492175102234, "logits/rejected": -0.4700120985507965, "logps/chosen": -180.13116455078125, "logps/rejected": -176.19134521484375, "loss": 0.8083, "rewards/accuracies": 0.0, "rewards/chosen": 0.3738723695278168, "rewards/margins": -0.4671035706996918, "rewards/rejected": 0.8409759402275085, "step": 580 }, { "epoch": 0.09, "learning_rate": 9.99231202133518e-07, "logits/chosen": -0.4250527024269104, "logits/rejected": -0.38423049449920654, "logps/chosen": -92.48635864257812, "logps/rejected": -14.115716934204102, "loss": 0.566, "rewards/accuracies": 1.0, "rewards/chosen": 0.5715034604072571, "rewards/margins": 0.41248780488967896, "rewards/rejected": 0.15901565551757812, "step": 581 }, { "epoch": 0.09, "learning_rate": 9.99223899571989e-07, "logits/chosen": -0.7995214462280273, "logits/rejected": -0.6622880101203918, "logps/chosen": -91.90774536132812, "logps/rejected": -80.52989196777344, "loss": 0.6215, "rewards/accuracies": 0.0, "rewards/chosen": -0.0016868591774255037, "rewards/margins": -0.07781372219324112, "rewards/rejected": 0.07612686604261398, "step": 582 }, { "epoch": 0.09, "learning_rate": 9.99216562518812e-07, "logits/chosen": -0.418034166097641, "logits/rejected": -0.40620580315589905, "logps/chosen": -61.87919235229492, "logps/rejected": -68.13811492919922, "loss": 0.7074, "rewards/accuracies": 1.0, "rewards/chosen": 0.48461267352104187, "rewards/margins": 0.4027363061904907, "rewards/rejected": 0.08187637478113174, "step": 583 }, { "epoch": 0.09, "learning_rate": 9.992091909744942e-07, "logits/chosen": -0.8028103113174438, "logits/rejected": -0.7999012470245361, "logps/chosen": -73.01380157470703, "logps/rejected": -111.23858642578125, "loss": 0.6871, "rewards/accuracies": 0.0, "rewards/chosen": 0.3056587278842926, "rewards/margins": -0.4746688902378082, "rewards/rejected": 0.7803276181221008, "step": 584 }, { "epoch": 0.09, "learning_rate": 9.992017849395448e-07, "logits/chosen": -0.5897679328918457, "logits/rejected": -0.5897679328918457, "logps/chosen": -81.6549072265625, "logps/rejected": -81.6549072265625, "loss": 0.6757, "rewards/accuracies": 0.0, "rewards/chosen": 0.24718932807445526, "rewards/margins": 0.0, "rewards/rejected": 0.24718932807445526, "step": 585 }, { "epoch": 0.1, "learning_rate": 9.991943444144756e-07, "logits/chosen": -0.897697389125824, "logits/rejected": -0.8855649828910828, "logps/chosen": -88.84510803222656, "logps/rejected": -40.48822021484375, "loss": 0.797, "rewards/accuracies": 0.0, "rewards/chosen": -0.01985931396484375, "rewards/margins": -0.126780703663826, "rewards/rejected": 0.10692138969898224, "step": 586 }, { "epoch": 0.1, "learning_rate": 9.991868693998006e-07, "logits/chosen": -0.36987119913101196, "logits/rejected": -0.3052482604980469, "logps/chosen": -141.39877319335938, "logps/rejected": -91.98651123046875, "loss": 0.5627, "rewards/accuracies": 1.0, "rewards/chosen": 0.7092911005020142, "rewards/margins": 0.2784011960029602, "rewards/rejected": 0.43088990449905396, "step": 587 }, { "epoch": 0.1, "learning_rate": 9.991793598960362e-07, "logits/chosen": -0.40671688318252563, "logits/rejected": -0.3349732756614685, "logps/chosen": -261.7919006347656, "logps/rejected": -80.53121948242188, "loss": 0.4462, "rewards/accuracies": 1.0, "rewards/chosen": 1.277258276939392, "rewards/margins": 0.7287688851356506, "rewards/rejected": 0.5484893918037415, "step": 588 }, { "epoch": 0.1, "learning_rate": 9.991718159037014e-07, "logits/chosen": -0.48404064774513245, "logits/rejected": -0.43313491344451904, "logps/chosen": -61.57494354248047, "logps/rejected": -21.457456588745117, "loss": 0.5745, "rewards/accuracies": 1.0, "rewards/chosen": 0.40715789794921875, "rewards/margins": 0.32636794447898865, "rewards/rejected": 0.0807899460196495, "step": 589 }, { "epoch": 0.1, "learning_rate": 9.991642374233175e-07, "logits/chosen": -0.3998531103134155, "logits/rejected": -0.3745899200439453, "logps/chosen": -115.82569885253906, "logps/rejected": -162.4021759033203, "loss": 0.5024, "rewards/accuracies": 1.0, "rewards/chosen": 0.23845215141773224, "rewards/margins": 0.27841949462890625, "rewards/rejected": -0.03996734693646431, "step": 590 }, { "epoch": 0.1, "learning_rate": 9.991566244554078e-07, "logits/chosen": -0.1236952394247055, "logits/rejected": -0.1236952394247055, "logps/chosen": -5.05660343170166, "logps/rejected": -5.05660343170166, "loss": 0.761, "rewards/accuracies": 0.0, "rewards/chosen": 0.0900263786315918, "rewards/margins": 0.0, "rewards/rejected": 0.0900263786315918, "step": 591 }, { "epoch": 0.1, "learning_rate": 9.991489770004985e-07, "logits/chosen": -0.425677627325058, "logits/rejected": -0.4660191535949707, "logps/chosen": -100.50413513183594, "logps/rejected": -146.33204650878906, "loss": 0.8133, "rewards/accuracies": 0.0, "rewards/chosen": 0.22337952256202698, "rewards/margins": -0.03462982177734375, "rewards/rejected": 0.2580093443393707, "step": 592 }, { "epoch": 0.1, "learning_rate": 9.991412950591177e-07, "logits/chosen": -0.5187820196151733, "logits/rejected": -0.5082182288169861, "logps/chosen": -85.80799865722656, "logps/rejected": -113.7113037109375, "loss": 0.6268, "rewards/accuracies": 1.0, "rewards/chosen": 0.31183701753616333, "rewards/margins": 0.18774491548538208, "rewards/rejected": 0.12409210205078125, "step": 593 }, { "epoch": 0.1, "learning_rate": 9.991335786317963e-07, "logits/chosen": -0.15996795892715454, "logits/rejected": -0.14383935928344727, "logps/chosen": -107.47305297851562, "logps/rejected": -53.845516204833984, "loss": 0.8045, "rewards/accuracies": 0.0, "rewards/chosen": 0.352630615234375, "rewards/margins": -0.1734638214111328, "rewards/rejected": 0.5260944366455078, "step": 594 }, { "epoch": 0.1, "learning_rate": 9.991258277190675e-07, "logits/chosen": -0.4222468435764313, "logits/rejected": -0.3399660289287567, "logps/chosen": -83.55875396728516, "logps/rejected": -32.41790771484375, "loss": 0.5012, "rewards/accuracies": 1.0, "rewards/chosen": 0.6188454031944275, "rewards/margins": 0.5391151905059814, "rewards/rejected": 0.07973022758960724, "step": 595 }, { "epoch": 0.1, "learning_rate": 9.99118042321467e-07, "logits/chosen": -0.23951494693756104, "logits/rejected": -0.2518576383590698, "logps/chosen": -3.797632932662964, "logps/rejected": -3.6982951164245605, "loss": 0.7202, "rewards/accuracies": 1.0, "rewards/chosen": 0.06300800293684006, "rewards/margins": 0.002930622547864914, "rewards/rejected": 0.06007738038897514, "step": 596 }, { "epoch": 0.1, "learning_rate": 9.99110222439532e-07, "logits/chosen": -0.42434006929397583, "logits/rejected": -0.38639310002326965, "logps/chosen": -77.68751525878906, "logps/rejected": -12.119918823242188, "loss": 0.7387, "rewards/accuracies": 1.0, "rewards/chosen": 0.21581421792507172, "rewards/margins": 0.07418528199195862, "rewards/rejected": 0.1416289359331131, "step": 597 }, { "epoch": 0.1, "learning_rate": 9.991023680738037e-07, "logits/chosen": -0.5585519671440125, "logits/rejected": -0.5497717261314392, "logps/chosen": -93.76718139648438, "logps/rejected": -141.8040771484375, "loss": 0.7747, "rewards/accuracies": 0.0, "rewards/chosen": 0.6437026858329773, "rewards/margins": -0.2813400626182556, "rewards/rejected": 0.9250427484512329, "step": 598 }, { "epoch": 0.1, "learning_rate": 9.990944792248242e-07, "logits/chosen": -1.0902807712554932, "logits/rejected": -1.118429183959961, "logps/chosen": -84.86784362792969, "logps/rejected": -38.00682830810547, "loss": 0.5727, "rewards/accuracies": 1.0, "rewards/chosen": 0.2645019590854645, "rewards/margins": 0.11756478250026703, "rewards/rejected": 0.14693717658519745, "step": 599 }, { "epoch": 0.1, "learning_rate": 9.990865558931386e-07, "logits/chosen": -0.2164459526538849, "logits/rejected": -0.35903993248939514, "logps/chosen": -209.8515625, "logps/rejected": -208.64840698242188, "loss": 1.0027, "rewards/accuracies": 0.0, "rewards/chosen": 0.2848358154296875, "rewards/margins": -0.3662063479423523, "rewards/rejected": 0.6510421633720398, "step": 600 }, { "epoch": 0.1, "learning_rate": 9.990785980792943e-07, "logits/chosen": -0.24447697401046753, "logits/rejected": -0.23464833199977875, "logps/chosen": -110.06934356689453, "logps/rejected": -52.94468688964844, "loss": 0.8129, "rewards/accuracies": 0.0, "rewards/chosen": 0.20304031670093536, "rewards/margins": -0.21676407754421234, "rewards/rejected": 0.4198043942451477, "step": 601 }, { "epoch": 0.1, "learning_rate": 9.990706057838414e-07, "logits/chosen": -0.43440356850624084, "logits/rejected": -0.40641772747039795, "logps/chosen": -248.9324493408203, "logps/rejected": -18.210458755493164, "loss": 0.7452, "rewards/accuracies": 1.0, "rewards/chosen": 0.3676773011684418, "rewards/margins": 0.30605486035346985, "rewards/rejected": 0.06162242963910103, "step": 602 }, { "epoch": 0.1, "learning_rate": 9.99062579007332e-07, "logits/chosen": -0.3534901738166809, "logits/rejected": -0.36727720499038696, "logps/chosen": -49.82996368408203, "logps/rejected": -97.11636352539062, "loss": 0.5335, "rewards/accuracies": 1.0, "rewards/chosen": 0.1914817839860916, "rewards/margins": 0.2709159851074219, "rewards/rejected": -0.07943420857191086, "step": 603 }, { "epoch": 0.1, "learning_rate": 9.990545177503202e-07, "logits/chosen": -0.4611683487892151, "logits/rejected": -0.5054824352264404, "logps/chosen": -110.75302124023438, "logps/rejected": -95.51815032958984, "loss": 0.7161, "rewards/accuracies": 0.0, "rewards/chosen": 0.08416900783777237, "rewards/margins": -0.10495681315660477, "rewards/rejected": 0.18912582099437714, "step": 604 }, { "epoch": 0.1, "learning_rate": 9.990464220133636e-07, "logits/chosen": -0.39509764313697815, "logits/rejected": -0.39509764313697815, "logps/chosen": -74.01332092285156, "logps/rejected": -74.01332092285156, "loss": 0.7301, "rewards/accuracies": 0.0, "rewards/chosen": 0.199198916554451, "rewards/margins": 0.0, "rewards/rejected": 0.199198916554451, "step": 605 }, { "epoch": 0.1, "learning_rate": 9.990382917970211e-07, "logits/chosen": -0.48622483015060425, "logits/rejected": -0.46750661730766296, "logps/chosen": -105.04229736328125, "logps/rejected": -102.29971313476562, "loss": 0.796, "rewards/accuracies": 0.0, "rewards/chosen": 0.19814454019069672, "rewards/margins": -0.27278900146484375, "rewards/rejected": 0.4709335267543793, "step": 606 }, { "epoch": 0.1, "learning_rate": 9.990301271018547e-07, "logits/chosen": -0.4944091737270355, "logits/rejected": -0.5089975595474243, "logps/chosen": -69.0010757446289, "logps/rejected": -19.328834533691406, "loss": 0.7633, "rewards/accuracies": 0.0, "rewards/chosen": 0.1070960983633995, "rewards/margins": -0.09173489362001419, "rewards/rejected": 0.1988309919834137, "step": 607 }, { "epoch": 0.1, "learning_rate": 9.990219279284283e-07, "logits/chosen": -0.5747641921043396, "logits/rejected": -0.23343487083911896, "logps/chosen": -227.36277770996094, "logps/rejected": -92.45463562011719, "loss": 0.564, "rewards/accuracies": 1.0, "rewards/chosen": 0.9632278680801392, "rewards/margins": 0.4353591799736023, "rewards/rejected": 0.5278686881065369, "step": 608 }, { "epoch": 0.1, "learning_rate": 9.990136942773084e-07, "logits/chosen": -0.147644504904747, "logits/rejected": 0.07531754672527313, "logps/chosen": -64.34678649902344, "logps/rejected": -185.12326049804688, "loss": 0.8789, "rewards/accuracies": 0.0, "rewards/chosen": 0.5054230093955994, "rewards/margins": -0.811881959438324, "rewards/rejected": 1.3173049688339233, "step": 609 }, { "epoch": 0.1, "learning_rate": 9.990054261490641e-07, "logits/chosen": -0.2679519057273865, "logits/rejected": -0.14392846822738647, "logps/chosen": -158.05984497070312, "logps/rejected": -49.68632507324219, "loss": 0.4968, "rewards/accuracies": 1.0, "rewards/chosen": 0.9148284792900085, "rewards/margins": 0.6931346654891968, "rewards/rejected": 0.22169379889965057, "step": 610 }, { "epoch": 0.1, "learning_rate": 9.989971235442665e-07, "logits/chosen": -0.3514177203178406, "logits/rejected": -0.3270071744918823, "logps/chosen": -161.8199005126953, "logps/rejected": -60.65516662597656, "loss": 0.4281, "rewards/accuracies": 1.0, "rewards/chosen": 0.5946823358535767, "rewards/margins": 0.3534725308418274, "rewards/rejected": 0.24120979011058807, "step": 611 }, { "epoch": 0.1, "learning_rate": 9.98988786463489e-07, "logits/chosen": -1.024774193763733, "logits/rejected": -0.9086479544639587, "logps/chosen": -166.07131958007812, "logps/rejected": -187.08277893066406, "loss": 0.7741, "rewards/accuracies": 0.0, "rewards/chosen": 0.8466049432754517, "rewards/margins": -0.08491820096969604, "rewards/rejected": 0.9315231442451477, "step": 612 }, { "epoch": 0.1, "learning_rate": 9.98980414907308e-07, "logits/chosen": -0.015758389607071877, "logits/rejected": -0.00043640192598104477, "logps/chosen": -15.735649108886719, "logps/rejected": -42.64983367919922, "loss": 0.755, "rewards/accuracies": 1.0, "rewards/chosen": 0.33377382159233093, "rewards/margins": 0.0323585569858551, "rewards/rejected": 0.30141526460647583, "step": 613 }, { "epoch": 0.1, "learning_rate": 9.98972008876302e-07, "logits/chosen": -0.3036727011203766, "logits/rejected": -0.008545254357159138, "logps/chosen": -64.91049194335938, "logps/rejected": -57.04804229736328, "loss": 0.7316, "rewards/accuracies": 1.0, "rewards/chosen": 0.4957122802734375, "rewards/margins": 0.24356231093406677, "rewards/rejected": 0.2521499693393707, "step": 614 }, { "epoch": 0.1, "learning_rate": 9.98963568371051e-07, "logits/chosen": 0.11476422101259232, "logits/rejected": 0.12512849271297455, "logps/chosen": -25.110607147216797, "logps/rejected": -20.593975067138672, "loss": 0.7352, "rewards/accuracies": 0.0, "rewards/chosen": 0.028495026752352715, "rewards/margins": -0.18680915236473083, "rewards/rejected": 0.2153041809797287, "step": 615 }, { "epoch": 0.1, "learning_rate": 9.98955093392139e-07, "logits/chosen": -0.1787639856338501, "logits/rejected": -0.17994903028011322, "logps/chosen": -9.26318645477295, "logps/rejected": -6.902174949645996, "loss": 0.5587, "rewards/accuracies": 0.0, "rewards/chosen": 0.09319658577442169, "rewards/margins": -0.048999398946762085, "rewards/rejected": 0.14219598472118378, "step": 616 }, { "epoch": 0.1, "learning_rate": 9.98946583940151e-07, "logits/chosen": -0.23513086140155792, "logits/rejected": -0.2339230626821518, "logps/chosen": -322.5084228515625, "logps/rejected": -51.824134826660156, "loss": 0.6881, "rewards/accuracies": 0.0, "rewards/chosen": 0.23221436142921448, "rewards/margins": -0.18466836214065552, "rewards/rejected": 0.41688272356987, "step": 617 }, { "epoch": 0.1, "learning_rate": 9.98938040015675e-07, "logits/chosen": -0.41856592893600464, "logits/rejected": -0.19642025232315063, "logps/chosen": -59.905784606933594, "logps/rejected": -82.84916687011719, "loss": 0.5312, "rewards/accuracies": 0.0, "rewards/chosen": 0.1579517424106598, "rewards/margins": -0.1408969759941101, "rewards/rejected": 0.2988487184047699, "step": 618 }, { "epoch": 0.1, "learning_rate": 9.989294616193017e-07, "logits/chosen": -0.4182632267475128, "logits/rejected": -0.40732723474502563, "logps/chosen": -90.90794372558594, "logps/rejected": -123.87251281738281, "loss": 0.77, "rewards/accuracies": 0.0, "rewards/chosen": 0.13494110107421875, "rewards/margins": -0.8907333612442017, "rewards/rejected": 1.0256744623184204, "step": 619 }, { "epoch": 0.1, "learning_rate": 9.989208487516235e-07, "logits/chosen": -0.21390719711780548, "logits/rejected": -0.21390719711780548, "logps/chosen": -1.1310266256332397, "logps/rejected": -1.1310266256332397, "loss": 0.9158, "rewards/accuracies": 0.0, "rewards/chosen": 0.03515845537185669, "rewards/margins": 0.0, "rewards/rejected": 0.03515845537185669, "step": 620 }, { "epoch": 0.1, "learning_rate": 9.989122014132354e-07, "logits/chosen": -0.45627862215042114, "logits/rejected": -0.41977110505104065, "logps/chosen": -211.128173828125, "logps/rejected": -102.63151550292969, "loss": 1.1057, "rewards/accuracies": 0.0, "rewards/chosen": 0.5495025515556335, "rewards/margins": -0.6123886704444885, "rewards/rejected": 1.161891222000122, "step": 621 }, { "epoch": 0.1, "learning_rate": 9.989035196047348e-07, "logits/chosen": -0.009820085018873215, "logits/rejected": 0.011466537602245808, "logps/chosen": -75.49549865722656, "logps/rejected": -231.53309631347656, "loss": 0.7908, "rewards/accuracies": 0.0, "rewards/chosen": 0.45329055190086365, "rewards/margins": -0.3534736931324005, "rewards/rejected": 0.8067642450332642, "step": 622 }, { "epoch": 0.1, "learning_rate": 9.98894803326722e-07, "logits/chosen": -0.2937721014022827, "logits/rejected": -0.20976178348064423, "logps/chosen": -32.94926452636719, "logps/rejected": -38.595699310302734, "loss": 0.6459, "rewards/accuracies": 1.0, "rewards/chosen": 0.36208876967430115, "rewards/margins": 0.2108619660139084, "rewards/rejected": 0.15122680366039276, "step": 623 }, { "epoch": 0.1, "learning_rate": 9.988860525797986e-07, "logits/chosen": -0.9626320004463196, "logits/rejected": -0.9324545860290527, "logps/chosen": -98.12504577636719, "logps/rejected": -137.9957733154297, "loss": 0.8155, "rewards/accuracies": 0.0, "rewards/chosen": 0.14345398545265198, "rewards/margins": -0.8041976690292358, "rewards/rejected": 0.9476516842842102, "step": 624 }, { "epoch": 0.1, "learning_rate": 9.988772673645696e-07, "logits/chosen": 0.002810761332511902, "logits/rejected": 0.002810761332511902, "logps/chosen": -104.63275146484375, "logps/rejected": -104.63275146484375, "loss": 0.7206, "rewards/accuracies": 0.0, "rewards/chosen": 0.1582687348127365, "rewards/margins": 0.0, "rewards/rejected": 0.1582687348127365, "step": 625 }, { "epoch": 0.1, "learning_rate": 9.988684476816418e-07, "logits/chosen": -0.3766342103481293, "logits/rejected": -0.324828565120697, "logps/chosen": -58.59852981567383, "logps/rejected": -65.7032470703125, "loss": 0.523, "rewards/accuracies": 1.0, "rewards/chosen": 0.37601280212402344, "rewards/margins": 0.06289482116699219, "rewards/rejected": 0.31311798095703125, "step": 626 }, { "epoch": 0.1, "learning_rate": 9.988595935316247e-07, "logits/chosen": -0.22786502540111542, "logits/rejected": -0.24520981311798096, "logps/chosen": -62.32355880737305, "logps/rejected": -68.76205444335938, "loss": 0.8404, "rewards/accuracies": 0.0, "rewards/chosen": 0.18167077004909515, "rewards/margins": -0.1778339296579361, "rewards/rejected": 0.35950469970703125, "step": 627 }, { "epoch": 0.1, "learning_rate": 9.988507049151297e-07, "logits/chosen": -0.28633368015289307, "logits/rejected": -0.21350647509098053, "logps/chosen": -65.4868392944336, "logps/rejected": -58.44578552246094, "loss": 0.4735, "rewards/accuracies": 1.0, "rewards/chosen": 0.5279685854911804, "rewards/margins": 0.40552443265914917, "rewards/rejected": 0.12244415283203125, "step": 628 }, { "epoch": 0.1, "learning_rate": 9.988417818327714e-07, "logits/chosen": -0.18715187907218933, "logits/rejected": -0.18706999719142914, "logps/chosen": -62.68156051635742, "logps/rejected": -94.00740051269531, "loss": 0.747, "rewards/accuracies": 0.0, "rewards/chosen": -0.03669700771570206, "rewards/margins": -0.20502281188964844, "rewards/rejected": 0.16832581162452698, "step": 629 }, { "epoch": 0.1, "learning_rate": 9.98832824285166e-07, "logits/chosen": -0.4846569895744324, "logits/rejected": -0.4372335970401764, "logps/chosen": -214.1779327392578, "logps/rejected": -136.3043975830078, "loss": 0.597, "rewards/accuracies": 1.0, "rewards/chosen": 0.3918747007846832, "rewards/margins": 0.05975341796875, "rewards/rejected": 0.3321212828159332, "step": 630 }, { "epoch": 0.1, "learning_rate": 9.988238322729324e-07, "logits/chosen": -0.5699892044067383, "logits/rejected": -0.5808294415473938, "logps/chosen": -69.29537963867188, "logps/rejected": -108.62230682373047, "loss": 0.668, "rewards/accuracies": 1.0, "rewards/chosen": 0.393423467874527, "rewards/margins": 0.4713325500488281, "rewards/rejected": -0.07790908962488174, "step": 631 }, { "epoch": 0.1, "learning_rate": 9.988148057966918e-07, "logits/chosen": -0.6430377960205078, "logits/rejected": -0.5990051031112671, "logps/chosen": -57.66731262207031, "logps/rejected": -113.08746337890625, "loss": 0.6147, "rewards/accuracies": 0.0, "rewards/chosen": 0.6958534121513367, "rewards/margins": -0.3394508957862854, "rewards/rejected": 1.035304307937622, "step": 632 }, { "epoch": 0.1, "learning_rate": 9.988057448570681e-07, "logits/chosen": -0.2921486496925354, "logits/rejected": -0.29189521074295044, "logps/chosen": -4.436276912689209, "logps/rejected": -6.463804244995117, "loss": 0.7057, "rewards/accuracies": 0.0, "rewards/chosen": 0.10635662078857422, "rewards/margins": -0.010168075561523438, "rewards/rejected": 0.11652469635009766, "step": 633 }, { "epoch": 0.1, "learning_rate": 9.987966494546872e-07, "logits/chosen": -0.5416035652160645, "logits/rejected": -0.555608868598938, "logps/chosen": -118.67558288574219, "logps/rejected": -66.80728149414062, "loss": 1.0957, "rewards/accuracies": 0.0, "rewards/chosen": 0.06467133015394211, "rewards/margins": -0.41749876737594604, "rewards/rejected": 0.48217010498046875, "step": 634 }, { "epoch": 0.1, "learning_rate": 9.987875195901774e-07, "logits/chosen": -0.3220074772834778, "logits/rejected": -0.34316855669021606, "logps/chosen": -104.41316223144531, "logps/rejected": -69.6659164428711, "loss": 0.7661, "rewards/accuracies": 0.0, "rewards/chosen": 0.10169219970703125, "rewards/margins": -0.5285324454307556, "rewards/rejected": 0.6302246451377869, "step": 635 }, { "epoch": 0.1, "learning_rate": 9.987783552641697e-07, "logits/chosen": -0.07997772097587585, "logits/rejected": -0.1008942574262619, "logps/chosen": -70.01264953613281, "logps/rejected": -51.695831298828125, "loss": 0.62, "rewards/accuracies": 0.0, "rewards/chosen": 0.39592742919921875, "rewards/margins": -0.10022088885307312, "rewards/rejected": 0.49614831805229187, "step": 636 }, { "epoch": 0.1, "learning_rate": 9.98769156477297e-07, "logits/chosen": -0.2352910190820694, "logits/rejected": -0.2384796440601349, "logps/chosen": -11.76171875, "logps/rejected": -8.72703742980957, "loss": 0.5557, "rewards/accuracies": 1.0, "rewards/chosen": 0.0974574089050293, "rewards/margins": 0.02373991161584854, "rewards/rejected": 0.07371749728918076, "step": 637 }, { "epoch": 0.1, "learning_rate": 9.98759923230195e-07, "logits/chosen": -0.3862921893596649, "logits/rejected": -0.3913685083389282, "logps/chosen": -89.50569152832031, "logps/rejected": -129.51783752441406, "loss": 0.6368, "rewards/accuracies": 1.0, "rewards/chosen": 1.1473435163497925, "rewards/margins": 0.5360687971115112, "rewards/rejected": 0.6112747192382812, "step": 638 }, { "epoch": 0.1, "learning_rate": 9.987506555235016e-07, "logits/chosen": -0.6076280474662781, "logits/rejected": -0.5942874550819397, "logps/chosen": -117.26824951171875, "logps/rejected": -75.77256774902344, "loss": 0.761, "rewards/accuracies": 0.0, "rewards/chosen": 0.5620666742324829, "rewards/margins": -0.30039823055267334, "rewards/rejected": 0.8624649047851562, "step": 639 }, { "epoch": 0.1, "learning_rate": 9.987413533578573e-07, "logits/chosen": -0.3932079076766968, "logits/rejected": -0.38146016001701355, "logps/chosen": -63.58042907714844, "logps/rejected": -22.967893600463867, "loss": 0.5951, "rewards/accuracies": 0.0, "rewards/chosen": 0.1642448455095291, "rewards/margins": -0.062094300985336304, "rewards/rejected": 0.22633914649486542, "step": 640 }, { "epoch": 0.1, "learning_rate": 9.987320167339044e-07, "logits/chosen": -0.09366496652364731, "logits/rejected": -0.08674134314060211, "logps/chosen": -66.22315979003906, "logps/rejected": -280.18621826171875, "loss": 0.6307, "rewards/accuracies": 0.0, "rewards/chosen": 0.479928582906723, "rewards/margins": -0.1364898979663849, "rewards/rejected": 0.6164184808731079, "step": 641 }, { "epoch": 0.1, "learning_rate": 9.987226456522882e-07, "logits/chosen": -0.5517073273658752, "logits/rejected": -0.5838797092437744, "logps/chosen": -133.69200134277344, "logps/rejected": -97.70892333984375, "loss": 1.0667, "rewards/accuracies": 0.0, "rewards/chosen": 0.2274017333984375, "rewards/margins": -0.5560577511787415, "rewards/rejected": 0.783459484577179, "step": 642 }, { "epoch": 0.1, "learning_rate": 9.987132401136562e-07, "logits/chosen": -0.6843501329421997, "logits/rejected": -0.7379023432731628, "logps/chosen": -177.7210693359375, "logps/rejected": -29.408830642700195, "loss": 0.3547, "rewards/accuracies": 1.0, "rewards/chosen": 0.68878173828125, "rewards/margins": 0.6475772857666016, "rewards/rejected": 0.04120445251464844, "step": 643 }, { "epoch": 0.1, "learning_rate": 9.987038001186584e-07, "logits/chosen": -0.41165652871131897, "logits/rejected": -0.42900121212005615, "logps/chosen": -264.3006896972656, "logps/rejected": -134.00196838378906, "loss": 0.6223, "rewards/accuracies": 1.0, "rewards/chosen": 0.42682191729545593, "rewards/margins": 0.07286834716796875, "rewards/rejected": 0.3539535701274872, "step": 644 }, { "epoch": 0.1, "learning_rate": 9.986943256679464e-07, "logits/chosen": -0.4364831745624542, "logits/rejected": -0.3925724923610687, "logps/chosen": -113.33329010009766, "logps/rejected": -91.32582092285156, "loss": 0.9217, "rewards/accuracies": 0.0, "rewards/chosen": 0.3337303102016449, "rewards/margins": -0.3213180601596832, "rewards/rejected": 0.6550483703613281, "step": 645 }, { "epoch": 0.1, "learning_rate": 9.986848167621753e-07, "logits/chosen": -0.3983420133590698, "logits/rejected": -0.38892310857772827, "logps/chosen": -126.53619384765625, "logps/rejected": -121.6314697265625, "loss": 0.5344, "rewards/accuracies": 1.0, "rewards/chosen": 0.948504626750946, "rewards/margins": 0.11499631404876709, "rewards/rejected": 0.833508312702179, "step": 646 }, { "epoch": 0.11, "learning_rate": 9.98675273402002e-07, "logits/chosen": -0.3220415413379669, "logits/rejected": -0.3195519745349884, "logps/chosen": -80.76234436035156, "logps/rejected": -67.58721160888672, "loss": 0.6746, "rewards/accuracies": 1.0, "rewards/chosen": 0.5036888122558594, "rewards/margins": 0.22552490234375, "rewards/rejected": 0.2781639099121094, "step": 647 }, { "epoch": 0.11, "learning_rate": 9.98665695588086e-07, "logits/chosen": -0.6859211325645447, "logits/rejected": -0.6388833522796631, "logps/chosen": -97.30088806152344, "logps/rejected": -173.0428466796875, "loss": 0.8622, "rewards/accuracies": 0.0, "rewards/chosen": 0.3136138916015625, "rewards/margins": -0.9474655389785767, "rewards/rejected": 1.2610794305801392, "step": 648 }, { "epoch": 0.11, "learning_rate": 9.986560833210887e-07, "logits/chosen": -0.11308769881725311, "logits/rejected": -0.1298927515745163, "logps/chosen": -54.650577545166016, "logps/rejected": -118.58696746826172, "loss": 0.6319, "rewards/accuracies": 1.0, "rewards/chosen": 0.1763816922903061, "rewards/margins": 0.03976288437843323, "rewards/rejected": 0.13661880791187286, "step": 649 }, { "epoch": 0.11, "learning_rate": 9.986464366016743e-07, "logits/chosen": -0.2527942359447479, "logits/rejected": -0.2527942359447479, "logps/chosen": -19.60879898071289, "logps/rejected": -19.60879898071289, "loss": 0.5818, "rewards/accuracies": 0.0, "rewards/chosen": 0.11629333347082138, "rewards/margins": 0.0, "rewards/rejected": 0.11629333347082138, "step": 650 }, { "epoch": 0.11, "learning_rate": 9.986367554305095e-07, "logits/chosen": -0.28275740146636963, "logits/rejected": -0.238677516579628, "logps/chosen": -99.34156799316406, "logps/rejected": -59.472572326660156, "loss": 0.5898, "rewards/accuracies": 0.0, "rewards/chosen": 0.3603775203227997, "rewards/margins": -0.06051024794578552, "rewards/rejected": 0.4208877682685852, "step": 651 }, { "epoch": 0.11, "learning_rate": 9.986270398082628e-07, "logits/chosen": -0.46631309390068054, "logits/rejected": -0.4692531228065491, "logps/chosen": -84.71591186523438, "logps/rejected": -98.46678161621094, "loss": 0.838, "rewards/accuracies": 0.0, "rewards/chosen": 0.30743715167045593, "rewards/margins": -0.5006523132324219, "rewards/rejected": 0.8080894351005554, "step": 652 }, { "epoch": 0.11, "learning_rate": 9.98617289735606e-07, "logits/chosen": -0.7766220569610596, "logits/rejected": -0.8127133250236511, "logps/chosen": -166.42445373535156, "logps/rejected": -61.459354400634766, "loss": 0.57, "rewards/accuracies": 1.0, "rewards/chosen": 1.1500351428985596, "rewards/margins": 0.6402164697647095, "rewards/rejected": 0.5098186731338501, "step": 653 }, { "epoch": 0.11, "learning_rate": 9.986075052132122e-07, "logits/chosen": -0.41248413920402527, "logits/rejected": -0.37732404470443726, "logps/chosen": -81.64117431640625, "logps/rejected": -61.95344924926758, "loss": 0.6021, "rewards/accuracies": 1.0, "rewards/chosen": 0.8749374747276306, "rewards/margins": 0.7174625396728516, "rewards/rejected": 0.15747490525245667, "step": 654 }, { "epoch": 0.11, "learning_rate": 9.985976862417577e-07, "logits/chosen": -0.22362005710601807, "logits/rejected": -0.19478145241737366, "logps/chosen": -145.57748413085938, "logps/rejected": -181.05099487304688, "loss": 0.948, "rewards/accuracies": 0.0, "rewards/chosen": 0.1673126220703125, "rewards/margins": -0.37233275175094604, "rewards/rejected": 0.5396453738212585, "step": 655 }, { "epoch": 0.11, "learning_rate": 9.985878328219211e-07, "logits/chosen": -0.7143442034721375, "logits/rejected": -0.7133447527885437, "logps/chosen": -141.34103393554688, "logps/rejected": -132.2146759033203, "loss": 0.8025, "rewards/accuracies": 0.0, "rewards/chosen": 0.23291015625, "rewards/margins": -0.34714359045028687, "rewards/rejected": 0.5800537467002869, "step": 656 }, { "epoch": 0.11, "learning_rate": 9.985779449543828e-07, "logits/chosen": -0.4722433388233185, "logits/rejected": -0.4887073040008545, "logps/chosen": -108.38612365722656, "logps/rejected": -62.757850646972656, "loss": 0.6759, "rewards/accuracies": 0.0, "rewards/chosen": 0.22656936943531036, "rewards/margins": -0.2470756620168686, "rewards/rejected": 0.47364503145217896, "step": 657 }, { "epoch": 0.11, "learning_rate": 9.98568022639826e-07, "logits/chosen": -0.03739653155207634, "logits/rejected": -0.05279424041509628, "logps/chosen": -15.827077865600586, "logps/rejected": -24.14719581604004, "loss": 0.6357, "rewards/accuracies": 0.0, "rewards/chosen": 0.12074146419763565, "rewards/margins": -0.04382381588220596, "rewards/rejected": 0.1645652800798416, "step": 658 }, { "epoch": 0.11, "learning_rate": 9.985580658789363e-07, "logits/chosen": -0.55863356590271, "logits/rejected": -0.4841821789741516, "logps/chosen": -115.69686889648438, "logps/rejected": -82.19363403320312, "loss": 0.7744, "rewards/accuracies": 0.0, "rewards/chosen": 0.002611541887745261, "rewards/margins": -0.13728637993335724, "rewards/rejected": 0.1398979276418686, "step": 659 }, { "epoch": 0.11, "learning_rate": 9.985480746724018e-07, "logits/chosen": -0.5546291470527649, "logits/rejected": -0.5168664455413818, "logps/chosen": -261.5708923339844, "logps/rejected": -321.29541015625, "loss": 1.0275, "rewards/accuracies": 0.0, "rewards/chosen": 0.5053680539131165, "rewards/margins": -1.6035919189453125, "rewards/rejected": 2.108959913253784, "step": 660 }, { "epoch": 0.11, "learning_rate": 9.985380490209125e-07, "logits/chosen": -0.28081703186035156, "logits/rejected": -0.28495678305625916, "logps/chosen": -3.2986388206481934, "logps/rejected": -2.5419809818267822, "loss": 0.7061, "rewards/accuracies": 0.0, "rewards/chosen": 0.047831084579229355, "rewards/margins": -0.026783417910337448, "rewards/rejected": 0.0746145024895668, "step": 661 }, { "epoch": 0.11, "learning_rate": 9.985279889251615e-07, "logits/chosen": -0.6198343634605408, "logits/rejected": -0.5769600868225098, "logps/chosen": -114.7254638671875, "logps/rejected": -105.62415313720703, "loss": 0.7784, "rewards/accuracies": 0.0, "rewards/chosen": 0.19352416694164276, "rewards/margins": -0.09968186914920807, "rewards/rejected": 0.29320603609085083, "step": 662 }, { "epoch": 0.11, "learning_rate": 9.985178943858432e-07, "logits/chosen": -0.5187552571296692, "logits/rejected": -0.5154271125793457, "logps/chosen": -140.5366668701172, "logps/rejected": -79.9945068359375, "loss": 0.4015, "rewards/accuracies": 1.0, "rewards/chosen": 0.9643036127090454, "rewards/margins": 0.6723769903182983, "rewards/rejected": 0.2919265925884247, "step": 663 }, { "epoch": 0.11, "learning_rate": 9.985077654036556e-07, "logits/chosen": -0.2101927250623703, "logits/rejected": -0.2675246000289917, "logps/chosen": -130.619140625, "logps/rejected": -165.04623413085938, "loss": 0.7054, "rewards/accuracies": 0.0, "rewards/chosen": 0.20882569253444672, "rewards/margins": -0.45509183406829834, "rewards/rejected": 0.6639175415039062, "step": 664 }, { "epoch": 0.11, "learning_rate": 9.984976019792983e-07, "logits/chosen": -0.36399540305137634, "logits/rejected": -0.35914626717567444, "logps/chosen": -60.117610931396484, "logps/rejected": -116.58653259277344, "loss": 0.808, "rewards/accuracies": 1.0, "rewards/chosen": 0.203094482421875, "rewards/margins": 0.17173156142234802, "rewards/rejected": 0.03136291727423668, "step": 665 }, { "epoch": 0.11, "learning_rate": 9.984874041134737e-07, "logits/chosen": -0.5648496150970459, "logits/rejected": -0.40702658891677856, "logps/chosen": -80.50988006591797, "logps/rejected": -176.0299072265625, "loss": 0.9368, "rewards/accuracies": 0.0, "rewards/chosen": 0.824518620967865, "rewards/margins": -1.1082069873809814, "rewards/rejected": 1.9327255487442017, "step": 666 }, { "epoch": 0.11, "learning_rate": 9.98477171806886e-07, "logits/chosen": -0.37777847051620483, "logits/rejected": -0.2851089835166931, "logps/chosen": -244.35581970214844, "logps/rejected": -103.90859985351562, "loss": 0.6376, "rewards/accuracies": 0.0, "rewards/chosen": 0.512713611125946, "rewards/margins": -0.14262926578521729, "rewards/rejected": 0.6553428769111633, "step": 667 }, { "epoch": 0.11, "learning_rate": 9.984669050602424e-07, "logits/chosen": -0.26342543959617615, "logits/rejected": -0.2639322280883789, "logps/chosen": -179.97256469726562, "logps/rejected": -162.55169677734375, "loss": 0.6746, "rewards/accuracies": 0.0, "rewards/chosen": 0.4534973204135895, "rewards/margins": -0.027116388082504272, "rewards/rejected": 0.48061370849609375, "step": 668 }, { "epoch": 0.11, "learning_rate": 9.984566038742524e-07, "logits/chosen": -0.32040104269981384, "logits/rejected": -0.2995995879173279, "logps/chosen": -204.04544067382812, "logps/rejected": -127.3862075805664, "loss": 0.5444, "rewards/accuracies": 1.0, "rewards/chosen": 0.720721423625946, "rewards/margins": 0.11684340238571167, "rewards/rejected": 0.6038780212402344, "step": 669 }, { "epoch": 0.11, "learning_rate": 9.984462682496273e-07, "logits/chosen": -0.7623414397239685, "logits/rejected": -0.7198289632797241, "logps/chosen": -150.99990844726562, "logps/rejected": -56.73026657104492, "loss": 0.7117, "rewards/accuracies": 1.0, "rewards/chosen": 0.13613586127758026, "rewards/margins": 0.0648525208234787, "rewards/rejected": 0.07128334045410156, "step": 670 }, { "epoch": 0.11, "learning_rate": 9.984358981870814e-07, "logits/chosen": -0.371764600276947, "logits/rejected": -0.3804396688938141, "logps/chosen": -108.28428649902344, "logps/rejected": -84.76790618896484, "loss": 0.4923, "rewards/accuracies": 1.0, "rewards/chosen": 0.3751052916049957, "rewards/margins": 0.05889052152633667, "rewards/rejected": 0.31621477007865906, "step": 671 }, { "epoch": 0.11, "learning_rate": 9.984254936873313e-07, "logits/chosen": -0.42776963114738464, "logits/rejected": -0.3545176386833191, "logps/chosen": -156.83065795898438, "logps/rejected": -106.54085540771484, "loss": 0.445, "rewards/accuracies": 1.0, "rewards/chosen": 1.2899277210235596, "rewards/margins": 0.5839897394180298, "rewards/rejected": 0.7059379816055298, "step": 672 }, { "epoch": 0.11, "learning_rate": 9.984150547510957e-07, "logits/chosen": -0.2851742208003998, "logits/rejected": -0.27590301632881165, "logps/chosen": -106.6006088256836, "logps/rejected": -78.69166564941406, "loss": 0.6809, "rewards/accuracies": 0.0, "rewards/chosen": 0.16678695380687714, "rewards/margins": -0.42371290922164917, "rewards/rejected": 0.5904998779296875, "step": 673 }, { "epoch": 0.11, "learning_rate": 9.984045813790958e-07, "logits/chosen": -0.6616382002830505, "logits/rejected": -0.6033777594566345, "logps/chosen": -137.037109375, "logps/rejected": -79.52883911132812, "loss": 0.4836, "rewards/accuracies": 1.0, "rewards/chosen": 0.42039185762405396, "rewards/margins": 0.13094329833984375, "rewards/rejected": 0.2894485592842102, "step": 674 }, { "epoch": 0.11, "learning_rate": 9.983940735720554e-07, "logits/chosen": -0.20934170484542847, "logits/rejected": -0.20845358073711395, "logps/chosen": -4.191601753234863, "logps/rejected": -10.152949333190918, "loss": 0.6434, "rewards/accuracies": 1.0, "rewards/chosen": 0.13662110269069672, "rewards/margins": 0.09308548271656036, "rewards/rejected": 0.043535616248846054, "step": 675 }, { "epoch": 0.11, "learning_rate": 9.983835313307e-07, "logits/chosen": -0.5788770318031311, "logits/rejected": -0.5595589876174927, "logps/chosen": -245.09353637695312, "logps/rejected": -210.555419921875, "loss": 0.4409, "rewards/accuracies": 1.0, "rewards/chosen": 1.8686126470565796, "rewards/margins": 0.36684560775756836, "rewards/rejected": 1.5017670392990112, "step": 676 }, { "epoch": 0.11, "learning_rate": 9.983729546557587e-07, "logits/chosen": -0.2705236077308655, "logits/rejected": -0.3225630819797516, "logps/chosen": -135.5359649658203, "logps/rejected": -53.92133331298828, "loss": 0.9147, "rewards/accuracies": 1.0, "rewards/chosen": 0.8289245963096619, "rewards/margins": 0.1342620849609375, "rewards/rejected": 0.6946625113487244, "step": 677 }, { "epoch": 0.11, "learning_rate": 9.983623435479618e-07, "logits/chosen": -0.3344486951828003, "logits/rejected": -0.29768186807632446, "logps/chosen": -64.10688018798828, "logps/rejected": -79.28683471679688, "loss": 0.744, "rewards/accuracies": 1.0, "rewards/chosen": 0.7987831234931946, "rewards/margins": 0.2280639410018921, "rewards/rejected": 0.5707191824913025, "step": 678 }, { "epoch": 0.11, "learning_rate": 9.983516980080425e-07, "logits/chosen": -0.3435980975627899, "logits/rejected": -0.3578556478023529, "logps/chosen": -153.80711364746094, "logps/rejected": -111.3577880859375, "loss": 0.4087, "rewards/accuracies": 1.0, "rewards/chosen": 1.0439194440841675, "rewards/margins": 0.6046799421310425, "rewards/rejected": 0.439239501953125, "step": 679 }, { "epoch": 0.11, "learning_rate": 9.983410180367364e-07, "logits/chosen": -0.059843551367521286, "logits/rejected": -0.09139138460159302, "logps/chosen": -86.11964416503906, "logps/rejected": -148.3851318359375, "loss": 0.8292, "rewards/accuracies": 0.0, "rewards/chosen": 0.05259094387292862, "rewards/margins": -0.592419445514679, "rewards/rejected": 0.6450104117393494, "step": 680 }, { "epoch": 0.11, "learning_rate": 9.98330303634781e-07, "logits/chosen": -0.2184309959411621, "logits/rejected": -0.22569286823272705, "logps/chosen": -90.73319244384766, "logps/rejected": -96.06724548339844, "loss": 0.5983, "rewards/accuracies": 1.0, "rewards/chosen": 0.9661735892295837, "rewards/margins": 0.47688981890678406, "rewards/rejected": 0.4892837703227997, "step": 681 }, { "epoch": 0.11, "learning_rate": 9.983195548029172e-07, "logits/chosen": -0.3651958703994751, "logits/rejected": -0.39435720443725586, "logps/chosen": -133.2665252685547, "logps/rejected": -91.45257568359375, "loss": 0.9169, "rewards/accuracies": 0.0, "rewards/chosen": 0.10254974663257599, "rewards/margins": -0.7417404651641846, "rewards/rejected": 0.8442901968955994, "step": 682 }, { "epoch": 0.11, "learning_rate": 9.983087715418872e-07, "logits/chosen": -0.2441750019788742, "logits/rejected": -0.20366209745407104, "logps/chosen": -88.6839370727539, "logps/rejected": -52.115692138671875, "loss": 0.5387, "rewards/accuracies": 0.0, "rewards/chosen": 0.26052170991897583, "rewards/margins": -0.10658493638038635, "rewards/rejected": 0.3671066462993622, "step": 683 }, { "epoch": 0.11, "learning_rate": 9.98297953852436e-07, "logits/chosen": -0.7155749797821045, "logits/rejected": -0.6821300387382507, "logps/chosen": -112.25934600830078, "logps/rejected": -116.1638412475586, "loss": 0.6768, "rewards/accuracies": 0.0, "rewards/chosen": 0.28675612807273865, "rewards/margins": -0.36006930470466614, "rewards/rejected": 0.6468254327774048, "step": 684 }, { "epoch": 0.11, "learning_rate": 9.982871017353114e-07, "logits/chosen": -0.5273882150650024, "logits/rejected": -0.5222888588905334, "logps/chosen": -64.2834243774414, "logps/rejected": -97.42479705810547, "loss": 0.5053, "rewards/accuracies": 0.0, "rewards/chosen": 0.8294700980186462, "rewards/margins": -0.07471007108688354, "rewards/rejected": 0.9041801691055298, "step": 685 }, { "epoch": 0.11, "learning_rate": 9.982762151912626e-07, "logits/chosen": -0.6836243867874146, "logits/rejected": -0.6416645050048828, "logps/chosen": -38.753021240234375, "logps/rejected": -88.51564025878906, "loss": 0.412, "rewards/accuracies": 1.0, "rewards/chosen": 0.330496221780777, "rewards/margins": 0.6405242681503296, "rewards/rejected": -0.310028076171875, "step": 686 }, { "epoch": 0.11, "learning_rate": 9.982652942210423e-07, "logits/chosen": -0.1424139142036438, "logits/rejected": -0.09445613622665405, "logps/chosen": -60.9833984375, "logps/rejected": -51.60106658935547, "loss": 0.4338, "rewards/accuracies": 1.0, "rewards/chosen": 0.865509033203125, "rewards/margins": 0.5197197198867798, "rewards/rejected": 0.3457893431186676, "step": 687 }, { "epoch": 0.11, "learning_rate": 9.982543388254046e-07, "logits/chosen": -0.2737206220626831, "logits/rejected": -0.18130061030387878, "logps/chosen": -102.37792205810547, "logps/rejected": -43.55689239501953, "loss": 0.5986, "rewards/accuracies": 1.0, "rewards/chosen": 0.6591682434082031, "rewards/margins": 0.38590124249458313, "rewards/rejected": 0.27326700091362, "step": 688 }, { "epoch": 0.11, "learning_rate": 9.982433490051068e-07, "logits/chosen": -0.23917387425899506, "logits/rejected": -0.25678759813308716, "logps/chosen": -125.87113189697266, "logps/rejected": -158.92056274414062, "loss": 0.8331, "rewards/accuracies": 0.0, "rewards/chosen": 0.6275596618652344, "rewards/margins": -0.4642571210861206, "rewards/rejected": 1.091816782951355, "step": 689 }, { "epoch": 0.11, "learning_rate": 9.982323247609079e-07, "logits/chosen": -0.7234982252120972, "logits/rejected": -0.7135689854621887, "logps/chosen": -60.0470085144043, "logps/rejected": -63.021934509277344, "loss": 0.8801, "rewards/accuracies": 1.0, "rewards/chosen": 0.19617728888988495, "rewards/margins": 0.12777632474899292, "rewards/rejected": 0.06840095669031143, "step": 690 }, { "epoch": 0.11, "learning_rate": 9.982212660935697e-07, "logits/chosen": -0.48356127738952637, "logits/rejected": -0.5164543390274048, "logps/chosen": -164.3890380859375, "logps/rejected": -93.74959564208984, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 1.3965637683868408, "rewards/margins": 0.17914044857025146, "rewards/rejected": 1.2174233198165894, "step": 691 }, { "epoch": 0.11, "learning_rate": 9.982101730038563e-07, "logits/chosen": -0.6814239621162415, "logits/rejected": -0.6653635501861572, "logps/chosen": -51.487525939941406, "logps/rejected": -24.80990982055664, "loss": 0.3875, "rewards/accuracies": 1.0, "rewards/chosen": 0.6108120083808899, "rewards/margins": 0.46238231658935547, "rewards/rejected": 0.14842967689037323, "step": 692 }, { "epoch": 0.11, "learning_rate": 9.98199045492534e-07, "logits/chosen": -0.3520853817462921, "logits/rejected": -0.3438680171966553, "logps/chosen": -101.67562866210938, "logps/rejected": -14.372688293457031, "loss": 0.4487, "rewards/accuracies": 1.0, "rewards/chosen": 0.9714226126670837, "rewards/margins": 0.8218014240264893, "rewards/rejected": 0.14962120354175568, "step": 693 }, { "epoch": 0.11, "learning_rate": 9.981878835603716e-07, "logits/chosen": -0.31101280450820923, "logits/rejected": -0.27007874846458435, "logps/chosen": -34.264671325683594, "logps/rejected": -8.971744537353516, "loss": 0.9354, "rewards/accuracies": 1.0, "rewards/chosen": 0.3704887330532074, "rewards/margins": 0.21697424352169037, "rewards/rejected": 0.15351448953151703, "step": 694 }, { "epoch": 0.11, "learning_rate": 9.981766872081402e-07, "logits/chosen": -0.5012122988700867, "logits/rejected": -0.4701113700866699, "logps/chosen": -41.35021209716797, "logps/rejected": -38.060340881347656, "loss": 0.6349, "rewards/accuracies": 1.0, "rewards/chosen": 0.5891372561454773, "rewards/margins": 0.04327046871185303, "rewards/rejected": 0.5458667874336243, "step": 695 }, { "epoch": 0.11, "learning_rate": 9.981654564366138e-07, "logits/chosen": -0.32884347438812256, "logits/rejected": -0.29189759492874146, "logps/chosen": -58.276676177978516, "logps/rejected": -146.23976135253906, "loss": 0.7465, "rewards/accuracies": 1.0, "rewards/chosen": 0.3348316252231598, "rewards/margins": 0.48338812589645386, "rewards/rejected": -0.14855651557445526, "step": 696 }, { "epoch": 0.11, "learning_rate": 9.981541912465679e-07, "logits/chosen": -0.1540413200855255, "logits/rejected": -0.13936088979244232, "logps/chosen": -74.8268051147461, "logps/rejected": -80.78663635253906, "loss": 0.8014, "rewards/accuracies": 0.0, "rewards/chosen": 0.1923423856496811, "rewards/margins": -0.3353568911552429, "rewards/rejected": 0.5276992917060852, "step": 697 }, { "epoch": 0.11, "learning_rate": 9.98142891638781e-07, "logits/chosen": -0.7494341135025024, "logits/rejected": -0.7159179449081421, "logps/chosen": -48.744468688964844, "logps/rejected": -58.36897659301758, "loss": 0.8253, "rewards/accuracies": 0.0, "rewards/chosen": 0.6036766171455383, "rewards/margins": -0.36691319942474365, "rewards/rejected": 0.970589816570282, "step": 698 }, { "epoch": 0.11, "learning_rate": 9.98131557614034e-07, "logits/chosen": -0.40194106101989746, "logits/rejected": -0.13559924066066742, "logps/chosen": -90.10868835449219, "logps/rejected": -187.68362426757812, "loss": 1.1206, "rewards/accuracies": 0.0, "rewards/chosen": 0.26669007539749146, "rewards/margins": -1.2614715099334717, "rewards/rejected": 1.528161644935608, "step": 699 }, { "epoch": 0.11, "learning_rate": 9.981201891731093e-07, "logits/chosen": -0.33622488379478455, "logits/rejected": -0.3150920867919922, "logps/chosen": -66.39161682128906, "logps/rejected": -82.99845886230469, "loss": 0.76, "rewards/accuracies": 1.0, "rewards/chosen": 0.24097824096679688, "rewards/margins": 0.007244870066642761, "rewards/rejected": 0.2337333709001541, "step": 700 }, { "epoch": 0.11, "learning_rate": 9.98108786316793e-07, "logits/chosen": -0.5835373997688293, "logits/rejected": -0.6516551971435547, "logps/chosen": -133.445068359375, "logps/rejected": -109.0105209350586, "loss": 0.6677, "rewards/accuracies": 1.0, "rewards/chosen": 1.3644531965255737, "rewards/margins": 1.217464566230774, "rewards/rejected": 0.1469886749982834, "step": 701 }, { "epoch": 0.11, "learning_rate": 9.980973490458728e-07, "logits/chosen": -0.5277565717697144, "logits/rejected": -0.5277565717697144, "logps/chosen": -62.808250427246094, "logps/rejected": -62.808250427246094, "loss": 0.5694, "rewards/accuracies": 0.0, "rewards/chosen": 0.33833542466163635, "rewards/margins": 0.0, "rewards/rejected": 0.33833542466163635, "step": 702 }, { "epoch": 0.11, "learning_rate": 9.980858773611388e-07, "logits/chosen": -0.27687379717826843, "logits/rejected": -0.25653696060180664, "logps/chosen": -81.59989166259766, "logps/rejected": -89.65660095214844, "loss": 0.7814, "rewards/accuracies": 0.0, "rewards/chosen": 0.4850784242153168, "rewards/margins": -0.4786483943462372, "rewards/rejected": 0.963726818561554, "step": 703 }, { "epoch": 0.11, "learning_rate": 9.980743712633833e-07, "logits/chosen": -0.7098524570465088, "logits/rejected": -0.7712257504463196, "logps/chosen": -160.25860595703125, "logps/rejected": -152.57907104492188, "loss": 0.6493, "rewards/accuracies": 0.0, "rewards/chosen": 0.7260376214981079, "rewards/margins": -0.2771393060684204, "rewards/rejected": 1.0031769275665283, "step": 704 }, { "epoch": 0.11, "learning_rate": 9.980628307534018e-07, "logits/chosen": -0.47614461183547974, "logits/rejected": -0.45325687527656555, "logps/chosen": -165.340087890625, "logps/rejected": -83.02822875976562, "loss": 0.4597, "rewards/accuracies": 1.0, "rewards/chosen": 0.9024032950401306, "rewards/margins": 0.2819107174873352, "rewards/rejected": 0.6204925775527954, "step": 705 }, { "epoch": 0.11, "learning_rate": 9.980512558319915e-07, "logits/chosen": -0.28809693455696106, "logits/rejected": -0.28809693455696106, "logps/chosen": -49.61867904663086, "logps/rejected": -49.61867904663086, "loss": 0.6548, "rewards/accuracies": 0.0, "rewards/chosen": 0.299642950296402, "rewards/margins": 0.0, "rewards/rejected": 0.299642950296402, "step": 706 }, { "epoch": 0.11, "learning_rate": 9.98039646499952e-07, "logits/chosen": -0.6952453851699829, "logits/rejected": -0.6822656393051147, "logps/chosen": -126.50914001464844, "logps/rejected": -87.18505859375, "loss": 0.7303, "rewards/accuracies": 0.0, "rewards/chosen": 0.3629318177700043, "rewards/margins": -0.35059359669685364, "rewards/rejected": 0.7135254144668579, "step": 707 }, { "epoch": 0.11, "learning_rate": 9.980280027580851e-07, "logits/chosen": -0.27174147963523865, "logits/rejected": -0.31754070520401, "logps/chosen": -84.47334289550781, "logps/rejected": -66.27742004394531, "loss": 0.9497, "rewards/accuracies": 0.0, "rewards/chosen": 0.21586303412914276, "rewards/margins": -0.8946502804756165, "rewards/rejected": 1.1105133295059204, "step": 708 }, { "epoch": 0.12, "learning_rate": 9.98016324607196e-07, "logits/chosen": -0.2230827659368515, "logits/rejected": -0.17165179550647736, "logps/chosen": -84.31852722167969, "logps/rejected": -100.86268615722656, "loss": 0.8868, "rewards/accuracies": 0.0, "rewards/chosen": 0.13129806518554688, "rewards/margins": -0.2752784788608551, "rewards/rejected": 0.406576544046402, "step": 709 }, { "epoch": 0.12, "learning_rate": 9.98004612048091e-07, "logits/chosen": -0.5388430953025818, "logits/rejected": -0.5184017419815063, "logps/chosen": -78.68995666503906, "logps/rejected": -23.85519027709961, "loss": 0.5703, "rewards/accuracies": 1.0, "rewards/chosen": 0.5385376214981079, "rewards/margins": 0.3500898480415344, "rewards/rejected": 0.1884477585554123, "step": 710 }, { "epoch": 0.12, "learning_rate": 9.979928650815795e-07, "logits/chosen": -0.7701398134231567, "logits/rejected": -0.72078937292099, "logps/chosen": -69.70431518554688, "logps/rejected": -96.922119140625, "loss": 0.9499, "rewards/accuracies": 0.0, "rewards/chosen": 0.8841491937637329, "rewards/margins": -0.33277738094329834, "rewards/rejected": 1.2169265747070312, "step": 711 }, { "epoch": 0.12, "learning_rate": 9.97981083708473e-07, "logits/chosen": -0.04058830067515373, "logits/rejected": -0.04058830067515373, "logps/chosen": -114.87493133544922, "logps/rejected": -114.87493133544922, "loss": 0.6406, "rewards/accuracies": 0.0, "rewards/chosen": 0.0905052199959755, "rewards/margins": 0.0, "rewards/rejected": 0.0905052199959755, "step": 712 }, { "epoch": 0.12, "learning_rate": 9.979692679295854e-07, "logits/chosen": -0.11830458045005798, "logits/rejected": -0.11803644150495529, "logps/chosen": -3.7596969604492188, "logps/rejected": -6.305907249450684, "loss": 0.7087, "rewards/accuracies": 0.0, "rewards/chosen": 0.23742714524269104, "rewards/margins": -0.03689965605735779, "rewards/rejected": 0.27432680130004883, "step": 713 }, { "epoch": 0.12, "learning_rate": 9.979574177457335e-07, "logits/chosen": -0.4660382866859436, "logits/rejected": -0.5202248692512512, "logps/chosen": -67.275634765625, "logps/rejected": -52.280853271484375, "loss": 0.5992, "rewards/accuracies": 0.0, "rewards/chosen": 0.29720765352249146, "rewards/margins": -0.18653830885887146, "rewards/rejected": 0.4837459623813629, "step": 714 }, { "epoch": 0.12, "learning_rate": 9.979455331577359e-07, "logits/chosen": -0.23615677654743195, "logits/rejected": -0.21863707900047302, "logps/chosen": -83.42327880859375, "logps/rejected": -71.40453338623047, "loss": 0.6601, "rewards/accuracies": 1.0, "rewards/chosen": 0.7390373349189758, "rewards/margins": 0.2990669310092926, "rewards/rejected": 0.4399704039096832, "step": 715 }, { "epoch": 0.12, "learning_rate": 9.979336141664131e-07, "logits/chosen": -0.34351304173469543, "logits/rejected": -0.33148032426834106, "logps/chosen": -114.49085235595703, "logps/rejected": -71.30384063720703, "loss": 0.9489, "rewards/accuracies": 0.0, "rewards/chosen": 0.052614595741033554, "rewards/margins": -0.9464393854141235, "rewards/rejected": 0.999053955078125, "step": 716 }, { "epoch": 0.12, "learning_rate": 9.979216607725894e-07, "logits/chosen": -0.5129144191741943, "logits/rejected": -0.6506532430648804, "logps/chosen": -190.70204162597656, "logps/rejected": -121.3319320678711, "loss": 0.78, "rewards/accuracies": 0.0, "rewards/chosen": 0.6799820065498352, "rewards/margins": -0.45180433988571167, "rewards/rejected": 1.1317863464355469, "step": 717 }, { "epoch": 0.12, "learning_rate": 9.979096729770901e-07, "logits/chosen": -0.1812610924243927, "logits/rejected": -0.15224818885326385, "logps/chosen": -123.86058807373047, "logps/rejected": -152.9120330810547, "loss": 0.6265, "rewards/accuracies": 1.0, "rewards/chosen": 0.7466530203819275, "rewards/margins": 0.31987231969833374, "rewards/rejected": 0.42678070068359375, "step": 718 }, { "epoch": 0.12, "learning_rate": 9.978976507807437e-07, "logits/chosen": -0.4619564414024353, "logits/rejected": -0.4658797085285187, "logps/chosen": -100.67180633544922, "logps/rejected": -175.77978515625, "loss": 0.86, "rewards/accuracies": 0.0, "rewards/chosen": 1.0411598682403564, "rewards/margins": -0.859246015548706, "rewards/rejected": 1.9004058837890625, "step": 719 }, { "epoch": 0.12, "learning_rate": 9.97885594184381e-07, "logits/chosen": -0.5464891791343689, "logits/rejected": -0.4994102716445923, "logps/chosen": -93.2345199584961, "logps/rejected": -89.16522216796875, "loss": 0.4141, "rewards/accuracies": 1.0, "rewards/chosen": 1.0507110357284546, "rewards/margins": 0.3249663710594177, "rewards/rejected": 0.7257446646690369, "step": 720 }, { "epoch": 0.12, "learning_rate": 9.978735031888345e-07, "logits/chosen": -0.38428372144699097, "logits/rejected": -0.36756035685539246, "logps/chosen": -65.10086822509766, "logps/rejected": -95.91710662841797, "loss": 0.8405, "rewards/accuracies": 1.0, "rewards/chosen": 0.8310295343399048, "rewards/margins": 0.4019058346748352, "rewards/rejected": 0.4291236996650696, "step": 721 }, { "epoch": 0.12, "learning_rate": 9.9786137779494e-07, "logits/chosen": -0.605602502822876, "logits/rejected": -0.5725057125091553, "logps/chosen": -100.98318481445312, "logps/rejected": -31.332910537719727, "loss": 0.5922, "rewards/accuracies": 1.0, "rewards/chosen": 0.2633415162563324, "rewards/margins": 0.1641576588153839, "rewards/rejected": 0.09918384999036789, "step": 722 }, { "epoch": 0.12, "learning_rate": 9.97849218003535e-07, "logits/chosen": -0.3704640567302704, "logits/rejected": -0.3467597961425781, "logps/chosen": -64.80332946777344, "logps/rejected": -78.84757995605469, "loss": 0.8468, "rewards/accuracies": 0.0, "rewards/chosen": 0.6719909906387329, "rewards/margins": -0.1323501467704773, "rewards/rejected": 0.8043411374092102, "step": 723 }, { "epoch": 0.12, "learning_rate": 9.9783702381546e-07, "logits/chosen": -0.4217956066131592, "logits/rejected": -0.43497976660728455, "logps/chosen": -112.00299835205078, "logps/rejected": -154.77430725097656, "loss": 0.6244, "rewards/accuracies": 0.0, "rewards/chosen": 0.7757682800292969, "rewards/margins": -0.034954071044921875, "rewards/rejected": 0.8107223510742188, "step": 724 }, { "epoch": 0.12, "learning_rate": 9.978247952315568e-07, "logits/chosen": -1.0639535188674927, "logits/rejected": -1.0430701971054077, "logps/chosen": -92.4710922241211, "logps/rejected": -81.90132904052734, "loss": 0.5833, "rewards/accuracies": 0.0, "rewards/chosen": 0.2943015992641449, "rewards/margins": -0.12919771671295166, "rewards/rejected": 0.42349931597709656, "step": 725 }, { "epoch": 0.12, "learning_rate": 9.97812532252671e-07, "logits/chosen": -0.6223391890525818, "logits/rejected": -0.6200693845748901, "logps/chosen": -38.533626556396484, "logps/rejected": -78.00508880615234, "loss": 0.8526, "rewards/accuracies": 0.0, "rewards/chosen": 0.5838283896446228, "rewards/margins": -0.15699422359466553, "rewards/rejected": 0.7408226132392883, "step": 726 }, { "epoch": 0.12, "learning_rate": 9.978002348796494e-07, "logits/chosen": -0.6066798567771912, "logits/rejected": -0.5684143304824829, "logps/chosen": -65.95474243164062, "logps/rejected": -102.86715698242188, "loss": 0.8189, "rewards/accuracies": 0.0, "rewards/chosen": 0.005368042271584272, "rewards/margins": -0.2995010316371918, "rewards/rejected": 0.3048690855503082, "step": 727 }, { "epoch": 0.12, "learning_rate": 9.97787903113342e-07, "logits/chosen": -0.19260936975479126, "logits/rejected": -0.08051933348178864, "logps/chosen": -52.46713638305664, "logps/rejected": -22.013778686523438, "loss": 0.4188, "rewards/accuracies": 1.0, "rewards/chosen": 0.8623634576797485, "rewards/margins": 0.751568615436554, "rewards/rejected": 0.11079483479261398, "step": 728 }, { "epoch": 0.12, "learning_rate": 9.977755369546006e-07, "logits/chosen": -0.5247380137443542, "logits/rejected": -0.5247380137443542, "logps/chosen": -46.1383056640625, "logps/rejected": -46.1383056640625, "loss": 0.7359, "rewards/accuracies": 0.0, "rewards/chosen": 0.5847198367118835, "rewards/margins": 0.0, "rewards/rejected": 0.5847198367118835, "step": 729 }, { "epoch": 0.12, "learning_rate": 9.977631364042794e-07, "logits/chosen": -0.3161120116710663, "logits/rejected": -0.22797030210494995, "logps/chosen": -103.64878845214844, "logps/rejected": -101.72062683105469, "loss": 0.8097, "rewards/accuracies": 0.0, "rewards/chosen": 0.11351318657398224, "rewards/margins": -0.2969474792480469, "rewards/rejected": 0.4104606807231903, "step": 730 }, { "epoch": 0.12, "learning_rate": 9.977507014632355e-07, "logits/chosen": -0.4113922417163849, "logits/rejected": -0.39817363023757935, "logps/chosen": -102.00245666503906, "logps/rejected": -80.34123229980469, "loss": 0.5519, "rewards/accuracies": 1.0, "rewards/chosen": 0.7138771414756775, "rewards/margins": 0.18900835514068604, "rewards/rejected": 0.5248687863349915, "step": 731 }, { "epoch": 0.12, "learning_rate": 9.977382321323277e-07, "logits/chosen": -0.46865397691726685, "logits/rejected": -0.45022499561309814, "logps/chosen": -29.956546783447266, "logps/rejected": -26.181806564331055, "loss": 0.5908, "rewards/accuracies": 1.0, "rewards/chosen": 0.4070892333984375, "rewards/margins": 0.0344032347202301, "rewards/rejected": 0.3726859986782074, "step": 732 }, { "epoch": 0.12, "learning_rate": 9.97725728412418e-07, "logits/chosen": -0.6061071753501892, "logits/rejected": -0.6172945499420166, "logps/chosen": -62.578163146972656, "logps/rejected": -14.934682846069336, "loss": 0.5141, "rewards/accuracies": 0.0, "rewards/chosen": 0.2675109803676605, "rewards/margins": -0.05823346972465515, "rewards/rejected": 0.3257444500923157, "step": 733 }, { "epoch": 0.12, "learning_rate": 9.977131903043698e-07, "logits/chosen": -0.457844078540802, "logits/rejected": -0.428212434053421, "logps/chosen": -103.40982055664062, "logps/rejected": -44.147193908691406, "loss": 0.3957, "rewards/accuracies": 1.0, "rewards/chosen": 0.9439491629600525, "rewards/margins": 0.5043430328369141, "rewards/rejected": 0.43960610032081604, "step": 734 }, { "epoch": 0.12, "learning_rate": 9.977006178090497e-07, "logits/chosen": -0.6679788827896118, "logits/rejected": -0.5944578647613525, "logps/chosen": -187.63641357421875, "logps/rejected": -256.77655029296875, "loss": 0.9933, "rewards/accuracies": 0.0, "rewards/chosen": 0.603894054889679, "rewards/margins": -1.2874970436096191, "rewards/rejected": 1.8913910388946533, "step": 735 }, { "epoch": 0.12, "learning_rate": 9.97688010927326e-07, "logits/chosen": -0.45311239361763, "logits/rejected": -0.43051227927207947, "logps/chosen": -145.81483459472656, "logps/rejected": -87.71959686279297, "loss": 0.5418, "rewards/accuracies": 1.0, "rewards/chosen": 1.290370225906372, "rewards/margins": 0.4533226490020752, "rewards/rejected": 0.8370475769042969, "step": 736 }, { "epoch": 0.12, "learning_rate": 9.9767536966007e-07, "logits/chosen": -0.5495651960372925, "logits/rejected": -0.6005272269248962, "logps/chosen": -147.47535705566406, "logps/rejected": -159.81585693359375, "loss": 0.9404, "rewards/accuracies": 0.0, "rewards/chosen": 0.40548402070999146, "rewards/margins": -0.7348083853721619, "rewards/rejected": 1.1402924060821533, "step": 737 }, { "epoch": 0.12, "learning_rate": 9.97662694008155e-07, "logits/chosen": -0.2920229434967041, "logits/rejected": -0.2921912670135498, "logps/chosen": -74.05195617675781, "logps/rejected": -151.30606079101562, "loss": 0.4284, "rewards/accuracies": 1.0, "rewards/chosen": 0.5079727172851562, "rewards/margins": 0.6341232061386108, "rewards/rejected": -0.12615051865577698, "step": 738 }, { "epoch": 0.12, "learning_rate": 9.976499839724569e-07, "logits/chosen": -0.05212375149130821, "logits/rejected": -0.030256086960434914, "logps/chosen": -116.79753112792969, "logps/rejected": -71.09473419189453, "loss": 1.0856, "rewards/accuracies": 0.0, "rewards/chosen": 0.5232086181640625, "rewards/margins": -0.09567338228225708, "rewards/rejected": 0.6188820004463196, "step": 739 }, { "epoch": 0.12, "learning_rate": 9.976372395538535e-07, "logits/chosen": -0.483311265707016, "logits/rejected": -0.41479983925819397, "logps/chosen": -32.730751037597656, "logps/rejected": -61.81577682495117, "loss": 0.8811, "rewards/accuracies": 0.0, "rewards/chosen": 0.13773345947265625, "rewards/margins": -0.8400676846504211, "rewards/rejected": 0.9778011441230774, "step": 740 }, { "epoch": 0.12, "learning_rate": 9.976244607532257e-07, "logits/chosen": -0.33684757351875305, "logits/rejected": -0.38321056962013245, "logps/chosen": -169.9423828125, "logps/rejected": -58.76321792602539, "loss": 0.6028, "rewards/accuracies": 0.0, "rewards/chosen": 0.5610870718955994, "rewards/margins": -0.21153903007507324, "rewards/rejected": 0.7726261019706726, "step": 741 }, { "epoch": 0.12, "learning_rate": 9.976116475714563e-07, "logits/chosen": -0.4751092791557312, "logits/rejected": -0.45506560802459717, "logps/chosen": -75.8924560546875, "logps/rejected": -37.3307991027832, "loss": 0.3295, "rewards/accuracies": 1.0, "rewards/chosen": 0.8578140139579773, "rewards/margins": 0.7764759063720703, "rewards/rejected": 0.08133812248706818, "step": 742 }, { "epoch": 0.12, "learning_rate": 9.9759880000943e-07, "logits/chosen": -0.3695339858531952, "logits/rejected": -0.35268262028694153, "logps/chosen": -68.9068374633789, "logps/rejected": -49.522579193115234, "loss": 0.653, "rewards/accuracies": 1.0, "rewards/chosen": 0.49451372027397156, "rewards/margins": 0.07325783371925354, "rewards/rejected": 0.421255886554718, "step": 743 }, { "epoch": 0.12, "learning_rate": 9.975859180680355e-07, "logits/chosen": -0.5421739816665649, "logits/rejected": -0.36857837438583374, "logps/chosen": -130.13172912597656, "logps/rejected": -124.76920318603516, "loss": 0.7833, "rewards/accuracies": 0.0, "rewards/chosen": 1.1854079961776733, "rewards/margins": -0.12786483764648438, "rewards/rejected": 1.3132728338241577, "step": 744 }, { "epoch": 0.12, "learning_rate": 9.97573001748162e-07, "logits/chosen": -0.36686262488365173, "logits/rejected": -0.3751752972602844, "logps/chosen": -77.79859924316406, "logps/rejected": -98.76263427734375, "loss": 0.5405, "rewards/accuracies": 0.0, "rewards/chosen": 0.6707077026367188, "rewards/margins": -0.2740311026573181, "rewards/rejected": 0.9447388052940369, "step": 745 }, { "epoch": 0.12, "learning_rate": 9.975600510507024e-07, "logits/chosen": -0.796753466129303, "logits/rejected": -0.7228917479515076, "logps/chosen": -54.589599609375, "logps/rejected": -128.6192626953125, "loss": 0.7643, "rewards/accuracies": 0.0, "rewards/chosen": 0.825592041015625, "rewards/margins": -0.07908022403717041, "rewards/rejected": 0.9046722650527954, "step": 746 }, { "epoch": 0.12, "learning_rate": 9.97547065976551e-07, "logits/chosen": -0.21084356307983398, "logits/rejected": -0.10243070870637894, "logps/chosen": -60.54407501220703, "logps/rejected": -60.07172775268555, "loss": 0.699, "rewards/accuracies": 1.0, "rewards/chosen": 0.8054115176200867, "rewards/margins": 0.05205798149108887, "rewards/rejected": 0.7533535361289978, "step": 747 }, { "epoch": 0.12, "learning_rate": 9.975340465266053e-07, "logits/chosen": -0.7200853228569031, "logits/rejected": -0.6782549023628235, "logps/chosen": -50.355445861816406, "logps/rejected": -58.05514907836914, "loss": 0.5626, "rewards/accuracies": 1.0, "rewards/chosen": 0.9633850455284119, "rewards/margins": 0.4960213005542755, "rewards/rejected": 0.46736374497413635, "step": 748 }, { "epoch": 0.12, "learning_rate": 9.975209927017646e-07, "logits/chosen": -0.4280184507369995, "logits/rejected": -0.39317816495895386, "logps/chosen": -126.66816711425781, "logps/rejected": -100.84758758544922, "loss": 0.7866, "rewards/accuracies": 0.0, "rewards/chosen": 0.34023284912109375, "rewards/margins": -0.34557878971099854, "rewards/rejected": 0.6858116388320923, "step": 749 }, { "epoch": 0.12, "learning_rate": 9.97507904502931e-07, "logits/chosen": -0.10516387969255447, "logits/rejected": -0.01997874490916729, "logps/chosen": -149.845703125, "logps/rejected": -38.91858673095703, "loss": 0.6164, "rewards/accuracies": 1.0, "rewards/chosen": 1.63078773021698, "rewards/margins": 1.3846542835235596, "rewards/rejected": 0.24613343179225922, "step": 750 }, { "epoch": 0.12, "learning_rate": 9.974947819310084e-07, "logits/chosen": -0.04093051329255104, "logits/rejected": -0.04670485109090805, "logps/chosen": -9.80473804473877, "logps/rejected": -6.694552898406982, "loss": 0.7765, "rewards/accuracies": 0.0, "rewards/chosen": -0.02672596089541912, "rewards/margins": -0.14647512137889862, "rewards/rejected": 0.11974916607141495, "step": 751 }, { "epoch": 0.12, "learning_rate": 9.97481624986904e-07, "logits/chosen": -0.5759952664375305, "logits/rejected": -0.5681008696556091, "logps/chosen": -126.63766479492188, "logps/rejected": -87.91337585449219, "loss": 0.844, "rewards/accuracies": 0.0, "rewards/chosen": 0.25982666015625, "rewards/margins": -0.5296714901924133, "rewards/rejected": 0.7894981503486633, "step": 752 }, { "epoch": 0.12, "learning_rate": 9.974684336715264e-07, "logits/chosen": -0.23386946320533752, "logits/rejected": -0.32755765318870544, "logps/chosen": -111.52996826171875, "logps/rejected": -181.73045349121094, "loss": 0.7209, "rewards/accuracies": 0.0, "rewards/chosen": 0.33037492632865906, "rewards/margins": -0.582262396812439, "rewards/rejected": 0.9126373529434204, "step": 753 }, { "epoch": 0.12, "learning_rate": 9.974552079857871e-07, "logits/chosen": -0.414320707321167, "logits/rejected": -0.4499983787536621, "logps/chosen": -45.30230712890625, "logps/rejected": -39.03960037231445, "loss": 0.6738, "rewards/accuracies": 1.0, "rewards/chosen": 0.6954239010810852, "rewards/margins": 0.03353464603424072, "rewards/rejected": 0.6618892550468445, "step": 754 }, { "epoch": 0.12, "learning_rate": 9.974419479306e-07, "logits/chosen": -0.3394632339477539, "logits/rejected": -0.23288725316524506, "logps/chosen": -206.01806640625, "logps/rejected": -53.81181335449219, "loss": 0.46, "rewards/accuracies": 1.0, "rewards/chosen": 0.96881103515625, "rewards/margins": 0.5592395663261414, "rewards/rejected": 0.40957146883010864, "step": 755 }, { "epoch": 0.12, "learning_rate": 9.97428653506881e-07, "logits/chosen": -0.48279714584350586, "logits/rejected": -0.49925217032432556, "logps/chosen": -70.1395263671875, "logps/rejected": -100.5390625, "loss": 0.8094, "rewards/accuracies": 1.0, "rewards/chosen": 0.22467041015625, "rewards/margins": 0.12920531630516052, "rewards/rejected": 0.09546508640050888, "step": 756 }, { "epoch": 0.12, "learning_rate": 9.974153247155487e-07, "logits/chosen": -0.3266132175922394, "logits/rejected": -0.3389873802661896, "logps/chosen": -88.65182495117188, "logps/rejected": -72.23348999023438, "loss": 0.8601, "rewards/accuracies": 0.0, "rewards/chosen": 0.6469619870185852, "rewards/margins": -0.4846206307411194, "rewards/rejected": 1.1315826177597046, "step": 757 }, { "epoch": 0.12, "learning_rate": 9.974019615575243e-07, "logits/chosen": 0.034758467227220535, "logits/rejected": 0.07353061437606812, "logps/chosen": -54.5823974609375, "logps/rejected": -53.49596405029297, "loss": 0.4314, "rewards/accuracies": 1.0, "rewards/chosen": 0.7435138821601868, "rewards/margins": 0.6531990170478821, "rewards/rejected": 0.09031486511230469, "step": 758 }, { "epoch": 0.12, "learning_rate": 9.973885640337307e-07, "logits/chosen": -0.7048351764678955, "logits/rejected": -0.7010761499404907, "logps/chosen": -120.51258087158203, "logps/rejected": -17.69342041015625, "loss": 0.5551, "rewards/accuracies": 1.0, "rewards/chosen": 0.5257034301757812, "rewards/margins": 0.4248100221157074, "rewards/rejected": 0.10089340060949326, "step": 759 }, { "epoch": 0.12, "learning_rate": 9.973751321450935e-07, "logits/chosen": -0.3681710958480835, "logits/rejected": -0.3681710958480835, "logps/chosen": -192.09478759765625, "logps/rejected": -192.09478759765625, "loss": 0.6035, "rewards/accuracies": 0.0, "rewards/chosen": 0.31522828340530396, "rewards/margins": 0.0, "rewards/rejected": 0.31522828340530396, "step": 760 }, { "epoch": 0.12, "learning_rate": 9.973616658925412e-07, "logits/chosen": -0.6414446234703064, "logits/rejected": -0.16503234207630157, "logps/chosen": -93.1842041015625, "logps/rejected": -118.18673706054688, "loss": 1.1264, "rewards/accuracies": 0.0, "rewards/chosen": 0.594678521156311, "rewards/margins": -0.36774975061416626, "rewards/rejected": 0.9624282717704773, "step": 761 }, { "epoch": 0.12, "learning_rate": 9.973481652770038e-07, "logits/chosen": -0.429164856672287, "logits/rejected": -0.42606404423713684, "logps/chosen": -63.87112808227539, "logps/rejected": -79.70624542236328, "loss": 0.7591, "rewards/accuracies": 0.0, "rewards/chosen": 0.8527935147285461, "rewards/margins": -0.17255741357803345, "rewards/rejected": 1.0253509283065796, "step": 762 }, { "epoch": 0.12, "learning_rate": 9.973346302994139e-07, "logits/chosen": -0.38256317377090454, "logits/rejected": -0.3847081959247589, "logps/chosen": -164.8197021484375, "logps/rejected": -133.97500610351562, "loss": 0.5834, "rewards/accuracies": 0.0, "rewards/chosen": 0.7063751220703125, "rewards/margins": -0.3323456048965454, "rewards/rejected": 1.038720726966858, "step": 763 }, { "epoch": 0.12, "learning_rate": 9.97321060960707e-07, "logits/chosen": -0.23031611740589142, "logits/rejected": -0.3106729984283447, "logps/chosen": -159.4802703857422, "logps/rejected": -182.397705078125, "loss": 0.6169, "rewards/accuracies": 1.0, "rewards/chosen": 0.6884750723838806, "rewards/margins": 0.5418747067451477, "rewards/rejected": 0.14660035073757172, "step": 764 }, { "epoch": 0.12, "learning_rate": 9.973074572618204e-07, "logits/chosen": -0.541104793548584, "logits/rejected": -0.52244633436203, "logps/chosen": -112.29867553710938, "logps/rejected": -130.71157836914062, "loss": 0.7406, "rewards/accuracies": 0.0, "rewards/chosen": 0.4445663392543793, "rewards/margins": -0.3689377009868622, "rewards/rejected": 0.8135040402412415, "step": 765 }, { "epoch": 0.12, "learning_rate": 9.972938192036944e-07, "logits/chosen": -0.047792695462703705, "logits/rejected": -0.12870632112026215, "logps/chosen": -217.7996368408203, "logps/rejected": -59.451683044433594, "loss": 0.3059, "rewards/accuracies": 1.0, "rewards/chosen": 1.902607798576355, "rewards/margins": 1.3427612781524658, "rewards/rejected": 0.5598465204238892, "step": 766 }, { "epoch": 0.12, "learning_rate": 9.972801467872705e-07, "logits/chosen": -0.05902973935008049, "logits/rejected": -0.05902973935008049, "logps/chosen": -77.4661865234375, "logps/rejected": -77.4661865234375, "loss": 0.7376, "rewards/accuracies": 0.0, "rewards/chosen": 0.10740890353918076, "rewards/margins": 0.0, "rewards/rejected": 0.10740890353918076, "step": 767 }, { "epoch": 0.12, "learning_rate": 9.97266440013494e-07, "logits/chosen": -0.07127434760332108, "logits/rejected": -0.07176133245229721, "logps/chosen": -98.77645111083984, "logps/rejected": -71.8453598022461, "loss": 1.0982, "rewards/accuracies": 0.0, "rewards/chosen": 0.25070419907569885, "rewards/margins": -0.4439682066440582, "rewards/rejected": 0.6946724057197571, "step": 768 }, { "epoch": 0.12, "learning_rate": 9.972526988833117e-07, "logits/chosen": -0.20745854079723358, "logits/rejected": -0.2546190619468689, "logps/chosen": -143.59632873535156, "logps/rejected": -74.17025756835938, "loss": 1.0094, "rewards/accuracies": 1.0, "rewards/chosen": 0.280792236328125, "rewards/margins": 0.23261871933937073, "rewards/rejected": 0.04817352443933487, "step": 769 }, { "epoch": 0.12, "learning_rate": 9.972389233976729e-07, "logits/chosen": -0.2568525969982147, "logits/rejected": -0.28050610423088074, "logps/chosen": -123.16567993164062, "logps/rejected": -113.48464965820312, "loss": 0.5462, "rewards/accuracies": 1.0, "rewards/chosen": 0.02975006215274334, "rewards/margins": 0.00642242468893528, "rewards/rejected": 0.02332763746380806, "step": 770 }, { "epoch": 0.13, "learning_rate": 9.972251135575293e-07, "logits/chosen": -0.2518429458141327, "logits/rejected": -0.2513241171836853, "logps/chosen": -4.733585834503174, "logps/rejected": -3.3229875564575195, "loss": 1.0183, "rewards/accuracies": 0.0, "rewards/chosen": 0.0758490115404129, "rewards/margins": -0.07427649199962616, "rewards/rejected": 0.15012550354003906, "step": 771 }, { "epoch": 0.13, "learning_rate": 9.972112693638352e-07, "logits/chosen": -0.4002718925476074, "logits/rejected": -0.39284855127334595, "logps/chosen": -34.39548110961914, "logps/rejected": -43.639869689941406, "loss": 0.7154, "rewards/accuracies": 1.0, "rewards/chosen": 0.6367992758750916, "rewards/margins": 0.15393564105033875, "rewards/rejected": 0.4828636348247528, "step": 772 }, { "epoch": 0.13, "learning_rate": 9.971973908175471e-07, "logits/chosen": -0.5363918542861938, "logits/rejected": -0.48046720027923584, "logps/chosen": -57.45812225341797, "logps/rejected": -84.986572265625, "loss": 0.5469, "rewards/accuracies": 1.0, "rewards/chosen": 0.2690528929233551, "rewards/margins": 0.21659547090530396, "rewards/rejected": 0.052457429468631744, "step": 773 }, { "epoch": 0.13, "learning_rate": 9.971834779196237e-07, "logits/chosen": -0.3558809757232666, "logits/rejected": -0.34273645281791687, "logps/chosen": -67.72076416015625, "logps/rejected": -45.52008819580078, "loss": 0.8186, "rewards/accuracies": 0.0, "rewards/chosen": 0.7190643548965454, "rewards/margins": -0.15457457304000854, "rewards/rejected": 0.873638927936554, "step": 774 }, { "epoch": 0.13, "learning_rate": 9.971695306710267e-07, "logits/chosen": -0.2237321436405182, "logits/rejected": -0.23206360638141632, "logps/chosen": -43.4703254699707, "logps/rejected": -80.5172348022461, "loss": 1.0987, "rewards/accuracies": 0.0, "rewards/chosen": 0.7287918329238892, "rewards/margins": -0.160649836063385, "rewards/rejected": 0.8894416689872742, "step": 775 }, { "epoch": 0.13, "learning_rate": 9.97155549072719e-07, "logits/chosen": -0.09577123075723648, "logits/rejected": -0.09577123075723648, "logps/chosen": -20.042652130126953, "logps/rejected": -20.042652130126953, "loss": 0.6774, "rewards/accuracies": 0.0, "rewards/chosen": 0.03932991251349449, "rewards/margins": 0.0, "rewards/rejected": 0.03932991251349449, "step": 776 }, { "epoch": 0.13, "learning_rate": 9.971415331256672e-07, "logits/chosen": -0.6027885675430298, "logits/rejected": -1.0340569019317627, "logps/chosen": -118.9892578125, "logps/rejected": -38.865360260009766, "loss": 0.773, "rewards/accuracies": 1.0, "rewards/chosen": 0.0050292969681322575, "rewards/margins": 0.008815002627670765, "rewards/rejected": -0.0037857056595385075, "step": 777 }, { "epoch": 0.13, "learning_rate": 9.971274828308393e-07, "logits/chosen": -0.20986908674240112, "logits/rejected": -0.20574577152729034, "logps/chosen": -72.63373565673828, "logps/rejected": -69.53253173828125, "loss": 0.522, "rewards/accuracies": 1.0, "rewards/chosen": 0.9270698428153992, "rewards/margins": 0.06642073392868042, "rewards/rejected": 0.8606491088867188, "step": 778 }, { "epoch": 0.13, "learning_rate": 9.971133981892065e-07, "logits/chosen": -0.09715074300765991, "logits/rejected": -0.10380624234676361, "logps/chosen": -89.74010467529297, "logps/rejected": -115.85496520996094, "loss": 0.7614, "rewards/accuracies": 0.0, "rewards/chosen": 0.5487373471260071, "rewards/margins": -0.3765457272529602, "rewards/rejected": 0.9252830743789673, "step": 779 }, { "epoch": 0.13, "learning_rate": 9.970992792017412e-07, "logits/chosen": -0.46050944924354553, "logits/rejected": -0.45501115918159485, "logps/chosen": -48.133758544921875, "logps/rejected": -34.145137786865234, "loss": 0.3237, "rewards/accuracies": 1.0, "rewards/chosen": 0.44012412428855896, "rewards/margins": 0.35218122601509094, "rewards/rejected": 0.08794289082288742, "step": 780 }, { "epoch": 0.13, "learning_rate": 9.970851258694197e-07, "logits/chosen": -0.5232816338539124, "logits/rejected": -0.5138660669326782, "logps/chosen": -73.90129852294922, "logps/rejected": -186.27923583984375, "loss": 1.168, "rewards/accuracies": 0.0, "rewards/chosen": 0.3639259338378906, "rewards/margins": -1.671933889389038, "rewards/rejected": 2.0358598232269287, "step": 781 }, { "epoch": 0.13, "learning_rate": 9.970709381932192e-07, "logits/chosen": -0.6932531595230103, "logits/rejected": -0.7118358612060547, "logps/chosen": -61.619911193847656, "logps/rejected": -110.19081115722656, "loss": 0.6117, "rewards/accuracies": 1.0, "rewards/chosen": 0.4483962953090668, "rewards/margins": 0.17683562636375427, "rewards/rejected": 0.2715606689453125, "step": 782 }, { "epoch": 0.13, "learning_rate": 9.970567161741204e-07, "logits/chosen": -0.5234143137931824, "logits/rejected": -1.0339962244033813, "logps/chosen": -110.66999053955078, "logps/rejected": -36.07526397705078, "loss": 0.7569, "rewards/accuracies": 1.0, "rewards/chosen": 0.1511695832014084, "rewards/margins": 0.0641101822257042, "rewards/rejected": 0.0870594009757042, "step": 783 }, { "epoch": 0.13, "learning_rate": 9.970424598131056e-07, "logits/chosen": -0.4525999128818512, "logits/rejected": -0.45364877581596375, "logps/chosen": -43.96698760986328, "logps/rejected": -36.13243865966797, "loss": 0.7476, "rewards/accuracies": 1.0, "rewards/chosen": 0.8001198172569275, "rewards/margins": 0.20113641023635864, "rewards/rejected": 0.5989834070205688, "step": 784 }, { "epoch": 0.13, "learning_rate": 9.970281691111597e-07, "logits/chosen": -0.8321750164031982, "logits/rejected": -0.8334233164787292, "logps/chosen": -90.92608642578125, "logps/rejected": -41.84989547729492, "loss": 0.7951, "rewards/accuracies": 0.0, "rewards/chosen": 0.06137847900390625, "rewards/margins": -0.007473371922969818, "rewards/rejected": 0.06885185092687607, "step": 785 }, { "epoch": 0.13, "learning_rate": 9.970138440692705e-07, "logits/chosen": -0.12730209529399872, "logits/rejected": -0.1563677191734314, "logps/chosen": -24.121315002441406, "logps/rejected": -103.30945587158203, "loss": 0.8852, "rewards/accuracies": 0.0, "rewards/chosen": -0.03550910949707031, "rewards/margins": -0.2378520965576172, "rewards/rejected": 0.20234298706054688, "step": 786 }, { "epoch": 0.13, "learning_rate": 9.969994846884273e-07, "logits/chosen": -0.550929605960846, "logits/rejected": -0.5494706630706787, "logps/chosen": -65.6530532836914, "logps/rejected": -72.97065734863281, "loss": 0.7245, "rewards/accuracies": 0.0, "rewards/chosen": 0.7948967218399048, "rewards/margins": -0.2394249439239502, "rewards/rejected": 1.034321665763855, "step": 787 }, { "epoch": 0.13, "learning_rate": 9.969850909696224e-07, "logits/chosen": -0.2818826138973236, "logits/rejected": -0.4378557503223419, "logps/chosen": -133.05926513671875, "logps/rejected": -79.60264587402344, "loss": 0.609, "rewards/accuracies": 1.0, "rewards/chosen": 0.8889800906181335, "rewards/margins": 0.3971000611782074, "rewards/rejected": 0.49188002943992615, "step": 788 }, { "epoch": 0.13, "learning_rate": 9.969706629138503e-07, "logits/chosen": -0.5875250697135925, "logits/rejected": -0.5945225954055786, "logps/chosen": -247.47035217285156, "logps/rejected": -60.4812126159668, "loss": 0.8166, "rewards/accuracies": 1.0, "rewards/chosen": 0.9070846438407898, "rewards/margins": 0.710384726524353, "rewards/rejected": 0.19669990241527557, "step": 789 }, { "epoch": 0.13, "learning_rate": 9.969562005221078e-07, "logits/chosen": -0.6754804849624634, "logits/rejected": -0.6754804849624634, "logps/chosen": -61.02031707763672, "logps/rejected": -61.02031707763672, "loss": 0.7087, "rewards/accuracies": 0.0, "rewards/chosen": 0.5703712701797485, "rewards/margins": 0.0, "rewards/rejected": 0.5703712701797485, "step": 790 }, { "epoch": 0.13, "learning_rate": 9.96941703795394e-07, "logits/chosen": -0.25387343764305115, "logits/rejected": -0.240218386054039, "logps/chosen": -82.54811096191406, "logps/rejected": -83.73661804199219, "loss": 0.6623, "rewards/accuracies": 0.0, "rewards/chosen": 0.5076629519462585, "rewards/margins": -0.3194824457168579, "rewards/rejected": 0.8271453976631165, "step": 791 }, { "epoch": 0.13, "learning_rate": 9.969271727347107e-07, "logits/chosen": -0.4430924355983734, "logits/rejected": -0.32681915163993835, "logps/chosen": -186.3129119873047, "logps/rejected": -30.263992309570312, "loss": 0.3426, "rewards/accuracies": 1.0, "rewards/chosen": 1.3787415027618408, "rewards/margins": 0.855146050453186, "rewards/rejected": 0.5235954523086548, "step": 792 }, { "epoch": 0.13, "learning_rate": 9.969126073410617e-07, "logits/chosen": -0.23077739775180817, "logits/rejected": -0.16643226146697998, "logps/chosen": -44.985992431640625, "logps/rejected": -48.747955322265625, "loss": 0.5639, "rewards/accuracies": 0.0, "rewards/chosen": 0.19896316528320312, "rewards/margins": -0.05914077162742615, "rewards/rejected": 0.2581039369106293, "step": 793 }, { "epoch": 0.13, "learning_rate": 9.96898007615453e-07, "logits/chosen": -0.39376917481422424, "logits/rejected": -0.36954107880592346, "logps/chosen": -89.03296661376953, "logps/rejected": -78.38948059082031, "loss": 0.4015, "rewards/accuracies": 1.0, "rewards/chosen": 0.8008713126182556, "rewards/margins": 0.39098894596099854, "rewards/rejected": 0.4098823666572571, "step": 794 }, { "epoch": 0.13, "learning_rate": 9.968833735588942e-07, "logits/chosen": 0.053810615092515945, "logits/rejected": 0.08409629762172699, "logps/chosen": -62.334190368652344, "logps/rejected": -100.45085144042969, "loss": 0.6084, "rewards/accuracies": 1.0, "rewards/chosen": 0.9335098266601562, "rewards/margins": 0.005128443241119385, "rewards/rejected": 0.9283813834190369, "step": 795 }, { "epoch": 0.13, "learning_rate": 9.968687051723956e-07, "logits/chosen": -0.32674774527549744, "logits/rejected": -0.2971176207065582, "logps/chosen": -113.14930725097656, "logps/rejected": -83.67604064941406, "loss": 0.9051, "rewards/accuracies": 1.0, "rewards/chosen": 0.2170562744140625, "rewards/margins": 0.09336700290441513, "rewards/rejected": 0.12368927150964737, "step": 796 }, { "epoch": 0.13, "learning_rate": 9.968540024569708e-07, "logits/chosen": -0.4328787624835968, "logits/rejected": -0.41627225279808044, "logps/chosen": -106.44602966308594, "logps/rejected": -151.45013427734375, "loss": 0.6265, "rewards/accuracies": 0.0, "rewards/chosen": 0.19668197631835938, "rewards/margins": -0.09289932250976562, "rewards/rejected": 0.289581298828125, "step": 797 }, { "epoch": 0.13, "learning_rate": 9.96839265413636e-07, "logits/chosen": -0.18812952935695648, "logits/rejected": -0.2439774125814438, "logps/chosen": -227.44070434570312, "logps/rejected": -95.73785400390625, "loss": 0.6249, "rewards/accuracies": 0.0, "rewards/chosen": 0.38565826416015625, "rewards/margins": -0.1630951166152954, "rewards/rejected": 0.5487533807754517, "step": 798 }, { "epoch": 0.13, "learning_rate": 9.968244940434088e-07, "logits/chosen": -0.1693282574415207, "logits/rejected": -0.16397391259670258, "logps/chosen": -1.4503414630889893, "logps/rejected": -7.094103813171387, "loss": 0.5734, "rewards/accuracies": 1.0, "rewards/chosen": 0.09803807735443115, "rewards/margins": 0.05663425847887993, "rewards/rejected": 0.041403818875551224, "step": 799 }, { "epoch": 0.13, "learning_rate": 9.968096883473103e-07, "logits/chosen": -0.3117412030696869, "logits/rejected": -0.30245721340179443, "logps/chosen": -116.06790924072266, "logps/rejected": -146.84896850585938, "loss": 1.1206, "rewards/accuracies": 0.0, "rewards/chosen": -0.0051521300338208675, "rewards/margins": -1.4998924732208252, "rewards/rejected": 1.49474036693573, "step": 800 }, { "epoch": 0.13, "learning_rate": 9.96794848326363e-07, "logits/chosen": -0.3441537022590637, "logits/rejected": -0.40259209275245667, "logps/chosen": -169.59228515625, "logps/rejected": -205.98129272460938, "loss": 0.4052, "rewards/accuracies": 1.0, "rewards/chosen": 2.30289626121521, "rewards/margins": 0.5908372402191162, "rewards/rejected": 1.7120590209960938, "step": 801 }, { "epoch": 0.13, "learning_rate": 9.967799739815924e-07, "logits/chosen": -0.516396164894104, "logits/rejected": -0.4818513095378876, "logps/chosen": -39.08819580078125, "logps/rejected": -96.42082977294922, "loss": 0.8299, "rewards/accuracies": 0.0, "rewards/chosen": 0.434967041015625, "rewards/margins": -0.7141708135604858, "rewards/rejected": 1.1491378545761108, "step": 802 }, { "epoch": 0.13, "learning_rate": 9.967650653140262e-07, "logits/chosen": -0.2070559561252594, "logits/rejected": -0.16315297782421112, "logps/chosen": -32.63996505737305, "logps/rejected": -9.253886222839355, "loss": 1.2236, "rewards/accuracies": 0.0, "rewards/chosen": -0.01727752760052681, "rewards/margins": -0.41076424717903137, "rewards/rejected": 0.39348670840263367, "step": 803 }, { "epoch": 0.13, "learning_rate": 9.967501223246945e-07, "logits/chosen": -0.7289486527442932, "logits/rejected": -0.7199764847755432, "logps/chosen": -111.11685180664062, "logps/rejected": -103.3882064819336, "loss": 0.7058, "rewards/accuracies": 0.0, "rewards/chosen": 0.31310805678367615, "rewards/margins": -0.15324097871780396, "rewards/rejected": 0.4663490355014801, "step": 804 }, { "epoch": 0.13, "learning_rate": 9.967351450146296e-07, "logits/chosen": -0.44597524404525757, "logits/rejected": -0.4051373600959778, "logps/chosen": -121.95081329345703, "logps/rejected": -126.53258514404297, "loss": 0.7825, "rewards/accuracies": 1.0, "rewards/chosen": 0.9879234433174133, "rewards/margins": 0.05019533634185791, "rewards/rejected": 0.9377281069755554, "step": 805 }, { "epoch": 0.13, "learning_rate": 9.967201333848664e-07, "logits/chosen": -0.11627703905105591, "logits/rejected": -0.11485230922698975, "logps/chosen": -6.688724517822266, "logps/rejected": -4.801177978515625, "loss": 0.5677, "rewards/accuracies": 0.0, "rewards/chosen": 0.16020575165748596, "rewards/margins": -0.09005141258239746, "rewards/rejected": 0.2502571642398834, "step": 806 }, { "epoch": 0.13, "learning_rate": 9.967050874364418e-07, "logits/chosen": -0.2384522557258606, "logits/rejected": -0.2384522557258606, "logps/chosen": -62.59231948852539, "logps/rejected": -62.59231948852539, "loss": 0.7125, "rewards/accuracies": 0.0, "rewards/chosen": 0.7826305627822876, "rewards/margins": 0.0, "rewards/rejected": 0.7826305627822876, "step": 807 }, { "epoch": 0.13, "learning_rate": 9.966900071703957e-07, "logits/chosen": -0.1389133185148239, "logits/rejected": -0.130196675658226, "logps/chosen": -79.94691467285156, "logps/rejected": -77.58601379394531, "loss": 0.7882, "rewards/accuracies": 1.0, "rewards/chosen": 0.8199173212051392, "rewards/margins": 0.25579833984375, "rewards/rejected": 0.5641189813613892, "step": 808 }, { "epoch": 0.13, "learning_rate": 9.966748925877696e-07, "logits/chosen": -0.402527779340744, "logits/rejected": -0.39304307103157043, "logps/chosen": -32.18494415283203, "logps/rejected": -23.96788787841797, "loss": 1.235, "rewards/accuracies": 0.0, "rewards/chosen": 0.2868820130825043, "rewards/margins": -0.012447565793991089, "rewards/rejected": 0.29932957887649536, "step": 809 }, { "epoch": 0.13, "learning_rate": 9.966597436896082e-07, "logits/chosen": -0.35898473858833313, "logits/rejected": -0.37602105736732483, "logps/chosen": -9.948938369750977, "logps/rejected": -6.683191299438477, "loss": 0.6141, "rewards/accuracies": 0.0, "rewards/chosen": 0.13830576837062836, "rewards/margins": -0.22784213721752167, "rewards/rejected": 0.36614790558815, "step": 810 }, { "epoch": 0.13, "learning_rate": 9.96644560476958e-07, "logits/chosen": -0.3100639283657074, "logits/rejected": -0.22948095202445984, "logps/chosen": -49.818260192871094, "logps/rejected": -26.801239013671875, "loss": 0.8756, "rewards/accuracies": 1.0, "rewards/chosen": 0.4335426390171051, "rewards/margins": 0.0630294680595398, "rewards/rejected": 0.3705131709575653, "step": 811 }, { "epoch": 0.13, "learning_rate": 9.966293429508678e-07, "logits/chosen": -0.11610997468233109, "logits/rejected": -0.12619788944721222, "logps/chosen": -6.5059356689453125, "logps/rejected": -4.947939872741699, "loss": 0.4685, "rewards/accuracies": 0.0, "rewards/chosen": 0.2063087522983551, "rewards/margins": -0.05563837289810181, "rewards/rejected": 0.2619471251964569, "step": 812 }, { "epoch": 0.13, "learning_rate": 9.966140911123893e-07, "logits/chosen": -0.6490569114685059, "logits/rejected": -0.633895993232727, "logps/chosen": -73.5419921875, "logps/rejected": -49.15940856933594, "loss": 0.6441, "rewards/accuracies": 1.0, "rewards/chosen": 0.8345497250556946, "rewards/margins": 0.5605624914169312, "rewards/rejected": 0.27398720383644104, "step": 813 }, { "epoch": 0.13, "learning_rate": 9.96598804962576e-07, "logits/chosen": -0.04622531682252884, "logits/rejected": -0.09361475706100464, "logps/chosen": -58.229698181152344, "logps/rejected": -115.7242660522461, "loss": 0.4773, "rewards/accuracies": 0.0, "rewards/chosen": 0.3126232326030731, "rewards/margins": -0.05061149597167969, "rewards/rejected": 0.3632347285747528, "step": 814 }, { "epoch": 0.13, "learning_rate": 9.965834845024842e-07, "logits/chosen": -0.5636569261550903, "logits/rejected": -0.5305889844894409, "logps/chosen": -135.0983123779297, "logps/rejected": -112.79049682617188, "loss": 0.8739, "rewards/accuracies": 0.0, "rewards/chosen": -0.11974640190601349, "rewards/margins": -0.4103187918663025, "rewards/rejected": 0.2905723750591278, "step": 815 }, { "epoch": 0.13, "learning_rate": 9.965681297331725e-07, "logits/chosen": -0.9013403654098511, "logits/rejected": -0.9424665570259094, "logps/chosen": -101.89120483398438, "logps/rejected": -165.37132263183594, "loss": 0.9119, "rewards/accuracies": 0.0, "rewards/chosen": 0.16506576538085938, "rewards/margins": -0.9254051446914673, "rewards/rejected": 1.0904709100723267, "step": 816 }, { "epoch": 0.13, "learning_rate": 9.965527406557013e-07, "logits/chosen": -0.255649596452713, "logits/rejected": -0.23178422451019287, "logps/chosen": -49.488059997558594, "logps/rejected": -63.11753845214844, "loss": 0.557, "rewards/accuracies": 1.0, "rewards/chosen": 0.7517383694648743, "rewards/margins": 0.6974140405654907, "rewards/rejected": 0.05432434007525444, "step": 817 }, { "epoch": 0.13, "learning_rate": 9.965373172711343e-07, "logits/chosen": -0.4363330602645874, "logits/rejected": -0.41929641366004944, "logps/chosen": -125.92317199707031, "logps/rejected": -69.1899642944336, "loss": 0.2692, "rewards/accuracies": 1.0, "rewards/chosen": 1.487141489982605, "rewards/margins": 1.287471055984497, "rewards/rejected": 0.19967041909694672, "step": 818 }, { "epoch": 0.13, "learning_rate": 9.96521859580537e-07, "logits/chosen": -0.44846436381340027, "logits/rejected": -0.4814274311065674, "logps/chosen": -55.2965087890625, "logps/rejected": -86.89849853515625, "loss": 0.9412, "rewards/accuracies": 0.0, "rewards/chosen": 0.1290901154279709, "rewards/margins": -0.19767381250858307, "rewards/rejected": 0.32676392793655396, "step": 819 }, { "epoch": 0.13, "learning_rate": 9.965063675849773e-07, "logits/chosen": -0.4196013808250427, "logits/rejected": -0.3996308445930481, "logps/chosen": -124.09219360351562, "logps/rejected": -66.8915023803711, "loss": 0.5968, "rewards/accuracies": 1.0, "rewards/chosen": 0.42964935302734375, "rewards/margins": 0.3321235775947571, "rewards/rejected": 0.09752579033374786, "step": 820 }, { "epoch": 0.13, "learning_rate": 9.964908412855255e-07, "logits/chosen": -0.4861706793308258, "logits/rejected": -0.4833866059780121, "logps/chosen": -167.942626953125, "logps/rejected": -205.67092895507812, "loss": 0.5716, "rewards/accuracies": 0.0, "rewards/chosen": 1.7120087146759033, "rewards/margins": -0.00231015682220459, "rewards/rejected": 1.714318871498108, "step": 821 }, { "epoch": 0.13, "learning_rate": 9.964752806832543e-07, "logits/chosen": -0.04459530860185623, "logits/rejected": -0.04459530860185623, "logps/chosen": -30.768024444580078, "logps/rejected": -30.768024444580078, "loss": 0.5934, "rewards/accuracies": 0.0, "rewards/chosen": 0.5640503168106079, "rewards/margins": 0.0, "rewards/rejected": 0.5640503168106079, "step": 822 }, { "epoch": 0.13, "learning_rate": 9.96459685779239e-07, "logits/chosen": -0.47316616773605347, "logits/rejected": -0.44635334610939026, "logps/chosen": -173.29725646972656, "logps/rejected": -154.33148193359375, "loss": 0.9037, "rewards/accuracies": 0.0, "rewards/chosen": 1.2097091674804688, "rewards/margins": -0.43008577823638916, "rewards/rejected": 1.639794945716858, "step": 823 }, { "epoch": 0.13, "learning_rate": 9.964440565745573e-07, "logits/chosen": -0.32888445258140564, "logits/rejected": -0.19324250519275665, "logps/chosen": -96.23228454589844, "logps/rejected": -92.64715576171875, "loss": 0.7477, "rewards/accuracies": 0.0, "rewards/chosen": 0.7052200436592102, "rewards/margins": -0.24965441226959229, "rewards/rejected": 0.9548744559288025, "step": 824 }, { "epoch": 0.13, "learning_rate": 9.964283930702884e-07, "logits/chosen": -0.11873294413089752, "logits/rejected": -0.16079770028591156, "logps/chosen": -55.650089263916016, "logps/rejected": -140.3531036376953, "loss": 0.6376, "rewards/accuracies": 1.0, "rewards/chosen": 0.23539619147777557, "rewards/margins": 0.4613773226737976, "rewards/rejected": -0.22598114609718323, "step": 825 }, { "epoch": 0.13, "learning_rate": 9.964126952675147e-07, "logits/chosen": -0.3093854784965515, "logits/rejected": -0.202581524848938, "logps/chosen": -65.18344116210938, "logps/rejected": -25.644527435302734, "loss": 0.592, "rewards/accuracies": 1.0, "rewards/chosen": 0.6569374203681946, "rewards/margins": 0.23315143585205078, "rewards/rejected": 0.4237859845161438, "step": 826 }, { "epoch": 0.13, "learning_rate": 9.96396963167321e-07, "logits/chosen": 0.07872473448514938, "logits/rejected": 0.058022890239953995, "logps/chosen": -5.925958633422852, "logps/rejected": -33.95680236816406, "loss": 0.7431, "rewards/accuracies": 1.0, "rewards/chosen": 0.07521224021911621, "rewards/margins": 0.11471191048622131, "rewards/rejected": -0.039499666541814804, "step": 827 }, { "epoch": 0.13, "learning_rate": 9.96381196770794e-07, "logits/chosen": -0.7649343609809875, "logits/rejected": -0.7921329736709595, "logps/chosen": -120.03811645507812, "logps/rejected": -101.83885192871094, "loss": 1.1648, "rewards/accuracies": 0.0, "rewards/chosen": 0.5528961420059204, "rewards/margins": -0.20869213342666626, "rewards/rejected": 0.7615882754325867, "step": 828 }, { "epoch": 0.13, "learning_rate": 9.963653960790232e-07, "logits/chosen": -0.3653430938720703, "logits/rejected": -0.31632357835769653, "logps/chosen": -70.08782958984375, "logps/rejected": -175.33154296875, "loss": 0.8174, "rewards/accuracies": 0.0, "rewards/chosen": 1.1054810285568237, "rewards/margins": -0.7433196306228638, "rewards/rejected": 1.8488006591796875, "step": 829 }, { "epoch": 0.13, "learning_rate": 9.963495610931001e-07, "logits/chosen": -0.5587300062179565, "logits/rejected": -0.5802493691444397, "logps/chosen": -157.15072631835938, "logps/rejected": -70.83100128173828, "loss": 0.3727, "rewards/accuracies": 1.0, "rewards/chosen": 1.1703033447265625, "rewards/margins": 0.6289970278739929, "rewards/rejected": 0.5413063168525696, "step": 830 }, { "epoch": 0.13, "learning_rate": 9.96333691814119e-07, "logits/chosen": -0.32623887062072754, "logits/rejected": -0.325192391872406, "logps/chosen": -3.223531723022461, "logps/rejected": -21.324932098388672, "loss": 0.5482, "rewards/accuracies": 1.0, "rewards/chosen": 0.1549907773733139, "rewards/margins": 0.11683178693056107, "rewards/rejected": 0.03815899044275284, "step": 831 }, { "epoch": 0.14, "learning_rate": 9.96317788243176e-07, "logits/chosen": -0.703837513923645, "logits/rejected": -0.6909640431404114, "logps/chosen": -21.759851455688477, "logps/rejected": -112.31059265136719, "loss": 0.8579, "rewards/accuracies": 0.0, "rewards/chosen": 0.19068698585033417, "rewards/margins": -1.1241720914840698, "rewards/rejected": 1.3148590326309204, "step": 832 }, { "epoch": 0.14, "learning_rate": 9.963018503813698e-07, "logits/chosen": -0.34208330512046814, "logits/rejected": -0.35877570509910583, "logps/chosen": -111.10383605957031, "logps/rejected": -101.18287658691406, "loss": 0.8559, "rewards/accuracies": 0.0, "rewards/chosen": 0.2167098969221115, "rewards/margins": -0.3075965642929077, "rewards/rejected": 0.5243064761161804, "step": 833 }, { "epoch": 0.14, "learning_rate": 9.962858782298023e-07, "logits/chosen": -0.7594357132911682, "logits/rejected": -0.7307891249656677, "logps/chosen": -144.82456970214844, "logps/rejected": -170.2674102783203, "loss": 0.9507, "rewards/accuracies": 0.0, "rewards/chosen": 0.9377639889717102, "rewards/margins": -0.8706527352333069, "rewards/rejected": 1.808416724205017, "step": 834 }, { "epoch": 0.14, "learning_rate": 9.962698717895761e-07, "logits/chosen": -0.6739681363105774, "logits/rejected": -0.7309385538101196, "logps/chosen": -145.13897705078125, "logps/rejected": -24.929054260253906, "loss": 0.5343, "rewards/accuracies": 1.0, "rewards/chosen": 0.05103454738855362, "rewards/margins": 0.058455660939216614, "rewards/rejected": -0.0074211121536791325, "step": 835 }, { "epoch": 0.14, "learning_rate": 9.962538310617976e-07, "logits/chosen": -0.6675406694412231, "logits/rejected": -0.7072100639343262, "logps/chosen": -216.02096557617188, "logps/rejected": -58.397727966308594, "loss": 0.8005, "rewards/accuracies": 0.0, "rewards/chosen": 0.15060119330883026, "rewards/margins": -0.3912792205810547, "rewards/rejected": 0.5418804287910461, "step": 836 }, { "epoch": 0.14, "learning_rate": 9.962377560475751e-07, "logits/chosen": -0.5990515351295471, "logits/rejected": -0.5233457088470459, "logps/chosen": -196.16305541992188, "logps/rejected": -42.39734649658203, "loss": 0.4304, "rewards/accuracies": 1.0, "rewards/chosen": 0.9808899164199829, "rewards/margins": 0.5463279485702515, "rewards/rejected": 0.43456193804740906, "step": 837 }, { "epoch": 0.14, "learning_rate": 9.96221646748019e-07, "logits/chosen": -0.031968507915735245, "logits/rejected": -0.05894197151064873, "logps/chosen": -66.01029968261719, "logps/rejected": -84.78271484375, "loss": 0.7549, "rewards/accuracies": 0.0, "rewards/chosen": 0.4694015681743622, "rewards/margins": -0.17612454295158386, "rewards/rejected": 0.645526111125946, "step": 838 }, { "epoch": 0.14, "learning_rate": 9.962055031642425e-07, "logits/chosen": -0.3290163576602936, "logits/rejected": -0.27278104424476624, "logps/chosen": -198.15762329101562, "logps/rejected": -232.99609375, "loss": 0.6017, "rewards/accuracies": 0.0, "rewards/chosen": 1.537379503250122, "rewards/margins": -0.20121002197265625, "rewards/rejected": 1.7385895252227783, "step": 839 }, { "epoch": 0.14, "learning_rate": 9.961893252973609e-07, "logits/chosen": -0.3809666931629181, "logits/rejected": -0.4026184678077698, "logps/chosen": -94.09613037109375, "logps/rejected": -101.27318572998047, "loss": 0.5776, "rewards/accuracies": 0.0, "rewards/chosen": 1.039753794670105, "rewards/margins": -0.0016448497772216797, "rewards/rejected": 1.0413986444473267, "step": 840 }, { "epoch": 0.14, "learning_rate": 9.96173113148492e-07, "logits/chosen": -0.318569153547287, "logits/rejected": -0.3019709587097168, "logps/chosen": -23.962081909179688, "logps/rejected": -6.215919494628906, "loss": 0.6407, "rewards/accuracies": 0.0, "rewards/chosen": -0.06935005635023117, "rewards/margins": -0.18456760048866272, "rewards/rejected": 0.11521754413843155, "step": 841 }, { "epoch": 0.14, "learning_rate": 9.961568667187554e-07, "logits/chosen": -0.3812316358089447, "logits/rejected": -0.36287280917167664, "logps/chosen": -88.70435333251953, "logps/rejected": -97.1252212524414, "loss": 0.5514, "rewards/accuracies": 1.0, "rewards/chosen": 0.9129859805107117, "rewards/margins": 0.29407191276550293, "rewards/rejected": 0.6189140677452087, "step": 842 }, { "epoch": 0.14, "learning_rate": 9.961405860092743e-07, "logits/chosen": -0.25713643431663513, "logits/rejected": -0.21567492187023163, "logps/chosen": -62.73847961425781, "logps/rejected": -76.22887420654297, "loss": 0.3779, "rewards/accuracies": 1.0, "rewards/chosen": 0.7175788879394531, "rewards/margins": 0.44202423095703125, "rewards/rejected": 0.2755546569824219, "step": 843 }, { "epoch": 0.14, "learning_rate": 9.961242710211732e-07, "logits/chosen": -0.15335851907730103, "logits/rejected": -0.1829715222120285, "logps/chosen": -108.21049499511719, "logps/rejected": -101.20170593261719, "loss": 0.6424, "rewards/accuracies": 0.0, "rewards/chosen": 0.7849617004394531, "rewards/margins": -0.3774566650390625, "rewards/rejected": 1.1624183654785156, "step": 844 }, { "epoch": 0.14, "learning_rate": 9.961079217555791e-07, "logits/chosen": -0.4817810356616974, "logits/rejected": -0.4713656008243561, "logps/chosen": -88.25182342529297, "logps/rejected": -150.5478515625, "loss": 0.5145, "rewards/accuracies": 1.0, "rewards/chosen": 0.511884331703186, "rewards/margins": 0.10263901948928833, "rewards/rejected": 0.4092453122138977, "step": 845 }, { "epoch": 0.14, "learning_rate": 9.960915382136222e-07, "logits/chosen": -0.1713639348745346, "logits/rejected": -0.1713639348745346, "logps/chosen": -105.3500747680664, "logps/rejected": -105.3500747680664, "loss": 0.5465, "rewards/accuracies": 0.0, "rewards/chosen": 0.550872802734375, "rewards/margins": 0.0, "rewards/rejected": 0.550872802734375, "step": 846 }, { "epoch": 0.14, "learning_rate": 9.96075120396434e-07, "logits/chosen": -0.3822229504585266, "logits/rejected": -0.4049735963344574, "logps/chosen": -182.91815185546875, "logps/rejected": -115.13713073730469, "loss": 0.6369, "rewards/accuracies": 1.0, "rewards/chosen": 1.0280579328536987, "rewards/margins": 0.28025978803634644, "rewards/rejected": 0.7477981448173523, "step": 847 }, { "epoch": 0.14, "learning_rate": 9.960586683051486e-07, "logits/chosen": -0.4764713644981384, "logits/rejected": -0.3671761751174927, "logps/chosen": -87.54388427734375, "logps/rejected": -22.219303131103516, "loss": 0.3588, "rewards/accuracies": 1.0, "rewards/chosen": 1.1331154108047485, "rewards/margins": 0.9489883780479431, "rewards/rejected": 0.1841270476579666, "step": 848 }, { "epoch": 0.14, "learning_rate": 9.960421819409033e-07, "logits/chosen": -0.0789894163608551, "logits/rejected": -0.17697739601135254, "logps/chosen": -55.239906311035156, "logps/rejected": -268.67828369140625, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.7116214632987976, "rewards/margins": 0.1495395302772522, "rewards/rejected": 0.5620819330215454, "step": 849 }, { "epoch": 0.14, "learning_rate": 9.960256613048367e-07, "logits/chosen": -0.6167064309120178, "logits/rejected": -0.6181465983390808, "logps/chosen": -230.1397705078125, "logps/rejected": -159.3177490234375, "loss": 0.6507, "rewards/accuracies": 0.0, "rewards/chosen": 0.4298355281352997, "rewards/margins": -0.8273712396621704, "rewards/rejected": 1.2572067975997925, "step": 850 }, { "epoch": 0.14, "learning_rate": 9.960091063980903e-07, "logits/chosen": -0.41319143772125244, "logits/rejected": -0.4247713088989258, "logps/chosen": -154.75791931152344, "logps/rejected": -79.88200378417969, "loss": 1.0722, "rewards/accuracies": 1.0, "rewards/chosen": 1.8833054304122925, "rewards/margins": 0.7005096673965454, "rewards/rejected": 1.182795763015747, "step": 851 }, { "epoch": 0.14, "learning_rate": 9.959925172218078e-07, "logits/chosen": -0.38077932596206665, "logits/rejected": -0.37190327048301697, "logps/chosen": -52.68772506713867, "logps/rejected": -28.013080596923828, "loss": 0.4913, "rewards/accuracies": 1.0, "rewards/chosen": 0.5117241144180298, "rewards/margins": 0.21366140246391296, "rewards/rejected": 0.2980627119541168, "step": 852 }, { "epoch": 0.14, "learning_rate": 9.959758937771356e-07, "logits/chosen": -0.5144363641738892, "logits/rejected": -0.40828192234039307, "logps/chosen": -176.04637145996094, "logps/rejected": -54.07648468017578, "loss": 0.4362, "rewards/accuracies": 1.0, "rewards/chosen": 1.3706191778182983, "rewards/margins": 0.6269172430038452, "rewards/rejected": 0.7437019348144531, "step": 853 }, { "epoch": 0.14, "learning_rate": 9.959592360652222e-07, "logits/chosen": -0.5795232653617859, "logits/rejected": -0.5222234129905701, "logps/chosen": -109.16864776611328, "logps/rejected": -232.59896850585938, "loss": 0.5275, "rewards/accuracies": 1.0, "rewards/chosen": 1.457140326499939, "rewards/margins": 0.21428298950195312, "rewards/rejected": 1.2428573369979858, "step": 854 }, { "epoch": 0.14, "learning_rate": 9.959425440872184e-07, "logits/chosen": -0.39544677734375, "logits/rejected": -0.32761454582214355, "logps/chosen": -192.96852111816406, "logps/rejected": -27.39053726196289, "loss": 0.4851, "rewards/accuracies": 1.0, "rewards/chosen": 0.9805251955986023, "rewards/margins": 0.916905403137207, "rewards/rejected": 0.06361980736255646, "step": 855 }, { "epoch": 0.14, "learning_rate": 9.959258178442774e-07, "logits/chosen": -0.2807011008262634, "logits/rejected": -0.2807011008262634, "logps/chosen": -109.12490844726562, "logps/rejected": -109.12490844726562, "loss": 0.8704, "rewards/accuracies": 0.0, "rewards/chosen": 0.19770660996437073, "rewards/margins": 0.0, "rewards/rejected": 0.19770660996437073, "step": 856 }, { "epoch": 0.14, "learning_rate": 9.95909057337555e-07, "logits/chosen": -0.43670615553855896, "logits/rejected": -0.44521161913871765, "logps/chosen": -22.216684341430664, "logps/rejected": -33.64244842529297, "loss": 0.589, "rewards/accuracies": 0.0, "rewards/chosen": 0.030192185193300247, "rewards/margins": -0.04195671156048775, "rewards/rejected": 0.072148896753788, "step": 857 }, { "epoch": 0.14, "learning_rate": 9.958922625682087e-07, "logits/chosen": -0.2762172818183899, "logits/rejected": -0.2726125717163086, "logps/chosen": -78.78364562988281, "logps/rejected": -80.02920532226562, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 0.893811047077179, "rewards/margins": 0.3073539733886719, "rewards/rejected": 0.5864570736885071, "step": 858 }, { "epoch": 0.14, "learning_rate": 9.958754335373996e-07, "logits/chosen": -0.37192854285240173, "logits/rejected": -0.3729384243488312, "logps/chosen": -84.01528930664062, "logps/rejected": -117.87653350830078, "loss": 0.4586, "rewards/accuracies": 0.0, "rewards/chosen": 0.04118957743048668, "rewards/margins": -0.09089431166648865, "rewards/rejected": 0.13208389282226562, "step": 859 }, { "epoch": 0.14, "learning_rate": 9.9585857024629e-07, "logits/chosen": -0.558224618434906, "logits/rejected": -0.5460222959518433, "logps/chosen": -77.78107452392578, "logps/rejected": -84.72422790527344, "loss": 0.9837, "rewards/accuracies": 0.0, "rewards/chosen": 0.32595139741897583, "rewards/margins": -0.642351508140564, "rewards/rejected": 0.9683029055595398, "step": 860 }, { "epoch": 0.14, "learning_rate": 9.958416726960451e-07, "logits/chosen": -0.27610161900520325, "logits/rejected": -0.04166589677333832, "logps/chosen": -147.3665771484375, "logps/rejected": -44.13640213012695, "loss": 0.4226, "rewards/accuracies": 1.0, "rewards/chosen": 1.46588134765625, "rewards/margins": 1.213191270828247, "rewards/rejected": 0.2526901364326477, "step": 861 }, { "epoch": 0.14, "learning_rate": 9.95824740887832e-07, "logits/chosen": -0.3027298152446747, "logits/rejected": -0.2733525037765503, "logps/chosen": -155.21820068359375, "logps/rejected": -94.85063171386719, "loss": 0.6704, "rewards/accuracies": 0.0, "rewards/chosen": 0.23126831650733948, "rewards/margins": -0.6626739501953125, "rewards/rejected": 0.8939422965049744, "step": 862 }, { "epoch": 0.14, "learning_rate": 9.95807774822821e-07, "logits/chosen": -0.51041579246521, "logits/rejected": -0.5335700511932373, "logps/chosen": -153.57078552246094, "logps/rejected": -69.89131927490234, "loss": 0.9424, "rewards/accuracies": 0.0, "rewards/chosen": 0.7707443237304688, "rewards/margins": -0.2400658130645752, "rewards/rejected": 1.010810136795044, "step": 863 }, { "epoch": 0.14, "learning_rate": 9.957907745021843e-07, "logits/chosen": -0.456840842962265, "logits/rejected": -0.47160160541534424, "logps/chosen": -101.30113983154297, "logps/rejected": -98.21746826171875, "loss": 1.0479, "rewards/accuracies": 0.0, "rewards/chosen": 0.046634674072265625, "rewards/margins": -0.0948280394077301, "rewards/rejected": 0.14146271347999573, "step": 864 }, { "epoch": 0.14, "learning_rate": 9.957737399270962e-07, "logits/chosen": -0.7483118176460266, "logits/rejected": -0.689399778842926, "logps/chosen": -186.97967529296875, "logps/rejected": -189.18136596679688, "loss": 0.8915, "rewards/accuracies": 0.0, "rewards/chosen": 1.9491180181503296, "rewards/margins": -0.5279420614242554, "rewards/rejected": 2.477060079574585, "step": 865 }, { "epoch": 0.14, "learning_rate": 9.957566710987337e-07, "logits/chosen": -0.37102648615837097, "logits/rejected": -0.3396016061306, "logps/chosen": -75.7564697265625, "logps/rejected": -78.76136779785156, "loss": 0.5491, "rewards/accuracies": 1.0, "rewards/chosen": 1.5609550476074219, "rewards/margins": 0.20337367057800293, "rewards/rejected": 1.357581377029419, "step": 866 }, { "epoch": 0.14, "learning_rate": 9.95739568018276e-07, "logits/chosen": -0.3558119237422943, "logits/rejected": -0.20925067365169525, "logps/chosen": -124.6610107421875, "logps/rejected": -115.88261413574219, "loss": 1.3031, "rewards/accuracies": 0.0, "rewards/chosen": 0.16158294677734375, "rewards/margins": -1.5642975568771362, "rewards/rejected": 1.72588050365448, "step": 867 }, { "epoch": 0.14, "learning_rate": 9.957224306869053e-07, "logits/chosen": -0.21956095099449158, "logits/rejected": -0.21166886389255524, "logps/chosen": -41.0999641418457, "logps/rejected": -17.494688034057617, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.67190021276474, "rewards/margins": 0.045946359634399414, "rewards/rejected": 0.6259538531303406, "step": 868 }, { "epoch": 0.14, "learning_rate": 9.957052591058048e-07, "logits/chosen": -0.5262841582298279, "logits/rejected": -0.4879225194454193, "logps/chosen": -44.11981201171875, "logps/rejected": -11.244254112243652, "loss": 0.5005, "rewards/accuracies": 1.0, "rewards/chosen": 1.1033695936203003, "rewards/margins": 0.7874447107315063, "rewards/rejected": 0.31592485308647156, "step": 869 }, { "epoch": 0.14, "learning_rate": 9.956880532761614e-07, "logits/chosen": -0.5785408616065979, "logits/rejected": -0.5738450288772583, "logps/chosen": -92.42717742919922, "logps/rejected": -115.57331848144531, "loss": 0.6077, "rewards/accuracies": 0.0, "rewards/chosen": 0.4741813838481903, "rewards/margins": -0.472525030374527, "rewards/rejected": 0.9467064142227173, "step": 870 }, { "epoch": 0.14, "learning_rate": 9.956708131991639e-07, "logits/chosen": -0.7218930125236511, "logits/rejected": -0.5727027654647827, "logps/chosen": -85.49542236328125, "logps/rejected": -197.44790649414062, "loss": 0.9515, "rewards/accuracies": 0.0, "rewards/chosen": 0.8014511466026306, "rewards/margins": -0.13384246826171875, "rewards/rejected": 0.9352936148643494, "step": 871 }, { "epoch": 0.14, "learning_rate": 9.95653538876003e-07, "logits/chosen": -0.7538942098617554, "logits/rejected": -0.6342383027076721, "logps/chosen": -202.48483276367188, "logps/rejected": -160.192626953125, "loss": 0.9898, "rewards/accuracies": 0.0, "rewards/chosen": 0.46318361163139343, "rewards/margins": -1.4793319702148438, "rewards/rejected": 1.9425156116485596, "step": 872 }, { "epoch": 0.14, "learning_rate": 9.956362303078727e-07, "logits/chosen": -0.3994273543357849, "logits/rejected": -0.3632078468799591, "logps/chosen": -77.31124877929688, "logps/rejected": -117.94790649414062, "loss": 0.5316, "rewards/accuracies": 1.0, "rewards/chosen": 1.3733261823654175, "rewards/margins": 0.23870396614074707, "rewards/rejected": 1.1346222162246704, "step": 873 }, { "epoch": 0.14, "learning_rate": 9.956188874959686e-07, "logits/chosen": -0.5617687106132507, "logits/rejected": -0.5727416276931763, "logps/chosen": -117.4634780883789, "logps/rejected": -43.42066192626953, "loss": 0.5941, "rewards/accuracies": 0.0, "rewards/chosen": 0.5170921683311462, "rewards/margins": -0.33403927087783813, "rewards/rejected": 0.8511314392089844, "step": 874 }, { "epoch": 0.14, "learning_rate": 9.95601510441489e-07, "logits/chosen": -0.012782417237758636, "logits/rejected": 0.04275662824511528, "logps/chosen": -60.52140426635742, "logps/rejected": -57.350494384765625, "loss": 0.8278, "rewards/accuracies": 1.0, "rewards/chosen": 0.4990215301513672, "rewards/margins": 0.27125129103660583, "rewards/rejected": 0.22777023911476135, "step": 875 }, { "epoch": 0.14, "learning_rate": 9.955840991456343e-07, "logits/chosen": -0.4574638605117798, "logits/rejected": -0.4151175320148468, "logps/chosen": -109.81353759765625, "logps/rejected": -77.64866638183594, "loss": 0.5032, "rewards/accuracies": 1.0, "rewards/chosen": 1.2744873762130737, "rewards/margins": 0.3804169297218323, "rewards/rejected": 0.8940704464912415, "step": 876 }, { "epoch": 0.14, "learning_rate": 9.955666536096078e-07, "logits/chosen": -0.47563600540161133, "logits/rejected": -0.5003131628036499, "logps/chosen": -168.66973876953125, "logps/rejected": -96.06533813476562, "loss": 0.4433, "rewards/accuracies": 1.0, "rewards/chosen": 1.453637719154358, "rewards/margins": 0.363616943359375, "rewards/rejected": 1.090020775794983, "step": 877 }, { "epoch": 0.14, "learning_rate": 9.955491738346146e-07, "logits/chosen": -0.4366306960582733, "logits/rejected": -0.45187321305274963, "logps/chosen": -108.33055877685547, "logps/rejected": -94.74929809570312, "loss": 0.5212, "rewards/accuracies": 1.0, "rewards/chosen": 1.2615333795547485, "rewards/margins": 0.32506483793258667, "rewards/rejected": 0.9364685416221619, "step": 878 }, { "epoch": 0.14, "learning_rate": 9.955316598218623e-07, "logits/chosen": -0.1017032340168953, "logits/rejected": -0.0860963985323906, "logps/chosen": -102.34095001220703, "logps/rejected": -92.81253051757812, "loss": 0.7303, "rewards/accuracies": 0.0, "rewards/chosen": 0.20201264321804047, "rewards/margins": -0.21338044106960297, "rewards/rejected": 0.41539308428764343, "step": 879 }, { "epoch": 0.14, "learning_rate": 9.955141115725611e-07, "logits/chosen": -0.24168355762958527, "logits/rejected": -0.25559791922569275, "logps/chosen": -120.28447723388672, "logps/rejected": -173.46978759765625, "loss": 1.1303, "rewards/accuracies": 0.0, "rewards/chosen": 0.24670791625976562, "rewards/margins": -1.5099648237228394, "rewards/rejected": 1.756672739982605, "step": 880 }, { "epoch": 0.14, "learning_rate": 9.954965290879236e-07, "logits/chosen": -0.20295187830924988, "logits/rejected": -0.13227376341819763, "logps/chosen": -188.57994079589844, "logps/rejected": -92.73194885253906, "loss": 0.3661, "rewards/accuracies": 1.0, "rewards/chosen": 1.3667755126953125, "rewards/margins": 0.20475304126739502, "rewards/rejected": 1.1620224714279175, "step": 881 }, { "epoch": 0.14, "learning_rate": 9.95478912369164e-07, "logits/chosen": -0.386095255613327, "logits/rejected": -0.3590834140777588, "logps/chosen": -99.25933837890625, "logps/rejected": -58.51258850097656, "loss": 0.728, "rewards/accuracies": 1.0, "rewards/chosen": 1.2540940046310425, "rewards/margins": 0.3677605390548706, "rewards/rejected": 0.8863334655761719, "step": 882 }, { "epoch": 0.14, "learning_rate": 9.954612614175002e-07, "logits/chosen": -0.8223049640655518, "logits/rejected": -0.7153063416481018, "logps/chosen": -73.88893127441406, "logps/rejected": -105.9329833984375, "loss": 0.5135, "rewards/accuracies": 1.0, "rewards/chosen": 1.4282379150390625, "rewards/margins": 0.13816606998443604, "rewards/rejected": 1.2900718450546265, "step": 883 }, { "epoch": 0.14, "learning_rate": 9.954435762341512e-07, "logits/chosen": -0.27646520733833313, "logits/rejected": -0.24036554992198944, "logps/chosen": -130.12008666992188, "logps/rejected": -52.41046905517578, "loss": 0.6171, "rewards/accuracies": 1.0, "rewards/chosen": 0.4963851869106293, "rewards/margins": 0.033923715353012085, "rewards/rejected": 0.4624614715576172, "step": 884 }, { "epoch": 0.14, "learning_rate": 9.95425856820339e-07, "logits/chosen": -0.2966753840446472, "logits/rejected": -0.2959355413913727, "logps/chosen": -3.851320743560791, "logps/rejected": -3.7792935371398926, "loss": 0.787, "rewards/accuracies": 0.0, "rewards/chosen": 0.20741939544677734, "rewards/margins": -0.04178909957408905, "rewards/rejected": 0.2492084950208664, "step": 885 }, { "epoch": 0.14, "learning_rate": 9.954081031772877e-07, "logits/chosen": -0.3969089090824127, "logits/rejected": -0.40509557723999023, "logps/chosen": -181.86195373535156, "logps/rejected": -109.42304992675781, "loss": 0.621, "rewards/accuracies": 1.0, "rewards/chosen": 1.7627090215682983, "rewards/margins": 0.40789794921875, "rewards/rejected": 1.3548110723495483, "step": 886 }, { "epoch": 0.14, "learning_rate": 9.953903153062242e-07, "logits/chosen": -0.4427596628665924, "logits/rejected": -0.41996046900749207, "logps/chosen": -95.97491455078125, "logps/rejected": -76.73027801513672, "loss": 0.6318, "rewards/accuracies": 0.0, "rewards/chosen": 0.5210129022598267, "rewards/margins": -0.05923539400100708, "rewards/rejected": 0.5802482962608337, "step": 887 }, { "epoch": 0.14, "learning_rate": 9.953724932083774e-07, "logits/chosen": -0.44107550382614136, "logits/rejected": -0.3660423159599304, "logps/chosen": -107.90498352050781, "logps/rejected": -44.219547271728516, "loss": 0.9356, "rewards/accuracies": 0.0, "rewards/chosen": 0.4426254332065582, "rewards/margins": -0.3853221833705902, "rewards/rejected": 0.8279476165771484, "step": 888 }, { "epoch": 0.14, "learning_rate": 9.953546368849786e-07, "logits/chosen": -0.1337607055902481, "logits/rejected": 0.01177042257040739, "logps/chosen": -89.50822448730469, "logps/rejected": -45.6734619140625, "loss": 0.3888, "rewards/accuracies": 1.0, "rewards/chosen": 0.8602539300918579, "rewards/margins": 0.41813012957572937, "rewards/rejected": 0.44212380051612854, "step": 889 }, { "epoch": 0.14, "learning_rate": 9.953367463372613e-07, "logits/chosen": -0.2632530927658081, "logits/rejected": -0.2632530927658081, "logps/chosen": -92.22377014160156, "logps/rejected": -92.22377014160156, "loss": 0.5219, "rewards/accuracies": 0.0, "rewards/chosen": 0.05281524732708931, "rewards/margins": 0.0, "rewards/rejected": 0.05281524732708931, "step": 890 }, { "epoch": 0.14, "learning_rate": 9.953188215664618e-07, "logits/chosen": -0.16956041753292084, "logits/rejected": 0.007526427507400513, "logps/chosen": -209.75338745117188, "logps/rejected": -24.45693016052246, "loss": 0.7022, "rewards/accuracies": 1.0, "rewards/chosen": 0.9206924438476562, "rewards/margins": 0.7295244336128235, "rewards/rejected": 0.19116802513599396, "step": 891 }, { "epoch": 0.14, "learning_rate": 9.953008625738184e-07, "logits/chosen": -0.2987687587738037, "logits/rejected": -0.2253357619047165, "logps/chosen": -76.69281005859375, "logps/rejected": -13.642353057861328, "loss": 0.2068, "rewards/accuracies": 1.0, "rewards/chosen": 1.4167206287384033, "rewards/margins": 1.164326548576355, "rewards/rejected": 0.2523941099643707, "step": 892 }, { "epoch": 0.14, "learning_rate": 9.952828693605722e-07, "logits/chosen": -0.4985131621360779, "logits/rejected": -0.5187339782714844, "logps/chosen": -136.60475158691406, "logps/rejected": -61.68494415283203, "loss": 0.3262, "rewards/accuracies": 1.0, "rewards/chosen": 2.005053758621216, "rewards/margins": 1.289283037185669, "rewards/rejected": 0.7157707214355469, "step": 893 }, { "epoch": 0.15, "learning_rate": 9.952648419279662e-07, "logits/chosen": -0.5156154036521912, "logits/rejected": -0.5615261793136597, "logps/chosen": -138.9089813232422, "logps/rejected": -22.907489776611328, "loss": 0.5162, "rewards/accuracies": 1.0, "rewards/chosen": 1.0536636114120483, "rewards/margins": 0.8669330477714539, "rewards/rejected": 0.18673057854175568, "step": 894 }, { "epoch": 0.15, "learning_rate": 9.952467802772454e-07, "logits/chosen": -0.4343058168888092, "logits/rejected": -0.41274961829185486, "logps/chosen": -27.382381439208984, "logps/rejected": -48.21680450439453, "loss": 0.6007, "rewards/accuracies": 1.0, "rewards/chosen": 0.11945648491382599, "rewards/margins": 0.25012511014938354, "rewards/rejected": -0.13066864013671875, "step": 895 }, { "epoch": 0.15, "learning_rate": 9.952286844096587e-07, "logits/chosen": -0.5207216143608093, "logits/rejected": -0.5679858326911926, "logps/chosen": -182.28921508789062, "logps/rejected": -69.83648681640625, "loss": 0.3008, "rewards/accuracies": 1.0, "rewards/chosen": 1.5404328107833862, "rewards/margins": 1.0920441150665283, "rewards/rejected": 0.4483886659145355, "step": 896 }, { "epoch": 0.15, "learning_rate": 9.952105543264556e-07, "logits/chosen": -0.6681405901908875, "logits/rejected": -0.5698761343955994, "logps/chosen": -143.09986877441406, "logps/rejected": -83.71878051757812, "loss": 0.2522, "rewards/accuracies": 1.0, "rewards/chosen": 1.5304336547851562, "rewards/margins": 0.8244865536689758, "rewards/rejected": 0.7059471011161804, "step": 897 }, { "epoch": 0.15, "learning_rate": 9.951923900288888e-07, "logits/chosen": -0.7295212745666504, "logits/rejected": -0.6670010089874268, "logps/chosen": -120.31796264648438, "logps/rejected": -193.57879638671875, "loss": 1.1575, "rewards/accuracies": 0.0, "rewards/chosen": 0.29641494154930115, "rewards/margins": -1.0717124938964844, "rewards/rejected": 1.368127465248108, "step": 898 }, { "epoch": 0.15, "learning_rate": 9.951741915182134e-07, "logits/chosen": -0.3865586519241333, "logits/rejected": -0.3191562592983246, "logps/chosen": -70.9472885131836, "logps/rejected": -46.15736389160156, "loss": 0.5685, "rewards/accuracies": 0.0, "rewards/chosen": 0.3513481318950653, "rewards/margins": -0.0053119659423828125, "rewards/rejected": 0.3566600978374481, "step": 899 }, { "epoch": 0.15, "learning_rate": 9.951559587956868e-07, "logits/chosen": 0.0989215075969696, "logits/rejected": 0.0957353413105011, "logps/chosen": -3.737154245376587, "logps/rejected": -8.845479965209961, "loss": 0.621, "rewards/accuracies": 1.0, "rewards/chosen": 0.14069555699825287, "rewards/margins": 0.1271091252565384, "rewards/rejected": 0.01358642615377903, "step": 900 }, { "epoch": 0.15, "learning_rate": 9.951376918625686e-07, "logits/chosen": -0.6902676224708557, "logits/rejected": -0.6433651447296143, "logps/chosen": -102.97773742675781, "logps/rejected": -78.44508361816406, "loss": 0.8687, "rewards/accuracies": 0.0, "rewards/chosen": 1.2665375471115112, "rewards/margins": -0.10231161117553711, "rewards/rejected": 1.3688491582870483, "step": 901 }, { "epoch": 0.15, "learning_rate": 9.951193907201212e-07, "logits/chosen": -0.3455348312854767, "logits/rejected": -0.34734436869621277, "logps/chosen": -8.7511625289917, "logps/rejected": -1.8650290966033936, "loss": 0.7581, "rewards/accuracies": 0.0, "rewards/chosen": -0.0070747374556958675, "rewards/margins": -0.14461544156074524, "rewards/rejected": 0.13754069805145264, "step": 902 }, { "epoch": 0.15, "learning_rate": 9.951010553696085e-07, "logits/chosen": -0.5164685249328613, "logits/rejected": -0.5866096615791321, "logps/chosen": -201.54808044433594, "logps/rejected": -53.56258010864258, "loss": 0.5238, "rewards/accuracies": 1.0, "rewards/chosen": 2.0338470935821533, "rewards/margins": 0.8247203826904297, "rewards/rejected": 1.2091267108917236, "step": 903 }, { "epoch": 0.15, "learning_rate": 9.950826858122976e-07, "logits/chosen": -0.20234179496765137, "logits/rejected": -0.2170984297990799, "logps/chosen": -5.61583948135376, "logps/rejected": -3.6135404109954834, "loss": 1.3216, "rewards/accuracies": 0.0, "rewards/chosen": -0.03534236177802086, "rewards/margins": -0.14145897328853607, "rewards/rejected": 0.10611660778522491, "step": 904 }, { "epoch": 0.15, "learning_rate": 9.950642820494577e-07, "logits/chosen": -0.4192408621311188, "logits/rejected": -0.3408571779727936, "logps/chosen": -113.86698913574219, "logps/rejected": -92.71470642089844, "loss": 0.633, "rewards/accuracies": 0.0, "rewards/chosen": 0.08964996784925461, "rewards/margins": -0.24982300400733948, "rewards/rejected": 0.3394729793071747, "step": 905 }, { "epoch": 0.15, "learning_rate": 9.9504584408236e-07, "logits/chosen": -0.5479927062988281, "logits/rejected": -0.5637657046318054, "logps/chosen": -87.01516723632812, "logps/rejected": -48.0171012878418, "loss": 1.1578, "rewards/accuracies": 0.0, "rewards/chosen": 0.11530838161706924, "rewards/margins": -0.3401363492012024, "rewards/rejected": 0.45544472336769104, "step": 906 }, { "epoch": 0.15, "learning_rate": 9.950273719122791e-07, "logits/chosen": -0.4147382378578186, "logits/rejected": -0.39167654514312744, "logps/chosen": -54.7701416015625, "logps/rejected": -79.85029602050781, "loss": 0.8659, "rewards/accuracies": 0.0, "rewards/chosen": 1.120409369468689, "rewards/margins": -0.06336450576782227, "rewards/rejected": 1.1837738752365112, "step": 907 }, { "epoch": 0.15, "learning_rate": 9.950088655404905e-07, "logits/chosen": -0.3737626373767853, "logits/rejected": -0.3070128262042999, "logps/chosen": -51.913169860839844, "logps/rejected": -75.71819305419922, "loss": 0.7534, "rewards/accuracies": 1.0, "rewards/chosen": 1.0536144971847534, "rewards/margins": 0.14731109142303467, "rewards/rejected": 0.9063034057617188, "step": 908 }, { "epoch": 0.15, "learning_rate": 9.949903249682733e-07, "logits/chosen": -0.4105655550956726, "logits/rejected": -0.4147549569606781, "logps/chosen": -89.23606872558594, "logps/rejected": -172.97113037109375, "loss": 0.8382, "rewards/accuracies": 0.0, "rewards/chosen": 0.24129639565944672, "rewards/margins": -0.5148254036903381, "rewards/rejected": 0.756121814250946, "step": 909 }, { "epoch": 0.15, "learning_rate": 9.949717501969079e-07, "logits/chosen": -0.4588710367679596, "logits/rejected": -0.37081071734428406, "logps/chosen": -206.685791015625, "logps/rejected": -131.1126251220703, "loss": 0.5768, "rewards/accuracies": 1.0, "rewards/chosen": 1.8346771001815796, "rewards/margins": 0.11620938777923584, "rewards/rejected": 1.7184677124023438, "step": 910 }, { "epoch": 0.15, "learning_rate": 9.949531412276784e-07, "logits/chosen": -0.4114108383655548, "logits/rejected": -0.3957681357860565, "logps/chosen": -97.885498046875, "logps/rejected": -10.307147026062012, "loss": 0.3572, "rewards/accuracies": 1.0, "rewards/chosen": 1.204365611076355, "rewards/margins": 0.6972243785858154, "rewards/rejected": 0.5071412324905396, "step": 911 }, { "epoch": 0.15, "learning_rate": 9.949344980618698e-07, "logits/chosen": -0.5057030320167542, "logits/rejected": -0.4752275347709656, "logps/chosen": -113.69500732421875, "logps/rejected": -75.25897216796875, "loss": 0.6292, "rewards/accuracies": 0.0, "rewards/chosen": 0.30383530259132385, "rewards/margins": -0.2156257927417755, "rewards/rejected": 0.5194610953330994, "step": 912 }, { "epoch": 0.15, "learning_rate": 9.949158207007709e-07, "logits/chosen": -0.6568328142166138, "logits/rejected": -0.649233877658844, "logps/chosen": -198.23153686523438, "logps/rejected": -100.40364837646484, "loss": 0.8777, "rewards/accuracies": 0.0, "rewards/chosen": 0.340982049703598, "rewards/margins": -0.14442062377929688, "rewards/rejected": 0.4854026734828949, "step": 913 }, { "epoch": 0.15, "learning_rate": 9.948971091456714e-07, "logits/chosen": -0.3276211619377136, "logits/rejected": -0.32786422967910767, "logps/chosen": -275.92596435546875, "logps/rejected": -187.87747192382812, "loss": 0.5946, "rewards/accuracies": 1.0, "rewards/chosen": 2.558734178543091, "rewards/margins": 0.3237426280975342, "rewards/rejected": 2.2349915504455566, "step": 914 }, { "epoch": 0.15, "learning_rate": 9.948783633978647e-07, "logits/chosen": -0.5147761702537537, "logits/rejected": -0.4105892479419708, "logps/chosen": -233.70730590820312, "logps/rejected": -245.86782836914062, "loss": 0.9313, "rewards/accuracies": 0.0, "rewards/chosen": 1.2250488996505737, "rewards/margins": -1.3802520036697388, "rewards/rejected": 2.6053009033203125, "step": 915 }, { "epoch": 0.15, "learning_rate": 9.948595834586455e-07, "logits/chosen": -0.46896660327911377, "logits/rejected": -0.48480963706970215, "logps/chosen": -59.37959671020508, "logps/rejected": -51.97996520996094, "loss": 0.9028, "rewards/accuracies": 0.0, "rewards/chosen": 0.8681003451347351, "rewards/margins": -0.6457729935646057, "rewards/rejected": 1.5138733386993408, "step": 916 }, { "epoch": 0.15, "learning_rate": 9.948407693293116e-07, "logits/chosen": -0.28651636838912964, "logits/rejected": -0.28038570284843445, "logps/chosen": -138.8177947998047, "logps/rejected": -165.98178100585938, "loss": 0.944, "rewards/accuracies": 0.0, "rewards/chosen": 1.80693519115448, "rewards/margins": -0.74791419506073, "rewards/rejected": 2.55484938621521, "step": 917 }, { "epoch": 0.15, "learning_rate": 9.948219210111626e-07, "logits/chosen": -0.48579874634742737, "logits/rejected": -0.40771934390068054, "logps/chosen": -230.5446014404297, "logps/rejected": -88.3382568359375, "loss": 0.5061, "rewards/accuracies": 1.0, "rewards/chosen": 1.274267554283142, "rewards/margins": 0.4840911626815796, "rewards/rejected": 0.7901763916015625, "step": 918 }, { "epoch": 0.15, "learning_rate": 9.94803038505501e-07, "logits/chosen": -0.8292404413223267, "logits/rejected": -0.8292404413223267, "logps/chosen": -47.023841857910156, "logps/rejected": -47.023841857910156, "loss": 0.7977, "rewards/accuracies": 0.0, "rewards/chosen": 0.12404900044202805, "rewards/margins": 0.0, "rewards/rejected": 0.12404900044202805, "step": 919 }, { "epoch": 0.15, "learning_rate": 9.947841218136314e-07, "logits/chosen": -0.40569254755973816, "logits/rejected": -0.24301646649837494, "logps/chosen": -75.38636779785156, "logps/rejected": -74.13532257080078, "loss": 0.5704, "rewards/accuracies": 1.0, "rewards/chosen": 0.5742378234863281, "rewards/margins": 0.13100889325141907, "rewards/rejected": 0.44322893023490906, "step": 920 }, { "epoch": 0.15, "learning_rate": 9.947651709368604e-07, "logits/chosen": -0.39979100227355957, "logits/rejected": -0.404857337474823, "logps/chosen": -20.694564819335938, "logps/rejected": -20.328052520751953, "loss": 0.7408, "rewards/accuracies": 0.0, "rewards/chosen": 0.4203033447265625, "rewards/margins": -0.1704128384590149, "rewards/rejected": 0.5907161831855774, "step": 921 }, { "epoch": 0.15, "learning_rate": 9.947461858764977e-07, "logits/chosen": -0.8605642914772034, "logits/rejected": -0.8731006979942322, "logps/chosen": -144.7519073486328, "logps/rejected": -77.6094970703125, "loss": 0.363, "rewards/accuracies": 1.0, "rewards/chosen": 2.37168288230896, "rewards/margins": 1.1924813985824585, "rewards/rejected": 1.1792014837265015, "step": 922 }, { "epoch": 0.15, "learning_rate": 9.94727166633855e-07, "logits/chosen": -0.3461511433124542, "logits/rejected": -0.3615994453430176, "logps/chosen": -32.584320068359375, "logps/rejected": -55.1397590637207, "loss": 0.6995, "rewards/accuracies": 0.0, "rewards/chosen": 0.05562744289636612, "rewards/margins": -0.26142922043800354, "rewards/rejected": 0.31705665588378906, "step": 923 }, { "epoch": 0.15, "learning_rate": 9.947081132102462e-07, "logits/chosen": -0.10010475665330887, "logits/rejected": -0.07564137876033783, "logps/chosen": -66.3153076171875, "logps/rejected": -59.75578308105469, "loss": 0.6477, "rewards/accuracies": 1.0, "rewards/chosen": 0.9603561758995056, "rewards/margins": 0.5421615839004517, "rewards/rejected": 0.41819459199905396, "step": 924 }, { "epoch": 0.15, "learning_rate": 9.946890256069877e-07, "logits/chosen": -0.24433478713035583, "logits/rejected": -0.16555146872997284, "logps/chosen": -64.20942687988281, "logps/rejected": -14.997598648071289, "loss": 0.5026, "rewards/accuracies": 1.0, "rewards/chosen": 0.8850181698799133, "rewards/margins": 0.5890001058578491, "rewards/rejected": 0.2960180342197418, "step": 925 }, { "epoch": 0.15, "learning_rate": 9.946699038253984e-07, "logits/chosen": -0.40506866574287415, "logits/rejected": -0.40415850281715393, "logps/chosen": -10.619102478027344, "logps/rejected": -5.348773002624512, "loss": 1.0912, "rewards/accuracies": 0.0, "rewards/chosen": 0.13226966559886932, "rewards/margins": -0.12491194903850555, "rewards/rejected": 0.2571816146373749, "step": 926 }, { "epoch": 0.15, "learning_rate": 9.946507478667995e-07, "logits/chosen": -0.44744935631752014, "logits/rejected": -0.4730706810951233, "logps/chosen": -84.55058288574219, "logps/rejected": -67.25149536132812, "loss": 1.5634, "rewards/accuracies": 0.0, "rewards/chosen": -0.02333221398293972, "rewards/margins": -0.345458984375, "rewards/rejected": 0.3221267759799957, "step": 927 }, { "epoch": 0.15, "learning_rate": 9.946315577325139e-07, "logits/chosen": -0.3891061842441559, "logits/rejected": -0.34948354959487915, "logps/chosen": -110.11279296875, "logps/rejected": -76.33999633789062, "loss": 0.46, "rewards/accuracies": 1.0, "rewards/chosen": 1.3999977111816406, "rewards/margins": 1.010279893875122, "rewards/rejected": 0.38971787691116333, "step": 928 }, { "epoch": 0.15, "learning_rate": 9.946123334238683e-07, "logits/chosen": -0.42204225063323975, "logits/rejected": -0.4049982726573944, "logps/chosen": -131.84225463867188, "logps/rejected": -57.565032958984375, "loss": 0.5466, "rewards/accuracies": 1.0, "rewards/chosen": 1.2913391590118408, "rewards/margins": 0.49126744270324707, "rewards/rejected": 0.8000717163085938, "step": 929 }, { "epoch": 0.15, "learning_rate": 9.945930749421902e-07, "logits/chosen": -0.5965118408203125, "logits/rejected": -0.5251451134681702, "logps/chosen": -75.2655029296875, "logps/rejected": -180.35467529296875, "loss": 1.4472, "rewards/accuracies": 0.0, "rewards/chosen": 0.4543258845806122, "rewards/margins": -1.5080214738845825, "rewards/rejected": 1.962347388267517, "step": 930 }, { "epoch": 0.15, "learning_rate": 9.945737822888109e-07, "logits/chosen": -0.49803948402404785, "logits/rejected": -0.3925989866256714, "logps/chosen": -82.00016784667969, "logps/rejected": -168.44186401367188, "loss": 0.6751, "rewards/accuracies": 0.0, "rewards/chosen": 0.99920654296875, "rewards/margins": -0.46339118480682373, "rewards/rejected": 1.4625977277755737, "step": 931 }, { "epoch": 0.15, "learning_rate": 9.945544554650626e-07, "logits/chosen": -0.38248974084854126, "logits/rejected": 0.05921003222465515, "logps/chosen": -54.25990295410156, "logps/rejected": -107.2830810546875, "loss": 0.5287, "rewards/accuracies": 1.0, "rewards/chosen": 0.985089123249054, "rewards/margins": 0.570220947265625, "rewards/rejected": 0.41486817598342896, "step": 932 }, { "epoch": 0.15, "learning_rate": 9.945350944722812e-07, "logits/chosen": -0.14082784950733185, "logits/rejected": -0.14082784950733185, "logps/chosen": -80.62692260742188, "logps/rejected": -80.62692260742188, "loss": 0.6893, "rewards/accuracies": 0.0, "rewards/chosen": 0.21582642197608948, "rewards/margins": 0.0, "rewards/rejected": 0.21582642197608948, "step": 933 }, { "epoch": 0.15, "learning_rate": 9.945156993118041e-07, "logits/chosen": -0.9214445352554321, "logits/rejected": -0.9079039096832275, "logps/chosen": -122.96182250976562, "logps/rejected": -39.31532287597656, "loss": 0.6577, "rewards/accuracies": 1.0, "rewards/chosen": 0.1896827667951584, "rewards/margins": 0.10861663520336151, "rewards/rejected": 0.08106613159179688, "step": 934 }, { "epoch": 0.15, "learning_rate": 9.94496269984971e-07, "logits/chosen": -0.5052337050437927, "logits/rejected": -0.475515753030777, "logps/chosen": -46.999324798583984, "logps/rejected": -40.43523025512695, "loss": 0.4487, "rewards/accuracies": 1.0, "rewards/chosen": 0.7841945886611938, "rewards/margins": 0.38199082016944885, "rewards/rejected": 0.402203768491745, "step": 935 }, { "epoch": 0.15, "learning_rate": 9.94476806493125e-07, "logits/chosen": -0.5669957399368286, "logits/rejected": -0.5707350373268127, "logps/chosen": -99.07353210449219, "logps/rejected": -93.02027130126953, "loss": 0.6614, "rewards/accuracies": 0.0, "rewards/chosen": 0.29829636216163635, "rewards/margins": -0.614250898361206, "rewards/rejected": 0.9125472903251648, "step": 936 }, { "epoch": 0.15, "learning_rate": 9.944573088376102e-07, "logits/chosen": -0.5822595953941345, "logits/rejected": -0.6538894772529602, "logps/chosen": -224.86996459960938, "logps/rejected": -91.8403549194336, "loss": 0.9241, "rewards/accuracies": 0.0, "rewards/chosen": 1.0449494123458862, "rewards/margins": -0.4832862615585327, "rewards/rejected": 1.528235673904419, "step": 937 }, { "epoch": 0.15, "learning_rate": 9.94437777019774e-07, "logits/chosen": -0.5956529378890991, "logits/rejected": -0.5936639904975891, "logps/chosen": -117.74163055419922, "logps/rejected": -15.37677001953125, "loss": 0.7011, "rewards/accuracies": 1.0, "rewards/chosen": 0.5773887634277344, "rewards/margins": 0.19284304976463318, "rewards/rejected": 0.3845457136631012, "step": 938 }, { "epoch": 0.15, "learning_rate": 9.94418211040966e-07, "logits/chosen": -0.5193156599998474, "logits/rejected": -0.5196067094802856, "logps/chosen": -203.67532348632812, "logps/rejected": -115.69783020019531, "loss": 0.4622, "rewards/accuracies": 1.0, "rewards/chosen": 0.712261974811554, "rewards/margins": 0.18748629093170166, "rewards/rejected": 0.5247756838798523, "step": 939 }, { "epoch": 0.15, "learning_rate": 9.943986109025377e-07, "logits/chosen": -0.12168669700622559, "logits/rejected": -0.02634008228778839, "logps/chosen": -175.95791625976562, "logps/rejected": -20.69942283630371, "loss": 0.497, "rewards/accuracies": 1.0, "rewards/chosen": 0.8828399777412415, "rewards/margins": 0.6744630932807922, "rewards/rejected": 0.20837688446044922, "step": 940 }, { "epoch": 0.15, "learning_rate": 9.943789766058434e-07, "logits/chosen": -0.43269675970077515, "logits/rejected": -0.46744459867477417, "logps/chosen": -96.24032592773438, "logps/rejected": -160.405029296875, "loss": 1.2507, "rewards/accuracies": 0.0, "rewards/chosen": 0.5585678219795227, "rewards/margins": -1.177537441253662, "rewards/rejected": 1.7361053228378296, "step": 941 }, { "epoch": 0.15, "learning_rate": 9.943593081522397e-07, "logits/chosen": -0.3124265968799591, "logits/rejected": -0.31309929490089417, "logps/chosen": -118.59629821777344, "logps/rejected": -126.67513275146484, "loss": 0.3723, "rewards/accuracies": 1.0, "rewards/chosen": 1.233912706375122, "rewards/margins": 0.1143333911895752, "rewards/rejected": 1.1195793151855469, "step": 942 }, { "epoch": 0.15, "learning_rate": 9.943396055430855e-07, "logits/chosen": -0.45800116658210754, "logits/rejected": -0.376458078622818, "logps/chosen": -89.06057739257812, "logps/rejected": -115.25715637207031, "loss": 1.5273, "rewards/accuracies": 0.0, "rewards/chosen": 0.8768966794013977, "rewards/margins": -0.13019102811813354, "rewards/rejected": 1.0070877075195312, "step": 943 }, { "epoch": 0.15, "learning_rate": 9.943198687797421e-07, "logits/chosen": -0.6856749653816223, "logits/rejected": -0.6932579874992371, "logps/chosen": -117.07266235351562, "logps/rejected": -134.60716247558594, "loss": 1.1836, "rewards/accuracies": 0.0, "rewards/chosen": 0.21651001274585724, "rewards/margins": -1.466406226158142, "rewards/rejected": 1.682916283607483, "step": 944 }, { "epoch": 0.15, "learning_rate": 9.94300097863573e-07, "logits/chosen": -0.5926874876022339, "logits/rejected": -0.6097602248191833, "logps/chosen": -116.20455169677734, "logps/rejected": -104.01321411132812, "loss": 0.5929, "rewards/accuracies": 0.0, "rewards/chosen": 0.47669145464897156, "rewards/margins": -0.2409515082836151, "rewards/rejected": 0.7176429629325867, "step": 945 }, { "epoch": 0.15, "learning_rate": 9.942802927959442e-07, "logits/chosen": -0.5079804062843323, "logits/rejected": -0.4924389123916626, "logps/chosen": -152.2129669189453, "logps/rejected": -151.73155212402344, "loss": 0.2782, "rewards/accuracies": 1.0, "rewards/chosen": 2.1201157569885254, "rewards/margins": 0.9481415748596191, "rewards/rejected": 1.1719741821289062, "step": 946 }, { "epoch": 0.15, "learning_rate": 9.942604535782242e-07, "logits/chosen": -0.5607247948646545, "logits/rejected": -0.5795480012893677, "logps/chosen": -54.6960563659668, "logps/rejected": -85.35992431640625, "loss": 0.5624, "rewards/accuracies": 0.0, "rewards/chosen": 0.7491123080253601, "rewards/margins": -0.2338329553604126, "rewards/rejected": 0.9829452633857727, "step": 947 }, { "epoch": 0.15, "learning_rate": 9.942405802117834e-07, "logits/chosen": -0.4895090162754059, "logits/rejected": -0.44684112071990967, "logps/chosen": -189.93402099609375, "logps/rejected": -96.275634765625, "loss": 0.2439, "rewards/accuracies": 1.0, "rewards/chosen": 1.4942978620529175, "rewards/margins": 0.9839691519737244, "rewards/rejected": 0.5103287100791931, "step": 948 }, { "epoch": 0.15, "learning_rate": 9.942206726979954e-07, "logits/chosen": -0.5371648669242859, "logits/rejected": -0.5448684096336365, "logps/chosen": -73.86761474609375, "logps/rejected": -118.4504623413086, "loss": 0.8618, "rewards/accuracies": 0.0, "rewards/chosen": 1.0394783020019531, "rewards/margins": -0.9952285289764404, "rewards/rejected": 2.0347068309783936, "step": 949 }, { "epoch": 0.15, "learning_rate": 9.94200731038235e-07, "logits/chosen": -0.9107662439346313, "logits/rejected": -0.8948917388916016, "logps/chosen": -115.04891967773438, "logps/rejected": -76.97254943847656, "loss": 1.0759, "rewards/accuracies": 0.0, "rewards/chosen": -0.02426605299115181, "rewards/margins": -0.4450134336948395, "rewards/rejected": 0.4207473695278168, "step": 950 }, { "epoch": 0.15, "learning_rate": 9.941807552338803e-07, "logits/chosen": -0.07184635102748871, "logits/rejected": -0.07043439894914627, "logps/chosen": -107.02757263183594, "logps/rejected": -69.46292877197266, "loss": 0.9223, "rewards/accuracies": 0.0, "rewards/chosen": 0.6289207339286804, "rewards/margins": -0.2199035882949829, "rewards/rejected": 0.8488243222236633, "step": 951 }, { "epoch": 0.15, "learning_rate": 9.941607452863115e-07, "logits/chosen": -0.5790843963623047, "logits/rejected": -0.5673204064369202, "logps/chosen": -79.02676391601562, "logps/rejected": -85.5396728515625, "loss": 0.7499, "rewards/accuracies": 0.0, "rewards/chosen": 0.2692100703716278, "rewards/margins": -0.3073265254497528, "rewards/rejected": 0.5765365958213806, "step": 952 }, { "epoch": 0.15, "learning_rate": 9.94140701196911e-07, "logits/chosen": -0.13732396066188812, "logits/rejected": -0.16906829178333282, "logps/chosen": -21.035537719726562, "logps/rejected": -40.498504638671875, "loss": 0.6379, "rewards/accuracies": 0.0, "rewards/chosen": 0.12800446152687073, "rewards/margins": -0.09317168593406677, "rewards/rejected": 0.2211761474609375, "step": 953 }, { "epoch": 0.15, "learning_rate": 9.941206229670634e-07, "logits/chosen": -0.4592660665512085, "logits/rejected": -0.3887856602668762, "logps/chosen": -99.10505676269531, "logps/rejected": -121.32941436767578, "loss": 0.8027, "rewards/accuracies": 0.0, "rewards/chosen": 1.228417992591858, "rewards/margins": -0.5401756763458252, "rewards/rejected": 1.768593668937683, "step": 954 }, { "epoch": 0.16, "learning_rate": 9.941005105981563e-07, "logits/chosen": -0.48022496700286865, "logits/rejected": -0.45139846205711365, "logps/chosen": -74.14089965820312, "logps/rejected": -160.3946075439453, "loss": 1.0635, "rewards/accuracies": 0.0, "rewards/chosen": 0.1295875608921051, "rewards/margins": -0.2211097776889801, "rewards/rejected": 0.3506973385810852, "step": 955 }, { "epoch": 0.16, "learning_rate": 9.940803640915792e-07, "logits/chosen": -0.3791871666908264, "logits/rejected": -0.34950339794158936, "logps/chosen": -183.83358764648438, "logps/rejected": -139.1790313720703, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": 2.0017471313476562, "rewards/margins": -0.32537388801574707, "rewards/rejected": 2.3271210193634033, "step": 956 }, { "epoch": 0.16, "learning_rate": 9.94060183448724e-07, "logits/chosen": -0.36404338479042053, "logits/rejected": -0.3299616575241089, "logps/chosen": -90.87330627441406, "logps/rejected": -102.36058807373047, "loss": 1.0985, "rewards/accuracies": 0.0, "rewards/chosen": 0.982495129108429, "rewards/margins": -0.6122002005577087, "rewards/rejected": 1.5946953296661377, "step": 957 }, { "epoch": 0.16, "learning_rate": 9.940399686709848e-07, "logits/chosen": -0.342166543006897, "logits/rejected": -0.34286677837371826, "logps/chosen": -103.23927307128906, "logps/rejected": -52.29560852050781, "loss": 0.8187, "rewards/accuracies": 0.0, "rewards/chosen": 0.18619003891944885, "rewards/margins": -0.8800922632217407, "rewards/rejected": 1.0662822723388672, "step": 958 }, { "epoch": 0.16, "learning_rate": 9.940197197597587e-07, "logits/chosen": -0.10380646586418152, "logits/rejected": -0.006333056837320328, "logps/chosen": -64.49095153808594, "logps/rejected": -53.41801452636719, "loss": 0.803, "rewards/accuracies": 1.0, "rewards/chosen": 0.9815826416015625, "rewards/margins": 0.4537234902381897, "rewards/rejected": 0.5278591513633728, "step": 959 }, { "epoch": 0.16, "learning_rate": 9.939994367164442e-07, "logits/chosen": -0.5355158448219299, "logits/rejected": -0.5196865200996399, "logps/chosen": -101.45866394042969, "logps/rejected": -123.64315795898438, "loss": 0.7814, "rewards/accuracies": 0.0, "rewards/chosen": 0.2761543393135071, "rewards/margins": -0.5191658139228821, "rewards/rejected": 0.7953201532363892, "step": 960 }, { "epoch": 0.16, "learning_rate": 9.93979119542443e-07, "logits/chosen": -0.22544559836387634, "logits/rejected": -0.22544559836387634, "logps/chosen": -52.17161178588867, "logps/rejected": -52.17161178588867, "loss": 0.9486, "rewards/accuracies": 0.0, "rewards/chosen": 1.0759124755859375, "rewards/margins": 0.0, "rewards/rejected": 1.0759124755859375, "step": 961 }, { "epoch": 0.16, "learning_rate": 9.939587682391586e-07, "logits/chosen": -0.515507161617279, "logits/rejected": -0.5055263638496399, "logps/chosen": -85.7392807006836, "logps/rejected": -58.91972351074219, "loss": 0.6671, "rewards/accuracies": 1.0, "rewards/chosen": 1.34502112865448, "rewards/margins": 0.10690999031066895, "rewards/rejected": 1.238111138343811, "step": 962 }, { "epoch": 0.16, "learning_rate": 9.939383828079972e-07, "logits/chosen": -0.4864521622657776, "logits/rejected": -0.4506518542766571, "logps/chosen": -159.15167236328125, "logps/rejected": -72.22525024414062, "loss": 0.7664, "rewards/accuracies": 0.0, "rewards/chosen": 1.30144202709198, "rewards/margins": -0.17971110343933105, "rewards/rejected": 1.481153130531311, "step": 963 }, { "epoch": 0.16, "learning_rate": 9.939179632503672e-07, "logits/chosen": -0.3851938545703888, "logits/rejected": -0.4024116098880768, "logps/chosen": -87.12199401855469, "logps/rejected": -32.099937438964844, "loss": 0.6584, "rewards/accuracies": 1.0, "rewards/chosen": 0.5276802182197571, "rewards/margins": 0.38838309049606323, "rewards/rejected": 0.13929711282253265, "step": 964 }, { "epoch": 0.16, "learning_rate": 9.938975095676797e-07, "logits/chosen": -0.5801361203193665, "logits/rejected": -0.6386829018592834, "logps/chosen": -219.49771118164062, "logps/rejected": -108.0543212890625, "loss": 0.7541, "rewards/accuracies": 1.0, "rewards/chosen": 0.5457214713096619, "rewards/margins": 0.4547729790210724, "rewards/rejected": 0.09094848483800888, "step": 965 }, { "epoch": 0.16, "learning_rate": 9.938770217613472e-07, "logits/chosen": -0.35314279794692993, "logits/rejected": -0.21471287310123444, "logps/chosen": -284.7803039550781, "logps/rejected": -27.098344802856445, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.593994140625, "rewards/margins": 0.5698179006576538, "rewards/rejected": 0.02417621575295925, "step": 966 }, { "epoch": 0.16, "learning_rate": 9.938564998327858e-07, "logits/chosen": -0.12375377863645554, "logits/rejected": -0.06103141978383064, "logps/chosen": -78.57330322265625, "logps/rejected": -63.53516387939453, "loss": 0.6579, "rewards/accuracies": 1.0, "rewards/chosen": 1.0489929914474487, "rewards/margins": 0.08490151166915894, "rewards/rejected": 0.9640914797782898, "step": 967 }, { "epoch": 0.16, "learning_rate": 9.938359437834133e-07, "logits/chosen": -0.3243097960948944, "logits/rejected": -0.3298949599266052, "logps/chosen": -122.7320556640625, "logps/rejected": -87.36099243164062, "loss": 0.4114, "rewards/accuracies": 1.0, "rewards/chosen": 2.3668930530548096, "rewards/margins": 1.2787460088729858, "rewards/rejected": 1.0881470441818237, "step": 968 }, { "epoch": 0.16, "learning_rate": 9.938153536146497e-07, "logits/chosen": -0.7549344897270203, "logits/rejected": -0.7552939653396606, "logps/chosen": -209.6094970703125, "logps/rejected": -124.63658905029297, "loss": 0.4056, "rewards/accuracies": 1.0, "rewards/chosen": 0.18392638862133026, "rewards/margins": 0.05450209975242615, "rewards/rejected": 0.1294242888689041, "step": 969 }, { "epoch": 0.16, "learning_rate": 9.937947293279175e-07, "logits/chosen": -0.44254887104034424, "logits/rejected": -0.43124791979789734, "logps/chosen": -85.40834045410156, "logps/rejected": -71.65155792236328, "loss": 0.9118, "rewards/accuracies": 0.0, "rewards/chosen": 0.6457611322402954, "rewards/margins": -0.3762824535369873, "rewards/rejected": 1.0220435857772827, "step": 970 }, { "epoch": 0.16, "learning_rate": 9.937740709246422e-07, "logits/chosen": -0.5219714641571045, "logits/rejected": -0.46556609869003296, "logps/chosen": -158.9251708984375, "logps/rejected": -85.85003662109375, "loss": 0.49, "rewards/accuracies": 1.0, "rewards/chosen": 1.2406189441680908, "rewards/margins": 0.2694237232208252, "rewards/rejected": 0.9711952209472656, "step": 971 }, { "epoch": 0.16, "learning_rate": 9.937533784062505e-07, "logits/chosen": -0.5166221857070923, "logits/rejected": -0.4996938705444336, "logps/chosen": -101.58123016357422, "logps/rejected": -30.599348068237305, "loss": 0.3528, "rewards/accuracies": 1.0, "rewards/chosen": 1.493843913078308, "rewards/margins": 1.118759036064148, "rewards/rejected": 0.37508487701416016, "step": 972 }, { "epoch": 0.16, "learning_rate": 9.937326517741723e-07, "logits/chosen": -0.15984299778938293, "logits/rejected": -0.17089354991912842, "logps/chosen": -78.25602722167969, "logps/rejected": -64.6107177734375, "loss": 0.6116, "rewards/accuracies": 0.0, "rewards/chosen": 0.7486435174942017, "rewards/margins": -0.21893233060836792, "rewards/rejected": 0.9675758481025696, "step": 973 }, { "epoch": 0.16, "learning_rate": 9.937118910298396e-07, "logits/chosen": -0.5768565535545349, "logits/rejected": -0.5263800024986267, "logps/chosen": -63.676414489746094, "logps/rejected": -82.50157165527344, "loss": 0.6385, "rewards/accuracies": 1.0, "rewards/chosen": 0.9449653625488281, "rewards/margins": 0.2639358639717102, "rewards/rejected": 0.6810294985771179, "step": 974 }, { "epoch": 0.16, "learning_rate": 9.93691096174687e-07, "logits/chosen": -0.1733677238225937, "logits/rejected": -0.14543457329273224, "logps/chosen": -130.0234375, "logps/rejected": -68.12568664550781, "loss": 0.4993, "rewards/accuracies": 1.0, "rewards/chosen": 0.5286346673965454, "rewards/margins": 0.39167481660842896, "rewards/rejected": 0.13695983588695526, "step": 975 }, { "epoch": 0.16, "learning_rate": 9.936702672101507e-07, "logits/chosen": -0.34680819511413574, "logits/rejected": -0.36094170808792114, "logps/chosen": -85.54222869873047, "logps/rejected": -173.01528930664062, "loss": 0.6282, "rewards/accuracies": 0.0, "rewards/chosen": 0.7764305472373962, "rewards/margins": -0.3305671811103821, "rewards/rejected": 1.1069977283477783, "step": 976 }, { "epoch": 0.16, "learning_rate": 9.936494041376701e-07, "logits/chosen": -0.6369107365608215, "logits/rejected": -0.6286468505859375, "logps/chosen": -89.34121704101562, "logps/rejected": -92.8769302368164, "loss": 0.588, "rewards/accuracies": 0.0, "rewards/chosen": 0.6406364440917969, "rewards/margins": -0.31227874755859375, "rewards/rejected": 0.9529151916503906, "step": 977 }, { "epoch": 0.16, "learning_rate": 9.936285069586869e-07, "logits/chosen": -0.10867006331682205, "logits/rejected": -0.10824155062437057, "logps/chosen": -2.787602186203003, "logps/rejected": -3.2179603576660156, "loss": 0.5171, "rewards/accuracies": 1.0, "rewards/chosen": 0.13996021449565887, "rewards/margins": 0.10595929622650146, "rewards/rejected": 0.03400092199444771, "step": 978 }, { "epoch": 0.16, "learning_rate": 9.936075756746443e-07, "logits/chosen": -0.4482247233390808, "logits/rejected": -0.43560537695884705, "logps/chosen": -128.6011505126953, "logps/rejected": -76.37197875976562, "loss": 0.3171, "rewards/accuracies": 1.0, "rewards/chosen": 1.7035369873046875, "rewards/margins": 0.6939681768417358, "rewards/rejected": 1.0095688104629517, "step": 979 }, { "epoch": 0.16, "learning_rate": 9.93586610286989e-07, "logits/chosen": -0.33834201097488403, "logits/rejected": -0.24495331943035126, "logps/chosen": -183.59756469726562, "logps/rejected": -72.61561584472656, "loss": 0.3078, "rewards/accuracies": 1.0, "rewards/chosen": 1.7992767095565796, "rewards/margins": 1.0759490728378296, "rewards/rejected": 0.72332763671875, "step": 980 }, { "epoch": 0.16, "learning_rate": 9.935656107971693e-07, "logits/chosen": -0.7277691960334778, "logits/rejected": -0.7041890025138855, "logps/chosen": -318.31298828125, "logps/rejected": -122.59381103515625, "loss": 0.9294, "rewards/accuracies": 0.0, "rewards/chosen": 0.25921937823295593, "rewards/margins": -1.107031226158142, "rewards/rejected": 1.3662506341934204, "step": 981 }, { "epoch": 0.16, "learning_rate": 9.93544577206636e-07, "logits/chosen": -0.296669065952301, "logits/rejected": -0.24456891417503357, "logps/chosen": -100.98211669921875, "logps/rejected": -94.35081481933594, "loss": 0.4342, "rewards/accuracies": 1.0, "rewards/chosen": 1.4134224653244019, "rewards/margins": 0.03264617919921875, "rewards/rejected": 1.380776286125183, "step": 982 }, { "epoch": 0.16, "learning_rate": 9.935235095168423e-07, "logits/chosen": -0.580673098564148, "logits/rejected": -0.5844411849975586, "logps/chosen": -80.85537719726562, "logps/rejected": -62.26264190673828, "loss": 0.5193, "rewards/accuracies": 1.0, "rewards/chosen": 1.4675788879394531, "rewards/margins": 0.13373947143554688, "rewards/rejected": 1.3338394165039062, "step": 983 }, { "epoch": 0.16, "learning_rate": 9.93502407729244e-07, "logits/chosen": -0.5120640397071838, "logits/rejected": -0.5496660470962524, "logps/chosen": -320.3173828125, "logps/rejected": -159.58416748046875, "loss": 0.7166, "rewards/accuracies": 0.0, "rewards/chosen": 1.495306372642517, "rewards/margins": -0.15372776985168457, "rewards/rejected": 1.6490341424942017, "step": 984 }, { "epoch": 0.16, "learning_rate": 9.934812718452987e-07, "logits/chosen": -0.2323881983757019, "logits/rejected": -0.2367582619190216, "logps/chosen": -5.927206039428711, "logps/rejected": -15.88182258605957, "loss": 0.7313, "rewards/accuracies": 1.0, "rewards/chosen": 0.08683443069458008, "rewards/margins": 0.11778726428747177, "rewards/rejected": -0.030952835455536842, "step": 985 }, { "epoch": 0.16, "learning_rate": 9.93460101866467e-07, "logits/chosen": -0.4207488000392914, "logits/rejected": -0.4513551592826843, "logps/chosen": -69.35585021972656, "logps/rejected": -55.34226989746094, "loss": 0.7067, "rewards/accuracies": 0.0, "rewards/chosen": 0.048603057861328125, "rewards/margins": -0.6561573147773743, "rewards/rejected": 0.7047603726387024, "step": 986 }, { "epoch": 0.16, "learning_rate": 9.934388977942114e-07, "logits/chosen": -0.41420385241508484, "logits/rejected": -0.4084748327732086, "logps/chosen": -107.12217712402344, "logps/rejected": -72.1055908203125, "loss": 0.608, "rewards/accuracies": 0.0, "rewards/chosen": 0.4883979856967926, "rewards/margins": -0.2030433714389801, "rewards/rejected": 0.6914413571357727, "step": 987 }, { "epoch": 0.16, "learning_rate": 9.93417659629997e-07, "logits/chosen": -0.6196743249893188, "logits/rejected": -0.6179621815681458, "logps/chosen": -93.62236785888672, "logps/rejected": -58.717613220214844, "loss": 1.0452, "rewards/accuracies": 0.0, "rewards/chosen": -0.004673004150390625, "rewards/margins": -1.2105156183242798, "rewards/rejected": 1.2058426141738892, "step": 988 }, { "epoch": 0.16, "learning_rate": 9.933963873752909e-07, "logits/chosen": -0.5029667615890503, "logits/rejected": -0.4705218970775604, "logps/chosen": -100.97657012939453, "logps/rejected": -63.683013916015625, "loss": 0.4379, "rewards/accuracies": 1.0, "rewards/chosen": 1.0701721906661987, "rewards/margins": 0.04936373233795166, "rewards/rejected": 1.020808458328247, "step": 989 }, { "epoch": 0.16, "learning_rate": 9.93375081031563e-07, "logits/chosen": -0.26676028966903687, "logits/rejected": -0.23943547904491425, "logps/chosen": -77.44954681396484, "logps/rejected": -119.55267333984375, "loss": 0.5117, "rewards/accuracies": 1.0, "rewards/chosen": 1.6868667602539062, "rewards/margins": 0.36399686336517334, "rewards/rejected": 1.322869896888733, "step": 990 }, { "epoch": 0.16, "learning_rate": 9.933537406002857e-07, "logits/chosen": -0.6293829679489136, "logits/rejected": -0.6676647067070007, "logps/chosen": -259.6576843261719, "logps/rejected": -178.66845703125, "loss": 0.8438, "rewards/accuracies": 0.0, "rewards/chosen": 0.9794372916221619, "rewards/margins": -1.0784943103790283, "rewards/rejected": 2.057931661605835, "step": 991 }, { "epoch": 0.16, "learning_rate": 9.933323660829328e-07, "logits/chosen": -0.41742879152297974, "logits/rejected": -0.4440658390522003, "logps/chosen": -25.505599975585938, "logps/rejected": -41.077239990234375, "loss": 0.6064, "rewards/accuracies": 0.0, "rewards/chosen": 0.6304513812065125, "rewards/margins": -0.3426298499107361, "rewards/rejected": 0.9730812311172485, "step": 992 }, { "epoch": 0.16, "learning_rate": 9.933109574809812e-07, "logits/chosen": -0.7493341565132141, "logits/rejected": -0.7527512907981873, "logps/chosen": -91.28418731689453, "logps/rejected": -47.76891326904297, "loss": 0.9542, "rewards/accuracies": 0.0, "rewards/chosen": 0.42657241225242615, "rewards/margins": -0.8402427434921265, "rewards/rejected": 1.266815185546875, "step": 993 }, { "epoch": 0.16, "learning_rate": 9.932895147959104e-07, "logits/chosen": -0.2597554326057434, "logits/rejected": -0.3459877371788025, "logps/chosen": -79.62229919433594, "logps/rejected": -67.37490844726562, "loss": 0.7855, "rewards/accuracies": 1.0, "rewards/chosen": 0.8334152102470398, "rewards/margins": 0.1423095464706421, "rewards/rejected": 0.6911056637763977, "step": 994 }, { "epoch": 0.16, "learning_rate": 9.932680380292019e-07, "logits/chosen": -0.7703312635421753, "logits/rejected": -0.7422247529029846, "logps/chosen": -166.34609985351562, "logps/rejected": -21.26439094543457, "loss": 0.1495, "rewards/accuracies": 1.0, "rewards/chosen": 2.0983827114105225, "rewards/margins": 1.8781754970550537, "rewards/rejected": 0.22020721435546875, "step": 995 }, { "epoch": 0.16, "learning_rate": 9.932465271823389e-07, "logits/chosen": -0.5409230589866638, "logits/rejected": -0.5425909757614136, "logps/chosen": -56.323883056640625, "logps/rejected": -56.019168853759766, "loss": 0.8705, "rewards/accuracies": 0.0, "rewards/chosen": 0.4402538239955902, "rewards/margins": -0.7505401372909546, "rewards/rejected": 1.1907939910888672, "step": 996 }, { "epoch": 0.16, "learning_rate": 9.932249822568084e-07, "logits/chosen": -0.3714655339717865, "logits/rejected": -0.3756244480609894, "logps/chosen": -109.20484161376953, "logps/rejected": -83.46378326416016, "loss": 0.835, "rewards/accuracies": 0.0, "rewards/chosen": 0.08535843342542648, "rewards/margins": -0.4131622314453125, "rewards/rejected": 0.4985206723213196, "step": 997 }, { "epoch": 0.16, "learning_rate": 9.932034032540983e-07, "logits/chosen": -0.25705933570861816, "logits/rejected": -0.1691206842660904, "logps/chosen": -39.862548828125, "logps/rejected": -53.19675827026367, "loss": 0.317, "rewards/accuracies": 1.0, "rewards/chosen": 1.2717891931533813, "rewards/margins": 0.2665752172470093, "rewards/rejected": 1.005213975906372, "step": 998 }, { "epoch": 0.16, "learning_rate": 9.931817901756997e-07, "logits/chosen": -0.2885593771934509, "logits/rejected": -0.10807360708713531, "logps/chosen": -156.766845703125, "logps/rejected": -76.66605377197266, "loss": 0.3576, "rewards/accuracies": 1.0, "rewards/chosen": 1.2404053211212158, "rewards/margins": 0.1662842035293579, "rewards/rejected": 1.074121117591858, "step": 999 }, { "epoch": 0.16, "learning_rate": 9.931601430231062e-07, "logits/chosen": -0.5493323802947998, "logits/rejected": -0.5070179104804993, "logps/chosen": -62.728675842285156, "logps/rejected": -83.24513244628906, "loss": 1.1054, "rewards/accuracies": 0.0, "rewards/chosen": 0.323110967874527, "rewards/margins": -0.8472298383712769, "rewards/rejected": 1.1703407764434814, "step": 1000 }, { "epoch": 0.16, "learning_rate": 9.931384617978129e-07, "logits/chosen": -0.23271068930625916, "logits/rejected": -0.25712335109710693, "logps/chosen": -65.48970031738281, "logps/rejected": -73.76498413085938, "loss": 0.6345, "rewards/accuracies": 1.0, "rewards/chosen": 1.571088433265686, "rewards/margins": 0.848065972328186, "rewards/rejected": 0.7230224609375, "step": 1001 }, { "epoch": 0.16, "learning_rate": 9.931167465013182e-07, "logits/chosen": -0.6093865036964417, "logits/rejected": -0.5949302911758423, "logps/chosen": -118.96907043457031, "logps/rejected": -56.784828186035156, "loss": 0.9832, "rewards/accuracies": 0.0, "rewards/chosen": 0.09792633354663849, "rewards/margins": -0.38657915592193604, "rewards/rejected": 0.48450547456741333, "step": 1002 }, { "epoch": 0.16, "learning_rate": 9.930949971351221e-07, "logits/chosen": -0.1241183951497078, "logits/rejected": -0.11682067066431046, "logps/chosen": -5.069598197937012, "logps/rejected": -8.72120189666748, "loss": 0.9094, "rewards/accuracies": 1.0, "rewards/chosen": 0.19748011231422424, "rewards/margins": 0.26866990327835083, "rewards/rejected": -0.07118978351354599, "step": 1003 }, { "epoch": 0.16, "learning_rate": 9.930732137007275e-07, "logits/chosen": -0.3930560052394867, "logits/rejected": -0.3577191233634949, "logps/chosen": -34.97254180908203, "logps/rejected": -64.83873748779297, "loss": 0.5517, "rewards/accuracies": 1.0, "rewards/chosen": 0.8250755667686462, "rewards/margins": 0.17308425903320312, "rewards/rejected": 0.6519913077354431, "step": 1004 }, { "epoch": 0.16, "learning_rate": 9.930513961996393e-07, "logits/chosen": -0.4500257074832916, "logits/rejected": -0.43798086047172546, "logps/chosen": -84.14128112792969, "logps/rejected": -38.852508544921875, "loss": 0.7606, "rewards/accuracies": 0.0, "rewards/chosen": -0.02609710767865181, "rewards/margins": -0.11770020425319672, "rewards/rejected": 0.09160309284925461, "step": 1005 }, { "epoch": 0.16, "learning_rate": 9.930295446333647e-07, "logits/chosen": -0.48030948638916016, "logits/rejected": -0.2616868019104004, "logps/chosen": -65.81604766845703, "logps/rejected": -130.84024047851562, "loss": 0.5884, "rewards/accuracies": 0.0, "rewards/chosen": 0.6681190729141235, "rewards/margins": -0.24697798490524292, "rewards/rejected": 0.9150970578193665, "step": 1006 }, { "epoch": 0.16, "learning_rate": 9.93007659003414e-07, "logits/chosen": -0.0788596048951149, "logits/rejected": -0.091781385242939, "logps/chosen": -96.46002960205078, "logps/rejected": -57.07160949707031, "loss": 0.7841, "rewards/accuracies": 0.0, "rewards/chosen": 0.7460846304893494, "rewards/margins": -0.33576351404190063, "rewards/rejected": 1.08184814453125, "step": 1007 }, { "epoch": 0.16, "learning_rate": 9.92985739311299e-07, "logits/chosen": -0.4523719251155853, "logits/rejected": -0.41999614238739014, "logps/chosen": -60.63468551635742, "logps/rejected": -60.344539642333984, "loss": 0.7255, "rewards/accuracies": 1.0, "rewards/chosen": 1.0206745862960815, "rewards/margins": 0.38135915994644165, "rewards/rejected": 0.6393154263496399, "step": 1008 }, { "epoch": 0.16, "learning_rate": 9.929637855585336e-07, "logits/chosen": -0.16148264706134796, "logits/rejected": -0.036919593811035156, "logps/chosen": -43.858646392822266, "logps/rejected": -70.97254943847656, "loss": 1.0097, "rewards/accuracies": 0.0, "rewards/chosen": 0.5931072235107422, "rewards/margins": -0.08003884553909302, "rewards/rejected": 0.6731460690498352, "step": 1009 }, { "epoch": 0.16, "learning_rate": 9.929417977466354e-07, "logits/chosen": -0.38984256982803345, "logits/rejected": -0.403330534696579, "logps/chosen": -126.68142700195312, "logps/rejected": -112.24662780761719, "loss": 0.6113, "rewards/accuracies": 0.0, "rewards/chosen": 0.8318115472793579, "rewards/margins": -0.40757906436920166, "rewards/rejected": 1.2393906116485596, "step": 1010 }, { "epoch": 0.16, "learning_rate": 9.929197758771233e-07, "logits/chosen": -0.18776647746562958, "logits/rejected": -0.18776647746562958, "logps/chosen": -72.83100891113281, "logps/rejected": -72.83100891113281, "loss": 0.7291, "rewards/accuracies": 0.0, "rewards/chosen": 1.0191948413848877, "rewards/margins": 0.0, "rewards/rejected": 1.0191948413848877, "step": 1011 }, { "epoch": 0.16, "learning_rate": 9.928977199515184e-07, "logits/chosen": -0.583915650844574, "logits/rejected": -0.5390820503234863, "logps/chosen": -239.6055908203125, "logps/rejected": -43.07021713256836, "loss": 0.8909, "rewards/accuracies": 1.0, "rewards/chosen": 0.23450012505054474, "rewards/margins": 0.14353370666503906, "rewards/rejected": 0.09096641838550568, "step": 1012 }, { "epoch": 0.16, "learning_rate": 9.928756299713453e-07, "logits/chosen": -0.31415635347366333, "logits/rejected": -0.3102168142795563, "logps/chosen": -72.02223205566406, "logps/rejected": -66.1932601928711, "loss": 0.7406, "rewards/accuracies": 1.0, "rewards/chosen": 1.3009796142578125, "rewards/margins": 0.4719047546386719, "rewards/rejected": 0.8290748596191406, "step": 1013 }, { "epoch": 0.16, "learning_rate": 9.928535059381297e-07, "logits/chosen": -0.5675497651100159, "logits/rejected": -0.4943305253982544, "logps/chosen": -80.72492980957031, "logps/rejected": -88.78587341308594, "loss": 0.6351, "rewards/accuracies": 0.0, "rewards/chosen": 0.7909942865371704, "rewards/margins": -0.3647491931915283, "rewards/rejected": 1.1557434797286987, "step": 1014 }, { "epoch": 0.16, "learning_rate": 9.928313478534002e-07, "logits/chosen": -0.4950714707374573, "logits/rejected": -0.4950714707374573, "logps/chosen": -75.36478424072266, "logps/rejected": -75.36478424072266, "loss": 1.186, "rewards/accuracies": 0.0, "rewards/chosen": 0.7094032168388367, "rewards/margins": 0.0, "rewards/rejected": 0.7094032168388367, "step": 1015 }, { "epoch": 0.16, "learning_rate": 9.928091557186877e-07, "logits/chosen": -0.6152698993682861, "logits/rejected": -0.5949714183807373, "logps/chosen": -128.81771850585938, "logps/rejected": -82.22669982910156, "loss": 1.0406, "rewards/accuracies": 0.0, "rewards/chosen": 0.4434265196323395, "rewards/margins": -0.12753751873970032, "rewards/rejected": 0.5709640383720398, "step": 1016 }, { "epoch": 0.17, "learning_rate": 9.927869295355257e-07, "logits/chosen": -0.6302343010902405, "logits/rejected": -0.6328955888748169, "logps/chosen": -108.23223114013672, "logps/rejected": -25.65082359313965, "loss": 0.5442, "rewards/accuracies": 1.0, "rewards/chosen": 0.7207992672920227, "rewards/margins": 0.49836617708206177, "rewards/rejected": 0.22243309020996094, "step": 1017 }, { "epoch": 0.17, "learning_rate": 9.927646693054495e-07, "logits/chosen": -0.33533620834350586, "logits/rejected": -0.33920806646347046, "logps/chosen": -118.9428939819336, "logps/rejected": -75.58522033691406, "loss": 1.2089, "rewards/accuracies": 0.0, "rewards/chosen": 0.10716324299573898, "rewards/margins": -0.4777717590332031, "rewards/rejected": 0.5849350094795227, "step": 1018 }, { "epoch": 0.17, "learning_rate": 9.927423750299974e-07, "logits/chosen": -0.29527613520622253, "logits/rejected": -0.27842476963996887, "logps/chosen": -77.75000762939453, "logps/rejected": -110.47866821289062, "loss": 1.0942, "rewards/accuracies": 1.0, "rewards/chosen": 1.0439437627792358, "rewards/margins": 0.04334104061126709, "rewards/rejected": 1.0006027221679688, "step": 1019 }, { "epoch": 0.17, "learning_rate": 9.927200467107095e-07, "logits/chosen": -0.3606013059616089, "logits/rejected": -0.3731048107147217, "logps/chosen": -196.7132568359375, "logps/rejected": -163.82525634765625, "loss": 0.9723, "rewards/accuracies": 0.0, "rewards/chosen": 1.4687988758087158, "rewards/margins": -1.2345428466796875, "rewards/rejected": 2.7033417224884033, "step": 1020 }, { "epoch": 0.17, "learning_rate": 9.926976843491285e-07, "logits/chosen": -0.22577032446861267, "logits/rejected": -0.2267681360244751, "logps/chosen": -30.555757522583008, "logps/rejected": -44.793212890625, "loss": 0.583, "rewards/accuracies": 1.0, "rewards/chosen": 0.2996233105659485, "rewards/margins": 0.004963487386703491, "rewards/rejected": 0.294659823179245, "step": 1021 }, { "epoch": 0.17, "learning_rate": 9.926752879467995e-07, "logits/chosen": -0.4087907373905182, "logits/rejected": -0.36463603377342224, "logps/chosen": -135.22796630859375, "logps/rejected": -116.44715881347656, "loss": 0.7534, "rewards/accuracies": 0.0, "rewards/chosen": 1.1947494745254517, "rewards/margins": -0.6950225830078125, "rewards/rejected": 1.8897720575332642, "step": 1022 }, { "epoch": 0.17, "learning_rate": 9.926528575052698e-07, "logits/chosen": -0.4020395874977112, "logits/rejected": -0.4102640450000763, "logps/chosen": -152.29183959960938, "logps/rejected": -106.45391845703125, "loss": 0.9401, "rewards/accuracies": 1.0, "rewards/chosen": 2.1653687953948975, "rewards/margins": 0.4547775983810425, "rewards/rejected": 1.710591197013855, "step": 1023 }, { "epoch": 0.17, "learning_rate": 9.92630393026089e-07, "logits/chosen": -0.21747955679893494, "logits/rejected": -0.21747955679893494, "logps/chosen": -56.59613037109375, "logps/rejected": -56.59613037109375, "loss": 0.4467, "rewards/accuracies": 0.0, "rewards/chosen": -0.0026210786309093237, "rewards/margins": 0.0, "rewards/rejected": -0.0026210786309093237, "step": 1024 }, { "epoch": 0.17, "learning_rate": 9.926078945108097e-07, "logits/chosen": -0.3183557093143463, "logits/rejected": -0.3001355528831482, "logps/chosen": -29.59823226928711, "logps/rejected": -26.095882415771484, "loss": 1.2389, "rewards/accuracies": 1.0, "rewards/chosen": 0.26988181471824646, "rewards/margins": 0.049017712473869324, "rewards/rejected": 0.22086410224437714, "step": 1025 }, { "epoch": 0.17, "learning_rate": 9.925853619609857e-07, "logits/chosen": -0.19865445792675018, "logits/rejected": -0.19865445792675018, "logps/chosen": -17.766237258911133, "logps/rejected": -17.766237258911133, "loss": 0.6182, "rewards/accuracies": 0.0, "rewards/chosen": 0.238536074757576, "rewards/margins": 0.0, "rewards/rejected": 0.238536074757576, "step": 1026 }, { "epoch": 0.17, "learning_rate": 9.925627953781742e-07, "logits/chosen": -0.13722367584705353, "logits/rejected": -0.13722367584705353, "logps/chosen": -109.56100463867188, "logps/rejected": -109.56100463867188, "loss": 0.606, "rewards/accuracies": 0.0, "rewards/chosen": 0.7928726077079773, "rewards/margins": 0.0, "rewards/rejected": 0.7928726077079773, "step": 1027 }, { "epoch": 0.17, "learning_rate": 9.925401947639343e-07, "logits/chosen": -0.3230677545070648, "logits/rejected": -0.33056017756462097, "logps/chosen": -50.45677185058594, "logps/rejected": -87.78773498535156, "loss": 0.866, "rewards/accuracies": 1.0, "rewards/chosen": 0.15296363830566406, "rewards/margins": 0.010626599192619324, "rewards/rejected": 0.14233703911304474, "step": 1028 }, { "epoch": 0.17, "learning_rate": 9.925175601198272e-07, "logits/chosen": -0.02813798189163208, "logits/rejected": 0.0076142181642353535, "logps/chosen": -26.203357696533203, "logps/rejected": -18.023311614990234, "loss": 0.6754, "rewards/accuracies": 0.0, "rewards/chosen": 0.20364724099636078, "rewards/margins": -0.007316768169403076, "rewards/rejected": 0.21096400916576385, "step": 1029 }, { "epoch": 0.17, "learning_rate": 9.924948914474172e-07, "logits/chosen": -0.07462120801210403, "logits/rejected": -0.08792400360107422, "logps/chosen": -96.32737731933594, "logps/rejected": -149.26950073242188, "loss": 0.5611, "rewards/accuracies": 1.0, "rewards/chosen": 0.4316246211528778, "rewards/margins": 0.33620530366897583, "rewards/rejected": 0.09541931003332138, "step": 1030 }, { "epoch": 0.17, "learning_rate": 9.9247218874827e-07, "logits/chosen": -0.07954147458076477, "logits/rejected": -0.11709862947463989, "logps/chosen": -85.30599975585938, "logps/rejected": -182.08474731445312, "loss": 0.9647, "rewards/accuracies": 0.0, "rewards/chosen": 0.4644317626953125, "rewards/margins": -1.5392043590545654, "rewards/rejected": 2.003636121749878, "step": 1031 }, { "epoch": 0.17, "learning_rate": 9.924494520239546e-07, "logits/chosen": -0.6282697916030884, "logits/rejected": -0.6418468952178955, "logps/chosen": -275.35467529296875, "logps/rejected": -146.9017333984375, "loss": 0.5169, "rewards/accuracies": 0.0, "rewards/chosen": 1.7225220203399658, "rewards/margins": -0.24613642692565918, "rewards/rejected": 1.968658447265625, "step": 1032 }, { "epoch": 0.17, "learning_rate": 9.924266812760414e-07, "logits/chosen": -1.0860233306884766, "logits/rejected": -1.056396484375, "logps/chosen": -165.00205993652344, "logps/rejected": -166.30877685546875, "loss": 0.6159, "rewards/accuracies": 1.0, "rewards/chosen": 2.012922763824463, "rewards/margins": 0.16916513442993164, "rewards/rejected": 1.8437576293945312, "step": 1033 }, { "epoch": 0.17, "learning_rate": 9.92403876506104e-07, "logits/chosen": -0.7171021103858948, "logits/rejected": -0.7108219265937805, "logps/chosen": -160.2376251220703, "logps/rejected": -29.120288848876953, "loss": 0.4295, "rewards/accuracies": 1.0, "rewards/chosen": 0.33477020263671875, "rewards/margins": 0.28831425309181213, "rewards/rejected": 0.04645595699548721, "step": 1034 }, { "epoch": 0.17, "learning_rate": 9.92381037715718e-07, "logits/chosen": -0.14708052575588226, "logits/rejected": -0.02751627191901207, "logps/chosen": -69.61448669433594, "logps/rejected": -44.067134857177734, "loss": 0.4781, "rewards/accuracies": 1.0, "rewards/chosen": 1.0176368951797485, "rewards/margins": 0.7563751339912415, "rewards/rejected": 0.2612617611885071, "step": 1035 }, { "epoch": 0.17, "learning_rate": 9.92358164906461e-07, "logits/chosen": -0.6074308753013611, "logits/rejected": -0.5826221108436584, "logps/chosen": -343.801513671875, "logps/rejected": -187.26614379882812, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": -0.05085449293255806, "rewards/margins": -0.6857773065567017, "rewards/rejected": 0.6349228024482727, "step": 1036 }, { "epoch": 0.17, "learning_rate": 9.923352580799138e-07, "logits/chosen": -0.44094234704971313, "logits/rejected": -0.36622360348701477, "logps/chosen": -100.77439880371094, "logps/rejected": -17.912654876708984, "loss": 0.8587, "rewards/accuracies": 0.0, "rewards/chosen": 0.2608848512172699, "rewards/margins": -0.23786908388137817, "rewards/rejected": 0.49875393509864807, "step": 1037 }, { "epoch": 0.17, "learning_rate": 9.923123172376587e-07, "logits/chosen": -0.405450701713562, "logits/rejected": -0.4046860933303833, "logps/chosen": -61.938743591308594, "logps/rejected": -26.461278915405273, "loss": 0.6686, "rewards/accuracies": 1.0, "rewards/chosen": 1.0478118658065796, "rewards/margins": 0.5690346956253052, "rewards/rejected": 0.478777140378952, "step": 1038 }, { "epoch": 0.17, "learning_rate": 9.922893423812808e-07, "logits/chosen": -0.4157545864582062, "logits/rejected": -0.5009247660636902, "logps/chosen": -142.75265502929688, "logps/rejected": -81.280029296875, "loss": 0.2252, "rewards/accuracies": 1.0, "rewards/chosen": 2.349679708480835, "rewards/margins": 1.0627480745315552, "rewards/rejected": 1.2869316339492798, "step": 1039 }, { "epoch": 0.17, "learning_rate": 9.922663335123672e-07, "logits/chosen": -0.3921615779399872, "logits/rejected": -0.3921615779399872, "logps/chosen": -62.96034240722656, "logps/rejected": -62.96034240722656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.000158667564392, "rewards/margins": 0.0, "rewards/rejected": 1.000158667564392, "step": 1040 }, { "epoch": 0.17, "learning_rate": 9.922432906325082e-07, "logits/chosen": -0.6973286271095276, "logits/rejected": -0.40386900305747986, "logps/chosen": -84.9886703491211, "logps/rejected": -41.351051330566406, "loss": 0.2078, "rewards/accuracies": 1.0, "rewards/chosen": 1.6637924909591675, "rewards/margins": 1.2010014057159424, "rewards/rejected": 0.4627910554409027, "step": 1041 }, { "epoch": 0.17, "learning_rate": 9.922202137432953e-07, "logits/chosen": -0.409273236989975, "logits/rejected": -0.3342900574207306, "logps/chosen": -127.36961364746094, "logps/rejected": -91.213623046875, "loss": 0.4652, "rewards/accuracies": 1.0, "rewards/chosen": 1.8080551624298096, "rewards/margins": 1.3178268671035767, "rewards/rejected": 0.4902282655239105, "step": 1042 }, { "epoch": 0.17, "learning_rate": 9.92197102846323e-07, "logits/chosen": -0.8661837577819824, "logits/rejected": -0.8808964490890503, "logps/chosen": -83.35015869140625, "logps/rejected": -70.60086059570312, "loss": 0.4728, "rewards/accuracies": 0.0, "rewards/chosen": 0.5400993227958679, "rewards/margins": -0.28887104988098145, "rewards/rejected": 0.8289703726768494, "step": 1043 }, { "epoch": 0.17, "learning_rate": 9.921739579431882e-07, "logits/chosen": -0.6395056247711182, "logits/rejected": -0.6289028525352478, "logps/chosen": -122.50667572021484, "logps/rejected": -144.28822326660156, "loss": 0.7022, "rewards/accuracies": 0.0, "rewards/chosen": 1.1968437433242798, "rewards/margins": -0.8929740190505981, "rewards/rejected": 2.089817762374878, "step": 1044 }, { "epoch": 0.17, "learning_rate": 9.9215077903549e-07, "logits/chosen": -0.31018686294555664, "logits/rejected": -0.227552130818367, "logps/chosen": -127.95526123046875, "logps/rejected": -99.58397674560547, "loss": 0.5673, "rewards/accuracies": 1.0, "rewards/chosen": 1.863215684890747, "rewards/margins": 0.18480455875396729, "rewards/rejected": 1.6784111261367798, "step": 1045 }, { "epoch": 0.17, "learning_rate": 9.921275661248294e-07, "logits/chosen": -0.4148564040660858, "logits/rejected": -0.41592690348625183, "logps/chosen": -127.1934585571289, "logps/rejected": -152.3587646484375, "loss": 0.8641, "rewards/accuracies": 0.0, "rewards/chosen": 0.8397758603096008, "rewards/margins": -1.004361629486084, "rewards/rejected": 1.8441375494003296, "step": 1046 }, { "epoch": 0.17, "learning_rate": 9.92104319212811e-07, "logits/chosen": -0.2996574938297272, "logits/rejected": -0.2996574938297272, "logps/chosen": -58.31909942626953, "logps/rejected": -58.31909942626953, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.5932213068008423, "rewards/margins": 0.0, "rewards/rejected": 0.5932213068008423, "step": 1047 }, { "epoch": 0.17, "learning_rate": 9.9208103830104e-07, "logits/chosen": -0.246074378490448, "logits/rejected": -0.2846619486808777, "logps/chosen": -91.54393005371094, "logps/rejected": -83.8204116821289, "loss": 0.796, "rewards/accuracies": 0.0, "rewards/chosen": 0.5894684195518494, "rewards/margins": -0.5574050545692444, "rewards/rejected": 1.1468734741210938, "step": 1048 }, { "epoch": 0.17, "learning_rate": 9.920577233911256e-07, "logits/chosen": -0.38024672865867615, "logits/rejected": -0.3319031596183777, "logps/chosen": -152.57769775390625, "logps/rejected": -138.62110900878906, "loss": 1.26, "rewards/accuracies": 0.0, "rewards/chosen": 1.1436340808868408, "rewards/margins": -1.1300079822540283, "rewards/rejected": 2.273642063140869, "step": 1049 }, { "epoch": 0.17, "learning_rate": 9.920343744846783e-07, "logits/chosen": 0.1731480062007904, "logits/rejected": 0.1731480062007904, "logps/chosen": -20.581581115722656, "logps/rejected": -20.581581115722656, "loss": 1.334, "rewards/accuracies": 0.0, "rewards/chosen": 0.021617699414491653, "rewards/margins": 0.0, "rewards/rejected": 0.021617699414491653, "step": 1050 }, { "epoch": 0.17, "learning_rate": 9.920109915833117e-07, "logits/chosen": -0.16657096147537231, "logits/rejected": -0.17784558236598969, "logps/chosen": -112.76589965820312, "logps/rejected": -74.92650604248047, "loss": 0.2832, "rewards/accuracies": 1.0, "rewards/chosen": 1.7432113885879517, "rewards/margins": 1.1309089660644531, "rewards/rejected": 0.6123024225234985, "step": 1051 }, { "epoch": 0.17, "learning_rate": 9.919875746886408e-07, "logits/chosen": -0.9188907742500305, "logits/rejected": -0.8029143810272217, "logps/chosen": -84.72319030761719, "logps/rejected": -294.2152099609375, "loss": 0.9703, "rewards/accuracies": 0.0, "rewards/chosen": 0.13088226318359375, "rewards/margins": -0.5444076657295227, "rewards/rejected": 0.6752899289131165, "step": 1052 }, { "epoch": 0.17, "learning_rate": 9.919641238022837e-07, "logits/chosen": -0.562254011631012, "logits/rejected": -0.11133623123168945, "logps/chosen": -92.84465026855469, "logps/rejected": -111.85208129882812, "loss": 0.6009, "rewards/accuracies": 1.0, "rewards/chosen": 0.6809318661689758, "rewards/margins": 0.1831199824810028, "rewards/rejected": 0.497811883687973, "step": 1053 }, { "epoch": 0.17, "learning_rate": 9.919406389258606e-07, "logits/chosen": -0.28027471899986267, "logits/rejected": -0.2565336525440216, "logps/chosen": -55.52204513549805, "logps/rejected": -4.4037065505981445, "loss": 0.6325, "rewards/accuracies": 1.0, "rewards/chosen": 0.3523281216621399, "rewards/margins": 0.021842479705810547, "rewards/rejected": 0.33048564195632935, "step": 1054 }, { "epoch": 0.17, "learning_rate": 9.919171200609945e-07, "logits/chosen": -0.904606282711029, "logits/rejected": -0.8968008756637573, "logps/chosen": -94.19231414794922, "logps/rejected": -91.69637298583984, "loss": 0.528, "rewards/accuracies": 0.0, "rewards/chosen": 0.8575004935264587, "rewards/margins": -0.3643936514854431, "rewards/rejected": 1.2218941450119019, "step": 1055 }, { "epoch": 0.17, "learning_rate": 9.918935672093095e-07, "logits/chosen": -0.28322795033454895, "logits/rejected": -0.19235378503799438, "logps/chosen": -121.42769622802734, "logps/rejected": -60.17466735839844, "loss": 0.278, "rewards/accuracies": 1.0, "rewards/chosen": 1.8843849897384644, "rewards/margins": 0.9953308701515198, "rewards/rejected": 0.8890541195869446, "step": 1056 }, { "epoch": 0.17, "learning_rate": 9.918699803724335e-07, "logits/chosen": -0.2826189398765564, "logits/rejected": -0.29352596402168274, "logps/chosen": -85.60387420654297, "logps/rejected": -141.376708984375, "loss": 0.7643, "rewards/accuracies": 0.0, "rewards/chosen": 1.1772987842559814, "rewards/margins": -0.8383095264434814, "rewards/rejected": 2.015608310699463, "step": 1057 }, { "epoch": 0.17, "learning_rate": 9.918463595519962e-07, "logits/chosen": -0.5772196054458618, "logits/rejected": -0.629236102104187, "logps/chosen": -111.77813720703125, "logps/rejected": -72.05854034423828, "loss": 0.4727, "rewards/accuracies": 1.0, "rewards/chosen": 0.778730034828186, "rewards/margins": 0.2376205325126648, "rewards/rejected": 0.5411095023155212, "step": 1058 }, { "epoch": 0.17, "learning_rate": 9.91822704749629e-07, "logits/chosen": -0.19430404901504517, "logits/rejected": -0.21688266098499298, "logps/chosen": -85.91252899169922, "logps/rejected": -114.08782958984375, "loss": 0.8522, "rewards/accuracies": 1.0, "rewards/chosen": 0.7225181460380554, "rewards/margins": 0.3090988099575043, "rewards/rejected": 0.41341933608055115, "step": 1059 }, { "epoch": 0.17, "learning_rate": 9.917990159669668e-07, "logits/chosen": -0.19304104149341583, "logits/rejected": -0.1652185022830963, "logps/chosen": -158.66390991210938, "logps/rejected": -90.35047912597656, "loss": 0.2957, "rewards/accuracies": 1.0, "rewards/chosen": 1.7576751708984375, "rewards/margins": 1.1961784362792969, "rewards/rejected": 0.5614967346191406, "step": 1060 }, { "epoch": 0.17, "learning_rate": 9.91775293205646e-07, "logits/chosen": -0.48693734407424927, "logits/rejected": -0.5264886617660522, "logps/chosen": -107.85990905761719, "logps/rejected": -69.54674530029297, "loss": 1.0006, "rewards/accuracies": 0.0, "rewards/chosen": 0.41315004229545593, "rewards/margins": -0.40333327651023865, "rewards/rejected": 0.8164833188056946, "step": 1061 }, { "epoch": 0.17, "learning_rate": 9.917515364673057e-07, "logits/chosen": -0.4664866626262665, "logits/rejected": -0.4879758059978485, "logps/chosen": -105.9609603881836, "logps/rejected": -88.76824188232422, "loss": 0.7308, "rewards/accuracies": 0.0, "rewards/chosen": 0.5326545834541321, "rewards/margins": -0.7149223685264587, "rewards/rejected": 1.2475769519805908, "step": 1062 }, { "epoch": 0.17, "learning_rate": 9.917277457535871e-07, "logits/chosen": -0.7362555265426636, "logits/rejected": -0.7354215383529663, "logps/chosen": -248.83660888671875, "logps/rejected": -183.1244659423828, "loss": 0.9125, "rewards/accuracies": 0.0, "rewards/chosen": 1.3065093755722046, "rewards/margins": -1.2015184164047241, "rewards/rejected": 2.5080277919769287, "step": 1063 }, { "epoch": 0.17, "learning_rate": 9.91703921066134e-07, "logits/chosen": -0.22481150925159454, "logits/rejected": -0.27481067180633545, "logps/chosen": -112.39921569824219, "logps/rejected": -68.90438842773438, "loss": 1.5067, "rewards/accuracies": 0.0, "rewards/chosen": 0.407156378030777, "rewards/margins": -0.9979088306427002, "rewards/rejected": 1.4050651788711548, "step": 1064 }, { "epoch": 0.17, "learning_rate": 9.916800624065926e-07, "logits/chosen": -0.35968688130378723, "logits/rejected": -0.3680065870285034, "logps/chosen": -151.7313995361328, "logps/rejected": -66.28826141357422, "loss": 0.5962, "rewards/accuracies": 1.0, "rewards/chosen": 1.558160424232483, "rewards/margins": 0.6320198178291321, "rewards/rejected": 0.9261406064033508, "step": 1065 }, { "epoch": 0.17, "learning_rate": 9.916561697766112e-07, "logits/chosen": -0.27163541316986084, "logits/rejected": -0.27163541316986084, "logps/chosen": -127.73519134521484, "logps/rejected": -127.73519134521484, "loss": 0.6768, "rewards/accuracies": 0.0, "rewards/chosen": 0.24316178262233734, "rewards/margins": 0.0, "rewards/rejected": 0.24316178262233734, "step": 1066 }, { "epoch": 0.17, "learning_rate": 9.916322431778406e-07, "logits/chosen": -0.09308800846338272, "logits/rejected": -0.09308800846338272, "logps/chosen": -84.638916015625, "logps/rejected": -84.638916015625, "loss": 0.5318, "rewards/accuracies": 0.0, "rewards/chosen": 0.9943863153457642, "rewards/margins": 0.0, "rewards/rejected": 0.9943863153457642, "step": 1067 }, { "epoch": 0.17, "learning_rate": 9.916082826119338e-07, "logits/chosen": -0.24227428436279297, "logits/rejected": -0.22529536485671997, "logps/chosen": -6.624048233032227, "logps/rejected": -35.2451171875, "loss": 0.9553, "rewards/accuracies": 0.0, "rewards/chosen": 0.2542814314365387, "rewards/margins": -0.15361005067825317, "rewards/rejected": 0.40789148211479187, "step": 1068 }, { "epoch": 0.17, "learning_rate": 9.915842880805464e-07, "logits/chosen": -0.3944930136203766, "logits/rejected": -0.33782443404197693, "logps/chosen": -124.9239501953125, "logps/rejected": -47.917781829833984, "loss": 0.5192, "rewards/accuracies": 1.0, "rewards/chosen": 1.3093003034591675, "rewards/margins": 0.5915939807891846, "rewards/rejected": 0.7177063226699829, "step": 1069 }, { "epoch": 0.17, "learning_rate": 9.91560259585336e-07, "logits/chosen": -0.3142920732498169, "logits/rejected": -0.2987678349018097, "logps/chosen": -120.44816589355469, "logps/rejected": -68.74559783935547, "loss": 0.6261, "rewards/accuracies": 0.0, "rewards/chosen": 0.768261730670929, "rewards/margins": -0.08919143676757812, "rewards/rejected": 0.8574531674385071, "step": 1070 }, { "epoch": 0.17, "learning_rate": 9.91536197127963e-07, "logits/chosen": -0.5708458423614502, "logits/rejected": -0.5644647479057312, "logps/chosen": -91.08546447753906, "logps/rejected": -67.53773498535156, "loss": 0.4884, "rewards/accuracies": 1.0, "rewards/chosen": 1.7946137189865112, "rewards/margins": 0.7218246459960938, "rewards/rejected": 1.0727890729904175, "step": 1071 }, { "epoch": 0.17, "learning_rate": 9.915121007100898e-07, "logits/chosen": -0.2961437702178955, "logits/rejected": -0.2961437702178955, "logps/chosen": -56.651588439941406, "logps/rejected": -56.651588439941406, "loss": 0.6598, "rewards/accuracies": 0.0, "rewards/chosen": 0.21083946526050568, "rewards/margins": 0.0, "rewards/rejected": 0.21083946526050568, "step": 1072 }, { "epoch": 0.17, "learning_rate": 9.914879703333809e-07, "logits/chosen": -0.7570253014564514, "logits/rejected": -0.643328845500946, "logps/chosen": -84.39578247070312, "logps/rejected": -203.39251708984375, "loss": 1.4599, "rewards/accuracies": 0.0, "rewards/chosen": 0.24763183295726776, "rewards/margins": -2.480038642883301, "rewards/rejected": 2.727670431137085, "step": 1073 }, { "epoch": 0.17, "learning_rate": 9.914638059995038e-07, "logits/chosen": -0.5214598178863525, "logits/rejected": -0.5001420974731445, "logps/chosen": -101.21138763427734, "logps/rejected": -85.35009765625, "loss": 0.6602, "rewards/accuracies": 1.0, "rewards/chosen": 1.7582359313964844, "rewards/margins": 0.4713134765625, "rewards/rejected": 1.2869224548339844, "step": 1074 }, { "epoch": 0.17, "learning_rate": 9.914396077101282e-07, "logits/chosen": -0.6740940809249878, "logits/rejected": -0.6598462462425232, "logps/chosen": -90.60212707519531, "logps/rejected": -157.45741271972656, "loss": 1.0278, "rewards/accuracies": 0.0, "rewards/chosen": 0.2849128842353821, "rewards/margins": -0.9901695847511292, "rewards/rejected": 1.2750824689865112, "step": 1075 }, { "epoch": 0.17, "learning_rate": 9.914153754669253e-07, "logits/chosen": -0.5827136635780334, "logits/rejected": -0.5389756560325623, "logps/chosen": -197.93905639648438, "logps/rejected": -73.91951751708984, "loss": 0.7752, "rewards/accuracies": 1.0, "rewards/chosen": 2.8528122901916504, "rewards/margins": 1.5723930597305298, "rewards/rejected": 1.2804192304611206, "step": 1076 }, { "epoch": 0.17, "learning_rate": 9.913911092715702e-07, "logits/chosen": -0.3608291745185852, "logits/rejected": -0.3418923020362854, "logps/chosen": -173.2765350341797, "logps/rejected": -66.88250732421875, "loss": 0.3749, "rewards/accuracies": 1.0, "rewards/chosen": 1.8875137567520142, "rewards/margins": 1.0241234302520752, "rewards/rejected": 0.8633903861045837, "step": 1077 }, { "epoch": 0.17, "learning_rate": 9.91366809125739e-07, "logits/chosen": -0.29192379117012024, "logits/rejected": -0.2698904871940613, "logps/chosen": -71.0572280883789, "logps/rejected": -135.02569580078125, "loss": 0.982, "rewards/accuracies": 0.0, "rewards/chosen": 0.6270614862442017, "rewards/margins": -1.1028366088867188, "rewards/rejected": 1.7298980951309204, "step": 1078 }, { "epoch": 0.18, "learning_rate": 9.913424750311106e-07, "logits/chosen": -0.3944908380508423, "logits/rejected": -0.3638738691806793, "logps/chosen": -24.043418884277344, "logps/rejected": -34.255252838134766, "loss": 0.6694, "rewards/accuracies": 0.0, "rewards/chosen": 0.8140282034873962, "rewards/margins": -0.024000167846679688, "rewards/rejected": 0.8380283713340759, "step": 1079 }, { "epoch": 0.18, "learning_rate": 9.913181069893662e-07, "logits/chosen": -0.5997815728187561, "logits/rejected": -0.545339047908783, "logps/chosen": -94.2874984741211, "logps/rejected": -64.69173431396484, "loss": 0.97, "rewards/accuracies": 0.0, "rewards/chosen": 0.24972915649414062, "rewards/margins": -0.2455742061138153, "rewards/rejected": 0.49530336260795593, "step": 1080 }, { "epoch": 0.18, "learning_rate": 9.912937050021894e-07, "logits/chosen": -0.17743448913097382, "logits/rejected": -0.17743448913097382, "logps/chosen": -50.58230972290039, "logps/rejected": -50.58230972290039, "loss": 1.1063, "rewards/accuracies": 0.0, "rewards/chosen": 0.5113574862480164, "rewards/margins": 0.0, "rewards/rejected": 0.5113574862480164, "step": 1081 }, { "epoch": 0.18, "learning_rate": 9.912692690712665e-07, "logits/chosen": -0.5585412979125977, "logits/rejected": -0.5202597379684448, "logps/chosen": -84.85609436035156, "logps/rejected": -34.693756103515625, "loss": 0.8442, "rewards/accuracies": 1.0, "rewards/chosen": 0.7459152340888977, "rewards/margins": 0.33181533217430115, "rewards/rejected": 0.41409990191459656, "step": 1082 }, { "epoch": 0.18, "learning_rate": 9.912447991982855e-07, "logits/chosen": -0.32755762338638306, "logits/rejected": -0.28905194997787476, "logps/chosen": -227.8641357421875, "logps/rejected": -110.88328552246094, "loss": 0.2702, "rewards/accuracies": 1.0, "rewards/chosen": 1.8287994861602783, "rewards/margins": 1.034753441810608, "rewards/rejected": 0.7940460443496704, "step": 1083 }, { "epoch": 0.18, "learning_rate": 9.912202953849368e-07, "logits/chosen": -0.5795559287071228, "logits/rejected": -0.5960627794265747, "logps/chosen": -67.6467056274414, "logps/rejected": -145.93258666992188, "loss": 0.4114, "rewards/accuracies": 1.0, "rewards/chosen": 0.7924186587333679, "rewards/margins": 1.0132744312286377, "rewards/rejected": -0.220855712890625, "step": 1084 }, { "epoch": 0.18, "learning_rate": 9.91195757632914e-07, "logits/chosen": -0.2805430293083191, "logits/rejected": -0.25361254811286926, "logps/chosen": -94.81950378417969, "logps/rejected": -70.2294921875, "loss": 0.4714, "rewards/accuracies": 1.0, "rewards/chosen": 1.766944169998169, "rewards/margins": 1.3242378234863281, "rewards/rejected": 0.44270631670951843, "step": 1085 }, { "epoch": 0.18, "learning_rate": 9.91171185943912e-07, "logits/chosen": -0.15456125140190125, "logits/rejected": -0.19573107361793518, "logps/chosen": -55.616172790527344, "logps/rejected": -80.23635864257812, "loss": 0.8716, "rewards/accuracies": 1.0, "rewards/chosen": 0.197886660695076, "rewards/margins": 0.13879089057445526, "rewards/rejected": 0.05909576639533043, "step": 1086 }, { "epoch": 0.18, "learning_rate": 9.911465803196284e-07, "logits/chosen": -0.29377439618110657, "logits/rejected": -0.29395556449890137, "logps/chosen": -105.49588775634766, "logps/rejected": -116.01126861572266, "loss": 0.5988, "rewards/accuracies": 1.0, "rewards/chosen": 0.40981826186180115, "rewards/margins": 0.12123870849609375, "rewards/rejected": 0.2885795533657074, "step": 1087 }, { "epoch": 0.18, "learning_rate": 9.911219407617635e-07, "logits/chosen": -0.525534987449646, "logits/rejected": -0.5333901643753052, "logps/chosen": -79.97029876708984, "logps/rejected": -29.726823806762695, "loss": 0.7246, "rewards/accuracies": 0.0, "rewards/chosen": 0.9438576102256775, "rewards/margins": -0.21111053228378296, "rewards/rejected": 1.1549681425094604, "step": 1088 }, { "epoch": 0.18, "learning_rate": 9.910972672720196e-07, "logits/chosen": -0.278123676776886, "logits/rejected": -0.2827489376068115, "logps/chosen": -4.246705055236816, "logps/rejected": -3.110701322555542, "loss": 0.6681, "rewards/accuracies": 0.0, "rewards/chosen": 0.1427055448293686, "rewards/margins": -0.1075231283903122, "rewards/rejected": 0.2502286732196808, "step": 1089 }, { "epoch": 0.18, "learning_rate": 9.910725598521012e-07, "logits/chosen": -0.26564088463783264, "logits/rejected": -0.21248063445091248, "logps/chosen": -64.83082580566406, "logps/rejected": -52.632232666015625, "loss": 0.5723, "rewards/accuracies": 1.0, "rewards/chosen": 1.1262520551681519, "rewards/margins": 0.2752155661582947, "rewards/rejected": 0.8510364890098572, "step": 1090 }, { "epoch": 0.18, "learning_rate": 9.910478185037156e-07, "logits/chosen": -0.5186875462532043, "logits/rejected": -0.5064365267753601, "logps/chosen": -100.34199523925781, "logps/rejected": -103.45226287841797, "loss": 1.8263, "rewards/accuracies": 0.0, "rewards/chosen": 1.0706466436386108, "rewards/margins": -0.49556124210357666, "rewards/rejected": 1.5662078857421875, "step": 1091 }, { "epoch": 0.18, "learning_rate": 9.91023043228572e-07, "logits/chosen": -0.6123014092445374, "logits/rejected": -0.5926774740219116, "logps/chosen": -86.24017333984375, "logps/rejected": -75.14218139648438, "loss": 0.7537, "rewards/accuracies": 0.0, "rewards/chosen": 0.5666366815567017, "rewards/margins": -0.22914350032806396, "rewards/rejected": 0.7957801818847656, "step": 1092 }, { "epoch": 0.18, "learning_rate": 9.909982340283822e-07, "logits/chosen": -0.7121622562408447, "logits/rejected": -0.7321634292602539, "logps/chosen": -111.97782135009766, "logps/rejected": -80.87966918945312, "loss": 0.9682, "rewards/accuracies": 0.0, "rewards/chosen": 0.5718017816543579, "rewards/margins": -0.6657737493515015, "rewards/rejected": 1.2375755310058594, "step": 1093 }, { "epoch": 0.18, "learning_rate": 9.909733909048605e-07, "logits/chosen": -0.32038551568984985, "logits/rejected": -0.32098624110221863, "logps/chosen": -19.85762596130371, "logps/rejected": -3.21468448638916, "loss": 1.1917, "rewards/accuracies": 0.0, "rewards/chosen": 0.2264341413974762, "rewards/margins": -0.004045858979225159, "rewards/rejected": 0.23048000037670135, "step": 1094 }, { "epoch": 0.18, "learning_rate": 9.909485138597228e-07, "logits/chosen": -0.04872681200504303, "logits/rejected": -0.04043776914477348, "logps/chosen": -99.9395980834961, "logps/rejected": -83.21890258789062, "loss": 0.5207, "rewards/accuracies": 1.0, "rewards/chosen": 1.123085856437683, "rewards/margins": 0.09442520141601562, "rewards/rejected": 1.0286606550216675, "step": 1095 }, { "epoch": 0.18, "learning_rate": 9.909236028946885e-07, "logits/chosen": -0.19589553773403168, "logits/rejected": -0.19192133843898773, "logps/chosen": -56.96478271484375, "logps/rejected": -88.56141662597656, "loss": 0.4191, "rewards/accuracies": 1.0, "rewards/chosen": 0.4134574830532074, "rewards/margins": 0.4234474003314972, "rewards/rejected": -0.009989929385483265, "step": 1096 }, { "epoch": 0.18, "learning_rate": 9.908986580114783e-07, "logits/chosen": -0.47958171367645264, "logits/rejected": -0.47958171367645264, "logps/chosen": -49.0648193359375, "logps/rejected": -49.0648193359375, "loss": 0.7151, "rewards/accuracies": 0.0, "rewards/chosen": 0.6972442865371704, "rewards/margins": 0.0, "rewards/rejected": 0.6972442865371704, "step": 1097 }, { "epoch": 0.18, "learning_rate": 9.908736792118157e-07, "logits/chosen": -0.8389751315116882, "logits/rejected": -0.8178064227104187, "logps/chosen": -104.4449462890625, "logps/rejected": -125.77339172363281, "loss": 0.6737, "rewards/accuracies": 0.0, "rewards/chosen": 1.2473068237304688, "rewards/margins": -0.5899628400802612, "rewards/rejected": 1.83726966381073, "step": 1098 }, { "epoch": 0.18, "learning_rate": 9.908486664974265e-07, "logits/chosen": -0.36566194891929626, "logits/rejected": -0.4104372262954712, "logps/chosen": -135.22073364257812, "logps/rejected": -82.80844116210938, "loss": 1.5129, "rewards/accuracies": 0.0, "rewards/chosen": -0.1381378173828125, "rewards/margins": -1.3492599725723267, "rewards/rejected": 1.2111221551895142, "step": 1099 }, { "epoch": 0.18, "learning_rate": 9.90823619870039e-07, "logits/chosen": -0.5623743534088135, "logits/rejected": -0.5274909138679504, "logps/chosen": -93.12384033203125, "logps/rejected": -159.01065063476562, "loss": 0.9881, "rewards/accuracies": 1.0, "rewards/chosen": 0.4364151060581207, "rewards/margins": 0.19290924072265625, "rewards/rejected": 0.24350586533546448, "step": 1100 }, { "epoch": 0.18, "learning_rate": 9.907985393313836e-07, "logits/chosen": -0.2355438470840454, "logits/rejected": -0.24857237935066223, "logps/chosen": -80.01348114013672, "logps/rejected": -45.113197326660156, "loss": 0.4778, "rewards/accuracies": 1.0, "rewards/chosen": 0.29090043902397156, "rewards/margins": 0.2556503415107727, "rewards/rejected": 0.035250093787908554, "step": 1101 }, { "epoch": 0.18, "learning_rate": 9.907734248831928e-07, "logits/chosen": -0.5714982748031616, "logits/rejected": -0.3659721910953522, "logps/chosen": -93.89228820800781, "logps/rejected": -107.24057006835938, "loss": 0.59, "rewards/accuracies": 0.0, "rewards/chosen": 0.5031768679618835, "rewards/margins": -0.6794052720069885, "rewards/rejected": 1.182582139968872, "step": 1102 }, { "epoch": 0.18, "learning_rate": 9.907482765272025e-07, "logits/chosen": -0.7375156283378601, "logits/rejected": -0.7081091403961182, "logps/chosen": -127.44627380371094, "logps/rejected": -132.630859375, "loss": 0.7758, "rewards/accuracies": 0.0, "rewards/chosen": 1.6810073852539062, "rewards/margins": -0.7595260143280029, "rewards/rejected": 2.440533399581909, "step": 1103 }, { "epoch": 0.18, "learning_rate": 9.907230942651497e-07, "logits/chosen": -0.1794503927230835, "logits/rejected": -0.1345766931772232, "logps/chosen": -58.36529541015625, "logps/rejected": -27.43830680847168, "loss": 0.7224, "rewards/accuracies": 1.0, "rewards/chosen": 1.4424759149551392, "rewards/margins": 0.8940057754516602, "rewards/rejected": 0.548470139503479, "step": 1104 }, { "epoch": 0.18, "learning_rate": 9.906978780987742e-07, "logits/chosen": -0.46986016631126404, "logits/rejected": -0.42616692185401917, "logps/chosen": -77.31205749511719, "logps/rejected": -71.40448760986328, "loss": 0.4957, "rewards/accuracies": 1.0, "rewards/chosen": 1.406398057937622, "rewards/margins": 0.05418097972869873, "rewards/rejected": 1.3522170782089233, "step": 1105 }, { "epoch": 0.18, "learning_rate": 9.906726280298184e-07, "logits/chosen": 0.01965969055891037, "logits/rejected": 0.01965969055891037, "logps/chosen": -21.434223175048828, "logps/rejected": -21.434223175048828, "loss": 0.7382, "rewards/accuracies": 0.0, "rewards/chosen": 0.008532143197953701, "rewards/margins": 0.0, "rewards/rejected": 0.008532143197953701, "step": 1106 }, { "epoch": 0.18, "learning_rate": 9.90647344060027e-07, "logits/chosen": -0.16633114218711853, "logits/rejected": -0.002491746563464403, "logps/chosen": -103.7568359375, "logps/rejected": -15.391504287719727, "loss": 0.6325, "rewards/accuracies": 1.0, "rewards/chosen": 0.8268486261367798, "rewards/margins": 0.5004106760025024, "rewards/rejected": 0.32643795013427734, "step": 1107 }, { "epoch": 0.18, "learning_rate": 9.906220261911465e-07, "logits/chosen": -0.4548496901988983, "logits/rejected": -0.4645920693874359, "logps/chosen": -88.6129379272461, "logps/rejected": -93.0587158203125, "loss": 0.7326, "rewards/accuracies": 1.0, "rewards/chosen": 1.042395830154419, "rewards/margins": 0.5977959036827087, "rewards/rejected": 0.4445999264717102, "step": 1108 }, { "epoch": 0.18, "learning_rate": 9.90596674424926e-07, "logits/chosen": -0.4435791075229645, "logits/rejected": -0.3753543794155121, "logps/chosen": -69.19392395019531, "logps/rejected": -185.57708740234375, "loss": 0.8883, "rewards/accuracies": 0.0, "rewards/chosen": 0.9223907589912415, "rewards/margins": -1.0600998401641846, "rewards/rejected": 1.9824905395507812, "step": 1109 }, { "epoch": 0.18, "learning_rate": 9.90571288763118e-07, "logits/chosen": -0.30856162309646606, "logits/rejected": -0.31174391508102417, "logps/chosen": -47.591148376464844, "logps/rejected": -112.91600036621094, "loss": 0.536, "rewards/accuracies": 1.0, "rewards/chosen": 0.6643520593643188, "rewards/margins": 0.558823823928833, "rewards/rejected": 0.10552825778722763, "step": 1110 }, { "epoch": 0.18, "learning_rate": 9.905458692074754e-07, "logits/chosen": -0.3738526403903961, "logits/rejected": -0.3785080909729004, "logps/chosen": -133.34597778320312, "logps/rejected": -52.80952835083008, "loss": 0.5738, "rewards/accuracies": 1.0, "rewards/chosen": 1.237335205078125, "rewards/margins": 0.718257486820221, "rewards/rejected": 0.519077718257904, "step": 1111 }, { "epoch": 0.18, "learning_rate": 9.905204157597548e-07, "logits/chosen": -0.6434870362281799, "logits/rejected": -0.6567153930664062, "logps/chosen": -297.31878662109375, "logps/rejected": -59.190677642822266, "loss": 1.0442, "rewards/accuracies": 0.0, "rewards/chosen": 0.6183807253837585, "rewards/margins": -1.0140292644500732, "rewards/rejected": 1.6324100494384766, "step": 1112 }, { "epoch": 0.18, "learning_rate": 9.904949284217147e-07, "logits/chosen": -1.1007403135299683, "logits/rejected": -1.134466528892517, "logps/chosen": -100.1493148803711, "logps/rejected": -41.701175689697266, "loss": 0.4023, "rewards/accuracies": 1.0, "rewards/chosen": 0.18588562309741974, "rewards/margins": 0.18555870652198792, "rewards/rejected": 0.0003269195731263608, "step": 1113 }, { "epoch": 0.18, "learning_rate": 9.904694071951165e-07, "logits/chosen": 0.026365036144852638, "logits/rejected": 0.02952093631029129, "logps/chosen": -1.5906262397766113, "logps/rejected": -10.325618743896484, "loss": 0.8708, "rewards/accuracies": 1.0, "rewards/chosen": 0.19639526307582855, "rewards/margins": 0.07956615090370178, "rewards/rejected": 0.11682911217212677, "step": 1114 }, { "epoch": 0.18, "learning_rate": 9.90443852081723e-07, "logits/chosen": -0.5760404467582703, "logits/rejected": -0.5751544833183289, "logps/chosen": -81.42008972167969, "logps/rejected": -82.8154525756836, "loss": 0.5973, "rewards/accuracies": 1.0, "rewards/chosen": 0.36462098360061646, "rewards/margins": 0.04893723130226135, "rewards/rejected": 0.3156837522983551, "step": 1115 }, { "epoch": 0.18, "learning_rate": 9.904182630832997e-07, "logits/chosen": -0.7663438320159912, "logits/rejected": -0.7972869873046875, "logps/chosen": -35.62838363647461, "logps/rejected": -16.718727111816406, "loss": 0.7466, "rewards/accuracies": 0.0, "rewards/chosen": -0.05736427381634712, "rewards/margins": -0.2652839720249176, "rewards/rejected": 0.20791970193386078, "step": 1116 }, { "epoch": 0.18, "learning_rate": 9.90392640201615e-07, "logits/chosen": -0.22777783870697021, "logits/rejected": -0.2501894533634186, "logps/chosen": -115.80633544921875, "logps/rejected": -108.03987884521484, "loss": 0.8588, "rewards/accuracies": 0.0, "rewards/chosen": 0.20889587700366974, "rewards/margins": -0.28079909086227417, "rewards/rejected": 0.4896949827671051, "step": 1117 }, { "epoch": 0.18, "learning_rate": 9.903669834384391e-07, "logits/chosen": -0.5301719307899475, "logits/rejected": -0.4833606481552124, "logps/chosen": -52.78116989135742, "logps/rejected": -44.33161926269531, "loss": 0.9146, "rewards/accuracies": 1.0, "rewards/chosen": 1.3345211744308472, "rewards/margins": 0.06521916389465332, "rewards/rejected": 1.2693020105361938, "step": 1118 }, { "epoch": 0.18, "learning_rate": 9.903412927955445e-07, "logits/chosen": -0.0837126225233078, "logits/rejected": -0.0837126225233078, "logps/chosen": -58.93674087524414, "logps/rejected": -58.93674087524414, "loss": 0.5175, "rewards/accuracies": 0.0, "rewards/chosen": 0.8198375701904297, "rewards/margins": 0.0, "rewards/rejected": 0.8198375701904297, "step": 1119 }, { "epoch": 0.18, "learning_rate": 9.903155682747061e-07, "logits/chosen": -0.46364665031433105, "logits/rejected": -0.4266453683376312, "logps/chosen": -116.35569763183594, "logps/rejected": -109.51234436035156, "loss": 0.3278, "rewards/accuracies": 1.0, "rewards/chosen": 0.8369209170341492, "rewards/margins": 0.37392348051071167, "rewards/rejected": 0.4629974365234375, "step": 1120 }, { "epoch": 0.18, "learning_rate": 9.902898098777015e-07, "logits/chosen": -0.3519148826599121, "logits/rejected": -0.38475465774536133, "logps/chosen": -193.7860565185547, "logps/rejected": -56.238887786865234, "loss": 0.4622, "rewards/accuracies": 0.0, "rewards/chosen": 0.598785400390625, "rewards/margins": -0.03880232572555542, "rewards/rejected": 0.6375877261161804, "step": 1121 }, { "epoch": 0.18, "learning_rate": 9.9026401760631e-07, "logits/chosen": -0.3483573794364929, "logits/rejected": -0.3704584538936615, "logps/chosen": -80.67366027832031, "logps/rejected": -104.60710144042969, "loss": 0.9071, "rewards/accuracies": 0.0, "rewards/chosen": 0.8974037170410156, "rewards/margins": -0.6434425115585327, "rewards/rejected": 1.5408462285995483, "step": 1122 }, { "epoch": 0.18, "learning_rate": 9.902381914623142e-07, "logits/chosen": -0.40171104669570923, "logits/rejected": -0.42871105670928955, "logps/chosen": -55.34581756591797, "logps/rejected": -57.99383544921875, "loss": 0.9101, "rewards/accuracies": 0.0, "rewards/chosen": -0.0014541626442223787, "rewards/margins": -1.0602684020996094, "rewards/rejected": 1.058814287185669, "step": 1123 }, { "epoch": 0.18, "learning_rate": 9.902123314474977e-07, "logits/chosen": -0.4592708349227905, "logits/rejected": -0.46156832575798035, "logps/chosen": -148.65521240234375, "logps/rejected": -70.05979919433594, "loss": 0.89, "rewards/accuracies": 0.0, "rewards/chosen": 0.2330474853515625, "rewards/margins": -0.8380492925643921, "rewards/rejected": 1.0710967779159546, "step": 1124 }, { "epoch": 0.18, "learning_rate": 9.901864375636476e-07, "logits/chosen": -0.6074013113975525, "logits/rejected": -0.47215914726257324, "logps/chosen": -155.19573974609375, "logps/rejected": -136.18319702148438, "loss": 0.5337, "rewards/accuracies": 0.0, "rewards/chosen": 1.6804672479629517, "rewards/margins": -0.22575688362121582, "rewards/rejected": 1.9062241315841675, "step": 1125 }, { "epoch": 0.18, "learning_rate": 9.901605098125526e-07, "logits/chosen": -0.5742756724357605, "logits/rejected": -0.5165364146232605, "logps/chosen": -96.95001220703125, "logps/rejected": -47.03111267089844, "loss": 0.736, "rewards/accuracies": 1.0, "rewards/chosen": 1.5392318964004517, "rewards/margins": 0.20423626899719238, "rewards/rejected": 1.3349956274032593, "step": 1126 }, { "epoch": 0.18, "learning_rate": 9.901345481960047e-07, "logits/chosen": -0.9715943336486816, "logits/rejected": -0.9487155079841614, "logps/chosen": -134.00062561035156, "logps/rejected": -74.09966278076172, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.4434524476528168, "rewards/margins": -0.8399436473846436, "rewards/rejected": 1.2833961248397827, "step": 1127 }, { "epoch": 0.18, "learning_rate": 9.90108552715797e-07, "logits/chosen": -0.039249613881111145, "logits/rejected": -0.044164758175611496, "logps/chosen": -2.2928686141967773, "logps/rejected": -25.64116859436035, "loss": 0.3483, "rewards/accuracies": 1.0, "rewards/chosen": 0.25802063941955566, "rewards/margins": 0.27202802896499634, "rewards/rejected": -0.014007377438247204, "step": 1128 }, { "epoch": 0.18, "learning_rate": 9.90082523373726e-07, "logits/chosen": -0.6805886030197144, "logits/rejected": -0.6664426326751709, "logps/chosen": -143.561279296875, "logps/rejected": -97.566650390625, "loss": 1.2904, "rewards/accuracies": 0.0, "rewards/chosen": 0.490548700094223, "rewards/margins": -1.2314071655273438, "rewards/rejected": 1.7219558954238892, "step": 1129 }, { "epoch": 0.18, "learning_rate": 9.900564601715897e-07, "logits/chosen": -0.19831927120685577, "logits/rejected": -0.08983546495437622, "logps/chosen": -75.98820495605469, "logps/rejected": -83.58917236328125, "loss": 0.6827, "rewards/accuracies": 0.0, "rewards/chosen": 0.5183960199356079, "rewards/margins": -0.2929244637489319, "rewards/rejected": 0.8113204836845398, "step": 1130 }, { "epoch": 0.18, "learning_rate": 9.900303631111887e-07, "logits/chosen": 0.21420301496982574, "logits/rejected": 0.21420301496982574, "logps/chosen": -51.340293884277344, "logps/rejected": -51.340293884277344, "loss": 0.648, "rewards/accuracies": 0.0, "rewards/chosen": -0.03967895731329918, "rewards/margins": 0.0, "rewards/rejected": -0.03967895731329918, "step": 1131 }, { "epoch": 0.18, "learning_rate": 9.900042321943267e-07, "logits/chosen": -0.37735897302627563, "logits/rejected": -0.24792903661727905, "logps/chosen": -49.67412185668945, "logps/rejected": -104.68168640136719, "loss": 0.4918, "rewards/accuracies": 0.0, "rewards/chosen": 0.9573249816894531, "rewards/margins": -0.3349212408065796, "rewards/rejected": 1.2922462224960327, "step": 1132 }, { "epoch": 0.18, "learning_rate": 9.899780674228086e-07, "logits/chosen": -0.5949997305870056, "logits/rejected": -0.48240554332733154, "logps/chosen": -69.39250183105469, "logps/rejected": -76.32015228271484, "loss": 0.8271, "rewards/accuracies": 0.0, "rewards/chosen": 0.5402763485908508, "rewards/margins": -0.6575302481651306, "rewards/rejected": 1.1978065967559814, "step": 1133 }, { "epoch": 0.18, "learning_rate": 9.899518687984422e-07, "logits/chosen": -0.25855591893196106, "logits/rejected": -0.28073957562446594, "logps/chosen": -39.6978645324707, "logps/rejected": -59.79630661010742, "loss": 0.7883, "rewards/accuracies": 0.0, "rewards/chosen": 0.9580677151679993, "rewards/margins": -0.3457443118095398, "rewards/rejected": 1.303812026977539, "step": 1134 }, { "epoch": 0.18, "learning_rate": 9.899256363230377e-07, "logits/chosen": -0.4183785915374756, "logits/rejected": -0.3993920683860779, "logps/chosen": -117.29902648925781, "logps/rejected": -62.45595169067383, "loss": 0.2769, "rewards/accuracies": 1.0, "rewards/chosen": 1.9795273542404175, "rewards/margins": 0.9536930322647095, "rewards/rejected": 1.025834321975708, "step": 1135 }, { "epoch": 0.18, "learning_rate": 9.898993699984075e-07, "logits/chosen": -0.28417423367500305, "logits/rejected": -0.38990718126296997, "logps/chosen": -39.25904846191406, "logps/rejected": -51.813621520996094, "loss": 0.585, "rewards/accuracies": 0.0, "rewards/chosen": 0.9647247195243835, "rewards/margins": -0.31084829568862915, "rewards/rejected": 1.2755730152130127, "step": 1136 }, { "epoch": 0.18, "learning_rate": 9.89873069826366e-07, "logits/chosen": -0.27015846967697144, "logits/rejected": -0.22139064967632294, "logps/chosen": -169.8890380859375, "logps/rejected": -115.35292053222656, "loss": 0.5049, "rewards/accuracies": 1.0, "rewards/chosen": 1.5540603399276733, "rewards/margins": 0.8260322213172913, "rewards/rejected": 0.7280281186103821, "step": 1137 }, { "epoch": 0.18, "learning_rate": 9.898467358087308e-07, "logits/chosen": -0.1444997638463974, "logits/rejected": -0.17868073284626007, "logps/chosen": -40.646484375, "logps/rejected": -115.69585418701172, "loss": 0.6824, "rewards/accuracies": 0.0, "rewards/chosen": 0.25253602862358093, "rewards/margins": -0.08033674955368042, "rewards/rejected": 0.33287277817726135, "step": 1138 }, { "epoch": 0.18, "learning_rate": 9.898203679473213e-07, "logits/chosen": -0.511301577091217, "logits/rejected": -0.5153087973594666, "logps/chosen": -228.46533203125, "logps/rejected": -73.12511444091797, "loss": 0.4886, "rewards/accuracies": 1.0, "rewards/chosen": 1.8753875494003296, "rewards/margins": 0.2833595275878906, "rewards/rejected": 1.592028021812439, "step": 1139 }, { "epoch": 0.19, "learning_rate": 9.89793966243959e-07, "logits/chosen": -0.7511119246482849, "logits/rejected": -0.7635484933853149, "logps/chosen": -67.41392517089844, "logps/rejected": -21.01919937133789, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 1.225110650062561, "rewards/margins": 0.8695785999298096, "rewards/rejected": 0.35553207993507385, "step": 1140 }, { "epoch": 0.19, "learning_rate": 9.897675307004679e-07, "logits/chosen": -0.3729150891304016, "logits/rejected": -0.47588568925857544, "logps/chosen": -169.99502563476562, "logps/rejected": -72.19965362548828, "loss": 0.734, "rewards/accuracies": 1.0, "rewards/chosen": 1.719722032546997, "rewards/margins": 0.8703773617744446, "rewards/rejected": 0.8493446707725525, "step": 1141 }, { "epoch": 0.19, "learning_rate": 9.897410613186748e-07, "logits/chosen": -0.5816178917884827, "logits/rejected": -0.5305801033973694, "logps/chosen": -105.23365783691406, "logps/rejected": -90.89494323730469, "loss": 0.9146, "rewards/accuracies": 0.0, "rewards/chosen": 0.8934555053710938, "rewards/margins": -0.5326584577560425, "rewards/rejected": 1.4261139631271362, "step": 1142 }, { "epoch": 0.19, "learning_rate": 9.897145581004085e-07, "logits/chosen": -0.4189237952232361, "logits/rejected": -0.4189237952232361, "logps/chosen": -73.77305603027344, "logps/rejected": -73.77305603027344, "loss": 0.5499, "rewards/accuracies": 0.0, "rewards/chosen": 0.6040847897529602, "rewards/margins": 0.0, "rewards/rejected": 0.6040847897529602, "step": 1143 }, { "epoch": 0.19, "learning_rate": 9.896880210474996e-07, "logits/chosen": -0.6888584494590759, "logits/rejected": -0.6280882954597473, "logps/chosen": -160.4983673095703, "logps/rejected": -75.65214538574219, "loss": 1.1631, "rewards/accuracies": 0.0, "rewards/chosen": 0.3895919919013977, "rewards/margins": -0.3540130853652954, "rewards/rejected": 0.7436050772666931, "step": 1144 }, { "epoch": 0.19, "learning_rate": 9.896614501617822e-07, "logits/chosen": -0.5730430483818054, "logits/rejected": -0.5349216461181641, "logps/chosen": -197.21478271484375, "logps/rejected": -37.90321731567383, "loss": 0.4041, "rewards/accuracies": 1.0, "rewards/chosen": 1.4126068353652954, "rewards/margins": 0.36113858222961426, "rewards/rejected": 1.0514682531356812, "step": 1145 }, { "epoch": 0.19, "learning_rate": 9.896348454450918e-07, "logits/chosen": -0.1264115571975708, "logits/rejected": -0.2261708676815033, "logps/chosen": -129.5434112548828, "logps/rejected": -158.63131713867188, "loss": 1.3163, "rewards/accuracies": 0.0, "rewards/chosen": 0.4316726624965668, "rewards/margins": -1.3089370727539062, "rewards/rejected": 1.7406097650527954, "step": 1146 }, { "epoch": 0.19, "learning_rate": 9.896082068992664e-07, "logits/chosen": -0.604041576385498, "logits/rejected": -0.5767216086387634, "logps/chosen": -93.19039916992188, "logps/rejected": -88.00509643554688, "loss": 0.3617, "rewards/accuracies": 1.0, "rewards/chosen": 0.500666081905365, "rewards/margins": 0.43034443259239197, "rewards/rejected": 0.07032165676355362, "step": 1147 }, { "epoch": 0.19, "learning_rate": 9.895815345261467e-07, "logits/chosen": -0.573249101638794, "logits/rejected": -0.9914124608039856, "logps/chosen": -157.3268585205078, "logps/rejected": -55.610595703125, "loss": 0.1719, "rewards/accuracies": 1.0, "rewards/chosen": 2.0801315307617188, "rewards/margins": 2.037982940673828, "rewards/rejected": 0.042148590087890625, "step": 1148 }, { "epoch": 0.19, "learning_rate": 9.895548283275754e-07, "logits/chosen": -0.5845385789871216, "logits/rejected": -0.5442959666252136, "logps/chosen": -99.84114074707031, "logps/rejected": -171.71859741210938, "loss": 0.8689, "rewards/accuracies": 1.0, "rewards/chosen": 0.3238639831542969, "rewards/margins": 0.05687180161476135, "rewards/rejected": 0.2669921815395355, "step": 1149 }, { "epoch": 0.19, "learning_rate": 9.895280883053976e-07, "logits/chosen": -0.7325261235237122, "logits/rejected": -0.6745345592498779, "logps/chosen": -76.62808990478516, "logps/rejected": -39.87390899658203, "loss": 0.4161, "rewards/accuracies": 1.0, "rewards/chosen": 0.3168601989746094, "rewards/margins": 0.12065429985523224, "rewards/rejected": 0.19620589911937714, "step": 1150 }, { "epoch": 0.19, "learning_rate": 9.89501314461461e-07, "logits/chosen": -0.5070750713348389, "logits/rejected": -0.48598963022232056, "logps/chosen": -69.8701400756836, "logps/rejected": -59.445518493652344, "loss": 0.4849, "rewards/accuracies": 0.0, "rewards/chosen": 0.7339324951171875, "rewards/margins": -0.17673265933990479, "rewards/rejected": 0.9106651544570923, "step": 1151 }, { "epoch": 0.19, "learning_rate": 9.894745067976152e-07, "logits/chosen": -0.1785472184419632, "logits/rejected": -0.1785472184419632, "logps/chosen": -44.4499397277832, "logps/rejected": -44.4499397277832, "loss": 0.4053, "rewards/accuracies": 0.0, "rewards/chosen": 0.07164420932531357, "rewards/margins": 0.0, "rewards/rejected": 0.07164420932531357, "step": 1152 }, { "epoch": 0.19, "learning_rate": 9.894476653157125e-07, "logits/chosen": -1.1095929145812988, "logits/rejected": -1.153794527053833, "logps/chosen": -88.13075256347656, "logps/rejected": -49.065208435058594, "loss": 0.4358, "rewards/accuracies": 1.0, "rewards/chosen": 0.19693146646022797, "rewards/margins": 0.11655198037624359, "rewards/rejected": 0.08037948608398438, "step": 1153 }, { "epoch": 0.19, "learning_rate": 9.894207900176073e-07, "logits/chosen": -0.29568204283714294, "logits/rejected": -0.2748239040374756, "logps/chosen": -69.17466735839844, "logps/rejected": -17.061344146728516, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.4268081784248352, "rewards/margins": 0.26607704162597656, "rewards/rejected": 0.16073112189769745, "step": 1154 }, { "epoch": 0.19, "learning_rate": 9.893938809051564e-07, "logits/chosen": -0.7871775031089783, "logits/rejected": -0.7962561845779419, "logps/chosen": -79.58930969238281, "logps/rejected": -19.266971588134766, "loss": 0.7888, "rewards/accuracies": 1.0, "rewards/chosen": 0.21578751504421234, "rewards/margins": 0.09409066289663315, "rewards/rejected": 0.1216968521475792, "step": 1155 }, { "epoch": 0.19, "learning_rate": 9.89366937980219e-07, "logits/chosen": -0.5960104465484619, "logits/rejected": -0.4741549491882324, "logps/chosen": -81.316650390625, "logps/rejected": -278.4056701660156, "loss": 1.2655, "rewards/accuracies": 0.0, "rewards/chosen": 1.5606743097305298, "rewards/margins": -1.6644295454025269, "rewards/rejected": 3.2251038551330566, "step": 1156 }, { "epoch": 0.19, "learning_rate": 9.893399612446566e-07, "logits/chosen": -0.6684827208518982, "logits/rejected": -0.6726964116096497, "logps/chosen": -49.62928009033203, "logps/rejected": -98.35540771484375, "loss": 0.4923, "rewards/accuracies": 0.0, "rewards/chosen": 0.060292817652225494, "rewards/margins": -0.31664353609085083, "rewards/rejected": 0.3769363462924957, "step": 1157 }, { "epoch": 0.19, "learning_rate": 9.893129507003334e-07, "logits/chosen": -0.4817725419998169, "logits/rejected": -0.4564354121685028, "logps/chosen": -165.4744873046875, "logps/rejected": -151.75900268554688, "loss": 1.0139, "rewards/accuracies": 0.0, "rewards/chosen": 1.1610275506973267, "rewards/margins": -0.2377777099609375, "rewards/rejected": 1.3988052606582642, "step": 1158 }, { "epoch": 0.19, "learning_rate": 9.892859063491147e-07, "logits/chosen": -0.6497545838356018, "logits/rejected": -0.5707362294197083, "logps/chosen": -113.96758270263672, "logps/rejected": -16.42338752746582, "loss": 0.3112, "rewards/accuracies": 1.0, "rewards/chosen": 0.5417534112930298, "rewards/margins": 0.3660110831260681, "rewards/rejected": 0.17574234306812286, "step": 1159 }, { "epoch": 0.19, "learning_rate": 9.892588281928698e-07, "logits/chosen": -0.3116728365421295, "logits/rejected": -0.314928263425827, "logps/chosen": -70.45352172851562, "logps/rejected": -145.18862915039062, "loss": 0.3067, "rewards/accuracies": 1.0, "rewards/chosen": 0.03512268140912056, "rewards/margins": 0.30496981739997864, "rewards/rejected": -0.2698471248149872, "step": 1160 }, { "epoch": 0.19, "learning_rate": 9.892317162334694e-07, "logits/chosen": -0.3399292528629303, "logits/rejected": -0.338259220123291, "logps/chosen": -111.79063415527344, "logps/rejected": -70.68009948730469, "loss": 0.8566, "rewards/accuracies": 0.0, "rewards/chosen": 0.55755615234375, "rewards/margins": -1.0294952392578125, "rewards/rejected": 1.5870513916015625, "step": 1161 }, { "epoch": 0.19, "learning_rate": 9.892045704727863e-07, "logits/chosen": -0.4842694103717804, "logits/rejected": -0.4836931824684143, "logps/chosen": -110.75834655761719, "logps/rejected": -102.10833740234375, "loss": 0.5001, "rewards/accuracies": 0.0, "rewards/chosen": 1.1449187994003296, "rewards/margins": -0.16673433780670166, "rewards/rejected": 1.3116531372070312, "step": 1162 }, { "epoch": 0.19, "learning_rate": 9.891773909126963e-07, "logits/chosen": -0.05372314900159836, "logits/rejected": -0.09203779697418213, "logps/chosen": -23.899009704589844, "logps/rejected": -31.63898277282715, "loss": 0.8406, "rewards/accuracies": 0.0, "rewards/chosen": 0.004469871520996094, "rewards/margins": -0.10377883911132812, "rewards/rejected": 0.10824871063232422, "step": 1163 }, { "epoch": 0.19, "learning_rate": 9.891501775550774e-07, "logits/chosen": -0.3188689053058624, "logits/rejected": -0.2912084758281708, "logps/chosen": -71.32563781738281, "logps/rejected": -34.262596130371094, "loss": 0.8181, "rewards/accuracies": 1.0, "rewards/chosen": 0.3583633601665497, "rewards/margins": 0.10830917954444885, "rewards/rejected": 0.25005418062210083, "step": 1164 }, { "epoch": 0.19, "learning_rate": 9.891229304018097e-07, "logits/chosen": -0.10553905367851257, "logits/rejected": -0.05122850090265274, "logps/chosen": -80.37718200683594, "logps/rejected": -52.87947463989258, "loss": 0.7356, "rewards/accuracies": 1.0, "rewards/chosen": 0.7656692862510681, "rewards/margins": 0.009177446365356445, "rewards/rejected": 0.7564918398857117, "step": 1165 }, { "epoch": 0.19, "learning_rate": 9.890956494547754e-07, "logits/chosen": -0.5952911376953125, "logits/rejected": -0.5511776804924011, "logps/chosen": -76.6153335571289, "logps/rejected": -123.05248260498047, "loss": 1.4646, "rewards/accuracies": 0.0, "rewards/chosen": 1.4213660955429077, "rewards/margins": -0.6509827375411987, "rewards/rejected": 2.0723488330841064, "step": 1166 }, { "epoch": 0.19, "learning_rate": 9.890683347158596e-07, "logits/chosen": -0.6548992395401001, "logits/rejected": -1.0295675992965698, "logps/chosen": -68.82879638671875, "logps/rejected": -52.07386779785156, "loss": 0.6634, "rewards/accuracies": 1.0, "rewards/chosen": 1.13444983959198, "rewards/margins": 0.9781445264816284, "rewards/rejected": 0.15630531311035156, "step": 1167 }, { "epoch": 0.19, "learning_rate": 9.890409861869495e-07, "logits/chosen": -0.40105554461479187, "logits/rejected": -0.4174901247024536, "logps/chosen": -155.02642822265625, "logps/rejected": -165.89852905273438, "loss": 0.5121, "rewards/accuracies": 0.0, "rewards/chosen": 1.3845398426055908, "rewards/margins": -0.44414055347442627, "rewards/rejected": 1.828680396080017, "step": 1168 }, { "epoch": 0.19, "learning_rate": 9.89013603869935e-07, "logits/chosen": -0.5766575336456299, "logits/rejected": -0.4340870976448059, "logps/chosen": -181.19638061523438, "logps/rejected": -87.19969177246094, "loss": 0.5834, "rewards/accuracies": 1.0, "rewards/chosen": 2.2325103282928467, "rewards/margins": 0.6000838279724121, "rewards/rejected": 1.6324265003204346, "step": 1169 }, { "epoch": 0.19, "learning_rate": 9.889861877667069e-07, "logits/chosen": -0.22546857595443726, "logits/rejected": -0.22546857595443726, "logps/chosen": -61.234580993652344, "logps/rejected": -61.234580993652344, "loss": 0.5598, "rewards/accuracies": 0.0, "rewards/chosen": -0.08960914611816406, "rewards/margins": 0.0, "rewards/rejected": -0.08960914611816406, "step": 1170 }, { "epoch": 0.19, "learning_rate": 9.889587378791604e-07, "logits/chosen": -0.8704437613487244, "logits/rejected": -0.8859503865242004, "logps/chosen": -87.44459533691406, "logps/rejected": -39.12553405761719, "loss": 0.739, "rewards/accuracies": 1.0, "rewards/chosen": 0.3753952085971832, "rewards/margins": 0.3165481686592102, "rewards/rejected": 0.05884704738855362, "step": 1171 }, { "epoch": 0.19, "learning_rate": 9.889312542091916e-07, "logits/chosen": -0.48549219965934753, "logits/rejected": -0.5111453533172607, "logps/chosen": -42.95692443847656, "logps/rejected": -131.51902770996094, "loss": 0.6638, "rewards/accuracies": 1.0, "rewards/chosen": 0.35358086228370667, "rewards/margins": 0.12099571526050568, "rewards/rejected": 0.232585147023201, "step": 1172 }, { "epoch": 0.19, "learning_rate": 9.889037367586997e-07, "logits/chosen": -0.4478459358215332, "logits/rejected": -0.4478459358215332, "logps/chosen": -29.930953979492188, "logps/rejected": -29.930953979492188, "loss": 0.736, "rewards/accuracies": 0.0, "rewards/chosen": 0.8824047446250916, "rewards/margins": 0.0, "rewards/rejected": 0.8824047446250916, "step": 1173 }, { "epoch": 0.19, "learning_rate": 9.888761855295852e-07, "logits/chosen": -0.42442598938941956, "logits/rejected": -0.3863997161388397, "logps/chosen": -138.6205596923828, "logps/rejected": -81.89195251464844, "loss": 0.3133, "rewards/accuracies": 1.0, "rewards/chosen": 2.137120008468628, "rewards/margins": 0.30560147762298584, "rewards/rejected": 1.831518530845642, "step": 1174 }, { "epoch": 0.19, "learning_rate": 9.888486005237523e-07, "logits/chosen": -0.7922078967094421, "logits/rejected": -0.7406747341156006, "logps/chosen": -79.328369140625, "logps/rejected": -32.203861236572266, "loss": 0.999, "rewards/accuracies": 1.0, "rewards/chosen": 0.4284301698207855, "rewards/margins": 0.3752933442592621, "rewards/rejected": 0.05313682556152344, "step": 1175 }, { "epoch": 0.19, "learning_rate": 9.888209817431068e-07, "logits/chosen": -0.1722847819328308, "logits/rejected": -0.1722847819328308, "logps/chosen": -55.585487365722656, "logps/rejected": -55.585487365722656, "loss": 0.8708, "rewards/accuracies": 0.0, "rewards/chosen": 0.7325920462608337, "rewards/margins": 0.0, "rewards/rejected": 0.7325920462608337, "step": 1176 }, { "epoch": 0.19, "learning_rate": 9.887933291895565e-07, "logits/chosen": -0.5553943514823914, "logits/rejected": -0.5589291453361511, "logps/chosen": -137.69638061523438, "logps/rejected": -111.8931884765625, "loss": 0.5528, "rewards/accuracies": 0.0, "rewards/chosen": 1.9153763055801392, "rewards/margins": -0.425251841545105, "rewards/rejected": 2.340628147125244, "step": 1177 }, { "epoch": 0.19, "learning_rate": 9.887656428650121e-07, "logits/chosen": -0.6359970569610596, "logits/rejected": -0.6198198795318604, "logps/chosen": -73.91532897949219, "logps/rejected": -79.82489013671875, "loss": 0.5623, "rewards/accuracies": 1.0, "rewards/chosen": 0.3157394528388977, "rewards/margins": 0.19823609292507172, "rewards/rejected": 0.11750335991382599, "step": 1178 }, { "epoch": 0.19, "learning_rate": 9.887379227713868e-07, "logits/chosen": -0.2987867295742035, "logits/rejected": -0.24743618071079254, "logps/chosen": -67.02252197265625, "logps/rejected": -69.40731811523438, "loss": 0.8566, "rewards/accuracies": 1.0, "rewards/chosen": 0.8647903800010681, "rewards/margins": 0.15222781896591187, "rewards/rejected": 0.7125625610351562, "step": 1179 }, { "epoch": 0.19, "learning_rate": 9.887101689105953e-07, "logits/chosen": -0.7787590622901917, "logits/rejected": -0.7964873313903809, "logps/chosen": -58.59750747680664, "logps/rejected": -83.54240417480469, "loss": 0.446, "rewards/accuracies": 0.0, "rewards/chosen": 0.259927362203598, "rewards/margins": -0.002683281898498535, "rewards/rejected": 0.26261064410209656, "step": 1180 }, { "epoch": 0.19, "learning_rate": 9.886823812845555e-07, "logits/chosen": -0.08070815354585648, "logits/rejected": -0.043988555669784546, "logps/chosen": -116.38481140136719, "logps/rejected": -63.8331413269043, "loss": 0.3705, "rewards/accuracies": 1.0, "rewards/chosen": 1.6577576398849487, "rewards/margins": 0.6374539136886597, "rewards/rejected": 1.020303726196289, "step": 1181 }, { "epoch": 0.19, "learning_rate": 9.88654559895187e-07, "logits/chosen": -0.1769258677959442, "logits/rejected": -0.17945203185081482, "logps/chosen": -79.7056884765625, "logps/rejected": -133.53355407714844, "loss": 0.8343, "rewards/accuracies": 1.0, "rewards/chosen": 0.6793052554130554, "rewards/margins": 0.022209882736206055, "rewards/rejected": 0.6570953726768494, "step": 1182 }, { "epoch": 0.19, "learning_rate": 9.886267047444122e-07, "logits/chosen": -0.3688494563102722, "logits/rejected": -0.35018017888069153, "logps/chosen": -39.612918853759766, "logps/rejected": -6.521529197692871, "loss": 0.7884, "rewards/accuracies": 0.0, "rewards/chosen": 0.12296028435230255, "rewards/margins": -0.10300521552562714, "rewards/rejected": 0.2259654998779297, "step": 1183 }, { "epoch": 0.19, "learning_rate": 9.885988158341553e-07, "logits/chosen": -0.0976376011967659, "logits/rejected": -0.06193581968545914, "logps/chosen": -63.40689468383789, "logps/rejected": -51.13249206542969, "loss": 0.3815, "rewards/accuracies": 1.0, "rewards/chosen": 1.1766560077667236, "rewards/margins": 0.31213879585266113, "rewards/rejected": 0.8645172119140625, "step": 1184 }, { "epoch": 0.19, "learning_rate": 9.885708931663436e-07, "logits/chosen": -0.2099180966615677, "logits/rejected": -0.19395272433757782, "logps/chosen": -77.90357971191406, "logps/rejected": -88.45032501220703, "loss": 0.8556, "rewards/accuracies": 0.0, "rewards/chosen": 0.40392228960990906, "rewards/margins": -0.32838132977485657, "rewards/rejected": 0.7323036193847656, "step": 1185 }, { "epoch": 0.19, "learning_rate": 9.885429367429062e-07, "logits/chosen": -0.6309916377067566, "logits/rejected": -0.6306880712509155, "logps/chosen": -83.96595764160156, "logps/rejected": -66.30905151367188, "loss": 0.5263, "rewards/accuracies": 1.0, "rewards/chosen": 0.9474014639854431, "rewards/margins": 0.15426105260849, "rewards/rejected": 0.7931404113769531, "step": 1186 }, { "epoch": 0.19, "learning_rate": 9.885149465657741e-07, "logits/chosen": -0.465440034866333, "logits/rejected": -0.6614287495613098, "logps/chosen": -145.38595581054688, "logps/rejected": -124.5038070678711, "loss": 0.4191, "rewards/accuracies": 1.0, "rewards/chosen": 1.9086426496505737, "rewards/margins": 0.3464469909667969, "rewards/rejected": 1.5621956586837769, "step": 1187 }, { "epoch": 0.19, "learning_rate": 9.884869226368819e-07, "logits/chosen": -0.31187930703163147, "logits/rejected": -0.17790982127189636, "logps/chosen": -135.6468963623047, "logps/rejected": -55.15559387207031, "loss": 0.9336, "rewards/accuracies": 1.0, "rewards/chosen": 1.4710251092910767, "rewards/margins": 0.4414947032928467, "rewards/rejected": 1.02953040599823, "step": 1188 }, { "epoch": 0.19, "learning_rate": 9.884588649581654e-07, "logits/chosen": -0.46989762783050537, "logits/rejected": -0.39890649914741516, "logps/chosen": -108.11729431152344, "logps/rejected": -169.99497985839844, "loss": 0.671, "rewards/accuracies": 0.0, "rewards/chosen": 1.5447219610214233, "rewards/margins": -0.1167144775390625, "rewards/rejected": 1.6614364385604858, "step": 1189 }, { "epoch": 0.19, "learning_rate": 9.884307735315632e-07, "logits/chosen": -0.5394408702850342, "logits/rejected": -0.46658313274383545, "logps/chosen": -110.28559875488281, "logps/rejected": -139.8318634033203, "loss": 0.7796, "rewards/accuracies": 0.0, "rewards/chosen": 0.5480591058731079, "rewards/margins": -1.1296554803848267, "rewards/rejected": 1.6777145862579346, "step": 1190 }, { "epoch": 0.19, "learning_rate": 9.88402648359016e-07, "logits/chosen": -0.5077910423278809, "logits/rejected": -0.5476176142692566, "logps/chosen": -102.08454895019531, "logps/rejected": -118.36024475097656, "loss": 0.6489, "rewards/accuracies": 0.0, "rewards/chosen": 1.21812903881073, "rewards/margins": -0.7932952642440796, "rewards/rejected": 2.0114243030548096, "step": 1191 }, { "epoch": 0.19, "learning_rate": 9.88374489442467e-07, "logits/chosen": -0.2881815731525421, "logits/rejected": -0.28026485443115234, "logps/chosen": -24.29645538330078, "logps/rejected": -23.953367233276367, "loss": 0.7709, "rewards/accuracies": 0.0, "rewards/chosen": -0.0034271241165697575, "rewards/margins": -0.2900167405605316, "rewards/rejected": 0.2865896224975586, "step": 1192 }, { "epoch": 0.19, "learning_rate": 9.88346296783862e-07, "logits/chosen": -0.28474026918411255, "logits/rejected": -0.30167555809020996, "logps/chosen": -17.28350830078125, "logps/rejected": -22.161718368530273, "loss": 0.8321, "rewards/accuracies": 1.0, "rewards/chosen": 0.23409461975097656, "rewards/margins": 0.04590281844139099, "rewards/rejected": 0.18819180130958557, "step": 1193 }, { "epoch": 0.19, "learning_rate": 9.883180703851487e-07, "logits/chosen": -0.4543224573135376, "logits/rejected": -0.4793076515197754, "logps/chosen": -69.01280212402344, "logps/rejected": -128.87159729003906, "loss": 1.5921, "rewards/accuracies": 0.0, "rewards/chosen": 0.5055877566337585, "rewards/margins": -1.8966476917266846, "rewards/rejected": 2.402235507965088, "step": 1194 }, { "epoch": 0.19, "learning_rate": 9.882898102482772e-07, "logits/chosen": -0.3298039734363556, "logits/rejected": -0.3334101438522339, "logps/chosen": -74.71156311035156, "logps/rejected": -74.68991088867188, "loss": 0.7847, "rewards/accuracies": 0.0, "rewards/chosen": 0.13470458984375, "rewards/margins": -0.4933463931083679, "rewards/rejected": 0.6280509829521179, "step": 1195 }, { "epoch": 0.19, "learning_rate": 9.882615163751999e-07, "logits/chosen": -0.1558692306280136, "logits/rejected": -0.1661451756954193, "logps/chosen": -29.22759246826172, "logps/rejected": -117.72996520996094, "loss": 0.9449, "rewards/accuracies": 0.0, "rewards/chosen": 0.5442497134208679, "rewards/margins": -0.7824088931083679, "rewards/rejected": 1.3266586065292358, "step": 1196 }, { "epoch": 0.19, "learning_rate": 9.88233188767872e-07, "logits/chosen": -0.8009697794914246, "logits/rejected": -0.7856727838516235, "logps/chosen": -326.1757507324219, "logps/rejected": -140.9679412841797, "loss": 1.2617, "rewards/accuracies": 0.0, "rewards/chosen": 1.1504913568496704, "rewards/margins": -1.9158889055252075, "rewards/rejected": 3.066380262374878, "step": 1197 }, { "epoch": 0.19, "learning_rate": 9.882048274282505e-07, "logits/chosen": -0.38310056924819946, "logits/rejected": -0.5709920525550842, "logps/chosen": -72.41400909423828, "logps/rejected": -71.09922790527344, "loss": 1.4439, "rewards/accuracies": 1.0, "rewards/chosen": 1.1715141534805298, "rewards/margins": 0.4387916922569275, "rewards/rejected": 0.7327224612236023, "step": 1198 }, { "epoch": 0.19, "learning_rate": 9.881764323582947e-07, "logits/chosen": -0.8441730737686157, "logits/rejected": -0.8176664113998413, "logps/chosen": -116.97259521484375, "logps/rejected": -101.83348083496094, "loss": 1.0248, "rewards/accuracies": 0.0, "rewards/chosen": 0.6769180297851562, "rewards/margins": -0.8249756097793579, "rewards/rejected": 1.5018936395645142, "step": 1199 }, { "epoch": 0.19, "learning_rate": 9.881480035599666e-07, "logits/chosen": -0.47430649399757385, "logits/rejected": -0.4562143087387085, "logps/chosen": -82.81914520263672, "logps/rejected": -70.05026245117188, "loss": 0.807, "rewards/accuracies": 0.0, "rewards/chosen": 0.34907761216163635, "rewards/margins": -0.6996521949768066, "rewards/rejected": 1.0487297773361206, "step": 1200 }, { "epoch": 0.19, "learning_rate": 9.881195410352304e-07, "logits/chosen": -0.5839294195175171, "logits/rejected": -0.5768266916275024, "logps/chosen": -184.8176727294922, "logps/rejected": -185.75848388671875, "loss": 1.2028, "rewards/accuracies": 0.0, "rewards/chosen": 1.4933136701583862, "rewards/margins": -2.2392959594726562, "rewards/rejected": 3.732609510421753, "step": 1201 }, { "epoch": 0.2, "learning_rate": 9.880910447860527e-07, "logits/chosen": -0.8172029256820679, "logits/rejected": -0.8374016880989075, "logps/chosen": -71.25817108154297, "logps/rejected": -21.777061462402344, "loss": 0.2819, "rewards/accuracies": 1.0, "rewards/chosen": 0.8029541373252869, "rewards/margins": 0.6425678730010986, "rewards/rejected": 0.16038627922534943, "step": 1202 }, { "epoch": 0.2, "learning_rate": 9.88062514814402e-07, "logits/chosen": -0.2635476291179657, "logits/rejected": -0.2755117118358612, "logps/chosen": -135.70419311523438, "logps/rejected": -68.11115264892578, "loss": 0.6858, "rewards/accuracies": 0.0, "rewards/chosen": 0.19317626953125, "rewards/margins": -0.811492919921875, "rewards/rejected": 1.004669189453125, "step": 1203 }, { "epoch": 0.2, "learning_rate": 9.880339511222494e-07, "logits/chosen": -0.6294224262237549, "logits/rejected": -0.6067074537277222, "logps/chosen": -91.8056640625, "logps/rejected": -85.07644653320312, "loss": 0.4707, "rewards/accuracies": 1.0, "rewards/chosen": 1.8937286138534546, "rewards/margins": 0.470744252204895, "rewards/rejected": 1.4229843616485596, "step": 1204 }, { "epoch": 0.2, "learning_rate": 9.880053537115688e-07, "logits/chosen": -0.4477231502532959, "logits/rejected": -0.4477231502532959, "logps/chosen": -62.00604248046875, "logps/rejected": -62.00604248046875, "loss": 0.8838, "rewards/accuracies": 0.0, "rewards/chosen": 0.8200897574424744, "rewards/margins": 0.0, "rewards/rejected": 0.8200897574424744, "step": 1205 }, { "epoch": 0.2, "learning_rate": 9.87976722584336e-07, "logits/chosen": -0.14402344822883606, "logits/rejected": -0.10783669352531433, "logps/chosen": -51.98916244506836, "logps/rejected": -67.693359375, "loss": 0.6257, "rewards/accuracies": 1.0, "rewards/chosen": 1.4076801538467407, "rewards/margins": 0.36363720893859863, "rewards/rejected": 1.044042944908142, "step": 1206 }, { "epoch": 0.2, "learning_rate": 9.879480577425288e-07, "logits/chosen": -0.05932099372148514, "logits/rejected": -0.10423309355974197, "logps/chosen": -112.13836669921875, "logps/rejected": -85.39289855957031, "loss": 0.584, "rewards/accuracies": 0.0, "rewards/chosen": -0.1295936554670334, "rewards/margins": -0.31627577543258667, "rewards/rejected": 0.18668213486671448, "step": 1207 }, { "epoch": 0.2, "learning_rate": 9.879193591881278e-07, "logits/chosen": -0.6668928265571594, "logits/rejected": -0.6133800745010376, "logps/chosen": -96.29199981689453, "logps/rejected": -70.99584197998047, "loss": 0.578, "rewards/accuracies": 0.0, "rewards/chosen": 0.33350449800491333, "rewards/margins": -0.2641647458076477, "rewards/rejected": 0.597669243812561, "step": 1208 }, { "epoch": 0.2, "learning_rate": 9.878906269231158e-07, "logits/chosen": -0.13816043734550476, "logits/rejected": -0.1358155906200409, "logps/chosen": -91.94453430175781, "logps/rejected": -109.92097473144531, "loss": 1.3165, "rewards/accuracies": 0.0, "rewards/chosen": -0.13297729194164276, "rewards/margins": -0.8840126395225525, "rewards/rejected": 0.7510353326797485, "step": 1209 }, { "epoch": 0.2, "learning_rate": 9.87861860949478e-07, "logits/chosen": -0.4143330454826355, "logits/rejected": -0.44950371980667114, "logps/chosen": -96.56611633300781, "logps/rejected": -133.31312561035156, "loss": 0.8667, "rewards/accuracies": 0.0, "rewards/chosen": 0.7079742550849915, "rewards/margins": -0.6419647336006165, "rewards/rejected": 1.349938988685608, "step": 1210 }, { "epoch": 0.2, "learning_rate": 9.878330612692017e-07, "logits/chosen": -0.3852238953113556, "logits/rejected": -0.33100661635398865, "logps/chosen": -61.09321212768555, "logps/rejected": -84.29965209960938, "loss": 0.8829, "rewards/accuracies": 0.0, "rewards/chosen": 0.4223102629184723, "rewards/margins": -0.5985897779464722, "rewards/rejected": 1.020900011062622, "step": 1211 }, { "epoch": 0.2, "learning_rate": 9.878042278842769e-07, "logits/chosen": -0.4307001531124115, "logits/rejected": -0.44242268800735474, "logps/chosen": -14.987231254577637, "logps/rejected": -4.6141676902771, "loss": 0.5902, "rewards/accuracies": 0.0, "rewards/chosen": 0.048262786120176315, "rewards/margins": -0.13137078285217285, "rewards/rejected": 0.17963357269763947, "step": 1212 }, { "epoch": 0.2, "learning_rate": 9.877753607966953e-07, "logits/chosen": -0.36621391773223877, "logits/rejected": -0.36621391773223877, "logps/chosen": -1.582680106163025, "logps/rejected": -1.582680106163025, "loss": 0.7095, "rewards/accuracies": 0.0, "rewards/chosen": 0.06272393465042114, "rewards/margins": 0.0, "rewards/rejected": 0.06272393465042114, "step": 1213 }, { "epoch": 0.2, "learning_rate": 9.877464600084522e-07, "logits/chosen": -0.1303483247756958, "logits/rejected": -0.14432069659233093, "logps/chosen": -98.00958251953125, "logps/rejected": -62.092384338378906, "loss": 1.0397, "rewards/accuracies": 0.0, "rewards/chosen": 0.1572616547346115, "rewards/margins": -1.2243927717208862, "rewards/rejected": 1.3816543817520142, "step": 1214 }, { "epoch": 0.2, "learning_rate": 9.877175255215434e-07, "logits/chosen": -0.8130632042884827, "logits/rejected": -0.8028640151023865, "logps/chosen": -93.41978454589844, "logps/rejected": -91.61035919189453, "loss": 1.1744, "rewards/accuracies": 0.0, "rewards/chosen": 0.6835487484931946, "rewards/margins": -0.29663926362991333, "rewards/rejected": 0.9801880121231079, "step": 1215 }, { "epoch": 0.2, "learning_rate": 9.876885573379685e-07, "logits/chosen": -0.9678093194961548, "logits/rejected": -0.9022422432899475, "logps/chosen": -38.71766662597656, "logps/rejected": -181.3072509765625, "loss": 1.407, "rewards/accuracies": 0.0, "rewards/chosen": 1.1709450483322144, "rewards/margins": -0.44051432609558105, "rewards/rejected": 1.6114593744277954, "step": 1216 }, { "epoch": 0.2, "learning_rate": 9.876595554597288e-07, "logits/chosen": -0.6476669907569885, "logits/rejected": -0.649786114692688, "logps/chosen": -70.61909484863281, "logps/rejected": -33.03480529785156, "loss": 0.4246, "rewards/accuracies": 1.0, "rewards/chosen": 1.3310272693634033, "rewards/margins": 0.5847046375274658, "rewards/rejected": 0.7463226318359375, "step": 1217 }, { "epoch": 0.2, "learning_rate": 9.876305198888282e-07, "logits/chosen": -0.16086922585964203, "logits/rejected": -0.21342743933200836, "logps/chosen": -136.86746215820312, "logps/rejected": -114.10140991210938, "loss": 0.7089, "rewards/accuracies": 0.0, "rewards/chosen": 1.9315887689590454, "rewards/margins": -0.36692512035369873, "rewards/rejected": 2.298513889312744, "step": 1218 }, { "epoch": 0.2, "learning_rate": 9.876014506272726e-07, "logits/chosen": -0.31132444739341736, "logits/rejected": -0.28290513157844543, "logps/chosen": -59.14315414428711, "logps/rejected": -17.899837493896484, "loss": 1.48, "rewards/accuracies": 1.0, "rewards/chosen": 1.0762226581573486, "rewards/margins": 0.7685608267784119, "rewards/rejected": 0.30766183137893677, "step": 1219 }, { "epoch": 0.2, "learning_rate": 9.875723476770705e-07, "logits/chosen": -0.5198438167572021, "logits/rejected": -0.45054617524147034, "logps/chosen": -166.59698486328125, "logps/rejected": -38.309139251708984, "loss": 0.0861, "rewards/accuracies": 1.0, "rewards/chosen": 2.415052890777588, "rewards/margins": 2.2577991485595703, "rewards/rejected": 0.15725365281105042, "step": 1220 }, { "epoch": 0.2, "learning_rate": 9.875432110402326e-07, "logits/chosen": -0.45662617683410645, "logits/rejected": -0.45662617683410645, "logps/chosen": -24.435495376586914, "logps/rejected": -24.435495376586914, "loss": 0.7072, "rewards/accuracies": 0.0, "rewards/chosen": 0.018752098083496094, "rewards/margins": 0.0, "rewards/rejected": 0.018752098083496094, "step": 1221 }, { "epoch": 0.2, "learning_rate": 9.87514040718772e-07, "logits/chosen": -0.42296358942985535, "logits/rejected": -0.40535518527030945, "logps/chosen": -111.7411880493164, "logps/rejected": -55.427833557128906, "loss": 1.4582, "rewards/accuracies": 1.0, "rewards/chosen": 0.721387505531311, "rewards/margins": 0.19583243131637573, "rewards/rejected": 0.5255550742149353, "step": 1222 }, { "epoch": 0.2, "learning_rate": 9.874848367147043e-07, "logits/chosen": -0.5736357569694519, "logits/rejected": -0.4893297851085663, "logps/chosen": -173.19781494140625, "logps/rejected": -61.597145080566406, "loss": 0.4276, "rewards/accuracies": 1.0, "rewards/chosen": 1.4454621076583862, "rewards/margins": 0.08166277408599854, "rewards/rejected": 1.3637993335723877, "step": 1223 }, { "epoch": 0.2, "learning_rate": 9.874555990300469e-07, "logits/chosen": -0.43188485503196716, "logits/rejected": -0.4115505516529083, "logps/chosen": -89.581298828125, "logps/rejected": -69.5991439819336, "loss": 0.9921, "rewards/accuracies": 0.0, "rewards/chosen": 0.43486329913139343, "rewards/margins": -0.5575195550918579, "rewards/rejected": 0.992382824420929, "step": 1224 }, { "epoch": 0.2, "learning_rate": 9.874263276668199e-07, "logits/chosen": -0.39529722929000854, "logits/rejected": -0.39529722929000854, "logps/chosen": -16.879121780395508, "logps/rejected": -16.879121780395508, "loss": 0.685, "rewards/accuracies": 0.0, "rewards/chosen": 0.22990131378173828, "rewards/margins": 0.0, "rewards/rejected": 0.22990131378173828, "step": 1225 }, { "epoch": 0.2, "learning_rate": 9.873970226270456e-07, "logits/chosen": -0.2934166193008423, "logits/rejected": -0.22952772676944733, "logps/chosen": -291.0242004394531, "logps/rejected": -70.53750610351562, "loss": 0.3285, "rewards/accuracies": 1.0, "rewards/chosen": 1.854949951171875, "rewards/margins": 0.24741661548614502, "rewards/rejected": 1.60753333568573, "step": 1226 }, { "epoch": 0.2, "learning_rate": 9.87367683912749e-07, "logits/chosen": -0.6780602931976318, "logits/rejected": -1.0685304403305054, "logps/chosen": -123.32170104980469, "logps/rejected": -35.84986114501953, "loss": 0.8817, "rewards/accuracies": 1.0, "rewards/chosen": 0.234375, "rewards/margins": 0.15053406357765198, "rewards/rejected": 0.08384094387292862, "step": 1227 }, { "epoch": 0.2, "learning_rate": 9.873383115259569e-07, "logits/chosen": -0.41170820593833923, "logits/rejected": -0.36089542508125305, "logps/chosen": -35.90580749511719, "logps/rejected": -15.473474502563477, "loss": 0.5358, "rewards/accuracies": 1.0, "rewards/chosen": 0.7199699282646179, "rewards/margins": 0.06597614288330078, "rewards/rejected": 0.6539937853813171, "step": 1228 }, { "epoch": 0.2, "learning_rate": 9.873089054686988e-07, "logits/chosen": -0.4581577181816101, "logits/rejected": -0.5081145167350769, "logps/chosen": -92.42779541015625, "logps/rejected": -94.8414535522461, "loss": 0.5817, "rewards/accuracies": 0.0, "rewards/chosen": 1.4213173389434814, "rewards/margins": -0.44824981689453125, "rewards/rejected": 1.8695671558380127, "step": 1229 }, { "epoch": 0.2, "learning_rate": 9.87279465743006e-07, "logits/chosen": -0.47390633821487427, "logits/rejected": -0.47390633821487427, "logps/chosen": -68.43875122070312, "logps/rejected": -68.43875122070312, "loss": 0.6447, "rewards/accuracies": 0.0, "rewards/chosen": 0.7421218752861023, "rewards/margins": 0.0, "rewards/rejected": 0.7421218752861023, "step": 1230 }, { "epoch": 0.2, "learning_rate": 9.872499923509132e-07, "logits/chosen": -0.3198246657848358, "logits/rejected": -0.3198246657848358, "logps/chosen": -25.761415481567383, "logps/rejected": -25.761415481567383, "loss": 0.5331, "rewards/accuracies": 0.0, "rewards/chosen": 0.6952390670776367, "rewards/margins": 0.0, "rewards/rejected": 0.6952390670776367, "step": 1231 }, { "epoch": 0.2, "learning_rate": 9.872204852944561e-07, "logits/chosen": -0.3779910206794739, "logits/rejected": -0.3779910206794739, "logps/chosen": -88.84390258789062, "logps/rejected": -88.84390258789062, "loss": 0.8008, "rewards/accuracies": 0.0, "rewards/chosen": 0.8876304626464844, "rewards/margins": 0.0, "rewards/rejected": 0.8876304626464844, "step": 1232 }, { "epoch": 0.2, "learning_rate": 9.871909445756735e-07, "logits/chosen": -0.6046261191368103, "logits/rejected": -0.6098018884658813, "logps/chosen": -79.11894989013672, "logps/rejected": -86.86235046386719, "loss": 0.6839, "rewards/accuracies": 0.0, "rewards/chosen": 0.7212883234024048, "rewards/margins": -0.3668152093887329, "rewards/rejected": 1.0881035327911377, "step": 1233 }, { "epoch": 0.2, "learning_rate": 9.871613701966066e-07, "logits/chosen": -0.4397522509098053, "logits/rejected": -0.5078603029251099, "logps/chosen": -148.30474853515625, "logps/rejected": -62.433982849121094, "loss": 0.7176, "rewards/accuracies": 1.0, "rewards/chosen": 2.0461976528167725, "rewards/margins": 1.1234192848205566, "rewards/rejected": 0.922778308391571, "step": 1234 }, { "epoch": 0.2, "learning_rate": 9.871317621592986e-07, "logits/chosen": -0.6016806364059448, "logits/rejected": -0.6125116348266602, "logps/chosen": -86.89657592773438, "logps/rejected": -47.26948928833008, "loss": 0.64, "rewards/accuracies": 1.0, "rewards/chosen": 1.1883400678634644, "rewards/margins": 0.22518432140350342, "rewards/rejected": 0.9631557464599609, "step": 1235 }, { "epoch": 0.2, "learning_rate": 9.871021204657952e-07, "logits/chosen": -0.6589880585670471, "logits/rejected": -0.5525935888290405, "logps/chosen": -192.63113403320312, "logps/rejected": -37.7963981628418, "loss": 0.2796, "rewards/accuracies": 1.0, "rewards/chosen": 0.7431671023368835, "rewards/margins": 0.4972400665283203, "rewards/rejected": 0.24592705070972443, "step": 1236 }, { "epoch": 0.2, "learning_rate": 9.87072445118144e-07, "logits/chosen": -0.04711788147687912, "logits/rejected": -0.04711788147687912, "logps/chosen": -60.311744689941406, "logps/rejected": -60.311744689941406, "loss": 0.5387, "rewards/accuracies": 0.0, "rewards/chosen": 0.4746742248535156, "rewards/margins": 0.0, "rewards/rejected": 0.4746742248535156, "step": 1237 }, { "epoch": 0.2, "learning_rate": 9.870427361183958e-07, "logits/chosen": -0.4613285958766937, "logits/rejected": -0.3850577473640442, "logps/chosen": -116.28282165527344, "logps/rejected": -77.78157043457031, "loss": 0.8218, "rewards/accuracies": 0.0, "rewards/chosen": 0.23793335258960724, "rewards/margins": -0.7875068783760071, "rewards/rejected": 1.0254402160644531, "step": 1238 }, { "epoch": 0.2, "learning_rate": 9.870129934686028e-07, "logits/chosen": -0.8611461520195007, "logits/rejected": -0.8062282204627991, "logps/chosen": -114.8975830078125, "logps/rejected": -123.02522277832031, "loss": 1.5205, "rewards/accuracies": 0.0, "rewards/chosen": 0.7685806155204773, "rewards/margins": -0.5581589341163635, "rewards/rejected": 1.3267395496368408, "step": 1239 }, { "epoch": 0.2, "learning_rate": 9.869832171708203e-07, "logits/chosen": -0.4598104953765869, "logits/rejected": -0.4420849084854126, "logps/chosen": -61.97636413574219, "logps/rejected": -107.2048568725586, "loss": 0.9712, "rewards/accuracies": 0.0, "rewards/chosen": 1.564173936843872, "rewards/margins": -1.0572516918182373, "rewards/rejected": 2.6214256286621094, "step": 1240 }, { "epoch": 0.2, "learning_rate": 9.869534072271053e-07, "logits/chosen": -0.2726703882217407, "logits/rejected": -0.2629311978816986, "logps/chosen": -64.09538269042969, "logps/rejected": -106.52217864990234, "loss": 0.9443, "rewards/accuracies": 0.0, "rewards/chosen": 0.8283584713935852, "rewards/margins": -1.4876337051391602, "rewards/rejected": 2.3159921169281006, "step": 1241 }, { "epoch": 0.2, "learning_rate": 9.869235636395176e-07, "logits/chosen": -0.751497209072113, "logits/rejected": -0.6426761150360107, "logps/chosen": -139.93324279785156, "logps/rejected": -69.9773941040039, "loss": 0.502, "rewards/accuracies": 0.0, "rewards/chosen": 1.1521072387695312, "rewards/margins": -0.35932767391204834, "rewards/rejected": 1.5114349126815796, "step": 1242 }, { "epoch": 0.2, "learning_rate": 9.868936864101187e-07, "logits/chosen": -0.5171256065368652, "logits/rejected": -0.48435157537460327, "logps/chosen": -50.65699005126953, "logps/rejected": -39.793701171875, "loss": 0.1974, "rewards/accuracies": 1.0, "rewards/chosen": 1.5390510559082031, "rewards/margins": 1.1866748332977295, "rewards/rejected": 0.35237619280815125, "step": 1243 }, { "epoch": 0.2, "learning_rate": 9.868637755409733e-07, "logits/chosen": -0.45068174600601196, "logits/rejected": -0.4448584318161011, "logps/chosen": -107.67839050292969, "logps/rejected": -109.34767150878906, "loss": 0.4198, "rewards/accuracies": 1.0, "rewards/chosen": 1.1628211736679077, "rewards/margins": 0.13412857055664062, "rewards/rejected": 1.028692603111267, "step": 1244 }, { "epoch": 0.2, "learning_rate": 9.868338310341477e-07, "logits/chosen": -0.48302575945854187, "logits/rejected": -0.46671000123023987, "logps/chosen": -119.41309356689453, "logps/rejected": -157.98251342773438, "loss": 0.8988, "rewards/accuracies": 0.0, "rewards/chosen": 1.3997688293457031, "rewards/margins": -1.3765647411346436, "rewards/rejected": 2.7763335704803467, "step": 1245 }, { "epoch": 0.2, "learning_rate": 9.868038528917108e-07, "logits/chosen": -0.2901720106601715, "logits/rejected": -0.3353659510612488, "logps/chosen": -36.93315887451172, "logps/rejected": -55.246742248535156, "loss": 0.8086, "rewards/accuracies": 0.0, "rewards/chosen": 0.07449112087488174, "rewards/margins": -0.12209624797105789, "rewards/rejected": 0.19658736884593964, "step": 1246 }, { "epoch": 0.2, "learning_rate": 9.867738411157338e-07, "logits/chosen": -0.13991312682628632, "logits/rejected": -0.10474184155464172, "logps/chosen": -183.58224487304688, "logps/rejected": -104.30455017089844, "loss": 0.6401, "rewards/accuracies": 0.0, "rewards/chosen": 1.1596587896347046, "rewards/margins": -0.6993370056152344, "rewards/rejected": 1.858995795249939, "step": 1247 }, { "epoch": 0.2, "learning_rate": 9.867437957082904e-07, "logits/chosen": -0.23662731051445007, "logits/rejected": -0.22310420870780945, "logps/chosen": -86.26705169677734, "logps/rejected": -62.570091247558594, "loss": 0.5106, "rewards/accuracies": 1.0, "rewards/chosen": 0.43117448687553406, "rewards/margins": 0.28566133975982666, "rewards/rejected": 0.1455131620168686, "step": 1248 }, { "epoch": 0.2, "learning_rate": 9.867137166714564e-07, "logits/chosen": -0.45475882291793823, "logits/rejected": -0.4503639340400696, "logps/chosen": -2.7729852199554443, "logps/rejected": -8.783554077148438, "loss": 0.8881, "rewards/accuracies": 1.0, "rewards/chosen": 0.15307562053203583, "rewards/margins": 0.02241174876689911, "rewards/rejected": 0.13066387176513672, "step": 1249 }, { "epoch": 0.2, "learning_rate": 9.866836040073097e-07, "logits/chosen": -0.8972603678703308, "logits/rejected": -0.8292879462242126, "logps/chosen": -141.3773193359375, "logps/rejected": -26.01650619506836, "loss": 0.1804, "rewards/accuracies": 1.0, "rewards/chosen": 2.1198060512542725, "rewards/margins": 2.0201919078826904, "rewards/rejected": 0.09961414337158203, "step": 1250 }, { "epoch": 0.2, "learning_rate": 9.86653457717931e-07, "logits/chosen": -0.3803999722003937, "logits/rejected": -0.4021607041358948, "logps/chosen": -155.33290100097656, "logps/rejected": -82.77470397949219, "loss": 0.2962, "rewards/accuracies": 1.0, "rewards/chosen": 3.2185730934143066, "rewards/margins": 1.5598130226135254, "rewards/rejected": 1.6587600708007812, "step": 1251 }, { "epoch": 0.2, "learning_rate": 9.866232778054032e-07, "logits/chosen": -0.18513470888137817, "logits/rejected": -0.1701376736164093, "logps/chosen": -16.725683212280273, "logps/rejected": -3.604881525039673, "loss": 1.1037, "rewards/accuracies": 0.0, "rewards/chosen": 0.09175491333007812, "rewards/margins": -0.12983348965644836, "rewards/rejected": 0.2215884029865265, "step": 1252 }, { "epoch": 0.2, "learning_rate": 9.865930642718114e-07, "logits/chosen": -0.6524350047111511, "logits/rejected": -0.6014057993888855, "logps/chosen": -114.28044128417969, "logps/rejected": -47.50286865234375, "loss": 0.5384, "rewards/accuracies": 0.0, "rewards/chosen": -0.13003693521022797, "rewards/margins": -0.13546715676784515, "rewards/rejected": 0.0054302215576171875, "step": 1253 }, { "epoch": 0.2, "learning_rate": 9.86562817119243e-07, "logits/chosen": -0.3419865667819977, "logits/rejected": -0.38550475239753723, "logps/chosen": -80.78890991210938, "logps/rejected": -91.10581970214844, "loss": 0.5523, "rewards/accuracies": 0.0, "rewards/chosen": 0.17000122368335724, "rewards/margins": -0.2795776128768921, "rewards/rejected": 0.4495788514614105, "step": 1254 }, { "epoch": 0.2, "learning_rate": 9.86532536349788e-07, "logits/chosen": -0.9379235506057739, "logits/rejected": -0.7849189043045044, "logps/chosen": -79.77680969238281, "logps/rejected": -203.62591552734375, "loss": 1.4395, "rewards/accuracies": 0.0, "rewards/chosen": 1.5345817804336548, "rewards/margins": -2.379960060119629, "rewards/rejected": 3.914541721343994, "step": 1255 }, { "epoch": 0.2, "learning_rate": 9.865022219655383e-07, "logits/chosen": -0.902962863445282, "logits/rejected": -0.8887812495231628, "logps/chosen": -123.83338165283203, "logps/rejected": -158.36314392089844, "loss": 0.8151, "rewards/accuracies": 0.0, "rewards/chosen": 0.14940719306468964, "rewards/margins": -0.43985670804977417, "rewards/rejected": 0.589263916015625, "step": 1256 }, { "epoch": 0.2, "learning_rate": 9.864718739685882e-07, "logits/chosen": -0.4453403949737549, "logits/rejected": -0.42248430848121643, "logps/chosen": -85.94219970703125, "logps/rejected": -84.18527221679688, "loss": 0.531, "rewards/accuracies": 0.0, "rewards/chosen": 1.3616409301757812, "rewards/margins": -0.2988159656524658, "rewards/rejected": 1.660456895828247, "step": 1257 }, { "epoch": 0.2, "learning_rate": 9.864414923610348e-07, "logits/chosen": -0.3452390730381012, "logits/rejected": -0.3556860387325287, "logps/chosen": -3.8674211502075195, "logps/rejected": -18.68110466003418, "loss": 0.6332, "rewards/accuracies": 1.0, "rewards/chosen": 0.38328391313552856, "rewards/margins": 0.08516129851341248, "rewards/rejected": 0.2981226146221161, "step": 1258 }, { "epoch": 0.2, "learning_rate": 9.86411077144977e-07, "logits/chosen": -0.7294653058052063, "logits/rejected": -0.7348421216011047, "logps/chosen": -97.78897094726562, "logps/rejected": -123.75485229492188, "loss": 0.6639, "rewards/accuracies": 1.0, "rewards/chosen": 2.0898056030273438, "rewards/margins": 0.5542739629745483, "rewards/rejected": 1.5355316400527954, "step": 1259 }, { "epoch": 0.2, "learning_rate": 9.863806283225163e-07, "logits/chosen": -0.6268986463546753, "logits/rejected": -0.6812489628791809, "logps/chosen": -150.5030517578125, "logps/rejected": -102.64503479003906, "loss": 0.3467, "rewards/accuracies": 1.0, "rewards/chosen": 3.2447526454925537, "rewards/margins": 2.382838487625122, "rewards/rejected": 0.8619140982627869, "step": 1260 }, { "epoch": 0.2, "learning_rate": 9.86350145895756e-07, "logits/chosen": 0.056259725242853165, "logits/rejected": 0.06648769974708557, "logps/chosen": -5.526655673980713, "logps/rejected": -3.8261494636535645, "loss": 0.7755, "rewards/accuracies": 1.0, "rewards/chosen": 0.14171944558620453, "rewards/margins": 0.003881514072418213, "rewards/rejected": 0.13783793151378632, "step": 1261 }, { "epoch": 0.2, "learning_rate": 9.86319629866803e-07, "logits/chosen": -0.538886308670044, "logits/rejected": -0.4275452792644501, "logps/chosen": -152.59506225585938, "logps/rejected": -91.40225219726562, "loss": 1.0178, "rewards/accuracies": 1.0, "rewards/chosen": 0.8514190912246704, "rewards/margins": 0.08927768468856812, "rewards/rejected": 0.7621414065361023, "step": 1262 }, { "epoch": 0.2, "learning_rate": 9.86289080237765e-07, "logits/chosen": -0.549251914024353, "logits/rejected": -0.5664284229278564, "logps/chosen": -26.18698501586914, "logps/rejected": -19.808610916137695, "loss": 0.7442, "rewards/accuracies": 0.0, "rewards/chosen": -0.02088909223675728, "rewards/margins": -0.23317165672779083, "rewards/rejected": 0.21228256821632385, "step": 1263 }, { "epoch": 0.21, "learning_rate": 9.862584970107528e-07, "logits/chosen": -0.36574456095695496, "logits/rejected": -0.36574456095695496, "logps/chosen": -87.51752471923828, "logps/rejected": -87.51752471923828, "loss": 1.0874, "rewards/accuracies": 0.0, "rewards/chosen": 0.8834366202354431, "rewards/margins": 0.0, "rewards/rejected": 0.8834366202354431, "step": 1264 }, { "epoch": 0.21, "learning_rate": 9.862278801878794e-07, "logits/chosen": -0.3822588324546814, "logits/rejected": -0.3822588324546814, "logps/chosen": -1.9558041095733643, "logps/rejected": -1.9558041095733643, "loss": 0.5747, "rewards/accuracies": 0.0, "rewards/chosen": 0.3457604646682739, "rewards/margins": 0.0, "rewards/rejected": 0.3457604646682739, "step": 1265 }, { "epoch": 0.21, "learning_rate": 9.861972297712603e-07, "logits/chosen": -0.4226459562778473, "logits/rejected": -0.4258221983909607, "logps/chosen": -59.87858581542969, "logps/rejected": -31.682416915893555, "loss": 0.3771, "rewards/accuracies": 1.0, "rewards/chosen": 1.3378952741622925, "rewards/margins": 0.6249873042106628, "rewards/rejected": 0.7129079699516296, "step": 1266 }, { "epoch": 0.21, "learning_rate": 9.861665457630133e-07, "logits/chosen": -0.3686637878417969, "logits/rejected": -0.3453827500343323, "logps/chosen": -69.20207214355469, "logps/rejected": -62.77857971191406, "loss": 0.7804, "rewards/accuracies": 0.0, "rewards/chosen": 1.2550872564315796, "rewards/margins": -0.11429905891418457, "rewards/rejected": 1.3693863153457642, "step": 1267 }, { "epoch": 0.21, "learning_rate": 9.86135828165258e-07, "logits/chosen": -0.636150062084198, "logits/rejected": -0.5241295099258423, "logps/chosen": -332.20281982421875, "logps/rejected": -207.34396362304688, "loss": 0.9942, "rewards/accuracies": 0.0, "rewards/chosen": 1.9324921369552612, "rewards/margins": -0.7836395502090454, "rewards/rejected": 2.7161316871643066, "step": 1268 }, { "epoch": 0.21, "learning_rate": 9.861050769801166e-07, "logits/chosen": -0.13887731730937958, "logits/rejected": -0.13887731730937958, "logps/chosen": -35.53938293457031, "logps/rejected": -35.53938293457031, "loss": 0.5509, "rewards/accuracies": 0.0, "rewards/chosen": 1.3572101593017578, "rewards/margins": 0.0, "rewards/rejected": 1.3572101593017578, "step": 1269 }, { "epoch": 0.21, "learning_rate": 9.86074292209714e-07, "logits/chosen": -0.47901710867881775, "logits/rejected": -0.45914196968078613, "logps/chosen": -82.08970642089844, "logps/rejected": -75.7100601196289, "loss": 0.8067, "rewards/accuracies": 0.0, "rewards/chosen": 0.2740432918071747, "rewards/margins": -0.958753228187561, "rewards/rejected": 1.232796549797058, "step": 1270 }, { "epoch": 0.21, "learning_rate": 9.860434738561773e-07, "logits/chosen": -0.06198665127158165, "logits/rejected": -0.1036984995007515, "logps/chosen": -57.26723098754883, "logps/rejected": -74.88909912109375, "loss": 0.9547, "rewards/accuracies": 0.0, "rewards/chosen": 0.2712768614292145, "rewards/margins": -0.7823303937911987, "rewards/rejected": 1.0536072254180908, "step": 1271 }, { "epoch": 0.21, "learning_rate": 9.860126219216353e-07, "logits/chosen": -0.40991267561912537, "logits/rejected": -0.40041956305503845, "logps/chosen": -34.28244400024414, "logps/rejected": -8.151969909667969, "loss": 0.6641, "rewards/accuracies": 0.0, "rewards/chosen": 0.11013183742761612, "rewards/margins": -0.2778116464614868, "rewards/rejected": 0.38794347643852234, "step": 1272 }, { "epoch": 0.21, "learning_rate": 9.859817364082201e-07, "logits/chosen": -0.4636315703392029, "logits/rejected": -0.41743046045303345, "logps/chosen": -64.26508331298828, "logps/rejected": -122.46928405761719, "loss": 1.2487, "rewards/accuracies": 0.0, "rewards/chosen": 0.9707099795341492, "rewards/margins": -0.18102043867111206, "rewards/rejected": 1.1517304182052612, "step": 1273 }, { "epoch": 0.21, "learning_rate": 9.859508173180653e-07, "logits/chosen": -0.5049199461936951, "logits/rejected": -0.5387250185012817, "logps/chosen": -123.87734985351562, "logps/rejected": -68.85706329345703, "loss": 1.1069, "rewards/accuracies": 0.0, "rewards/chosen": 0.6925918459892273, "rewards/margins": -0.4737747311592102, "rewards/rejected": 1.1663665771484375, "step": 1274 }, { "epoch": 0.21, "learning_rate": 9.85919864653307e-07, "logits/chosen": -0.6043634414672852, "logits/rejected": -0.4645653963088989, "logps/chosen": -88.52227783203125, "logps/rejected": -77.4135971069336, "loss": 0.4982, "rewards/accuracies": 1.0, "rewards/chosen": 1.4481651782989502, "rewards/margins": 0.29278719425201416, "rewards/rejected": 1.155377984046936, "step": 1275 }, { "epoch": 0.21, "learning_rate": 9.858888784160836e-07, "logits/chosen": -0.6577765941619873, "logits/rejected": -0.6551858186721802, "logps/chosen": -156.62161254882812, "logps/rejected": -113.24143981933594, "loss": 0.4237, "rewards/accuracies": 1.0, "rewards/chosen": 2.549053907394409, "rewards/margins": 0.27184128761291504, "rewards/rejected": 2.277212619781494, "step": 1276 }, { "epoch": 0.21, "learning_rate": 9.858578586085366e-07, "logits/chosen": -0.3929704427719116, "logits/rejected": -0.43385666608810425, "logps/chosen": -111.01636505126953, "logps/rejected": -69.62696838378906, "loss": 0.9063, "rewards/accuracies": 0.0, "rewards/chosen": 0.08508529514074326, "rewards/margins": -1.2741097211837769, "rewards/rejected": 1.3591949939727783, "step": 1277 }, { "epoch": 0.21, "learning_rate": 9.858268052328088e-07, "logits/chosen": -0.3254806697368622, "logits/rejected": -0.07858286052942276, "logps/chosen": -133.25022888183594, "logps/rejected": -50.02680969238281, "loss": 0.3029, "rewards/accuracies": 1.0, "rewards/chosen": 2.254608154296875, "rewards/margins": 0.8524513244628906, "rewards/rejected": 1.4021568298339844, "step": 1278 }, { "epoch": 0.21, "learning_rate": 9.857957182910455e-07, "logits/chosen": -0.5037498474121094, "logits/rejected": -0.488595187664032, "logps/chosen": -113.5480728149414, "logps/rejected": -70.91287994384766, "loss": 0.6551, "rewards/accuracies": 0.0, "rewards/chosen": 0.13019028306007385, "rewards/margins": -0.18614807724952698, "rewards/rejected": 0.31633836030960083, "step": 1279 }, { "epoch": 0.21, "learning_rate": 9.857645977853947e-07, "logits/chosen": -0.11698433756828308, "logits/rejected": -0.10982561111450195, "logps/chosen": -54.71002960205078, "logps/rejected": -120.81205749511719, "loss": 0.3959, "rewards/accuracies": 1.0, "rewards/chosen": 0.7861156463623047, "rewards/margins": 1.1389553546905518, "rewards/rejected": -0.3528396785259247, "step": 1280 }, { "epoch": 0.21, "learning_rate": 9.857334437180068e-07, "logits/chosen": -0.2465651035308838, "logits/rejected": -0.27262961864471436, "logps/chosen": -62.42771911621094, "logps/rejected": -108.37708282470703, "loss": 0.5419, "rewards/accuracies": 1.0, "rewards/chosen": 0.684478759765625, "rewards/margins": 0.484506219625473, "rewards/rejected": 0.19997254014015198, "step": 1281 }, { "epoch": 0.21, "learning_rate": 9.857022560910337e-07, "logits/chosen": -0.817693293094635, "logits/rejected": -0.8719503879547119, "logps/chosen": -133.88665771484375, "logps/rejected": -143.79605102539062, "loss": 0.7646, "rewards/accuracies": 0.0, "rewards/chosen": 0.36428529024124146, "rewards/margins": -0.007977277040481567, "rewards/rejected": 0.372262567281723, "step": 1282 }, { "epoch": 0.21, "learning_rate": 9.856710349066307e-07, "logits/chosen": -0.5410975813865662, "logits/rejected": -0.5194955468177795, "logps/chosen": -86.45942687988281, "logps/rejected": -74.0710220336914, "loss": 0.3323, "rewards/accuracies": 1.0, "rewards/chosen": 1.6906356811523438, "rewards/margins": 0.7017234563827515, "rewards/rejected": 0.9889122247695923, "step": 1283 }, { "epoch": 0.21, "learning_rate": 9.856397801669545e-07, "logits/chosen": -0.45541471242904663, "logits/rejected": -0.3958711624145508, "logps/chosen": -113.05215454101562, "logps/rejected": -224.3528594970703, "loss": 1.652, "rewards/accuracies": 0.0, "rewards/chosen": 0.9471023678779602, "rewards/margins": -2.595782518386841, "rewards/rejected": 3.5428848266601562, "step": 1284 }, { "epoch": 0.21, "learning_rate": 9.856084918741648e-07, "logits/chosen": -0.38253262639045715, "logits/rejected": -0.3916466534137726, "logps/chosen": -143.44863891601562, "logps/rejected": -43.03330612182617, "loss": 1.4379, "rewards/accuracies": 0.0, "rewards/chosen": -0.3626968562602997, "rewards/margins": -1.2667640447616577, "rewards/rejected": 0.9040672183036804, "step": 1285 }, { "epoch": 0.21, "learning_rate": 9.85577170030423e-07, "logits/chosen": -0.07019833475351334, "logits/rejected": -0.03311920538544655, "logps/chosen": -86.51581573486328, "logps/rejected": -81.04593658447266, "loss": 0.879, "rewards/accuracies": 0.0, "rewards/chosen": 0.7241409420967102, "rewards/margins": -0.06275254487991333, "rewards/rejected": 0.7868934869766235, "step": 1286 }, { "epoch": 0.21, "learning_rate": 9.855458146378934e-07, "logits/chosen": -0.25881677865982056, "logits/rejected": -0.25486844778060913, "logps/chosen": -135.473876953125, "logps/rejected": -89.73114013671875, "loss": 0.3859, "rewards/accuracies": 1.0, "rewards/chosen": 2.0095155239105225, "rewards/margins": 0.3466752767562866, "rewards/rejected": 1.6628402471542358, "step": 1287 }, { "epoch": 0.21, "learning_rate": 9.855144256987423e-07, "logits/chosen": -0.45260879397392273, "logits/rejected": -0.2671670913696289, "logps/chosen": -177.3347625732422, "logps/rejected": -61.68048095703125, "loss": 1.04, "rewards/accuracies": 1.0, "rewards/chosen": 1.2288284301757812, "rewards/margins": 0.5650947690010071, "rewards/rejected": 0.6637336611747742, "step": 1288 }, { "epoch": 0.21, "learning_rate": 9.854830032151383e-07, "logits/chosen": -0.6216235756874084, "logits/rejected": -0.617688775062561, "logps/chosen": -79.10456848144531, "logps/rejected": -70.6999282836914, "loss": 0.4714, "rewards/accuracies": 0.0, "rewards/chosen": 0.661785900592804, "rewards/margins": -0.19921648502349854, "rewards/rejected": 0.8610023856163025, "step": 1289 }, { "epoch": 0.21, "learning_rate": 9.854515471892526e-07, "logits/chosen": -0.29305317997932434, "logits/rejected": -0.28616297245025635, "logps/chosen": -71.92244720458984, "logps/rejected": -85.71249389648438, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 1.7527297735214233, "rewards/margins": 0.6502288579940796, "rewards/rejected": 1.1025009155273438, "step": 1290 }, { "epoch": 0.21, "learning_rate": 9.854200576232582e-07, "logits/chosen": -0.40703535079956055, "logits/rejected": -0.36656180024147034, "logps/chosen": -78.24003601074219, "logps/rejected": -57.73876953125, "loss": 0.6246, "rewards/accuracies": 0.0, "rewards/chosen": 1.446790337562561, "rewards/margins": -0.5248985290527344, "rewards/rejected": 1.9716888666152954, "step": 1291 }, { "epoch": 0.21, "learning_rate": 9.853885345193312e-07, "logits/chosen": -0.34075719118118286, "logits/rejected": -0.34830328822135925, "logps/chosen": -6.689882278442383, "logps/rejected": -1.5236073732376099, "loss": 0.8757, "rewards/accuracies": 0.0, "rewards/chosen": -0.09751339256763458, "rewards/margins": -0.2893235683441162, "rewards/rejected": 0.19181017577648163, "step": 1292 }, { "epoch": 0.21, "learning_rate": 9.853569778796488e-07, "logits/chosen": -0.62345951795578, "logits/rejected": -0.5188924670219421, "logps/chosen": -128.4679718017578, "logps/rejected": -80.11509704589844, "loss": 0.7843, "rewards/accuracies": 1.0, "rewards/chosen": 1.5698318481445312, "rewards/margins": 0.5583206415176392, "rewards/rejected": 1.011511206626892, "step": 1293 }, { "epoch": 0.21, "learning_rate": 9.85325387706392e-07, "logits/chosen": -0.6835158467292786, "logits/rejected": -0.6407589316368103, "logps/chosen": -87.25785827636719, "logps/rejected": -29.350845336914062, "loss": 1.0813, "rewards/accuracies": 0.0, "rewards/chosen": 0.4563339352607727, "rewards/margins": -0.35273629426956177, "rewards/rejected": 0.8090702295303345, "step": 1294 }, { "epoch": 0.21, "learning_rate": 9.852937640017431e-07, "logits/chosen": -0.19655893743038177, "logits/rejected": -0.19655893743038177, "logps/chosen": -0.9654375910758972, "logps/rejected": -0.9654375910758972, "loss": 0.7791, "rewards/accuracies": 0.0, "rewards/chosen": 0.1774493306875229, "rewards/margins": 0.0, "rewards/rejected": 0.1774493306875229, "step": 1295 }, { "epoch": 0.21, "learning_rate": 9.85262106767887e-07, "logits/chosen": -0.3474274277687073, "logits/rejected": -0.4020792245864868, "logps/chosen": -127.4933090209961, "logps/rejected": -74.62776184082031, "loss": 0.5422, "rewards/accuracies": 1.0, "rewards/chosen": 2.9709084033966064, "rewards/margins": 1.7690986394882202, "rewards/rejected": 1.2018097639083862, "step": 1296 }, { "epoch": 0.21, "learning_rate": 9.852304160070108e-07, "logits/chosen": -0.6322745680809021, "logits/rejected": -0.6467583775520325, "logps/chosen": -108.74024200439453, "logps/rejected": -57.088226318359375, "loss": 0.7071, "rewards/accuracies": 0.0, "rewards/chosen": 1.1525566577911377, "rewards/margins": -0.5076881647109985, "rewards/rejected": 1.6602448225021362, "step": 1297 }, { "epoch": 0.21, "learning_rate": 9.851986917213044e-07, "logits/chosen": -0.10550040006637573, "logits/rejected": -0.11594866216182709, "logps/chosen": -5.956380844116211, "logps/rejected": -4.821430683135986, "loss": 1.2571, "rewards/accuracies": 0.0, "rewards/chosen": 0.1476546823978424, "rewards/margins": -0.06692351400852203, "rewards/rejected": 0.21457819640636444, "step": 1298 }, { "epoch": 0.21, "learning_rate": 9.851669339129591e-07, "logits/chosen": -0.6431034207344055, "logits/rejected": -0.6272035837173462, "logps/chosen": -77.336181640625, "logps/rejected": -45.136566162109375, "loss": 0.6644, "rewards/accuracies": 0.0, "rewards/chosen": -0.12632369995117188, "rewards/margins": -0.8051986694335938, "rewards/rejected": 0.6788749694824219, "step": 1299 }, { "epoch": 0.21, "learning_rate": 9.851351425841695e-07, "logits/chosen": -0.45718827843666077, "logits/rejected": -0.4224272072315216, "logps/chosen": -75.13187408447266, "logps/rejected": -19.980045318603516, "loss": 0.6525, "rewards/accuracies": 1.0, "rewards/chosen": 1.7920852899551392, "rewards/margins": 1.510664939880371, "rewards/rejected": 0.2814203202724457, "step": 1300 }, { "epoch": 0.21, "learning_rate": 9.851033177371319e-07, "logits/chosen": -0.4555315673351288, "logits/rejected": -0.38040420413017273, "logps/chosen": -216.48684692382812, "logps/rejected": -22.74610137939453, "loss": 0.4722, "rewards/accuracies": 1.0, "rewards/chosen": 2.4178130626678467, "rewards/margins": 1.893632173538208, "rewards/rejected": 0.5241808295249939, "step": 1301 }, { "epoch": 0.21, "learning_rate": 9.850714593740453e-07, "logits/chosen": -0.28248831629753113, "logits/rejected": -0.2546045482158661, "logps/chosen": -126.01966857910156, "logps/rejected": -54.77052307128906, "loss": 0.7388, "rewards/accuracies": 1.0, "rewards/chosen": 1.6287002563476562, "rewards/margins": 0.311259388923645, "rewards/rejected": 1.3174408674240112, "step": 1302 }, { "epoch": 0.21, "learning_rate": 9.850395674971105e-07, "logits/chosen": -0.3062770664691925, "logits/rejected": -0.24623623490333557, "logps/chosen": -110.73905944824219, "logps/rejected": -82.68816375732422, "loss": 0.8304, "rewards/accuracies": 1.0, "rewards/chosen": 0.8646721243858337, "rewards/margins": 0.4281250238418579, "rewards/rejected": 0.43654710054397583, "step": 1303 }, { "epoch": 0.21, "learning_rate": 9.850076421085308e-07, "logits/chosen": -0.8355301022529602, "logits/rejected": -0.8143122792243958, "logps/chosen": -118.64167785644531, "logps/rejected": -94.5888671875, "loss": 0.9314, "rewards/accuracies": 0.0, "rewards/chosen": 0.42634353041648865, "rewards/margins": -0.9603195190429688, "rewards/rejected": 1.3866630792617798, "step": 1304 }, { "epoch": 0.21, "learning_rate": 9.849756832105128e-07, "logits/chosen": -0.05201683193445206, "logits/rejected": -0.06024598702788353, "logps/chosen": -68.00965118408203, "logps/rejected": -62.84453582763672, "loss": 0.714, "rewards/accuracies": 0.0, "rewards/chosen": 0.1664894074201584, "rewards/margins": -0.20290909707546234, "rewards/rejected": 0.3693985044956207, "step": 1305 }, { "epoch": 0.21, "learning_rate": 9.849436908052636e-07, "logits/chosen": -0.2745262682437897, "logits/rejected": -0.18962819874286652, "logps/chosen": -189.99697875976562, "logps/rejected": -115.95201873779297, "loss": 0.4535, "rewards/accuracies": 0.0, "rewards/chosen": 1.4070907831192017, "rewards/margins": -0.13634562492370605, "rewards/rejected": 1.5434364080429077, "step": 1306 }, { "epoch": 0.21, "learning_rate": 9.84911664894994e-07, "logits/chosen": -0.6824490427970886, "logits/rejected": -0.46838030219078064, "logps/chosen": -77.10163879394531, "logps/rejected": -110.73391723632812, "loss": 0.8227, "rewards/accuracies": 0.0, "rewards/chosen": 0.432821661233902, "rewards/margins": -0.3742172420024872, "rewards/rejected": 0.8070389032363892, "step": 1307 }, { "epoch": 0.21, "learning_rate": 9.848796054819165e-07, "logits/chosen": -0.36875030398368835, "logits/rejected": -0.3063512146472931, "logps/chosen": -162.49905395507812, "logps/rejected": -122.50506591796875, "loss": 0.7158, "rewards/accuracies": 1.0, "rewards/chosen": 0.6105316281318665, "rewards/margins": 0.4460304379463196, "rewards/rejected": 0.16450119018554688, "step": 1308 }, { "epoch": 0.21, "learning_rate": 9.848475125682465e-07, "logits/chosen": -0.2712548077106476, "logits/rejected": -0.2561452388763428, "logps/chosen": -186.17050170898438, "logps/rejected": -163.36831665039062, "loss": 0.5666, "rewards/accuracies": 0.0, "rewards/chosen": 2.632341146469116, "rewards/margins": -0.6994948387145996, "rewards/rejected": 3.331835985183716, "step": 1309 }, { "epoch": 0.21, "learning_rate": 9.84815386156201e-07, "logits/chosen": -0.273004949092865, "logits/rejected": -0.0977213904261589, "logps/chosen": -56.91221618652344, "logps/rejected": -85.42823791503906, "loss": 0.2557, "rewards/accuracies": 1.0, "rewards/chosen": 1.5080406665802002, "rewards/margins": 0.5284218192100525, "rewards/rejected": 0.9796188473701477, "step": 1310 }, { "epoch": 0.21, "learning_rate": 9.847832262479996e-07, "logits/chosen": -0.3941684067249298, "logits/rejected": -0.4356832206249237, "logps/chosen": -136.21981811523438, "logps/rejected": -121.14960479736328, "loss": 0.292, "rewards/accuracies": 1.0, "rewards/chosen": 1.2309478521347046, "rewards/margins": 0.6441985964775085, "rewards/rejected": 0.586749255657196, "step": 1311 }, { "epoch": 0.21, "learning_rate": 9.847510328458642e-07, "logits/chosen": -0.37009650468826294, "logits/rejected": -0.31439927220344543, "logps/chosen": -118.85527038574219, "logps/rejected": -152.49337768554688, "loss": 1.1604, "rewards/accuracies": 0.0, "rewards/chosen": 1.3028243780136108, "rewards/margins": -1.754553198814392, "rewards/rejected": 3.057377576828003, "step": 1312 }, { "epoch": 0.21, "learning_rate": 9.847188059520193e-07, "logits/chosen": -0.6574454307556152, "logits/rejected": -0.6450192928314209, "logps/chosen": -178.27337646484375, "logps/rejected": -76.99536895751953, "loss": 0.4601, "rewards/accuracies": 1.0, "rewards/chosen": 2.808825731277466, "rewards/margins": 1.4431098699569702, "rewards/rejected": 1.3657158613204956, "step": 1313 }, { "epoch": 0.21, "learning_rate": 9.846865455686914e-07, "logits/chosen": -0.8305000066757202, "logits/rejected": -0.8119020462036133, "logps/chosen": -180.97178649902344, "logps/rejected": -34.87730407714844, "loss": 0.2335, "rewards/accuracies": 1.0, "rewards/chosen": 2.5868821144104004, "rewards/margins": 2.1391007900238037, "rewards/rejected": 0.44778138399124146, "step": 1314 }, { "epoch": 0.21, "learning_rate": 9.846542516981093e-07, "logits/chosen": -0.4014389216899872, "logits/rejected": -0.39159539341926575, "logps/chosen": -93.40049743652344, "logps/rejected": -136.9912567138672, "loss": 1.0388, "rewards/accuracies": 0.0, "rewards/chosen": 0.7941253781318665, "rewards/margins": -0.4620833992958069, "rewards/rejected": 1.2562087774276733, "step": 1315 }, { "epoch": 0.21, "learning_rate": 9.846219243425045e-07, "logits/chosen": -0.8173195123672485, "logits/rejected": -0.65497225522995, "logps/chosen": -172.84414672851562, "logps/rejected": -141.78860473632812, "loss": 0.6912, "rewards/accuracies": 0.0, "rewards/chosen": 2.306715488433838, "rewards/margins": -1.026676893234253, "rewards/rejected": 3.333392381668091, "step": 1316 }, { "epoch": 0.21, "learning_rate": 9.8458956350411e-07, "logits/chosen": -0.5176631808280945, "logits/rejected": -0.4561782479286194, "logps/chosen": -112.1124496459961, "logps/rejected": -132.9437255859375, "loss": 0.8862, "rewards/accuracies": 0.0, "rewards/chosen": 0.8614738583564758, "rewards/margins": -0.5534889101982117, "rewards/rejected": 1.4149627685546875, "step": 1317 }, { "epoch": 0.21, "learning_rate": 9.84557169185162e-07, "logits/chosen": -0.3420025110244751, "logits/rejected": -0.2904970943927765, "logps/chosen": -53.964595794677734, "logps/rejected": -56.71902084350586, "loss": 0.3378, "rewards/accuracies": 1.0, "rewards/chosen": 0.7219768762588501, "rewards/margins": 0.43017733097076416, "rewards/rejected": 0.29179954528808594, "step": 1318 }, { "epoch": 0.21, "learning_rate": 9.845247413878982e-07, "logits/chosen": -0.648476243019104, "logits/rejected": -0.5540993809700012, "logps/chosen": -175.31671142578125, "logps/rejected": -53.760658264160156, "loss": 0.4757, "rewards/accuracies": 1.0, "rewards/chosen": 2.102731466293335, "rewards/margins": 1.222213864326477, "rewards/rejected": 0.8805176019668579, "step": 1319 }, { "epoch": 0.21, "learning_rate": 9.8449228011456e-07, "logits/chosen": -0.6852661371231079, "logits/rejected": -0.5203390717506409, "logps/chosen": -93.50968933105469, "logps/rejected": -88.49363708496094, "loss": 0.8484, "rewards/accuracies": 1.0, "rewards/chosen": 1.2060562372207642, "rewards/margins": 0.3827980160713196, "rewards/rejected": 0.8232582211494446, "step": 1320 }, { "epoch": 0.21, "learning_rate": 9.84459785367389e-07, "logits/chosen": -0.32402822375297546, "logits/rejected": -0.3056194484233856, "logps/chosen": -77.30584716796875, "logps/rejected": -84.71485137939453, "loss": 0.7322, "rewards/accuracies": 1.0, "rewards/chosen": 0.46162644028663635, "rewards/margins": 0.23174361884593964, "rewards/rejected": 0.22988282144069672, "step": 1321 }, { "epoch": 0.21, "learning_rate": 9.844272571486311e-07, "logits/chosen": -0.6542015075683594, "logits/rejected": -0.5578145980834961, "logps/chosen": -203.26931762695312, "logps/rejected": -59.60882568359375, "loss": 0.6548, "rewards/accuracies": 1.0, "rewards/chosen": 1.794470191001892, "rewards/margins": 0.7911415100097656, "rewards/rejected": 1.0033286809921265, "step": 1322 }, { "epoch": 0.21, "learning_rate": 9.843946954605333e-07, "logits/chosen": -0.795970618724823, "logits/rejected": -0.7863433957099915, "logps/chosen": -90.59911346435547, "logps/rejected": -85.33492279052734, "loss": 1.2194, "rewards/accuracies": 0.0, "rewards/chosen": 0.6140945553779602, "rewards/margins": -0.8347283005714417, "rewards/rejected": 1.4488228559494019, "step": 1323 }, { "epoch": 0.21, "learning_rate": 9.843621003053454e-07, "logits/chosen": -0.44153788685798645, "logits/rejected": -0.4233315587043762, "logps/chosen": -98.70591735839844, "logps/rejected": -74.69821166992188, "loss": 0.4437, "rewards/accuracies": 1.0, "rewards/chosen": 0.5064300894737244, "rewards/margins": 0.1977432668209076, "rewards/rejected": 0.3086868226528168, "step": 1324 }, { "epoch": 0.22, "learning_rate": 9.843294716853197e-07, "logits/chosen": -0.9672062397003174, "logits/rejected": -0.9715983271598816, "logps/chosen": -65.78947448730469, "logps/rejected": -14.799239158630371, "loss": 0.1706, "rewards/accuracies": 1.0, "rewards/chosen": 1.3447303771972656, "rewards/margins": 1.1971142292022705, "rewards/rejected": 0.14761610329151154, "step": 1325 }, { "epoch": 0.22, "learning_rate": 9.8429680960271e-07, "logits/chosen": -0.35051867365837097, "logits/rejected": -0.3915408253669739, "logps/chosen": -92.17752075195312, "logps/rejected": -104.82492065429688, "loss": 1.6228, "rewards/accuracies": 0.0, "rewards/chosen": 0.9749679565429688, "rewards/margins": -1.3671448230743408, "rewards/rejected": 2.3421127796173096, "step": 1326 }, { "epoch": 0.22, "learning_rate": 9.842641140597733e-07, "logits/chosen": -0.3700173497200012, "logits/rejected": -0.3425348699092865, "logps/chosen": -75.69185638427734, "logps/rejected": -61.76576232910156, "loss": 0.3362, "rewards/accuracies": 1.0, "rewards/chosen": 0.059668730944395065, "rewards/margins": 0.18414458632469177, "rewards/rejected": -0.124475859105587, "step": 1327 }, { "epoch": 0.22, "learning_rate": 9.842313850587687e-07, "logits/chosen": -0.6602933406829834, "logits/rejected": -0.6357975602149963, "logps/chosen": -115.44972229003906, "logps/rejected": -18.4945068359375, "loss": 0.8344, "rewards/accuracies": 1.0, "rewards/chosen": 0.6575607657432556, "rewards/margins": 0.5316148996353149, "rewards/rejected": 0.12594585120677948, "step": 1328 }, { "epoch": 0.22, "learning_rate": 9.84198622601957e-07, "logits/chosen": -0.7124059796333313, "logits/rejected": -0.6720041632652283, "logps/chosen": -91.67815399169922, "logps/rejected": -94.5013427734375, "loss": 0.5261, "rewards/accuracies": 0.0, "rewards/chosen": 1.1741493940353394, "rewards/margins": -0.2906409502029419, "rewards/rejected": 1.4647903442382812, "step": 1329 }, { "epoch": 0.22, "learning_rate": 9.84165826691602e-07, "logits/chosen": -0.4299750328063965, "logits/rejected": -0.21811619400978088, "logps/chosen": -142.31808471679688, "logps/rejected": -22.63555335998535, "loss": 0.1492, "rewards/accuracies": 1.0, "rewards/chosen": 3.395611524581909, "rewards/margins": 3.1239209175109863, "rewards/rejected": 0.27169057726860046, "step": 1330 }, { "epoch": 0.22, "learning_rate": 9.841329973299697e-07, "logits/chosen": -0.4032009243965149, "logits/rejected": -0.326673686504364, "logps/chosen": -76.81861877441406, "logps/rejected": -18.608104705810547, "loss": 0.736, "rewards/accuracies": 0.0, "rewards/chosen": 0.407662957906723, "rewards/margins": -0.16450348496437073, "rewards/rejected": 0.5721664428710938, "step": 1331 }, { "epoch": 0.22, "learning_rate": 9.84100134519328e-07, "logits/chosen": -0.4917982816696167, "logits/rejected": -0.5037323832511902, "logps/chosen": -127.33686828613281, "logps/rejected": -89.54200744628906, "loss": 0.5082, "rewards/accuracies": 0.0, "rewards/chosen": 0.19371338188648224, "rewards/margins": -0.2305252104997635, "rewards/rejected": 0.4242385923862457, "step": 1332 }, { "epoch": 0.22, "learning_rate": 9.840672382619477e-07, "logits/chosen": -0.5050503611564636, "logits/rejected": -0.5062116980552673, "logps/chosen": -97.11372375488281, "logps/rejected": -53.85462188720703, "loss": 0.4493, "rewards/accuracies": 1.0, "rewards/chosen": 1.9519462585449219, "rewards/margins": 1.0145610570907593, "rewards/rejected": 0.9373852014541626, "step": 1333 }, { "epoch": 0.22, "learning_rate": 9.840343085601016e-07, "logits/chosen": -0.5239220857620239, "logits/rejected": -0.4290807843208313, "logps/chosen": -173.54754638671875, "logps/rejected": -23.116596221923828, "loss": 0.2013, "rewards/accuracies": 1.0, "rewards/chosen": 2.3465301990509033, "rewards/margins": 1.9614008665084839, "rewards/rejected": 0.3851293623447418, "step": 1334 }, { "epoch": 0.22, "learning_rate": 9.840013454160646e-07, "logits/chosen": -0.4286634922027588, "logits/rejected": -0.4600289463996887, "logps/chosen": -98.2183837890625, "logps/rejected": -129.23875427246094, "loss": 1.0238, "rewards/accuracies": 0.0, "rewards/chosen": 0.7009963989257812, "rewards/margins": -1.4688186645507812, "rewards/rejected": 2.1698150634765625, "step": 1335 }, { "epoch": 0.22, "learning_rate": 9.839683488321143e-07, "logits/chosen": -0.4616558849811554, "logits/rejected": -0.34021174907684326, "logps/chosen": -138.510986328125, "logps/rejected": -73.8784408569336, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 0.1638443022966385, "rewards/margins": -0.5576454401016235, "rewards/rejected": 0.7214897274971008, "step": 1336 }, { "epoch": 0.22, "learning_rate": 9.83935318810531e-07, "logits/chosen": -0.17981067299842834, "logits/rejected": -0.1344897449016571, "logps/chosen": -58.969207763671875, "logps/rejected": -7.481453895568848, "loss": 0.333, "rewards/accuracies": 1.0, "rewards/chosen": 1.0601905584335327, "rewards/margins": 0.4854122996330261, "rewards/rejected": 0.5747782588005066, "step": 1337 }, { "epoch": 0.22, "learning_rate": 9.839022553535954e-07, "logits/chosen": -0.36472901701927185, "logits/rejected": -0.37079817056655884, "logps/chosen": -104.5933837890625, "logps/rejected": -44.84178161621094, "loss": 0.8861, "rewards/accuracies": 0.0, "rewards/chosen": 0.12237091362476349, "rewards/margins": -0.6155197620391846, "rewards/rejected": 0.7378906607627869, "step": 1338 }, { "epoch": 0.22, "learning_rate": 9.838691584635931e-07, "logits/chosen": -0.3254995048046112, "logits/rejected": -0.32519909739494324, "logps/chosen": -56.44773864746094, "logps/rejected": -78.71129608154297, "loss": 0.6332, "rewards/accuracies": 1.0, "rewards/chosen": 1.3444069623947144, "rewards/margins": 0.2561173439025879, "rewards/rejected": 1.0882896184921265, "step": 1339 }, { "epoch": 0.22, "learning_rate": 9.838360281428104e-07, "logits/chosen": -0.5931112170219421, "logits/rejected": -0.588306725025177, "logps/chosen": -107.35896301269531, "logps/rejected": -130.169921875, "loss": 1.0192, "rewards/accuracies": 1.0, "rewards/chosen": 0.5579666495323181, "rewards/margins": 0.029386937618255615, "rewards/rejected": 0.5285797119140625, "step": 1340 }, { "epoch": 0.22, "learning_rate": 9.838028643935362e-07, "logits/chosen": -0.6423364877700806, "logits/rejected": -0.6229010224342346, "logps/chosen": -89.94169616699219, "logps/rejected": -71.81006622314453, "loss": 0.7073, "rewards/accuracies": 1.0, "rewards/chosen": 0.670666515827179, "rewards/margins": 0.19363632798194885, "rewards/rejected": 0.4770301878452301, "step": 1341 }, { "epoch": 0.22, "learning_rate": 9.837696672180618e-07, "logits/chosen": -0.3365236818790436, "logits/rejected": -0.25268441438674927, "logps/chosen": -205.29832458496094, "logps/rejected": -40.390342712402344, "loss": 0.4004, "rewards/accuracies": 1.0, "rewards/chosen": 3.228593587875366, "rewards/margins": 2.8171274662017822, "rewards/rejected": 0.41146621108055115, "step": 1342 }, { "epoch": 0.22, "learning_rate": 9.837364366186807e-07, "logits/chosen": -0.806862473487854, "logits/rejected": -0.2982918620109558, "logps/chosen": -7.7129292488098145, "logps/rejected": -213.42767333984375, "loss": 0.8254, "rewards/accuracies": 0.0, "rewards/chosen": 0.13388848304748535, "rewards/margins": -0.9400769472122192, "rewards/rejected": 1.0739654302597046, "step": 1343 }, { "epoch": 0.22, "learning_rate": 9.837031725976892e-07, "logits/chosen": -0.28143414855003357, "logits/rejected": -0.25517743825912476, "logps/chosen": -49.77579879760742, "logps/rejected": -21.28982925415039, "loss": 0.805, "rewards/accuracies": 0.0, "rewards/chosen": 0.21474723517894745, "rewards/margins": -0.09177990257740021, "rewards/rejected": 0.30652713775634766, "step": 1344 }, { "epoch": 0.22, "learning_rate": 9.836698751573854e-07, "logits/chosen": -0.6094515323638916, "logits/rejected": -0.44840767979621887, "logps/chosen": -170.33428955078125, "logps/rejected": -69.50616455078125, "loss": 0.8279, "rewards/accuracies": 0.0, "rewards/chosen": 0.468109130859375, "rewards/margins": -0.42133182287216187, "rewards/rejected": 0.8894409537315369, "step": 1345 }, { "epoch": 0.22, "learning_rate": 9.836365443000697e-07, "logits/chosen": -0.01922186277806759, "logits/rejected": -0.01922186277806759, "logps/chosen": -45.299781799316406, "logps/rejected": -45.299781799316406, "loss": 0.6624, "rewards/accuracies": 0.0, "rewards/chosen": 0.6362465023994446, "rewards/margins": 0.0, "rewards/rejected": 0.6362465023994446, "step": 1346 }, { "epoch": 0.22, "learning_rate": 9.83603180028045e-07, "logits/chosen": -0.8073794841766357, "logits/rejected": -0.693332314491272, "logps/chosen": -164.07940673828125, "logps/rejected": -185.06619262695312, "loss": 0.1719, "rewards/accuracies": 1.0, "rewards/chosen": 4.744392395019531, "rewards/margins": 1.9648878574371338, "rewards/rejected": 2.7795045375823975, "step": 1347 }, { "epoch": 0.22, "learning_rate": 9.835697823436161e-07, "logits/chosen": -0.6483800411224365, "logits/rejected": -0.682334840297699, "logps/chosen": -74.06906127929688, "logps/rejected": -111.71397399902344, "loss": 0.859, "rewards/accuracies": 0.0, "rewards/chosen": 0.3676818907260895, "rewards/margins": -0.03336945176124573, "rewards/rejected": 0.4010513424873352, "step": 1348 }, { "epoch": 0.22, "learning_rate": 9.835363512490912e-07, "logits/chosen": -0.2160835862159729, "logits/rejected": -0.07472400367259979, "logps/chosen": -131.63729858398438, "logps/rejected": -66.0794677734375, "loss": 0.1458, "rewards/accuracies": 1.0, "rewards/chosen": 2.7729034423828125, "rewards/margins": 1.7901489734649658, "rewards/rejected": 0.9827545285224915, "step": 1349 }, { "epoch": 0.22, "learning_rate": 9.835028867467797e-07, "logits/chosen": -0.027316460385918617, "logits/rejected": -0.03046341799199581, "logps/chosen": -2.542248249053955, "logps/rejected": -3.293822765350342, "loss": 0.4522, "rewards/accuracies": 0.0, "rewards/chosen": 0.22814670205116272, "rewards/margins": -0.03141993284225464, "rewards/rejected": 0.25956663489341736, "step": 1350 }, { "epoch": 0.22, "learning_rate": 9.834693888389934e-07, "logits/chosen": -0.4427025318145752, "logits/rejected": -0.44515907764434814, "logps/chosen": -71.86045837402344, "logps/rejected": -142.85446166992188, "loss": 0.7191, "rewards/accuracies": 0.0, "rewards/chosen": 1.55803382396698, "rewards/margins": -0.33922576904296875, "rewards/rejected": 1.8972595930099487, "step": 1351 }, { "epoch": 0.22, "learning_rate": 9.834358575280473e-07, "logits/chosen": -0.4976502060890198, "logits/rejected": -0.4858955442905426, "logps/chosen": -87.41216278076172, "logps/rejected": -55.792701721191406, "loss": 0.6117, "rewards/accuracies": 1.0, "rewards/chosen": 0.5513916015625, "rewards/margins": 0.02663499116897583, "rewards/rejected": 0.5247566103935242, "step": 1352 }, { "epoch": 0.22, "learning_rate": 9.834022928162575e-07, "logits/chosen": -0.6507996320724487, "logits/rejected": -0.577467679977417, "logps/chosen": -92.41744232177734, "logps/rejected": -167.82595825195312, "loss": 2.2339, "rewards/accuracies": 0.0, "rewards/chosen": 1.0734604597091675, "rewards/margins": -2.8903884887695312, "rewards/rejected": 3.963848829269409, "step": 1353 }, { "epoch": 0.22, "learning_rate": 9.833686947059434e-07, "logits/chosen": -0.5141868591308594, "logits/rejected": -0.4882294833660126, "logps/chosen": -78.75053405761719, "logps/rejected": -61.23960494995117, "loss": 0.5329, "rewards/accuracies": 1.0, "rewards/chosen": 1.3819115161895752, "rewards/margins": 0.33042263984680176, "rewards/rejected": 1.0514888763427734, "step": 1354 }, { "epoch": 0.22, "learning_rate": 9.83335063199426e-07, "logits/chosen": -0.7000033855438232, "logits/rejected": -0.49708834290504456, "logps/chosen": -58.2700309753418, "logps/rejected": -95.55387878417969, "loss": 0.5288, "rewards/accuracies": 0.0, "rewards/chosen": 1.6030505895614624, "rewards/margins": -0.44511234760284424, "rewards/rejected": 2.0481629371643066, "step": 1355 }, { "epoch": 0.22, "learning_rate": 9.833013982990293e-07, "logits/chosen": -0.45284348726272583, "logits/rejected": -0.3650353252887726, "logps/chosen": -91.00711059570312, "logps/rejected": -108.66496276855469, "loss": 0.9551, "rewards/accuracies": 0.0, "rewards/chosen": 1.9853874444961548, "rewards/margins": -0.8659569025039673, "rewards/rejected": 2.851344347000122, "step": 1356 }, { "epoch": 0.22, "learning_rate": 9.83267700007079e-07, "logits/chosen": -0.6498560905456543, "logits/rejected": -0.6165614128112793, "logps/chosen": -63.121063232421875, "logps/rejected": -56.938167572021484, "loss": 0.4187, "rewards/accuracies": 1.0, "rewards/chosen": 1.19561767578125, "rewards/margins": 0.1273517608642578, "rewards/rejected": 1.0682659149169922, "step": 1357 }, { "epoch": 0.22, "learning_rate": 9.832339683259033e-07, "logits/chosen": -0.24721334874629974, "logits/rejected": -0.22540754079818726, "logps/chosen": -98.84606170654297, "logps/rejected": -28.310218811035156, "loss": 0.513, "rewards/accuracies": 0.0, "rewards/chosen": 0.9867507815361023, "rewards/margins": -0.2904091477394104, "rewards/rejected": 1.2771599292755127, "step": 1358 }, { "epoch": 0.22, "learning_rate": 9.832002032578328e-07, "logits/chosen": -0.8121322989463806, "logits/rejected": -0.7666911482810974, "logps/chosen": -104.34213256835938, "logps/rejected": -55.17807388305664, "loss": 1.0359, "rewards/accuracies": 0.0, "rewards/chosen": 0.107574462890625, "rewards/margins": -0.3903999328613281, "rewards/rejected": 0.4979743957519531, "step": 1359 }, { "epoch": 0.22, "learning_rate": 9.831664048052002e-07, "logits/chosen": -0.3136973977088928, "logits/rejected": -0.3599819839000702, "logps/chosen": -116.41938781738281, "logps/rejected": -115.68038177490234, "loss": 1.4913, "rewards/accuracies": 0.0, "rewards/chosen": 0.01989593543112278, "rewards/margins": -1.1445000171661377, "rewards/rejected": 1.1643959283828735, "step": 1360 }, { "epoch": 0.22, "learning_rate": 9.83132572970341e-07, "logits/chosen": -1.2706717252731323, "logits/rejected": -1.3530064821243286, "logps/chosen": -180.96908569335938, "logps/rejected": -198.55970764160156, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 2.768850803375244, "rewards/margins": -0.39293360710144043, "rewards/rejected": 3.1617844104766846, "step": 1361 }, { "epoch": 0.22, "learning_rate": 9.830987077555924e-07, "logits/chosen": -0.39650171995162964, "logits/rejected": -0.4106297194957733, "logps/chosen": -59.81099319458008, "logps/rejected": -160.8832550048828, "loss": 1.3586, "rewards/accuracies": 0.0, "rewards/chosen": 1.792367935180664, "rewards/margins": -0.9617640972137451, "rewards/rejected": 2.754132032394409, "step": 1362 }, { "epoch": 0.22, "learning_rate": 9.83064809163294e-07, "logits/chosen": -0.45861151814460754, "logits/rejected": -0.4232015609741211, "logps/chosen": -156.95211791992188, "logps/rejected": -132.54580688476562, "loss": 0.3038, "rewards/accuracies": 1.0, "rewards/chosen": 3.4209015369415283, "rewards/margins": 0.8921904563903809, "rewards/rejected": 2.5287110805511475, "step": 1363 }, { "epoch": 0.22, "learning_rate": 9.830308771957883e-07, "logits/chosen": -0.19891110062599182, "logits/rejected": -0.21636420488357544, "logps/chosen": -3.600884199142456, "logps/rejected": -3.8660051822662354, "loss": 0.6464, "rewards/accuracies": 0.0, "rewards/chosen": 0.09727232903242111, "rewards/margins": -0.030587054789066315, "rewards/rejected": 0.12785938382148743, "step": 1364 }, { "epoch": 0.22, "learning_rate": 9.829969118554195e-07, "logits/chosen": -0.6332648992538452, "logits/rejected": -0.5643330812454224, "logps/chosen": -89.80810546875, "logps/rejected": -98.1880111694336, "loss": 1.1499, "rewards/accuracies": 0.0, "rewards/chosen": 0.7119979858398438, "rewards/margins": -1.3827934265136719, "rewards/rejected": 2.0947914123535156, "step": 1365 }, { "epoch": 0.22, "learning_rate": 9.82962913144534e-07, "logits/chosen": -0.3479940891265869, "logits/rejected": -0.3563497066497803, "logps/chosen": -3.228423833847046, "logps/rejected": -4.850451946258545, "loss": 0.5775, "rewards/accuracies": 0.0, "rewards/chosen": 0.2195831537246704, "rewards/margins": -0.00730326771736145, "rewards/rejected": 0.22688642144203186, "step": 1366 }, { "epoch": 0.22, "learning_rate": 9.829288810654814e-07, "logits/chosen": -0.4076909124851227, "logits/rejected": -0.4142122268676758, "logps/chosen": -117.39945983886719, "logps/rejected": -126.24991607666016, "loss": 1.2703, "rewards/accuracies": 0.0, "rewards/chosen": 1.1369231939315796, "rewards/margins": -1.5530143976211548, "rewards/rejected": 2.6899375915527344, "step": 1367 }, { "epoch": 0.22, "learning_rate": 9.828948156206124e-07, "logits/chosen": -0.6638469099998474, "logits/rejected": -0.6693843603134155, "logps/chosen": -93.07975769042969, "logps/rejected": -63.54533767700195, "loss": 0.5355, "rewards/accuracies": 0.0, "rewards/chosen": 0.3568115234375, "rewards/margins": -0.4667797088623047, "rewards/rejected": 0.8235912322998047, "step": 1368 }, { "epoch": 0.22, "learning_rate": 9.828607168122808e-07, "logits/chosen": -1.0056339502334595, "logits/rejected": -0.9211031794548035, "logps/chosen": -102.95284271240234, "logps/rejected": -58.80530548095703, "loss": 0.2055, "rewards/accuracies": 1.0, "rewards/chosen": 2.3098182678222656, "rewards/margins": 0.9698845148086548, "rewards/rejected": 1.3399337530136108, "step": 1369 }, { "epoch": 0.22, "learning_rate": 9.828265846428428e-07, "logits/chosen": -0.8519511222839355, "logits/rejected": -0.7182387113571167, "logps/chosen": -120.16197967529297, "logps/rejected": -75.01127624511719, "loss": 0.5511, "rewards/accuracies": 1.0, "rewards/chosen": 2.2943336963653564, "rewards/margins": 0.5354652404785156, "rewards/rejected": 1.7588684558868408, "step": 1370 }, { "epoch": 0.22, "learning_rate": 9.827924191146561e-07, "logits/chosen": -0.3895043432712555, "logits/rejected": -0.34391334652900696, "logps/chosen": -79.1278076171875, "logps/rejected": -140.4169921875, "loss": 1.6865, "rewards/accuracies": 0.0, "rewards/chosen": 0.6022842526435852, "rewards/margins": -2.4532241821289062, "rewards/rejected": 3.0555083751678467, "step": 1371 }, { "epoch": 0.22, "learning_rate": 9.827582202300815e-07, "logits/chosen": -0.8222745060920715, "logits/rejected": -0.7590322494506836, "logps/chosen": -163.7850799560547, "logps/rejected": -79.74127197265625, "loss": 0.6337, "rewards/accuracies": 0.0, "rewards/chosen": 0.6350769400596619, "rewards/margins": -0.4926223158836365, "rewards/rejected": 1.1276992559432983, "step": 1372 }, { "epoch": 0.22, "learning_rate": 9.827239879914817e-07, "logits/chosen": -0.43860676884651184, "logits/rejected": -0.43860676884651184, "logps/chosen": -79.72373962402344, "logps/rejected": -79.72373962402344, "loss": 0.6555, "rewards/accuracies": 0.0, "rewards/chosen": 1.107934594154358, "rewards/margins": 0.0, "rewards/rejected": 1.107934594154358, "step": 1373 }, { "epoch": 0.22, "learning_rate": 9.82689722401222e-07, "logits/chosen": -0.017835678532719612, "logits/rejected": -0.007726313546299934, "logps/chosen": -4.6884589195251465, "logps/rejected": -31.25092315673828, "loss": 0.5131, "rewards/accuracies": 1.0, "rewards/chosen": 0.20597945153713226, "rewards/margins": 0.14587222039699554, "rewards/rejected": 0.06010723114013672, "step": 1374 }, { "epoch": 0.22, "learning_rate": 9.8265542346167e-07, "logits/chosen": -0.5098567008972168, "logits/rejected": -0.5505213141441345, "logps/chosen": -96.85954284667969, "logps/rejected": -127.58499908447266, "loss": 1.1332, "rewards/accuracies": 0.0, "rewards/chosen": 0.21611785888671875, "rewards/margins": -0.7764900326728821, "rewards/rejected": 0.9926078915596008, "step": 1375 }, { "epoch": 0.22, "learning_rate": 9.826210911751949e-07, "logits/chosen": -0.711003839969635, "logits/rejected": -0.6992810368537903, "logps/chosen": -66.55110931396484, "logps/rejected": -142.20693969726562, "loss": 2.4709, "rewards/accuracies": 0.0, "rewards/chosen": 1.4366806745529175, "rewards/margins": -2.6368026733398438, "rewards/rejected": 4.073483467102051, "step": 1376 }, { "epoch": 0.22, "learning_rate": 9.825867255441688e-07, "logits/chosen": -0.8704630136489868, "logits/rejected": -0.799465537071228, "logps/chosen": -71.05170440673828, "logps/rejected": -126.44818878173828, "loss": 1.0826, "rewards/accuracies": 0.0, "rewards/chosen": 1.6734733581542969, "rewards/margins": -1.5698883533477783, "rewards/rejected": 3.243361711502075, "step": 1377 }, { "epoch": 0.22, "learning_rate": 9.825523265709665e-07, "logits/chosen": -0.4386165142059326, "logits/rejected": -0.42633184790611267, "logps/chosen": -121.50675964355469, "logps/rejected": -149.17465209960938, "loss": 0.9325, "rewards/accuracies": 0.0, "rewards/chosen": 1.9471969604492188, "rewards/margins": -1.0005478858947754, "rewards/rejected": 2.947744846343994, "step": 1378 }, { "epoch": 0.22, "learning_rate": 9.825178942579645e-07, "logits/chosen": -0.7335240244865417, "logits/rejected": -0.6319056749343872, "logps/chosen": -163.58737182617188, "logps/rejected": -48.919673919677734, "loss": 0.2114, "rewards/accuracies": 1.0, "rewards/chosen": 2.904209852218628, "rewards/margins": 1.6107691526412964, "rewards/rejected": 1.2934406995773315, "step": 1379 }, { "epoch": 0.22, "learning_rate": 9.824834286075413e-07, "logits/chosen": -0.5149381756782532, "logits/rejected": -0.4646507203578949, "logps/chosen": -163.87001037597656, "logps/rejected": -162.03445434570312, "loss": 0.5723, "rewards/accuracies": 0.0, "rewards/chosen": 1.7953567504882812, "rewards/margins": -0.4981400966644287, "rewards/rejected": 2.29349684715271, "step": 1380 }, { "epoch": 0.22, "learning_rate": 9.824489296220788e-07, "logits/chosen": -0.47735461592674255, "logits/rejected": -0.4762008488178253, "logps/chosen": -3.6217751502990723, "logps/rejected": -3.4178009033203125, "loss": 0.8707, "rewards/accuracies": 0.0, "rewards/chosen": 0.4023279845714569, "rewards/margins": -0.05865311622619629, "rewards/rejected": 0.4609811007976532, "step": 1381 }, { "epoch": 0.22, "learning_rate": 9.824143973039602e-07, "logits/chosen": -0.5892009735107422, "logits/rejected": -0.5053977370262146, "logps/chosen": -114.5240478515625, "logps/rejected": -30.050996780395508, "loss": 0.5061, "rewards/accuracies": 1.0, "rewards/chosen": 0.1824180632829666, "rewards/margins": 0.14709797501564026, "rewards/rejected": 0.03532009199261665, "step": 1382 }, { "epoch": 0.22, "learning_rate": 9.823798316555712e-07, "logits/chosen": -0.2999570667743683, "logits/rejected": -0.2914380431175232, "logps/chosen": -143.6942138671875, "logps/rejected": -82.43350219726562, "loss": 0.981, "rewards/accuracies": 0.0, "rewards/chosen": 0.09352722018957138, "rewards/margins": -0.9890503883361816, "rewards/rejected": 1.0825775861740112, "step": 1383 }, { "epoch": 0.22, "learning_rate": 9.823452326793002e-07, "logits/chosen": -0.7240189909934998, "logits/rejected": -0.6458162069320679, "logps/chosen": -106.61691284179688, "logps/rejected": -86.27361297607422, "loss": 1.4036, "rewards/accuracies": 0.0, "rewards/chosen": 0.9010559320449829, "rewards/margins": -1.119458794593811, "rewards/rejected": 2.020514726638794, "step": 1384 }, { "epoch": 0.22, "learning_rate": 9.823106003775378e-07, "logits/chosen": -0.5347172021865845, "logits/rejected": -0.47331422567367554, "logps/chosen": -104.59697723388672, "logps/rejected": -53.96590042114258, "loss": 1.1552, "rewards/accuracies": 0.0, "rewards/chosen": 0.1959220916032791, "rewards/margins": -0.7191272974014282, "rewards/rejected": 0.9150493741035461, "step": 1385 }, { "epoch": 0.22, "learning_rate": 9.822759347526765e-07, "logits/chosen": -0.3467366695404053, "logits/rejected": -0.36061587929725647, "logps/chosen": -95.92015075683594, "logps/rejected": -170.83584594726562, "loss": 0.9977, "rewards/accuracies": 0.0, "rewards/chosen": 1.4134567975997925, "rewards/margins": -1.7024184465408325, "rewards/rejected": 3.115875244140625, "step": 1386 }, { "epoch": 0.23, "learning_rate": 9.822412358071113e-07, "logits/chosen": -0.33478137850761414, "logits/rejected": -0.41133058071136475, "logps/chosen": -189.7172088623047, "logps/rejected": -122.1552734375, "loss": 0.9483, "rewards/accuracies": 0.0, "rewards/chosen": 1.1670058965682983, "rewards/margins": -0.9634231328964233, "rewards/rejected": 2.1304290294647217, "step": 1387 }, { "epoch": 0.23, "learning_rate": 9.822065035432398e-07, "logits/chosen": -0.6807522773742676, "logits/rejected": -0.6597549915313721, "logps/chosen": -40.57111358642578, "logps/rejected": -45.53611373901367, "loss": 0.5247, "rewards/accuracies": 1.0, "rewards/chosen": 1.3293808698654175, "rewards/margins": 0.5636215806007385, "rewards/rejected": 0.765759289264679, "step": 1388 }, { "epoch": 0.23, "learning_rate": 9.821717379634616e-07, "logits/chosen": -0.6055023670196533, "logits/rejected": -0.4669385850429535, "logps/chosen": -169.763916015625, "logps/rejected": -78.83952331542969, "loss": 0.4504, "rewards/accuracies": 1.0, "rewards/chosen": 3.2962403297424316, "rewards/margins": 1.8056312799453735, "rewards/rejected": 1.490609049797058, "step": 1389 }, { "epoch": 0.23, "learning_rate": 9.821369390701787e-07, "logits/chosen": -0.6219010949134827, "logits/rejected": -0.5279162526130676, "logps/chosen": -84.49522399902344, "logps/rejected": -169.21737670898438, "loss": 1.3089, "rewards/accuracies": 0.0, "rewards/chosen": 1.4625694751739502, "rewards/margins": -2.146713972091675, "rewards/rejected": 3.609283447265625, "step": 1390 }, { "epoch": 0.23, "learning_rate": 9.821021068657953e-07, "logits/chosen": -0.262783020734787, "logits/rejected": -0.22057828307151794, "logps/chosen": -45.52992248535156, "logps/rejected": -107.1380615234375, "loss": 0.9898, "rewards/accuracies": 0.0, "rewards/chosen": 1.6548118591308594, "rewards/margins": -0.6900413036346436, "rewards/rejected": 2.344853162765503, "step": 1391 }, { "epoch": 0.23, "learning_rate": 9.82067241352718e-07, "logits/chosen": -0.22851374745368958, "logits/rejected": -0.16853941977024078, "logps/chosen": -56.552345275878906, "logps/rejected": -13.883028984069824, "loss": 0.1588, "rewards/accuracies": 1.0, "rewards/chosen": 1.6531181335449219, "rewards/margins": 1.4764955043792725, "rewards/rejected": 0.176622673869133, "step": 1392 }, { "epoch": 0.23, "learning_rate": 9.820323425333556e-07, "logits/chosen": -0.6200956702232361, "logits/rejected": -0.635516345500946, "logps/chosen": -43.75325393676758, "logps/rejected": -32.623878479003906, "loss": 1.1242, "rewards/accuracies": 1.0, "rewards/chosen": 1.2028850317001343, "rewards/margins": 0.11657631397247314, "rewards/rejected": 1.0863087177276611, "step": 1393 }, { "epoch": 0.23, "learning_rate": 9.819974104101196e-07, "logits/chosen": -0.5516936182975769, "logits/rejected": -0.5516936182975769, "logps/chosen": -6.591100215911865, "logps/rejected": -6.591100215911865, "loss": 0.5995, "rewards/accuracies": 0.0, "rewards/chosen": 0.15361113846302032, "rewards/margins": 0.0, "rewards/rejected": 0.15361113846302032, "step": 1394 }, { "epoch": 0.23, "learning_rate": 9.819624449854231e-07, "logits/chosen": -0.4418380558490753, "logits/rejected": -0.4418380558490753, "logps/chosen": -22.415016174316406, "logps/rejected": -22.415016174316406, "loss": 0.6128, "rewards/accuracies": 0.0, "rewards/chosen": 1.1320785284042358, "rewards/margins": 0.0, "rewards/rejected": 1.1320785284042358, "step": 1395 }, { "epoch": 0.23, "learning_rate": 9.81927446261682e-07, "logits/chosen": -0.3586269021034241, "logits/rejected": -0.3229885697364807, "logps/chosen": -57.422271728515625, "logps/rejected": -74.5110855102539, "loss": 0.301, "rewards/accuracies": 1.0, "rewards/chosen": 1.2818984985351562, "rewards/margins": 0.7811301946640015, "rewards/rejected": 0.5007683038711548, "step": 1396 }, { "epoch": 0.23, "learning_rate": 9.818924142413143e-07, "logits/chosen": -0.6568360924720764, "logits/rejected": -0.6568360924720764, "logps/chosen": -110.06710815429688, "logps/rejected": -110.06710815429688, "loss": 0.4533, "rewards/accuracies": 0.0, "rewards/chosen": 0.3540504574775696, "rewards/margins": 0.0, "rewards/rejected": 0.3540504574775696, "step": 1397 }, { "epoch": 0.23, "learning_rate": 9.818573489267406e-07, "logits/chosen": -0.7821238040924072, "logits/rejected": -0.653771162033081, "logps/chosen": -194.27377319335938, "logps/rejected": -72.71194458007812, "loss": 0.7381, "rewards/accuracies": 1.0, "rewards/chosen": 1.4810622930526733, "rewards/margins": 0.22031402587890625, "rewards/rejected": 1.260748267173767, "step": 1398 }, { "epoch": 0.23, "learning_rate": 9.818222503203835e-07, "logits/chosen": -0.2360311895608902, "logits/rejected": -0.19948557019233704, "logps/chosen": -109.42897033691406, "logps/rejected": -50.985355377197266, "loss": 1.2647, "rewards/accuracies": 0.0, "rewards/chosen": 0.6430206298828125, "rewards/margins": -0.584612250328064, "rewards/rejected": 1.2276328802108765, "step": 1399 }, { "epoch": 0.23, "learning_rate": 9.81787118424668e-07, "logits/chosen": -0.957625687122345, "logits/rejected": -0.961887538433075, "logps/chosen": -179.19189453125, "logps/rejected": -105.59867858886719, "loss": 0.2147, "rewards/accuracies": 1.0, "rewards/chosen": 1.805395483970642, "rewards/margins": 0.7663352489471436, "rewards/rejected": 1.0390602350234985, "step": 1400 }, { "epoch": 0.23, "learning_rate": 9.817519532420212e-07, "logits/chosen": -0.6105149388313293, "logits/rejected": -0.5785433650016785, "logps/chosen": -70.60468292236328, "logps/rejected": -113.2984619140625, "loss": 0.7222, "rewards/accuracies": 1.0, "rewards/chosen": 0.7422569394111633, "rewards/margins": 0.3125297725200653, "rewards/rejected": 0.429727166891098, "step": 1401 }, { "epoch": 0.23, "learning_rate": 9.817167547748728e-07, "logits/chosen": -0.7758787274360657, "logits/rejected": -0.7108023166656494, "logps/chosen": -113.49232482910156, "logps/rejected": -120.90754699707031, "loss": 0.8584, "rewards/accuracies": 0.0, "rewards/chosen": -0.0011108398903161287, "rewards/margins": -1.0869919061660767, "rewards/rejected": 1.0858811140060425, "step": 1402 }, { "epoch": 0.23, "learning_rate": 9.816815230256548e-07, "logits/chosen": -0.9530332684516907, "logits/rejected": -0.9331129193305969, "logps/chosen": -100.39851379394531, "logps/rejected": -45.91630554199219, "loss": 0.973, "rewards/accuracies": 1.0, "rewards/chosen": 0.7231033444404602, "rewards/margins": 0.5386508703231812, "rewards/rejected": 0.18445244431495667, "step": 1403 }, { "epoch": 0.23, "learning_rate": 9.816462579968013e-07, "logits/chosen": -0.4412338137626648, "logits/rejected": -0.3910749554634094, "logps/chosen": -47.2672004699707, "logps/rejected": -18.509672164916992, "loss": 0.6366, "rewards/accuracies": 1.0, "rewards/chosen": 0.6185207366943359, "rewards/margins": 0.24078959226608276, "rewards/rejected": 0.3777311444282532, "step": 1404 }, { "epoch": 0.23, "learning_rate": 9.816109596907485e-07, "logits/chosen": -0.5180099010467529, "logits/rejected": -0.4206198751926422, "logps/chosen": -199.81591796875, "logps/rejected": -132.82142639160156, "loss": 1.0571, "rewards/accuracies": 1.0, "rewards/chosen": 3.4837143421173096, "rewards/margins": 1.4182114601135254, "rewards/rejected": 2.065502882003784, "step": 1405 }, { "epoch": 0.23, "learning_rate": 9.815756281099357e-07, "logits/chosen": -0.5503718852996826, "logits/rejected": -0.5473297238349915, "logps/chosen": -41.01939010620117, "logps/rejected": -56.715171813964844, "loss": 1.6076, "rewards/accuracies": 0.0, "rewards/chosen": 0.8530673980712891, "rewards/margins": -0.48741722106933594, "rewards/rejected": 1.340484619140625, "step": 1406 }, { "epoch": 0.23, "learning_rate": 9.815402632568035e-07, "logits/chosen": -0.40803754329681396, "logits/rejected": -0.4032101333141327, "logps/chosen": -116.49093627929688, "logps/rejected": -85.16402435302734, "loss": 0.6207, "rewards/accuracies": 0.0, "rewards/chosen": -0.01448135357350111, "rewards/margins": -0.0017242431640625, "rewards/rejected": -0.01275711040943861, "step": 1407 }, { "epoch": 0.23, "learning_rate": 9.815048651337956e-07, "logits/chosen": -0.5135939717292786, "logits/rejected": -0.5135939717292786, "logps/chosen": -59.27369689941406, "logps/rejected": -59.27369689941406, "loss": 1.0609, "rewards/accuracies": 0.0, "rewards/chosen": 1.47966468334198, "rewards/margins": 0.0, "rewards/rejected": 1.47966468334198, "step": 1408 }, { "epoch": 0.23, "learning_rate": 9.814694337433576e-07, "logits/chosen": -0.3139709532260895, "logits/rejected": -0.36785393953323364, "logps/chosen": -86.2626953125, "logps/rejected": -54.743934631347656, "loss": 0.7611, "rewards/accuracies": 0.0, "rewards/chosen": 0.4347282350063324, "rewards/margins": -0.9102340936660767, "rewards/rejected": 1.3449623584747314, "step": 1409 }, { "epoch": 0.23, "learning_rate": 9.814339690879374e-07, "logits/chosen": -0.5231918096542358, "logits/rejected": -0.5054978728294373, "logps/chosen": -181.243408203125, "logps/rejected": -62.931705474853516, "loss": 0.7154, "rewards/accuracies": 0.0, "rewards/chosen": -0.3828063905239105, "rewards/margins": -0.6063305139541626, "rewards/rejected": 0.2235240936279297, "step": 1410 }, { "epoch": 0.23, "learning_rate": 9.813984711699852e-07, "logits/chosen": -0.7221665978431702, "logits/rejected": -1.087464451789856, "logps/chosen": -132.43577575683594, "logps/rejected": -38.03738784790039, "loss": 0.6448, "rewards/accuracies": 1.0, "rewards/chosen": 0.19570617377758026, "rewards/margins": 0.0717342346906662, "rewards/rejected": 0.12397193908691406, "step": 1411 }, { "epoch": 0.23, "learning_rate": 9.813629399919539e-07, "logits/chosen": -0.40184542536735535, "logits/rejected": -0.4907773733139038, "logps/chosen": -51.8978385925293, "logps/rejected": -127.61241149902344, "loss": 0.3465, "rewards/accuracies": 1.0, "rewards/chosen": 1.330575942993164, "rewards/margins": 0.49018746614456177, "rewards/rejected": 0.8403884768486023, "step": 1412 }, { "epoch": 0.23, "learning_rate": 9.81327375556298e-07, "logits/chosen": -0.683602511882782, "logits/rejected": -0.6436113119125366, "logps/chosen": -134.6607666015625, "logps/rejected": -92.41655731201172, "loss": 1.2114, "rewards/accuracies": 0.0, "rewards/chosen": -0.3419204652309418, "rewards/margins": -2.1370606422424316, "rewards/rejected": 1.7951401472091675, "step": 1413 }, { "epoch": 0.23, "learning_rate": 9.812917778654747e-07, "logits/chosen": -0.19775529205799103, "logits/rejected": -0.21998600661754608, "logps/chosen": -48.01370620727539, "logps/rejected": -78.63860321044922, "loss": 0.3611, "rewards/accuracies": 1.0, "rewards/chosen": 1.1053398847579956, "rewards/margins": 0.1828194260597229, "rewards/rejected": 0.9225204586982727, "step": 1414 }, { "epoch": 0.23, "learning_rate": 9.812561469219439e-07, "logits/chosen": -0.3523884415626526, "logits/rejected": -0.4635542333126068, "logps/chosen": -62.975563049316406, "logps/rejected": -66.16021728515625, "loss": 0.7535, "rewards/accuracies": 0.0, "rewards/chosen": 0.443695068359375, "rewards/margins": -0.036144256591796875, "rewards/rejected": 0.4798393249511719, "step": 1415 }, { "epoch": 0.23, "learning_rate": 9.812204827281667e-07, "logits/chosen": -0.17731332778930664, "logits/rejected": -0.8934669494628906, "logps/chosen": -82.25585174560547, "logps/rejected": -35.53687286376953, "loss": 0.5382, "rewards/accuracies": 1.0, "rewards/chosen": 0.7643791437149048, "rewards/margins": 0.6373192071914673, "rewards/rejected": 0.1270599365234375, "step": 1416 }, { "epoch": 0.23, "learning_rate": 9.811847852866078e-07, "logits/chosen": -0.18498995900154114, "logits/rejected": -0.18262141942977905, "logps/chosen": -51.22563552856445, "logps/rejected": -56.9819221496582, "loss": 0.8001, "rewards/accuracies": 0.0, "rewards/chosen": 1.0314239263534546, "rewards/margins": -0.3857541084289551, "rewards/rejected": 1.4171780347824097, "step": 1417 }, { "epoch": 0.23, "learning_rate": 9.811490545997329e-07, "logits/chosen": -0.8118199110031128, "logits/rejected": -0.7766611576080322, "logps/chosen": -85.451171875, "logps/rejected": -79.84553527832031, "loss": 0.9793, "rewards/accuracies": 1.0, "rewards/chosen": 1.609107255935669, "rewards/margins": 0.17722856998443604, "rewards/rejected": 1.431878685951233, "step": 1418 }, { "epoch": 0.23, "learning_rate": 9.811132906700113e-07, "logits/chosen": -0.2459496706724167, "logits/rejected": -0.21387560665607452, "logps/chosen": -110.12971496582031, "logps/rejected": -137.34286499023438, "loss": 0.6482, "rewards/accuracies": 0.0, "rewards/chosen": 0.050875093787908554, "rewards/margins": -0.33279189467430115, "rewards/rejected": 0.3836669921875, "step": 1419 }, { "epoch": 0.23, "learning_rate": 9.810774934999136e-07, "logits/chosen": -0.8189704418182373, "logits/rejected": -0.7418544888496399, "logps/chosen": -42.83228302001953, "logps/rejected": -103.48400115966797, "loss": 0.2924, "rewards/accuracies": 1.0, "rewards/chosen": 0.9652507901191711, "rewards/margins": 0.5058582425117493, "rewards/rejected": 0.4593925476074219, "step": 1420 }, { "epoch": 0.23, "learning_rate": 9.810416630919129e-07, "logits/chosen": -0.8419285416603088, "logits/rejected": -0.841719388961792, "logps/chosen": -331.6182556152344, "logps/rejected": -51.58759689331055, "loss": 0.4623, "rewards/accuracies": 0.0, "rewards/chosen": 0.02850036695599556, "rewards/margins": -0.15042763948440552, "rewards/rejected": 0.17892800271511078, "step": 1421 }, { "epoch": 0.23, "learning_rate": 9.81005799448485e-07, "logits/chosen": -0.7736119031906128, "logits/rejected": -0.6490102410316467, "logps/chosen": -186.9595184326172, "logps/rejected": -148.95266723632812, "loss": 0.8141, "rewards/accuracies": 0.0, "rewards/chosen": 3.237112522125244, "rewards/margins": -0.6824691295623779, "rewards/rejected": 3.919581651687622, "step": 1422 }, { "epoch": 0.23, "learning_rate": 9.80969902572108e-07, "logits/chosen": -0.30137547850608826, "logits/rejected": -0.3228560984134674, "logps/chosen": -56.21546936035156, "logps/rejected": -71.63304901123047, "loss": 1.0314, "rewards/accuracies": 1.0, "rewards/chosen": 0.3085647523403168, "rewards/margins": 0.15069884061813354, "rewards/rejected": 0.15786591172218323, "step": 1423 }, { "epoch": 0.23, "learning_rate": 9.809339724652612e-07, "logits/chosen": -0.5752702355384827, "logits/rejected": -0.5771369338035583, "logps/chosen": -2.447906732559204, "logps/rejected": -1.83065664768219, "loss": 0.6129, "rewards/accuracies": 0.0, "rewards/chosen": 0.1818396896123886, "rewards/margins": -0.08403651416301727, "rewards/rejected": 0.2658762037754059, "step": 1424 }, { "epoch": 0.23, "learning_rate": 9.80898009130428e-07, "logits/chosen": -0.25879693031311035, "logits/rejected": -0.2669544517993927, "logps/chosen": -4.194666385650635, "logps/rejected": -37.16359329223633, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 0.17054076492786407, "rewards/margins": -0.024095013737678528, "rewards/rejected": 0.1946357786655426, "step": 1425 }, { "epoch": 0.23, "learning_rate": 9.808620125700924e-07, "logits/chosen": -0.62633216381073, "logits/rejected": -0.6073732972145081, "logps/chosen": -80.20362854003906, "logps/rejected": -96.23582458496094, "loss": 0.6401, "rewards/accuracies": 1.0, "rewards/chosen": 0.2830100953578949, "rewards/margins": 0.018610358238220215, "rewards/rejected": 0.2643997371196747, "step": 1426 }, { "epoch": 0.23, "learning_rate": 9.808259827867416e-07, "logits/chosen": -0.11934356391429901, "logits/rejected": -0.11780858784914017, "logps/chosen": -4.452303886413574, "logps/rejected": -10.464850425720215, "loss": 0.8775, "rewards/accuracies": 1.0, "rewards/chosen": 0.1269546002149582, "rewards/margins": 0.1317809522151947, "rewards/rejected": -0.004826355259865522, "step": 1427 }, { "epoch": 0.23, "learning_rate": 9.807899197828653e-07, "logits/chosen": -1.116859793663025, "logits/rejected": -0.9753273129463196, "logps/chosen": -144.58740234375, "logps/rejected": -173.96774291992188, "loss": 1.0598, "rewards/accuracies": 0.0, "rewards/chosen": 0.0217437744140625, "rewards/margins": -1.7688461542129517, "rewards/rejected": 1.7905899286270142, "step": 1428 }, { "epoch": 0.23, "learning_rate": 9.807538235609547e-07, "logits/chosen": -0.6373052597045898, "logits/rejected": -0.6770974397659302, "logps/chosen": -87.27285766601562, "logps/rejected": -142.33372497558594, "loss": 0.7632, "rewards/accuracies": 1.0, "rewards/chosen": 1.5063308477401733, "rewards/margins": 0.11099851131439209, "rewards/rejected": 1.3953323364257812, "step": 1429 }, { "epoch": 0.23, "learning_rate": 9.807176941235038e-07, "logits/chosen": -0.3107633590698242, "logits/rejected": -0.2926346957683563, "logps/chosen": -77.61918640136719, "logps/rejected": -88.40461730957031, "loss": 1.3132, "rewards/accuracies": 1.0, "rewards/chosen": 0.7140304446220398, "rewards/margins": 0.43318480253219604, "rewards/rejected": 0.28084564208984375, "step": 1430 }, { "epoch": 0.23, "learning_rate": 9.806815314730088e-07, "logits/chosen": -0.39599061012268066, "logits/rejected": 0.44339972734451294, "logps/chosen": -65.96371459960938, "logps/rejected": -38.708030700683594, "loss": 0.5502, "rewards/accuracies": 1.0, "rewards/chosen": 1.5928863286972046, "rewards/margins": 1.3318027257919312, "rewards/rejected": 0.26108360290527344, "step": 1431 }, { "epoch": 0.23, "learning_rate": 9.806453356119682e-07, "logits/chosen": -0.4832843244075775, "logits/rejected": -0.39882686734199524, "logps/chosen": -82.7921371459961, "logps/rejected": -147.8019256591797, "loss": 0.4935, "rewards/accuracies": 1.0, "rewards/chosen": 1.2806625366210938, "rewards/margins": 1.3774383068084717, "rewards/rejected": -0.09677582234144211, "step": 1432 }, { "epoch": 0.23, "learning_rate": 9.806091065428829e-07, "logits/chosen": -0.17789411544799805, "logits/rejected": -0.1966594159603119, "logps/chosen": -50.920555114746094, "logps/rejected": -34.83934020996094, "loss": 0.6916, "rewards/accuracies": 0.0, "rewards/chosen": 0.48014259338378906, "rewards/margins": -0.2806202173233032, "rewards/rejected": 0.7607628107070923, "step": 1433 }, { "epoch": 0.23, "learning_rate": 9.80572844268256e-07, "logits/chosen": -0.5119796395301819, "logits/rejected": -0.4919336140155792, "logps/chosen": -280.8130187988281, "logps/rejected": -67.50084686279297, "loss": 1.0462, "rewards/accuracies": 1.0, "rewards/chosen": 2.23191237449646, "rewards/margins": 0.7404488325119019, "rewards/rejected": 1.491463541984558, "step": 1434 }, { "epoch": 0.23, "learning_rate": 9.805365487905926e-07, "logits/chosen": -0.24586822092533112, "logits/rejected": -0.2453906536102295, "logps/chosen": -5.058225631713867, "logps/rejected": -8.402913093566895, "loss": 0.3624, "rewards/accuracies": 1.0, "rewards/chosen": 0.1386570930480957, "rewards/margins": 0.23187370598316193, "rewards/rejected": -0.09321661293506622, "step": 1435 }, { "epoch": 0.23, "learning_rate": 9.805002201124006e-07, "logits/chosen": -0.8751562237739563, "logits/rejected": -0.7648171186447144, "logps/chosen": -124.48018646240234, "logps/rejected": -33.06280517578125, "loss": 0.8178, "rewards/accuracies": 1.0, "rewards/chosen": 0.2892623841762543, "rewards/margins": 0.1092834323644638, "rewards/rejected": 0.17997895181179047, "step": 1436 }, { "epoch": 0.23, "learning_rate": 9.8046385823619e-07, "logits/chosen": -0.21566253900527954, "logits/rejected": -0.22781744599342346, "logps/chosen": -49.454105377197266, "logps/rejected": -78.2674789428711, "loss": 0.796, "rewards/accuracies": 0.0, "rewards/chosen": -0.1821727752685547, "rewards/margins": -0.1783725768327713, "rewards/rejected": -0.0038002014625817537, "step": 1437 }, { "epoch": 0.23, "learning_rate": 9.804274631644728e-07, "logits/chosen": -0.013235613703727722, "logits/rejected": -0.5789085030555725, "logps/chosen": -45.35822677612305, "logps/rejected": -73.84725189208984, "loss": 1.0425, "rewards/accuracies": 1.0, "rewards/chosen": 0.9633854031562805, "rewards/margins": 0.6890125274658203, "rewards/rejected": 0.2743728756904602, "step": 1438 }, { "epoch": 0.23, "learning_rate": 9.80391034899764e-07, "logits/chosen": -0.6251644492149353, "logits/rejected": -0.5778806209564209, "logps/chosen": -242.5043182373047, "logps/rejected": -31.390783309936523, "loss": 0.8596, "rewards/accuracies": 0.0, "rewards/chosen": 0.21051636338233948, "rewards/margins": -0.4202161729335785, "rewards/rejected": 0.630732536315918, "step": 1439 }, { "epoch": 0.23, "learning_rate": 9.8035457344458e-07, "logits/chosen": -0.4828919470310211, "logits/rejected": -0.41896530985832214, "logps/chosen": -51.162315368652344, "logps/rejected": -36.804569244384766, "loss": 0.566, "rewards/accuracies": 1.0, "rewards/chosen": 1.0106743574142456, "rewards/margins": 0.021437525749206543, "rewards/rejected": 0.9892368316650391, "step": 1440 }, { "epoch": 0.23, "learning_rate": 9.803180788014402e-07, "logits/chosen": -0.5157266855239868, "logits/rejected": -0.4794618487358093, "logps/chosen": -53.25870132446289, "logps/rejected": -132.47434997558594, "loss": 1.2218, "rewards/accuracies": 0.0, "rewards/chosen": 2.157623052597046, "rewards/margins": -0.4436969757080078, "rewards/rejected": 2.6013200283050537, "step": 1441 }, { "epoch": 0.23, "learning_rate": 9.802815509728662e-07, "logits/chosen": -0.45245304703712463, "logits/rejected": -0.4551716148853302, "logps/chosen": -184.63414001464844, "logps/rejected": -123.79289245605469, "loss": 0.6198, "rewards/accuracies": 0.0, "rewards/chosen": 2.7251954078674316, "rewards/margins": -0.10378718376159668, "rewards/rejected": 2.8289825916290283, "step": 1442 }, { "epoch": 0.23, "learning_rate": 9.802449899613811e-07, "logits/chosen": -0.39725393056869507, "logits/rejected": -0.39725393056869507, "logps/chosen": -98.0006103515625, "logps/rejected": -98.0006103515625, "loss": 1.3878, "rewards/accuracies": 0.0, "rewards/chosen": 0.7830421328544617, "rewards/margins": 0.0, "rewards/rejected": 0.7830421328544617, "step": 1443 }, { "epoch": 0.23, "learning_rate": 9.802083957695114e-07, "logits/chosen": -0.45836973190307617, "logits/rejected": -0.44652822613716125, "logps/chosen": -127.13052368164062, "logps/rejected": -54.88080978393555, "loss": 0.6124, "rewards/accuracies": 1.0, "rewards/chosen": 2.6452300548553467, "rewards/margins": 2.2308878898620605, "rewards/rejected": 0.41434213519096375, "step": 1444 }, { "epoch": 0.23, "learning_rate": 9.801717683997856e-07, "logits/chosen": -0.5197967886924744, "logits/rejected": -0.4856727123260498, "logps/chosen": -82.83146667480469, "logps/rejected": -83.65879821777344, "loss": 0.1348, "rewards/accuracies": 1.0, "rewards/chosen": 2.58329176902771, "rewards/margins": 1.267038106918335, "rewards/rejected": 1.316253662109375, "step": 1445 }, { "epoch": 0.23, "learning_rate": 9.801351078547337e-07, "logits/chosen": -0.6494700312614441, "logits/rejected": -0.6443199515342712, "logps/chosen": -14.802362442016602, "logps/rejected": -25.157855987548828, "loss": 0.3453, "rewards/accuracies": 1.0, "rewards/chosen": 0.10791625827550888, "rewards/margins": 0.08149413764476776, "rewards/rejected": 0.02642211876809597, "step": 1446 }, { "epoch": 0.23, "learning_rate": 9.800984141368891e-07, "logits/chosen": -0.49214470386505127, "logits/rejected": -0.4610937833786011, "logps/chosen": -74.41638946533203, "logps/rejected": -40.5112419128418, "loss": 0.6132, "rewards/accuracies": 1.0, "rewards/chosen": 1.4743690490722656, "rewards/margins": 0.23792338371276855, "rewards/rejected": 1.236445665359497, "step": 1447 }, { "epoch": 0.24, "learning_rate": 9.80061687248787e-07, "logits/chosen": -0.7179824113845825, "logits/rejected": -0.6600512862205505, "logps/chosen": -226.22760009765625, "logps/rejected": -85.0826644897461, "loss": 1.1636, "rewards/accuracies": 0.0, "rewards/chosen": 0.24002991616725922, "rewards/margins": -0.6311805844306946, "rewards/rejected": 0.871210515499115, "step": 1448 }, { "epoch": 0.24, "learning_rate": 9.800249271929643e-07, "logits/chosen": -0.3608531951904297, "logits/rejected": -0.2914378345012665, "logps/chosen": -139.42886352539062, "logps/rejected": -91.39164733886719, "loss": 0.176, "rewards/accuracies": 1.0, "rewards/chosen": 4.055890083312988, "rewards/margins": 2.7324142456054688, "rewards/rejected": 1.32347571849823, "step": 1449 }, { "epoch": 0.24, "learning_rate": 9.799881339719614e-07, "logits/chosen": -0.11403616517782211, "logits/rejected": -0.11403616517782211, "logps/chosen": -0.6145517230033875, "logps/rejected": -0.6145517230033875, "loss": 0.4996, "rewards/accuracies": 0.0, "rewards/chosen": 0.09103134274482727, "rewards/margins": 0.0, "rewards/rejected": 0.09103134274482727, "step": 1450 }, { "epoch": 0.24, "learning_rate": 9.7995130758832e-07, "logits/chosen": -0.5125530958175659, "logits/rejected": -0.5125530958175659, "logps/chosen": -116.7440414428711, "logps/rejected": -116.7440414428711, "loss": 0.4751, "rewards/accuracies": 0.0, "rewards/chosen": 1.689313530921936, "rewards/margins": 0.0, "rewards/rejected": 1.689313530921936, "step": 1451 }, { "epoch": 0.24, "learning_rate": 9.799144480445847e-07, "logits/chosen": -0.32254281640052795, "logits/rejected": -0.3637646734714508, "logps/chosen": -5.631381988525391, "logps/rejected": -51.86806106567383, "loss": 0.8056, "rewards/accuracies": 0.0, "rewards/chosen": 0.38988199830055237, "rewards/margins": -0.17500296235084534, "rewards/rejected": 0.5648849606513977, "step": 1452 }, { "epoch": 0.24, "learning_rate": 9.798775553433022e-07, "logits/chosen": -0.5558789968490601, "logits/rejected": -0.5755639672279358, "logps/chosen": -68.79376220703125, "logps/rejected": -99.58059692382812, "loss": 1.3541, "rewards/accuracies": 0.0, "rewards/chosen": 0.39447328448295593, "rewards/margins": -0.9353485107421875, "rewards/rejected": 1.3298218250274658, "step": 1453 }, { "epoch": 0.24, "learning_rate": 9.798406294870209e-07, "logits/chosen": -0.3470323085784912, "logits/rejected": -0.35329461097717285, "logps/chosen": -80.00010681152344, "logps/rejected": -61.52684783935547, "loss": 0.4928, "rewards/accuracies": 1.0, "rewards/chosen": 1.6750602722167969, "rewards/margins": 0.4385024309158325, "rewards/rejected": 1.2365578413009644, "step": 1454 }, { "epoch": 0.24, "learning_rate": 9.798036704782925e-07, "logits/chosen": -0.8978464007377625, "logits/rejected": -0.8626545667648315, "logps/chosen": -99.798095703125, "logps/rejected": -88.07760620117188, "loss": 1.7034, "rewards/accuracies": 0.0, "rewards/chosen": 0.8345245718955994, "rewards/margins": -1.0704452991485596, "rewards/rejected": 1.9049698114395142, "step": 1455 }, { "epoch": 0.24, "learning_rate": 9.797666783196706e-07, "logits/chosen": -0.6024524569511414, "logits/rejected": -0.47434288263320923, "logps/chosen": -131.72174072265625, "logps/rejected": -109.33473205566406, "loss": 0.6845, "rewards/accuracies": 0.0, "rewards/chosen": 0.20533142983913422, "rewards/margins": -0.6536300778388977, "rewards/rejected": 0.8589615225791931, "step": 1456 }, { "epoch": 0.24, "learning_rate": 9.797296530137107e-07, "logits/chosen": -0.5837112069129944, "logits/rejected": -0.558194637298584, "logps/chosen": -129.20095825195312, "logps/rejected": -132.71653747558594, "loss": 0.5619, "rewards/accuracies": 1.0, "rewards/chosen": 1.3296356201171875, "rewards/margins": 0.36899107694625854, "rewards/rejected": 0.960644543170929, "step": 1457 }, { "epoch": 0.24, "learning_rate": 9.796925945629709e-07, "logits/chosen": -0.3967236876487732, "logits/rejected": -0.33250555396080017, "logps/chosen": -125.6954574584961, "logps/rejected": -64.87765502929688, "loss": 0.4383, "rewards/accuracies": 1.0, "rewards/chosen": 0.3449104428291321, "rewards/margins": 0.12402267754077911, "rewards/rejected": 0.22088776528835297, "step": 1458 }, { "epoch": 0.24, "learning_rate": 9.796555029700118e-07, "logits/chosen": -0.5090893507003784, "logits/rejected": -0.48448583483695984, "logps/chosen": -151.83624267578125, "logps/rejected": -62.037296295166016, "loss": 0.9524, "rewards/accuracies": 0.0, "rewards/chosen": 0.2863357663154602, "rewards/margins": -1.2998631000518799, "rewards/rejected": 1.5861988067626953, "step": 1459 }, { "epoch": 0.24, "learning_rate": 9.79618378237396e-07, "logits/chosen": -0.3796676993370056, "logits/rejected": -0.3781411647796631, "logps/chosen": -235.1437225341797, "logps/rejected": -56.127296447753906, "loss": 0.3292, "rewards/accuracies": 1.0, "rewards/chosen": 1.7994660139083862, "rewards/margins": 0.8544136881828308, "rewards/rejected": 0.9450523257255554, "step": 1460 }, { "epoch": 0.24, "learning_rate": 9.795812203676885e-07, "logits/chosen": -0.717410683631897, "logits/rejected": -0.7762855291366577, "logps/chosen": -264.3314208984375, "logps/rejected": -162.7606201171875, "loss": 0.4362, "rewards/accuracies": 0.0, "rewards/chosen": 1.9452027082443237, "rewards/margins": -0.25885307788848877, "rewards/rejected": 2.2040557861328125, "step": 1461 }, { "epoch": 0.24, "learning_rate": 9.795440293634566e-07, "logits/chosen": -0.27505144476890564, "logits/rejected": -0.20642411708831787, "logps/chosen": -72.43748474121094, "logps/rejected": -7.903651237487793, "loss": 0.3819, "rewards/accuracies": 1.0, "rewards/chosen": 1.5914710760116577, "rewards/margins": 1.1714556217193604, "rewards/rejected": 0.420015424489975, "step": 1462 }, { "epoch": 0.24, "learning_rate": 9.795068052272697e-07, "logits/chosen": -0.524653434753418, "logits/rejected": -0.4966968894004822, "logps/chosen": -145.6240997314453, "logps/rejected": -107.37091064453125, "loss": 0.2824, "rewards/accuracies": 1.0, "rewards/chosen": 2.962825059890747, "rewards/margins": 0.8018355369567871, "rewards/rejected": 2.16098952293396, "step": 1463 }, { "epoch": 0.24, "learning_rate": 9.794695479616995e-07, "logits/chosen": -0.45417919754981995, "logits/rejected": -0.32576829195022583, "logps/chosen": -132.61134338378906, "logps/rejected": -50.211551666259766, "loss": 0.362, "rewards/accuracies": 1.0, "rewards/chosen": 0.907733142375946, "rewards/margins": 0.7986060976982117, "rewards/rejected": 0.10912704467773438, "step": 1464 }, { "epoch": 0.24, "learning_rate": 9.794322575693206e-07, "logits/chosen": -0.5517804622650146, "logits/rejected": -0.5096648931503296, "logps/chosen": -112.93412780761719, "logps/rejected": -97.69316101074219, "loss": 0.6574, "rewards/accuracies": 1.0, "rewards/chosen": 2.6969680786132812, "rewards/margins": 0.3618483543395996, "rewards/rejected": 2.3351197242736816, "step": 1465 }, { "epoch": 0.24, "learning_rate": 9.793949340527089e-07, "logits/chosen": -0.44703209400177, "logits/rejected": -0.4763895869255066, "logps/chosen": -131.31552124023438, "logps/rejected": -115.31639099121094, "loss": 0.5795, "rewards/accuracies": 1.0, "rewards/chosen": 1.5205490589141846, "rewards/margins": 0.10372161865234375, "rewards/rejected": 1.4168274402618408, "step": 1466 }, { "epoch": 0.24, "learning_rate": 9.793575774144435e-07, "logits/chosen": -0.45653846859931946, "logits/rejected": -0.4851658046245575, "logps/chosen": -96.19446563720703, "logps/rejected": -123.32257080078125, "loss": 0.831, "rewards/accuracies": 0.0, "rewards/chosen": 0.9130111932754517, "rewards/margins": -0.6562957763671875, "rewards/rejected": 1.5693069696426392, "step": 1467 }, { "epoch": 0.24, "learning_rate": 9.79320187657105e-07, "logits/chosen": -0.23563918471336365, "logits/rejected": -0.19891782104969025, "logps/chosen": -61.63865280151367, "logps/rejected": -73.78927612304688, "loss": 0.9039, "rewards/accuracies": 1.0, "rewards/chosen": 1.006285548210144, "rewards/margins": 0.508652925491333, "rewards/rejected": 0.49763259291648865, "step": 1468 }, { "epoch": 0.24, "learning_rate": 9.79282764783277e-07, "logits/chosen": -0.4698795676231384, "logits/rejected": -0.49452969431877136, "logps/chosen": -58.246971130371094, "logps/rejected": -58.16657638549805, "loss": 0.7796, "rewards/accuracies": 1.0, "rewards/chosen": 1.1151618957519531, "rewards/margins": 0.35172003507614136, "rewards/rejected": 0.7634418606758118, "step": 1469 }, { "epoch": 0.24, "learning_rate": 9.792453087955453e-07, "logits/chosen": -0.2533639073371887, "logits/rejected": -0.24238865077495575, "logps/chosen": -71.67630767822266, "logps/rejected": -59.235130310058594, "loss": 0.481, "rewards/accuracies": 1.0, "rewards/chosen": 0.5040443539619446, "rewards/margins": 0.08626976609230042, "rewards/rejected": 0.41777458786964417, "step": 1470 }, { "epoch": 0.24, "learning_rate": 9.79207819696497e-07, "logits/chosen": -0.3433392643928528, "logits/rejected": -0.37500256299972534, "logps/chosen": -73.5865707397461, "logps/rejected": -71.15499877929688, "loss": 0.5726, "rewards/accuracies": 0.0, "rewards/chosen": 1.4231407642364502, "rewards/margins": -0.6288344860076904, "rewards/rejected": 2.0519752502441406, "step": 1471 }, { "epoch": 0.24, "learning_rate": 9.791702974887228e-07, "logits/chosen": -0.2782151997089386, "logits/rejected": -0.33530542254447937, "logps/chosen": -20.582172393798828, "logps/rejected": -84.40902709960938, "loss": 1.0187, "rewards/accuracies": 0.0, "rewards/chosen": 0.72536700963974, "rewards/margins": -1.0259826183319092, "rewards/rejected": 1.751349687576294, "step": 1472 }, { "epoch": 0.24, "learning_rate": 9.79132742174815e-07, "logits/chosen": -0.5797999501228333, "logits/rejected": -0.43974143266677856, "logps/chosen": -139.5922393798828, "logps/rejected": -56.29396057128906, "loss": 1.3483, "rewards/accuracies": 0.0, "rewards/chosen": 0.13555756211280823, "rewards/margins": -0.8097518682479858, "rewards/rejected": 0.9453094601631165, "step": 1473 }, { "epoch": 0.24, "learning_rate": 9.790951537573683e-07, "logits/chosen": -0.23651763796806335, "logits/rejected": -0.23564666509628296, "logps/chosen": -6.2475361824035645, "logps/rejected": -5.428149700164795, "loss": 0.6871, "rewards/accuracies": 0.0, "rewards/chosen": 0.15749874711036682, "rewards/margins": -0.05372162163257599, "rewards/rejected": 0.2112203687429428, "step": 1474 }, { "epoch": 0.24, "learning_rate": 9.790575322389797e-07, "logits/chosen": -0.3320131003856659, "logits/rejected": -0.2999926209449768, "logps/chosen": -48.927303314208984, "logps/rejected": -93.67097473144531, "loss": 0.8207, "rewards/accuracies": 1.0, "rewards/chosen": 0.4705585539340973, "rewards/margins": 0.1674121916294098, "rewards/rejected": 0.3031463623046875, "step": 1475 }, { "epoch": 0.24, "learning_rate": 9.790198776222487e-07, "logits/chosen": -0.8458237051963806, "logits/rejected": -0.8175431489944458, "logps/chosen": -136.068603515625, "logps/rejected": -90.09618377685547, "loss": 0.3171, "rewards/accuracies": 1.0, "rewards/chosen": 2.19431471824646, "rewards/margins": 0.4710777997970581, "rewards/rejected": 1.7232369184494019, "step": 1476 }, { "epoch": 0.24, "learning_rate": 9.789821899097766e-07, "logits/chosen": -0.7409854531288147, "logits/rejected": -0.7255493998527527, "logps/chosen": -82.06962585449219, "logps/rejected": -44.96182632446289, "loss": 1.3593, "rewards/accuracies": 1.0, "rewards/chosen": 2.6724274158477783, "rewards/margins": 0.949140191078186, "rewards/rejected": 1.7232872247695923, "step": 1477 }, { "epoch": 0.24, "learning_rate": 9.789444691041672e-07, "logits/chosen": -0.5955150127410889, "logits/rejected": -0.4563983082771301, "logps/chosen": -156.5496826171875, "logps/rejected": -97.7492904663086, "loss": 0.7506, "rewards/accuracies": 0.0, "rewards/chosen": 0.41900941729545593, "rewards/margins": -0.29950329661369324, "rewards/rejected": 0.7185127139091492, "step": 1478 }, { "epoch": 0.24, "learning_rate": 9.789067152080268e-07, "logits/chosen": -0.5364139676094055, "logits/rejected": -0.48837342858314514, "logps/chosen": -92.15901184082031, "logps/rejected": -61.06266784667969, "loss": 0.398, "rewards/accuracies": 1.0, "rewards/chosen": 2.4798943996429443, "rewards/margins": 1.1195054054260254, "rewards/rejected": 1.360388994216919, "step": 1479 }, { "epoch": 0.24, "learning_rate": 9.78868928223964e-07, "logits/chosen": -0.5188068747520447, "logits/rejected": -0.5560226440429688, "logps/chosen": -84.30258178710938, "logps/rejected": -46.366153717041016, "loss": 0.1845, "rewards/accuracies": 1.0, "rewards/chosen": 2.6921417713165283, "rewards/margins": 1.502545952796936, "rewards/rejected": 1.1895958185195923, "step": 1480 }, { "epoch": 0.24, "learning_rate": 9.788311081545893e-07, "logits/chosen": -0.39074811339378357, "logits/rejected": -0.39074811339378357, "logps/chosen": -61.69099807739258, "logps/rejected": -61.69099807739258, "loss": 0.4233, "rewards/accuracies": 0.0, "rewards/chosen": 1.0626506805419922, "rewards/margins": 0.0, "rewards/rejected": 1.0626506805419922, "step": 1481 }, { "epoch": 0.24, "learning_rate": 9.787932550025157e-07, "logits/chosen": -0.7935963869094849, "logits/rejected": -0.6834017038345337, "logps/chosen": -90.99574279785156, "logps/rejected": -68.29998016357422, "loss": 0.894, "rewards/accuracies": 0.0, "rewards/chosen": 0.42908477783203125, "rewards/margins": -0.24118274450302124, "rewards/rejected": 0.6702675223350525, "step": 1482 }, { "epoch": 0.24, "learning_rate": 9.787553687703584e-07, "logits/chosen": -0.27432936429977417, "logits/rejected": -0.26006683707237244, "logps/chosen": -53.150146484375, "logps/rejected": -47.5843391418457, "loss": 1.0422, "rewards/accuracies": 1.0, "rewards/chosen": 1.2819305658340454, "rewards/margins": 0.052335381507873535, "rewards/rejected": 1.2295951843261719, "step": 1483 }, { "epoch": 0.24, "learning_rate": 9.787174494607354e-07, "logits/chosen": -0.6040154695510864, "logits/rejected": -0.5602536201477051, "logps/chosen": -120.3697738647461, "logps/rejected": -142.71414184570312, "loss": 0.2604, "rewards/accuracies": 1.0, "rewards/chosen": 1.9759864807128906, "rewards/margins": 0.6210517883300781, "rewards/rejected": 1.3549346923828125, "step": 1484 }, { "epoch": 0.24, "learning_rate": 9.786794970762663e-07, "logits/chosen": -0.7029940485954285, "logits/rejected": -0.69713294506073, "logps/chosen": -115.50113677978516, "logps/rejected": -54.270565032958984, "loss": 1.271, "rewards/accuracies": 0.0, "rewards/chosen": -0.24643173813819885, "rewards/margins": -1.7200267314910889, "rewards/rejected": 1.4735950231552124, "step": 1485 }, { "epoch": 0.24, "learning_rate": 9.786415116195732e-07, "logits/chosen": -0.6642323732376099, "logits/rejected": -0.7595757842063904, "logps/chosen": -220.1798095703125, "logps/rejected": -66.57616424560547, "loss": 0.1889, "rewards/accuracies": 1.0, "rewards/chosen": 2.342254638671875, "rewards/margins": 0.8787742853164673, "rewards/rejected": 1.4634803533554077, "step": 1486 }, { "epoch": 0.24, "learning_rate": 9.786034930932807e-07, "logits/chosen": -0.9045150876045227, "logits/rejected": -0.9207284450531006, "logps/chosen": -44.82798767089844, "logps/rejected": -115.7057876586914, "loss": 0.3317, "rewards/accuracies": 1.0, "rewards/chosen": 1.9685570001602173, "rewards/margins": 1.2803542613983154, "rewards/rejected": 0.6882026791572571, "step": 1487 }, { "epoch": 0.24, "learning_rate": 9.785654415000153e-07, "logits/chosen": -0.550280749797821, "logits/rejected": -0.47157999873161316, "logps/chosen": -49.559226989746094, "logps/rejected": -36.999107360839844, "loss": 0.8533, "rewards/accuracies": 1.0, "rewards/chosen": 1.2396965026855469, "rewards/margins": 0.6899757385253906, "rewards/rejected": 0.5497207641601562, "step": 1488 }, { "epoch": 0.24, "learning_rate": 9.785273568424062e-07, "logits/chosen": -0.2574659287929535, "logits/rejected": -0.2574659287929535, "logps/chosen": -47.76045608520508, "logps/rejected": -47.76045608520508, "loss": 0.5776, "rewards/accuracies": 0.0, "rewards/chosen": 1.0381786823272705, "rewards/margins": 0.0, "rewards/rejected": 1.0381786823272705, "step": 1489 }, { "epoch": 0.24, "learning_rate": 9.784892391230845e-07, "logits/chosen": -0.8283786773681641, "logits/rejected": -0.7075421214103699, "logps/chosen": -138.3074188232422, "logps/rejected": -141.81149291992188, "loss": 0.665, "rewards/accuracies": 1.0, "rewards/chosen": 3.256208896636963, "rewards/margins": 1.4238388538360596, "rewards/rejected": 1.8323700428009033, "step": 1490 }, { "epoch": 0.24, "learning_rate": 9.78451088344684e-07, "logits/chosen": -0.33090299367904663, "logits/rejected": -0.3055322766304016, "logps/chosen": -104.85224914550781, "logps/rejected": -165.08822631835938, "loss": 1.4201, "rewards/accuracies": 0.0, "rewards/chosen": 2.331526279449463, "rewards/margins": -1.0448029041290283, "rewards/rejected": 3.376329183578491, "step": 1491 }, { "epoch": 0.24, "learning_rate": 9.784129045098404e-07, "logits/chosen": -0.1378428190946579, "logits/rejected": -0.12231866270303726, "logps/chosen": -43.40513229370117, "logps/rejected": -46.85642623901367, "loss": 1.0466, "rewards/accuracies": 0.0, "rewards/chosen": 1.2168060541152954, "rewards/margins": -0.18592190742492676, "rewards/rejected": 1.4027279615402222, "step": 1492 }, { "epoch": 0.24, "learning_rate": 9.783746876211917e-07, "logits/chosen": -0.4010874032974243, "logits/rejected": -0.4010874032974243, "logps/chosen": -45.416526794433594, "logps/rejected": -45.416526794433594, "loss": 0.6218, "rewards/accuracies": 0.0, "rewards/chosen": 0.9007053375244141, "rewards/margins": 0.0, "rewards/rejected": 0.9007053375244141, "step": 1493 }, { "epoch": 0.24, "learning_rate": 9.783364376813787e-07, "logits/chosen": -0.441585898399353, "logits/rejected": -0.4667235016822815, "logps/chosen": -106.47343444824219, "logps/rejected": -156.31448364257812, "loss": 1.217, "rewards/accuracies": 0.0, "rewards/chosen": 1.2528114318847656, "rewards/margins": -1.9484641551971436, "rewards/rejected": 3.201275587081909, "step": 1494 }, { "epoch": 0.24, "learning_rate": 9.78298154693044e-07, "logits/chosen": -0.2650599777698517, "logits/rejected": -0.14141100645065308, "logps/chosen": -127.97843170166016, "logps/rejected": -60.64625549316406, "loss": 1.169, "rewards/accuracies": 1.0, "rewards/chosen": 2.601726531982422, "rewards/margins": 1.193926215171814, "rewards/rejected": 1.407800316810608, "step": 1495 }, { "epoch": 0.24, "learning_rate": 9.782598386588324e-07, "logits/chosen": -0.43469488620758057, "logits/rejected": -0.4186934232711792, "logps/chosen": -99.43505859375, "logps/rejected": -45.81074523925781, "loss": 0.4156, "rewards/accuracies": 1.0, "rewards/chosen": 1.424098253250122, "rewards/margins": 0.586979329586029, "rewards/rejected": 0.837118923664093, "step": 1496 }, { "epoch": 0.24, "learning_rate": 9.782214895813913e-07, "logits/chosen": -0.3835313320159912, "logits/rejected": -0.3835313320159912, "logps/chosen": -125.89595794677734, "logps/rejected": -125.89595794677734, "loss": 0.9069, "rewards/accuracies": 0.0, "rewards/chosen": 0.8815895318984985, "rewards/margins": 0.0, "rewards/rejected": 0.8815895318984985, "step": 1497 }, { "epoch": 0.24, "learning_rate": 9.781831074633703e-07, "logits/chosen": -0.22493326663970947, "logits/rejected": -0.18963921070098877, "logps/chosen": -46.23423767089844, "logps/rejected": -51.50922393798828, "loss": 0.6696, "rewards/accuracies": 1.0, "rewards/chosen": 0.8784710168838501, "rewards/margins": 0.6335464715957642, "rewards/rejected": 0.24492454528808594, "step": 1498 }, { "epoch": 0.24, "learning_rate": 9.78144692307421e-07, "logits/chosen": -0.4704485535621643, "logits/rejected": -0.48313769698143005, "logps/chosen": -9.457500457763672, "logps/rejected": -29.21957778930664, "loss": 0.4688, "rewards/accuracies": 0.0, "rewards/chosen": 0.41626960039138794, "rewards/margins": -0.007589817047119141, "rewards/rejected": 0.4238594174385071, "step": 1499 }, { "epoch": 0.24, "learning_rate": 9.781062441161979e-07, "logits/chosen": -0.2428041249513626, "logits/rejected": -0.2375166118144989, "logps/chosen": -51.21078872680664, "logps/rejected": -46.21934509277344, "loss": 0.3602, "rewards/accuracies": 1.0, "rewards/chosen": 1.3574405908584595, "rewards/margins": 0.5884903073310852, "rewards/rejected": 0.7689502835273743, "step": 1500 }, { "epoch": 0.24, "learning_rate": 9.78067762892357e-07, "logits/chosen": -0.45661625266075134, "logits/rejected": -0.3983442783355713, "logps/chosen": -82.90550231933594, "logps/rejected": -11.123128890991211, "loss": 0.6424, "rewards/accuracies": 1.0, "rewards/chosen": 0.8449005484580994, "rewards/margins": 0.1364513635635376, "rewards/rejected": 0.7084491848945618, "step": 1501 }, { "epoch": 0.24, "learning_rate": 9.780292486385574e-07, "logits/chosen": -0.4828795790672302, "logits/rejected": -0.4496448338031769, "logps/chosen": -61.47906494140625, "logps/rejected": -71.89443969726562, "loss": 0.6811, "rewards/accuracies": 0.0, "rewards/chosen": 1.0285431146621704, "rewards/margins": -0.3073081970214844, "rewards/rejected": 1.3358513116836548, "step": 1502 }, { "epoch": 0.24, "learning_rate": 9.779907013574598e-07, "logits/chosen": -0.6824793219566345, "logits/rejected": -0.7132186889648438, "logps/chosen": -79.0577392578125, "logps/rejected": -167.5985870361328, "loss": 0.914, "rewards/accuracies": 0.0, "rewards/chosen": 0.9574524164199829, "rewards/margins": -1.3752366304397583, "rewards/rejected": 2.332689046859741, "step": 1503 }, { "epoch": 0.24, "learning_rate": 9.779521210517275e-07, "logits/chosen": -0.963604748249054, "logits/rejected": -0.9794360399246216, "logps/chosen": -125.05081176757812, "logps/rejected": -61.72600555419922, "loss": 0.7465, "rewards/accuracies": 0.0, "rewards/chosen": 0.763384997844696, "rewards/margins": -0.7188827395439148, "rewards/rejected": 1.4822677373886108, "step": 1504 }, { "epoch": 0.24, "learning_rate": 9.779135077240262e-07, "logits/chosen": -0.6494208574295044, "logits/rejected": -0.6417734622955322, "logps/chosen": -76.87213134765625, "logps/rejected": -58.72136688232422, "loss": 0.3934, "rewards/accuracies": 0.0, "rewards/chosen": 0.7069687247276306, "rewards/margins": -0.048183441162109375, "rewards/rejected": 0.75515216588974, "step": 1505 }, { "epoch": 0.24, "learning_rate": 9.778748613770234e-07, "logits/chosen": -0.33139553666114807, "logits/rejected": -0.34744709730148315, "logps/chosen": -17.432607650756836, "logps/rejected": -4.5770721435546875, "loss": 0.4558, "rewards/accuracies": 0.0, "rewards/chosen": 0.03806133195757866, "rewards/margins": -0.11953906714916229, "rewards/rejected": 0.15760040283203125, "step": 1506 }, { "epoch": 0.24, "learning_rate": 9.778361820133894e-07, "logits/chosen": -0.4457920491695404, "logits/rejected": -0.43164610862731934, "logps/chosen": -71.52336120605469, "logps/rejected": -139.0357208251953, "loss": 0.9944, "rewards/accuracies": 0.0, "rewards/chosen": 0.35168763995170593, "rewards/margins": -0.20688322186470032, "rewards/rejected": 0.5585708618164062, "step": 1507 }, { "epoch": 0.24, "learning_rate": 9.777974696357967e-07, "logits/chosen": -0.626103401184082, "logits/rejected": -0.6045825481414795, "logps/chosen": -98.34341430664062, "logps/rejected": -114.03460693359375, "loss": 0.9526, "rewards/accuracies": 0.0, "rewards/chosen": 0.24992676079273224, "rewards/margins": -0.4628540277481079, "rewards/rejected": 0.712780773639679, "step": 1508 }, { "epoch": 0.24, "learning_rate": 9.777587242469196e-07, "logits/chosen": -0.2009119987487793, "logits/rejected": -0.2009119987487793, "logps/chosen": -34.09600830078125, "logps/rejected": -34.09600830078125, "loss": 0.6446, "rewards/accuracies": 0.0, "rewards/chosen": 0.5712787508964539, "rewards/margins": 0.0, "rewards/rejected": 0.5712787508964539, "step": 1509 }, { "epoch": 0.25, "learning_rate": 9.777199458494354e-07, "logits/chosen": -0.31626924872398376, "logits/rejected": -0.29714396595954895, "logps/chosen": -147.04673767089844, "logps/rejected": -57.32610321044922, "loss": 0.3764, "rewards/accuracies": 1.0, "rewards/chosen": 1.594508409500122, "rewards/margins": 0.4183197021484375, "rewards/rejected": 1.1761887073516846, "step": 1510 }, { "epoch": 0.25, "learning_rate": 9.776811344460231e-07, "logits/chosen": -0.356449156999588, "logits/rejected": -0.39193427562713623, "logps/chosen": -34.4299430847168, "logps/rejected": -83.0125732421875, "loss": 0.7365, "rewards/accuracies": 0.0, "rewards/chosen": 1.2038639783859253, "rewards/margins": -0.8488537073135376, "rewards/rejected": 2.052717685699463, "step": 1511 }, { "epoch": 0.25, "learning_rate": 9.776422900393644e-07, "logits/chosen": -0.6631408929824829, "logits/rejected": -0.6212267279624939, "logps/chosen": -112.88387298583984, "logps/rejected": -122.15348815917969, "loss": 0.6481, "rewards/accuracies": 0.0, "rewards/chosen": 0.2813659608364105, "rewards/margins": -0.6243927478790283, "rewards/rejected": 0.9057586789131165, "step": 1512 }, { "epoch": 0.25, "learning_rate": 9.776034126321429e-07, "logits/chosen": -0.4643614590167999, "logits/rejected": -0.4273837208747864, "logps/chosen": -77.9986572265625, "logps/rejected": -30.64627456665039, "loss": 0.5935, "rewards/accuracies": 0.0, "rewards/chosen": 0.3519333004951477, "rewards/margins": -0.12574461102485657, "rewards/rejected": 0.4776779115200043, "step": 1513 }, { "epoch": 0.25, "learning_rate": 9.775645022270446e-07, "logits/chosen": -0.47062647342681885, "logits/rejected": -0.37922632694244385, "logps/chosen": -78.93849182128906, "logps/rejected": -51.778263092041016, "loss": 0.4634, "rewards/accuracies": 0.0, "rewards/chosen": 1.4069626331329346, "rewards/margins": -0.12520480155944824, "rewards/rejected": 1.5321674346923828, "step": 1514 }, { "epoch": 0.25, "learning_rate": 9.77525558826758e-07, "logits/chosen": -0.29650771617889404, "logits/rejected": -0.31331318616867065, "logps/chosen": -6.014074802398682, "logps/rejected": -2.762266159057617, "loss": 0.7564, "rewards/accuracies": 0.0, "rewards/chosen": 0.3168505132198334, "rewards/margins": -0.2111588418483734, "rewards/rejected": 0.5280093550682068, "step": 1515 }, { "epoch": 0.25, "learning_rate": 9.774865824339737e-07, "logits/chosen": -0.21746313571929932, "logits/rejected": -0.21746313571929932, "logps/chosen": -92.26873779296875, "logps/rejected": -92.26873779296875, "loss": 1.4599, "rewards/accuracies": 0.0, "rewards/chosen": 1.8561553955078125, "rewards/margins": 0.0, "rewards/rejected": 1.8561553955078125, "step": 1516 }, { "epoch": 0.25, "learning_rate": 9.774475730513847e-07, "logits/chosen": -0.6854605674743652, "logits/rejected": -0.7333911061286926, "logps/chosen": -127.459228515625, "logps/rejected": -125.89495849609375, "loss": 1.5976, "rewards/accuracies": 0.0, "rewards/chosen": 0.04712829738855362, "rewards/margins": -2.6154725551605225, "rewards/rejected": 2.6626007556915283, "step": 1517 }, { "epoch": 0.25, "learning_rate": 9.774085306816857e-07, "logits/chosen": -0.44247370958328247, "logits/rejected": -0.3963192403316498, "logps/chosen": -78.49817657470703, "logps/rejected": -17.83580780029297, "loss": 1.1285, "rewards/accuracies": 0.0, "rewards/chosen": 0.17785035073757172, "rewards/margins": -0.06265811622142792, "rewards/rejected": 0.24050846695899963, "step": 1518 }, { "epoch": 0.25, "learning_rate": 9.77369455327575e-07, "logits/chosen": -0.3872547149658203, "logits/rejected": -0.33817195892333984, "logps/chosen": -55.58305740356445, "logps/rejected": -51.881263732910156, "loss": 0.8358, "rewards/accuracies": 0.0, "rewards/chosen": 0.4281322658061981, "rewards/margins": -0.11853674054145813, "rewards/rejected": 0.5466690063476562, "step": 1519 }, { "epoch": 0.25, "learning_rate": 9.773303469917514e-07, "logits/chosen": -0.22770552337169647, "logits/rejected": -0.22770552337169647, "logps/chosen": -101.8804931640625, "logps/rejected": -101.8804931640625, "loss": 0.4849, "rewards/accuracies": 0.0, "rewards/chosen": 0.19342041015625, "rewards/margins": 0.0, "rewards/rejected": 0.19342041015625, "step": 1520 }, { "epoch": 0.25, "learning_rate": 9.772912056769175e-07, "logits/chosen": -0.6957395672798157, "logits/rejected": -0.6965112090110779, "logps/chosen": -73.8929443359375, "logps/rejected": -95.10118865966797, "loss": 1.5229, "rewards/accuracies": 1.0, "rewards/chosen": 1.6398757696151733, "rewards/margins": 0.21211159229278564, "rewards/rejected": 1.4277641773223877, "step": 1521 }, { "epoch": 0.25, "learning_rate": 9.772520313857775e-07, "logits/chosen": -0.6035584211349487, "logits/rejected": -0.6398181319236755, "logps/chosen": -177.66664123535156, "logps/rejected": -120.52739715576172, "loss": 0.3045, "rewards/accuracies": 1.0, "rewards/chosen": 1.7308365106582642, "rewards/margins": 0.46224141120910645, "rewards/rejected": 1.2685950994491577, "step": 1522 }, { "epoch": 0.25, "learning_rate": 9.77212824121038e-07, "logits/chosen": -0.9360856413841248, "logits/rejected": -0.92127925157547, "logps/chosen": -238.69692993164062, "logps/rejected": -150.67288208007812, "loss": 0.8804, "rewards/accuracies": 0.0, "rewards/chosen": 1.5891830921173096, "rewards/margins": -1.270124912261963, "rewards/rejected": 2.8593080043792725, "step": 1523 }, { "epoch": 0.25, "learning_rate": 9.771735838854077e-07, "logits/chosen": -0.6244069933891296, "logits/rejected": -0.5910523533821106, "logps/chosen": -78.84954833984375, "logps/rejected": -56.425315856933594, "loss": 0.3699, "rewards/accuracies": 1.0, "rewards/chosen": 1.5299484729766846, "rewards/margins": 0.4328979253768921, "rewards/rejected": 1.0970505475997925, "step": 1524 }, { "epoch": 0.25, "learning_rate": 9.77134310681598e-07, "logits/chosen": 0.17675258219242096, "logits/rejected": 0.19406765699386597, "logps/chosen": -4.640444278717041, "logps/rejected": -6.030031681060791, "loss": 0.4186, "rewards/accuracies": 0.0, "rewards/chosen": 0.02326378785073757, "rewards/margins": -0.24780312180519104, "rewards/rejected": 0.27106690406799316, "step": 1525 }, { "epoch": 0.25, "learning_rate": 9.770950045123218e-07, "logits/chosen": 0.11484010517597198, "logits/rejected": 0.11484010517597198, "logps/chosen": -6.651671409606934, "logps/rejected": -6.651671409606934, "loss": 0.6393, "rewards/accuracies": 0.0, "rewards/chosen": 0.13522911071777344, "rewards/margins": 0.0, "rewards/rejected": 0.13522911071777344, "step": 1526 }, { "epoch": 0.25, "learning_rate": 9.770556653802953e-07, "logits/chosen": -0.3122521638870239, "logits/rejected": -0.3122521638870239, "logps/chosen": -104.8798599243164, "logps/rejected": -104.8798599243164, "loss": 0.7578, "rewards/accuracies": 0.0, "rewards/chosen": 1.1331154108047485, "rewards/margins": 0.0, "rewards/rejected": 1.1331154108047485, "step": 1527 }, { "epoch": 0.25, "learning_rate": 9.770162932882363e-07, "logits/chosen": -0.45019909739494324, "logits/rejected": -0.4146522879600525, "logps/chosen": -68.70742797851562, "logps/rejected": -121.78001403808594, "loss": 0.8574, "rewards/accuracies": 1.0, "rewards/chosen": 0.2680343687534332, "rewards/margins": 0.5535300970077515, "rewards/rejected": -0.2854957580566406, "step": 1528 }, { "epoch": 0.25, "learning_rate": 9.769768882388647e-07, "logits/chosen": -0.45273616909980774, "logits/rejected": -0.44173750281333923, "logps/chosen": -78.61544799804688, "logps/rejected": -40.61370086669922, "loss": 0.8928, "rewards/accuracies": 0.0, "rewards/chosen": 0.3175247311592102, "rewards/margins": -0.9730743765830994, "rewards/rejected": 1.2905991077423096, "step": 1529 }, { "epoch": 0.25, "learning_rate": 9.769374502349036e-07, "logits/chosen": -0.7201471328735352, "logits/rejected": -0.663252592086792, "logps/chosen": -136.6280059814453, "logps/rejected": -73.02494812011719, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.3420669734477997, "rewards/margins": 0.022509008646011353, "rewards/rejected": 0.31955796480178833, "step": 1530 }, { "epoch": 0.25, "learning_rate": 9.768979792790776e-07, "logits/chosen": -0.7096880078315735, "logits/rejected": -0.6549328565597534, "logps/chosen": -53.73686218261719, "logps/rejected": -116.17359161376953, "loss": 1.0827, "rewards/accuracies": 0.0, "rewards/chosen": 1.3280220031738281, "rewards/margins": -0.25384676456451416, "rewards/rejected": 1.5818687677383423, "step": 1531 }, { "epoch": 0.25, "learning_rate": 9.768584753741134e-07, "logits/chosen": -0.18489579856395721, "logits/rejected": -0.18489579856395721, "logps/chosen": -37.66487121582031, "logps/rejected": -37.66487121582031, "loss": 0.3941, "rewards/accuracies": 0.0, "rewards/chosen": 0.006900024600327015, "rewards/margins": 0.0, "rewards/rejected": 0.006900024600327015, "step": 1532 }, { "epoch": 0.25, "learning_rate": 9.768189385227409e-07, "logits/chosen": -0.5759722590446472, "logits/rejected": -0.5841638445854187, "logps/chosen": -100.1788330078125, "logps/rejected": -71.4183349609375, "loss": 0.1889, "rewards/accuracies": 1.0, "rewards/chosen": 2.015911817550659, "rewards/margins": 1.5755844116210938, "rewards/rejected": 0.4403274655342102, "step": 1533 }, { "epoch": 0.25, "learning_rate": 9.767793687276911e-07, "logits/chosen": -0.6487188935279846, "logits/rejected": -0.5468915104866028, "logps/chosen": -85.99002838134766, "logps/rejected": -74.26309967041016, "loss": 0.7148, "rewards/accuracies": 0.0, "rewards/chosen": 1.2251518964767456, "rewards/margins": -0.08475720882415771, "rewards/rejected": 1.3099091053009033, "step": 1534 }, { "epoch": 0.25, "learning_rate": 9.767397659916986e-07, "logits/chosen": -0.13650940358638763, "logits/rejected": -0.1629430055618286, "logps/chosen": -62.465877532958984, "logps/rejected": -123.43598175048828, "loss": 0.3273, "rewards/accuracies": 1.0, "rewards/chosen": 1.0130817890167236, "rewards/margins": 1.6798657178878784, "rewards/rejected": -0.6667839288711548, "step": 1535 }, { "epoch": 0.25, "learning_rate": 9.76700130317499e-07, "logits/chosen": -0.4074555039405823, "logits/rejected": -0.47018685936927795, "logps/chosen": -176.23162841796875, "logps/rejected": -116.10546875, "loss": 1.0542, "rewards/accuracies": 1.0, "rewards/chosen": 2.4817168712615967, "rewards/margins": 0.4583768844604492, "rewards/rejected": 2.0233399868011475, "step": 1536 }, { "epoch": 0.25, "learning_rate": 9.76660461707831e-07, "logits/chosen": -0.45521974563598633, "logits/rejected": -0.33921390771865845, "logps/chosen": -140.94422912597656, "logps/rejected": -83.59017181396484, "loss": 0.8286, "rewards/accuracies": 0.0, "rewards/chosen": 0.7926956415176392, "rewards/margins": -0.7918983697891235, "rewards/rejected": 1.5845940113067627, "step": 1537 }, { "epoch": 0.25, "learning_rate": 9.766207601654355e-07, "logits/chosen": -0.20922985672950745, "logits/rejected": -0.16102108359336853, "logps/chosen": -171.1466064453125, "logps/rejected": -129.59165954589844, "loss": 0.638, "rewards/accuracies": 1.0, "rewards/chosen": 2.4299285411834717, "rewards/margins": 2.275836229324341, "rewards/rejected": 0.15409241616725922, "step": 1538 }, { "epoch": 0.25, "learning_rate": 9.76581025693055e-07, "logits/chosen": -0.5151697397232056, "logits/rejected": -0.5144970417022705, "logps/chosen": -55.410282135009766, "logps/rejected": -62.63957214355469, "loss": 0.3882, "rewards/accuracies": 1.0, "rewards/chosen": 1.710204005241394, "rewards/margins": 0.3845677375793457, "rewards/rejected": 1.3256362676620483, "step": 1539 }, { "epoch": 0.25, "learning_rate": 9.765412582934353e-07, "logits/chosen": -0.5781236290931702, "logits/rejected": -0.5459142327308655, "logps/chosen": -46.77259063720703, "logps/rejected": -27.085233688354492, "loss": 1.4955, "rewards/accuracies": 0.0, "rewards/chosen": 0.057233430445194244, "rewards/margins": -0.4828931987285614, "rewards/rejected": 0.540126621723175, "step": 1540 }, { "epoch": 0.25, "learning_rate": 9.765014579693237e-07, "logits/chosen": -0.5309715270996094, "logits/rejected": -0.205574631690979, "logps/chosen": -159.49664306640625, "logps/rejected": -240.23870849609375, "loss": 0.6895, "rewards/accuracies": 0.0, "rewards/chosen": 2.5841386318206787, "rewards/margins": -0.7034957408905029, "rewards/rejected": 3.2876343727111816, "step": 1541 }, { "epoch": 0.25, "learning_rate": 9.764616247234701e-07, "logits/chosen": -0.3914860486984253, "logits/rejected": -0.38865336775779724, "logps/chosen": -4.1370110511779785, "logps/rejected": -5.422585964202881, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.27589789032936096, "rewards/margins": 0.027959778904914856, "rewards/rejected": 0.2479381114244461, "step": 1542 }, { "epoch": 0.25, "learning_rate": 9.764217585586263e-07, "logits/chosen": -0.3632132112979889, "logits/rejected": -0.3650133013725281, "logps/chosen": -106.20649719238281, "logps/rejected": -80.33412170410156, "loss": 0.7798, "rewards/accuracies": 0.0, "rewards/chosen": -0.5335617065429688, "rewards/margins": -0.66973876953125, "rewards/rejected": 0.13617706298828125, "step": 1543 }, { "epoch": 0.25, "learning_rate": 9.763818594775473e-07, "logits/chosen": -0.41730281710624695, "logits/rejected": -0.29301348328590393, "logps/chosen": -211.15835571289062, "logps/rejected": -204.2930145263672, "loss": 1.6427, "rewards/accuracies": 0.0, "rewards/chosen": 1.8336257934570312, "rewards/margins": -1.0970795154571533, "rewards/rejected": 2.9307053089141846, "step": 1544 }, { "epoch": 0.25, "learning_rate": 9.763419274829892e-07, "logits/chosen": -0.23395615816116333, "logits/rejected": -0.20414675772190094, "logps/chosen": -53.5139274597168, "logps/rejected": -104.30661010742188, "loss": 1.1329, "rewards/accuracies": 0.0, "rewards/chosen": 0.7024604678153992, "rewards/margins": -1.9384651184082031, "rewards/rejected": 2.640925645828247, "step": 1545 }, { "epoch": 0.25, "learning_rate": 9.76301962577711e-07, "logits/chosen": -0.18769681453704834, "logits/rejected": -0.1938783973455429, "logps/chosen": -4.502137660980225, "logps/rejected": -3.6743969917297363, "loss": 0.6235, "rewards/accuracies": 0.0, "rewards/chosen": 0.10321507602930069, "rewards/margins": -0.12722283601760864, "rewards/rejected": 0.23043790459632874, "step": 1546 }, { "epoch": 0.25, "learning_rate": 9.76261964764474e-07, "logits/chosen": -0.2643645405769348, "logits/rejected": -0.27415159344673157, "logps/chosen": -84.28512573242188, "logps/rejected": -67.01165771484375, "loss": 0.6599, "rewards/accuracies": 0.0, "rewards/chosen": 0.7835022211074829, "rewards/margins": -0.3973572254180908, "rewards/rejected": 1.1808594465255737, "step": 1547 }, { "epoch": 0.25, "learning_rate": 9.762219340460418e-07, "logits/chosen": -0.3828331530094147, "logits/rejected": -0.37987202405929565, "logps/chosen": -63.4718132019043, "logps/rejected": -83.01774597167969, "loss": 0.8741, "rewards/accuracies": 0.0, "rewards/chosen": 1.2076740264892578, "rewards/margins": -0.19597280025482178, "rewards/rejected": 1.4036468267440796, "step": 1548 }, { "epoch": 0.25, "learning_rate": 9.7618187042518e-07, "logits/chosen": -1.515047550201416, "logits/rejected": -0.8233029246330261, "logps/chosen": -113.27454376220703, "logps/rejected": -65.55593872070312, "loss": 0.9595, "rewards/accuracies": 0.0, "rewards/chosen": 0.5929481387138367, "rewards/margins": -0.573236882686615, "rewards/rejected": 1.1661850214004517, "step": 1549 }, { "epoch": 0.25, "learning_rate": 9.761417739046565e-07, "logits/chosen": -0.30832475423812866, "logits/rejected": -0.2563970983028412, "logps/chosen": -60.98678207397461, "logps/rejected": -85.5921401977539, "loss": 0.4688, "rewards/accuracies": 1.0, "rewards/chosen": 1.3312962055206299, "rewards/margins": 0.6590633988380432, "rewards/rejected": 0.6722328066825867, "step": 1550 }, { "epoch": 0.25, "learning_rate": 9.761016444872418e-07, "logits/chosen": -0.41163185238838196, "logits/rejected": -0.4109170138835907, "logps/chosen": -61.563507080078125, "logps/rejected": -160.23641967773438, "loss": 0.6305, "rewards/accuracies": 1.0, "rewards/chosen": 0.7750518918037415, "rewards/margins": 0.26848143339157104, "rewards/rejected": 0.5065704584121704, "step": 1551 }, { "epoch": 0.25, "learning_rate": 9.760614821757084e-07, "logits/chosen": -0.5076546669006348, "logits/rejected": -0.45859822630882263, "logps/chosen": -65.7236328125, "logps/rejected": -62.557064056396484, "loss": 0.4519, "rewards/accuracies": 1.0, "rewards/chosen": 1.2888901233673096, "rewards/margins": 0.03581047058105469, "rewards/rejected": 1.2530796527862549, "step": 1552 }, { "epoch": 0.25, "learning_rate": 9.76021286972831e-07, "logits/chosen": -0.7819964289665222, "logits/rejected": -0.6561844348907471, "logps/chosen": -152.0945587158203, "logps/rejected": -158.66305541992188, "loss": 1.0093, "rewards/accuracies": 1.0, "rewards/chosen": 3.0409257411956787, "rewards/margins": 0.2882828712463379, "rewards/rejected": 2.752642869949341, "step": 1553 }, { "epoch": 0.25, "learning_rate": 9.75981058881387e-07, "logits/chosen": -0.5673351287841797, "logits/rejected": -0.4925590753555298, "logps/chosen": -104.81269073486328, "logps/rejected": -146.40286254882812, "loss": 0.3177, "rewards/accuracies": 1.0, "rewards/chosen": 2.2803475856781006, "rewards/margins": 1.2212319374084473, "rewards/rejected": 1.0591156482696533, "step": 1554 }, { "epoch": 0.25, "learning_rate": 9.759407979041557e-07, "logits/chosen": -0.18139822781085968, "logits/rejected": -0.18139822781085968, "logps/chosen": -3.3211357593536377, "logps/rejected": -3.3211357593536377, "loss": 0.6581, "rewards/accuracies": 0.0, "rewards/chosen": 0.09376294910907745, "rewards/margins": 0.0, "rewards/rejected": 0.09376294910907745, "step": 1555 }, { "epoch": 0.25, "learning_rate": 9.759005040439184e-07, "logits/chosen": -0.006277901120483875, "logits/rejected": -0.0015768917510285974, "logps/chosen": -92.37237548828125, "logps/rejected": -51.588253021240234, "loss": 0.5441, "rewards/accuracies": 0.0, "rewards/chosen": 0.9434967041015625, "rewards/margins": -0.08528256416320801, "rewards/rejected": 1.0287792682647705, "step": 1556 }, { "epoch": 0.25, "learning_rate": 9.758601773034594e-07, "logits/chosen": -0.3677695095539093, "logits/rejected": -0.3379448652267456, "logps/chosen": -69.49726104736328, "logps/rejected": -79.06939697265625, "loss": 0.871, "rewards/accuracies": 0.0, "rewards/chosen": 0.8283492922782898, "rewards/margins": -0.055898308753967285, "rewards/rejected": 0.8842476010322571, "step": 1557 }, { "epoch": 0.25, "learning_rate": 9.758198176855646e-07, "logits/chosen": -0.8137792348861694, "logits/rejected": -0.6195169687271118, "logps/chosen": -90.14152526855469, "logps/rejected": -98.28060913085938, "loss": 0.8719, "rewards/accuracies": 1.0, "rewards/chosen": 2.830249071121216, "rewards/margins": 1.2945419549942017, "rewards/rejected": 1.5357071161270142, "step": 1558 }, { "epoch": 0.25, "learning_rate": 9.75779425193023e-07, "logits/chosen": -0.6978669762611389, "logits/rejected": -0.6488758325576782, "logps/chosen": -95.86358642578125, "logps/rejected": -122.88294219970703, "loss": 0.2678, "rewards/accuracies": 1.0, "rewards/chosen": 2.863389730453491, "rewards/margins": 0.8178093433380127, "rewards/rejected": 2.0455803871154785, "step": 1559 }, { "epoch": 0.25, "learning_rate": 9.757389998286245e-07, "logits/chosen": -0.6092339158058167, "logits/rejected": -0.5380819439888, "logps/chosen": -70.25691986083984, "logps/rejected": -63.680458068847656, "loss": 0.6655, "rewards/accuracies": 0.0, "rewards/chosen": 0.22215043008327484, "rewards/margins": -0.44908446073532104, "rewards/rejected": 0.6712349057197571, "step": 1560 }, { "epoch": 0.25, "learning_rate": 9.75698541595163e-07, "logits/chosen": -0.45213380455970764, "logits/rejected": -0.4374169409275055, "logps/chosen": -84.8198013305664, "logps/rejected": -66.09342956542969, "loss": 0.7528, "rewards/accuracies": 1.0, "rewards/chosen": 0.6896354556083679, "rewards/margins": 0.1894690990447998, "rewards/rejected": 0.5001663565635681, "step": 1561 }, { "epoch": 0.25, "learning_rate": 9.756580504954333e-07, "logits/chosen": -0.47851046919822693, "logits/rejected": -0.45938655734062195, "logps/chosen": -51.944122314453125, "logps/rejected": -68.63111877441406, "loss": 0.77, "rewards/accuracies": 1.0, "rewards/chosen": 1.1805340051651, "rewards/margins": 0.3951045870780945, "rewards/rejected": 0.7854294180870056, "step": 1562 }, { "epoch": 0.25, "learning_rate": 9.75617526532233e-07, "logits/chosen": -0.5019930601119995, "logits/rejected": -0.4066154658794403, "logps/chosen": -63.163719177246094, "logps/rejected": -86.72930908203125, "loss": 1.2561, "rewards/accuracies": 0.0, "rewards/chosen": 0.7992798089981079, "rewards/margins": -1.4979225397109985, "rewards/rejected": 2.2972023487091064, "step": 1563 }, { "epoch": 0.25, "learning_rate": 9.755769697083618e-07, "logits/chosen": -0.6712477803230286, "logits/rejected": -0.6534830927848816, "logps/chosen": -54.039039611816406, "logps/rejected": -64.09947204589844, "loss": 0.5928, "rewards/accuracies": 0.0, "rewards/chosen": 0.4277504086494446, "rewards/margins": -0.09962844848632812, "rewards/rejected": 0.5273788571357727, "step": 1564 }, { "epoch": 0.25, "learning_rate": 9.75536380026622e-07, "logits/chosen": -0.559796929359436, "logits/rejected": -0.49527406692504883, "logps/chosen": -201.0487518310547, "logps/rejected": -129.85366821289062, "loss": 1.3556, "rewards/accuracies": 0.0, "rewards/chosen": 2.202558994293213, "rewards/margins": -2.1641173362731934, "rewards/rejected": 4.366676330566406, "step": 1565 }, { "epoch": 0.25, "learning_rate": 9.754957574898182e-07, "logits/chosen": -0.497792512178421, "logits/rejected": -0.5081384181976318, "logps/chosen": -56.47799301147461, "logps/rejected": -101.33277130126953, "loss": 1.1655, "rewards/accuracies": 0.0, "rewards/chosen": 1.5244678258895874, "rewards/margins": -1.4293538331985474, "rewards/rejected": 2.9538216590881348, "step": 1566 }, { "epoch": 0.25, "learning_rate": 9.754551021007565e-07, "logits/chosen": -0.6910521984100342, "logits/rejected": -1.1102925539016724, "logps/chosen": -109.78024291992188, "logps/rejected": -37.834869384765625, "loss": 0.8083, "rewards/accuracies": 1.0, "rewards/chosen": 0.985058605670929, "rewards/margins": 0.7987518310546875, "rewards/rejected": 0.18630675971508026, "step": 1567 }, { "epoch": 0.25, "learning_rate": 9.75414413862246e-07, "logits/chosen": -0.6021013855934143, "logits/rejected": -0.6583996415138245, "logps/chosen": -88.44029235839844, "logps/rejected": -146.9351806640625, "loss": 1.5837, "rewards/accuracies": 0.0, "rewards/chosen": 0.902178943157196, "rewards/margins": -1.1985869407653809, "rewards/rejected": 2.1007659435272217, "step": 1568 }, { "epoch": 0.25, "learning_rate": 9.753736927770982e-07, "logits/chosen": -0.5732123255729675, "logits/rejected": -0.5735555291175842, "logps/chosen": -182.98751831054688, "logps/rejected": -88.71024322509766, "loss": 0.411, "rewards/accuracies": 1.0, "rewards/chosen": 0.551409900188446, "rewards/margins": 0.3658859133720398, "rewards/rejected": 0.18552398681640625, "step": 1569 }, { "epoch": 0.25, "learning_rate": 9.753329388481259e-07, "logits/chosen": -0.764689028263092, "logits/rejected": -0.6740744709968567, "logps/chosen": -183.75497436523438, "logps/rejected": -132.5806884765625, "loss": 0.7764, "rewards/accuracies": 0.0, "rewards/chosen": 0.870758056640625, "rewards/margins": -1.0162323713302612, "rewards/rejected": 1.8869904279708862, "step": 1570 }, { "epoch": 0.25, "learning_rate": 9.752921520781453e-07, "logits/chosen": -0.6730920076370239, "logits/rejected": -0.6563119888305664, "logps/chosen": -139.84127807617188, "logps/rejected": -157.46121215820312, "loss": 0.5279, "rewards/accuracies": 0.0, "rewards/chosen": 0.759356677532196, "rewards/margins": -0.0021576285362243652, "rewards/rejected": 0.7615143060684204, "step": 1571 }, { "epoch": 0.26, "learning_rate": 9.752513324699742e-07, "logits/chosen": -0.40669843554496765, "logits/rejected": -0.39991122484207153, "logps/chosen": -2.5145668983459473, "logps/rejected": -17.52561378479004, "loss": 0.3854, "rewards/accuracies": 1.0, "rewards/chosen": 0.170098215341568, "rewards/margins": 0.17984648048877716, "rewards/rejected": -0.009748267941176891, "step": 1572 }, { "epoch": 0.26, "learning_rate": 9.75210480026433e-07, "logits/chosen": -0.23397737741470337, "logits/rejected": -0.1553153693675995, "logps/chosen": -93.980712890625, "logps/rejected": -18.66645622253418, "loss": 0.6158, "rewards/accuracies": 0.0, "rewards/chosen": -0.018708039075136185, "rewards/margins": -0.34449502825737, "rewards/rejected": 0.3257869780063629, "step": 1573 }, { "epoch": 0.26, "learning_rate": 9.751695947503442e-07, "logits/chosen": -0.2628459334373474, "logits/rejected": -0.2524999976158142, "logps/chosen": -53.865089416503906, "logps/rejected": -66.55081176757812, "loss": 1.0013, "rewards/accuracies": 0.0, "rewards/chosen": 0.33385545015335083, "rewards/margins": -0.41546404361724854, "rewards/rejected": 0.7493194937705994, "step": 1574 }, { "epoch": 0.26, "learning_rate": 9.751286766445322e-07, "logits/chosen": -0.6573001742362976, "logits/rejected": -0.6801421046257019, "logps/chosen": -71.13493347167969, "logps/rejected": -25.153615951538086, "loss": 0.4321, "rewards/accuracies": 1.0, "rewards/chosen": 0.472360223531723, "rewards/margins": 0.38877999782562256, "rewards/rejected": 0.08358021080493927, "step": 1575 }, { "epoch": 0.26, "learning_rate": 9.750877257118247e-07, "logits/chosen": -0.5193739533424377, "logits/rejected": -0.4705520272254944, "logps/chosen": -74.83648681640625, "logps/rejected": -105.16567993164062, "loss": 0.558, "rewards/accuracies": 0.0, "rewards/chosen": 0.17880859971046448, "rewards/margins": -0.2700973451137543, "rewards/rejected": 0.44890594482421875, "step": 1576 }, { "epoch": 0.26, "learning_rate": 9.750467419550504e-07, "logits/chosen": -1.1187078952789307, "logits/rejected": -1.1255439519882202, "logps/chosen": -157.48654174804688, "logps/rejected": -48.83838653564453, "loss": 1.0881, "rewards/accuracies": 0.0, "rewards/chosen": 0.5414581298828125, "rewards/margins": -1.134107232093811, "rewards/rejected": 1.6755653619766235, "step": 1577 }, { "epoch": 0.26, "learning_rate": 9.750057253770411e-07, "logits/chosen": -0.42330875992774963, "logits/rejected": -0.42330875992774963, "logps/chosen": -23.16845703125, "logps/rejected": -23.16845703125, "loss": 0.3754, "rewards/accuracies": 0.0, "rewards/chosen": 1.1856567859649658, "rewards/margins": 0.0, "rewards/rejected": 1.1856567859649658, "step": 1578 }, { "epoch": 0.26, "learning_rate": 9.74964675980631e-07, "logits/chosen": -0.4641115069389343, "logits/rejected": -0.4641115069389343, "logps/chosen": -33.23194885253906, "logps/rejected": -33.23194885253906, "loss": 0.8811, "rewards/accuracies": 0.0, "rewards/chosen": 1.0141304731369019, "rewards/margins": 0.0, "rewards/rejected": 1.0141304731369019, "step": 1579 }, { "epoch": 0.26, "learning_rate": 9.749235937686558e-07, "logits/chosen": -0.316628634929657, "logits/rejected": -0.2478368580341339, "logps/chosen": -50.81137466430664, "logps/rejected": -83.0827865600586, "loss": 0.4267, "rewards/accuracies": 0.0, "rewards/chosen": 1.33977472782135, "rewards/margins": -0.12291145324707031, "rewards/rejected": 1.4626861810684204, "step": 1580 }, { "epoch": 0.26, "learning_rate": 9.74882478743954e-07, "logits/chosen": -0.7232614755630493, "logits/rejected": -0.7741268277168274, "logps/chosen": -116.93151092529297, "logps/rejected": -107.77496337890625, "loss": 1.8274, "rewards/accuracies": 0.0, "rewards/chosen": 0.2303978055715561, "rewards/margins": -2.908510684967041, "rewards/rejected": 3.1389083862304688, "step": 1581 }, { "epoch": 0.26, "learning_rate": 9.748413309093665e-07, "logits/chosen": -0.4356025457382202, "logits/rejected": -0.2830873727798462, "logps/chosen": -152.89212036132812, "logps/rejected": -62.688629150390625, "loss": 0.2772, "rewards/accuracies": 1.0, "rewards/chosen": 2.6878631114959717, "rewards/margins": 0.7619979381561279, "rewards/rejected": 1.9258651733398438, "step": 1582 }, { "epoch": 0.26, "learning_rate": 9.74800150267736e-07, "logits/chosen": -0.5936979651451111, "logits/rejected": -0.6083076596260071, "logps/chosen": -47.06806182861328, "logps/rejected": -35.67849349975586, "loss": 0.797, "rewards/accuracies": 1.0, "rewards/chosen": 1.653036117553711, "rewards/margins": 0.3589862585067749, "rewards/rejected": 1.294049859046936, "step": 1583 }, { "epoch": 0.26, "learning_rate": 9.747589368219075e-07, "logits/chosen": -0.4916575253009796, "logits/rejected": -0.48794251680374146, "logps/chosen": -26.149410247802734, "logps/rejected": -37.115272521972656, "loss": 0.5238, "rewards/accuracies": 0.0, "rewards/chosen": 0.023389434441924095, "rewards/margins": -0.015480807051062584, "rewards/rejected": 0.03887024149298668, "step": 1584 }, { "epoch": 0.26, "learning_rate": 9.747176905747288e-07, "logits/chosen": -0.5192253589630127, "logits/rejected": -0.5125957131385803, "logps/chosen": -66.27587890625, "logps/rejected": -57.208648681640625, "loss": 0.6769, "rewards/accuracies": 0.0, "rewards/chosen": 1.6036865711212158, "rewards/margins": -0.21775352954864502, "rewards/rejected": 1.8214401006698608, "step": 1585 }, { "epoch": 0.26, "learning_rate": 9.746764115290494e-07, "logits/chosen": -0.22345130145549774, "logits/rejected": -0.221981480717659, "logps/chosen": -3.1813883781433105, "logps/rejected": -14.455034255981445, "loss": 0.767, "rewards/accuracies": 0.0, "rewards/chosen": 0.19724583625793457, "rewards/margins": -0.10597378015518188, "rewards/rejected": 0.30321961641311646, "step": 1586 }, { "epoch": 0.26, "learning_rate": 9.746350996877214e-07, "logits/chosen": -0.5281963348388672, "logits/rejected": -0.455442875623703, "logps/chosen": -98.68379211425781, "logps/rejected": -96.31092834472656, "loss": 0.9844, "rewards/accuracies": 1.0, "rewards/chosen": 1.7018768787384033, "rewards/margins": 0.03622746467590332, "rewards/rejected": 1.6656494140625, "step": 1587 }, { "epoch": 0.26, "learning_rate": 9.745937550535992e-07, "logits/chosen": -0.47097769379615784, "logits/rejected": -0.47174233198165894, "logps/chosen": -78.75831604003906, "logps/rejected": -71.9536361694336, "loss": 0.6311, "rewards/accuracies": 1.0, "rewards/chosen": 1.7688149213790894, "rewards/margins": 0.2191758155822754, "rewards/rejected": 1.549639105796814, "step": 1588 }, { "epoch": 0.26, "learning_rate": 9.745523776295393e-07, "logits/chosen": -1.204740285873413, "logits/rejected": -1.253579020500183, "logps/chosen": -59.96159362792969, "logps/rejected": -33.15565872192383, "loss": 0.924, "rewards/accuracies": 1.0, "rewards/chosen": 1.310376763343811, "rewards/margins": 0.6653491854667664, "rewards/rejected": 0.6450275778770447, "step": 1589 }, { "epoch": 0.26, "learning_rate": 9.745109674184e-07, "logits/chosen": -0.44405484199523926, "logits/rejected": -0.4965492784976959, "logps/chosen": -70.67008972167969, "logps/rejected": -128.0760040283203, "loss": 0.3277, "rewards/accuracies": 1.0, "rewards/chosen": 1.868952989578247, "rewards/margins": 0.2811065912246704, "rewards/rejected": 1.5878463983535767, "step": 1590 }, { "epoch": 0.26, "learning_rate": 9.74469524423043e-07, "logits/chosen": -0.3558933138847351, "logits/rejected": -0.4029240310192108, "logps/chosen": -105.37228393554688, "logps/rejected": -64.41987609863281, "loss": 1.8503, "rewards/accuracies": 0.0, "rewards/chosen": 0.3932235836982727, "rewards/margins": -1.5427460670471191, "rewards/rejected": 1.935969591140747, "step": 1591 }, { "epoch": 0.26, "learning_rate": 9.744280486463313e-07, "logits/chosen": -0.5571030378341675, "logits/rejected": -0.5148847699165344, "logps/chosen": -66.70005798339844, "logps/rejected": -107.46568298339844, "loss": 0.2993, "rewards/accuracies": 1.0, "rewards/chosen": 0.6339462399482727, "rewards/margins": 0.4391922056674957, "rewards/rejected": 0.19475403428077698, "step": 1592 }, { "epoch": 0.26, "learning_rate": 9.743865400911304e-07, "logits/chosen": -0.342391699552536, "logits/rejected": -0.2849225401878357, "logps/chosen": -73.4927978515625, "logps/rejected": -81.47750091552734, "loss": 0.7031, "rewards/accuracies": 1.0, "rewards/chosen": 1.7214279174804688, "rewards/margins": 0.5417205095291138, "rewards/rejected": 1.179707407951355, "step": 1593 }, { "epoch": 0.26, "learning_rate": 9.74344998760308e-07, "logits/chosen": -0.6905626058578491, "logits/rejected": -0.6911499500274658, "logps/chosen": -134.5774688720703, "logps/rejected": -104.84654235839844, "loss": 0.4174, "rewards/accuracies": 1.0, "rewards/chosen": 0.4337020814418793, "rewards/margins": 0.13367155194282532, "rewards/rejected": 0.30003052949905396, "step": 1594 }, { "epoch": 0.26, "learning_rate": 9.74303424656735e-07, "logits/chosen": -0.4077971875667572, "logits/rejected": -0.3166925013065338, "logps/chosen": -45.82028579711914, "logps/rejected": -54.341651916503906, "loss": 0.7966, "rewards/accuracies": 0.0, "rewards/chosen": 0.9310612082481384, "rewards/margins": -0.4354396462440491, "rewards/rejected": 1.3665008544921875, "step": 1595 }, { "epoch": 0.26, "learning_rate": 9.74261817783283e-07, "logits/chosen": -0.2861189842224121, "logits/rejected": -0.253286749124527, "logps/chosen": -77.75668334960938, "logps/rejected": -38.83715057373047, "loss": 0.5202, "rewards/accuracies": 1.0, "rewards/chosen": 0.3430374264717102, "rewards/margins": 0.08106383681297302, "rewards/rejected": 0.2619735896587372, "step": 1596 }, { "epoch": 0.26, "learning_rate": 9.74220178142827e-07, "logits/chosen": -0.6133451461791992, "logits/rejected": -0.5534877181053162, "logps/chosen": -63.95663070678711, "logps/rejected": -39.49907302856445, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 1.6198261976242065, "rewards/margins": 0.2895618677139282, "rewards/rejected": 1.3302643299102783, "step": 1597 }, { "epoch": 0.26, "learning_rate": 9.741785057382436e-07, "logits/chosen": -0.5507533550262451, "logits/rejected": -0.5136104226112366, "logps/chosen": -70.6592788696289, "logps/rejected": -73.5941162109375, "loss": 0.3055, "rewards/accuracies": 1.0, "rewards/chosen": 1.4338287115097046, "rewards/margins": 0.35390615463256836, "rewards/rejected": 1.0799225568771362, "step": 1598 }, { "epoch": 0.26, "learning_rate": 9.741368005724124e-07, "logits/chosen": -0.38834741711616516, "logits/rejected": -0.29887282848358154, "logps/chosen": -110.27291870117188, "logps/rejected": -45.02895736694336, "loss": 0.9929, "rewards/accuracies": 0.0, "rewards/chosen": 0.1491134613752365, "rewards/margins": -0.48548853397369385, "rewards/rejected": 0.6346020102500916, "step": 1599 }, { "epoch": 0.26, "learning_rate": 9.740950626482144e-07, "logits/chosen": -0.47493892908096313, "logits/rejected": -0.47493892908096313, "logps/chosen": -63.71525955200195, "logps/rejected": -63.71525955200195, "loss": 0.3746, "rewards/accuracies": 0.0, "rewards/chosen": 0.7755405306816101, "rewards/margins": 0.0, "rewards/rejected": 0.7755405306816101, "step": 1600 }, { "epoch": 0.26, "learning_rate": 9.740532919685339e-07, "logits/chosen": -0.5987963676452637, "logits/rejected": -0.56800776720047, "logps/chosen": -160.57806396484375, "logps/rejected": -38.37412643432617, "loss": 0.5826, "rewards/accuracies": 1.0, "rewards/chosen": 2.0238754749298096, "rewards/margins": 1.643370509147644, "rewards/rejected": 0.3805049955844879, "step": 1601 }, { "epoch": 0.26, "learning_rate": 9.74011488536256e-07, "logits/chosen": -0.7248682975769043, "logits/rejected": -0.6675673127174377, "logps/chosen": -108.80401611328125, "logps/rejected": -94.1314926147461, "loss": 0.5478, "rewards/accuracies": 0.0, "rewards/chosen": 0.8915023803710938, "rewards/margins": -0.001889050006866455, "rewards/rejected": 0.8933914303779602, "step": 1602 }, { "epoch": 0.26, "learning_rate": 9.739696523542696e-07, "logits/chosen": -0.7136890888214111, "logits/rejected": -0.6738806366920471, "logps/chosen": -253.53073120117188, "logps/rejected": -74.50007629394531, "loss": 0.4331, "rewards/accuracies": 0.0, "rewards/chosen": 1.9806549549102783, "rewards/margins": -0.10995626449584961, "rewards/rejected": 2.090611219406128, "step": 1603 }, { "epoch": 0.26, "learning_rate": 9.739277834254649e-07, "logits/chosen": -0.9713284969329834, "logits/rejected": -0.9055855870246887, "logps/chosen": -117.62899780273438, "logps/rejected": -30.318069458007812, "loss": 0.4988, "rewards/accuracies": 1.0, "rewards/chosen": 0.39654541015625, "rewards/margins": 0.3504818081855774, "rewards/rejected": 0.0460636131465435, "step": 1604 }, { "epoch": 0.26, "learning_rate": 9.738858817527347e-07, "logits/chosen": -0.5023034811019897, "logits/rejected": -0.4429815113544464, "logps/chosen": -225.0129852294922, "logps/rejected": -129.01419067382812, "loss": 0.9086, "rewards/accuracies": 0.0, "rewards/chosen": 1.4668747186660767, "rewards/margins": -0.36957240104675293, "rewards/rejected": 1.8364471197128296, "step": 1605 }, { "epoch": 0.26, "learning_rate": 9.73843947338974e-07, "logits/chosen": -0.11764401197433472, "logits/rejected": -0.1213192492723465, "logps/chosen": -63.15666198730469, "logps/rejected": -92.74004364013672, "loss": 0.3806, "rewards/accuracies": 1.0, "rewards/chosen": 1.2093642950057983, "rewards/margins": 0.12241888046264648, "rewards/rejected": 1.0869454145431519, "step": 1606 }, { "epoch": 0.26, "learning_rate": 9.738019801870802e-07, "logits/chosen": -0.1857784390449524, "logits/rejected": -0.19742892682552338, "logps/chosen": -27.464597702026367, "logps/rejected": -21.674293518066406, "loss": 0.5025, "rewards/accuracies": 0.0, "rewards/chosen": -0.3144233822822571, "rewards/margins": -0.3651107847690582, "rewards/rejected": 0.050687409937381744, "step": 1607 }, { "epoch": 0.26, "learning_rate": 9.737599802999528e-07, "logits/chosen": -0.8070448637008667, "logits/rejected": -0.49881309270858765, "logps/chosen": -64.72879028320312, "logps/rejected": -85.7618408203125, "loss": 0.364, "rewards/accuracies": 1.0, "rewards/chosen": 1.461066484451294, "rewards/margins": 0.5299209952354431, "rewards/rejected": 0.9311454892158508, "step": 1608 }, { "epoch": 0.26, "learning_rate": 9.737179476804932e-07, "logits/chosen": -0.3839636743068695, "logits/rejected": -0.4182986617088318, "logps/chosen": -113.53547668457031, "logps/rejected": -64.50257873535156, "loss": 1.3063, "rewards/accuracies": 0.0, "rewards/chosen": -0.12711487710475922, "rewards/margins": -1.0739364624023438, "rewards/rejected": 0.9468216300010681, "step": 1609 }, { "epoch": 0.26, "learning_rate": 9.73675882331606e-07, "logits/chosen": -0.26528042554855347, "logits/rejected": -0.2257416993379593, "logps/chosen": -7.400271892547607, "logps/rejected": -18.5943603515625, "loss": 0.3729, "rewards/accuracies": 1.0, "rewards/chosen": 0.702674150466919, "rewards/margins": 0.3923738896846771, "rewards/rejected": 0.3103002607822418, "step": 1610 }, { "epoch": 0.26, "learning_rate": 9.736337842561972e-07, "logits/chosen": -0.6754454374313354, "logits/rejected": -0.6349582672119141, "logps/chosen": -253.61331176757812, "logps/rejected": -75.85834503173828, "loss": 0.3516, "rewards/accuracies": 1.0, "rewards/chosen": 2.91471266746521, "rewards/margins": 1.4073487520217896, "rewards/rejected": 1.5073639154434204, "step": 1611 }, { "epoch": 0.26, "learning_rate": 9.735916534571756e-07, "logits/chosen": -0.4799545705318451, "logits/rejected": -0.395308256149292, "logps/chosen": -211.5025634765625, "logps/rejected": -175.47186279296875, "loss": 0.3304, "rewards/accuracies": 1.0, "rewards/chosen": 3.2022125720977783, "rewards/margins": 0.8005127906799316, "rewards/rejected": 2.4016997814178467, "step": 1612 }, { "epoch": 0.26, "learning_rate": 9.73549489937452e-07, "logits/chosen": -0.584354817867279, "logits/rejected": -0.5105617046356201, "logps/chosen": -107.95880889892578, "logps/rejected": -62.72564697265625, "loss": 1.1717, "rewards/accuracies": 0.0, "rewards/chosen": -0.041309356689453125, "rewards/margins": -0.915399968624115, "rewards/rejected": 0.8740906119346619, "step": 1613 }, { "epoch": 0.26, "learning_rate": 9.73507293699939e-07, "logits/chosen": -0.5071917176246643, "logits/rejected": -0.49728232622146606, "logps/chosen": -101.7081298828125, "logps/rejected": -56.491737365722656, "loss": 0.4976, "rewards/accuracies": 0.0, "rewards/chosen": 0.10347061604261398, "rewards/margins": -0.4804527163505554, "rewards/rejected": 0.58392333984375, "step": 1614 }, { "epoch": 0.26, "learning_rate": 9.73465064747553e-07, "logits/chosen": -0.4485330283641815, "logits/rejected": -0.3455526530742645, "logps/chosen": -131.17752075195312, "logps/rejected": -31.372943878173828, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 3.6071319580078125, "rewards/margins": 3.2662627696990967, "rewards/rejected": 0.34086915850639343, "step": 1615 }, { "epoch": 0.26, "learning_rate": 9.734228030832103e-07, "logits/chosen": -0.6773151159286499, "logits/rejected": -0.6583923101425171, "logps/chosen": -116.59263610839844, "logps/rejected": -142.91461181640625, "loss": 0.8681, "rewards/accuracies": 0.0, "rewards/chosen": 2.676670789718628, "rewards/margins": -1.2668321132659912, "rewards/rejected": 3.943502902984619, "step": 1616 }, { "epoch": 0.26, "learning_rate": 9.73380508709832e-07, "logits/chosen": -0.40418708324432373, "logits/rejected": -0.3249609172344208, "logps/chosen": -161.98831176757812, "logps/rejected": -30.37683868408203, "loss": 0.9302, "rewards/accuracies": 1.0, "rewards/chosen": 1.1541579961776733, "rewards/margins": 0.4218791723251343, "rewards/rejected": 0.7322788238525391, "step": 1617 }, { "epoch": 0.26, "learning_rate": 9.733381816303394e-07, "logits/chosen": -0.25315341353416443, "logits/rejected": -0.249391108751297, "logps/chosen": -13.629191398620605, "logps/rejected": -4.96142053604126, "loss": 0.6489, "rewards/accuracies": 0.0, "rewards/chosen": -0.14130783081054688, "rewards/margins": -0.26429423689842224, "rewards/rejected": 0.12298641353845596, "step": 1618 }, { "epoch": 0.26, "learning_rate": 9.732958218476573e-07, "logits/chosen": -0.740529477596283, "logits/rejected": -0.7553956508636475, "logps/chosen": -82.99052429199219, "logps/rejected": -80.11345672607422, "loss": 1.1591, "rewards/accuracies": 1.0, "rewards/chosen": 0.9903060793876648, "rewards/margins": 0.6366920471191406, "rewards/rejected": 0.35361406207084656, "step": 1619 }, { "epoch": 0.26, "learning_rate": 9.732534293647123e-07, "logits/chosen": -0.24145865440368652, "logits/rejected": -0.23623014986515045, "logps/chosen": -105.28581237792969, "logps/rejected": -99.32514190673828, "loss": 0.6559, "rewards/accuracies": 1.0, "rewards/chosen": 1.2524880170822144, "rewards/margins": 0.3733261823654175, "rewards/rejected": 0.8791618347167969, "step": 1620 }, { "epoch": 0.26, "learning_rate": 9.732110041844333e-07, "logits/chosen": -0.7882300615310669, "logits/rejected": -0.6891676187515259, "logps/chosen": -94.9182357788086, "logps/rejected": -126.76918029785156, "loss": 0.3826, "rewards/accuracies": 1.0, "rewards/chosen": 1.3941811323165894, "rewards/margins": 0.5610191822052002, "rewards/rejected": 0.8331619501113892, "step": 1621 }, { "epoch": 0.26, "learning_rate": 9.731685463097516e-07, "logits/chosen": -0.65240079164505, "logits/rejected": -0.6183503270149231, "logps/chosen": -107.48489379882812, "logps/rejected": -32.283573150634766, "loss": 0.1901, "rewards/accuracies": 1.0, "rewards/chosen": 2.0028862953186035, "rewards/margins": 1.6212735176086426, "rewards/rejected": 0.38161277770996094, "step": 1622 }, { "epoch": 0.26, "learning_rate": 9.731260557436003e-07, "logits/chosen": -0.6937465667724609, "logits/rejected": -0.5653401613235474, "logps/chosen": -147.790283203125, "logps/rejected": -114.143310546875, "loss": 1.0151, "rewards/accuracies": 0.0, "rewards/chosen": -0.06262665241956711, "rewards/margins": -1.7609649896621704, "rewards/rejected": 1.6983383893966675, "step": 1623 }, { "epoch": 0.26, "learning_rate": 9.730835324889155e-07, "logits/chosen": -0.5943285822868347, "logits/rejected": -0.6419785022735596, "logps/chosen": -195.79647827148438, "logps/rejected": -143.09576416015625, "loss": 0.473, "rewards/accuracies": 0.0, "rewards/chosen": 2.334733724594116, "rewards/margins": -0.21941065788269043, "rewards/rejected": 2.5541443824768066, "step": 1624 }, { "epoch": 0.26, "learning_rate": 9.73040976548635e-07, "logits/chosen": -0.5454141497612, "logits/rejected": -0.585533857345581, "logps/chosen": -171.75506591796875, "logps/rejected": -145.8829345703125, "loss": 0.6831, "rewards/accuracies": 0.0, "rewards/chosen": 2.2141997814178467, "rewards/margins": -0.25893568992614746, "rewards/rejected": 2.473135471343994, "step": 1625 }, { "epoch": 0.26, "learning_rate": 9.729983879256986e-07, "logits/chosen": -0.011463510803878307, "logits/rejected": -0.00795905664563179, "logps/chosen": -2.9915072917938232, "logps/rejected": -21.64633560180664, "loss": 0.7131, "rewards/accuracies": 1.0, "rewards/chosen": 0.27054598927497864, "rewards/margins": 0.19768565893173218, "rewards/rejected": 0.07286033779382706, "step": 1626 }, { "epoch": 0.26, "learning_rate": 9.729557666230494e-07, "logits/chosen": -0.4839336574077606, "logits/rejected": -0.44213607907295227, "logps/chosen": -93.8359603881836, "logps/rejected": -106.68512725830078, "loss": 0.7514, "rewards/accuracies": 1.0, "rewards/chosen": 1.438153862953186, "rewards/margins": 0.1620795726776123, "rewards/rejected": 1.2760742902755737, "step": 1627 }, { "epoch": 0.26, "learning_rate": 9.72913112643632e-07, "logits/chosen": -0.3907599151134491, "logits/rejected": -0.4101773500442505, "logps/chosen": -60.993682861328125, "logps/rejected": -39.3288459777832, "loss": 0.7181, "rewards/accuracies": 0.0, "rewards/chosen": 0.5694168210029602, "rewards/margins": -0.07926446199417114, "rewards/rejected": 0.6486812829971313, "step": 1628 }, { "epoch": 0.26, "learning_rate": 9.72870425990393e-07, "logits/chosen": -0.44889214634895325, "logits/rejected": -0.44006800651550293, "logps/chosen": -51.54340362548828, "logps/rejected": -99.37268829345703, "loss": 1.0354, "rewards/accuracies": 0.0, "rewards/chosen": 0.6251053214073181, "rewards/margins": -0.43252938985824585, "rewards/rejected": 1.057634711265564, "step": 1629 }, { "epoch": 0.26, "learning_rate": 9.72827706666282e-07, "logits/chosen": -0.7861468195915222, "logits/rejected": -0.7566465735435486, "logps/chosen": -114.93453979492188, "logps/rejected": -75.862060546875, "loss": 0.8483, "rewards/accuracies": 0.0, "rewards/chosen": 1.2742691040039062, "rewards/margins": -1.070826768875122, "rewards/rejected": 2.3450958728790283, "step": 1630 }, { "epoch": 0.26, "learning_rate": 9.727849546742506e-07, "logits/chosen": -0.849306583404541, "logits/rejected": -0.6123315691947937, "logps/chosen": -171.68353271484375, "logps/rejected": -156.4169464111328, "loss": 0.496, "rewards/accuracies": 0.0, "rewards/chosen": 3.2860260009765625, "rewards/margins": -0.28632354736328125, "rewards/rejected": 3.5723495483398438, "step": 1631 }, { "epoch": 0.26, "learning_rate": 9.72742170017252e-07, "logits/chosen": -0.30084875226020813, "logits/rejected": -0.30084875226020813, "logps/chosen": -8.218497276306152, "logps/rejected": -8.218497276306152, "loss": 0.4611, "rewards/accuracies": 0.0, "rewards/chosen": 0.6666087508201599, "rewards/margins": 0.0, "rewards/rejected": 0.6666087508201599, "step": 1632 }, { "epoch": 0.27, "learning_rate": 9.72699352698243e-07, "logits/chosen": -0.6453444361686707, "logits/rejected": -0.5769411325454712, "logps/chosen": -48.42116928100586, "logps/rejected": -19.978742599487305, "loss": 0.5034, "rewards/accuracies": 1.0, "rewards/chosen": 0.4084518551826477, "rewards/margins": 0.19969044625759125, "rewards/rejected": 0.20876140892505646, "step": 1633 }, { "epoch": 0.27, "learning_rate": 9.726565027201813e-07, "logits/chosen": -0.7169986367225647, "logits/rejected": -1.1630630493164062, "logps/chosen": -76.9155502319336, "logps/rejected": -37.26118469238281, "loss": 0.2944, "rewards/accuracies": 1.0, "rewards/chosen": 1.9660240411758423, "rewards/margins": 1.844111680984497, "rewards/rejected": 0.121912382543087, "step": 1634 }, { "epoch": 0.27, "learning_rate": 9.726136200860273e-07, "logits/chosen": -0.652566134929657, "logits/rejected": -0.6534156799316406, "logps/chosen": -109.30966186523438, "logps/rejected": -50.78422546386719, "loss": 0.8033, "rewards/accuracies": 0.0, "rewards/chosen": 0.1718185395002365, "rewards/margins": -1.0210884809494019, "rewards/rejected": 1.1929069757461548, "step": 1635 }, { "epoch": 0.27, "learning_rate": 9.725707047987443e-07, "logits/chosen": -0.5870254635810852, "logits/rejected": -0.5848901271820068, "logps/chosen": -179.01426696777344, "logps/rejected": -23.596752166748047, "loss": 0.1673, "rewards/accuracies": 1.0, "rewards/chosen": 1.6381698846817017, "rewards/margins": 1.4415842294692993, "rewards/rejected": 0.19658565521240234, "step": 1636 }, { "epoch": 0.27, "learning_rate": 9.72527756861297e-07, "logits/chosen": -0.3976791799068451, "logits/rejected": -0.474467933177948, "logps/chosen": -52.162933349609375, "logps/rejected": -123.76161193847656, "loss": 0.8773, "rewards/accuracies": 1.0, "rewards/chosen": 1.4551124572753906, "rewards/margins": 0.3130760192871094, "rewards/rejected": 1.1420364379882812, "step": 1637 }, { "epoch": 0.27, "learning_rate": 9.72484776276653e-07, "logits/chosen": -0.37285465002059937, "logits/rejected": -0.3417854607105255, "logps/chosen": -70.6929702758789, "logps/rejected": -66.3365478515625, "loss": 1.0111, "rewards/accuracies": 0.0, "rewards/chosen": 1.3137978315353394, "rewards/margins": -0.5150198936462402, "rewards/rejected": 1.8288177251815796, "step": 1638 }, { "epoch": 0.27, "learning_rate": 9.724417630477815e-07, "logits/chosen": -0.311684250831604, "logits/rejected": -0.311684250831604, "logps/chosen": -34.24437713623047, "logps/rejected": -34.24437713623047, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.2462947815656662, "rewards/margins": 0.0, "rewards/rejected": 0.2462947815656662, "step": 1639 }, { "epoch": 0.27, "learning_rate": 9.723987171776545e-07, "logits/chosen": -0.333977073431015, "logits/rejected": -0.3314549922943115, "logps/chosen": -1.035940170288086, "logps/rejected": -2.060046672821045, "loss": 0.3501, "rewards/accuracies": 1.0, "rewards/chosen": 0.14634600281715393, "rewards/margins": 0.05661831051111221, "rewards/rejected": 0.08972769230604172, "step": 1640 }, { "epoch": 0.27, "learning_rate": 9.72355638669246e-07, "logits/chosen": -0.7802851796150208, "logits/rejected": -0.7602705359458923, "logps/chosen": -150.1768341064453, "logps/rejected": -59.28217315673828, "loss": 0.4679, "rewards/accuracies": 1.0, "rewards/chosen": 0.29345703125, "rewards/margins": 0.07485313713550568, "rewards/rejected": 0.21860389411449432, "step": 1641 }, { "epoch": 0.27, "learning_rate": 9.723125275255323e-07, "logits/chosen": -0.3858949542045593, "logits/rejected": -0.2897367477416992, "logps/chosen": -116.90846252441406, "logps/rejected": -29.134700775146484, "loss": 0.5864, "rewards/accuracies": 1.0, "rewards/chosen": 0.3274215757846832, "rewards/margins": 0.11730499565601349, "rewards/rejected": 0.21011658012866974, "step": 1642 }, { "epoch": 0.27, "learning_rate": 9.722693837494922e-07, "logits/chosen": -0.6084193587303162, "logits/rejected": -0.5900633931159973, "logps/chosen": -239.50338745117188, "logps/rejected": -113.3427734375, "loss": 0.477, "rewards/accuracies": 1.0, "rewards/chosen": 2.424612522125244, "rewards/margins": 2.2965073585510254, "rewards/rejected": 0.12810516357421875, "step": 1643 }, { "epoch": 0.27, "learning_rate": 9.722262073441062e-07, "logits/chosen": -0.3355882167816162, "logits/rejected": -0.3059086501598358, "logps/chosen": -75.41632843017578, "logps/rejected": -75.10749053955078, "loss": 0.9432, "rewards/accuracies": 0.0, "rewards/chosen": 1.4099464416503906, "rewards/margins": -0.16608738899230957, "rewards/rejected": 1.5760338306427002, "step": 1644 }, { "epoch": 0.27, "learning_rate": 9.721829983123575e-07, "logits/chosen": -0.4283771812915802, "logits/rejected": -0.44330018758773804, "logps/chosen": -41.09475326538086, "logps/rejected": -43.645103454589844, "loss": 1.1299, "rewards/accuracies": 0.0, "rewards/chosen": 1.571574091911316, "rewards/margins": -0.15168225765228271, "rewards/rejected": 1.7232563495635986, "step": 1645 }, { "epoch": 0.27, "learning_rate": 9.721397566572313e-07, "logits/chosen": -0.35100942850112915, "logits/rejected": 0.0014357188483700156, "logps/chosen": -33.00690460205078, "logps/rejected": -126.2538070678711, "loss": 1.2001, "rewards/accuracies": 0.0, "rewards/chosen": 0.9266101717948914, "rewards/margins": -1.6551105976104736, "rewards/rejected": 2.5817208290100098, "step": 1646 }, { "epoch": 0.27, "learning_rate": 9.720964823817157e-07, "logits/chosen": -0.5029385685920715, "logits/rejected": -0.49570730328559875, "logps/chosen": -129.6925506591797, "logps/rejected": -46.77407455444336, "loss": 1.3917, "rewards/accuracies": 0.0, "rewards/chosen": -0.09452972561120987, "rewards/margins": -1.1439590454101562, "rewards/rejected": 1.0494292974472046, "step": 1647 }, { "epoch": 0.27, "learning_rate": 9.720531754888e-07, "logits/chosen": -0.3164639174938202, "logits/rejected": -0.3049582540988922, "logps/chosen": -37.68054962158203, "logps/rejected": -2.0441536903381348, "loss": 1.2695, "rewards/accuracies": 0.0, "rewards/chosen": 0.050937652587890625, "rewards/margins": -0.1294809877872467, "rewards/rejected": 0.18041864037513733, "step": 1648 }, { "epoch": 0.27, "learning_rate": 9.720098359814763e-07, "logits/chosen": -0.3938594460487366, "logits/rejected": -0.2912538945674896, "logps/chosen": -80.83084106445312, "logps/rejected": -92.27020263671875, "loss": 0.3043, "rewards/accuracies": 1.0, "rewards/chosen": 1.6364548206329346, "rewards/margins": 1.3454506397247314, "rewards/rejected": 0.2910041809082031, "step": 1649 }, { "epoch": 0.27, "learning_rate": 9.719664638627394e-07, "logits/chosen": -0.676584780216217, "logits/rejected": -0.6036049723625183, "logps/chosen": -159.55810546875, "logps/rejected": -99.78865814208984, "loss": 0.3181, "rewards/accuracies": 1.0, "rewards/chosen": 2.172607421875, "rewards/margins": 0.4158836603164673, "rewards/rejected": 1.7567237615585327, "step": 1650 }, { "epoch": 0.27, "learning_rate": 9.719230591355857e-07, "logits/chosen": -0.32252272963523865, "logits/rejected": -0.32252272963523865, "logps/chosen": -62.749847412109375, "logps/rejected": -62.749847412109375, "loss": 0.8761, "rewards/accuracies": 0.0, "rewards/chosen": 1.5196014642715454, "rewards/margins": 0.0, "rewards/rejected": 1.5196014642715454, "step": 1651 }, { "epoch": 0.27, "learning_rate": 9.718796218030137e-07, "logits/chosen": -0.5645360946655273, "logits/rejected": -0.5251957774162292, "logps/chosen": -83.78993225097656, "logps/rejected": -56.511566162109375, "loss": 0.5139, "rewards/accuracies": 1.0, "rewards/chosen": 2.3258864879608154, "rewards/margins": 0.42583608627319336, "rewards/rejected": 1.900050401687622, "step": 1652 }, { "epoch": 0.27, "learning_rate": 9.718361518680249e-07, "logits/chosen": -0.8000502586364746, "logits/rejected": -0.6937903761863708, "logps/chosen": -210.91656494140625, "logps/rejected": -97.69635772705078, "loss": 0.7994, "rewards/accuracies": 1.0, "rewards/chosen": 0.8780059814453125, "rewards/margins": 0.37227553129196167, "rewards/rejected": 0.5057304501533508, "step": 1653 }, { "epoch": 0.27, "learning_rate": 9.717926493336226e-07, "logits/chosen": -0.4823444187641144, "logits/rejected": -0.43224936723709106, "logps/chosen": -69.54012298583984, "logps/rejected": -81.08683776855469, "loss": 0.5203, "rewards/accuracies": 0.0, "rewards/chosen": 1.2419929504394531, "rewards/margins": -0.2920891046524048, "rewards/rejected": 1.534082055091858, "step": 1654 }, { "epoch": 0.27, "learning_rate": 9.717491142028125e-07, "logits/chosen": -0.4737960696220398, "logits/rejected": -0.44977137446403503, "logps/chosen": -84.77013397216797, "logps/rejected": -77.13320922851562, "loss": 0.3179, "rewards/accuracies": 1.0, "rewards/chosen": 0.22416305541992188, "rewards/margins": 0.1898506134748459, "rewards/rejected": 0.03431243821978569, "step": 1655 }, { "epoch": 0.27, "learning_rate": 9.717055464786021e-07, "logits/chosen": -0.09361208230257034, "logits/rejected": -0.035073909908533096, "logps/chosen": -81.79267883300781, "logps/rejected": -64.31246948242188, "loss": 0.2508, "rewards/accuracies": 1.0, "rewards/chosen": 1.535614013671875, "rewards/margins": 1.1940765380859375, "rewards/rejected": 0.3415374755859375, "step": 1656 }, { "epoch": 0.27, "learning_rate": 9.716619461640019e-07, "logits/chosen": -0.538797914981842, "logits/rejected": -0.5068156719207764, "logps/chosen": -62.91902542114258, "logps/rejected": -58.398643493652344, "loss": 1.713, "rewards/accuracies": 0.0, "rewards/chosen": 1.4491146802902222, "rewards/margins": -0.30588412284851074, "rewards/rejected": 1.754998803138733, "step": 1657 }, { "epoch": 0.27, "learning_rate": 9.716183132620241e-07, "logits/chosen": -0.5232024788856506, "logits/rejected": -0.49210691452026367, "logps/chosen": -153.59397888183594, "logps/rejected": -84.09420013427734, "loss": 0.7074, "rewards/accuracies": 1.0, "rewards/chosen": 2.6609420776367188, "rewards/margins": 0.9430313110351562, "rewards/rejected": 1.7179107666015625, "step": 1658 }, { "epoch": 0.27, "learning_rate": 9.715746477756835e-07, "logits/chosen": -0.4223056733608246, "logits/rejected": -0.4223056733608246, "logps/chosen": -102.79319763183594, "logps/rejected": -102.79319763183594, "loss": 1.1633, "rewards/accuracies": 0.0, "rewards/chosen": 0.6149368286132812, "rewards/margins": 0.0, "rewards/rejected": 0.6149368286132812, "step": 1659 }, { "epoch": 0.27, "learning_rate": 9.715309497079966e-07, "logits/chosen": -0.46540483832359314, "logits/rejected": -0.3611792027950287, "logps/chosen": -127.44945526123047, "logps/rejected": -69.61300659179688, "loss": 1.1208, "rewards/accuracies": 1.0, "rewards/chosen": 1.669165849685669, "rewards/margins": 0.16097486019134521, "rewards/rejected": 1.5081909894943237, "step": 1660 }, { "epoch": 0.27, "learning_rate": 9.714872190619827e-07, "logits/chosen": -0.4886403977870941, "logits/rejected": -0.4667297601699829, "logps/chosen": -33.97064208984375, "logps/rejected": -72.97987365722656, "loss": 0.9665, "rewards/accuracies": 1.0, "rewards/chosen": 1.0796695947647095, "rewards/margins": 0.2149227261543274, "rewards/rejected": 0.8647468686103821, "step": 1661 }, { "epoch": 0.27, "learning_rate": 9.714434558406634e-07, "logits/chosen": -0.5990015864372253, "logits/rejected": -0.597760796546936, "logps/chosen": -77.68507385253906, "logps/rejected": -63.19670486450195, "loss": 0.9339, "rewards/accuracies": 1.0, "rewards/chosen": 1.6866806745529175, "rewards/margins": 0.2864658832550049, "rewards/rejected": 1.4002147912979126, "step": 1662 }, { "epoch": 0.27, "learning_rate": 9.713996600470622e-07, "logits/chosen": -0.5874719619750977, "logits/rejected": -0.5617752075195312, "logps/chosen": -76.06361389160156, "logps/rejected": -71.45756530761719, "loss": 0.6564, "rewards/accuracies": 0.0, "rewards/chosen": 1.4007843732833862, "rewards/margins": -0.41561198234558105, "rewards/rejected": 1.8163963556289673, "step": 1663 }, { "epoch": 0.27, "learning_rate": 9.713558316842047e-07, "logits/chosen": -0.4967183470726013, "logits/rejected": -0.48861539363861084, "logps/chosen": -104.70128631591797, "logps/rejected": -73.71005249023438, "loss": 0.9408, "rewards/accuracies": 0.0, "rewards/chosen": 0.4744865596294403, "rewards/margins": -1.1809089183807373, "rewards/rejected": 1.6553955078125, "step": 1664 }, { "epoch": 0.27, "learning_rate": 9.713119707551192e-07, "logits/chosen": -0.847646951675415, "logits/rejected": -0.9249745607376099, "logps/chosen": -266.3624267578125, "logps/rejected": -274.7767028808594, "loss": 1.2545, "rewards/accuracies": 0.0, "rewards/chosen": 3.4142701625823975, "rewards/margins": -1.067962408065796, "rewards/rejected": 4.482232570648193, "step": 1665 }, { "epoch": 0.27, "learning_rate": 9.712680772628363e-07, "logits/chosen": -0.445751428604126, "logits/rejected": -0.362520694732666, "logps/chosen": -135.69998168945312, "logps/rejected": -60.555355072021484, "loss": 1.1531, "rewards/accuracies": 1.0, "rewards/chosen": 2.9792816638946533, "rewards/margins": 1.8865253925323486, "rewards/rejected": 1.0927562713623047, "step": 1666 }, { "epoch": 0.27, "learning_rate": 9.712241512103883e-07, "logits/chosen": -0.6336906552314758, "logits/rejected": -0.5790595412254333, "logps/chosen": -168.5624237060547, "logps/rejected": -132.66017150878906, "loss": 0.8928, "rewards/accuracies": 0.0, "rewards/chosen": 1.2109726667404175, "rewards/margins": -1.5396071672439575, "rewards/rejected": 2.750579833984375, "step": 1667 }, { "epoch": 0.27, "learning_rate": 9.711801926008105e-07, "logits/chosen": -0.5329746603965759, "logits/rejected": -0.512364387512207, "logps/chosen": -69.03866577148438, "logps/rejected": -54.60251235961914, "loss": 0.8141, "rewards/accuracies": 0.0, "rewards/chosen": 0.986310601234436, "rewards/margins": -0.9040133953094482, "rewards/rejected": 1.8903239965438843, "step": 1668 }, { "epoch": 0.27, "learning_rate": 9.711362014371393e-07, "logits/chosen": -0.8185533285140991, "logits/rejected": -0.805656909942627, "logps/chosen": -35.079620361328125, "logps/rejected": -22.53830337524414, "loss": 0.307, "rewards/accuracies": 1.0, "rewards/chosen": 1.4270210266113281, "rewards/margins": 1.230783462524414, "rewards/rejected": 0.19623756408691406, "step": 1669 }, { "epoch": 0.27, "learning_rate": 9.710921777224147e-07, "logits/chosen": -0.439692884683609, "logits/rejected": -0.3683042824268341, "logps/chosen": -62.04855728149414, "logps/rejected": -59.10798645019531, "loss": 0.3653, "rewards/accuracies": 1.0, "rewards/chosen": 1.2420765161514282, "rewards/margins": 0.2459850311279297, "rewards/rejected": 0.9960914850234985, "step": 1670 }, { "epoch": 0.27, "learning_rate": 9.710481214596785e-07, "logits/chosen": -0.7012636065483093, "logits/rejected": -0.7059773802757263, "logps/chosen": -68.40782928466797, "logps/rejected": -37.318931579589844, "loss": 0.8976, "rewards/accuracies": 0.0, "rewards/chosen": 0.29336243867874146, "rewards/margins": -1.1310820579528809, "rewards/rejected": 1.424444556236267, "step": 1671 }, { "epoch": 0.27, "learning_rate": 9.710040326519737e-07, "logits/chosen": -0.6769906282424927, "logits/rejected": -0.6921897530555725, "logps/chosen": -85.66390991210938, "logps/rejected": -66.47366333007812, "loss": 0.7797, "rewards/accuracies": 0.0, "rewards/chosen": -0.03880004957318306, "rewards/margins": -1.1603606939315796, "rewards/rejected": 1.1215606927871704, "step": 1672 }, { "epoch": 0.27, "learning_rate": 9.709599113023472e-07, "logits/chosen": -0.6452703475952148, "logits/rejected": -0.7281630635261536, "logps/chosen": -156.6732177734375, "logps/rejected": -57.709381103515625, "loss": 0.5904, "rewards/accuracies": 0.0, "rewards/chosen": 1.9820343255996704, "rewards/margins": -0.5056725740432739, "rewards/rejected": 2.4877068996429443, "step": 1673 }, { "epoch": 0.27, "learning_rate": 9.70915757413847e-07, "logits/chosen": -0.28470608592033386, "logits/rejected": -0.28470608592033386, "logps/chosen": -43.387760162353516, "logps/rejected": -43.387760162353516, "loss": 0.6117, "rewards/accuracies": 0.0, "rewards/chosen": 1.3042668104171753, "rewards/margins": 0.0, "rewards/rejected": 1.3042668104171753, "step": 1674 }, { "epoch": 0.27, "learning_rate": 9.708715709895237e-07, "logits/chosen": -0.39443349838256836, "logits/rejected": -0.39443349838256836, "logps/chosen": -4.276804447174072, "logps/rejected": -4.276804447174072, "loss": 0.5074, "rewards/accuracies": 0.0, "rewards/chosen": 0.13313089311122894, "rewards/margins": 0.0, "rewards/rejected": 0.13313089311122894, "step": 1675 }, { "epoch": 0.27, "learning_rate": 9.708273520324306e-07, "logits/chosen": -0.07424135506153107, "logits/rejected": -0.12471552938222885, "logps/chosen": -161.72268676757812, "logps/rejected": -63.75609588623047, "loss": 1.1014, "rewards/accuracies": 1.0, "rewards/chosen": 1.7609161138534546, "rewards/margins": 0.7300453186035156, "rewards/rejected": 1.030870795249939, "step": 1676 }, { "epoch": 0.27, "learning_rate": 9.707831005456222e-07, "logits/chosen": -0.23540352284908295, "logits/rejected": -0.2769850492477417, "logps/chosen": -123.85978698730469, "logps/rejected": -102.59697723388672, "loss": 0.7789, "rewards/accuracies": 0.0, "rewards/chosen": 2.217198133468628, "rewards/margins": -1.0911386013031006, "rewards/rejected": 3.3083367347717285, "step": 1677 }, { "epoch": 0.27, "learning_rate": 9.707388165321561e-07, "logits/chosen": -0.2915797233581543, "logits/rejected": -0.166018545627594, "logps/chosen": -86.14215087890625, "logps/rejected": -15.570046424865723, "loss": 0.5519, "rewards/accuracies": 1.0, "rewards/chosen": 1.2826645374298096, "rewards/margins": 0.5687453746795654, "rewards/rejected": 0.7139191627502441, "step": 1678 }, { "epoch": 0.27, "learning_rate": 9.706944999950921e-07, "logits/chosen": -0.3236142694950104, "logits/rejected": -0.3037042021751404, "logps/chosen": -99.40145874023438, "logps/rejected": -73.87540435791016, "loss": 1.4531, "rewards/accuracies": 0.0, "rewards/chosen": 1.4171355962753296, "rewards/margins": -0.16399621963500977, "rewards/rejected": 1.5811318159103394, "step": 1679 }, { "epoch": 0.27, "learning_rate": 9.70650150937492e-07, "logits/chosen": -0.7264478206634521, "logits/rejected": -0.7480053305625916, "logps/chosen": -137.63072204589844, "logps/rejected": -111.13604736328125, "loss": 0.8755, "rewards/accuracies": 0.0, "rewards/chosen": 0.8267532587051392, "rewards/margins": -1.3397048711776733, "rewards/rejected": 2.1664581298828125, "step": 1680 }, { "epoch": 0.27, "learning_rate": 9.706057693624197e-07, "logits/chosen": -0.39850813150405884, "logits/rejected": -0.30909496545791626, "logps/chosen": -59.87627410888672, "logps/rejected": -56.51863098144531, "loss": 0.2658, "rewards/accuracies": 1.0, "rewards/chosen": 1.9297302961349487, "rewards/margins": 0.40904998779296875, "rewards/rejected": 1.52068030834198, "step": 1681 }, { "epoch": 0.27, "learning_rate": 9.705613552729415e-07, "logits/chosen": -0.6644245386123657, "logits/rejected": -0.6203324794769287, "logps/chosen": -100.51988220214844, "logps/rejected": -30.329256057739258, "loss": 0.5652, "rewards/accuracies": 1.0, "rewards/chosen": 0.2570388913154602, "rewards/margins": 0.2353849560022354, "rewards/rejected": 0.02165393903851509, "step": 1682 }, { "epoch": 0.27, "learning_rate": 9.705169086721264e-07, "logits/chosen": -0.39109158515930176, "logits/rejected": -0.38041359186172485, "logps/chosen": -76.07810974121094, "logps/rejected": -40.01004409790039, "loss": 0.7234, "rewards/accuracies": 0.0, "rewards/chosen": 0.6391403079032898, "rewards/margins": -0.6610886454582214, "rewards/rejected": 1.3002289533615112, "step": 1683 }, { "epoch": 0.27, "learning_rate": 9.704724295630447e-07, "logits/chosen": -0.06266613304615021, "logits/rejected": -0.10274789482355118, "logps/chosen": -75.166015625, "logps/rejected": -74.20582580566406, "loss": 0.7441, "rewards/accuracies": 0.0, "rewards/chosen": 1.8343040943145752, "rewards/margins": -0.21497726440429688, "rewards/rejected": 2.049281358718872, "step": 1684 }, { "epoch": 0.27, "learning_rate": 9.7042791794877e-07, "logits/chosen": -1.014467716217041, "logits/rejected": -1.0037074089050293, "logps/chosen": -93.2900161743164, "logps/rejected": -63.37720489501953, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 1.480249047279358, "rewards/margins": -0.4168449640274048, "rewards/rejected": 1.8970940113067627, "step": 1685 }, { "epoch": 0.27, "learning_rate": 9.703833738323772e-07, "logits/chosen": -0.3621465861797333, "logits/rejected": -0.349679172039032, "logps/chosen": -1.6840094327926636, "logps/rejected": -18.542551040649414, "loss": 0.6619, "rewards/accuracies": 1.0, "rewards/chosen": 0.36845728754997253, "rewards/margins": 0.10578486323356628, "rewards/rejected": 0.26267242431640625, "step": 1686 }, { "epoch": 0.27, "learning_rate": 9.703387972169443e-07, "logits/chosen": -1.656541109085083, "logits/rejected": -1.7409968376159668, "logps/chosen": -57.818870544433594, "logps/rejected": -21.708377838134766, "loss": 0.4755, "rewards/accuracies": 1.0, "rewards/chosen": 1.0903167724609375, "rewards/margins": 0.671805739402771, "rewards/rejected": 0.4185110032558441, "step": 1687 }, { "epoch": 0.27, "learning_rate": 9.702941881055508e-07, "logits/chosen": -0.08835947513580322, "logits/rejected": -0.02949223481118679, "logps/chosen": -100.79901123046875, "logps/rejected": -60.053123474121094, "loss": 1.1232, "rewards/accuracies": 0.0, "rewards/chosen": 0.5596481561660767, "rewards/margins": -0.5644371509552002, "rewards/rejected": 1.1240853071212769, "step": 1688 }, { "epoch": 0.27, "learning_rate": 9.702495465012787e-07, "logits/chosen": -0.6603122353553772, "logits/rejected": -0.6478579640388489, "logps/chosen": -52.68473815917969, "logps/rejected": -56.655601501464844, "loss": 0.5364, "rewards/accuracies": 1.0, "rewards/chosen": 1.7131394147872925, "rewards/margins": 0.12291491031646729, "rewards/rejected": 1.5902245044708252, "step": 1689 }, { "epoch": 0.27, "learning_rate": 9.702048724072127e-07, "logits/chosen": -0.8529937863349915, "logits/rejected": -0.8109708428382874, "logps/chosen": -90.48452758789062, "logps/rejected": -23.80543327331543, "loss": 0.5639, "rewards/accuracies": 0.0, "rewards/chosen": -0.09209823608398438, "rewards/margins": -0.11792125552892685, "rewards/rejected": 0.025823021307587624, "step": 1690 }, { "epoch": 0.27, "learning_rate": 9.70160165826439e-07, "logits/chosen": -0.4584237337112427, "logits/rejected": -0.4423412084579468, "logps/chosen": -67.31578063964844, "logps/rejected": -61.398353576660156, "loss": 0.5583, "rewards/accuracies": 0.0, "rewards/chosen": 1.4224494695663452, "rewards/margins": -0.6177324056625366, "rewards/rejected": 2.040181875228882, "step": 1691 }, { "epoch": 0.27, "learning_rate": 9.701154267620468e-07, "logits/chosen": -0.9668098092079163, "logits/rejected": -0.9357534050941467, "logps/chosen": -133.8937530517578, "logps/rejected": -138.87655639648438, "loss": 0.3489, "rewards/accuracies": 1.0, "rewards/chosen": 3.708195447921753, "rewards/margins": 1.9786604642868042, "rewards/rejected": 1.7295349836349487, "step": 1692 }, { "epoch": 0.27, "learning_rate": 9.700706552171267e-07, "logits/chosen": -0.42301735281944275, "logits/rejected": -0.30016592144966125, "logps/chosen": -125.47537231445312, "logps/rejected": -122.54119873046875, "loss": 1.1478, "rewards/accuracies": 0.0, "rewards/chosen": 2.3380188941955566, "rewards/margins": -0.747894287109375, "rewards/rejected": 3.0859131813049316, "step": 1693 }, { "epoch": 0.27, "learning_rate": 9.700258511947722e-07, "logits/chosen": -0.0029934009071439505, "logits/rejected": 0.011513140052556992, "logps/chosen": -114.58673095703125, "logps/rejected": -131.73046875, "loss": 0.4055, "rewards/accuracies": 1.0, "rewards/chosen": -0.01417465228587389, "rewards/margins": 0.050592806190252304, "rewards/rejected": -0.06476745754480362, "step": 1694 }, { "epoch": 0.28, "learning_rate": 9.699810146980788e-07, "logits/chosen": -0.061152033507823944, "logits/rejected": -0.0644938200712204, "logps/chosen": -2.372884750366211, "logps/rejected": -2.2186686992645264, "loss": 0.6459, "rewards/accuracies": 0.0, "rewards/chosen": 0.15830962359905243, "rewards/margins": -0.01124623417854309, "rewards/rejected": 0.16955585777759552, "step": 1695 }, { "epoch": 0.28, "learning_rate": 9.699361457301443e-07, "logits/chosen": -0.34184542298316956, "logits/rejected": -0.3074105679988861, "logps/chosen": -82.22315979003906, "logps/rejected": -124.0779800415039, "loss": 0.7016, "rewards/accuracies": 0.0, "rewards/chosen": 0.7722381949424744, "rewards/margins": -0.5396323800086975, "rewards/rejected": 1.3118705749511719, "step": 1696 }, { "epoch": 0.28, "learning_rate": 9.698912442940685e-07, "logits/chosen": -0.4415183961391449, "logits/rejected": -0.4348437786102295, "logps/chosen": -32.436065673828125, "logps/rejected": -130.6617431640625, "loss": 0.7924, "rewards/accuracies": 0.0, "rewards/chosen": 1.8238296508789062, "rewards/margins": -0.5415329933166504, "rewards/rejected": 2.3653626441955566, "step": 1697 }, { "epoch": 0.28, "learning_rate": 9.698463103929541e-07, "logits/chosen": -0.34287500381469727, "logits/rejected": -0.363719642162323, "logps/chosen": -105.54136657714844, "logps/rejected": -104.81594848632812, "loss": 0.6738, "rewards/accuracies": 1.0, "rewards/chosen": 0.8906700015068054, "rewards/margins": 0.1360068917274475, "rewards/rejected": 0.7546631097793579, "step": 1698 }, { "epoch": 0.28, "learning_rate": 9.698013440299054e-07, "logits/chosen": -0.5812764763832092, "logits/rejected": -0.5842459797859192, "logps/chosen": -64.33914947509766, "logps/rejected": -64.6305160522461, "loss": 0.5949, "rewards/accuracies": 0.0, "rewards/chosen": 0.621075451374054, "rewards/margins": -0.2226928472518921, "rewards/rejected": 0.843768298625946, "step": 1699 }, { "epoch": 0.28, "learning_rate": 9.697563452080291e-07, "logits/chosen": -0.5724259614944458, "logits/rejected": -0.6050719022750854, "logps/chosen": -93.98428344726562, "logps/rejected": -103.35191345214844, "loss": 0.5969, "rewards/accuracies": 0.0, "rewards/chosen": 1.0099884271621704, "rewards/margins": -0.16718602180480957, "rewards/rejected": 1.17717444896698, "step": 1700 }, { "epoch": 0.28, "learning_rate": 9.69711313930434e-07, "logits/chosen": -0.5382741093635559, "logits/rejected": -0.5570672154426575, "logps/chosen": -146.05953979492188, "logps/rejected": -83.36418914794922, "loss": 0.4169, "rewards/accuracies": 1.0, "rewards/chosen": 1.0806626081466675, "rewards/margins": 0.27302783727645874, "rewards/rejected": 0.8076347708702087, "step": 1701 }, { "epoch": 0.28, "learning_rate": 9.696662502002318e-07, "logits/chosen": -0.7833682298660278, "logits/rejected": -0.8071064352989197, "logps/chosen": -163.11151123046875, "logps/rejected": -174.36996459960938, "loss": 1.2374, "rewards/accuracies": 0.0, "rewards/chosen": 1.9529800415039062, "rewards/margins": -1.7761857509613037, "rewards/rejected": 3.72916579246521, "step": 1702 }, { "epoch": 0.28, "learning_rate": 9.696211540205358e-07, "logits/chosen": -0.4516300857067108, "logits/rejected": -0.42424917221069336, "logps/chosen": -134.9610595703125, "logps/rejected": -149.74545288085938, "loss": 0.9776, "rewards/accuracies": 0.0, "rewards/chosen": 0.6663650870323181, "rewards/margins": -1.6769194602966309, "rewards/rejected": 2.3432846069335938, "step": 1703 }, { "epoch": 0.28, "learning_rate": 9.695760253944613e-07, "logits/chosen": -0.6142175197601318, "logits/rejected": -0.6275830864906311, "logps/chosen": -217.25758361816406, "logps/rejected": -141.88803100585938, "loss": 0.5122, "rewards/accuracies": 1.0, "rewards/chosen": 1.6891052722930908, "rewards/margins": 0.5507477521896362, "rewards/rejected": 1.1383575201034546, "step": 1704 }, { "epoch": 0.28, "learning_rate": 9.69530864325127e-07, "logits/chosen": -0.6126984357833862, "logits/rejected": -0.5707006454467773, "logps/chosen": -149.19163513183594, "logps/rejected": -182.07528686523438, "loss": 0.6268, "rewards/accuracies": 1.0, "rewards/chosen": 2.137408494949341, "rewards/margins": 0.4836304187774658, "rewards/rejected": 1.653778076171875, "step": 1705 }, { "epoch": 0.28, "learning_rate": 9.694856708156524e-07, "logits/chosen": -0.744992733001709, "logits/rejected": -0.7589332461357117, "logps/chosen": -47.40000915527344, "logps/rejected": -9.736969947814941, "loss": 0.5702, "rewards/accuracies": 1.0, "rewards/chosen": 1.8232383728027344, "rewards/margins": 1.399578332901001, "rewards/rejected": 0.423660010099411, "step": 1706 }, { "epoch": 0.28, "learning_rate": 9.694404448691606e-07, "logits/chosen": -0.32820504903793335, "logits/rejected": -0.31672054529190063, "logps/chosen": -2.7103161811828613, "logps/rejected": -6.255877494812012, "loss": 0.4283, "rewards/accuracies": 1.0, "rewards/chosen": 0.2562113404273987, "rewards/margins": 0.20891310274600983, "rewards/rejected": 0.04729824140667915, "step": 1707 }, { "epoch": 0.28, "learning_rate": 9.693951864887758e-07, "logits/chosen": -0.098832868039608, "logits/rejected": -0.10161376744508743, "logps/chosen": -7.746555328369141, "logps/rejected": -6.998704433441162, "loss": 0.7342, "rewards/accuracies": 0.0, "rewards/chosen": 0.004425621125847101, "rewards/margins": -0.026929190382361412, "rewards/rejected": 0.031354811042547226, "step": 1708 }, { "epoch": 0.28, "learning_rate": 9.69349895677625e-07, "logits/chosen": -0.01849798671901226, "logits/rejected": -0.01063557993620634, "logps/chosen": -12.360337257385254, "logps/rejected": -20.21475601196289, "loss": 0.776, "rewards/accuracies": 0.0, "rewards/chosen": 0.13827763497829437, "rewards/margins": -0.01667480170726776, "rewards/rejected": 0.15495243668556213, "step": 1709 }, { "epoch": 0.28, "learning_rate": 9.693045724388374e-07, "logits/chosen": -0.25473153591156006, "logits/rejected": -0.23928067088127136, "logps/chosen": -73.83695983886719, "logps/rejected": -61.55546951293945, "loss": 0.422, "rewards/accuracies": 1.0, "rewards/chosen": 1.7058944702148438, "rewards/margins": 0.7561839818954468, "rewards/rejected": 0.949710488319397, "step": 1710 }, { "epoch": 0.28, "learning_rate": 9.692592167755445e-07, "logits/chosen": -0.3256739377975464, "logits/rejected": -0.3256739377975464, "logps/chosen": -8.74869155883789, "logps/rejected": -8.74869155883789, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.1149606704711914, "rewards/margins": 0.0, "rewards/rejected": 0.1149606704711914, "step": 1711 }, { "epoch": 0.28, "learning_rate": 9.6921382869088e-07, "logits/chosen": -0.6507197022438049, "logits/rejected": -0.5718502402305603, "logps/chosen": -150.8812255859375, "logps/rejected": -48.39220428466797, "loss": 0.1904, "rewards/accuracies": 1.0, "rewards/chosen": 2.272482395172119, "rewards/margins": 1.4025897979736328, "rewards/rejected": 0.8698925375938416, "step": 1712 }, { "epoch": 0.28, "learning_rate": 9.691684081879796e-07, "logits/chosen": -1.3893804550170898, "logits/rejected": -1.3766589164733887, "logps/chosen": -56.77540969848633, "logps/rejected": -133.68824768066406, "loss": 0.9703, "rewards/accuracies": 0.0, "rewards/chosen": 0.40652772784233093, "rewards/margins": -1.6443893909454346, "rewards/rejected": 2.050917148590088, "step": 1713 }, { "epoch": 0.28, "learning_rate": 9.691229552699815e-07, "logits/chosen": -0.4212700426578522, "logits/rejected": -0.41045936942100525, "logps/chosen": -64.38557434082031, "logps/rejected": -58.478485107421875, "loss": 0.9702, "rewards/accuracies": 0.0, "rewards/chosen": 1.4368561506271362, "rewards/margins": -0.37977135181427, "rewards/rejected": 1.8166275024414062, "step": 1714 }, { "epoch": 0.28, "learning_rate": 9.69077469940026e-07, "logits/chosen": -0.5383166670799255, "logits/rejected": -0.4625088572502136, "logps/chosen": -56.8956413269043, "logps/rejected": -132.65774536132812, "loss": 0.5944, "rewards/accuracies": 0.0, "rewards/chosen": 1.5905224084854126, "rewards/margins": -0.15788614749908447, "rewards/rejected": 1.748408555984497, "step": 1715 }, { "epoch": 0.28, "learning_rate": 9.69031952201256e-07, "logits/chosen": -0.8208984732627869, "logits/rejected": -0.822675347328186, "logps/chosen": -112.23213195800781, "logps/rejected": -102.88575744628906, "loss": 0.4384, "rewards/accuracies": 1.0, "rewards/chosen": 0.9563224911689758, "rewards/margins": 0.004963696002960205, "rewards/rejected": 0.9513587951660156, "step": 1716 }, { "epoch": 0.28, "learning_rate": 9.68986402056816e-07, "logits/chosen": -0.37782248854637146, "logits/rejected": -0.3914211094379425, "logps/chosen": -93.27706146240234, "logps/rejected": -62.46222686767578, "loss": 0.3255, "rewards/accuracies": 1.0, "rewards/chosen": 3.155620574951172, "rewards/margins": 0.9543869495391846, "rewards/rejected": 2.2012336254119873, "step": 1717 }, { "epoch": 0.28, "learning_rate": 9.689408195098532e-07, "logits/chosen": -0.4132390022277832, "logits/rejected": -0.4315393567085266, "logps/chosen": -99.05268859863281, "logps/rejected": -56.02886199951172, "loss": 0.8811, "rewards/accuracies": 0.0, "rewards/chosen": -0.11105575412511826, "rewards/margins": -0.9121391177177429, "rewards/rejected": 0.8010833859443665, "step": 1718 }, { "epoch": 0.28, "learning_rate": 9.688952045635167e-07, "logits/chosen": -0.5712985992431641, "logits/rejected": -0.5179315805435181, "logps/chosen": -204.29510498046875, "logps/rejected": -53.75407409667969, "loss": 0.64, "rewards/accuracies": 1.0, "rewards/chosen": 1.8339356184005737, "rewards/margins": 0.5804405212402344, "rewards/rejected": 1.2534950971603394, "step": 1719 }, { "epoch": 0.28, "learning_rate": 9.688495572209585e-07, "logits/chosen": -0.43604329228401184, "logits/rejected": -0.47044554352760315, "logps/chosen": -88.40113830566406, "logps/rejected": -99.09410095214844, "loss": 0.8209, "rewards/accuracies": 0.0, "rewards/chosen": -0.3386490046977997, "rewards/margins": -1.132330298423767, "rewards/rejected": 0.7936813235282898, "step": 1720 }, { "epoch": 0.28, "learning_rate": 9.688038774853322e-07, "logits/chosen": -0.2797895669937134, "logits/rejected": -0.323633074760437, "logps/chosen": -26.58719253540039, "logps/rejected": -80.76933288574219, "loss": 0.5998, "rewards/accuracies": 0.0, "rewards/chosen": 0.43659305572509766, "rewards/margins": -0.32569485902786255, "rewards/rejected": 0.7622879147529602, "step": 1721 }, { "epoch": 0.28, "learning_rate": 9.687581653597939e-07, "logits/chosen": -0.4162140190601349, "logits/rejected": -0.42087432742118835, "logps/chosen": -76.45368957519531, "logps/rejected": -94.98544311523438, "loss": 0.7771, "rewards/accuracies": 1.0, "rewards/chosen": 0.45726320147514343, "rewards/margins": 0.5006920099258423, "rewards/rejected": -0.043428804725408554, "step": 1722 }, { "epoch": 0.28, "learning_rate": 9.687124208475017e-07, "logits/chosen": -0.7750296592712402, "logits/rejected": -0.5956529974937439, "logps/chosen": -103.76821899414062, "logps/rejected": -70.33192443847656, "loss": 0.3019, "rewards/accuracies": 1.0, "rewards/chosen": 1.8573700189590454, "rewards/margins": 0.5555359125137329, "rewards/rejected": 1.3018341064453125, "step": 1723 }, { "epoch": 0.28, "learning_rate": 9.686666439516163e-07, "logits/chosen": -0.5107059478759766, "logits/rejected": -0.4952004849910736, "logps/chosen": -184.26498413085938, "logps/rejected": -83.05561828613281, "loss": 1.6951, "rewards/accuracies": 0.0, "rewards/chosen": 0.37115478515625, "rewards/margins": -1.1667908430099487, "rewards/rejected": 1.5379456281661987, "step": 1724 }, { "epoch": 0.28, "learning_rate": 9.686208346753005e-07, "logits/chosen": -0.7361781597137451, "logits/rejected": -0.7250219583511353, "logps/chosen": -71.59078216552734, "logps/rejected": -38.06034469604492, "loss": 0.3537, "rewards/accuracies": 1.0, "rewards/chosen": 1.2445083856582642, "rewards/margins": 0.6628185510635376, "rewards/rejected": 0.5816898345947266, "step": 1725 }, { "epoch": 0.28, "learning_rate": 9.68574993021719e-07, "logits/chosen": -0.6319441795349121, "logits/rejected": -0.6333389282226562, "logps/chosen": -89.06991577148438, "logps/rejected": -80.62258911132812, "loss": 0.831, "rewards/accuracies": 0.0, "rewards/chosen": 0.676715075969696, "rewards/margins": -1.0432755947113037, "rewards/rejected": 1.719990611076355, "step": 1726 }, { "epoch": 0.28, "learning_rate": 9.68529118994039e-07, "logits/chosen": -0.5131248235702515, "logits/rejected": -0.5131248235702515, "logps/chosen": -63.407859802246094, "logps/rejected": -63.407859802246094, "loss": 0.768, "rewards/accuracies": 0.0, "rewards/chosen": 2.6949684619903564, "rewards/margins": 0.0, "rewards/rejected": 2.6949684619903564, "step": 1727 }, { "epoch": 0.28, "learning_rate": 9.684832125954303e-07, "logits/chosen": -0.8408834934234619, "logits/rejected": -0.8383386135101318, "logps/chosen": -54.8654899597168, "logps/rejected": -127.33970642089844, "loss": 1.9236, "rewards/accuracies": 0.0, "rewards/chosen": 0.6733570098876953, "rewards/margins": -0.6630916595458984, "rewards/rejected": 1.3364486694335938, "step": 1728 }, { "epoch": 0.28, "learning_rate": 9.684372738290645e-07, "logits/chosen": -0.33743810653686523, "logits/rejected": -0.27527955174446106, "logps/chosen": -89.2599868774414, "logps/rejected": -60.27509307861328, "loss": 0.7995, "rewards/accuracies": 1.0, "rewards/chosen": 1.5040085315704346, "rewards/margins": 0.053614020347595215, "rewards/rejected": 1.4503945112228394, "step": 1729 }, { "epoch": 0.28, "learning_rate": 9.683913026981154e-07, "logits/chosen": -0.7636017799377441, "logits/rejected": -0.7793775796890259, "logps/chosen": -109.52495574951172, "logps/rejected": -73.23330688476562, "loss": 1.0282, "rewards/accuracies": 0.0, "rewards/chosen": 0.6172256469726562, "rewards/margins": -1.8213365077972412, "rewards/rejected": 2.4385621547698975, "step": 1730 }, { "epoch": 0.28, "learning_rate": 9.683452992057593e-07, "logits/chosen": -0.7666037678718567, "logits/rejected": -0.7072299122810364, "logps/chosen": -86.18429565429688, "logps/rejected": -105.52398681640625, "loss": 0.5476, "rewards/accuracies": 0.0, "rewards/chosen": 1.332654595375061, "rewards/margins": -0.5933548212051392, "rewards/rejected": 1.9260094165802002, "step": 1731 }, { "epoch": 0.28, "learning_rate": 9.682992633551743e-07, "logits/chosen": -0.5317108631134033, "logits/rejected": -0.5459613800048828, "logps/chosen": -27.468082427978516, "logps/rejected": -8.12397575378418, "loss": 0.4381, "rewards/accuracies": 0.0, "rewards/chosen": -0.028228759765625, "rewards/margins": -0.18794088065624237, "rewards/rejected": 0.15971212089061737, "step": 1732 }, { "epoch": 0.28, "learning_rate": 9.682531951495416e-07, "logits/chosen": -0.46004408597946167, "logits/rejected": -0.39876270294189453, "logps/chosen": -59.36050796508789, "logps/rejected": -52.01327896118164, "loss": 0.4203, "rewards/accuracies": 1.0, "rewards/chosen": 1.4171459674835205, "rewards/margins": 1.0135658979415894, "rewards/rejected": 0.40358009934425354, "step": 1733 }, { "epoch": 0.28, "learning_rate": 9.682070945920436e-07, "logits/chosen": -0.20106031000614166, "logits/rejected": -0.2013368457555771, "logps/chosen": -41.06346130371094, "logps/rejected": -54.080223083496094, "loss": 0.8229, "rewards/accuracies": 1.0, "rewards/chosen": 0.5327476859092712, "rewards/margins": 0.31265491247177124, "rewards/rejected": 0.2200927734375, "step": 1734 }, { "epoch": 0.28, "learning_rate": 9.681609616858657e-07, "logits/chosen": -0.5043142437934875, "logits/rejected": -0.5412325859069824, "logps/chosen": -53.17424392700195, "logps/rejected": -70.8585205078125, "loss": 0.9278, "rewards/accuracies": 0.0, "rewards/chosen": 1.5545475482940674, "rewards/margins": -0.6377360820770264, "rewards/rejected": 2.1922836303710938, "step": 1735 }, { "epoch": 0.28, "learning_rate": 9.681147964341952e-07, "logits/chosen": -0.6329213976860046, "logits/rejected": -0.5818919539451599, "logps/chosen": -111.458984375, "logps/rejected": -74.65052795410156, "loss": 0.4938, "rewards/accuracies": 0.0, "rewards/chosen": 1.7816482782363892, "rewards/margins": -0.2974196672439575, "rewards/rejected": 2.0790679454803467, "step": 1736 }, { "epoch": 0.28, "learning_rate": 9.680685988402212e-07, "logits/chosen": -0.7621629238128662, "logits/rejected": -0.7712106108665466, "logps/chosen": -281.63385009765625, "logps/rejected": -161.93228149414062, "loss": 1.964, "rewards/accuracies": 0.0, "rewards/chosen": 2.1765289306640625, "rewards/margins": -2.2274508476257324, "rewards/rejected": 4.403979778289795, "step": 1737 }, { "epoch": 0.28, "learning_rate": 9.680223689071362e-07, "logits/chosen": -0.8652686476707458, "logits/rejected": -0.7303887009620667, "logps/chosen": -96.1897201538086, "logps/rejected": -137.45166015625, "loss": 0.8919, "rewards/accuracies": 0.0, "rewards/chosen": 2.164952039718628, "rewards/margins": -0.6811738014221191, "rewards/rejected": 2.846125841140747, "step": 1738 }, { "epoch": 0.28, "learning_rate": 9.679761066381341e-07, "logits/chosen": -0.7947168350219727, "logits/rejected": -0.7717213034629822, "logps/chosen": -97.27796936035156, "logps/rejected": -137.64691162109375, "loss": 0.646, "rewards/accuracies": 0.0, "rewards/chosen": 1.0122276544570923, "rewards/margins": -0.4852027893066406, "rewards/rejected": 1.497430443763733, "step": 1739 }, { "epoch": 0.28, "learning_rate": 9.67929812036411e-07, "logits/chosen": -0.7099080085754395, "logits/rejected": -0.693474531173706, "logps/chosen": -148.9281005859375, "logps/rejected": -88.84619140625, "loss": 0.2474, "rewards/accuracies": 1.0, "rewards/chosen": 3.165152072906494, "rewards/margins": 0.7177093029022217, "rewards/rejected": 2.4474427700042725, "step": 1740 }, { "epoch": 0.28, "learning_rate": 9.678834851051653e-07, "logits/chosen": -0.307068407535553, "logits/rejected": -0.307068407535553, "logps/chosen": -46.9665641784668, "logps/rejected": -46.9665641784668, "loss": 0.6194, "rewards/accuracies": 0.0, "rewards/chosen": 0.22712936997413635, "rewards/margins": 0.0, "rewards/rejected": 0.22712936997413635, "step": 1741 }, { "epoch": 0.28, "learning_rate": 9.67837125847598e-07, "logits/chosen": -0.3355496823787689, "logits/rejected": -0.36783677339553833, "logps/chosen": -1.8790676593780518, "logps/rejected": -48.39552688598633, "loss": 0.984, "rewards/accuracies": 1.0, "rewards/chosen": 0.346427857875824, "rewards/margins": 0.058760762214660645, "rewards/rejected": 0.28766709566116333, "step": 1742 }, { "epoch": 0.28, "learning_rate": 9.677907342669123e-07, "logits/chosen": -0.6960082054138184, "logits/rejected": -0.596035897731781, "logps/chosen": -127.09326171875, "logps/rejected": -94.89686584472656, "loss": 0.2184, "rewards/accuracies": 1.0, "rewards/chosen": 2.772966146469116, "rewards/margins": 1.4891984462738037, "rewards/rejected": 1.2837677001953125, "step": 1743 }, { "epoch": 0.28, "learning_rate": 9.677443103663128e-07, "logits/chosen": -0.5182369351387024, "logits/rejected": -0.44769468903541565, "logps/chosen": -146.85414123535156, "logps/rejected": -35.52263641357422, "loss": 0.1281, "rewards/accuracies": 1.0, "rewards/chosen": 2.4066498279571533, "rewards/margins": 1.7713489532470703, "rewards/rejected": 0.6353008151054382, "step": 1744 }, { "epoch": 0.28, "learning_rate": 9.676978541490074e-07, "logits/chosen": -0.282535582780838, "logits/rejected": -0.21087335050106049, "logps/chosen": -78.41720581054688, "logps/rejected": -14.967774391174316, "loss": 0.6241, "rewards/accuracies": 1.0, "rewards/chosen": 1.6910721063613892, "rewards/margins": 0.6664608716964722, "rewards/rejected": 1.024611234664917, "step": 1745 }, { "epoch": 0.28, "learning_rate": 9.676513656182057e-07, "logits/chosen": -0.44435206055641174, "logits/rejected": -0.35993456840515137, "logps/chosen": -56.09576416015625, "logps/rejected": -64.71892547607422, "loss": 0.4769, "rewards/accuracies": 1.0, "rewards/chosen": 1.095367431640625, "rewards/margins": 0.3359825015068054, "rewards/rejected": 0.7593849301338196, "step": 1746 }, { "epoch": 0.28, "learning_rate": 9.676048447771198e-07, "logits/chosen": -0.24736008048057556, "logits/rejected": -0.17651380598545074, "logps/chosen": -89.96141052246094, "logps/rejected": -59.892578125, "loss": 1.1672, "rewards/accuracies": 0.0, "rewards/chosen": -0.10054321587085724, "rewards/margins": -1.1872856616973877, "rewards/rejected": 1.0867424011230469, "step": 1747 }, { "epoch": 0.28, "learning_rate": 9.675582916289633e-07, "logits/chosen": -0.3360142111778259, "logits/rejected": -0.35384300351142883, "logps/chosen": -61.35423278808594, "logps/rejected": -81.31893157958984, "loss": 0.8456, "rewards/accuracies": 0.0, "rewards/chosen": 0.676770031452179, "rewards/margins": -0.404047429561615, "rewards/rejected": 1.080817461013794, "step": 1748 }, { "epoch": 0.28, "learning_rate": 9.675117061769532e-07, "logits/chosen": -0.4436734914779663, "logits/rejected": -0.469441682100296, "logps/chosen": -114.74850463867188, "logps/rejected": -135.23060607910156, "loss": 1.2435, "rewards/accuracies": 0.0, "rewards/chosen": 1.3280044794082642, "rewards/margins": -1.246801733970642, "rewards/rejected": 2.5748062133789062, "step": 1749 }, { "epoch": 0.28, "learning_rate": 9.674650884243074e-07, "logits/chosen": -0.6207736134529114, "logits/rejected": -0.5494433045387268, "logps/chosen": -96.63613891601562, "logps/rejected": -187.05201721191406, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 1.1499626636505127, "rewards/margins": 0.8701698780059814, "rewards/rejected": 0.27979278564453125, "step": 1750 }, { "epoch": 0.28, "learning_rate": 9.674184383742475e-07, "logits/chosen": -0.4474794864654541, "logits/rejected": -0.4474794864654541, "logps/chosen": -2.364588499069214, "logps/rejected": -2.364588499069214, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.25261327624320984, "rewards/margins": 0.0, "rewards/rejected": 0.25261327624320984, "step": 1751 }, { "epoch": 0.28, "learning_rate": 9.673717560299963e-07, "logits/chosen": -0.5981031656265259, "logits/rejected": -0.5510356426239014, "logps/chosen": -48.4598388671875, "logps/rejected": -194.81307983398438, "loss": 1.6554, "rewards/accuracies": 0.0, "rewards/chosen": 1.3778976202011108, "rewards/margins": -2.781533718109131, "rewards/rejected": 4.159431457519531, "step": 1752 }, { "epoch": 0.28, "learning_rate": 9.673250413947791e-07, "logits/chosen": -0.31054171919822693, "logits/rejected": -0.3207387626171112, "logps/chosen": -7.275275230407715, "logps/rejected": -2.6372621059417725, "loss": 0.6298, "rewards/accuracies": 0.0, "rewards/chosen": 0.1979164183139801, "rewards/margins": -0.0375288724899292, "rewards/rejected": 0.2354452908039093, "step": 1753 }, { "epoch": 0.28, "learning_rate": 9.672782944718233e-07, "logits/chosen": -0.6363195180892944, "logits/rejected": -0.5614436864852905, "logps/chosen": -33.08110427856445, "logps/rejected": -111.98001098632812, "loss": 0.6095, "rewards/accuracies": 1.0, "rewards/chosen": 1.834323525428772, "rewards/margins": 0.32346153259277344, "rewards/rejected": 1.5108619928359985, "step": 1754 }, { "epoch": 0.28, "learning_rate": 9.672315152643587e-07, "logits/chosen": -0.6968820095062256, "logits/rejected": -0.5439953207969666, "logps/chosen": -44.858367919921875, "logps/rejected": -179.77740478515625, "loss": 1.9021, "rewards/accuracies": 0.0, "rewards/chosen": 1.315351128578186, "rewards/margins": -3.5631160736083984, "rewards/rejected": 4.878467082977295, "step": 1755 }, { "epoch": 0.29, "learning_rate": 9.671847037756176e-07, "logits/chosen": -0.38766762614250183, "logits/rejected": -0.38766762614250183, "logps/chosen": -94.32369995117188, "logps/rejected": -94.32369995117188, "loss": 0.7134, "rewards/accuracies": 0.0, "rewards/chosen": 1.2357910871505737, "rewards/margins": 0.0, "rewards/rejected": 1.2357910871505737, "step": 1756 }, { "epoch": 0.29, "learning_rate": 9.671378600088338e-07, "logits/chosen": -0.5569347739219666, "logits/rejected": -0.542975902557373, "logps/chosen": -60.21218490600586, "logps/rejected": -96.45443725585938, "loss": 1.3285, "rewards/accuracies": 0.0, "rewards/chosen": 0.5323207974433899, "rewards/margins": -0.15936928987503052, "rewards/rejected": 0.6916900873184204, "step": 1757 }, { "epoch": 0.29, "learning_rate": 9.670909839672441e-07, "logits/chosen": -0.43490496277809143, "logits/rejected": -0.44596800208091736, "logps/chosen": -70.45170593261719, "logps/rejected": -146.42788696289062, "loss": 0.548, "rewards/accuracies": 1.0, "rewards/chosen": 0.5832252502441406, "rewards/margins": 0.6387168765068054, "rewards/rejected": -0.05549163743853569, "step": 1758 }, { "epoch": 0.29, "learning_rate": 9.67044075654087e-07, "logits/chosen": -0.5900685787200928, "logits/rejected": -0.5731212496757507, "logps/chosen": -38.449058532714844, "logps/rejected": -78.42427062988281, "loss": 1.1974, "rewards/accuracies": 0.0, "rewards/chosen": 1.6346641778945923, "rewards/margins": -0.006204962730407715, "rewards/rejected": 1.640869140625, "step": 1759 }, { "epoch": 0.29, "learning_rate": 9.669971350726035e-07, "logits/chosen": -0.3563269376754761, "logits/rejected": -0.2781960964202881, "logps/chosen": -91.65714263916016, "logps/rejected": -17.029512405395508, "loss": 1.1806, "rewards/accuracies": 1.0, "rewards/chosen": 0.6685249209403992, "rewards/margins": 0.529593825340271, "rewards/rejected": 0.13893108069896698, "step": 1760 }, { "epoch": 0.29, "learning_rate": 9.669501622260367e-07, "logits/chosen": -0.6376113295555115, "logits/rejected": -0.5522226691246033, "logps/chosen": -77.39915466308594, "logps/rejected": -68.4190673828125, "loss": 0.8711, "rewards/accuracies": 0.0, "rewards/chosen": 0.9435318112373352, "rewards/margins": -0.9311538338661194, "rewards/rejected": 1.8746856451034546, "step": 1761 }, { "epoch": 0.29, "learning_rate": 9.669031571176323e-07, "logits/chosen": -0.9801343679428101, "logits/rejected": -0.943116307258606, "logps/chosen": -124.5424575805664, "logps/rejected": -25.500675201416016, "loss": 0.3436, "rewards/accuracies": 1.0, "rewards/chosen": 2.051412343978882, "rewards/margins": 1.899248480796814, "rewards/rejected": 0.15216389298439026, "step": 1762 }, { "epoch": 0.29, "learning_rate": 9.668561197506374e-07, "logits/chosen": -1.1766330003738403, "logits/rejected": -1.1653568744659424, "logps/chosen": -124.31047058105469, "logps/rejected": -48.43781661987305, "loss": 0.4655, "rewards/accuracies": 1.0, "rewards/chosen": 4.707530498504639, "rewards/margins": 3.476198196411133, "rewards/rejected": 1.2313324213027954, "step": 1763 }, { "epoch": 0.29, "learning_rate": 9.66809050128302e-07, "logits/chosen": -0.6261560320854187, "logits/rejected": -0.5916199684143066, "logps/chosen": -152.490966796875, "logps/rejected": -75.19914245605469, "loss": 0.3011, "rewards/accuracies": 1.0, "rewards/chosen": 2.061413526535034, "rewards/margins": 0.78559410572052, "rewards/rejected": 1.2758194208145142, "step": 1764 }, { "epoch": 0.29, "learning_rate": 9.667619482538783e-07, "logits/chosen": -0.13343356549739838, "logits/rejected": -0.13343356549739838, "logps/chosen": -31.26841926574707, "logps/rejected": -31.26841926574707, "loss": 0.7399, "rewards/accuracies": 0.0, "rewards/chosen": 0.959236741065979, "rewards/margins": 0.0, "rewards/rejected": 0.959236741065979, "step": 1765 }, { "epoch": 0.29, "learning_rate": 9.667148141306205e-07, "logits/chosen": -0.26742565631866455, "logits/rejected": -0.2649436891078949, "logps/chosen": -1.8768401145935059, "logps/rejected": -5.289096355438232, "loss": 0.7026, "rewards/accuracies": 0.0, "rewards/chosen": 0.2469620704650879, "rewards/margins": -0.03729081153869629, "rewards/rejected": 0.2842528820037842, "step": 1766 }, { "epoch": 0.29, "learning_rate": 9.66667647761785e-07, "logits/chosen": -0.8405154943466187, "logits/rejected": -0.9180821180343628, "logps/chosen": -283.106689453125, "logps/rejected": -187.460205078125, "loss": 0.8926, "rewards/accuracies": 1.0, "rewards/chosen": 2.833972215652466, "rewards/margins": 0.2668945789337158, "rewards/rejected": 2.56707763671875, "step": 1767 }, { "epoch": 0.29, "learning_rate": 9.666204491506308e-07, "logits/chosen": -0.44221341609954834, "logits/rejected": -0.4723174571990967, "logps/chosen": -114.65475463867188, "logps/rejected": -83.7939453125, "loss": 0.6736, "rewards/accuracies": 0.0, "rewards/chosen": 1.583032250404358, "rewards/margins": -0.8733352422714233, "rewards/rejected": 2.4563674926757812, "step": 1768 }, { "epoch": 0.29, "learning_rate": 9.66573218300419e-07, "logits/chosen": -0.5665967464447021, "logits/rejected": -0.45934948325157166, "logps/chosen": -67.71952819824219, "logps/rejected": -114.33599090576172, "loss": 0.8425, "rewards/accuracies": 1.0, "rewards/chosen": 2.4088547229766846, "rewards/margins": 0.15607690811157227, "rewards/rejected": 2.2527778148651123, "step": 1769 }, { "epoch": 0.29, "learning_rate": 9.665259552144122e-07, "logits/chosen": -0.4393405616283417, "logits/rejected": -0.4385932683944702, "logps/chosen": -61.34362030029297, "logps/rejected": -94.34750366210938, "loss": 0.4004, "rewards/accuracies": 1.0, "rewards/chosen": 1.4598182439804077, "rewards/margins": 1.0195746421813965, "rewards/rejected": 0.44024354219436646, "step": 1770 }, { "epoch": 0.29, "learning_rate": 9.664786598958762e-07, "logits/chosen": -0.6092082858085632, "logits/rejected": -0.23238912224769592, "logps/chosen": -158.75006103515625, "logps/rejected": -123.14927673339844, "loss": 0.7101, "rewards/accuracies": 1.0, "rewards/chosen": 2.21978759765625, "rewards/margins": 0.1418440341949463, "rewards/rejected": 2.0779435634613037, "step": 1771 }, { "epoch": 0.29, "learning_rate": 9.66431332348079e-07, "logits/chosen": -0.17967309057712555, "logits/rejected": -0.18944771587848663, "logps/chosen": -23.039812088012695, "logps/rejected": -26.88651466369629, "loss": 1.0134, "rewards/accuracies": 0.0, "rewards/chosen": -0.19635449349880219, "rewards/margins": -0.08461933583021164, "rewards/rejected": -0.11173515766859055, "step": 1772 }, { "epoch": 0.29, "learning_rate": 9.663839725742899e-07, "logits/chosen": -0.3884316384792328, "logits/rejected": -0.3884316384792328, "logps/chosen": -109.58345794677734, "logps/rejected": -109.58345794677734, "loss": 0.7323, "rewards/accuracies": 0.0, "rewards/chosen": 1.9980782270431519, "rewards/margins": 0.0, "rewards/rejected": 1.9980782270431519, "step": 1773 }, { "epoch": 0.29, "learning_rate": 9.663365805777814e-07, "logits/chosen": -0.5004698634147644, "logits/rejected": -0.5188855528831482, "logps/chosen": -99.69073486328125, "logps/rejected": -41.82701110839844, "loss": 0.6227, "rewards/accuracies": 1.0, "rewards/chosen": 0.43219301104545593, "rewards/margins": 0.3047359585762024, "rewards/rejected": 0.12745705246925354, "step": 1774 }, { "epoch": 0.29, "learning_rate": 9.662891563618277e-07, "logits/chosen": -0.38992688059806824, "logits/rejected": -0.4066542685031891, "logps/chosen": -75.0697021484375, "logps/rejected": -125.80155181884766, "loss": 1.604, "rewards/accuracies": 0.0, "rewards/chosen": 1.632849931716919, "rewards/margins": -0.40903162956237793, "rewards/rejected": 2.041881561279297, "step": 1775 }, { "epoch": 0.29, "learning_rate": 9.662416999297052e-07, "logits/chosen": -0.5732222199440002, "logits/rejected": -0.5460708141326904, "logps/chosen": -76.56419372558594, "logps/rejected": -115.5421371459961, "loss": 0.6409, "rewards/accuracies": 0.0, "rewards/chosen": 1.4464691877365112, "rewards/margins": -0.3874427080154419, "rewards/rejected": 1.8339118957519531, "step": 1776 }, { "epoch": 0.29, "learning_rate": 9.661942112846929e-07, "logits/chosen": -0.5721240639686584, "logits/rejected": -0.5996444225311279, "logps/chosen": -54.261802673339844, "logps/rejected": -39.81105041503906, "loss": 0.5567, "rewards/accuracies": 0.0, "rewards/chosen": 0.5462779998779297, "rewards/margins": -0.5307537317276001, "rewards/rejected": 1.0770317316055298, "step": 1777 }, { "epoch": 0.29, "learning_rate": 9.66146690430072e-07, "logits/chosen": -0.6176702976226807, "logits/rejected": -0.6283723711967468, "logps/chosen": -45.492069244384766, "logps/rejected": -46.32852554321289, "loss": 0.4372, "rewards/accuracies": 0.0, "rewards/chosen": 1.4121395349502563, "rewards/margins": -0.12044835090637207, "rewards/rejected": 1.5325878858566284, "step": 1778 }, { "epoch": 0.29, "learning_rate": 9.660991373691252e-07, "logits/chosen": -0.14584408700466156, "logits/rejected": -0.14584408700466156, "logps/chosen": -1.3292789459228516, "logps/rejected": -1.3292789459228516, "loss": 1.2142, "rewards/accuracies": 0.0, "rewards/chosen": 0.4306240975856781, "rewards/margins": 0.0, "rewards/rejected": 0.4306240975856781, "step": 1779 }, { "epoch": 0.29, "learning_rate": 9.660515521051384e-07, "logits/chosen": -0.5612158179283142, "logits/rejected": -0.446572870016098, "logps/chosen": -104.87295532226562, "logps/rejected": -18.455297470092773, "loss": 0.3223, "rewards/accuracies": 1.0, "rewards/chosen": 2.287912130355835, "rewards/margins": 2.101571559906006, "rewards/rejected": 0.18634052574634552, "step": 1780 }, { "epoch": 0.29, "learning_rate": 9.660039346413992e-07, "logits/chosen": -0.5196320414543152, "logits/rejected": -0.5746228098869324, "logps/chosen": -94.37767028808594, "logps/rejected": -144.46522521972656, "loss": 0.8815, "rewards/accuracies": 0.0, "rewards/chosen": 1.6268097162246704, "rewards/margins": -0.9826217889785767, "rewards/rejected": 2.609431505203247, "step": 1781 }, { "epoch": 0.29, "learning_rate": 9.659562849811974e-07, "logits/chosen": -0.7521563172340393, "logits/rejected": -0.6888428330421448, "logps/chosen": -97.73577880859375, "logps/rejected": -110.826171875, "loss": 2.4146, "rewards/accuracies": 0.0, "rewards/chosen": 1.0084892511367798, "rewards/margins": -1.9765311479568481, "rewards/rejected": 2.985020399093628, "step": 1782 }, { "epoch": 0.29, "learning_rate": 9.659086031278254e-07, "logits/chosen": -0.35926273465156555, "logits/rejected": -0.3864854574203491, "logps/chosen": -55.797889709472656, "logps/rejected": -76.73838806152344, "loss": 0.4257, "rewards/accuracies": 1.0, "rewards/chosen": 0.6156883239746094, "rewards/margins": 0.2973060607910156, "rewards/rejected": 0.31838226318359375, "step": 1783 }, { "epoch": 0.29, "learning_rate": 9.658608890845771e-07, "logits/chosen": -0.46272578835487366, "logits/rejected": -0.5363852381706238, "logps/chosen": -92.84898376464844, "logps/rejected": -124.8608627319336, "loss": 3.1054, "rewards/accuracies": 0.0, "rewards/chosen": 0.7437591552734375, "rewards/margins": -0.22590869665145874, "rewards/rejected": 0.9696678519248962, "step": 1784 }, { "epoch": 0.29, "learning_rate": 9.658131428547498e-07, "logits/chosen": -0.24610881507396698, "logits/rejected": -0.3171504735946655, "logps/chosen": -80.4622802734375, "logps/rejected": -75.64942932128906, "loss": 0.8263, "rewards/accuracies": 0.0, "rewards/chosen": 0.44942474365234375, "rewards/margins": -1.0740684270858765, "rewards/rejected": 1.5234931707382202, "step": 1785 }, { "epoch": 0.29, "learning_rate": 9.657653644416418e-07, "logits/chosen": -0.4537840485572815, "logits/rejected": -0.43863019347190857, "logps/chosen": -44.77143478393555, "logps/rejected": -76.24845886230469, "loss": 0.7332, "rewards/accuracies": 0.0, "rewards/chosen": 1.1115883588790894, "rewards/margins": -1.1842795610427856, "rewards/rejected": 2.295867919921875, "step": 1786 }, { "epoch": 0.29, "learning_rate": 9.65717553848554e-07, "logits/chosen": -0.77033931016922, "logits/rejected": -0.7032835483551025, "logps/chosen": -132.42422485351562, "logps/rejected": -61.87013626098633, "loss": 0.3604, "rewards/accuracies": 1.0, "rewards/chosen": 0.9955520629882812, "rewards/margins": 0.04580193758010864, "rewards/rejected": 0.9497501254081726, "step": 1787 }, { "epoch": 0.29, "learning_rate": 9.6566971107879e-07, "logits/chosen": -0.4455025792121887, "logits/rejected": -0.40593698620796204, "logps/chosen": -127.0694351196289, "logps/rejected": -66.24320220947266, "loss": 0.94, "rewards/accuracies": 0.0, "rewards/chosen": 0.29955217242240906, "rewards/margins": -0.6571563482284546, "rewards/rejected": 0.956708550453186, "step": 1788 }, { "epoch": 0.29, "learning_rate": 9.65621836135655e-07, "logits/chosen": -0.7692303657531738, "logits/rejected": -0.6996967196464539, "logps/chosen": -39.664703369140625, "logps/rejected": -65.77237701416016, "loss": 0.7903, "rewards/accuracies": 1.0, "rewards/chosen": 1.4102531671524048, "rewards/margins": 0.3824737071990967, "rewards/rejected": 1.027779459953308, "step": 1789 }, { "epoch": 0.29, "learning_rate": 9.65573929022457e-07, "logits/chosen": -0.6437876224517822, "logits/rejected": -0.637093722820282, "logps/chosen": -318.85052490234375, "logps/rejected": -54.93293380737305, "loss": 0.203, "rewards/accuracies": 1.0, "rewards/chosen": 2.7960174083709717, "rewards/margins": 1.1851016283035278, "rewards/rejected": 1.6109157800674438, "step": 1790 }, { "epoch": 0.29, "learning_rate": 9.65525989742506e-07, "logits/chosen": -0.11804591864347458, "logits/rejected": -0.10495990514755249, "logps/chosen": -96.95893859863281, "logps/rejected": -55.826637268066406, "loss": 1.3458, "rewards/accuracies": 0.0, "rewards/chosen": 1.7667083740234375, "rewards/margins": -0.30071496963500977, "rewards/rejected": 2.0674233436584473, "step": 1791 }, { "epoch": 0.29, "learning_rate": 9.654780182991138e-07, "logits/chosen": -0.042583297938108444, "logits/rejected": -0.042583297938108444, "logps/chosen": -57.32335662841797, "logps/rejected": -57.32335662841797, "loss": 0.4546, "rewards/accuracies": 0.0, "rewards/chosen": 1.5712441205978394, "rewards/margins": 0.0, "rewards/rejected": 1.5712441205978394, "step": 1792 }, { "epoch": 0.29, "learning_rate": 9.65430014695595e-07, "logits/chosen": -0.21127045154571533, "logits/rejected": -0.016381438821554184, "logps/chosen": -63.57223129272461, "logps/rejected": -113.16583251953125, "loss": 1.1875, "rewards/accuracies": 0.0, "rewards/chosen": 1.8378559350967407, "rewards/margins": -0.45741093158721924, "rewards/rejected": 2.29526686668396, "step": 1793 }, { "epoch": 0.29, "learning_rate": 9.65381978935266e-07, "logits/chosen": -0.6736025810241699, "logits/rejected": -0.69432133436203, "logps/chosen": -115.00846862792969, "logps/rejected": -63.72567367553711, "loss": 1.614, "rewards/accuracies": 0.0, "rewards/chosen": 0.9971458315849304, "rewards/margins": -1.2325434684753418, "rewards/rejected": 2.229689359664917, "step": 1794 }, { "epoch": 0.29, "learning_rate": 9.653339110214458e-07, "logits/chosen": -0.6532445549964905, "logits/rejected": -0.587542712688446, "logps/chosen": -165.00511169433594, "logps/rejected": -23.896446228027344, "loss": 0.8654, "rewards/accuracies": 0.0, "rewards/chosen": 0.15972137451171875, "rewards/margins": -0.6600807309150696, "rewards/rejected": 0.8198021054267883, "step": 1795 }, { "epoch": 0.29, "learning_rate": 9.652858109574552e-07, "logits/chosen": -0.7826747298240662, "logits/rejected": -0.5873280167579651, "logps/chosen": -198.80584716796875, "logps/rejected": -78.24552917480469, "loss": 0.4564, "rewards/accuracies": 1.0, "rewards/chosen": 3.339560031890869, "rewards/margins": 0.9912941455841064, "rewards/rejected": 2.3482658863067627, "step": 1796 }, { "epoch": 0.29, "learning_rate": 9.652376787466178e-07, "logits/chosen": -0.8217235207557678, "logits/rejected": -0.7888252139091492, "logps/chosen": -111.65347290039062, "logps/rejected": -112.26798248291016, "loss": 0.3137, "rewards/accuracies": 1.0, "rewards/chosen": 1.0460113286972046, "rewards/margins": 0.2576255202293396, "rewards/rejected": 0.788385808467865, "step": 1797 }, { "epoch": 0.29, "learning_rate": 9.65189514392259e-07, "logits/chosen": -0.33526715636253357, "logits/rejected": -0.33789780735969543, "logps/chosen": -88.04502868652344, "logps/rejected": -44.82978820800781, "loss": 0.9563, "rewards/accuracies": 0.0, "rewards/chosen": -0.2323562651872635, "rewards/margins": -1.664519190788269, "rewards/rejected": 1.432162880897522, "step": 1798 }, { "epoch": 0.29, "learning_rate": 9.651413178977064e-07, "logits/chosen": -0.3798331022262573, "logits/rejected": -0.00468726409599185, "logps/chosen": -40.26777648925781, "logps/rejected": -64.24549102783203, "loss": 0.541, "rewards/accuracies": 1.0, "rewards/chosen": 1.5225368738174438, "rewards/margins": 1.1368801593780518, "rewards/rejected": 0.3856567442417145, "step": 1799 }, { "epoch": 0.29, "learning_rate": 9.6509308926629e-07, "logits/chosen": -0.39441418647766113, "logits/rejected": -0.4033089280128479, "logps/chosen": -43.56193542480469, "logps/rejected": -62.81449890136719, "loss": 0.6261, "rewards/accuracies": 1.0, "rewards/chosen": 0.6057510375976562, "rewards/margins": 0.28909605741500854, "rewards/rejected": 0.3166549801826477, "step": 1800 }, { "epoch": 0.29, "learning_rate": 9.650448285013417e-07, "logits/chosen": -0.5453575849533081, "logits/rejected": -0.5594650506973267, "logps/chosen": -98.09841918945312, "logps/rejected": -78.50601196289062, "loss": 0.5346, "rewards/accuracies": 0.0, "rewards/chosen": 1.0115493535995483, "rewards/margins": -0.4606132507324219, "rewards/rejected": 1.4721626043319702, "step": 1801 }, { "epoch": 0.29, "learning_rate": 9.64996535606196e-07, "logits/chosen": -0.35876983404159546, "logits/rejected": -0.3566417694091797, "logps/chosen": -17.322738647460938, "logps/rejected": -51.14950180053711, "loss": 0.7345, "rewards/accuracies": 1.0, "rewards/chosen": 0.3827190399169922, "rewards/margins": 0.34913405776023865, "rewards/rejected": 0.03358497843146324, "step": 1802 }, { "epoch": 0.29, "learning_rate": 9.649482105841898e-07, "logits/chosen": -0.26507750153541565, "logits/rejected": -0.24928203225135803, "logps/chosen": -76.20181274414062, "logps/rejected": -44.39300537109375, "loss": 0.8615, "rewards/accuracies": 1.0, "rewards/chosen": 1.44810950756073, "rewards/margins": 0.38121461868286133, "rewards/rejected": 1.0668948888778687, "step": 1803 }, { "epoch": 0.29, "learning_rate": 9.648998534386615e-07, "logits/chosen": -0.865122377872467, "logits/rejected": -0.8373469710350037, "logps/chosen": -91.93382263183594, "logps/rejected": -135.88839721679688, "loss": 0.3385, "rewards/accuracies": 1.0, "rewards/chosen": 1.303688883781433, "rewards/margins": 0.8133416175842285, "rewards/rejected": 0.490347295999527, "step": 1804 }, { "epoch": 0.29, "learning_rate": 9.648514641729522e-07, "logits/chosen": -0.38523030281066895, "logits/rejected": -0.2596748173236847, "logps/chosen": -78.13751220703125, "logps/rejected": -19.622051239013672, "loss": 0.3126, "rewards/accuracies": 1.0, "rewards/chosen": 1.337316870689392, "rewards/margins": 1.0037013292312622, "rewards/rejected": 0.3336155116558075, "step": 1805 }, { "epoch": 0.29, "learning_rate": 9.64803042790405e-07, "logits/chosen": -0.7001051306724548, "logits/rejected": -0.7001051306724548, "logps/chosen": -133.20046997070312, "logps/rejected": -133.20046997070312, "loss": 0.791, "rewards/accuracies": 0.0, "rewards/chosen": 0.753466784954071, "rewards/margins": 0.0, "rewards/rejected": 0.753466784954071, "step": 1806 }, { "epoch": 0.29, "learning_rate": 9.647545892943657e-07, "logits/chosen": -0.5723860263824463, "logits/rejected": -0.5655289888381958, "logps/chosen": -54.862205505371094, "logps/rejected": -17.957197189331055, "loss": 0.9661, "rewards/accuracies": 1.0, "rewards/chosen": 1.2240676879882812, "rewards/margins": 0.5442607998847961, "rewards/rejected": 0.6798068881034851, "step": 1807 }, { "epoch": 0.29, "learning_rate": 9.647061036881821e-07, "logits/chosen": -0.24861307442188263, "logits/rejected": -0.2579134702682495, "logps/chosen": -123.17935180664062, "logps/rejected": -134.1771240234375, "loss": 1.3644, "rewards/accuracies": 0.0, "rewards/chosen": -0.2309166043996811, "rewards/margins": -0.2988945245742798, "rewards/rejected": 0.0679779052734375, "step": 1808 }, { "epoch": 0.29, "learning_rate": 9.646575859752035e-07, "logits/chosen": -0.6421144008636475, "logits/rejected": -0.6855486631393433, "logps/chosen": -51.461883544921875, "logps/rejected": -96.57164764404297, "loss": 0.8568, "rewards/accuracies": 0.0, "rewards/chosen": 0.7968978881835938, "rewards/margins": -1.0809539556503296, "rewards/rejected": 1.8778518438339233, "step": 1809 }, { "epoch": 0.29, "learning_rate": 9.646090361587827e-07, "logits/chosen": -0.5316223502159119, "logits/rejected": -0.5041146874427795, "logps/chosen": -75.13668823242188, "logps/rejected": -104.51860046386719, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 1.0754966735839844, "rewards/margins": 0.8862975835800171, "rewards/rejected": 0.1891990751028061, "step": 1810 }, { "epoch": 0.29, "learning_rate": 9.645604542422732e-07, "logits/chosen": -0.21964296698570251, "logits/rejected": -0.134617418050766, "logps/chosen": -60.94596862792969, "logps/rejected": -86.62264251708984, "loss": 0.3114, "rewards/accuracies": 1.0, "rewards/chosen": 0.7538772821426392, "rewards/margins": 0.35541918873786926, "rewards/rejected": 0.3984580934047699, "step": 1811 }, { "epoch": 0.29, "learning_rate": 9.645118402290324e-07, "logits/chosen": -0.42028912901878357, "logits/rejected": -0.38013899326324463, "logps/chosen": -192.1126251220703, "logps/rejected": -60.941253662109375, "loss": 0.2376, "rewards/accuracies": 1.0, "rewards/chosen": 3.8108322620391846, "rewards/margins": 1.6733016967773438, "rewards/rejected": 2.137530565261841, "step": 1812 }, { "epoch": 0.29, "learning_rate": 9.644631941224184e-07, "logits/chosen": -0.2508145570755005, "logits/rejected": -0.16377535462379456, "logps/chosen": -50.10313415527344, "logps/rejected": -34.92962646484375, "loss": 0.3852, "rewards/accuracies": 1.0, "rewards/chosen": 0.7978367209434509, "rewards/margins": 0.8992580771446228, "rewards/rejected": -0.10142135620117188, "step": 1813 }, { "epoch": 0.29, "learning_rate": 9.644145159257927e-07, "logits/chosen": -0.5812758803367615, "logits/rejected": -0.5812403559684753, "logps/chosen": -122.63322448730469, "logps/rejected": -57.785133361816406, "loss": 0.335, "rewards/accuracies": 1.0, "rewards/chosen": 2.8135788440704346, "rewards/margins": 0.21097946166992188, "rewards/rejected": 2.6025993824005127, "step": 1814 }, { "epoch": 0.29, "learning_rate": 9.643658056425183e-07, "logits/chosen": -0.4965992569923401, "logits/rejected": -0.5029363632202148, "logps/chosen": -90.20521545410156, "logps/rejected": -91.26181030273438, "loss": 0.6406, "rewards/accuracies": 0.0, "rewards/chosen": 1.0606659650802612, "rewards/margins": -0.39929497241973877, "rewards/rejected": 1.4599609375, "step": 1815 }, { "epoch": 0.29, "learning_rate": 9.643170632759606e-07, "logits/chosen": -0.6518141031265259, "logits/rejected": -0.5997182726860046, "logps/chosen": -98.58224487304688, "logps/rejected": -67.93598937988281, "loss": 1.2193, "rewards/accuracies": 0.0, "rewards/chosen": 0.7953552603721619, "rewards/margins": -1.1687729358673096, "rewards/rejected": 1.9641281366348267, "step": 1816 }, { "epoch": 0.29, "learning_rate": 9.64268288829487e-07, "logits/chosen": -0.39121565222740173, "logits/rejected": -0.3632439076900482, "logps/chosen": -32.29670715332031, "logps/rejected": -98.8349838256836, "loss": 0.6756, "rewards/accuracies": 0.0, "rewards/chosen": 0.6966690421104431, "rewards/margins": -0.17779994010925293, "rewards/rejected": 0.874468982219696, "step": 1817 }, { "epoch": 0.3, "learning_rate": 9.642194823064677e-07, "logits/chosen": -0.6692429184913635, "logits/rejected": -0.5139455199241638, "logps/chosen": -218.2486572265625, "logps/rejected": -19.51618766784668, "loss": 0.8526, "rewards/accuracies": 1.0, "rewards/chosen": 2.9384095668792725, "rewards/margins": 2.815419912338257, "rewards/rejected": 0.12298965454101562, "step": 1818 }, { "epoch": 0.3, "learning_rate": 9.641706437102748e-07, "logits/chosen": -0.5204666256904602, "logits/rejected": -0.5156437754631042, "logps/chosen": -25.28288459777832, "logps/rejected": -1.2820312976837158, "loss": 1.1523, "rewards/accuracies": 0.0, "rewards/chosen": -0.22235718369483948, "rewards/margins": -0.48851120471954346, "rewards/rejected": 0.266154021024704, "step": 1819 }, { "epoch": 0.3, "learning_rate": 9.641217730442824e-07, "logits/chosen": -0.6814224720001221, "logits/rejected": -0.5178859829902649, "logps/chosen": -239.0716552734375, "logps/rejected": -324.70965576171875, "loss": 2.2185, "rewards/accuracies": 0.0, "rewards/chosen": 1.365380883216858, "rewards/margins": -3.4579806327819824, "rewards/rejected": 4.823361396789551, "step": 1820 }, { "epoch": 0.3, "learning_rate": 9.640728703118668e-07, "logits/chosen": -0.4738021790981293, "logits/rejected": -0.47050100564956665, "logps/chosen": -6.615017414093018, "logps/rejected": -19.100053787231445, "loss": 1.4151, "rewards/accuracies": 1.0, "rewards/chosen": 0.17926974594593048, "rewards/margins": 0.3324350118637085, "rewards/rejected": -0.15316525101661682, "step": 1821 }, { "epoch": 0.3, "learning_rate": 9.640239355164073e-07, "logits/chosen": -0.2072477638721466, "logits/rejected": -0.17384472489356995, "logps/chosen": -50.99378967285156, "logps/rejected": -70.44575500488281, "loss": 0.4618, "rewards/accuracies": 1.0, "rewards/chosen": 1.2517330646514893, "rewards/margins": 0.3479672074317932, "rewards/rejected": 0.903765857219696, "step": 1822 }, { "epoch": 0.3, "learning_rate": 9.639749686612842e-07, "logits/chosen": -0.4166909456253052, "logits/rejected": -0.48701176047325134, "logps/chosen": -72.88750457763672, "logps/rejected": -153.3609619140625, "loss": 1.7129, "rewards/accuracies": 0.0, "rewards/chosen": 0.8645492792129517, "rewards/margins": -1.6779557466506958, "rewards/rejected": 2.5425050258636475, "step": 1823 }, { "epoch": 0.3, "learning_rate": 9.63925969749881e-07, "logits/chosen": -0.5751039981842041, "logits/rejected": -0.514002799987793, "logps/chosen": -138.983154296875, "logps/rejected": -67.94908142089844, "loss": 1.003, "rewards/accuracies": 1.0, "rewards/chosen": 2.9969894886016846, "rewards/margins": 2.088538408279419, "rewards/rejected": 0.9084510803222656, "step": 1824 }, { "epoch": 0.3, "learning_rate": 9.638769387855832e-07, "logits/chosen": -0.5569620132446289, "logits/rejected": -0.5082404613494873, "logps/chosen": -53.49402618408203, "logps/rejected": -59.87034606933594, "loss": 0.7248, "rewards/accuracies": 0.0, "rewards/chosen": 0.8821258544921875, "rewards/margins": -0.3137848377227783, "rewards/rejected": 1.1959106922149658, "step": 1825 }, { "epoch": 0.3, "learning_rate": 9.638278757717779e-07, "logits/chosen": -0.2953309714794159, "logits/rejected": -0.2907005250453949, "logps/chosen": -123.3516845703125, "logps/rejected": -146.29383850097656, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 1.3608306646347046, "rewards/margins": 0.17246854305267334, "rewards/rejected": 1.1883621215820312, "step": 1826 }, { "epoch": 0.3, "learning_rate": 9.637787807118553e-07, "logits/chosen": -0.32328304648399353, "logits/rejected": -0.3141739070415497, "logps/chosen": -78.08499145507812, "logps/rejected": -74.21198272705078, "loss": 1.1619, "rewards/accuracies": 0.0, "rewards/chosen": 1.5048248767852783, "rewards/margins": -0.33583831787109375, "rewards/rejected": 1.840663194656372, "step": 1827 }, { "epoch": 0.3, "learning_rate": 9.637296536092074e-07, "logits/chosen": -0.14718134701251984, "logits/rejected": 0.09794524312019348, "logps/chosen": -57.46781921386719, "logps/rejected": -123.49645233154297, "loss": 1.3285, "rewards/accuracies": 0.0, "rewards/chosen": 1.046656847000122, "rewards/margins": -1.9644584655761719, "rewards/rejected": 3.011115312576294, "step": 1828 }, { "epoch": 0.3, "learning_rate": 9.636804944672282e-07, "logits/chosen": -0.2256469428539276, "logits/rejected": -0.10098294913768768, "logps/chosen": -133.4879150390625, "logps/rejected": -81.48763275146484, "loss": 0.4003, "rewards/accuracies": 1.0, "rewards/chosen": 2.7836365699768066, "rewards/margins": 1.5618927478790283, "rewards/rejected": 1.2217438220977783, "step": 1829 }, { "epoch": 0.3, "learning_rate": 9.636313032893142e-07, "logits/chosen": -0.5869866013526917, "logits/rejected": -0.5869866013526917, "logps/chosen": -37.87635040283203, "logps/rejected": -37.87635040283203, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.0376129150390625, "rewards/margins": 0.0, "rewards/rejected": 1.0376129150390625, "step": 1830 }, { "epoch": 0.3, "learning_rate": 9.635820800788638e-07, "logits/chosen": -0.4660891890525818, "logits/rejected": -0.3230683207511902, "logps/chosen": -81.39799499511719, "logps/rejected": -60.08574676513672, "loss": 0.3343, "rewards/accuracies": 1.0, "rewards/chosen": 1.6756683588027954, "rewards/margins": 0.3040870428085327, "rewards/rejected": 1.3715813159942627, "step": 1831 }, { "epoch": 0.3, "learning_rate": 9.635328248392784e-07, "logits/chosen": -0.17433106899261475, "logits/rejected": -0.18556848168373108, "logps/chosen": -60.56085205078125, "logps/rejected": -55.24895477294922, "loss": 1.0736, "rewards/accuracies": 0.0, "rewards/chosen": 0.7281364798545837, "rewards/margins": -1.2355430126190186, "rewards/rejected": 1.963679552078247, "step": 1832 }, { "epoch": 0.3, "learning_rate": 9.63483537573961e-07, "logits/chosen": -0.6728404760360718, "logits/rejected": -0.627115786075592, "logps/chosen": -98.41657257080078, "logps/rejected": -80.03123474121094, "loss": 0.447, "rewards/accuracies": 1.0, "rewards/chosen": 1.8196296691894531, "rewards/margins": 0.005098700523376465, "rewards/rejected": 1.8145309686660767, "step": 1833 }, { "epoch": 0.3, "learning_rate": 9.634342182863162e-07, "logits/chosen": -0.7963566780090332, "logits/rejected": -0.7077906727790833, "logps/chosen": -54.97502899169922, "logps/rejected": -65.28536224365234, "loss": 1.1033, "rewards/accuracies": 1.0, "rewards/chosen": 1.677350640296936, "rewards/margins": 0.038605451583862305, "rewards/rejected": 1.6387451887130737, "step": 1834 }, { "epoch": 0.3, "learning_rate": 9.633848669797523e-07, "logits/chosen": -0.4032575786113739, "logits/rejected": -0.38895222544670105, "logps/chosen": -75.11825561523438, "logps/rejected": -43.024627685546875, "loss": 1.0277, "rewards/accuracies": 0.0, "rewards/chosen": 0.645825207233429, "rewards/margins": -0.9563243985176086, "rewards/rejected": 1.6021496057510376, "step": 1835 }, { "epoch": 0.3, "learning_rate": 9.633354836576785e-07, "logits/chosen": -0.6104785799980164, "logits/rejected": -0.6104785799980164, "logps/chosen": -40.02928924560547, "logps/rejected": -40.02928924560547, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.9593750238418579, "rewards/margins": 0.0, "rewards/rejected": 0.9593750238418579, "step": 1836 }, { "epoch": 0.3, "learning_rate": 9.63286068323507e-07, "logits/chosen": -0.27033767104148865, "logits/rejected": -0.2278415709733963, "logps/chosen": -79.42796325683594, "logps/rejected": -72.02864074707031, "loss": 0.7744, "rewards/accuracies": 0.0, "rewards/chosen": 0.5458893179893494, "rewards/margins": -0.30242156982421875, "rewards/rejected": 0.8483108878135681, "step": 1837 }, { "epoch": 0.3, "learning_rate": 9.632366209806518e-07, "logits/chosen": -0.5478050708770752, "logits/rejected": -0.5356186032295227, "logps/chosen": -104.882568359375, "logps/rejected": -99.6761474609375, "loss": 0.895, "rewards/accuracies": 1.0, "rewards/chosen": 2.858067274093628, "rewards/margins": 0.8030638694763184, "rewards/rejected": 2.0550034046173096, "step": 1838 }, { "epoch": 0.3, "learning_rate": 9.631871416325293e-07, "logits/chosen": -0.3798847794532776, "logits/rejected": -0.4469505846500397, "logps/chosen": -90.05166625976562, "logps/rejected": -106.08778381347656, "loss": 0.991, "rewards/accuracies": 0.0, "rewards/chosen": 0.6802360415458679, "rewards/margins": -1.1880652904510498, "rewards/rejected": 1.8683013916015625, "step": 1839 }, { "epoch": 0.3, "learning_rate": 9.63137630282558e-07, "logits/chosen": -0.7056890726089478, "logits/rejected": -0.7089148163795471, "logps/chosen": -70.08847045898438, "logps/rejected": -155.18505859375, "loss": 0.1482, "rewards/accuracies": 1.0, "rewards/chosen": 1.303486704826355, "rewards/margins": 1.5700974464416504, "rewards/rejected": -0.266610711812973, "step": 1840 }, { "epoch": 0.3, "learning_rate": 9.630880869341587e-07, "logits/chosen": -0.6803218126296997, "logits/rejected": -0.6757042407989502, "logps/chosen": -86.71920776367188, "logps/rejected": -127.23085021972656, "loss": 1.0566, "rewards/accuracies": 0.0, "rewards/chosen": 1.0065491199493408, "rewards/margins": -0.7425872087478638, "rewards/rejected": 1.7491363286972046, "step": 1841 }, { "epoch": 0.3, "learning_rate": 9.630385115907544e-07, "logits/chosen": -0.7065485119819641, "logits/rejected": -0.6480814814567566, "logps/chosen": -102.07771301269531, "logps/rejected": -89.38880157470703, "loss": 1.0013, "rewards/accuracies": 0.0, "rewards/chosen": 0.4179275631904602, "rewards/margins": -1.1068778038024902, "rewards/rejected": 1.5248054265975952, "step": 1842 }, { "epoch": 0.3, "learning_rate": 9.629889042557704e-07, "logits/chosen": -0.2828211486339569, "logits/rejected": -0.32800158858299255, "logps/chosen": -83.22544860839844, "logps/rejected": -108.39778137207031, "loss": 0.3174, "rewards/accuracies": 1.0, "rewards/chosen": 0.904169499874115, "rewards/margins": 0.6942497491836548, "rewards/rejected": 0.209919735789299, "step": 1843 }, { "epoch": 0.3, "learning_rate": 9.629392649326337e-07, "logits/chosen": -0.6213772892951965, "logits/rejected": -0.5713744163513184, "logps/chosen": -94.26029205322266, "logps/rejected": -44.99891662597656, "loss": 2.1473, "rewards/accuracies": 0.0, "rewards/chosen": 1.1599876880645752, "rewards/margins": -0.15442156791687012, "rewards/rejected": 1.3144092559814453, "step": 1844 }, { "epoch": 0.3, "learning_rate": 9.628895936247742e-07, "logits/chosen": -0.4153004288673401, "logits/rejected": -0.4153004288673401, "logps/chosen": -19.451736450195312, "logps/rejected": -19.451736450195312, "loss": 1.3264, "rewards/accuracies": 0.0, "rewards/chosen": 1.1995338201522827, "rewards/margins": 0.0, "rewards/rejected": 1.1995338201522827, "step": 1845 }, { "epoch": 0.3, "learning_rate": 9.628398903356239e-07, "logits/chosen": -0.6302679181098938, "logits/rejected": -0.514167308807373, "logps/chosen": -57.5745735168457, "logps/rejected": -31.813915252685547, "loss": 0.2322, "rewards/accuracies": 1.0, "rewards/chosen": 1.5422862768173218, "rewards/margins": 0.637566328048706, "rewards/rejected": 0.9047199487686157, "step": 1846 }, { "epoch": 0.3, "learning_rate": 9.627901550686163e-07, "logits/chosen": -0.7353848218917847, "logits/rejected": -0.6761333346366882, "logps/chosen": -141.80059814453125, "logps/rejected": -123.93668365478516, "loss": 1.1445, "rewards/accuracies": 1.0, "rewards/chosen": 4.0016021728515625, "rewards/margins": 3.225557804107666, "rewards/rejected": 0.776044487953186, "step": 1847 }, { "epoch": 0.3, "learning_rate": 9.627403878271882e-07, "logits/chosen": -0.74085533618927, "logits/rejected": -0.6314619779586792, "logps/chosen": -91.39568328857422, "logps/rejected": -66.70681762695312, "loss": 1.1355, "rewards/accuracies": 0.0, "rewards/chosen": 0.31128770112991333, "rewards/margins": -1.516951084136963, "rewards/rejected": 1.8282387256622314, "step": 1848 }, { "epoch": 0.3, "learning_rate": 9.626905886147779e-07, "logits/chosen": -0.6299273371696472, "logits/rejected": -0.5840972065925598, "logps/chosen": -52.21613311767578, "logps/rejected": -47.55911636352539, "loss": 1.5856, "rewards/accuracies": 0.0, "rewards/chosen": 0.3875900208950043, "rewards/margins": -0.6241706609725952, "rewards/rejected": 1.0117607116699219, "step": 1849 }, { "epoch": 0.3, "learning_rate": 9.626407574348257e-07, "logits/chosen": -0.2013881355524063, "logits/rejected": -0.1862785667181015, "logps/chosen": -53.94867706298828, "logps/rejected": -97.01417541503906, "loss": 0.511, "rewards/accuracies": 0.0, "rewards/chosen": 1.435259222984314, "rewards/margins": -0.15481948852539062, "rewards/rejected": 1.5900787115097046, "step": 1850 }, { "epoch": 0.3, "learning_rate": 9.625908942907747e-07, "logits/chosen": -0.7346110343933105, "logits/rejected": -0.7235197424888611, "logps/chosen": -258.3079833984375, "logps/rejected": -99.06056213378906, "loss": 0.3117, "rewards/accuracies": 1.0, "rewards/chosen": 3.428088426589966, "rewards/margins": 0.5581085681915283, "rewards/rejected": 2.8699798583984375, "step": 1851 }, { "epoch": 0.3, "learning_rate": 9.6254099918607e-07, "logits/chosen": -0.47075846791267395, "logits/rejected": -0.47437626123428345, "logps/chosen": -64.64878845214844, "logps/rejected": -100.51591491699219, "loss": 1.1042, "rewards/accuracies": 0.0, "rewards/chosen": 0.15390396118164062, "rewards/margins": -0.3430275022983551, "rewards/rejected": 0.4969314634799957, "step": 1852 }, { "epoch": 0.3, "learning_rate": 9.624910721241588e-07, "logits/chosen": -0.37128230929374695, "logits/rejected": -0.36796852946281433, "logps/chosen": -21.92938804626465, "logps/rejected": -28.233549118041992, "loss": 1.0061, "rewards/accuracies": 0.0, "rewards/chosen": 0.04678993299603462, "rewards/margins": -0.36947765946388245, "rewards/rejected": 0.41626760363578796, "step": 1853 }, { "epoch": 0.3, "learning_rate": 9.624411131084908e-07, "logits/chosen": -0.8058673143386841, "logits/rejected": -0.7585764527320862, "logps/chosen": -102.1646957397461, "logps/rejected": -199.58087158203125, "loss": 2.2877, "rewards/accuracies": 0.0, "rewards/chosen": 1.8620017766952515, "rewards/margins": -3.2811317443847656, "rewards/rejected": 5.143133640289307, "step": 1854 }, { "epoch": 0.3, "learning_rate": 9.623911221425174e-07, "logits/chosen": -0.27431952953338623, "logits/rejected": -0.27431952953338623, "logps/chosen": -65.46418762207031, "logps/rejected": -65.46418762207031, "loss": 0.511, "rewards/accuracies": 0.0, "rewards/chosen": 0.24633483588695526, "rewards/margins": 0.0, "rewards/rejected": 0.24633483588695526, "step": 1855 }, { "epoch": 0.3, "learning_rate": 9.623410992296929e-07, "logits/chosen": -0.6235947608947754, "logits/rejected": -0.6101281046867371, "logps/chosen": -91.84649658203125, "logps/rejected": -60.265655517578125, "loss": 1.1523, "rewards/accuracies": 0.0, "rewards/chosen": 0.19964599609375, "rewards/margins": -1.2613327503204346, "rewards/rejected": 1.4609787464141846, "step": 1856 }, { "epoch": 0.3, "learning_rate": 9.62291044373473e-07, "logits/chosen": -0.5449185371398926, "logits/rejected": -0.4515722692012787, "logps/chosen": -116.42265319824219, "logps/rejected": -338.7391052246094, "loss": 1.2945, "rewards/accuracies": 0.0, "rewards/chosen": 1.820672631263733, "rewards/margins": -1.5890809297561646, "rewards/rejected": 3.4097535610198975, "step": 1857 }, { "epoch": 0.3, "learning_rate": 9.62240957577316e-07, "logits/chosen": -0.29961854219436646, "logits/rejected": -0.26915639638900757, "logps/chosen": -61.48270797729492, "logps/rejected": -58.86769104003906, "loss": 0.8847, "rewards/accuracies": 0.0, "rewards/chosen": 0.8080852627754211, "rewards/margins": -0.23841899633407593, "rewards/rejected": 1.046504259109497, "step": 1858 }, { "epoch": 0.3, "learning_rate": 9.62190838844683e-07, "logits/chosen": -0.45146748423576355, "logits/rejected": -0.45146748423576355, "logps/chosen": -30.74567413330078, "logps/rejected": -30.74567413330078, "loss": 0.3951, "rewards/accuracies": 0.0, "rewards/chosen": 1.015753149986267, "rewards/margins": 0.0, "rewards/rejected": 1.015753149986267, "step": 1859 }, { "epoch": 0.3, "learning_rate": 9.62140688179036e-07, "logits/chosen": -0.39160922169685364, "logits/rejected": -0.3867012560367584, "logps/chosen": -58.44831085205078, "logps/rejected": -5.251425743103027, "loss": 1.5132, "rewards/accuracies": 0.0, "rewards/chosen": -0.1368785947561264, "rewards/margins": -0.37517234683036804, "rewards/rejected": 0.23829375207424164, "step": 1860 }, { "epoch": 0.3, "learning_rate": 9.620905055838402e-07, "logits/chosen": -0.47113966941833496, "logits/rejected": -0.5174566507339478, "logps/chosen": -172.93801879882812, "logps/rejected": -112.21610260009766, "loss": 0.3322, "rewards/accuracies": 1.0, "rewards/chosen": 1.9226547479629517, "rewards/margins": 0.8101234436035156, "rewards/rejected": 1.112531304359436, "step": 1861 }, { "epoch": 0.3, "learning_rate": 9.62040291062563e-07, "logits/chosen": -0.5328531265258789, "logits/rejected": -0.5333726406097412, "logps/chosen": -2.731926918029785, "logps/rejected": -1.9708698987960815, "loss": 1.7709, "rewards/accuracies": 0.0, "rewards/chosen": 0.32861897349357605, "rewards/margins": -0.06247815489768982, "rewards/rejected": 0.39109712839126587, "step": 1862 }, { "epoch": 0.3, "learning_rate": 9.619900446186734e-07, "logits/chosen": -0.4643878638744354, "logits/rejected": -0.43763136863708496, "logps/chosen": -132.69544982910156, "logps/rejected": -43.05631637573242, "loss": 0.4385, "rewards/accuracies": 1.0, "rewards/chosen": 0.7463928461074829, "rewards/margins": 0.6329326629638672, "rewards/rejected": 0.11346016079187393, "step": 1863 }, { "epoch": 0.3, "learning_rate": 9.619397662556433e-07, "logits/chosen": -1.0326822996139526, "logits/rejected": -0.8174601197242737, "logps/chosen": -116.1693344116211, "logps/rejected": -99.64689636230469, "loss": 0.9491, "rewards/accuracies": 0.0, "rewards/chosen": 2.3099632263183594, "rewards/margins": -1.4297540187835693, "rewards/rejected": 3.7397172451019287, "step": 1864 }, { "epoch": 0.3, "learning_rate": 9.618894559769462e-07, "logits/chosen": -0.5957133173942566, "logits/rejected": -0.5231002569198608, "logps/chosen": -78.732666015625, "logps/rejected": -83.39502716064453, "loss": 0.4972, "rewards/accuracies": 0.0, "rewards/chosen": 1.182318091392517, "rewards/margins": -0.22049331665039062, "rewards/rejected": 1.4028114080429077, "step": 1865 }, { "epoch": 0.3, "learning_rate": 9.618391137860582e-07, "logits/chosen": -0.11659301072359085, "logits/rejected": -0.09671010822057724, "logps/chosen": -43.55418014526367, "logps/rejected": -49.995582580566406, "loss": 0.8216, "rewards/accuracies": 0.0, "rewards/chosen": 0.7099079489707947, "rewards/margins": -0.46108478307724, "rewards/rejected": 1.1709927320480347, "step": 1866 }, { "epoch": 0.3, "learning_rate": 9.617887396864573e-07, "logits/chosen": -0.4555852711200714, "logits/rejected": -0.4555852711200714, "logps/chosen": -62.249244689941406, "logps/rejected": -62.249244689941406, "loss": 0.6445, "rewards/accuracies": 0.0, "rewards/chosen": 2.643195390701294, "rewards/margins": 0.0, "rewards/rejected": 2.643195390701294, "step": 1867 }, { "epoch": 0.3, "learning_rate": 9.61738333681624e-07, "logits/chosen": -0.8066610097885132, "logits/rejected": -0.7182728052139282, "logps/chosen": -76.61355590820312, "logps/rejected": -22.668020248413086, "loss": 0.0767, "rewards/accuracies": 1.0, "rewards/chosen": 3.4922873973846436, "rewards/margins": 3.2373762130737305, "rewards/rejected": 0.25491124391555786, "step": 1868 }, { "epoch": 0.3, "learning_rate": 9.616878957750407e-07, "logits/chosen": -0.48906731605529785, "logits/rejected": -0.3470480442047119, "logps/chosen": -169.71470642089844, "logps/rejected": -88.01683807373047, "loss": 0.353, "rewards/accuracies": 1.0, "rewards/chosen": 4.001155376434326, "rewards/margins": 1.9070069789886475, "rewards/rejected": 2.0941483974456787, "step": 1869 }, { "epoch": 0.3, "learning_rate": 9.616374259701925e-07, "logits/chosen": -0.3260138928890228, "logits/rejected": -0.3260138928890228, "logps/chosen": -44.027034759521484, "logps/rejected": -44.027034759521484, "loss": 0.3673, "rewards/accuracies": 0.0, "rewards/chosen": 2.045912504196167, "rewards/margins": 0.0, "rewards/rejected": 2.045912504196167, "step": 1870 }, { "epoch": 0.3, "learning_rate": 9.615869242705663e-07, "logits/chosen": -0.5456026196479797, "logits/rejected": -0.537469208240509, "logps/chosen": -95.54425048828125, "logps/rejected": -76.7726058959961, "loss": 0.2584, "rewards/accuracies": 1.0, "rewards/chosen": 0.5167709589004517, "rewards/margins": 0.629145085811615, "rewards/rejected": -0.11237411946058273, "step": 1871 }, { "epoch": 0.3, "learning_rate": 9.615363906796509e-07, "logits/chosen": -0.2636527121067047, "logits/rejected": -0.27048853039741516, "logps/chosen": -63.034629821777344, "logps/rejected": -102.2744369506836, "loss": 0.2975, "rewards/accuracies": 1.0, "rewards/chosen": 1.5668312311172485, "rewards/margins": 0.697430431842804, "rewards/rejected": 0.8694007992744446, "step": 1872 }, { "epoch": 0.3, "learning_rate": 9.614858252009384e-07, "logits/chosen": -0.2530997395515442, "logits/rejected": -0.2749357223510742, "logps/chosen": -89.41133117675781, "logps/rejected": -42.87178039550781, "loss": 1.0178, "rewards/accuracies": 0.0, "rewards/chosen": 0.11243820190429688, "rewards/margins": -1.658624291419983, "rewards/rejected": 1.7710624933242798, "step": 1873 }, { "epoch": 0.3, "learning_rate": 9.614352278379216e-07, "logits/chosen": -0.7570366859436035, "logits/rejected": -0.6404500603675842, "logps/chosen": -98.35230255126953, "logps/rejected": -85.00537872314453, "loss": 0.575, "rewards/accuracies": 0.0, "rewards/chosen": 1.3053123950958252, "rewards/margins": -0.18745267391204834, "rewards/rejected": 1.4927650690078735, "step": 1874 }, { "epoch": 0.3, "learning_rate": 9.613845985940969e-07, "logits/chosen": -0.6954625844955444, "logits/rejected": -1.1159250736236572, "logps/chosen": -91.5401840209961, "logps/rejected": -37.15263748168945, "loss": 0.4086, "rewards/accuracies": 1.0, "rewards/chosen": 0.5829262137413025, "rewards/margins": 0.37607842683792114, "rewards/rejected": 0.20684777200222015, "step": 1875 }, { "epoch": 0.3, "learning_rate": 9.61333937472962e-07, "logits/chosen": -0.22599723935127258, "logits/rejected": -0.14875158667564392, "logps/chosen": -81.41488647460938, "logps/rejected": -26.569530487060547, "loss": 1.484, "rewards/accuracies": 0.0, "rewards/chosen": 0.5926742553710938, "rewards/margins": -0.411594033241272, "rewards/rejected": 1.0042682886123657, "step": 1876 }, { "epoch": 0.3, "learning_rate": 9.612832444780175e-07, "logits/chosen": -0.41007888317108154, "logits/rejected": -0.3604786694049835, "logps/chosen": -63.842071533203125, "logps/rejected": -42.81480026245117, "loss": 0.2214, "rewards/accuracies": 1.0, "rewards/chosen": 1.5078262090682983, "rewards/margins": 0.8212695717811584, "rewards/rejected": 0.6865566372871399, "step": 1877 }, { "epoch": 0.3, "learning_rate": 9.612325196127654e-07, "logits/chosen": -0.43184494972229004, "logits/rejected": -0.395791232585907, "logps/chosen": -94.71184539794922, "logps/rejected": -58.875282287597656, "loss": 0.668, "rewards/accuracies": 0.0, "rewards/chosen": 1.256691813468933, "rewards/margins": -0.5501395463943481, "rewards/rejected": 1.8068313598632812, "step": 1878 }, { "epoch": 0.3, "learning_rate": 9.611817628807103e-07, "logits/chosen": -0.04502226412296295, "logits/rejected": -0.04576680809259415, "logps/chosen": -2.2594103813171387, "logps/rejected": -32.98278045654297, "loss": 0.6835, "rewards/accuracies": 0.0, "rewards/chosen": 0.15168647468090057, "rewards/margins": -0.050605401396751404, "rewards/rejected": 0.20229187607765198, "step": 1879 }, { "epoch": 0.31, "learning_rate": 9.61130974285359e-07, "logits/chosen": -0.6088447570800781, "logits/rejected": -0.6302792429924011, "logps/chosen": -88.54081726074219, "logps/rejected": -52.88446807861328, "loss": 0.377, "rewards/accuracies": 1.0, "rewards/chosen": 2.295780897140503, "rewards/margins": 0.7802596092224121, "rewards/rejected": 1.5155212879180908, "step": 1880 }, { "epoch": 0.31, "learning_rate": 9.610801538302207e-07, "logits/chosen": -0.5841541886329651, "logits/rejected": -0.4781188368797302, "logps/chosen": -113.14983367919922, "logps/rejected": -35.03185272216797, "loss": 0.5601, "rewards/accuracies": 1.0, "rewards/chosen": 0.9154464602470398, "rewards/margins": 0.8499862551689148, "rewards/rejected": 0.065460205078125, "step": 1881 }, { "epoch": 0.31, "learning_rate": 9.610293015188067e-07, "logits/chosen": -0.31499502062797546, "logits/rejected": -0.35456758737564087, "logps/chosen": -63.175079345703125, "logps/rejected": -59.0396728515625, "loss": 1.005, "rewards/accuracies": 0.0, "rewards/chosen": 0.395730584859848, "rewards/margins": -1.2483749389648438, "rewards/rejected": 1.6441055536270142, "step": 1882 }, { "epoch": 0.31, "learning_rate": 9.609784173546302e-07, "logits/chosen": -0.5584301352500916, "logits/rejected": -0.545765221118927, "logps/chosen": -56.20002746582031, "logps/rejected": -109.40081024169922, "loss": 1.1607, "rewards/accuracies": 0.0, "rewards/chosen": 0.8786941766738892, "rewards/margins": -1.4502311944961548, "rewards/rejected": 2.328925371170044, "step": 1883 }, { "epoch": 0.31, "learning_rate": 9.60927501341207e-07, "logits/chosen": -0.5099976658821106, "logits/rejected": -0.5243518948554993, "logps/chosen": -57.782806396484375, "logps/rejected": -55.727176666259766, "loss": 0.8293, "rewards/accuracies": 0.0, "rewards/chosen": 1.328944444656372, "rewards/margins": -0.22423970699310303, "rewards/rejected": 1.553184151649475, "step": 1884 }, { "epoch": 0.31, "learning_rate": 9.608765534820547e-07, "logits/chosen": -0.5781271457672119, "logits/rejected": -0.42662450671195984, "logps/chosen": -140.04373168945312, "logps/rejected": -109.5509033203125, "loss": 0.679, "rewards/accuracies": 0.0, "rewards/chosen": 1.4898834228515625, "rewards/margins": -0.44904327392578125, "rewards/rejected": 1.9389266967773438, "step": 1885 }, { "epoch": 0.31, "learning_rate": 9.608255737806932e-07, "logits/chosen": -0.8569148182868958, "logits/rejected": -0.7578107118606567, "logps/chosen": -177.97158813476562, "logps/rejected": -209.34249877929688, "loss": 1.2729, "rewards/accuracies": 0.0, "rewards/chosen": 3.719381809234619, "rewards/margins": -2.0626401901245117, "rewards/rejected": 5.782021999359131, "step": 1886 }, { "epoch": 0.31, "learning_rate": 9.60774562240645e-07, "logits/chosen": -0.39840611815452576, "logits/rejected": -0.43675366044044495, "logps/chosen": -69.8489990234375, "logps/rejected": -130.455078125, "loss": 0.7565, "rewards/accuracies": 1.0, "rewards/chosen": 0.2896560728549957, "rewards/margins": 0.5637878179550171, "rewards/rejected": -0.27413177490234375, "step": 1887 }, { "epoch": 0.31, "learning_rate": 9.607235188654349e-07, "logits/chosen": -0.4412376880645752, "logits/rejected": -0.3741144835948944, "logps/chosen": -137.91807556152344, "logps/rejected": -71.44622802734375, "loss": 0.888, "rewards/accuracies": 1.0, "rewards/chosen": 3.562277317047119, "rewards/margins": 2.4367516040802, "rewards/rejected": 1.125525712966919, "step": 1888 }, { "epoch": 0.31, "learning_rate": 9.606724436585885e-07, "logits/chosen": -0.4949439465999603, "logits/rejected": -0.4949439465999603, "logps/chosen": -52.56729507446289, "logps/rejected": -52.56729507446289, "loss": 0.4158, "rewards/accuracies": 0.0, "rewards/chosen": 1.2980221509933472, "rewards/margins": 0.0, "rewards/rejected": 1.2980221509933472, "step": 1889 }, { "epoch": 0.31, "learning_rate": 9.606213366236353e-07, "logits/chosen": -0.4282463490962982, "logits/rejected": -0.3230648934841156, "logps/chosen": -78.84693908691406, "logps/rejected": -70.15172576904297, "loss": 0.4406, "rewards/accuracies": 1.0, "rewards/chosen": 1.136932373046875, "rewards/margins": 0.30349576473236084, "rewards/rejected": 0.8334366083145142, "step": 1890 }, { "epoch": 0.31, "learning_rate": 9.605701977641063e-07, "logits/chosen": -0.5050025582313538, "logits/rejected": -0.5158449411392212, "logps/chosen": -88.16983032226562, "logps/rejected": -122.30838775634766, "loss": 0.3289, "rewards/accuracies": 1.0, "rewards/chosen": 3.714080810546875, "rewards/margins": 0.23163819313049316, "rewards/rejected": 3.482442617416382, "step": 1891 }, { "epoch": 0.31, "learning_rate": 9.605190270835346e-07, "logits/chosen": -0.6749734878540039, "logits/rejected": -0.443446546792984, "logps/chosen": -155.91331481933594, "logps/rejected": -75.60264587402344, "loss": 0.3681, "rewards/accuracies": 1.0, "rewards/chosen": 2.6618974208831787, "rewards/margins": 1.6515450477600098, "rewards/rejected": 1.010352373123169, "step": 1892 }, { "epoch": 0.31, "learning_rate": 9.604678245854555e-07, "logits/chosen": -0.8623543381690979, "logits/rejected": -0.7333976030349731, "logps/chosen": -102.18170166015625, "logps/rejected": -59.12023162841797, "loss": 1.2164, "rewards/accuracies": 0.0, "rewards/chosen": 0.5142318606376648, "rewards/margins": -0.0727584958076477, "rewards/rejected": 0.5869903564453125, "step": 1893 }, { "epoch": 0.31, "learning_rate": 9.604165902734068e-07, "logits/chosen": -0.6697613596916199, "logits/rejected": -0.6655304431915283, "logps/chosen": -102.9799575805664, "logps/rejected": -80.19184112548828, "loss": 0.7336, "rewards/accuracies": 0.0, "rewards/chosen": -0.025260163471102715, "rewards/margins": -1.0907670259475708, "rewards/rejected": 1.0655068159103394, "step": 1894 }, { "epoch": 0.31, "learning_rate": 9.60365324150928e-07, "logits/chosen": -0.401533842086792, "logits/rejected": -0.42299142479896545, "logps/chosen": -136.7181396484375, "logps/rejected": -138.74819946289062, "loss": 1.922, "rewards/accuracies": 0.0, "rewards/chosen": 2.143322706222534, "rewards/margins": -2.8124663829803467, "rewards/rejected": 4.955789089202881, "step": 1895 }, { "epoch": 0.31, "learning_rate": 9.603140262215616e-07, "logits/chosen": -0.6787334084510803, "logits/rejected": -0.6574382781982422, "logps/chosen": -198.05368041992188, "logps/rejected": -182.38446044921875, "loss": 0.5278, "rewards/accuracies": 1.0, "rewards/chosen": 3.503103733062744, "rewards/margins": 0.4046783447265625, "rewards/rejected": 3.0984253883361816, "step": 1896 }, { "epoch": 0.31, "learning_rate": 9.602626964888514e-07, "logits/chosen": -0.321422815322876, "logits/rejected": -0.3415125012397766, "logps/chosen": -45.75225830078125, "logps/rejected": -51.572174072265625, "loss": 1.1172, "rewards/accuracies": 0.0, "rewards/chosen": 0.9980335235595703, "rewards/margins": -0.7948330640792847, "rewards/rejected": 1.792866587638855, "step": 1897 }, { "epoch": 0.31, "learning_rate": 9.602113349563438e-07, "logits/chosen": -0.3487258851528168, "logits/rejected": -0.3546931743621826, "logps/chosen": -43.227317810058594, "logps/rejected": -42.24138641357422, "loss": 0.9234, "rewards/accuracies": 0.0, "rewards/chosen": 0.7317013144493103, "rewards/margins": -0.48410147428512573, "rewards/rejected": 1.215802788734436, "step": 1898 }, { "epoch": 0.31, "learning_rate": 9.601599416275877e-07, "logits/chosen": -0.8440920114517212, "logits/rejected": -0.8549969792366028, "logps/chosen": -112.92807006835938, "logps/rejected": -114.27743530273438, "loss": 0.8268, "rewards/accuracies": 0.0, "rewards/chosen": 0.4800659120082855, "rewards/margins": -1.214300513267517, "rewards/rejected": 1.694366455078125, "step": 1899 }, { "epoch": 0.31, "learning_rate": 9.601085165061336e-07, "logits/chosen": -0.7914395332336426, "logits/rejected": -0.7661675810813904, "logps/chosen": -101.52288818359375, "logps/rejected": -98.24520111083984, "loss": 1.8236, "rewards/accuracies": 0.0, "rewards/chosen": 1.314570665359497, "rewards/margins": -0.8010215759277344, "rewards/rejected": 2.1155922412872314, "step": 1900 }, { "epoch": 0.31, "learning_rate": 9.600570595955346e-07, "logits/chosen": -0.4499278664588928, "logits/rejected": -0.45704489946365356, "logps/chosen": -60.11737823486328, "logps/rejected": -217.56759643554688, "loss": 1.3339, "rewards/accuracies": 0.0, "rewards/chosen": 1.5600547790527344, "rewards/margins": -1.5276985168457031, "rewards/rejected": 3.0877532958984375, "step": 1901 }, { "epoch": 0.31, "learning_rate": 9.60005570899346e-07, "logits/chosen": -0.42759963870048523, "logits/rejected": -0.35795891284942627, "logps/chosen": -253.26095581054688, "logps/rejected": -26.563472747802734, "loss": 0.1272, "rewards/accuracies": 1.0, "rewards/chosen": 2.473764181137085, "rewards/margins": 1.9921226501464844, "rewards/rejected": 0.48164159059524536, "step": 1902 }, { "epoch": 0.31, "learning_rate": 9.59954050421125e-07, "logits/chosen": -0.5703677535057068, "logits/rejected": -0.5027574300765991, "logps/chosen": -115.27793884277344, "logps/rejected": -91.89042663574219, "loss": 0.4423, "rewards/accuracies": 1.0, "rewards/chosen": 2.1031205654144287, "rewards/margins": 0.5269913673400879, "rewards/rejected": 1.5761291980743408, "step": 1903 }, { "epoch": 0.31, "learning_rate": 9.599024981644312e-07, "logits/chosen": -0.7217491865158081, "logits/rejected": -0.6047840118408203, "logps/chosen": -111.36512756347656, "logps/rejected": -30.541135787963867, "loss": 0.4754, "rewards/accuracies": 1.0, "rewards/chosen": 0.647113025188446, "rewards/margins": 0.48033082485198975, "rewards/rejected": 0.1667821854352951, "step": 1904 }, { "epoch": 0.31, "learning_rate": 9.598509141328263e-07, "logits/chosen": -0.4720212519168854, "logits/rejected": -0.41340774297714233, "logps/chosen": -61.882301330566406, "logps/rejected": -136.0941925048828, "loss": 0.2007, "rewards/accuracies": 1.0, "rewards/chosen": 1.6946114301681519, "rewards/margins": 1.4661126136779785, "rewards/rejected": 0.22849884629249573, "step": 1905 }, { "epoch": 0.31, "learning_rate": 9.597992983298745e-07, "logits/chosen": -0.7165334224700928, "logits/rejected": -0.7261443734169006, "logps/chosen": -46.577484130859375, "logps/rejected": -55.48094177246094, "loss": 0.5014, "rewards/accuracies": 1.0, "rewards/chosen": 1.6989914178848267, "rewards/margins": 0.5047409534454346, "rewards/rejected": 1.194250464439392, "step": 1906 }, { "epoch": 0.31, "learning_rate": 9.59747650759142e-07, "logits/chosen": -0.2862403392791748, "logits/rejected": -0.30125775933265686, "logps/chosen": -1.3655223846435547, "logps/rejected": -35.93319320678711, "loss": 0.5529, "rewards/accuracies": 1.0, "rewards/chosen": 0.2558419406414032, "rewards/margins": 0.0714600682258606, "rewards/rejected": 0.1843818724155426, "step": 1907 }, { "epoch": 0.31, "learning_rate": 9.596959714241968e-07, "logits/chosen": -0.19834327697753906, "logits/rejected": -0.16825039684772491, "logps/chosen": -106.95433044433594, "logps/rejected": -128.10421752929688, "loss": 0.4868, "rewards/accuracies": 1.0, "rewards/chosen": 1.1913292407989502, "rewards/margins": 0.6103203296661377, "rewards/rejected": 0.5810089111328125, "step": 1908 }, { "epoch": 0.31, "learning_rate": 9.596442603286097e-07, "logits/chosen": -0.37732192873954773, "logits/rejected": -0.4033614993095398, "logps/chosen": -175.00613403320312, "logps/rejected": -127.61515045166016, "loss": 0.7687, "rewards/accuracies": 0.0, "rewards/chosen": 3.1171233654022217, "rewards/margins": -0.9269554615020752, "rewards/rejected": 4.044078826904297, "step": 1909 }, { "epoch": 0.31, "learning_rate": 9.595925174759537e-07, "logits/chosen": -0.504828155040741, "logits/rejected": -0.4680788516998291, "logps/chosen": -81.35515594482422, "logps/rejected": -63.997459411621094, "loss": 1.2519, "rewards/accuracies": 0.0, "rewards/chosen": 1.749420166015625, "rewards/margins": -0.5278708934783936, "rewards/rejected": 2.2772910594940186, "step": 1910 }, { "epoch": 0.31, "learning_rate": 9.59540742869803e-07, "logits/chosen": -0.7422964572906494, "logits/rejected": -0.7623255252838135, "logps/chosen": -58.427921295166016, "logps/rejected": -63.097015380859375, "loss": 0.5005, "rewards/accuracies": 1.0, "rewards/chosen": 1.297885537147522, "rewards/margins": 0.10414767265319824, "rewards/rejected": 1.1937378644943237, "step": 1911 }, { "epoch": 0.31, "learning_rate": 9.594889365137352e-07, "logits/chosen": -0.1524767279624939, "logits/rejected": -0.1505410075187683, "logps/chosen": -6.146256923675537, "logps/rejected": -3.8628504276275635, "loss": 0.4412, "rewards/accuracies": 0.0, "rewards/chosen": 0.23624996840953827, "rewards/margins": -0.18527905642986298, "rewards/rejected": 0.42152902483940125, "step": 1912 }, { "epoch": 0.31, "learning_rate": 9.5943709841133e-07, "logits/chosen": -0.3420032262802124, "logits/rejected": -0.3218640089035034, "logps/chosen": -37.998634338378906, "logps/rejected": -27.854318618774414, "loss": 0.61, "rewards/accuracies": 0.0, "rewards/chosen": -0.3696170747280121, "rewards/margins": -0.3935585021972656, "rewards/rejected": 0.023941421881318092, "step": 1913 }, { "epoch": 0.31, "learning_rate": 9.593852285661683e-07, "logits/chosen": -0.4608805179595947, "logits/rejected": -0.4683118462562561, "logps/chosen": -68.72827911376953, "logps/rejected": -199.05084228515625, "loss": 0.8268, "rewards/accuracies": 0.0, "rewards/chosen": 2.387322187423706, "rewards/margins": -0.6310708522796631, "rewards/rejected": 3.018393039703369, "step": 1914 }, { "epoch": 0.31, "learning_rate": 9.59333326981834e-07, "logits/chosen": -0.4531547427177429, "logits/rejected": -0.4258367419242859, "logps/chosen": -35.69667434692383, "logps/rejected": -79.07189178466797, "loss": 1.2928, "rewards/accuracies": 0.0, "rewards/chosen": 1.522691011428833, "rewards/margins": -0.8594355583190918, "rewards/rejected": 2.382126569747925, "step": 1915 }, { "epoch": 0.31, "learning_rate": 9.59281393661913e-07, "logits/chosen": -0.263700008392334, "logits/rejected": -0.1810101866722107, "logps/chosen": -44.6599006652832, "logps/rejected": -26.721403121948242, "loss": 0.5278, "rewards/accuracies": 1.0, "rewards/chosen": 1.3191403150558472, "rewards/margins": 1.1029974222183228, "rewards/rejected": 0.21614284813404083, "step": 1916 }, { "epoch": 0.31, "learning_rate": 9.592294286099938e-07, "logits/chosen": -0.3487570583820343, "logits/rejected": -0.4119241535663605, "logps/chosen": -64.17272186279297, "logps/rejected": -137.2548828125, "loss": 0.9996, "rewards/accuracies": 0.0, "rewards/chosen": 1.640313744544983, "rewards/margins": -1.6967681646347046, "rewards/rejected": 3.3370819091796875, "step": 1917 }, { "epoch": 0.31, "learning_rate": 9.59177431829666e-07, "logits/chosen": -0.8724730610847473, "logits/rejected": -0.7739624977111816, "logps/chosen": -52.54408264160156, "logps/rejected": -36.382991790771484, "loss": 0.282, "rewards/accuracies": 1.0, "rewards/chosen": 1.44133460521698, "rewards/margins": 1.3515820503234863, "rewards/rejected": 0.08975257724523544, "step": 1918 }, { "epoch": 0.31, "learning_rate": 9.591254033245227e-07, "logits/chosen": -0.554244339466095, "logits/rejected": -0.5249618291854858, "logps/chosen": -84.15984344482422, "logps/rejected": -98.72285461425781, "loss": 0.1892, "rewards/accuracies": 1.0, "rewards/chosen": 2.5451736450195312, "rewards/margins": 0.9795539379119873, "rewards/rejected": 1.565619707107544, "step": 1919 }, { "epoch": 0.31, "learning_rate": 9.590733430981582e-07, "logits/chosen": -0.28900784254074097, "logits/rejected": -0.18024636805057526, "logps/chosen": -149.88949584960938, "logps/rejected": -97.13410949707031, "loss": 1.4171, "rewards/accuracies": 1.0, "rewards/chosen": 4.318164348602295, "rewards/margins": 2.005906820297241, "rewards/rejected": 2.3122575283050537, "step": 1920 }, { "epoch": 0.31, "learning_rate": 9.590212511541693e-07, "logits/chosen": -0.2569423019886017, "logits/rejected": -0.2599388659000397, "logps/chosen": -5.4416327476501465, "logps/rejected": -6.6584153175354, "loss": 0.7199, "rewards/accuracies": 1.0, "rewards/chosen": 0.176275834441185, "rewards/margins": 0.16238099336624146, "rewards/rejected": 0.013894843868911266, "step": 1921 }, { "epoch": 0.31, "learning_rate": 9.589691274961555e-07, "logits/chosen": -0.34948328137397766, "logits/rejected": -0.18728423118591309, "logps/chosen": -73.23963928222656, "logps/rejected": -73.33065032958984, "loss": 0.7059, "rewards/accuracies": 0.0, "rewards/chosen": 1.6657761335372925, "rewards/margins": -0.9542793035507202, "rewards/rejected": 2.6200554370880127, "step": 1922 }, { "epoch": 0.31, "learning_rate": 9.589169721277177e-07, "logits/chosen": -0.5955585241317749, "logits/rejected": -0.5524383783340454, "logps/chosen": -74.42598724365234, "logps/rejected": -124.07064056396484, "loss": 1.8262, "rewards/accuracies": 0.0, "rewards/chosen": 0.7129325866699219, "rewards/margins": -2.379742383956909, "rewards/rejected": 3.092674970626831, "step": 1923 }, { "epoch": 0.31, "learning_rate": 9.588647850524594e-07, "logits/chosen": -0.7040032744407654, "logits/rejected": -0.6090324521064758, "logps/chosen": -76.53290557861328, "logps/rejected": -14.8244047164917, "loss": 1.2833, "rewards/accuracies": 1.0, "rewards/chosen": 0.8636177182197571, "rewards/margins": 0.11722421646118164, "rewards/rejected": 0.7463935017585754, "step": 1924 }, { "epoch": 0.31, "learning_rate": 9.588125662739862e-07, "logits/chosen": -0.48704037070274353, "logits/rejected": -0.47563111782073975, "logps/chosen": -53.100257873535156, "logps/rejected": -109.15304565429688, "loss": 0.4541, "rewards/accuracies": 1.0, "rewards/chosen": 1.0764282941818237, "rewards/margins": 0.007093071937561035, "rewards/rejected": 1.0693352222442627, "step": 1925 }, { "epoch": 0.31, "learning_rate": 9.587603157959062e-07, "logits/chosen": -0.5362470746040344, "logits/rejected": -0.5630895495414734, "logps/chosen": -78.28551483154297, "logps/rejected": -101.70803833007812, "loss": 1.6555, "rewards/accuracies": 1.0, "rewards/chosen": 1.910408854484558, "rewards/margins": 1.3292855024337769, "rewards/rejected": 0.5811233520507812, "step": 1926 }, { "epoch": 0.31, "learning_rate": 9.587080336218292e-07, "logits/chosen": -0.6183530688285828, "logits/rejected": -0.6100847721099854, "logps/chosen": -60.71240234375, "logps/rejected": -102.44518280029297, "loss": 0.2318, "rewards/accuracies": 1.0, "rewards/chosen": 1.0809128284454346, "rewards/margins": 1.3688805103302002, "rewards/rejected": -0.2879676818847656, "step": 1927 }, { "epoch": 0.31, "learning_rate": 9.586557197553673e-07, "logits/chosen": -0.8408425450325012, "logits/rejected": -0.7798688411712646, "logps/chosen": -199.35531616210938, "logps/rejected": -35.522701263427734, "loss": 0.3696, "rewards/accuracies": 1.0, "rewards/chosen": 0.6124786734580994, "rewards/margins": 0.3521648645401001, "rewards/rejected": 0.26031380891799927, "step": 1928 }, { "epoch": 0.31, "learning_rate": 9.58603374200135e-07, "logits/chosen": -0.11974028497934341, "logits/rejected": -0.11974028497934341, "logps/chosen": -4.280609130859375, "logps/rejected": -4.280609130859375, "loss": 0.9478, "rewards/accuracies": 0.0, "rewards/chosen": 0.03855771943926811, "rewards/margins": 0.0, "rewards/rejected": 0.03855771943926811, "step": 1929 }, { "epoch": 0.31, "learning_rate": 9.58550996959749e-07, "logits/chosen": -0.795540452003479, "logits/rejected": -0.7579590678215027, "logps/chosen": -115.69140625, "logps/rejected": -111.02631378173828, "loss": 1.4826, "rewards/accuracies": 0.0, "rewards/chosen": 0.408355712890625, "rewards/margins": -1.6562798023223877, "rewards/rejected": 2.0646355152130127, "step": 1930 }, { "epoch": 0.31, "learning_rate": 9.584985880378278e-07, "logits/chosen": -0.1700620949268341, "logits/rejected": -0.24287040531635284, "logps/chosen": -97.47439575195312, "logps/rejected": -54.610050201416016, "loss": 0.4643, "rewards/accuracies": 0.0, "rewards/chosen": 1.5777138471603394, "rewards/margins": -0.3225085735321045, "rewards/rejected": 1.9002224206924438, "step": 1931 }, { "epoch": 0.31, "learning_rate": 9.584461474379925e-07, "logits/chosen": -0.10183589160442352, "logits/rejected": -0.10183589160442352, "logps/chosen": -41.261627197265625, "logps/rejected": -41.261627197265625, "loss": 1.0884, "rewards/accuracies": 0.0, "rewards/chosen": 1.0504554510116577, "rewards/margins": 0.0, "rewards/rejected": 1.0504554510116577, "step": 1932 }, { "epoch": 0.31, "learning_rate": 9.583936751638666e-07, "logits/chosen": -0.5448720455169678, "logits/rejected": -0.5250089764595032, "logps/chosen": -147.12283325195312, "logps/rejected": -96.95176696777344, "loss": 0.7699, "rewards/accuracies": 1.0, "rewards/chosen": 2.645498752593994, "rewards/margins": 0.115997314453125, "rewards/rejected": 2.529501438140869, "step": 1933 }, { "epoch": 0.31, "learning_rate": 9.583411712190749e-07, "logits/chosen": -0.5204380750656128, "logits/rejected": -0.5428440570831299, "logps/chosen": -291.551025390625, "logps/rejected": -105.74983978271484, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 2.9511964321136475, "rewards/margins": 1.6908334493637085, "rewards/rejected": 1.260362982749939, "step": 1934 }, { "epoch": 0.31, "learning_rate": 9.58288635607245e-07, "logits/chosen": -0.23577378690242767, "logits/rejected": -0.2011273056268692, "logps/chosen": -61.553646087646484, "logps/rejected": -93.09249877929688, "loss": 0.8045, "rewards/accuracies": 0.0, "rewards/chosen": 1.3281300067901611, "rewards/margins": -1.174609661102295, "rewards/rejected": 2.502739667892456, "step": 1935 }, { "epoch": 0.31, "learning_rate": 9.582360683320068e-07, "logits/chosen": -0.5332710146903992, "logits/rejected": -0.43555471301078796, "logps/chosen": -92.96566772460938, "logps/rejected": -153.5332794189453, "loss": 1.0346, "rewards/accuracies": 0.0, "rewards/chosen": 1.7273170948028564, "rewards/margins": -1.3902168273925781, "rewards/rejected": 3.1175339221954346, "step": 1936 }, { "epoch": 0.31, "learning_rate": 9.581834693969923e-07, "logits/chosen": -0.2962997853755951, "logits/rejected": -0.3181843161582947, "logps/chosen": -128.41412353515625, "logps/rejected": -140.0158233642578, "loss": 0.1842, "rewards/accuracies": 1.0, "rewards/chosen": 0.9612320065498352, "rewards/margins": 0.9016219973564148, "rewards/rejected": 0.05960998684167862, "step": 1937 }, { "epoch": 0.31, "learning_rate": 9.581308388058354e-07, "logits/chosen": -0.5165637731552124, "logits/rejected": -0.5165637731552124, "logps/chosen": -29.84419822692871, "logps/rejected": -29.84419822692871, "loss": 1.2661, "rewards/accuracies": 0.0, "rewards/chosen": 0.39501437544822693, "rewards/margins": 0.0, "rewards/rejected": 0.39501437544822693, "step": 1938 }, { "epoch": 0.31, "learning_rate": 9.580781765621724e-07, "logits/chosen": -0.7589944005012512, "logits/rejected": -0.6256656050682068, "logps/chosen": -80.95274353027344, "logps/rejected": -85.05494689941406, "loss": 0.8839, "rewards/accuracies": 0.0, "rewards/chosen": 0.6227027773857117, "rewards/margins": -0.7561073899269104, "rewards/rejected": 1.378810167312622, "step": 1939 }, { "epoch": 0.31, "learning_rate": 9.580254826696417e-07, "logits/chosen": -0.5797024369239807, "logits/rejected": -0.5237851142883301, "logps/chosen": -51.78318405151367, "logps/rejected": -56.251136779785156, "loss": 0.6169, "rewards/accuracies": 1.0, "rewards/chosen": 1.751169204711914, "rewards/margins": 0.06031453609466553, "rewards/rejected": 1.6908546686172485, "step": 1940 }, { "epoch": 0.32, "learning_rate": 9.57972757131884e-07, "logits/chosen": -0.46917256712913513, "logits/rejected": -0.3914475739002228, "logps/chosen": -72.98003387451172, "logps/rejected": -67.97330474853516, "loss": 0.6218, "rewards/accuracies": 1.0, "rewards/chosen": 2.1491920948028564, "rewards/margins": 0.2067558765411377, "rewards/rejected": 1.9424362182617188, "step": 1941 }, { "epoch": 0.32, "learning_rate": 9.579199999525423e-07, "logits/chosen": -0.545120358467102, "logits/rejected": -0.5416971445083618, "logps/chosen": -20.98344612121582, "logps/rejected": -22.72818946838379, "loss": 0.9129, "rewards/accuracies": 1.0, "rewards/chosen": 0.11523780971765518, "rewards/margins": 0.14889660477638245, "rewards/rejected": -0.033658791333436966, "step": 1942 }, { "epoch": 0.32, "learning_rate": 9.578672111352613e-07, "logits/chosen": -0.5369117856025696, "logits/rejected": -0.46224889159202576, "logps/chosen": -83.15397644042969, "logps/rejected": -79.51002502441406, "loss": 0.4187, "rewards/accuracies": 1.0, "rewards/chosen": 1.7895042896270752, "rewards/margins": 0.6442489624023438, "rewards/rejected": 1.1452553272247314, "step": 1943 }, { "epoch": 0.32, "learning_rate": 9.578143906836884e-07, "logits/chosen": -0.6774004697799683, "logits/rejected": -0.5771955251693726, "logps/chosen": -72.45106506347656, "logps/rejected": -44.18281936645508, "loss": 0.6569, "rewards/accuracies": 1.0, "rewards/chosen": 1.1533325910568237, "rewards/margins": 0.777834415435791, "rewards/rejected": 0.3754982054233551, "step": 1944 }, { "epoch": 0.32, "learning_rate": 9.577615386014732e-07, "logits/chosen": -0.5134309530258179, "logits/rejected": -0.4683777689933777, "logps/chosen": -39.9511604309082, "logps/rejected": -78.42033386230469, "loss": 0.8764, "rewards/accuracies": 1.0, "rewards/chosen": 1.6341781616210938, "rewards/margins": 0.5303809642791748, "rewards/rejected": 1.103797197341919, "step": 1945 }, { "epoch": 0.32, "learning_rate": 9.577086548922668e-07, "logits/chosen": -0.23536080121994019, "logits/rejected": -0.17889255285263062, "logps/chosen": -49.90726089477539, "logps/rejected": -14.680103302001953, "loss": 0.6109, "rewards/accuracies": 0.0, "rewards/chosen": 0.474957674741745, "rewards/margins": -0.3231521546840668, "rewards/rejected": 0.7981098294258118, "step": 1946 }, { "epoch": 0.32, "learning_rate": 9.576557395597236e-07, "logits/chosen": -0.6383376717567444, "logits/rejected": -0.6156271696090698, "logps/chosen": -69.08261108398438, "logps/rejected": -72.8558578491211, "loss": 0.5627, "rewards/accuracies": 0.0, "rewards/chosen": 0.8180397152900696, "rewards/margins": -0.2820731997489929, "rewards/rejected": 1.1001129150390625, "step": 1947 }, { "epoch": 0.32, "learning_rate": 9.57602792607499e-07, "logits/chosen": -0.2746785581111908, "logits/rejected": -0.30696383118629456, "logps/chosen": -50.89452362060547, "logps/rejected": -39.034393310546875, "loss": 0.6818, "rewards/accuracies": 0.0, "rewards/chosen": 0.5996391177177429, "rewards/margins": -0.09335100650787354, "rewards/rejected": 0.6929901242256165, "step": 1948 }, { "epoch": 0.32, "learning_rate": 9.57549814039251e-07, "logits/chosen": -0.7661125063896179, "logits/rejected": -0.7653001546859741, "logps/chosen": -317.69659423828125, "logps/rejected": -149.3045654296875, "loss": 0.8464, "rewards/accuracies": 0.0, "rewards/chosen": 3.13372802734375, "rewards/margins": -0.6955687999725342, "rewards/rejected": 3.829296827316284, "step": 1949 }, { "epoch": 0.32, "learning_rate": 9.574968038586406e-07, "logits/chosen": -0.5645548105239868, "logits/rejected": -0.5819300413131714, "logps/chosen": -59.90195083618164, "logps/rejected": -51.635459899902344, "loss": 0.2712, "rewards/accuracies": 1.0, "rewards/chosen": 1.7480480670928955, "rewards/margins": 0.38957107067108154, "rewards/rejected": 1.358476996421814, "step": 1950 }, { "epoch": 0.32, "learning_rate": 9.5744376206933e-07, "logits/chosen": -0.0009096321882680058, "logits/rejected": -0.0009818682447075844, "logps/chosen": -21.57933807373047, "logps/rejected": -61.17034149169922, "loss": 0.495, "rewards/accuracies": 0.0, "rewards/chosen": 0.2054603546857834, "rewards/margins": -0.1318313628435135, "rewards/rejected": 0.3372917175292969, "step": 1951 }, { "epoch": 0.32, "learning_rate": 9.573906886749834e-07, "logits/chosen": -0.7689287066459656, "logits/rejected": -0.6163550019264221, "logps/chosen": -111.23126220703125, "logps/rejected": -209.86602783203125, "loss": 1.2378, "rewards/accuracies": 0.0, "rewards/chosen": 3.2977845668792725, "rewards/margins": -0.011981010437011719, "rewards/rejected": 3.309765577316284, "step": 1952 }, { "epoch": 0.32, "learning_rate": 9.573375836792683e-07, "logits/chosen": -0.6421437859535217, "logits/rejected": -0.5347292423248291, "logps/chosen": -141.68667602539062, "logps/rejected": -75.42994689941406, "loss": 0.4105, "rewards/accuracies": 1.0, "rewards/chosen": 3.142712354660034, "rewards/margins": 0.6299560070037842, "rewards/rejected": 2.51275634765625, "step": 1953 }, { "epoch": 0.32, "learning_rate": 9.572844470858537e-07, "logits/chosen": -0.3468853235244751, "logits/rejected": -0.24687297642230988, "logps/chosen": -102.50786590576172, "logps/rejected": -31.43292999267578, "loss": 0.4523, "rewards/accuracies": 1.0, "rewards/chosen": 1.4114822149276733, "rewards/margins": 1.3588107824325562, "rewards/rejected": 0.05267143249511719, "step": 1954 }, { "epoch": 0.32, "learning_rate": 9.572312788984105e-07, "logits/chosen": -0.34170934557914734, "logits/rejected": -0.2875176668167114, "logps/chosen": -139.2829132080078, "logps/rejected": -70.97877502441406, "loss": 0.7461, "rewards/accuracies": 0.0, "rewards/chosen": 0.08902740478515625, "rewards/margins": -0.7590522766113281, "rewards/rejected": 0.8480796813964844, "step": 1955 }, { "epoch": 0.32, "learning_rate": 9.571780791206121e-07, "logits/chosen": -0.7831887602806091, "logits/rejected": -0.6324914693832397, "logps/chosen": -210.04666137695312, "logps/rejected": -104.10382080078125, "loss": 1.141, "rewards/accuracies": 0.0, "rewards/chosen": 0.19333802163600922, "rewards/margins": -1.5549720525741577, "rewards/rejected": 1.7483100891113281, "step": 1956 }, { "epoch": 0.32, "learning_rate": 9.571248477561346e-07, "logits/chosen": -0.6835006475448608, "logits/rejected": -0.6840420365333557, "logps/chosen": -69.9583740234375, "logps/rejected": -127.14080810546875, "loss": 0.7694, "rewards/accuracies": 0.0, "rewards/chosen": 0.6166473627090454, "rewards/margins": -1.2199004888534546, "rewards/rejected": 1.8365478515625, "step": 1957 }, { "epoch": 0.32, "learning_rate": 9.570715848086555e-07, "logits/chosen": -0.8281421065330505, "logits/rejected": -0.8575924038887024, "logps/chosen": -209.03953552246094, "logps/rejected": -64.116943359375, "loss": 0.2529, "rewards/accuracies": 1.0, "rewards/chosen": 3.2268693447113037, "rewards/margins": 1.166203498840332, "rewards/rejected": 2.0606658458709717, "step": 1958 }, { "epoch": 0.32, "learning_rate": 9.570182902818544e-07, "logits/chosen": -0.3861093819141388, "logits/rejected": -0.30052536725997925, "logps/chosen": -84.78816223144531, "logps/rejected": -56.08661651611328, "loss": 1.2039, "rewards/accuracies": 1.0, "rewards/chosen": 1.7379013299942017, "rewards/margins": 1.0117645263671875, "rewards/rejected": 0.7261368036270142, "step": 1959 }, { "epoch": 0.32, "learning_rate": 9.569649641794141e-07, "logits/chosen": -0.3451656997203827, "logits/rejected": -0.18101097643375397, "logps/chosen": -89.05012512207031, "logps/rejected": -29.38842010498047, "loss": 0.7848, "rewards/accuracies": 0.0, "rewards/chosen": -0.038238525390625, "rewards/margins": -0.03308296203613281, "rewards/rejected": -0.0051555633544921875, "step": 1960 }, { "epoch": 0.32, "learning_rate": 9.569116065050185e-07, "logits/chosen": -0.4695764183998108, "logits/rejected": -0.47090646624565125, "logps/chosen": -148.24044799804688, "logps/rejected": -82.80245971679688, "loss": 0.5784, "rewards/accuracies": 0.0, "rewards/chosen": 2.929539442062378, "rewards/margins": -0.3524765968322754, "rewards/rejected": 3.2820160388946533, "step": 1961 }, { "epoch": 0.32, "learning_rate": 9.568582172623543e-07, "logits/chosen": -0.5769939422607422, "logits/rejected": -0.6199286580085754, "logps/chosen": -120.28643798828125, "logps/rejected": -90.49524688720703, "loss": 0.1414, "rewards/accuracies": 1.0, "rewards/chosen": 3.8071229457855225, "rewards/margins": 2.244098663330078, "rewards/rejected": 1.5630241632461548, "step": 1962 }, { "epoch": 0.32, "learning_rate": 9.5680479645511e-07, "logits/chosen": -0.45335355401039124, "logits/rejected": -0.4573650658130646, "logps/chosen": -155.28680419921875, "logps/rejected": -140.98965454101562, "loss": 1.1812, "rewards/accuracies": 1.0, "rewards/chosen": 1.3321853876113892, "rewards/margins": 0.6273483633995056, "rewards/rejected": 0.7048370242118835, "step": 1963 }, { "epoch": 0.32, "learning_rate": 9.567513440869767e-07, "logits/chosen": -0.46180349588394165, "logits/rejected": -0.42661425471305847, "logps/chosen": -61.618675231933594, "logps/rejected": -98.60188293457031, "loss": 0.864, "rewards/accuracies": 1.0, "rewards/chosen": 1.0991172790527344, "rewards/margins": 1.252428412437439, "rewards/rejected": -0.15331116318702698, "step": 1964 }, { "epoch": 0.32, "learning_rate": 9.566978601616473e-07, "logits/chosen": -0.41008925437927246, "logits/rejected": -0.3686230480670929, "logps/chosen": -49.4334602355957, "logps/rejected": -45.7982177734375, "loss": 0.5027, "rewards/accuracies": 1.0, "rewards/chosen": 1.6037571430206299, "rewards/margins": 0.17027699947357178, "rewards/rejected": 1.433480143547058, "step": 1965 }, { "epoch": 0.32, "learning_rate": 9.56644344682817e-07, "logits/chosen": -0.44881707429885864, "logits/rejected": -0.13618594408035278, "logps/chosen": -166.03530883789062, "logps/rejected": -133.05587768554688, "loss": 0.4247, "rewards/accuracies": 0.0, "rewards/chosen": 2.578631639480591, "rewards/margins": -0.2669205665588379, "rewards/rejected": 2.8455522060394287, "step": 1966 }, { "epoch": 0.32, "learning_rate": 9.565907976541833e-07, "logits/chosen": -0.25455689430236816, "logits/rejected": -0.12895403802394867, "logps/chosen": -62.68065643310547, "logps/rejected": -54.04391860961914, "loss": 1.3153, "rewards/accuracies": 1.0, "rewards/chosen": 1.5630035400390625, "rewards/margins": 0.06297183036804199, "rewards/rejected": 1.5000317096710205, "step": 1967 }, { "epoch": 0.32, "learning_rate": 9.56537219079446e-07, "logits/chosen": -0.19383835792541504, "logits/rejected": -0.2176615446805954, "logps/chosen": -79.52656555175781, "logps/rejected": -73.4379653930664, "loss": 0.3678, "rewards/accuracies": 1.0, "rewards/chosen": 1.4915611743927002, "rewards/margins": 0.7972473502159119, "rewards/rejected": 0.6943138241767883, "step": 1968 }, { "epoch": 0.32, "learning_rate": 9.564836089623064e-07, "logits/chosen": -0.5502282977104187, "logits/rejected": -0.5368875861167908, "logps/chosen": -36.18980407714844, "logps/rejected": -37.10671615600586, "loss": 0.8274, "rewards/accuracies": 0.0, "rewards/chosen": 0.1897563934326172, "rewards/margins": -0.211285799741745, "rewards/rejected": 0.4010421931743622, "step": 1969 }, { "epoch": 0.32, "learning_rate": 9.56429967306469e-07, "logits/chosen": -0.5450348854064941, "logits/rejected": -0.5094304084777832, "logps/chosen": -147.09060668945312, "logps/rejected": -103.19861602783203, "loss": 0.5967, "rewards/accuracies": 0.0, "rewards/chosen": 0.7933487296104431, "rewards/margins": -0.6849426627159119, "rewards/rejected": 1.478291392326355, "step": 1970 }, { "epoch": 0.32, "learning_rate": 9.563762941156395e-07, "logits/chosen": -0.7914460301399231, "logits/rejected": -0.7819869518280029, "logps/chosen": -119.82237243652344, "logps/rejected": -230.2091522216797, "loss": 0.976, "rewards/accuracies": 0.0, "rewards/chosen": 0.6639084219932556, "rewards/margins": -1.753343105316162, "rewards/rejected": 2.4172515869140625, "step": 1971 }, { "epoch": 0.32, "learning_rate": 9.563225893935264e-07, "logits/chosen": -0.2660697102546692, "logits/rejected": -0.2547133266925812, "logps/chosen": -19.809326171875, "logps/rejected": -9.378641128540039, "loss": 1.0217, "rewards/accuracies": 1.0, "rewards/chosen": 0.5352191925048828, "rewards/margins": 0.40613746643066406, "rewards/rejected": 0.12908172607421875, "step": 1972 }, { "epoch": 0.32, "learning_rate": 9.562688531438398e-07, "logits/chosen": -0.3342893123626709, "logits/rejected": -0.3984718918800354, "logps/chosen": -79.99201965332031, "logps/rejected": -101.60220336914062, "loss": 0.5671, "rewards/accuracies": 0.0, "rewards/chosen": 0.20983505249023438, "rewards/margins": -0.5297042727470398, "rewards/rejected": 0.7395393252372742, "step": 1973 }, { "epoch": 0.32, "learning_rate": 9.56215085370293e-07, "logits/chosen": -0.8788049221038818, "logits/rejected": -0.8291428685188293, "logps/chosen": -123.33438110351562, "logps/rejected": -118.84677124023438, "loss": 1.2289, "rewards/accuracies": 0.0, "rewards/chosen": 0.9805435538291931, "rewards/margins": -2.1157639026641846, "rewards/rejected": 3.0963075160980225, "step": 1974 }, { "epoch": 0.32, "learning_rate": 9.561612860766006e-07, "logits/chosen": -0.7668665647506714, "logits/rejected": -0.7668665647506714, "logps/chosen": -103.67782592773438, "logps/rejected": -103.67782592773438, "loss": 0.6223, "rewards/accuracies": 0.0, "rewards/chosen": 1.657112956047058, "rewards/margins": 0.0, "rewards/rejected": 1.657112956047058, "step": 1975 }, { "epoch": 0.32, "learning_rate": 9.561074552664794e-07, "logits/chosen": -0.5547834634780884, "logits/rejected": -0.5043244957923889, "logps/chosen": -47.437870025634766, "logps/rejected": -72.19793701171875, "loss": 0.4284, "rewards/accuracies": 1.0, "rewards/chosen": 1.7498668432235718, "rewards/margins": 0.7078959941864014, "rewards/rejected": 1.0419708490371704, "step": 1976 }, { "epoch": 0.32, "learning_rate": 9.560535929436487e-07, "logits/chosen": -0.004289090167731047, "logits/rejected": -0.0010679024271667004, "logps/chosen": -4.105475425720215, "logps/rejected": -3.7631752490997314, "loss": 1.6023, "rewards/accuracies": 0.0, "rewards/chosen": 0.41643354296684265, "rewards/margins": -0.10784748196601868, "rewards/rejected": 0.5242810249328613, "step": 1977 }, { "epoch": 0.32, "learning_rate": 9.559996991118303e-07, "logits/chosen": -0.42260944843292236, "logits/rejected": -0.3593675494194031, "logps/chosen": -75.85577392578125, "logps/rejected": -55.33977127075195, "loss": 0.7246, "rewards/accuracies": 1.0, "rewards/chosen": 1.2792656421661377, "rewards/margins": 0.23818552494049072, "rewards/rejected": 1.041080117225647, "step": 1978 }, { "epoch": 0.32, "learning_rate": 9.55945773774747e-07, "logits/chosen": -0.6401126384735107, "logits/rejected": -0.488337904214859, "logps/chosen": -207.95553588867188, "logps/rejected": -55.997806549072266, "loss": 0.5745, "rewards/accuracies": 0.0, "rewards/chosen": 1.0896027088165283, "rewards/margins": -0.294345498085022, "rewards/rejected": 1.3839482069015503, "step": 1979 }, { "epoch": 0.32, "learning_rate": 9.558918169361251e-07, "logits/chosen": -0.444324791431427, "logits/rejected": -0.3741748034954071, "logps/chosen": -120.65278625488281, "logps/rejected": -159.77291870117188, "loss": 0.6608, "rewards/accuracies": 1.0, "rewards/chosen": 3.270982503890991, "rewards/margins": 0.3906693458557129, "rewards/rejected": 2.8803131580352783, "step": 1980 }, { "epoch": 0.32, "learning_rate": 9.558378285996924e-07, "logits/chosen": -0.5040221810340881, "logits/rejected": -0.3823948800563812, "logps/chosen": -113.90371704101562, "logps/rejected": -85.95677185058594, "loss": 0.3766, "rewards/accuracies": 1.0, "rewards/chosen": 0.9700302481651306, "rewards/margins": 0.34212726354599, "rewards/rejected": 0.6279029846191406, "step": 1981 }, { "epoch": 0.32, "learning_rate": 9.55783808769179e-07, "logits/chosen": -0.2847299575805664, "logits/rejected": -0.2894439995288849, "logps/chosen": -9.911155700683594, "logps/rejected": -13.189963340759277, "loss": 0.9984, "rewards/accuracies": 1.0, "rewards/chosen": 0.09483232349157333, "rewards/margins": 0.08210992813110352, "rewards/rejected": 0.012722397223114967, "step": 1982 }, { "epoch": 0.32, "learning_rate": 9.55729757448317e-07, "logits/chosen": -0.5087144374847412, "logits/rejected": -0.38650423288345337, "logps/chosen": -99.15876770019531, "logps/rejected": -61.73227310180664, "loss": 0.4833, "rewards/accuracies": 1.0, "rewards/chosen": 2.5532326698303223, "rewards/margins": 1.3447315692901611, "rewards/rejected": 1.2085011005401611, "step": 1983 }, { "epoch": 0.32, "learning_rate": 9.556756746408409e-07, "logits/chosen": -0.6973614692687988, "logits/rejected": -0.6661790013313293, "logps/chosen": -101.63793182373047, "logps/rejected": -81.78185272216797, "loss": 0.9969, "rewards/accuracies": 0.0, "rewards/chosen": -0.0004341125604696572, "rewards/margins": -1.1421234607696533, "rewards/rejected": 1.1416893005371094, "step": 1984 }, { "epoch": 0.32, "learning_rate": 9.556215603504873e-07, "logits/chosen": -0.3101425766944885, "logits/rejected": -0.24752558767795563, "logps/chosen": -37.65397644042969, "logps/rejected": -12.50006103515625, "loss": 1.0524, "rewards/accuracies": 1.0, "rewards/chosen": 1.2175769805908203, "rewards/margins": 0.2912864685058594, "rewards/rejected": 0.9262905120849609, "step": 1985 }, { "epoch": 0.32, "learning_rate": 9.55567414580995e-07, "logits/chosen": -0.7503710389137268, "logits/rejected": -0.7518116235733032, "logps/chosen": -101.27926635742188, "logps/rejected": -90.43296813964844, "loss": 2.0339, "rewards/accuracies": 1.0, "rewards/chosen": 2.5460357666015625, "rewards/margins": 0.5187027454376221, "rewards/rejected": 2.0273330211639404, "step": 1986 }, { "epoch": 0.32, "learning_rate": 9.55513237336105e-07, "logits/chosen": -0.3697892129421234, "logits/rejected": -0.3509412705898285, "logps/chosen": -152.16256713867188, "logps/rejected": -96.14710998535156, "loss": 0.3295, "rewards/accuracies": 1.0, "rewards/chosen": 2.410687208175659, "rewards/margins": 1.4705450534820557, "rewards/rejected": 0.9401420950889587, "step": 1987 }, { "epoch": 0.32, "learning_rate": 9.554590286195604e-07, "logits/chosen": -0.7589337229728699, "logits/rejected": -0.7559970021247864, "logps/chosen": -99.37158203125, "logps/rejected": -70.56869506835938, "loss": 0.1696, "rewards/accuracies": 1.0, "rewards/chosen": 3.244036912918091, "rewards/margins": 1.8437042236328125, "rewards/rejected": 1.4003326892852783, "step": 1988 }, { "epoch": 0.32, "learning_rate": 9.554047884351064e-07, "logits/chosen": -1.019852876663208, "logits/rejected": -0.9336857199668884, "logps/chosen": -70.9728012084961, "logps/rejected": -66.3422622680664, "loss": 0.2822, "rewards/accuracies": 1.0, "rewards/chosen": 2.640432119369507, "rewards/margins": 0.81748366355896, "rewards/rejected": 1.8229484558105469, "step": 1989 }, { "epoch": 0.32, "learning_rate": 9.55350516786491e-07, "logits/chosen": -0.09749995172023773, "logits/rejected": -0.08274739980697632, "logps/chosen": -29.6508731842041, "logps/rejected": -2.3336141109466553, "loss": 0.4534, "rewards/accuracies": 0.0, "rewards/chosen": -0.02992858923971653, "rewards/margins": -0.3393907845020294, "rewards/rejected": 0.30946218967437744, "step": 1990 }, { "epoch": 0.32, "learning_rate": 9.55296213677463e-07, "logits/chosen": -0.6435051560401917, "logits/rejected": -0.5857639908790588, "logps/chosen": -94.23992919921875, "logps/rejected": -27.358436584472656, "loss": 0.1261, "rewards/accuracies": 1.0, "rewards/chosen": 3.855581760406494, "rewards/margins": 3.283007860183716, "rewards/rejected": 0.5725738406181335, "step": 1991 }, { "epoch": 0.32, "learning_rate": 9.552418791117746e-07, "logits/chosen": -0.6813923120498657, "logits/rejected": -0.6029241681098938, "logps/chosen": -83.8589096069336, "logps/rejected": -100.51930236816406, "loss": 0.4109, "rewards/accuracies": 1.0, "rewards/chosen": 2.076712131500244, "rewards/margins": 0.5008606910705566, "rewards/rejected": 1.5758514404296875, "step": 1992 }, { "epoch": 0.32, "learning_rate": 9.551875130931803e-07, "logits/chosen": -0.5556973814964294, "logits/rejected": -0.5167815089225769, "logps/chosen": -77.06948852539062, "logps/rejected": -22.030662536621094, "loss": 0.4693, "rewards/accuracies": 1.0, "rewards/chosen": 0.9450035095214844, "rewards/margins": 0.5490930080413818, "rewards/rejected": 0.39591047167778015, "step": 1993 }, { "epoch": 0.32, "learning_rate": 9.551331156254357e-07, "logits/chosen": -0.5534830093383789, "logits/rejected": -0.4486576020717621, "logps/chosen": -93.03843688964844, "logps/rejected": -79.16690063476562, "loss": 0.5638, "rewards/accuracies": 1.0, "rewards/chosen": 3.2781968116760254, "rewards/margins": 0.6260216236114502, "rewards/rejected": 2.652175188064575, "step": 1994 }, { "epoch": 0.32, "learning_rate": 9.550786867122994e-07, "logits/chosen": -0.2049064338207245, "logits/rejected": -0.2049064338207245, "logps/chosen": -37.11834716796875, "logps/rejected": -37.11834716796875, "loss": 0.8032, "rewards/accuracies": 0.0, "rewards/chosen": 0.037444304674863815, "rewards/margins": 0.0, "rewards/rejected": 0.037444304674863815, "step": 1995 }, { "epoch": 0.32, "learning_rate": 9.550242263575317e-07, "logits/chosen": -0.4918075203895569, "logits/rejected": -0.4279654920101166, "logps/chosen": -78.62486267089844, "logps/rejected": -119.08277893066406, "loss": 0.319, "rewards/accuracies": 1.0, "rewards/chosen": 1.097406029701233, "rewards/margins": 0.4237152338027954, "rewards/rejected": 0.6736907958984375, "step": 1996 }, { "epoch": 0.32, "learning_rate": 9.549697345648955e-07, "logits/chosen": -0.49196556210517883, "logits/rejected": -0.3004618287086487, "logps/chosen": -75.36092376708984, "logps/rejected": -16.950359344482422, "loss": 0.0812, "rewards/accuracies": 1.0, "rewards/chosen": 2.839963674545288, "rewards/margins": 2.537376642227173, "rewards/rejected": 0.3025871217250824, "step": 1997 }, { "epoch": 0.32, "learning_rate": 9.549152113381556e-07, "logits/chosen": -0.5055475234985352, "logits/rejected": -0.4849903881549835, "logps/chosen": -74.05940246582031, "logps/rejected": -76.8971176147461, "loss": 0.5416, "rewards/accuracies": 0.0, "rewards/chosen": 0.33783799409866333, "rewards/margins": -0.08069610595703125, "rewards/rejected": 0.4185341000556946, "step": 1998 }, { "epoch": 0.32, "learning_rate": 9.54860656681079e-07, "logits/chosen": -0.5478367805480957, "logits/rejected": -0.5180079340934753, "logps/chosen": -71.20845031738281, "logps/rejected": -92.64236450195312, "loss": 0.379, "rewards/accuracies": 1.0, "rewards/chosen": 2.3263916969299316, "rewards/margins": 0.7625527381896973, "rewards/rejected": 1.5638389587402344, "step": 1999 }, { "epoch": 0.32, "learning_rate": 9.548060705974352e-07, "logits/chosen": -0.7122210264205933, "logits/rejected": -0.731631875038147, "logps/chosen": -101.89585876464844, "logps/rejected": -111.76065063476562, "loss": 0.8057, "rewards/accuracies": 0.0, "rewards/chosen": 0.6156166195869446, "rewards/margins": -0.15405428409576416, "rewards/rejected": 0.7696709036827087, "step": 2000 }, { "epoch": 0.32, "learning_rate": 2.702702702702703e-09, "logits/chosen": -0.37260928750038147, "logits/rejected": -0.2882232069969177, "logps/chosen": -36.73725509643555, "logps/rejected": -103.63104248046875, "loss": 1.7121, "rewards/accuracies": 0.0, "rewards/chosen": 1.5490047931671143, "rewards/margins": -0.9067814350128174, "rewards/rejected": 2.4557862281799316, "step": 2001 }, { "epoch": 0.32, "learning_rate": 5.405405405405406e-09, "logits/chosen": -0.7094069719314575, "logits/rejected": -0.7125234603881836, "logps/chosen": -67.0871810913086, "logps/rejected": -35.51721954345703, "loss": 0.9072, "rewards/accuracies": 0.0, "rewards/chosen": 0.42542725801467896, "rewards/margins": -1.1791884899139404, "rewards/rejected": 1.6046158075332642, "step": 2002 }, { "epoch": 0.33, "learning_rate": 8.108108108108109e-09, "logits/chosen": -0.6759322285652161, "logits/rejected": -0.6730693578720093, "logps/chosen": -57.28179931640625, "logps/rejected": -77.80209350585938, "loss": 0.4153, "rewards/accuracies": 1.0, "rewards/chosen": 1.9241119623184204, "rewards/margins": 0.8248733282089233, "rewards/rejected": 1.099238634109497, "step": 2003 }, { "epoch": 0.33, "learning_rate": 1.0810810810810811e-08, "logits/chosen": -0.8054463863372803, "logits/rejected": -0.8054463863372803, "logps/chosen": -115.41925048828125, "logps/rejected": -115.41925048828125, "loss": 0.396, "rewards/accuracies": 0.0, "rewards/chosen": 0.3708763122558594, "rewards/margins": 0.0, "rewards/rejected": 0.3708763122558594, "step": 2004 }, { "epoch": 0.33, "learning_rate": 1.3513513513513514e-08, "logits/chosen": -0.6950713396072388, "logits/rejected": -0.6834873557090759, "logps/chosen": -82.674072265625, "logps/rejected": -63.560794830322266, "loss": 0.6838, "rewards/accuracies": 0.0, "rewards/chosen": 0.8899459838867188, "rewards/margins": -0.3748089075088501, "rewards/rejected": 1.2647548913955688, "step": 2005 }, { "epoch": 0.33, "learning_rate": 1.6216216216216218e-08, "logits/chosen": -0.5972280502319336, "logits/rejected": -0.6371584534645081, "logps/chosen": -114.12921905517578, "logps/rejected": -53.3710823059082, "loss": 0.5734, "rewards/accuracies": 1.0, "rewards/chosen": 2.72819447517395, "rewards/margins": 1.164744257926941, "rewards/rejected": 1.5634502172470093, "step": 2006 }, { "epoch": 0.33, "learning_rate": 1.891891891891892e-08, "logits/chosen": -0.37022316455841064, "logits/rejected": -0.36667582392692566, "logps/chosen": -19.8411808013916, "logps/rejected": -2.508099317550659, "loss": 0.3981, "rewards/accuracies": 0.0, "rewards/chosen": 0.22807864844799042, "rewards/margins": -0.07305987179279327, "rewards/rejected": 0.3011385202407837, "step": 2007 }, { "epoch": 0.33, "learning_rate": 2.1621621621621623e-08, "logits/chosen": -0.3946094214916229, "logits/rejected": -0.22746729850769043, "logps/chosen": -48.47248840332031, "logps/rejected": -87.40121459960938, "loss": 0.5098, "rewards/accuracies": 0.0, "rewards/chosen": 1.3265243768692017, "rewards/margins": -0.07172083854675293, "rewards/rejected": 1.3982452154159546, "step": 2008 }, { "epoch": 0.33, "learning_rate": 2.4324324324324324e-08, "logits/chosen": -0.25021398067474365, "logits/rejected": -0.30276304483413696, "logps/chosen": -79.98393249511719, "logps/rejected": -85.50216674804688, "loss": 0.7832, "rewards/accuracies": 0.0, "rewards/chosen": 1.3769546747207642, "rewards/margins": -0.22752606868743896, "rewards/rejected": 1.6044807434082031, "step": 2009 }, { "epoch": 0.33, "learning_rate": 2.7027027027027028e-08, "logits/chosen": -0.5523815751075745, "logits/rejected": -0.5333122611045837, "logps/chosen": -89.28509521484375, "logps/rejected": -162.7510528564453, "loss": 0.4821, "rewards/accuracies": 0.0, "rewards/chosen": 0.8526626825332642, "rewards/margins": -0.058917224407196045, "rewards/rejected": 0.9115799069404602, "step": 2010 }, { "epoch": 0.33, "learning_rate": 2.972972972972973e-08, "logits/chosen": -0.7025288343429565, "logits/rejected": -0.5926586389541626, "logps/chosen": -153.06866455078125, "logps/rejected": -15.425583839416504, "loss": 0.2579, "rewards/accuracies": 1.0, "rewards/chosen": 1.4811416864395142, "rewards/margins": 1.1786147356033325, "rewards/rejected": 0.30252695083618164, "step": 2011 }, { "epoch": 0.33, "learning_rate": 3.2432432432432436e-08, "logits/chosen": -0.5131896138191223, "logits/rejected": -0.44030433893203735, "logps/chosen": -119.34649658203125, "logps/rejected": -69.509521484375, "loss": 0.5064, "rewards/accuracies": 0.0, "rewards/chosen": 1.1795166730880737, "rewards/margins": -0.025347113609313965, "rewards/rejected": 1.2048637866973877, "step": 2012 }, { "epoch": 0.33, "learning_rate": 3.513513513513514e-08, "logits/chosen": -0.7397844791412354, "logits/rejected": -0.7424653768539429, "logps/chosen": -103.63862609863281, "logps/rejected": -97.35877990722656, "loss": 1.1454, "rewards/accuracies": 0.0, "rewards/chosen": 0.7280876040458679, "rewards/margins": -1.4605293273925781, "rewards/rejected": 2.188616991043091, "step": 2013 }, { "epoch": 0.33, "learning_rate": 3.783783783783784e-08, "logits/chosen": -0.40043455362319946, "logits/rejected": -0.40043455362319946, "logps/chosen": -1.1359349489212036, "logps/rejected": -1.1359349489212036, "loss": 0.5011, "rewards/accuracies": 0.0, "rewards/chosen": 0.10739845037460327, "rewards/margins": 0.0, "rewards/rejected": 0.10739845037460327, "step": 2014 }, { "epoch": 0.33, "learning_rate": 4.054054054054054e-08, "logits/chosen": -0.4081251621246338, "logits/rejected": -0.39436808228492737, "logps/chosen": -73.46331787109375, "logps/rejected": -45.995750427246094, "loss": 1.4136, "rewards/accuracies": 1.0, "rewards/chosen": 1.3189048767089844, "rewards/margins": 0.018113255500793457, "rewards/rejected": 1.300791621208191, "step": 2015 }, { "epoch": 0.33, "learning_rate": 4.3243243243243246e-08, "logits/chosen": -0.7945809364318848, "logits/rejected": -0.7871768474578857, "logps/chosen": -131.40570068359375, "logps/rejected": -84.26326751708984, "loss": 0.84, "rewards/accuracies": 1.0, "rewards/chosen": 1.215632677078247, "rewards/margins": 0.915442705154419, "rewards/rejected": 0.3001899719238281, "step": 2016 }, { "epoch": 0.33, "learning_rate": 4.5945945945945947e-08, "logits/chosen": -0.6670001745223999, "logits/rejected": -0.5951669812202454, "logps/chosen": -94.57952880859375, "logps/rejected": -62.87716293334961, "loss": 0.3999, "rewards/accuracies": 0.0, "rewards/chosen": 2.054852247238159, "rewards/margins": -0.03487896919250488, "rewards/rejected": 2.089731216430664, "step": 2017 }, { "epoch": 0.33, "learning_rate": 4.864864864864865e-08, "logits/chosen": -0.06650763005018234, "logits/rejected": -0.06794086843729019, "logps/chosen": -2.2232093811035156, "logps/rejected": -2.876570224761963, "loss": 0.8492, "rewards/accuracies": 0.0, "rewards/chosen": 0.26005059480667114, "rewards/margins": -0.04124131798744202, "rewards/rejected": 0.30129191279411316, "step": 2018 }, { "epoch": 0.33, "learning_rate": 5.1351351351351355e-08, "logits/chosen": -0.2556467354297638, "logits/rejected": -0.18863673508167267, "logps/chosen": -53.183372497558594, "logps/rejected": -24.30801773071289, "loss": 0.1866, "rewards/accuracies": 1.0, "rewards/chosen": 1.9606682062149048, "rewards/margins": 1.0991692543029785, "rewards/rejected": 0.861499011516571, "step": 2019 }, { "epoch": 0.33, "learning_rate": 5.4054054054054056e-08, "logits/chosen": -0.6161839365959167, "logits/rejected": -0.6282938718795776, "logps/chosen": -87.26971435546875, "logps/rejected": -18.49227523803711, "loss": 1.1663, "rewards/accuracies": 0.0, "rewards/chosen": 0.7557998895645142, "rewards/margins": -0.14610344171524048, "rewards/rejected": 0.9019033312797546, "step": 2020 }, { "epoch": 0.33, "learning_rate": 5.6756756756756756e-08, "logits/chosen": -0.6695593595504761, "logits/rejected": -0.5685866475105286, "logps/chosen": -104.62957763671875, "logps/rejected": -21.253690719604492, "loss": 1.911, "rewards/accuracies": 1.0, "rewards/chosen": 2.410409688949585, "rewards/margins": 2.09971022605896, "rewards/rejected": 0.310699462890625, "step": 2021 }, { "epoch": 0.33, "learning_rate": 5.945945945945946e-08, "logits/chosen": 0.16892708837985992, "logits/rejected": 0.16892708837985992, "logps/chosen": -51.8961181640625, "logps/rejected": -51.8961181640625, "loss": 1.1946, "rewards/accuracies": 0.0, "rewards/chosen": -0.09526138752698898, "rewards/margins": 0.0, "rewards/rejected": -0.09526138752698898, "step": 2022 }, { "epoch": 0.33, "learning_rate": 6.216216216216216e-08, "logits/chosen": -0.7109383344650269, "logits/rejected": -0.6878337264060974, "logps/chosen": -101.70941162109375, "logps/rejected": -53.04537582397461, "loss": 0.8434, "rewards/accuracies": 0.0, "rewards/chosen": -0.1164451614022255, "rewards/margins": -1.0649417638778687, "rewards/rejected": 0.9484966397285461, "step": 2023 }, { "epoch": 0.33, "learning_rate": 6.486486486486487e-08, "logits/chosen": -0.40386831760406494, "logits/rejected": -0.3978206217288971, "logps/chosen": -55.7398681640625, "logps/rejected": -79.21705627441406, "loss": 0.719, "rewards/accuracies": 0.0, "rewards/chosen": 1.8695114850997925, "rewards/margins": -0.1705695390701294, "rewards/rejected": 2.040081024169922, "step": 2024 }, { "epoch": 0.33, "learning_rate": 6.756756756756757e-08, "logits/chosen": -0.4841715097427368, "logits/rejected": -0.4714045524597168, "logps/chosen": -72.4833755493164, "logps/rejected": -163.6463623046875, "loss": 1.2448, "rewards/accuracies": 0.0, "rewards/chosen": 1.9103096723556519, "rewards/margins": -1.835492730140686, "rewards/rejected": 3.745802402496338, "step": 2025 }, { "epoch": 0.33, "learning_rate": 7.027027027027027e-08, "logits/chosen": -0.745278000831604, "logits/rejected": -0.6173157691955566, "logps/chosen": -156.12362670898438, "logps/rejected": -142.42730712890625, "loss": 0.3489, "rewards/accuracies": 1.0, "rewards/chosen": 2.9111602306365967, "rewards/margins": 2.9470458030700684, "rewards/rejected": -0.03588562086224556, "step": 2026 }, { "epoch": 0.33, "learning_rate": 7.297297297297297e-08, "logits/chosen": -0.5357784628868103, "logits/rejected": -0.44873473048210144, "logps/chosen": -46.75214767456055, "logps/rejected": -20.47149658203125, "loss": 0.1625, "rewards/accuracies": 1.0, "rewards/chosen": 1.2867435216903687, "rewards/margins": 1.2637609243392944, "rewards/rejected": 0.02298259735107422, "step": 2027 }, { "epoch": 0.33, "learning_rate": 7.567567567567568e-08, "logits/chosen": -0.5793745517730713, "logits/rejected": -0.6078525185585022, "logps/chosen": -79.56889343261719, "logps/rejected": -80.45892333984375, "loss": 1.0906, "rewards/accuracies": 0.0, "rewards/chosen": 0.8221389651298523, "rewards/margins": -1.786736249923706, "rewards/rejected": 2.608875274658203, "step": 2028 }, { "epoch": 0.33, "learning_rate": 7.837837837837838e-08, "logits/chosen": -0.6959444880485535, "logits/rejected": -0.7246010303497314, "logps/chosen": -51.61113739013672, "logps/rejected": -74.92472839355469, "loss": 0.7682, "rewards/accuracies": 1.0, "rewards/chosen": 0.38999292254447937, "rewards/margins": 0.09895211458206177, "rewards/rejected": 0.2910408079624176, "step": 2029 }, { "epoch": 0.33, "learning_rate": 8.108108108108108e-08, "logits/chosen": -0.4849380850791931, "logits/rejected": -0.3218994736671448, "logps/chosen": -69.89143371582031, "logps/rejected": -70.8180923461914, "loss": 1.081, "rewards/accuracies": 1.0, "rewards/chosen": 1.123731255531311, "rewards/margins": 0.3487793207168579, "rewards/rejected": 0.7749519348144531, "step": 2030 }, { "epoch": 0.33, "learning_rate": 8.378378378378379e-08, "logits/chosen": -0.3807983100414276, "logits/rejected": -0.2429821640253067, "logps/chosen": -104.20154571533203, "logps/rejected": -120.10697937011719, "loss": 0.9437, "rewards/accuracies": 0.0, "rewards/chosen": 1.064954400062561, "rewards/margins": -1.371590495109558, "rewards/rejected": 2.436544895172119, "step": 2031 }, { "epoch": 0.33, "learning_rate": 8.648648648648649e-08, "logits/chosen": -0.44251009821891785, "logits/rejected": -0.3180239498615265, "logps/chosen": -50.57048034667969, "logps/rejected": -55.279762268066406, "loss": 1.205, "rewards/accuracies": 1.0, "rewards/chosen": 1.8425095081329346, "rewards/margins": 0.1851646900177002, "rewards/rejected": 1.6573448181152344, "step": 2032 }, { "epoch": 0.33, "learning_rate": 8.918918918918919e-08, "logits/chosen": -0.23727603256702423, "logits/rejected": -0.2782098352909088, "logps/chosen": -68.73775482177734, "logps/rejected": -60.49174880981445, "loss": 1.2268, "rewards/accuracies": 0.0, "rewards/chosen": 0.78143310546875, "rewards/margins": -0.343570351600647, "rewards/rejected": 1.125003457069397, "step": 2033 }, { "epoch": 0.33, "learning_rate": 9.189189189189189e-08, "logits/chosen": -0.47169485688209534, "logits/rejected": -0.32300475239753723, "logps/chosen": -118.59674835205078, "logps/rejected": -22.62067985534668, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": 3.076396942138672, "rewards/margins": 2.714921474456787, "rewards/rejected": 0.36147555708885193, "step": 2034 }, { "epoch": 0.33, "learning_rate": 9.45945945945946e-08, "logits/chosen": -0.5107951164245605, "logits/rejected": -0.2993049919605255, "logps/chosen": -54.544761657714844, "logps/rejected": -70.09373474121094, "loss": 0.6865, "rewards/accuracies": 0.0, "rewards/chosen": 0.6940540671348572, "rewards/margins": -0.8803378939628601, "rewards/rejected": 1.5743919610977173, "step": 2035 }, { "epoch": 0.33, "learning_rate": 9.72972972972973e-08, "logits/chosen": -0.3845590353012085, "logits/rejected": -0.31841495633125305, "logps/chosen": -92.71344757080078, "logps/rejected": -114.28942108154297, "loss": 0.3846, "rewards/accuracies": 1.0, "rewards/chosen": 0.519366443157196, "rewards/margins": 0.07195204496383667, "rewards/rejected": 0.4474143981933594, "step": 2036 }, { "epoch": 0.33, "learning_rate": 1e-07, "logits/chosen": -0.8519560098648071, "logits/rejected": -0.8365830779075623, "logps/chosen": -112.3161849975586, "logps/rejected": -142.29884338378906, "loss": 1.6262, "rewards/accuracies": 0.0, "rewards/chosen": 0.6858375668525696, "rewards/margins": -2.4514000415802, "rewards/rejected": 3.137237548828125, "step": 2037 }, { "epoch": 0.33, "learning_rate": 1.0270270270270271e-07, "logits/chosen": -0.3036808371543884, "logits/rejected": -0.2952026128768921, "logps/chosen": -65.54125213623047, "logps/rejected": -110.25564575195312, "loss": 0.2964, "rewards/accuracies": 1.0, "rewards/chosen": 0.8499199151992798, "rewards/margins": 0.7021530270576477, "rewards/rejected": 0.1477668732404709, "step": 2038 }, { "epoch": 0.33, "learning_rate": 1.0540540540540541e-07, "logits/chosen": -0.6328423619270325, "logits/rejected": -0.5155761241912842, "logps/chosen": -74.40544128417969, "logps/rejected": -62.71959686279297, "loss": 0.2321, "rewards/accuracies": 1.0, "rewards/chosen": 3.0159897804260254, "rewards/margins": 1.1924583911895752, "rewards/rejected": 1.8235313892364502, "step": 2039 }, { "epoch": 0.33, "learning_rate": 1.0810810810810811e-07, "logits/chosen": -0.4136113226413727, "logits/rejected": -0.2985445261001587, "logps/chosen": -58.100955963134766, "logps/rejected": -17.43105697631836, "loss": 0.4297, "rewards/accuracies": 1.0, "rewards/chosen": 1.8957089185714722, "rewards/margins": 1.311145305633545, "rewards/rejected": 0.584563672542572, "step": 2040 }, { "epoch": 0.33, "learning_rate": 1.1081081081081081e-07, "logits/chosen": -0.3525519073009491, "logits/rejected": -0.40071672201156616, "logps/chosen": -51.58453369140625, "logps/rejected": -52.014652252197266, "loss": 0.5088, "rewards/accuracies": 1.0, "rewards/chosen": 1.5897293090820312, "rewards/margins": 0.3481639623641968, "rewards/rejected": 1.2415653467178345, "step": 2041 }, { "epoch": 0.33, "learning_rate": 1.1351351351351351e-07, "logits/chosen": -0.7932590842247009, "logits/rejected": -0.7947293519973755, "logps/chosen": -126.49179077148438, "logps/rejected": -110.78194427490234, "loss": 1.5048, "rewards/accuracies": 0.0, "rewards/chosen": 0.552838146686554, "rewards/margins": -2.085700273513794, "rewards/rejected": 2.638538360595703, "step": 2042 }, { "epoch": 0.33, "learning_rate": 1.1621621621621621e-07, "logits/chosen": -0.07071737200021744, "logits/rejected": -0.07071737200021744, "logps/chosen": -22.205970764160156, "logps/rejected": -22.205970764160156, "loss": 0.9425, "rewards/accuracies": 0.0, "rewards/chosen": 0.11699390411376953, "rewards/margins": 0.0, "rewards/rejected": 0.11699390411376953, "step": 2043 }, { "epoch": 0.33, "learning_rate": 1.1891891891891891e-07, "logits/chosen": -0.035101473331451416, "logits/rejected": 0.05514253303408623, "logps/chosen": -49.647674560546875, "logps/rejected": -57.776695251464844, "loss": 0.6958, "rewards/accuracies": 1.0, "rewards/chosen": 1.6666145324707031, "rewards/margins": 0.411601185798645, "rewards/rejected": 1.255013346672058, "step": 2044 }, { "epoch": 0.33, "learning_rate": 1.2162162162162163e-07, "logits/chosen": -0.8557162880897522, "logits/rejected": -0.8688034415245056, "logps/chosen": -50.4608039855957, "logps/rejected": -74.76761627197266, "loss": 0.9427, "rewards/accuracies": 1.0, "rewards/chosen": 0.0015529632801190019, "rewards/margins": 0.04403114318847656, "rewards/rejected": -0.042478181421756744, "step": 2045 }, { "epoch": 0.33, "learning_rate": 1.2432432432432432e-07, "logits/chosen": -0.6132864952087402, "logits/rejected": -0.6163443922996521, "logps/chosen": -76.11798858642578, "logps/rejected": -61.60945510864258, "loss": 0.4572, "rewards/accuracies": 1.0, "rewards/chosen": 1.8433891534805298, "rewards/margins": 0.28444933891296387, "rewards/rejected": 1.558939814567566, "step": 2046 }, { "epoch": 0.33, "learning_rate": 1.2702702702702703e-07, "logits/chosen": -0.4204026162624359, "logits/rejected": -0.40981611609458923, "logps/chosen": -58.771484375, "logps/rejected": -81.18952178955078, "loss": 0.3439, "rewards/accuracies": 1.0, "rewards/chosen": 1.2749923467636108, "rewards/margins": 0.2707076072692871, "rewards/rejected": 1.0042847394943237, "step": 2047 }, { "epoch": 0.33, "learning_rate": 1.2972972972972974e-07, "logits/chosen": -0.5222378969192505, "logits/rejected": -0.5230295658111572, "logps/chosen": -89.22915649414062, "logps/rejected": -67.88148498535156, "loss": 2.149, "rewards/accuracies": 0.0, "rewards/chosen": 0.6649025082588196, "rewards/margins": -0.53328937292099, "rewards/rejected": 1.1981918811798096, "step": 2048 }, { "epoch": 0.33, "learning_rate": 1.3243243243243243e-07, "logits/chosen": -0.5854445099830627, "logits/rejected": -0.5070541501045227, "logps/chosen": -111.55198669433594, "logps/rejected": -126.95785522460938, "loss": 0.954, "rewards/accuracies": 0.0, "rewards/chosen": 2.1817734241485596, "rewards/margins": -1.530839443206787, "rewards/rejected": 3.7126128673553467, "step": 2049 }, { "epoch": 0.33, "learning_rate": 1.3513513513513515e-07, "logits/chosen": -0.4060167968273163, "logits/rejected": -0.4060167968273163, "logps/chosen": -95.40303039550781, "logps/rejected": -95.40303039550781, "loss": 1.1057, "rewards/accuracies": 0.0, "rewards/chosen": 0.8697220087051392, "rewards/margins": 0.0, "rewards/rejected": 0.8697220087051392, "step": 2050 }, { "epoch": 0.33, "learning_rate": 1.3783783783783783e-07, "logits/chosen": -0.7633802890777588, "logits/rejected": -0.664988100528717, "logps/chosen": -268.5889892578125, "logps/rejected": -118.34046936035156, "loss": 0.4567, "rewards/accuracies": 0.0, "rewards/chosen": 2.879901170730591, "rewards/margins": -0.3425629138946533, "rewards/rejected": 3.222464084625244, "step": 2051 }, { "epoch": 0.33, "learning_rate": 1.4054054054054055e-07, "logits/chosen": -0.39919883012771606, "logits/rejected": -0.4086095690727234, "logps/chosen": -2.686551094055176, "logps/rejected": -4.430522441864014, "loss": 0.7806, "rewards/accuracies": 1.0, "rewards/chosen": 0.27377042174339294, "rewards/margins": 0.004891037940979004, "rewards/rejected": 0.26887938380241394, "step": 2052 }, { "epoch": 0.33, "learning_rate": 1.4324324324324323e-07, "logits/chosen": -0.5000694990158081, "logits/rejected": -0.4735373258590698, "logps/chosen": -79.08506774902344, "logps/rejected": -37.87040328979492, "loss": 0.4433, "rewards/accuracies": 1.0, "rewards/chosen": 1.9852676391601562, "rewards/margins": 0.7710673809051514, "rewards/rejected": 1.2142002582550049, "step": 2053 }, { "epoch": 0.33, "learning_rate": 1.4594594594594595e-07, "logits/chosen": -0.4315599203109741, "logits/rejected": -0.5387960076332092, "logps/chosen": -179.46109008789062, "logps/rejected": -57.65621566772461, "loss": 0.8975, "rewards/accuracies": 1.0, "rewards/chosen": 2.0425307750701904, "rewards/margins": 0.5699336528778076, "rewards/rejected": 1.4725971221923828, "step": 2054 }, { "epoch": 0.33, "learning_rate": 1.4864864864864866e-07, "logits/chosen": -0.31516799330711365, "logits/rejected": -0.1940845251083374, "logps/chosen": -93.16392517089844, "logps/rejected": -53.678802490234375, "loss": 0.3384, "rewards/accuracies": 1.0, "rewards/chosen": 1.3500092029571533, "rewards/margins": 0.8428447842597961, "rewards/rejected": 0.5071644186973572, "step": 2055 }, { "epoch": 0.33, "learning_rate": 1.5135135135135135e-07, "logits/chosen": -0.515066921710968, "logits/rejected": -0.4376254379749298, "logps/chosen": -38.96825408935547, "logps/rejected": -63.69441604614258, "loss": 0.3635, "rewards/accuracies": 1.0, "rewards/chosen": 1.0308319330215454, "rewards/margins": 0.47836267948150635, "rewards/rejected": 0.5524692535400391, "step": 2056 }, { "epoch": 0.33, "learning_rate": 1.5405405405405406e-07, "logits/chosen": -0.5618714094161987, "logits/rejected": -0.4478464424610138, "logps/chosen": -70.04454040527344, "logps/rejected": -58.91426086425781, "loss": 0.3227, "rewards/accuracies": 1.0, "rewards/chosen": 1.7362213134765625, "rewards/margins": 0.1062781810760498, "rewards/rejected": 1.6299431324005127, "step": 2057 }, { "epoch": 0.33, "learning_rate": 1.5675675675675675e-07, "logits/chosen": -0.11539799720048904, "logits/rejected": -0.17807158827781677, "logps/chosen": -84.80226135253906, "logps/rejected": -93.9679946899414, "loss": 0.8796, "rewards/accuracies": 0.0, "rewards/chosen": 0.3594978451728821, "rewards/margins": -1.0591614246368408, "rewards/rejected": 1.4186592102050781, "step": 2058 }, { "epoch": 0.33, "learning_rate": 1.5945945945945947e-07, "logits/chosen": -0.4364461600780487, "logits/rejected": -0.4574010372161865, "logps/chosen": -110.56755065917969, "logps/rejected": -57.81294631958008, "loss": 0.8229, "rewards/accuracies": 0.0, "rewards/chosen": 0.2866157591342926, "rewards/margins": -1.424540400505066, "rewards/rejected": 1.7111561298370361, "step": 2059 }, { "epoch": 0.33, "learning_rate": 1.6216216216216215e-07, "logits/chosen": -0.21880565583705902, "logits/rejected": -0.21880565583705902, "logps/chosen": -102.64314270019531, "logps/rejected": -102.64314270019531, "loss": 0.8441, "rewards/accuracies": 0.0, "rewards/chosen": -0.20289841294288635, "rewards/margins": 0.0, "rewards/rejected": -0.20289841294288635, "step": 2060 }, { "epoch": 0.33, "learning_rate": 1.6486486486486487e-07, "logits/chosen": -0.49467170238494873, "logits/rejected": -0.4970383048057556, "logps/chosen": -76.02372741699219, "logps/rejected": -83.04541778564453, "loss": 0.6244, "rewards/accuracies": 0.0, "rewards/chosen": 1.103127360343933, "rewards/margins": -0.6082320213317871, "rewards/rejected": 1.7113593816757202, "step": 2061 }, { "epoch": 0.33, "learning_rate": 1.6756756756756758e-07, "logits/chosen": -0.36150893568992615, "logits/rejected": -0.3695526719093323, "logps/chosen": -18.27265167236328, "logps/rejected": -59.375030517578125, "loss": 0.3781, "rewards/accuracies": 0.0, "rewards/chosen": 0.010984993539750576, "rewards/margins": -0.0761655792593956, "rewards/rejected": 0.08715057373046875, "step": 2062 }, { "epoch": 0.33, "learning_rate": 1.7027027027027027e-07, "logits/chosen": -0.38786569237709045, "logits/rejected": -0.3241172134876251, "logps/chosen": -178.71389770507812, "logps/rejected": -112.518798828125, "loss": 0.2105, "rewards/accuracies": 1.0, "rewards/chosen": 2.8168792724609375, "rewards/margins": 1.918931484222412, "rewards/rejected": 0.8979477286338806, "step": 2063 }, { "epoch": 0.34, "learning_rate": 1.7297297297297298e-07, "logits/chosen": -0.5245824456214905, "logits/rejected": -0.44678863883018494, "logps/chosen": -64.96693420410156, "logps/rejected": -171.80343627929688, "loss": 1.0874, "rewards/accuracies": 0.0, "rewards/chosen": 1.3450897932052612, "rewards/margins": -2.014765739440918, "rewards/rejected": 3.3598556518554688, "step": 2064 }, { "epoch": 0.34, "learning_rate": 1.7567567567567567e-07, "logits/chosen": -0.3768271505832672, "logits/rejected": -0.34153103828430176, "logps/chosen": -150.87750244140625, "logps/rejected": -84.84649658203125, "loss": 0.9409, "rewards/accuracies": 0.0, "rewards/chosen": 0.6653381586074829, "rewards/margins": -1.2290176153182983, "rewards/rejected": 1.8943557739257812, "step": 2065 }, { "epoch": 0.34, "learning_rate": 1.7837837837837838e-07, "logits/chosen": -0.5428378582000732, "logits/rejected": -0.47758984565734863, "logps/chosen": -86.83586883544922, "logps/rejected": -77.62651062011719, "loss": 0.4967, "rewards/accuracies": 1.0, "rewards/chosen": 2.602501630783081, "rewards/margins": 1.3707572221755981, "rewards/rejected": 1.231744408607483, "step": 2066 }, { "epoch": 0.34, "learning_rate": 1.8108108108108107e-07, "logits/chosen": -0.43751776218414307, "logits/rejected": -0.472555935382843, "logps/chosen": -34.23125076293945, "logps/rejected": -76.50212097167969, "loss": 0.7537, "rewards/accuracies": 0.0, "rewards/chosen": 0.2871360778808594, "rewards/margins": -0.05288466811180115, "rewards/rejected": 0.3400207459926605, "step": 2067 }, { "epoch": 0.34, "learning_rate": 1.8378378378378379e-07, "logits/chosen": -0.9870815873146057, "logits/rejected": -0.9525981545448303, "logps/chosen": -78.24810028076172, "logps/rejected": -69.88269805908203, "loss": 1.1774, "rewards/accuracies": 0.0, "rewards/chosen": 0.2052208036184311, "rewards/margins": -0.7020416259765625, "rewards/rejected": 0.9072624444961548, "step": 2068 }, { "epoch": 0.34, "learning_rate": 1.864864864864865e-07, "logits/chosen": -0.41838932037353516, "logits/rejected": -0.32097697257995605, "logps/chosen": -86.78950500488281, "logps/rejected": -104.40106201171875, "loss": 1.3675, "rewards/accuracies": 0.0, "rewards/chosen": 0.36632004380226135, "rewards/margins": -2.2762885093688965, "rewards/rejected": 2.642608642578125, "step": 2069 }, { "epoch": 0.34, "learning_rate": 1.891891891891892e-07, "logits/chosen": -0.5866312980651855, "logits/rejected": -0.4222331643104553, "logps/chosen": -209.91336059570312, "logps/rejected": -26.514118194580078, "loss": 0.9102, "rewards/accuracies": 1.0, "rewards/chosen": 0.6344528198242188, "rewards/margins": 0.4944990277290344, "rewards/rejected": 0.13995380699634552, "step": 2070 }, { "epoch": 0.34, "learning_rate": 1.918918918918919e-07, "logits/chosen": -0.7850434184074402, "logits/rejected": -0.8048589825630188, "logps/chosen": -141.25350952148438, "logps/rejected": -74.68354797363281, "loss": 1.6213, "rewards/accuracies": 0.0, "rewards/chosen": -0.10486602783203125, "rewards/margins": -2.895122528076172, "rewards/rejected": 2.7902565002441406, "step": 2071 }, { "epoch": 0.34, "learning_rate": 1.945945945945946e-07, "logits/chosen": -0.5283366441726685, "logits/rejected": -0.5021358728408813, "logps/chosen": -138.172119140625, "logps/rejected": -134.6516876220703, "loss": 1.3409, "rewards/accuracies": 0.0, "rewards/chosen": 0.7082473635673523, "rewards/margins": -1.0946106910705566, "rewards/rejected": 1.8028579950332642, "step": 2072 }, { "epoch": 0.34, "learning_rate": 1.972972972972973e-07, "logits/chosen": -1.1567296981811523, "logits/rejected": -1.1046255826950073, "logps/chosen": -32.97633361816406, "logps/rejected": -103.1612548828125, "loss": 0.3052, "rewards/accuracies": 1.0, "rewards/chosen": 1.6882027387619019, "rewards/margins": 0.6428787708282471, "rewards/rejected": 1.0453239679336548, "step": 2073 }, { "epoch": 0.34, "learning_rate": 2e-07, "logits/chosen": -0.15795527398586273, "logits/rejected": -0.21894557774066925, "logps/chosen": -83.0604248046875, "logps/rejected": -169.6582794189453, "loss": 1.4118, "rewards/accuracies": 0.0, "rewards/chosen": 0.6889892816543579, "rewards/margins": -2.5572938919067383, "rewards/rejected": 3.2462830543518066, "step": 2074 }, { "epoch": 0.34, "learning_rate": 2.027027027027027e-07, "logits/chosen": -0.25480571389198303, "logits/rejected": -0.3049890995025635, "logps/chosen": -86.36807250976562, "logps/rejected": -77.07135009765625, "loss": 0.4463, "rewards/accuracies": 1.0, "rewards/chosen": 1.6569503545761108, "rewards/margins": 0.5545897483825684, "rewards/rejected": 1.1023606061935425, "step": 2075 }, { "epoch": 0.34, "learning_rate": 2.0540540540540542e-07, "logits/chosen": -0.21459120512008667, "logits/rejected": -0.21459120512008667, "logps/chosen": -37.62116622924805, "logps/rejected": -37.62116622924805, "loss": 0.8987, "rewards/accuracies": 0.0, "rewards/chosen": 0.011270523071289062, "rewards/margins": 0.0, "rewards/rejected": 0.011270523071289062, "step": 2076 }, { "epoch": 0.34, "learning_rate": 2.081081081081081e-07, "logits/chosen": -0.5256888270378113, "logits/rejected": -0.5278785228729248, "logps/chosen": -13.586666107177734, "logps/rejected": -4.175704002380371, "loss": 0.7087, "rewards/accuracies": 0.0, "rewards/chosen": 0.15412883460521698, "rewards/margins": -0.20864908397197723, "rewards/rejected": 0.3627779185771942, "step": 2077 }, { "epoch": 0.34, "learning_rate": 2.1081081081081082e-07, "logits/chosen": -0.33482393622398376, "logits/rejected": -0.29274430871009827, "logps/chosen": -76.78628540039062, "logps/rejected": -86.7144775390625, "loss": 0.5479, "rewards/accuracies": 1.0, "rewards/chosen": 0.9617156982421875, "rewards/margins": 0.2446182370185852, "rewards/rejected": 0.7170974612236023, "step": 2078 }, { "epoch": 0.34, "learning_rate": 2.135135135135135e-07, "logits/chosen": -0.6394751667976379, "logits/rejected": -0.5627712607383728, "logps/chosen": -73.00347900390625, "logps/rejected": -87.48075866699219, "loss": 1.164, "rewards/accuracies": 0.0, "rewards/chosen": 0.9444671869277954, "rewards/margins": -1.2879258394241333, "rewards/rejected": 2.2323930263519287, "step": 2079 }, { "epoch": 0.34, "learning_rate": 2.1621621621621622e-07, "logits/chosen": -0.6195282936096191, "logits/rejected": -1.0849559307098389, "logps/chosen": -109.33438110351562, "logps/rejected": -34.6919059753418, "loss": 0.4457, "rewards/accuracies": 1.0, "rewards/chosen": 0.2847305238246918, "rewards/margins": 0.05933532118797302, "rewards/rejected": 0.22539520263671875, "step": 2080 }, { "epoch": 0.34, "learning_rate": 2.189189189189189e-07, "logits/chosen": -0.4409160315990448, "logits/rejected": -0.36093759536743164, "logps/chosen": -100.56845092773438, "logps/rejected": -169.4366455078125, "loss": 1.3672, "rewards/accuracies": 0.0, "rewards/chosen": 0.8044052124023438, "rewards/margins": -2.5783157348632812, "rewards/rejected": 3.382720947265625, "step": 2081 }, { "epoch": 0.34, "learning_rate": 2.2162162162162162e-07, "logits/chosen": -0.5751580595970154, "logits/rejected": -0.5117058753967285, "logps/chosen": -31.88165855407715, "logps/rejected": -75.52436065673828, "loss": 1.1255, "rewards/accuracies": 0.0, "rewards/chosen": 0.8801305890083313, "rewards/margins": -0.4824572205543518, "rewards/rejected": 1.362587809562683, "step": 2082 }, { "epoch": 0.34, "learning_rate": 2.2432432432432434e-07, "logits/chosen": -0.3115454912185669, "logits/rejected": -0.3115454912185669, "logps/chosen": -71.73295593261719, "logps/rejected": -71.73295593261719, "loss": 0.3576, "rewards/accuracies": 0.0, "rewards/chosen": 0.9422271847724915, "rewards/margins": 0.0, "rewards/rejected": 0.9422271847724915, "step": 2083 }, { "epoch": 0.34, "learning_rate": 2.2702702702702703e-07, "logits/chosen": -0.23384462296962738, "logits/rejected": -0.23384462296962738, "logps/chosen": -83.81204223632812, "logps/rejected": -83.81204223632812, "loss": 0.9154, "rewards/accuracies": 0.0, "rewards/chosen": 1.0303878784179688, "rewards/margins": 0.0, "rewards/rejected": 1.0303878784179688, "step": 2084 }, { "epoch": 0.34, "learning_rate": 2.2972972972972974e-07, "logits/chosen": -0.4776977002620697, "logits/rejected": -0.3536386787891388, "logps/chosen": -175.89645385742188, "logps/rejected": -30.640098571777344, "loss": 1.9272, "rewards/accuracies": 1.0, "rewards/chosen": 2.1332595348358154, "rewards/margins": 1.6118953227996826, "rewards/rejected": 0.5213642120361328, "step": 2085 }, { "epoch": 0.34, "learning_rate": 2.3243243243243243e-07, "logits/chosen": -0.3918249309062958, "logits/rejected": -0.3181737959384918, "logps/chosen": -57.26548767089844, "logps/rejected": -107.609375, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": 1.831294298171997, "rewards/margins": 2.5592002868652344, "rewards/rejected": -0.7279060482978821, "step": 2086 }, { "epoch": 0.34, "learning_rate": 2.3513513513513514e-07, "logits/chosen": -0.40748098492622375, "logits/rejected": -0.3565293252468109, "logps/chosen": -70.5374526977539, "logps/rejected": -31.041704177856445, "loss": 0.4883, "rewards/accuracies": 1.0, "rewards/chosen": 2.36540150642395, "rewards/margins": 2.189488172531128, "rewards/rejected": 0.17591343820095062, "step": 2087 }, { "epoch": 0.34, "learning_rate": 2.3783783783783783e-07, "logits/chosen": -0.530112087726593, "logits/rejected": -0.5384843349456787, "logps/chosen": -32.88288116455078, "logps/rejected": -75.06842803955078, "loss": 0.612, "rewards/accuracies": 0.0, "rewards/chosen": 1.2872051000595093, "rewards/margins": -0.5652890205383301, "rewards/rejected": 1.8524941205978394, "step": 2088 }, { "epoch": 0.34, "learning_rate": 2.4054054054054054e-07, "logits/chosen": -0.7681447863578796, "logits/rejected": -0.8044151067733765, "logps/chosen": -132.8482208251953, "logps/rejected": -59.4769172668457, "loss": 0.1891, "rewards/accuracies": 1.0, "rewards/chosen": 3.4742019176483154, "rewards/margins": 0.935279369354248, "rewards/rejected": 2.5389225482940674, "step": 2089 }, { "epoch": 0.34, "learning_rate": 2.4324324324324326e-07, "logits/chosen": -0.7545903921127319, "logits/rejected": -0.8444384932518005, "logps/chosen": -303.37420654296875, "logps/rejected": -173.98971557617188, "loss": 1.9582, "rewards/accuracies": 0.0, "rewards/chosen": 3.0795655250549316, "rewards/margins": -1.6669034957885742, "rewards/rejected": 4.746469020843506, "step": 2090 }, { "epoch": 0.34, "learning_rate": 2.4594594594594597e-07, "logits/chosen": -0.4266819655895233, "logits/rejected": -0.4180111289024353, "logps/chosen": -34.281982421875, "logps/rejected": -19.1921443939209, "loss": 0.457, "rewards/accuracies": 1.0, "rewards/chosen": 0.7617210745811462, "rewards/margins": 0.36888355016708374, "rewards/rejected": 0.3928375244140625, "step": 2091 }, { "epoch": 0.34, "learning_rate": 2.4864864864864863e-07, "logits/chosen": -0.21381574869155884, "logits/rejected": -0.1378152221441269, "logps/chosen": -66.9754867553711, "logps/rejected": -52.64897918701172, "loss": 0.6974, "rewards/accuracies": 1.0, "rewards/chosen": 1.776404619216919, "rewards/margins": 0.8961422443389893, "rewards/rejected": 0.8802623748779297, "step": 2092 }, { "epoch": 0.34, "learning_rate": 2.5135135135135135e-07, "logits/chosen": -0.6110963225364685, "logits/rejected": -0.5330227613449097, "logps/chosen": -58.98561096191406, "logps/rejected": -22.702373504638672, "loss": 0.3019, "rewards/accuracies": 1.0, "rewards/chosen": 1.0214828252792358, "rewards/margins": 0.6880122423171997, "rewards/rejected": 0.33347055315971375, "step": 2093 }, { "epoch": 0.34, "learning_rate": 2.5405405405405406e-07, "logits/chosen": -0.5064904689788818, "logits/rejected": -0.4627798795700073, "logps/chosen": -48.717342376708984, "logps/rejected": -102.36174011230469, "loss": 0.8169, "rewards/accuracies": 0.0, "rewards/chosen": 1.8090511560440063, "rewards/margins": -0.5068341493606567, "rewards/rejected": 2.315885305404663, "step": 2094 }, { "epoch": 0.34, "learning_rate": 2.567567567567567e-07, "logits/chosen": -0.27577537298202515, "logits/rejected": -0.266548752784729, "logps/chosen": -55.46177291870117, "logps/rejected": -75.26455688476562, "loss": 0.5234, "rewards/accuracies": 0.0, "rewards/chosen": 1.0554523468017578, "rewards/margins": -0.07552909851074219, "rewards/rejected": 1.1309814453125, "step": 2095 }, { "epoch": 0.34, "learning_rate": 2.594594594594595e-07, "logits/chosen": -0.7177397608757019, "logits/rejected": -0.7225173115730286, "logps/chosen": -69.8622055053711, "logps/rejected": -154.9500274658203, "loss": 0.4181, "rewards/accuracies": 1.0, "rewards/chosen": 1.3261131048202515, "rewards/margins": 1.5692206621170044, "rewards/rejected": -0.2431076020002365, "step": 2096 }, { "epoch": 0.34, "learning_rate": 2.6216216216216215e-07, "logits/chosen": -0.6032572984695435, "logits/rejected": -0.575149655342102, "logps/chosen": -120.93714904785156, "logps/rejected": -75.07658386230469, "loss": 0.1828, "rewards/accuracies": 1.0, "rewards/chosen": 2.901280164718628, "rewards/margins": 2.0998427867889404, "rewards/rejected": 0.8014373779296875, "step": 2097 }, { "epoch": 0.34, "learning_rate": 2.6486486486486486e-07, "logits/chosen": -0.905024528503418, "logits/rejected": -0.8705285787582397, "logps/chosen": -129.5218048095703, "logps/rejected": -85.56600952148438, "loss": 0.608, "rewards/accuracies": 1.0, "rewards/chosen": 2.848994493484497, "rewards/margins": 0.6727402210235596, "rewards/rejected": 2.1762542724609375, "step": 2098 }, { "epoch": 0.34, "learning_rate": 2.675675675675675e-07, "logits/chosen": -0.5857173204421997, "logits/rejected": -0.5701577663421631, "logps/chosen": -65.3134765625, "logps/rejected": -38.46866226196289, "loss": 0.5364, "rewards/accuracies": 1.0, "rewards/chosen": 2.8084068298339844, "rewards/margins": 0.6934795379638672, "rewards/rejected": 2.114927291870117, "step": 2099 }, { "epoch": 0.34, "learning_rate": 2.702702702702703e-07, "logits/chosen": -0.4619256854057312, "logits/rejected": -0.5031762719154358, "logps/chosen": -45.477294921875, "logps/rejected": -160.77752685546875, "loss": 1.6728, "rewards/accuracies": 0.0, "rewards/chosen": 1.3238747119903564, "rewards/margins": -2.4426276683807373, "rewards/rejected": 3.7665023803710938, "step": 2100 }, { "epoch": 0.34, "learning_rate": 2.7297297297297295e-07, "logits/chosen": -0.22224345803260803, "logits/rejected": -0.22224345803260803, "logps/chosen": -39.6258544921875, "logps/rejected": -39.6258544921875, "loss": 1.2455, "rewards/accuracies": 0.0, "rewards/chosen": 0.44067153334617615, "rewards/margins": 0.0, "rewards/rejected": 0.44067153334617615, "step": 2101 }, { "epoch": 0.34, "learning_rate": 2.7567567567567567e-07, "logits/chosen": -0.7658488750457764, "logits/rejected": -0.7641267776489258, "logps/chosen": -67.47772216796875, "logps/rejected": -20.661890029907227, "loss": 0.7227, "rewards/accuracies": 0.0, "rewards/chosen": -0.15045319497585297, "rewards/margins": -0.7769724130630493, "rewards/rejected": 0.6265192031860352, "step": 2102 }, { "epoch": 0.34, "learning_rate": 2.7837837837837833e-07, "logits/chosen": -0.6671684980392456, "logits/rejected": -0.5664970278739929, "logps/chosen": -132.27902221679688, "logps/rejected": -103.71363830566406, "loss": 0.4934, "rewards/accuracies": 0.0, "rewards/chosen": 0.28730469942092896, "rewards/margins": -0.3447967767715454, "rewards/rejected": 0.6321014761924744, "step": 2103 }, { "epoch": 0.34, "learning_rate": 2.810810810810811e-07, "logits/chosen": -1.407422661781311, "logits/rejected": -1.393861174583435, "logps/chosen": -56.57591247558594, "logps/rejected": -131.94935607910156, "loss": 1.0292, "rewards/accuracies": 0.0, "rewards/chosen": 0.42647743225097656, "rewards/margins": -1.7983288764953613, "rewards/rejected": 2.224806308746338, "step": 2104 }, { "epoch": 0.34, "learning_rate": 2.8378378378378376e-07, "logits/chosen": -0.5509414672851562, "logits/rejected": -0.5496905446052551, "logps/chosen": -112.43663024902344, "logps/rejected": -122.34059143066406, "loss": 0.0706, "rewards/accuracies": 1.0, "rewards/chosen": 2.9288527965545654, "rewards/margins": 2.3991453647613525, "rewards/rejected": 0.5297073721885681, "step": 2105 }, { "epoch": 0.34, "learning_rate": 2.8648648648648647e-07, "logits/chosen": -0.36417436599731445, "logits/rejected": -0.3720613121986389, "logps/chosen": -3.296194553375244, "logps/rejected": -39.647315979003906, "loss": 0.4356, "rewards/accuracies": 1.0, "rewards/chosen": 0.2768876254558563, "rewards/margins": 0.3081284761428833, "rewards/rejected": -0.03124084509909153, "step": 2106 }, { "epoch": 0.34, "learning_rate": 2.891891891891892e-07, "logits/chosen": -0.31907564401626587, "logits/rejected": -0.3164938986301422, "logps/chosen": -119.68548583984375, "logps/rejected": -135.78546142578125, "loss": 0.9167, "rewards/accuracies": 0.0, "rewards/chosen": 1.3516753911972046, "rewards/margins": -0.752121090888977, "rewards/rejected": 2.1037964820861816, "step": 2107 }, { "epoch": 0.34, "learning_rate": 2.918918918918919e-07, "logits/chosen": -0.513139009475708, "logits/rejected": -0.4412067234516144, "logps/chosen": -141.80752563476562, "logps/rejected": -95.17527770996094, "loss": 2.6171, "rewards/accuracies": 1.0, "rewards/chosen": 3.4458389282226562, "rewards/margins": 0.549835205078125, "rewards/rejected": 2.8960037231445312, "step": 2108 }, { "epoch": 0.34, "learning_rate": 2.9459459459459456e-07, "logits/chosen": -0.29349371790885925, "logits/rejected": -0.2944623529911041, "logps/chosen": -1.3217368125915527, "logps/rejected": -5.546994209289551, "loss": 0.4274, "rewards/accuracies": 1.0, "rewards/chosen": 0.1768854707479477, "rewards/margins": 0.004094123840332031, "rewards/rejected": 0.17279134690761566, "step": 2109 }, { "epoch": 0.34, "learning_rate": 2.972972972972973e-07, "logits/chosen": -0.57938152551651, "logits/rejected": -0.5586732029914856, "logps/chosen": -122.30465698242188, "logps/rejected": -111.0719985961914, "loss": 1.3683, "rewards/accuracies": 0.0, "rewards/chosen": 0.7033004760742188, "rewards/margins": -2.102275848388672, "rewards/rejected": 2.8055763244628906, "step": 2110 }, { "epoch": 0.34, "learning_rate": 3e-07, "logits/chosen": -0.60262131690979, "logits/rejected": -0.60262131690979, "logps/chosen": -72.64495849609375, "logps/rejected": -72.64495849609375, "loss": 1.0316, "rewards/accuracies": 0.0, "rewards/chosen": 1.4059876203536987, "rewards/margins": 0.0, "rewards/rejected": 1.4059876203536987, "step": 2111 }, { "epoch": 0.34, "learning_rate": 3.027027027027027e-07, "logits/chosen": -0.606593668460846, "logits/rejected": -0.5257943272590637, "logps/chosen": -86.20037841796875, "logps/rejected": -78.36763763427734, "loss": 0.6687, "rewards/accuracies": 0.0, "rewards/chosen": 1.1769554615020752, "rewards/margins": -0.5105079412460327, "rewards/rejected": 1.687463402748108, "step": 2112 }, { "epoch": 0.34, "learning_rate": 3.0540540540540536e-07, "logits/chosen": -0.8816842436790466, "logits/rejected": -0.878835916519165, "logps/chosen": -65.54835510253906, "logps/rejected": -20.9567928314209, "loss": 1.21, "rewards/accuracies": 1.0, "rewards/chosen": 1.3739356994628906, "rewards/margins": 1.131522536277771, "rewards/rejected": 0.24241314828395844, "step": 2113 }, { "epoch": 0.34, "learning_rate": 3.0810810810810813e-07, "logits/chosen": -0.4023173749446869, "logits/rejected": -0.3290365934371948, "logps/chosen": -39.826454162597656, "logps/rejected": -17.926347732543945, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.9319900870323181, "rewards/margins": 0.012237012386322021, "rewards/rejected": 0.9197530746459961, "step": 2114 }, { "epoch": 0.34, "learning_rate": 3.108108108108108e-07, "logits/chosen": 0.0029672724194824696, "logits/rejected": -0.6230562925338745, "logps/chosen": -14.889159202575684, "logps/rejected": -72.26271057128906, "loss": 1.3951, "rewards/accuracies": 0.0, "rewards/chosen": 0.7404770851135254, "rewards/margins": -1.0032912492752075, "rewards/rejected": 1.743768334388733, "step": 2115 }, { "epoch": 0.34, "learning_rate": 3.135135135135135e-07, "logits/chosen": -0.8997620344161987, "logits/rejected": -0.9156690835952759, "logps/chosen": -141.1888427734375, "logps/rejected": -42.83502197265625, "loss": 0.4227, "rewards/accuracies": 1.0, "rewards/chosen": 0.47012636065483093, "rewards/margins": 0.36802026629447937, "rewards/rejected": 0.10210609436035156, "step": 2116 }, { "epoch": 0.34, "learning_rate": 3.162162162162162e-07, "logits/chosen": -0.8598780035972595, "logits/rejected": -0.8443993926048279, "logps/chosen": -49.650665283203125, "logps/rejected": -54.46894836425781, "loss": 0.5954, "rewards/accuracies": 1.0, "rewards/chosen": 1.1932018995285034, "rewards/margins": 0.14836621284484863, "rewards/rejected": 1.0448356866836548, "step": 2117 }, { "epoch": 0.34, "learning_rate": 3.1891891891891893e-07, "logits/chosen": -0.7210777997970581, "logits/rejected": -0.6697502136230469, "logps/chosen": -65.39427185058594, "logps/rejected": -74.64293670654297, "loss": 1.491, "rewards/accuracies": 0.0, "rewards/chosen": 0.8207870721817017, "rewards/margins": -1.149742841720581, "rewards/rejected": 1.9705299139022827, "step": 2118 }, { "epoch": 0.34, "learning_rate": 3.216216216216216e-07, "logits/chosen": -0.8095804452896118, "logits/rejected": -0.8211345076560974, "logps/chosen": -87.32853698730469, "logps/rejected": -113.95744323730469, "loss": 0.4096, "rewards/accuracies": 1.0, "rewards/chosen": 0.9039276242256165, "rewards/margins": 0.503313422203064, "rewards/rejected": 0.4006141722202301, "step": 2119 }, { "epoch": 0.34, "learning_rate": 3.243243243243243e-07, "logits/chosen": -0.4072106182575226, "logits/rejected": -0.43325117230415344, "logps/chosen": -47.53099822998047, "logps/rejected": -42.22876739501953, "loss": 0.7836, "rewards/accuracies": 0.0, "rewards/chosen": 1.4767287969589233, "rewards/margins": -0.3341946601867676, "rewards/rejected": 1.810923457145691, "step": 2120 }, { "epoch": 0.34, "learning_rate": 3.27027027027027e-07, "logits/chosen": -0.5100024938583374, "logits/rejected": -0.4933726191520691, "logps/chosen": -200.23446655273438, "logps/rejected": -178.0823516845703, "loss": 0.1443, "rewards/accuracies": 1.0, "rewards/chosen": 2.213391065597534, "rewards/margins": 2.044856309890747, "rewards/rejected": 0.16853486001491547, "step": 2121 }, { "epoch": 0.34, "learning_rate": 3.2972972972972973e-07, "logits/chosen": -0.8930361866950989, "logits/rejected": -0.8889463543891907, "logps/chosen": -97.29705810546875, "logps/rejected": -115.9339828491211, "loss": 1.602, "rewards/accuracies": 0.0, "rewards/chosen": 1.9620956182479858, "rewards/margins": -0.8591150045394897, "rewards/rejected": 2.8212106227874756, "step": 2122 }, { "epoch": 0.34, "learning_rate": 3.324324324324324e-07, "logits/chosen": -0.7545350193977356, "logits/rejected": -0.6707737445831299, "logps/chosen": -94.36369323730469, "logps/rejected": -67.99639892578125, "loss": 0.8532, "rewards/accuracies": 0.0, "rewards/chosen": 0.40174180269241333, "rewards/margins": -0.9663658738136292, "rewards/rejected": 1.3681076765060425, "step": 2123 }, { "epoch": 0.34, "learning_rate": 3.3513513513513516e-07, "logits/chosen": -0.4709581732749939, "logits/rejected": -0.4709581732749939, "logps/chosen": -104.78665924072266, "logps/rejected": -104.78665924072266, "loss": 0.7841, "rewards/accuracies": 0.0, "rewards/chosen": 1.002400279045105, "rewards/margins": 0.0, "rewards/rejected": 1.002400279045105, "step": 2124 }, { "epoch": 0.34, "learning_rate": 3.378378378378378e-07, "logits/chosen": -0.4606289565563202, "logits/rejected": -0.4472913146018982, "logps/chosen": -3.650759220123291, "logps/rejected": -10.806795120239258, "loss": 1.2119, "rewards/accuracies": 1.0, "rewards/chosen": 0.11255116760730743, "rewards/margins": 0.12492747604846954, "rewards/rejected": -0.01237630844116211, "step": 2125 }, { "epoch": 0.35, "learning_rate": 3.4054054054054054e-07, "logits/chosen": -0.4982917010784149, "logits/rejected": -0.42007899284362793, "logps/chosen": -41.07697296142578, "logps/rejected": -74.84559631347656, "loss": 0.4687, "rewards/accuracies": 1.0, "rewards/chosen": 1.9199455976486206, "rewards/margins": 1.190885305404663, "rewards/rejected": 0.7290603518486023, "step": 2126 }, { "epoch": 0.35, "learning_rate": 3.432432432432432e-07, "logits/chosen": -0.5195062756538391, "logits/rejected": -0.5336887836456299, "logps/chosen": -99.69735717773438, "logps/rejected": -41.57301712036133, "loss": 0.4023, "rewards/accuracies": 1.0, "rewards/chosen": 0.43153077363967896, "rewards/margins": 0.27867430448532104, "rewards/rejected": 0.15285645425319672, "step": 2127 }, { "epoch": 0.35, "learning_rate": 3.4594594594594597e-07, "logits/chosen": -0.40789586305618286, "logits/rejected": -0.3599899411201477, "logps/chosen": -66.72909545898438, "logps/rejected": -82.58778381347656, "loss": 1.001, "rewards/accuracies": 0.0, "rewards/chosen": 1.2740601301193237, "rewards/margins": -0.22332453727722168, "rewards/rejected": 1.4973846673965454, "step": 2128 }, { "epoch": 0.35, "learning_rate": 3.4864864864864863e-07, "logits/chosen": -0.3354593813419342, "logits/rejected": -0.39856377243995667, "logps/chosen": -79.95133972167969, "logps/rejected": -101.18598937988281, "loss": 0.7983, "rewards/accuracies": 0.0, "rewards/chosen": 0.2139030545949936, "rewards/margins": -0.5672577023506165, "rewards/rejected": 0.7811607718467712, "step": 2129 }, { "epoch": 0.35, "learning_rate": 3.5135135135135134e-07, "logits/chosen": -0.5026493668556213, "logits/rejected": -0.5075084567070007, "logps/chosen": -174.83963012695312, "logps/rejected": -217.1357879638672, "loss": 0.3749, "rewards/accuracies": 1.0, "rewards/chosen": 5.568933010101318, "rewards/margins": 1.363603115081787, "rewards/rejected": 4.205329895019531, "step": 2130 }, { "epoch": 0.35, "learning_rate": 3.5405405405405406e-07, "logits/chosen": -0.8483948111534119, "logits/rejected": -0.7188287377357483, "logps/chosen": -269.05718994140625, "logps/rejected": -158.0476837158203, "loss": 2.8883, "rewards/accuracies": 0.0, "rewards/chosen": 1.210150122642517, "rewards/margins": -3.9708189964294434, "rewards/rejected": 5.18096923828125, "step": 2131 }, { "epoch": 0.35, "learning_rate": 3.5675675675675677e-07, "logits/chosen": -0.512380838394165, "logits/rejected": -0.4989902079105377, "logps/chosen": -64.57154083251953, "logps/rejected": -85.65404510498047, "loss": 1.6909, "rewards/accuracies": 1.0, "rewards/chosen": 2.1551263332366943, "rewards/margins": 0.41124284267425537, "rewards/rejected": 1.743883490562439, "step": 2132 }, { "epoch": 0.35, "learning_rate": 3.5945945945945943e-07, "logits/chosen": -0.5707583427429199, "logits/rejected": -0.4322517216205597, "logps/chosen": -135.56240844726562, "logps/rejected": -97.17918395996094, "loss": 0.2634, "rewards/accuracies": 1.0, "rewards/chosen": 3.101214647293091, "rewards/margins": 1.4720871448516846, "rewards/rejected": 1.6291275024414062, "step": 2133 }, { "epoch": 0.35, "learning_rate": 3.6216216216216214e-07, "logits/chosen": -0.5827527642250061, "logits/rejected": -0.5848903059959412, "logps/chosen": -41.62854766845703, "logps/rejected": -36.187320709228516, "loss": 0.5527, "rewards/accuracies": 0.0, "rewards/chosen": 0.5945881009101868, "rewards/margins": -0.35291290283203125, "rewards/rejected": 0.947501003742218, "step": 2134 }, { "epoch": 0.35, "learning_rate": 3.6486486486486486e-07, "logits/chosen": -0.6887966990470886, "logits/rejected": -0.6479496359825134, "logps/chosen": -56.6466064453125, "logps/rejected": -50.956321716308594, "loss": 0.3754, "rewards/accuracies": 1.0, "rewards/chosen": 2.0885109901428223, "rewards/margins": 0.7345261573791504, "rewards/rejected": 1.3539848327636719, "step": 2135 }, { "epoch": 0.35, "learning_rate": 3.6756756756756757e-07, "logits/chosen": -0.2532139718532562, "logits/rejected": -0.22924433648586273, "logps/chosen": -40.955570220947266, "logps/rejected": -9.368001937866211, "loss": 1.4052, "rewards/accuracies": 0.0, "rewards/chosen": 0.05186271667480469, "rewards/margins": -0.29736843705177307, "rewards/rejected": 0.34923115372657776, "step": 2136 }, { "epoch": 0.35, "learning_rate": 3.7027027027027023e-07, "logits/chosen": -0.4545397162437439, "logits/rejected": -0.41204771399497986, "logps/chosen": -68.97218322753906, "logps/rejected": -82.40409088134766, "loss": 0.8883, "rewards/accuracies": 0.0, "rewards/chosen": 1.5376434326171875, "rewards/margins": -0.32169950008392334, "rewards/rejected": 1.8593429327011108, "step": 2137 }, { "epoch": 0.35, "learning_rate": 3.72972972972973e-07, "logits/chosen": -0.7334949970245361, "logits/rejected": -0.6883483529090881, "logps/chosen": -69.63099670410156, "logps/rejected": -73.0766372680664, "loss": 0.4149, "rewards/accuracies": 1.0, "rewards/chosen": 1.0469802618026733, "rewards/margins": 0.08920055627822876, "rewards/rejected": 0.9577797055244446, "step": 2138 }, { "epoch": 0.35, "learning_rate": 3.7567567567567566e-07, "logits/chosen": -0.10799092799425125, "logits/rejected": -0.11077418923377991, "logps/chosen": -4.084143161773682, "logps/rejected": -2.7448244094848633, "loss": 0.5895, "rewards/accuracies": 0.0, "rewards/chosen": 0.2512974441051483, "rewards/margins": -0.04026639461517334, "rewards/rejected": 0.29156383872032166, "step": 2139 }, { "epoch": 0.35, "learning_rate": 3.783783783783784e-07, "logits/chosen": -0.3278270959854126, "logits/rejected": -0.30182477831840515, "logps/chosen": -44.660057067871094, "logps/rejected": -52.29362106323242, "loss": 0.6358, "rewards/accuracies": 1.0, "rewards/chosen": 0.7851974368095398, "rewards/margins": 0.23745304346084595, "rewards/rejected": 0.5477443933486938, "step": 2140 }, { "epoch": 0.35, "learning_rate": 3.8108108108108104e-07, "logits/chosen": -0.31569910049438477, "logits/rejected": -0.30502551794052124, "logps/chosen": -94.23563385009766, "logps/rejected": -137.90296936035156, "loss": 1.1004, "rewards/accuracies": 0.0, "rewards/chosen": 2.594207763671875, "rewards/margins": -1.6089234352111816, "rewards/rejected": 4.203131198883057, "step": 2141 }, { "epoch": 0.35, "learning_rate": 3.837837837837838e-07, "logits/chosen": -0.42355114221572876, "logits/rejected": -0.3779412806034088, "logps/chosen": -85.13922882080078, "logps/rejected": -139.74380493164062, "loss": 0.4549, "rewards/accuracies": 0.0, "rewards/chosen": 0.08735275268554688, "rewards/margins": -0.08567428588867188, "rewards/rejected": 0.17302703857421875, "step": 2142 }, { "epoch": 0.35, "learning_rate": 3.8648648648648646e-07, "logits/chosen": -0.26655906438827515, "logits/rejected": -0.0927795022726059, "logps/chosen": -80.32222747802734, "logps/rejected": -30.22260284423828, "loss": 0.2228, "rewards/accuracies": 1.0, "rewards/chosen": 1.8716987371444702, "rewards/margins": 1.729256272315979, "rewards/rejected": 0.1424425095319748, "step": 2143 }, { "epoch": 0.35, "learning_rate": 3.891891891891892e-07, "logits/chosen": -0.5186541080474854, "logits/rejected": -0.39086756110191345, "logps/chosen": -73.3723373413086, "logps/rejected": -95.618896484375, "loss": 3.3484, "rewards/accuracies": 0.0, "rewards/chosen": 0.7752998471260071, "rewards/margins": -1.4082679748535156, "rewards/rejected": 2.183567762374878, "step": 2144 }, { "epoch": 0.35, "learning_rate": 3.918918918918919e-07, "logits/chosen": -0.4451928436756134, "logits/rejected": -0.4277082085609436, "logps/chosen": -56.49940872192383, "logps/rejected": -56.205867767333984, "loss": 0.2667, "rewards/accuracies": 1.0, "rewards/chosen": 2.2034435272216797, "rewards/margins": 1.3957343101501465, "rewards/rejected": 0.8077091574668884, "step": 2145 }, { "epoch": 0.35, "learning_rate": 3.945945945945946e-07, "logits/chosen": -0.3031560182571411, "logits/rejected": -0.19653911888599396, "logps/chosen": -88.74258422851562, "logps/rejected": -69.8315200805664, "loss": 0.6503, "rewards/accuracies": 1.0, "rewards/chosen": 2.642399549484253, "rewards/margins": 1.2188483476638794, "rewards/rejected": 1.4235512018203735, "step": 2146 }, { "epoch": 0.35, "learning_rate": 3.9729729729729727e-07, "logits/chosen": -0.36053600907325745, "logits/rejected": -0.4241037368774414, "logps/chosen": -83.17250061035156, "logps/rejected": -51.086151123046875, "loss": 2.2157, "rewards/accuracies": 0.0, "rewards/chosen": 0.7437477111816406, "rewards/margins": -0.966992974281311, "rewards/rejected": 1.7107406854629517, "step": 2147 }, { "epoch": 0.35, "learning_rate": 4e-07, "logits/chosen": -0.6865269541740417, "logits/rejected": -0.69866544008255, "logps/chosen": -60.990478515625, "logps/rejected": -81.26504516601562, "loss": 0.3864, "rewards/accuracies": 1.0, "rewards/chosen": 2.083786725997925, "rewards/margins": 0.5920401811599731, "rewards/rejected": 1.4917465448379517, "step": 2148 }, { "epoch": 0.35, "learning_rate": 4.027027027027027e-07, "logits/chosen": -0.10725314915180206, "logits/rejected": -0.15069186687469482, "logps/chosen": -25.539859771728516, "logps/rejected": -49.41919708251953, "loss": 0.5494, "rewards/accuracies": 1.0, "rewards/chosen": 0.24405498802661896, "rewards/margins": 0.11764582991600037, "rewards/rejected": 0.1264091581106186, "step": 2149 }, { "epoch": 0.35, "learning_rate": 4.054054054054054e-07, "logits/chosen": -0.4213802218437195, "logits/rejected": -0.4226512908935547, "logps/chosen": -3.929889440536499, "logps/rejected": -5.168259620666504, "loss": 1.5055, "rewards/accuracies": 1.0, "rewards/chosen": 0.2966100573539734, "rewards/margins": 0.023239314556121826, "rewards/rejected": 0.27337074279785156, "step": 2150 }, { "epoch": 0.35, "learning_rate": 4.0810810810810807e-07, "logits/chosen": -0.5704920291900635, "logits/rejected": -0.6185474395751953, "logps/chosen": -106.74931335449219, "logps/rejected": -92.34242248535156, "loss": 1.7365, "rewards/accuracies": 0.0, "rewards/chosen": 0.48453980684280396, "rewards/margins": -0.022158801555633545, "rewards/rejected": 0.5066986083984375, "step": 2151 }, { "epoch": 0.35, "learning_rate": 4.1081081081081084e-07, "logits/chosen": -0.35126224160194397, "logits/rejected": -0.3108502924442291, "logps/chosen": -83.17985534667969, "logps/rejected": -75.4355239868164, "loss": 0.3751, "rewards/accuracies": 1.0, "rewards/chosen": 0.2219093292951584, "rewards/margins": 0.2746597230434418, "rewards/rejected": -0.052750397473573685, "step": 2152 }, { "epoch": 0.35, "learning_rate": 4.135135135135135e-07, "logits/chosen": -0.5754314064979553, "logits/rejected": -0.5356216430664062, "logps/chosen": -59.537052154541016, "logps/rejected": -95.84889221191406, "loss": 0.6842, "rewards/accuracies": 0.0, "rewards/chosen": 2.090916156768799, "rewards/margins": -0.9584574699401855, "rewards/rejected": 3.0493736267089844, "step": 2153 }, { "epoch": 0.35, "learning_rate": 4.162162162162162e-07, "logits/chosen": -0.5428247451782227, "logits/rejected": -0.5263684988021851, "logps/chosen": -132.94203186035156, "logps/rejected": -68.70899963378906, "loss": 0.2879, "rewards/accuracies": 1.0, "rewards/chosen": 4.064894199371338, "rewards/margins": 1.7647979259490967, "rewards/rejected": 2.300096273422241, "step": 2154 }, { "epoch": 0.35, "learning_rate": 4.189189189189189e-07, "logits/chosen": -0.8344422578811646, "logits/rejected": -0.8978497982025146, "logps/chosen": -224.50054931640625, "logps/rejected": -66.19876098632812, "loss": 0.4348, "rewards/accuracies": 1.0, "rewards/chosen": 2.2015488147735596, "rewards/margins": 0.050464630126953125, "rewards/rejected": 2.1510841846466064, "step": 2155 }, { "epoch": 0.35, "learning_rate": 4.2162162162162164e-07, "logits/chosen": -0.8982471227645874, "logits/rejected": -0.8867499828338623, "logps/chosen": -38.53271484375, "logps/rejected": -168.42611694335938, "loss": 1.4033, "rewards/accuracies": 0.0, "rewards/chosen": 1.5496758222579956, "rewards/margins": -2.440502166748047, "rewards/rejected": 3.990177869796753, "step": 2156 }, { "epoch": 0.35, "learning_rate": 4.243243243243243e-07, "logits/chosen": -0.496015340089798, "logits/rejected": -0.398540198802948, "logps/chosen": -123.65937042236328, "logps/rejected": -58.232025146484375, "loss": 0.8479, "rewards/accuracies": 0.0, "rewards/chosen": 0.7416221499443054, "rewards/margins": -0.9711357951164246, "rewards/rejected": 1.71275794506073, "step": 2157 }, { "epoch": 0.35, "learning_rate": 4.27027027027027e-07, "logits/chosen": -0.28595614433288574, "logits/rejected": -0.28417450189590454, "logps/chosen": -29.76348876953125, "logps/rejected": -43.67072677612305, "loss": 0.6478, "rewards/accuracies": 0.0, "rewards/chosen": 0.37885019183158875, "rewards/margins": -0.028058230876922607, "rewards/rejected": 0.40690842270851135, "step": 2158 }, { "epoch": 0.35, "learning_rate": 4.2972972972972973e-07, "logits/chosen": -0.5546735525131226, "logits/rejected": -0.21074055135250092, "logps/chosen": -111.30125427246094, "logps/rejected": -108.41715240478516, "loss": 0.7589, "rewards/accuracies": 0.0, "rewards/chosen": 2.838975667953491, "rewards/margins": -0.4820899963378906, "rewards/rejected": 3.321065664291382, "step": 2159 }, { "epoch": 0.35, "learning_rate": 4.3243243243243244e-07, "logits/chosen": -0.6835565567016602, "logits/rejected": -0.5470778942108154, "logps/chosen": -53.04884338378906, "logps/rejected": -66.55030822753906, "loss": 0.7313, "rewards/accuracies": 0.0, "rewards/chosen": 1.4423065185546875, "rewards/margins": -0.00622403621673584, "rewards/rejected": 1.4485305547714233, "step": 2160 }, { "epoch": 0.35, "learning_rate": 4.351351351351351e-07, "logits/chosen": -0.41955873370170593, "logits/rejected": -0.4055470824241638, "logps/chosen": -143.3220672607422, "logps/rejected": -59.43154525756836, "loss": 1.2829, "rewards/accuracies": 0.0, "rewards/chosen": 0.19544677436351776, "rewards/margins": -0.3993030786514282, "rewards/rejected": 0.5947498679161072, "step": 2161 }, { "epoch": 0.35, "learning_rate": 4.378378378378378e-07, "logits/chosen": -0.2935418486595154, "logits/rejected": -0.31860315799713135, "logps/chosen": -70.87696075439453, "logps/rejected": -81.18253326416016, "loss": 0.5225, "rewards/accuracies": 1.0, "rewards/chosen": 0.7263779044151306, "rewards/margins": 0.5664604306221008, "rewards/rejected": 0.1599174588918686, "step": 2162 }, { "epoch": 0.35, "learning_rate": 4.4054054054054053e-07, "logits/chosen": -0.25802284479141235, "logits/rejected": -0.21198531985282898, "logps/chosen": -82.14349365234375, "logps/rejected": -78.23993682861328, "loss": 0.4127, "rewards/accuracies": 0.0, "rewards/chosen": 1.8759613037109375, "rewards/margins": -0.09353184700012207, "rewards/rejected": 1.9694931507110596, "step": 2163 }, { "epoch": 0.35, "learning_rate": 4.4324324324324325e-07, "logits/chosen": -0.6338291168212891, "logits/rejected": -0.660693883895874, "logps/chosen": -109.5542221069336, "logps/rejected": -77.26728820800781, "loss": 0.9671, "rewards/accuracies": 0.0, "rewards/chosen": 0.5923469662666321, "rewards/margins": -1.4318628311157227, "rewards/rejected": 2.02420973777771, "step": 2164 }, { "epoch": 0.35, "learning_rate": 4.459459459459459e-07, "logits/chosen": -0.5185290575027466, "logits/rejected": -0.47882646322250366, "logps/chosen": -126.2143325805664, "logps/rejected": -74.93888854980469, "loss": 0.6679, "rewards/accuracies": 1.0, "rewards/chosen": 3.3777427673339844, "rewards/margins": 0.8509178161621094, "rewards/rejected": 2.526824951171875, "step": 2165 }, { "epoch": 0.35, "learning_rate": 4.486486486486487e-07, "logits/chosen": -0.3740265965461731, "logits/rejected": -0.3434125781059265, "logps/chosen": -71.23106384277344, "logps/rejected": -81.58892822265625, "loss": 0.6488, "rewards/accuracies": 1.0, "rewards/chosen": 1.813201904296875, "rewards/margins": 1.014044165611267, "rewards/rejected": 0.7991577386856079, "step": 2166 }, { "epoch": 0.35, "learning_rate": 4.5135135135135134e-07, "logits/chosen": -0.39749518036842346, "logits/rejected": -0.2875962257385254, "logps/chosen": -115.42091369628906, "logps/rejected": -152.63436889648438, "loss": 0.4415, "rewards/accuracies": 0.0, "rewards/chosen": 2.6889755725860596, "rewards/margins": -0.1425766944885254, "rewards/rejected": 2.831552267074585, "step": 2167 }, { "epoch": 0.35, "learning_rate": 4.5405405405405405e-07, "logits/chosen": -0.5342963337898254, "logits/rejected": -0.4956776797771454, "logps/chosen": -90.45204162597656, "logps/rejected": -14.861011505126953, "loss": 1.6289, "rewards/accuracies": 0.0, "rewards/chosen": -0.3791153132915497, "rewards/margins": -0.7729179859161377, "rewards/rejected": 0.3938026428222656, "step": 2168 }, { "epoch": 0.35, "learning_rate": 4.567567567567567e-07, "logits/chosen": -0.4366784989833832, "logits/rejected": -0.499798059463501, "logps/chosen": -98.23945617675781, "logps/rejected": -150.46316528320312, "loss": 1.9631, "rewards/accuracies": 0.0, "rewards/chosen": 2.5381362438201904, "rewards/margins": -0.9170854091644287, "rewards/rejected": 3.455221652984619, "step": 2169 }, { "epoch": 0.35, "learning_rate": 4.594594594594595e-07, "logits/chosen": -0.20994731783866882, "logits/rejected": -0.2074291706085205, "logps/chosen": -60.394386291503906, "logps/rejected": -70.17640686035156, "loss": 1.0485, "rewards/accuracies": 1.0, "rewards/chosen": 0.6609253287315369, "rewards/margins": 0.5484550595283508, "rewards/rejected": 0.11247024685144424, "step": 2170 }, { "epoch": 0.35, "learning_rate": 4.6216216216216214e-07, "logits/chosen": -0.18756167590618134, "logits/rejected": -0.07993786036968231, "logps/chosen": -73.82443237304688, "logps/rejected": -28.671836853027344, "loss": 0.1743, "rewards/accuracies": 1.0, "rewards/chosen": 1.0288498401641846, "rewards/margins": 1.0198503732681274, "rewards/rejected": 0.008999443612992764, "step": 2171 }, { "epoch": 0.35, "learning_rate": 4.6486486486486485e-07, "logits/chosen": -0.7413047552108765, "logits/rejected": -0.6942827105522156, "logps/chosen": -253.3379364013672, "logps/rejected": -104.80034637451172, "loss": 0.0682, "rewards/accuracies": 1.0, "rewards/chosen": 2.1160600185394287, "rewards/margins": 2.0273385047912598, "rewards/rejected": 0.08872146904468536, "step": 2172 }, { "epoch": 0.35, "learning_rate": 4.6756756756756757e-07, "logits/chosen": -0.3191736340522766, "logits/rejected": -0.21672406792640686, "logps/chosen": -62.941917419433594, "logps/rejected": -99.80104064941406, "loss": 1.196, "rewards/accuracies": 1.0, "rewards/chosen": 1.5286461114883423, "rewards/margins": 0.6515068411827087, "rewards/rejected": 0.8771392703056335, "step": 2173 }, { "epoch": 0.35, "learning_rate": 4.702702702702703e-07, "logits/chosen": -0.27289173007011414, "logits/rejected": -0.06955968588590622, "logps/chosen": -54.13648986816406, "logps/rejected": -159.2003631591797, "loss": 1.6568, "rewards/accuracies": 0.0, "rewards/chosen": 1.526452660560608, "rewards/margins": -2.3831419944763184, "rewards/rejected": 3.909594774246216, "step": 2174 }, { "epoch": 0.35, "learning_rate": 4.7297297297297294e-07, "logits/chosen": -0.4407849907875061, "logits/rejected": -0.46241825819015503, "logps/chosen": -79.27301025390625, "logps/rejected": -37.00206756591797, "loss": 0.7732, "rewards/accuracies": 0.0, "rewards/chosen": 0.9001785516738892, "rewards/margins": -0.9533913135528564, "rewards/rejected": 1.8535698652267456, "step": 2175 }, { "epoch": 0.35, "learning_rate": 4.7567567567567566e-07, "logits/chosen": -0.8041391372680664, "logits/rejected": -0.14364422857761383, "logps/chosen": -90.97981262207031, "logps/rejected": -102.42393493652344, "loss": 0.1493, "rewards/accuracies": 1.0, "rewards/chosen": 3.358529806137085, "rewards/margins": 1.1873841285705566, "rewards/rejected": 2.1711456775665283, "step": 2176 }, { "epoch": 0.35, "learning_rate": 4.783783783783784e-07, "logits/chosen": -0.4198273718357086, "logits/rejected": -0.4958454370498657, "logps/chosen": -50.4986572265625, "logps/rejected": -122.12676239013672, "loss": 3.1057, "rewards/accuracies": 1.0, "rewards/chosen": 1.6215400695800781, "rewards/margins": 0.3160187005996704, "rewards/rejected": 1.3055213689804077, "step": 2177 }, { "epoch": 0.35, "learning_rate": 4.810810810810811e-07, "logits/chosen": -0.45585301518440247, "logits/rejected": -0.4679739773273468, "logps/chosen": -129.26019287109375, "logps/rejected": -97.84583282470703, "loss": 0.363, "rewards/accuracies": 1.0, "rewards/chosen": 3.7124602794647217, "rewards/margins": 1.2315101623535156, "rewards/rejected": 2.480950117111206, "step": 2178 }, { "epoch": 0.35, "learning_rate": 4.837837837837838e-07, "logits/chosen": -0.19035686552524567, "logits/rejected": -0.20156799256801605, "logps/chosen": -84.01806640625, "logps/rejected": -106.45915222167969, "loss": 0.5846, "rewards/accuracies": 0.0, "rewards/chosen": 1.120941162109375, "rewards/margins": -0.7439231872558594, "rewards/rejected": 1.8648643493652344, "step": 2179 }, { "epoch": 0.35, "learning_rate": 4.864864864864865e-07, "logits/chosen": -0.4457489252090454, "logits/rejected": -0.43190720677375793, "logps/chosen": -57.85032272338867, "logps/rejected": -94.48915100097656, "loss": 0.4828, "rewards/accuracies": 0.0, "rewards/chosen": 0.23600998520851135, "rewards/margins": -0.14262467622756958, "rewards/rejected": 0.37863466143608093, "step": 2180 }, { "epoch": 0.35, "learning_rate": 4.891891891891891e-07, "logits/chosen": 0.009007860906422138, "logits/rejected": -0.02879509888589382, "logps/chosen": -91.83883666992188, "logps/rejected": -73.68150329589844, "loss": 0.8516, "rewards/accuracies": 0.0, "rewards/chosen": 0.6700599789619446, "rewards/margins": -0.5903655886650085, "rewards/rejected": 1.2604255676269531, "step": 2181 }, { "epoch": 0.35, "learning_rate": 4.918918918918919e-07, "logits/chosen": -0.3703330457210541, "logits/rejected": -0.3703330457210541, "logps/chosen": -79.58348083496094, "logps/rejected": -79.58348083496094, "loss": 0.4886, "rewards/accuracies": 0.0, "rewards/chosen": 2.4350945949554443, "rewards/margins": 0.0, "rewards/rejected": 2.4350945949554443, "step": 2182 }, { "epoch": 0.35, "learning_rate": 4.945945945945945e-07, "logits/chosen": -0.6236835718154907, "logits/rejected": -0.09183456003665924, "logps/chosen": -85.68640899658203, "logps/rejected": -64.9882583618164, "loss": 0.3176, "rewards/accuracies": 1.0, "rewards/chosen": 1.8665717840194702, "rewards/margins": 0.2838706970214844, "rewards/rejected": 1.5827010869979858, "step": 2183 }, { "epoch": 0.35, "learning_rate": 4.972972972972973e-07, "logits/chosen": -0.5549763441085815, "logits/rejected": -0.3789466619491577, "logps/chosen": -73.89368438720703, "logps/rejected": -30.629688262939453, "loss": 0.4087, "rewards/accuracies": 1.0, "rewards/chosen": 1.5853523015975952, "rewards/margins": 1.3268001079559326, "rewards/rejected": 0.2585521638393402, "step": 2184 }, { "epoch": 0.35, "learning_rate": 5e-07, "logits/chosen": -0.5130411982536316, "logits/rejected": -0.42051541805267334, "logps/chosen": -46.749210357666016, "logps/rejected": -76.88398742675781, "loss": 0.7375, "rewards/accuracies": 1.0, "rewards/chosen": 1.6159924268722534, "rewards/margins": 0.19410669803619385, "rewards/rejected": 1.4218857288360596, "step": 2185 }, { "epoch": 0.35, "learning_rate": 5.027027027027027e-07, "logits/chosen": -0.5678090453147888, "logits/rejected": -0.5685379505157471, "logps/chosen": -135.89743041992188, "logps/rejected": -42.889225006103516, "loss": 0.8786, "rewards/accuracies": 0.0, "rewards/chosen": -0.17560577392578125, "rewards/margins": -0.4034916162490845, "rewards/rejected": 0.22788582742214203, "step": 2186 }, { "epoch": 0.35, "learning_rate": 5.054054054054053e-07, "logits/chosen": -0.7194467782974243, "logits/rejected": -0.7194395065307617, "logps/chosen": -132.9296875, "logps/rejected": -103.73695373535156, "loss": 0.3314, "rewards/accuracies": 1.0, "rewards/chosen": 0.598480224609375, "rewards/margins": 0.18749085068702698, "rewards/rejected": 0.410989373922348, "step": 2187 }, { "epoch": 0.36, "learning_rate": 5.081081081081081e-07, "logits/chosen": -0.218809112906456, "logits/rejected": -0.2930816411972046, "logps/chosen": -232.8078155517578, "logps/rejected": -145.40060424804688, "loss": 0.653, "rewards/accuracies": 0.0, "rewards/chosen": 1.8071731328964233, "rewards/margins": -0.6251999139785767, "rewards/rejected": 2.432373046875, "step": 2188 }, { "epoch": 0.36, "learning_rate": 5.108108108108108e-07, "logits/chosen": -0.4813421368598938, "logits/rejected": -0.43581321835517883, "logps/chosen": -159.9041290283203, "logps/rejected": -107.5104751586914, "loss": 0.1868, "rewards/accuracies": 1.0, "rewards/chosen": 3.4613327980041504, "rewards/margins": 0.9672324657440186, "rewards/rejected": 2.494100332260132, "step": 2189 }, { "epoch": 0.36, "learning_rate": 5.135135135135134e-07, "logits/chosen": -0.5335087776184082, "logits/rejected": -0.5008985996246338, "logps/chosen": -58.36848831176758, "logps/rejected": -97.9521484375, "loss": 0.5161, "rewards/accuracies": 0.0, "rewards/chosen": 1.6213444471359253, "rewards/margins": -0.566064715385437, "rewards/rejected": 2.1874091625213623, "step": 2190 }, { "epoch": 0.36, "learning_rate": 5.162162162162162e-07, "logits/chosen": -0.6182664632797241, "logits/rejected": -0.5828692317008972, "logps/chosen": -47.23626708984375, "logps/rejected": -189.26699829101562, "loss": 2.2019, "rewards/accuracies": 0.0, "rewards/chosen": 1.5002548694610596, "rewards/margins": -3.21378493309021, "rewards/rejected": 4.7140398025512695, "step": 2191 }, { "epoch": 0.36, "learning_rate": 5.18918918918919e-07, "logits/chosen": -0.5331897735595703, "logits/rejected": -0.5605032444000244, "logps/chosen": -122.672119140625, "logps/rejected": -132.3572998046875, "loss": 0.5083, "rewards/accuracies": 1.0, "rewards/chosen": 0.7477569580078125, "rewards/margins": 0.2394302487373352, "rewards/rejected": 0.5083267092704773, "step": 2192 }, { "epoch": 0.36, "learning_rate": 5.216216216216216e-07, "logits/chosen": -0.48053935170173645, "logits/rejected": -0.4344552159309387, "logps/chosen": -227.2435302734375, "logps/rejected": -85.90962219238281, "loss": 0.3332, "rewards/accuracies": 1.0, "rewards/chosen": 2.705575704574585, "rewards/margins": 0.33988499641418457, "rewards/rejected": 2.3656907081604004, "step": 2193 }, { "epoch": 0.36, "learning_rate": 5.243243243243243e-07, "logits/chosen": -0.5711706876754761, "logits/rejected": -0.5367836356163025, "logps/chosen": -91.29940795898438, "logps/rejected": -173.72763061523438, "loss": 0.0964, "rewards/accuracies": 1.0, "rewards/chosen": 3.4354920387268066, "rewards/margins": 1.6875778436660767, "rewards/rejected": 1.74791419506073, "step": 2194 }, { "epoch": 0.36, "learning_rate": 5.270270270270269e-07, "logits/chosen": -0.7306760549545288, "logits/rejected": -0.6922553181648254, "logps/chosen": -103.81627655029297, "logps/rejected": -102.42733764648438, "loss": 0.6299, "rewards/accuracies": 0.0, "rewards/chosen": 1.2333595752716064, "rewards/margins": -0.8077201843261719, "rewards/rejected": 2.0410797595977783, "step": 2195 }, { "epoch": 0.36, "learning_rate": 5.297297297297297e-07, "logits/chosen": -0.5067434310913086, "logits/rejected": -0.5379117131233215, "logps/chosen": -97.17808532714844, "logps/rejected": -57.35884094238281, "loss": 0.7944, "rewards/accuracies": 0.0, "rewards/chosen": 0.11162109673023224, "rewards/margins": -1.1790412664413452, "rewards/rejected": 1.290662407875061, "step": 2196 }, { "epoch": 0.36, "learning_rate": 5.324324324324324e-07, "logits/chosen": -0.590520441532135, "logits/rejected": -0.596881091594696, "logps/chosen": -143.9635467529297, "logps/rejected": -131.5481414794922, "loss": 1.0043, "rewards/accuracies": 0.0, "rewards/chosen": 2.681379795074463, "rewards/margins": -1.4183807373046875, "rewards/rejected": 4.09976053237915, "step": 2197 }, { "epoch": 0.36, "learning_rate": 5.35135135135135e-07, "logits/chosen": -0.3733547627925873, "logits/rejected": -0.35267174243927, "logps/chosen": -157.31884765625, "logps/rejected": -130.004150390625, "loss": 2.249, "rewards/accuracies": 1.0, "rewards/chosen": 2.411367893218994, "rewards/margins": 1.0113251209259033, "rewards/rejected": 1.4000427722930908, "step": 2198 }, { "epoch": 0.36, "learning_rate": 5.378378378378378e-07, "logits/chosen": -0.727671205997467, "logits/rejected": -0.7792339324951172, "logps/chosen": -81.23085021972656, "logps/rejected": -89.97348022460938, "loss": 0.4814, "rewards/accuracies": 0.0, "rewards/chosen": 2.706272840499878, "rewards/margins": -0.06639409065246582, "rewards/rejected": 2.7726669311523438, "step": 2199 }, { "epoch": 0.36, "learning_rate": 5.405405405405406e-07, "logits/chosen": -0.29513871669769287, "logits/rejected": -0.30271971225738525, "logps/chosen": -4.207644939422607, "logps/rejected": -37.64068603515625, "loss": 0.6564, "rewards/accuracies": 1.0, "rewards/chosen": 0.16924290359020233, "rewards/margins": 0.022316396236419678, "rewards/rejected": 0.14692650735378265, "step": 2200 }, { "epoch": 0.36, "learning_rate": 5.432432432432432e-07, "logits/chosen": -0.400267630815506, "logits/rejected": -0.400267630815506, "logps/chosen": -73.78633117675781, "logps/rejected": -73.78633117675781, "loss": 0.4105, "rewards/accuracies": 0.0, "rewards/chosen": 2.0743682384490967, "rewards/margins": 0.0, "rewards/rejected": 2.0743682384490967, "step": 2201 }, { "epoch": 0.36, "learning_rate": 5.459459459459459e-07, "logits/chosen": -0.4696676731109619, "logits/rejected": -0.5478312373161316, "logps/chosen": -112.98370361328125, "logps/rejected": -87.946044921875, "loss": 1.0573, "rewards/accuracies": 0.0, "rewards/chosen": 0.9503723382949829, "rewards/margins": -0.6444565057754517, "rewards/rejected": 1.5948288440704346, "step": 2202 }, { "epoch": 0.36, "learning_rate": 5.486486486486486e-07, "logits/chosen": -0.2651711702346802, "logits/rejected": -0.2651711702346802, "logps/chosen": -29.334693908691406, "logps/rejected": -29.334693908691406, "loss": 0.4019, "rewards/accuracies": 0.0, "rewards/chosen": 1.301522135734558, "rewards/margins": 0.0, "rewards/rejected": 1.301522135734558, "step": 2203 }, { "epoch": 0.36, "learning_rate": 5.513513513513513e-07, "logits/chosen": -0.9738913178443909, "logits/rejected": -1.0123355388641357, "logps/chosen": -122.08543395996094, "logps/rejected": -253.07400512695312, "loss": 1.0508, "rewards/accuracies": 0.0, "rewards/chosen": 0.8210281729698181, "rewards/margins": -0.3254348635673523, "rewards/rejected": 1.1464630365371704, "step": 2204 }, { "epoch": 0.36, "learning_rate": 5.54054054054054e-07, "logits/chosen": -0.8522574305534363, "logits/rejected": -0.8358944654464722, "logps/chosen": -151.73886108398438, "logps/rejected": -108.1144790649414, "loss": 0.368, "rewards/accuracies": 1.0, "rewards/chosen": 0.862652599811554, "rewards/margins": 0.5687859058380127, "rewards/rejected": 0.29386672377586365, "step": 2205 }, { "epoch": 0.36, "learning_rate": 5.567567567567567e-07, "logits/chosen": -0.7890209555625916, "logits/rejected": -0.7670561075210571, "logps/chosen": -94.08331298828125, "logps/rejected": -67.086669921875, "loss": 0.4921, "rewards/accuracies": 0.0, "rewards/chosen": 1.084703803062439, "rewards/margins": -0.4244370460510254, "rewards/rejected": 1.5091408491134644, "step": 2206 }, { "epoch": 0.36, "learning_rate": 5.594594594594594e-07, "logits/chosen": -0.3891123831272125, "logits/rejected": -0.41028398275375366, "logps/chosen": -116.54798126220703, "logps/rejected": -189.66073608398438, "loss": 0.952, "rewards/accuracies": 0.0, "rewards/chosen": 2.2915215492248535, "rewards/margins": -1.1252937316894531, "rewards/rejected": 3.4168152809143066, "step": 2207 }, { "epoch": 0.36, "learning_rate": 5.621621621621622e-07, "logits/chosen": -0.7814469337463379, "logits/rejected": -0.7903851866722107, "logps/chosen": -99.15293884277344, "logps/rejected": -89.89266967773438, "loss": 1.1645, "rewards/accuracies": 1.0, "rewards/chosen": 3.772631883621216, "rewards/margins": 1.9287995100021362, "rewards/rejected": 1.8438323736190796, "step": 2208 }, { "epoch": 0.36, "learning_rate": 5.648648648648648e-07, "logits/chosen": -0.6191549897193909, "logits/rejected": -0.5932613611221313, "logps/chosen": -80.70841979980469, "logps/rejected": -62.486209869384766, "loss": 0.6296, "rewards/accuracies": 1.0, "rewards/chosen": 2.016737461090088, "rewards/margins": 0.267917275428772, "rewards/rejected": 1.748820185661316, "step": 2209 }, { "epoch": 0.36, "learning_rate": 5.675675675675675e-07, "logits/chosen": -0.5009771585464478, "logits/rejected": -0.43425148725509644, "logps/chosen": -156.1354217529297, "logps/rejected": -74.98237609863281, "loss": 1.1081, "rewards/accuracies": 0.0, "rewards/chosen": -0.49295350909233093, "rewards/margins": -1.3550255298614502, "rewards/rejected": 0.8620719909667969, "step": 2210 }, { "epoch": 0.36, "learning_rate": 5.702702702702702e-07, "logits/chosen": -0.5816836953163147, "logits/rejected": -0.5420420169830322, "logps/chosen": -141.77711486816406, "logps/rejected": -136.50775146484375, "loss": 2.0549, "rewards/accuracies": 0.0, "rewards/chosen": 2.0655746459960938, "rewards/margins": -2.211805820465088, "rewards/rejected": 4.277380466461182, "step": 2211 }, { "epoch": 0.36, "learning_rate": 5.729729729729729e-07, "logits/chosen": -0.659098744392395, "logits/rejected": -1.0688343048095703, "logps/chosen": -111.70785522460938, "logps/rejected": -37.43682098388672, "loss": 0.4555, "rewards/accuracies": 1.0, "rewards/chosen": 0.5389404296875, "rewards/margins": 0.4174766540527344, "rewards/rejected": 0.12146377563476562, "step": 2212 }, { "epoch": 0.36, "learning_rate": 5.756756756756757e-07, "logits/chosen": -0.2228303700685501, "logits/rejected": -0.23341336846351624, "logps/chosen": -29.497066497802734, "logps/rejected": -71.74976348876953, "loss": 0.6381, "rewards/accuracies": 0.0, "rewards/chosen": 0.34590911865234375, "rewards/margins": -0.17366564273834229, "rewards/rejected": 0.519574761390686, "step": 2213 }, { "epoch": 0.36, "learning_rate": 5.783783783783784e-07, "logits/chosen": -0.7368628978729248, "logits/rejected": -0.7771813869476318, "logps/chosen": -141.6856231689453, "logps/rejected": -62.55714797973633, "loss": 1.4924, "rewards/accuracies": 0.0, "rewards/chosen": 0.5145096182823181, "rewards/margins": -1.4567980766296387, "rewards/rejected": 1.9713077545166016, "step": 2214 }, { "epoch": 0.36, "learning_rate": 5.81081081081081e-07, "logits/chosen": -0.3850262761116028, "logits/rejected": -0.329418420791626, "logps/chosen": -96.46529388427734, "logps/rejected": -112.01148223876953, "loss": 0.8674, "rewards/accuracies": 1.0, "rewards/chosen": 1.1124649047851562, "rewards/margins": 0.5419692993164062, "rewards/rejected": 0.57049560546875, "step": 2215 }, { "epoch": 0.36, "learning_rate": 5.837837837837838e-07, "logits/chosen": -0.4658806622028351, "logits/rejected": -0.32661062479019165, "logps/chosen": -124.41962432861328, "logps/rejected": -95.48467254638672, "loss": 1.8398, "rewards/accuracies": 0.0, "rewards/chosen": 0.18572159111499786, "rewards/margins": -3.5799529552459717, "rewards/rejected": 3.765674591064453, "step": 2216 }, { "epoch": 0.36, "learning_rate": 5.864864864864865e-07, "logits/chosen": -0.33614876866340637, "logits/rejected": -0.3308427035808563, "logps/chosen": -3.5979716777801514, "logps/rejected": -8.206172943115234, "loss": 0.5238, "rewards/accuracies": 0.0, "rewards/chosen": 0.2995084524154663, "rewards/margins": -0.030670851469039917, "rewards/rejected": 0.3301793038845062, "step": 2217 }, { "epoch": 0.36, "learning_rate": 5.891891891891891e-07, "logits/chosen": -0.6400794982910156, "logits/rejected": -0.629692792892456, "logps/chosen": -188.1693115234375, "logps/rejected": -104.38520050048828, "loss": 0.2205, "rewards/accuracies": 1.0, "rewards/chosen": 3.4334990978240967, "rewards/margins": 1.1759307384490967, "rewards/rejected": 2.257568359375, "step": 2218 }, { "epoch": 0.36, "learning_rate": 5.918918918918918e-07, "logits/chosen": -0.12606307864189148, "logits/rejected": -0.12606307864189148, "logps/chosen": -110.34852600097656, "logps/rejected": -110.34852600097656, "loss": 0.3792, "rewards/accuracies": 0.0, "rewards/chosen": 0.3286605775356293, "rewards/margins": 0.0, "rewards/rejected": 0.3286605775356293, "step": 2219 }, { "epoch": 0.36, "learning_rate": 5.945945945945947e-07, "logits/chosen": -0.3759145736694336, "logits/rejected": -0.35592591762542725, "logps/chosen": -24.90598487854004, "logps/rejected": -5.0648112297058105, "loss": 1.0124, "rewards/accuracies": 0.0, "rewards/chosen": -0.1637403517961502, "rewards/margins": -0.39406871795654297, "rewards/rejected": 0.23032836616039276, "step": 2220 }, { "epoch": 0.36, "learning_rate": 5.972972972972973e-07, "logits/chosen": -0.2824896275997162, "logits/rejected": -0.23891115188598633, "logps/chosen": -74.68785095214844, "logps/rejected": -54.86075973510742, "loss": 0.1551, "rewards/accuracies": 1.0, "rewards/chosen": 2.5850884914398193, "rewards/margins": 1.4989994764328003, "rewards/rejected": 1.086089015007019, "step": 2221 }, { "epoch": 0.36, "learning_rate": 6e-07, "logits/chosen": -0.44935426115989685, "logits/rejected": -0.400255024433136, "logps/chosen": -110.47136688232422, "logps/rejected": -77.33094787597656, "loss": 1.1124, "rewards/accuracies": 0.0, "rewards/chosen": 0.3373054563999176, "rewards/margins": -0.7044075727462769, "rewards/rejected": 1.041712999343872, "step": 2222 }, { "epoch": 0.36, "learning_rate": 6.027027027027026e-07, "logits/chosen": -0.7625378966331482, "logits/rejected": -0.7259291410446167, "logps/chosen": -150.62359619140625, "logps/rejected": -62.60855484008789, "loss": 0.0693, "rewards/accuracies": 1.0, "rewards/chosen": 4.210789680480957, "rewards/margins": 2.1200413703918457, "rewards/rejected": 2.0907483100891113, "step": 2223 }, { "epoch": 0.36, "learning_rate": 6.054054054054054e-07, "logits/chosen": -0.1412983238697052, "logits/rejected": -0.1412983238697052, "logps/chosen": -0.4614506959915161, "logps/rejected": -0.4614506959915161, "loss": 0.4358, "rewards/accuracies": 0.0, "rewards/chosen": 0.10634144395589828, "rewards/margins": 0.0, "rewards/rejected": 0.10634144395589828, "step": 2224 }, { "epoch": 0.36, "learning_rate": 6.081081081081081e-07, "logits/chosen": -0.21359539031982422, "logits/rejected": -0.20888929069042206, "logps/chosen": -20.594663619995117, "logps/rejected": -40.310054779052734, "loss": 0.4893, "rewards/accuracies": 1.0, "rewards/chosen": 0.3395406901836395, "rewards/margins": 0.23571377992630005, "rewards/rejected": 0.10382690280675888, "step": 2225 }, { "epoch": 0.36, "learning_rate": 6.108108108108107e-07, "logits/chosen": -0.5517796874046326, "logits/rejected": -0.549530029296875, "logps/chosen": -20.9345703125, "logps/rejected": -22.567407608032227, "loss": 1.0206, "rewards/accuracies": 1.0, "rewards/chosen": 0.12012539058923721, "rewards/margins": 0.137705996632576, "rewards/rejected": -0.017580604180693626, "step": 2226 }, { "epoch": 0.36, "learning_rate": 6.135135135135134e-07, "logits/chosen": -0.1894611269235611, "logits/rejected": -0.1694878339767456, "logps/chosen": -100.68209075927734, "logps/rejected": -90.32896423339844, "loss": 0.4914, "rewards/accuracies": 0.0, "rewards/chosen": 0.367898553609848, "rewards/margins": -0.2958511412143707, "rewards/rejected": 0.6637496948242188, "step": 2227 }, { "epoch": 0.36, "learning_rate": 6.162162162162163e-07, "logits/chosen": -0.8169233202934265, "logits/rejected": -0.811479389667511, "logps/chosen": -159.1812286376953, "logps/rejected": -77.5208511352539, "loss": 0.1482, "rewards/accuracies": 1.0, "rewards/chosen": 2.717097520828247, "rewards/margins": 1.4689171314239502, "rewards/rejected": 1.2481803894042969, "step": 2228 }, { "epoch": 0.36, "learning_rate": 6.189189189189189e-07, "logits/chosen": -0.5358737707138062, "logits/rejected": -0.4878224730491638, "logps/chosen": -59.268218994140625, "logps/rejected": -41.43863296508789, "loss": 0.1521, "rewards/accuracies": 1.0, "rewards/chosen": 2.5691139698028564, "rewards/margins": 1.180010199546814, "rewards/rejected": 1.3891037702560425, "step": 2229 }, { "epoch": 0.36, "learning_rate": 6.216216216216216e-07, "logits/chosen": -0.7950950860977173, "logits/rejected": -0.7117176055908203, "logps/chosen": -79.00365447998047, "logps/rejected": -42.09553146362305, "loss": 1.0183, "rewards/accuracies": 0.0, "rewards/chosen": 0.7479515075683594, "rewards/margins": -0.575473427772522, "rewards/rejected": 1.3234249353408813, "step": 2230 }, { "epoch": 0.36, "learning_rate": 6.243243243243243e-07, "logits/chosen": -0.5264948010444641, "logits/rejected": -0.47745072841644287, "logps/chosen": -134.607421875, "logps/rejected": -30.294893264770508, "loss": 0.1771, "rewards/accuracies": 1.0, "rewards/chosen": 2.703125, "rewards/margins": 1.7023210525512695, "rewards/rejected": 1.0008039474487305, "step": 2231 }, { "epoch": 0.36, "learning_rate": 6.27027027027027e-07, "logits/chosen": -0.5882291197776794, "logits/rejected": -0.5477176308631897, "logps/chosen": -137.37857055664062, "logps/rejected": -94.03492736816406, "loss": 0.1384, "rewards/accuracies": 1.0, "rewards/chosen": 3.700082540512085, "rewards/margins": 1.2463388442993164, "rewards/rejected": 2.4537436962127686, "step": 2232 }, { "epoch": 0.36, "learning_rate": 6.297297297297297e-07, "logits/chosen": -0.6876538991928101, "logits/rejected": -0.5761052370071411, "logps/chosen": -136.47317504882812, "logps/rejected": -140.30264282226562, "loss": 0.5825, "rewards/accuracies": 0.0, "rewards/chosen": 3.995526075363159, "rewards/margins": -0.0660402774810791, "rewards/rejected": 4.061566352844238, "step": 2233 }, { "epoch": 0.36, "learning_rate": 6.324324324324324e-07, "logits/chosen": -0.5634015202522278, "logits/rejected": -0.579166054725647, "logps/chosen": -41.19657897949219, "logps/rejected": -130.71902465820312, "loss": 0.9258, "rewards/accuracies": 1.0, "rewards/chosen": 0.5296154022216797, "rewards/margins": 0.21702995896339417, "rewards/rejected": 0.3125854432582855, "step": 2234 }, { "epoch": 0.36, "learning_rate": 6.35135135135135e-07, "logits/chosen": -0.46079570055007935, "logits/rejected": -0.3754708170890808, "logps/chosen": -111.921630859375, "logps/rejected": -98.19268798828125, "loss": 0.1285, "rewards/accuracies": 1.0, "rewards/chosen": 1.7960388660430908, "rewards/margins": 1.2379975318908691, "rewards/rejected": 0.5580413937568665, "step": 2235 }, { "epoch": 0.36, "learning_rate": 6.378378378378379e-07, "logits/chosen": -0.243368998169899, "logits/rejected": -0.1606096625328064, "logps/chosen": -48.52909851074219, "logps/rejected": -34.772605895996094, "loss": 1.1069, "rewards/accuracies": 1.0, "rewards/chosen": 1.8097137212753296, "rewards/margins": 0.24315261840820312, "rewards/rejected": 1.5665611028671265, "step": 2236 }, { "epoch": 0.36, "learning_rate": 6.405405405405405e-07, "logits/chosen": -0.2882453203201294, "logits/rejected": -0.25236934423446655, "logps/chosen": -65.38898468017578, "logps/rejected": -78.75468444824219, "loss": 0.8227, "rewards/accuracies": 0.0, "rewards/chosen": 1.7625938653945923, "rewards/margins": -0.21469950675964355, "rewards/rejected": 1.9772933721542358, "step": 2237 }, { "epoch": 0.36, "learning_rate": 6.432432432432432e-07, "logits/chosen": -0.6537531018257141, "logits/rejected": -0.624577522277832, "logps/chosen": -140.067626953125, "logps/rejected": -74.93856048583984, "loss": 1.236, "rewards/accuracies": 0.0, "rewards/chosen": -0.02530822716653347, "rewards/margins": -1.7784919738769531, "rewards/rejected": 1.7531837224960327, "step": 2238 }, { "epoch": 0.36, "learning_rate": 6.459459459459459e-07, "logits/chosen": -0.6643588542938232, "logits/rejected": -0.552036702632904, "logps/chosen": -100.667236328125, "logps/rejected": -83.07039642333984, "loss": 0.9252, "rewards/accuracies": 0.0, "rewards/chosen": 0.6354614496231079, "rewards/margins": -0.3013450503349304, "rewards/rejected": 0.9368064999580383, "step": 2239 }, { "epoch": 0.36, "learning_rate": 6.486486486486486e-07, "logits/chosen": -0.5016212463378906, "logits/rejected": -0.45266252756118774, "logps/chosen": -49.72368621826172, "logps/rejected": -93.40552520751953, "loss": 0.8423, "rewards/accuracies": 0.0, "rewards/chosen": 1.71526038646698, "rewards/margins": -0.875780463218689, "rewards/rejected": 2.591040849685669, "step": 2240 }, { "epoch": 0.36, "learning_rate": 6.513513513513513e-07, "logits/chosen": -0.4239346981048584, "logits/rejected": -0.33170607686042786, "logps/chosen": -89.53902435302734, "logps/rejected": -50.21046447753906, "loss": 0.6218, "rewards/accuracies": 0.0, "rewards/chosen": -0.15555191040039062, "rewards/margins": -0.43943482637405396, "rewards/rejected": 0.28388291597366333, "step": 2241 }, { "epoch": 0.36, "learning_rate": 6.54054054054054e-07, "logits/chosen": -1.0785690546035767, "logits/rejected": -1.0885066986083984, "logps/chosen": -79.8589859008789, "logps/rejected": -35.22509002685547, "loss": 0.3262, "rewards/accuracies": 1.0, "rewards/chosen": 1.950721025466919, "rewards/margins": 1.6049575805664062, "rewards/rejected": 0.3457634150981903, "step": 2242 }, { "epoch": 0.36, "learning_rate": 6.567567567567566e-07, "logits/chosen": -0.4209129810333252, "logits/rejected": -0.424108624458313, "logps/chosen": -100.9858627319336, "logps/rejected": -47.72055435180664, "loss": 1.021, "rewards/accuracies": 0.0, "rewards/chosen": 0.4115310609340668, "rewards/margins": -1.112256646156311, "rewards/rejected": 1.5237877368927002, "step": 2243 }, { "epoch": 0.36, "learning_rate": 6.594594594594595e-07, "logits/chosen": -0.4518023729324341, "logits/rejected": -0.48046737909317017, "logps/chosen": -56.93736267089844, "logps/rejected": -87.30290222167969, "loss": 0.7495, "rewards/accuracies": 0.0, "rewards/chosen": 0.10209961235523224, "rewards/margins": -0.10595551133155823, "rewards/rejected": 0.20805512368679047, "step": 2244 }, { "epoch": 0.36, "learning_rate": 6.621621621621622e-07, "logits/chosen": -0.6082068681716919, "logits/rejected": -0.6082068681716919, "logps/chosen": -51.009552001953125, "logps/rejected": -51.009552001953125, "loss": 0.509, "rewards/accuracies": 0.0, "rewards/chosen": 2.2537803649902344, "rewards/margins": 0.0, "rewards/rejected": 2.2537803649902344, "step": 2245 }, { "epoch": 0.36, "learning_rate": 6.648648648648648e-07, "logits/chosen": -0.2621532678604126, "logits/rejected": -0.24568825960159302, "logps/chosen": -94.0628433227539, "logps/rejected": -97.76658630371094, "loss": 1.0525, "rewards/accuracies": 1.0, "rewards/chosen": 0.9280906915664673, "rewards/margins": 0.02028733491897583, "rewards/rejected": 0.9078033566474915, "step": 2246 }, { "epoch": 0.36, "learning_rate": 6.675675675675675e-07, "logits/chosen": -0.9863182902336121, "logits/rejected": -0.8158172369003296, "logps/chosen": -201.6885986328125, "logps/rejected": -100.59742736816406, "loss": 0.4067, "rewards/accuracies": 0.0, "rewards/chosen": 3.714529514312744, "rewards/margins": -0.04021000862121582, "rewards/rejected": 3.75473952293396, "step": 2247 }, { "epoch": 0.36, "learning_rate": 6.702702702702703e-07, "logits/chosen": -0.20137806236743927, "logits/rejected": -0.20546410977840424, "logps/chosen": -9.265668869018555, "logps/rejected": -21.1968936920166, "loss": 0.8247, "rewards/accuracies": 0.0, "rewards/chosen": -0.08988199383020401, "rewards/margins": -0.21451863646507263, "rewards/rejected": 0.12463665008544922, "step": 2248 }, { "epoch": 0.37, "learning_rate": 6.729729729729729e-07, "logits/chosen": -0.3657453954219818, "logits/rejected": -0.3557415306568146, "logps/chosen": -150.3458251953125, "logps/rejected": -185.52883911132812, "loss": 0.4947, "rewards/accuracies": 0.0, "rewards/chosen": 3.256579637527466, "rewards/margins": -0.3350646495819092, "rewards/rejected": 3.591644287109375, "step": 2249 }, { "epoch": 0.37, "learning_rate": 6.756756756756756e-07, "logits/chosen": -0.41512587666511536, "logits/rejected": -0.3829752504825592, "logps/chosen": -103.3506851196289, "logps/rejected": -72.46723175048828, "loss": 0.6199, "rewards/accuracies": 1.0, "rewards/chosen": 4.454204559326172, "rewards/margins": 2.8914780616760254, "rewards/rejected": 1.562726616859436, "step": 2250 }, { "epoch": 0.37, "learning_rate": 6.783783783783783e-07, "logits/chosen": -0.22589227557182312, "logits/rejected": -0.1449282169342041, "logps/chosen": -67.24797058105469, "logps/rejected": -75.03020477294922, "loss": 0.48, "rewards/accuracies": 1.0, "rewards/chosen": 0.7991447448730469, "rewards/margins": 0.7736099362373352, "rewards/rejected": 0.025534821674227715, "step": 2251 }, { "epoch": 0.37, "learning_rate": 6.810810810810811e-07, "logits/chosen": -0.4397050738334656, "logits/rejected": -0.46435463428497314, "logps/chosen": -77.95272827148438, "logps/rejected": -161.3625030517578, "loss": 0.8758, "rewards/accuracies": 0.0, "rewards/chosen": 1.5353806018829346, "rewards/margins": -0.7368957996368408, "rewards/rejected": 2.2722764015197754, "step": 2252 }, { "epoch": 0.37, "learning_rate": 6.837837837837838e-07, "logits/chosen": -0.7108121514320374, "logits/rejected": -0.6696009039878845, "logps/chosen": -62.85237503051758, "logps/rejected": -94.7480239868164, "loss": 0.615, "rewards/accuracies": 0.0, "rewards/chosen": 1.415183663368225, "rewards/margins": -0.29699206352233887, "rewards/rejected": 1.712175726890564, "step": 2253 }, { "epoch": 0.37, "learning_rate": 6.864864864864864e-07, "logits/chosen": -0.690198540687561, "logits/rejected": -0.47488880157470703, "logps/chosen": -174.44630432128906, "logps/rejected": -53.584312438964844, "loss": 0.2738, "rewards/accuracies": 1.0, "rewards/chosen": 4.592665195465088, "rewards/margins": 1.0275490283966064, "rewards/rejected": 3.5651161670684814, "step": 2254 }, { "epoch": 0.37, "learning_rate": 6.891891891891891e-07, "logits/chosen": -0.2936997711658478, "logits/rejected": -0.26545047760009766, "logps/chosen": -44.51640701293945, "logps/rejected": -19.694988250732422, "loss": 1.1743, "rewards/accuracies": 0.0, "rewards/chosen": 0.16365623474121094, "rewards/margins": -0.005013853311538696, "rewards/rejected": 0.16867008805274963, "step": 2255 }, { "epoch": 0.37, "learning_rate": 6.918918918918919e-07, "logits/chosen": -0.5204583406448364, "logits/rejected": -0.5031228065490723, "logps/chosen": -49.419010162353516, "logps/rejected": -32.330535888671875, "loss": 0.6975, "rewards/accuracies": 0.0, "rewards/chosen": 1.418189287185669, "rewards/margins": -0.6471772193908691, "rewards/rejected": 2.065366506576538, "step": 2256 }, { "epoch": 0.37, "learning_rate": 6.945945945945945e-07, "logits/chosen": -0.563084602355957, "logits/rejected": -0.533106803894043, "logps/chosen": -58.56404113769531, "logps/rejected": -126.12002563476562, "loss": 1.2578, "rewards/accuracies": 1.0, "rewards/chosen": 1.0397186279296875, "rewards/margins": 1.1561416387557983, "rewards/rejected": -0.11642303317785263, "step": 2257 }, { "epoch": 0.37, "learning_rate": 6.972972972972973e-07, "logits/chosen": -0.6760677695274353, "logits/rejected": -0.6315847635269165, "logps/chosen": -82.01365661621094, "logps/rejected": -62.064701080322266, "loss": 1.6476, "rewards/accuracies": 1.0, "rewards/chosen": 1.9071991443634033, "rewards/margins": 1.5499317646026611, "rewards/rejected": 0.3572673797607422, "step": 2258 }, { "epoch": 0.37, "learning_rate": 7e-07, "logits/chosen": -0.7631151080131531, "logits/rejected": -0.8262404203414917, "logps/chosen": -115.71836853027344, "logps/rejected": -102.24626159667969, "loss": 2.5616, "rewards/accuracies": 0.0, "rewards/chosen": 0.35171204805374146, "rewards/margins": -3.34006667137146, "rewards/rejected": 3.6917786598205566, "step": 2259 }, { "epoch": 0.37, "learning_rate": 7.027027027027027e-07, "logits/chosen": -0.7611721158027649, "logits/rejected": -0.6119667887687683, "logps/chosen": -119.52710723876953, "logps/rejected": -35.96532440185547, "loss": 0.1884, "rewards/accuracies": 1.0, "rewards/chosen": 3.345522403717041, "rewards/margins": 1.8139622211456299, "rewards/rejected": 1.5315601825714111, "step": 2260 }, { "epoch": 0.37, "learning_rate": 7.054054054054054e-07, "logits/chosen": -0.8510820865631104, "logits/rejected": -0.7916209697723389, "logps/chosen": -88.5776596069336, "logps/rejected": -60.683624267578125, "loss": 0.8697, "rewards/accuracies": 1.0, "rewards/chosen": 1.273566484451294, "rewards/margins": 0.16949236392974854, "rewards/rejected": 1.1040741205215454, "step": 2261 }, { "epoch": 0.37, "learning_rate": 7.081081081081081e-07, "logits/chosen": -0.7983943819999695, "logits/rejected": -0.7720741629600525, "logps/chosen": -75.52900695800781, "logps/rejected": -187.8837890625, "loss": 2.0604, "rewards/accuracies": 0.0, "rewards/chosen": 0.4883110225200653, "rewards/margins": -2.6880714893341064, "rewards/rejected": 3.176382541656494, "step": 2262 }, { "epoch": 0.37, "learning_rate": 7.108108108108107e-07, "logits/chosen": -0.5307034850120544, "logits/rejected": -0.5326235890388489, "logps/chosen": -133.7303466796875, "logps/rejected": -144.37625122070312, "loss": 0.5192, "rewards/accuracies": 0.0, "rewards/chosen": 2.9297943115234375, "rewards/margins": -0.427154541015625, "rewards/rejected": 3.3569488525390625, "step": 2263 }, { "epoch": 0.37, "learning_rate": 7.135135135135135e-07, "logits/chosen": -0.5278887748718262, "logits/rejected": -0.5278887748718262, "logps/chosen": -39.228858947753906, "logps/rejected": -39.228858947753906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.6631874442100525, "rewards/margins": 0.0, "rewards/rejected": 0.6631874442100525, "step": 2264 }, { "epoch": 0.37, "learning_rate": 7.162162162162161e-07, "logits/chosen": -0.648796796798706, "logits/rejected": -0.6754841208457947, "logps/chosen": -85.5849838256836, "logps/rejected": -156.18679809570312, "loss": 1.9813, "rewards/accuracies": 0.0, "rewards/chosen": 0.4534095823764801, "rewards/margins": -3.0796713829040527, "rewards/rejected": 3.5330810546875, "step": 2265 }, { "epoch": 0.37, "learning_rate": 7.189189189189189e-07, "logits/chosen": -0.6149728894233704, "logits/rejected": -0.604219377040863, "logps/chosen": -129.94772338867188, "logps/rejected": -155.06858825683594, "loss": 0.2964, "rewards/accuracies": 1.0, "rewards/chosen": 4.311166286468506, "rewards/margins": 0.40631842613220215, "rewards/rejected": 3.9048478603363037, "step": 2266 }, { "epoch": 0.37, "learning_rate": 7.216216216216216e-07, "logits/chosen": -0.51175856590271, "logits/rejected": -0.51175856590271, "logps/chosen": -70.71818542480469, "logps/rejected": -70.71818542480469, "loss": 1.7826, "rewards/accuracies": 0.0, "rewards/chosen": 0.5365447998046875, "rewards/margins": 0.0, "rewards/rejected": 0.5365447998046875, "step": 2267 }, { "epoch": 0.37, "learning_rate": 7.243243243243243e-07, "logits/chosen": -0.8254315853118896, "logits/rejected": -0.8292708992958069, "logps/chosen": -48.19972229003906, "logps/rejected": -40.88510513305664, "loss": 0.4459, "rewards/accuracies": 1.0, "rewards/chosen": 1.2767006158828735, "rewards/margins": 0.0020335912704467773, "rewards/rejected": 1.2746670246124268, "step": 2268 }, { "epoch": 0.37, "learning_rate": 7.27027027027027e-07, "logits/chosen": -0.3939618766307831, "logits/rejected": -0.2423282414674759, "logps/chosen": -90.11600494384766, "logps/rejected": -103.72291564941406, "loss": 0.8517, "rewards/accuracies": 0.0, "rewards/chosen": 2.4312310218811035, "rewards/margins": -0.32256078720092773, "rewards/rejected": 2.7537918090820312, "step": 2269 }, { "epoch": 0.37, "learning_rate": 7.297297297297297e-07, "logits/chosen": -0.3999376595020294, "logits/rejected": -0.31365466117858887, "logps/chosen": -95.79456329345703, "logps/rejected": -61.80335998535156, "loss": 0.9759, "rewards/accuracies": 0.0, "rewards/chosen": 1.3308228254318237, "rewards/margins": -0.14208602905273438, "rewards/rejected": 1.472908854484558, "step": 2270 }, { "epoch": 0.37, "learning_rate": 7.324324324324323e-07, "logits/chosen": -0.24847428500652313, "logits/rejected": -0.24847428500652313, "logps/chosen": -67.73552703857422, "logps/rejected": -67.73552703857422, "loss": 0.8735, "rewards/accuracies": 0.0, "rewards/chosen": 1.528743028640747, "rewards/margins": 0.0, "rewards/rejected": 1.528743028640747, "step": 2271 }, { "epoch": 0.37, "learning_rate": 7.351351351351351e-07, "logits/chosen": -0.6813787221908569, "logits/rejected": -0.5504580736160278, "logps/chosen": -224.71554565429688, "logps/rejected": -54.89336013793945, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": 3.7898895740509033, "rewards/margins": 2.3136417865753174, "rewards/rejected": 1.476247787475586, "step": 2272 }, { "epoch": 0.37, "learning_rate": 7.378378378378379e-07, "logits/chosen": -0.691271185874939, "logits/rejected": -0.6675417423248291, "logps/chosen": -102.50733184814453, "logps/rejected": -143.55343627929688, "loss": 0.8864, "rewards/accuracies": 0.0, "rewards/chosen": 0.94301837682724, "rewards/margins": -1.2588675022125244, "rewards/rejected": 2.201885938644409, "step": 2273 }, { "epoch": 0.37, "learning_rate": 7.405405405405405e-07, "logits/chosen": -0.37543919682502747, "logits/rejected": -0.2317550629377365, "logps/chosen": -90.74073791503906, "logps/rejected": -104.24488830566406, "loss": 0.9917, "rewards/accuracies": 1.0, "rewards/chosen": 1.4280548095703125, "rewards/margins": 0.041606903076171875, "rewards/rejected": 1.3864479064941406, "step": 2274 }, { "epoch": 0.37, "learning_rate": 7.432432432432432e-07, "logits/chosen": -0.4576973617076874, "logits/rejected": -0.5013885498046875, "logps/chosen": -36.70549392700195, "logps/rejected": -52.95794677734375, "loss": 0.7555, "rewards/accuracies": 0.0, "rewards/chosen": 1.1989459991455078, "rewards/margins": -1.1380078792572021, "rewards/rejected": 2.33695387840271, "step": 2275 }, { "epoch": 0.37, "learning_rate": 7.45945945945946e-07, "logits/chosen": -0.752943217754364, "logits/rejected": -0.7201883792877197, "logps/chosen": -56.70930480957031, "logps/rejected": -11.692708969116211, "loss": 0.3171, "rewards/accuracies": 1.0, "rewards/chosen": 1.2456337213516235, "rewards/margins": 0.6368152499198914, "rewards/rejected": 0.6088184714317322, "step": 2276 }, { "epoch": 0.37, "learning_rate": 7.486486486486486e-07, "logits/chosen": -0.41769662499427795, "logits/rejected": -0.45332425832748413, "logps/chosen": -84.1392822265625, "logps/rejected": -118.39344787597656, "loss": 1.0254, "rewards/accuracies": 0.0, "rewards/chosen": 1.339910864830017, "rewards/margins": -0.31324315071105957, "rewards/rejected": 1.6531540155410767, "step": 2277 }, { "epoch": 0.37, "learning_rate": 7.513513513513513e-07, "logits/chosen": -0.7593998908996582, "logits/rejected": -0.759899377822876, "logps/chosen": -86.78003692626953, "logps/rejected": -92.82141876220703, "loss": 0.6758, "rewards/accuracies": 1.0, "rewards/chosen": 1.6889396905899048, "rewards/margins": 0.3600738048553467, "rewards/rejected": 1.328865885734558, "step": 2278 }, { "epoch": 0.37, "learning_rate": 7.540540540540539e-07, "logits/chosen": -0.5957744121551514, "logits/rejected": -0.38917556405067444, "logps/chosen": -169.15586853027344, "logps/rejected": -92.94557189941406, "loss": 0.0669, "rewards/accuracies": 1.0, "rewards/chosen": 3.734135389328003, "rewards/margins": 2.500797986984253, "rewards/rejected": 1.23333740234375, "step": 2279 }, { "epoch": 0.37, "learning_rate": 7.567567567567568e-07, "logits/chosen": -0.19638627767562866, "logits/rejected": -0.19638627767562866, "logps/chosen": -21.09461212158203, "logps/rejected": -21.09461212158203, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.12737254798412323, "rewards/margins": 0.0, "rewards/rejected": 0.12737254798412323, "step": 2280 }, { "epoch": 0.37, "learning_rate": 7.594594594594595e-07, "logits/chosen": -0.6448432803153992, "logits/rejected": -0.6368750929832458, "logps/chosen": -87.98416137695312, "logps/rejected": -40.91659927368164, "loss": 1.1861, "rewards/accuracies": 1.0, "rewards/chosen": 1.2063064575195312, "rewards/margins": 1.0341793298721313, "rewards/rejected": 0.1721271574497223, "step": 2281 }, { "epoch": 0.37, "learning_rate": 7.621621621621621e-07, "logits/chosen": -0.44892406463623047, "logits/rejected": -0.44892406463623047, "logps/chosen": -16.312599182128906, "logps/rejected": -16.312599182128906, "loss": 0.6347, "rewards/accuracies": 0.0, "rewards/chosen": 0.28655359148979187, "rewards/margins": 0.0, "rewards/rejected": 0.28655359148979187, "step": 2282 }, { "epoch": 0.37, "learning_rate": 7.648648648648648e-07, "logits/chosen": -0.6318801045417786, "logits/rejected": -0.501460075378418, "logps/chosen": -100.11405944824219, "logps/rejected": -32.6263542175293, "loss": 0.1851, "rewards/accuracies": 1.0, "rewards/chosen": 3.124559164047241, "rewards/margins": 2.889474630355835, "rewards/rejected": 0.23508453369140625, "step": 2283 }, { "epoch": 0.37, "learning_rate": 7.675675675675676e-07, "logits/chosen": -0.5599803328514099, "logits/rejected": -0.5892466306686401, "logps/chosen": -72.09712219238281, "logps/rejected": -110.91473388671875, "loss": 0.8973, "rewards/accuracies": 0.0, "rewards/chosen": 1.2312911748886108, "rewards/margins": -1.27959144115448, "rewards/rejected": 2.510882616043091, "step": 2284 }, { "epoch": 0.37, "learning_rate": 7.702702702702702e-07, "logits/chosen": -0.44776061177253723, "logits/rejected": -0.4473162889480591, "logps/chosen": -77.71829223632812, "logps/rejected": -118.5373306274414, "loss": 0.5304, "rewards/accuracies": 0.0, "rewards/chosen": 3.162590742111206, "rewards/margins": -0.28567981719970703, "rewards/rejected": 3.448270559310913, "step": 2285 }, { "epoch": 0.37, "learning_rate": 7.729729729729729e-07, "logits/chosen": -0.7313956022262573, "logits/rejected": -0.8112574219703674, "logps/chosen": -224.60296630859375, "logps/rejected": -189.0287628173828, "loss": 0.3154, "rewards/accuracies": 1.0, "rewards/chosen": 2.178253173828125, "rewards/margins": 0.46984100341796875, "rewards/rejected": 1.7084121704101562, "step": 2286 }, { "epoch": 0.37, "learning_rate": 7.756756756756756e-07, "logits/chosen": -0.9103565812110901, "logits/rejected": -0.8068420886993408, "logps/chosen": -101.92405700683594, "logps/rejected": -176.78872680664062, "loss": 0.8149, "rewards/accuracies": 1.0, "rewards/chosen": 3.9273223876953125, "rewards/margins": 0.011322021484375, "rewards/rejected": 3.9160003662109375, "step": 2287 }, { "epoch": 0.37, "learning_rate": 7.783783783783784e-07, "logits/chosen": -0.6683869957923889, "logits/rejected": -0.5888423323631287, "logps/chosen": -53.247100830078125, "logps/rejected": -61.554725646972656, "loss": 0.4053, "rewards/accuracies": 0.0, "rewards/chosen": 0.8816871643066406, "rewards/margins": -0.07863008975982666, "rewards/rejected": 0.9603172540664673, "step": 2288 }, { "epoch": 0.37, "learning_rate": 7.810810810810811e-07, "logits/chosen": -0.3665440082550049, "logits/rejected": -0.33358800411224365, "logps/chosen": -111.53568267822266, "logps/rejected": -111.87462615966797, "loss": 0.5964, "rewards/accuracies": 0.0, "rewards/chosen": 0.184803768992424, "rewards/margins": -0.7496665716171265, "rewards/rejected": 0.9344703555107117, "step": 2289 }, { "epoch": 0.37, "learning_rate": 7.837837837837838e-07, "logits/chosen": -0.16175779700279236, "logits/rejected": -0.1299322098493576, "logps/chosen": -53.265968322753906, "logps/rejected": -17.74261474609375, "loss": 0.3482, "rewards/accuracies": 1.0, "rewards/chosen": 1.0214073657989502, "rewards/margins": 0.6860802173614502, "rewards/rejected": 0.3353271484375, "step": 2290 }, { "epoch": 0.37, "learning_rate": 7.864864864864864e-07, "logits/chosen": -0.38605737686157227, "logits/rejected": -0.2128995656967163, "logps/chosen": -167.19003295898438, "logps/rejected": -88.32518768310547, "loss": 0.1895, "rewards/accuracies": 1.0, "rewards/chosen": 2.591177463531494, "rewards/margins": 0.8309547901153564, "rewards/rejected": 1.7602226734161377, "step": 2291 }, { "epoch": 0.37, "learning_rate": 7.891891891891892e-07, "logits/chosen": -0.5214920043945312, "logits/rejected": -0.5342315435409546, "logps/chosen": -89.76756286621094, "logps/rejected": -108.95663452148438, "loss": 1.5386, "rewards/accuracies": 0.0, "rewards/chosen": 0.24897919595241547, "rewards/margins": -2.2682831287384033, "rewards/rejected": 2.5172622203826904, "step": 2292 }, { "epoch": 0.37, "learning_rate": 7.918918918918918e-07, "logits/chosen": -0.47931385040283203, "logits/rejected": -0.4713512063026428, "logps/chosen": -68.42208862304688, "logps/rejected": -115.53945922851562, "loss": 1.0636, "rewards/accuracies": 0.0, "rewards/chosen": 0.5147331357002258, "rewards/margins": -0.28154677152633667, "rewards/rejected": 0.7962799072265625, "step": 2293 }, { "epoch": 0.37, "learning_rate": 7.945945945945945e-07, "logits/chosen": -0.6799190640449524, "logits/rejected": -0.671176552772522, "logps/chosen": -50.684574127197266, "logps/rejected": -55.41895294189453, "loss": 0.6392, "rewards/accuracies": 1.0, "rewards/chosen": 1.9131557941436768, "rewards/margins": 0.1992664337158203, "rewards/rejected": 1.7138893604278564, "step": 2294 }, { "epoch": 0.37, "learning_rate": 7.972972972972972e-07, "logits/chosen": -0.4295284152030945, "logits/rejected": -0.46319231390953064, "logps/chosen": -107.1137466430664, "logps/rejected": -89.61665344238281, "loss": 0.9694, "rewards/accuracies": 0.0, "rewards/chosen": 0.6157188415527344, "rewards/margins": -1.065209984779358, "rewards/rejected": 1.6809288263320923, "step": 2295 }, { "epoch": 0.37, "learning_rate": 8e-07, "logits/chosen": -0.5271212458610535, "logits/rejected": -0.5063544511795044, "logps/chosen": -71.67218780517578, "logps/rejected": -86.6873779296875, "loss": 1.3331, "rewards/accuracies": 0.0, "rewards/chosen": 0.6120269894599915, "rewards/margins": -1.2520675659179688, "rewards/rejected": 1.864094614982605, "step": 2296 }, { "epoch": 0.37, "learning_rate": 8.027027027027027e-07, "logits/chosen": -0.7460078001022339, "logits/rejected": -0.495119571685791, "logps/chosen": -107.78353118896484, "logps/rejected": -183.57562255859375, "loss": 0.9989, "rewards/accuracies": 0.0, "rewards/chosen": 1.4311363697052002, "rewards/margins": -1.2616188526153564, "rewards/rejected": 2.6927552223205566, "step": 2297 }, { "epoch": 0.37, "learning_rate": 8.054054054054054e-07, "logits/chosen": -0.35122251510620117, "logits/rejected": -0.33654963970184326, "logps/chosen": -37.10763168334961, "logps/rejected": -109.46951293945312, "loss": 2.5611, "rewards/accuracies": 0.0, "rewards/chosen": -0.04220543056726456, "rewards/margins": -0.6287006735801697, "rewards/rejected": 0.5864952206611633, "step": 2298 }, { "epoch": 0.37, "learning_rate": 8.08108108108108e-07, "logits/chosen": -0.58549964427948, "logits/rejected": -0.5804513096809387, "logps/chosen": -4.831625938415527, "logps/rejected": -16.459861755371094, "loss": 0.6737, "rewards/accuracies": 1.0, "rewards/chosen": 0.37321996688842773, "rewards/margins": 0.0794537365436554, "rewards/rejected": 0.29376623034477234, "step": 2299 }, { "epoch": 0.37, "learning_rate": 8.108108108108108e-07, "logits/chosen": -0.6013640761375427, "logits/rejected": -0.4635302424430847, "logps/chosen": -75.16122436523438, "logps/rejected": -10.047098159790039, "loss": 0.5448, "rewards/accuracies": 1.0, "rewards/chosen": 1.8600921630859375, "rewards/margins": 1.359785556793213, "rewards/rejected": 0.5003066062927246, "step": 2300 }, { "epoch": 0.37, "learning_rate": 8.135135135135135e-07, "logits/chosen": -0.6868063807487488, "logits/rejected": -1.0528459548950195, "logps/chosen": -115.80709838867188, "logps/rejected": -36.799957275390625, "loss": 0.3798, "rewards/accuracies": 1.0, "rewards/chosen": 0.5089210867881775, "rewards/margins": 0.36212772130966187, "rewards/rejected": 0.14679336547851562, "step": 2301 }, { "epoch": 0.37, "learning_rate": 8.162162162162161e-07, "logits/chosen": -0.40855491161346436, "logits/rejected": -0.36858752369880676, "logps/chosen": -44.57244873046875, "logps/rejected": -51.603843688964844, "loss": 0.5532, "rewards/accuracies": 1.0, "rewards/chosen": 1.490533471107483, "rewards/margins": 0.9545631408691406, "rewards/rejected": 0.5359703302383423, "step": 2302 }, { "epoch": 0.37, "learning_rate": 8.189189189189189e-07, "logits/chosen": -0.8472437262535095, "logits/rejected": -0.8552764058113098, "logps/chosen": -148.364501953125, "logps/rejected": -83.25772094726562, "loss": 1.292, "rewards/accuracies": 0.0, "rewards/chosen": 0.6797348260879517, "rewards/margins": -1.83222496509552, "rewards/rejected": 2.5119597911834717, "step": 2303 }, { "epoch": 0.37, "learning_rate": 8.216216216216217e-07, "logits/chosen": -0.5455209612846375, "logits/rejected": -0.6424177289009094, "logps/chosen": -68.11906433105469, "logps/rejected": -170.09637451171875, "loss": 1.3594, "rewards/accuracies": 0.0, "rewards/chosen": 0.655939519405365, "rewards/margins": -2.259030818939209, "rewards/rejected": 2.9149703979492188, "step": 2304 }, { "epoch": 0.37, "learning_rate": 8.243243243243243e-07, "logits/chosen": -0.6036635637283325, "logits/rejected": -0.5052569508552551, "logps/chosen": -59.437137603759766, "logps/rejected": -69.40980529785156, "loss": 0.3467, "rewards/accuracies": 1.0, "rewards/chosen": 2.321889877319336, "rewards/margins": 0.8659770488739014, "rewards/rejected": 1.4559128284454346, "step": 2305 }, { "epoch": 0.37, "learning_rate": 8.27027027027027e-07, "logits/chosen": -0.8754952549934387, "logits/rejected": -0.8959340453147888, "logps/chosen": -113.24971008300781, "logps/rejected": -90.44630432128906, "loss": 0.756, "rewards/accuracies": 0.0, "rewards/chosen": 1.2317367792129517, "rewards/margins": -0.6691062450408936, "rewards/rejected": 1.9008430242538452, "step": 2306 }, { "epoch": 0.37, "learning_rate": 8.297297297297296e-07, "logits/chosen": -0.766835629940033, "logits/rejected": -0.7332937121391296, "logps/chosen": -73.51830291748047, "logps/rejected": -52.880516052246094, "loss": 0.8506, "rewards/accuracies": 0.0, "rewards/chosen": 1.100878119468689, "rewards/margins": -0.03112494945526123, "rewards/rejected": 1.1320030689239502, "step": 2307 }, { "epoch": 0.37, "learning_rate": 8.324324324324324e-07, "logits/chosen": -0.5705403089523315, "logits/rejected": -0.6418525576591492, "logps/chosen": -165.97579956054688, "logps/rejected": -106.51762390136719, "loss": 1.4804, "rewards/accuracies": 0.0, "rewards/chosen": 3.256579637527466, "rewards/margins": -1.0407545566558838, "rewards/rejected": 4.29733419418335, "step": 2308 }, { "epoch": 0.37, "learning_rate": 8.351351351351351e-07, "logits/chosen": -0.7093808650970459, "logits/rejected": -0.7248870730400085, "logps/chosen": -194.10946655273438, "logps/rejected": -34.53525924682617, "loss": 0.134, "rewards/accuracies": 1.0, "rewards/chosen": 4.300975322723389, "rewards/margins": 4.095051288604736, "rewards/rejected": 0.20592384040355682, "step": 2309 }, { "epoch": 0.37, "learning_rate": 8.378378378378377e-07, "logits/chosen": -0.520706057548523, "logits/rejected": -0.520706057548523, "logps/chosen": -94.17559814453125, "logps/rejected": -94.17559814453125, "loss": 0.3766, "rewards/accuracies": 0.0, "rewards/chosen": 0.956927478313446, "rewards/margins": 0.0, "rewards/rejected": 0.956927478313446, "step": 2310 }, { "epoch": 0.38, "learning_rate": 8.405405405405405e-07, "logits/chosen": -0.5797439217567444, "logits/rejected": -0.6409834623336792, "logps/chosen": -99.7515869140625, "logps/rejected": -57.80138397216797, "loss": 0.637, "rewards/accuracies": 0.0, "rewards/chosen": 1.5487030744552612, "rewards/margins": -0.7548857927322388, "rewards/rejected": 2.3035888671875, "step": 2311 }, { "epoch": 0.38, "learning_rate": 8.432432432432433e-07, "logits/chosen": -0.17140986025333405, "logits/rejected": -0.17140986025333405, "logps/chosen": -123.00250244140625, "logps/rejected": -123.00250244140625, "loss": 0.735, "rewards/accuracies": 0.0, "rewards/chosen": 1.1795624494552612, "rewards/margins": 0.0, "rewards/rejected": 1.1795624494552612, "step": 2312 }, { "epoch": 0.38, "learning_rate": 8.459459459459459e-07, "logits/chosen": -0.3734242022037506, "logits/rejected": -0.3734242022037506, "logps/chosen": -71.8282470703125, "logps/rejected": -71.8282470703125, "loss": 0.589, "rewards/accuracies": 0.0, "rewards/chosen": 0.189117431640625, "rewards/margins": 0.0, "rewards/rejected": 0.189117431640625, "step": 2313 }, { "epoch": 0.38, "learning_rate": 8.486486486486486e-07, "logits/chosen": -0.5015697479248047, "logits/rejected": -0.4834578037261963, "logps/chosen": -68.92568969726562, "logps/rejected": -94.94432067871094, "loss": 0.6568, "rewards/accuracies": 0.0, "rewards/chosen": 1.514312744140625, "rewards/margins": -0.2415459156036377, "rewards/rejected": 1.7558586597442627, "step": 2314 }, { "epoch": 0.38, "learning_rate": 8.513513513513513e-07, "logits/chosen": -0.6894134283065796, "logits/rejected": -0.5963969230651855, "logps/chosen": -124.25252532958984, "logps/rejected": -59.61198806762695, "loss": 0.7121, "rewards/accuracies": 1.0, "rewards/chosen": 0.5352973937988281, "rewards/margins": 0.2424217164516449, "rewards/rejected": 0.2928756773471832, "step": 2315 }, { "epoch": 0.38, "learning_rate": 8.54054054054054e-07, "logits/chosen": -0.3604210317134857, "logits/rejected": -0.3000790774822235, "logps/chosen": -82.63446044921875, "logps/rejected": -80.15568542480469, "loss": 0.3321, "rewards/accuracies": 1.0, "rewards/chosen": 1.838021159172058, "rewards/margins": 0.7269554138183594, "rewards/rejected": 1.1110657453536987, "step": 2316 }, { "epoch": 0.38, "learning_rate": 8.567567567567567e-07, "logits/chosen": -0.7420095801353455, "logits/rejected": -0.811941385269165, "logps/chosen": -157.31016540527344, "logps/rejected": -56.59748077392578, "loss": 0.6959, "rewards/accuracies": 1.0, "rewards/chosen": 3.1691482067108154, "rewards/margins": 3.167790651321411, "rewards/rejected": 0.0013576507335528731, "step": 2317 }, { "epoch": 0.38, "learning_rate": 8.594594594594595e-07, "logits/chosen": -0.7676670551300049, "logits/rejected": -0.7479544281959534, "logps/chosen": -24.928844451904297, "logps/rejected": -57.130027770996094, "loss": 0.4711, "rewards/accuracies": 0.0, "rewards/chosen": 1.345845103263855, "rewards/margins": -0.0700995922088623, "rewards/rejected": 1.4159446954727173, "step": 2318 }, { "epoch": 0.38, "learning_rate": 8.621621621621621e-07, "logits/chosen": -0.6948001384735107, "logits/rejected": -0.6502681374549866, "logps/chosen": -84.4904556274414, "logps/rejected": -73.36249542236328, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 2.9811439514160156, "rewards/margins": 0.37610626220703125, "rewards/rejected": 2.6050376892089844, "step": 2319 }, { "epoch": 0.38, "learning_rate": 8.648648648648649e-07, "logits/chosen": -0.3142896890640259, "logits/rejected": -0.2955203652381897, "logps/chosen": -5.516109466552734, "logps/rejected": -33.346893310546875, "loss": 0.7547, "rewards/accuracies": 0.0, "rewards/chosen": 0.36507532000541687, "rewards/margins": -0.23263856768608093, "rewards/rejected": 0.5977138876914978, "step": 2320 }, { "epoch": 0.38, "learning_rate": 8.675675675675675e-07, "logits/chosen": -0.42687204480171204, "logits/rejected": -0.3555420935153961, "logps/chosen": -72.5299301147461, "logps/rejected": -90.29856872558594, "loss": 0.7275, "rewards/accuracies": 0.0, "rewards/chosen": 1.406884789466858, "rewards/margins": -0.9936424493789673, "rewards/rejected": 2.400527238845825, "step": 2321 }, { "epoch": 0.38, "learning_rate": 8.702702702702702e-07, "logits/chosen": -0.5294039845466614, "logits/rejected": -0.5445606708526611, "logps/chosen": -10.073688507080078, "logps/rejected": -3.5335702896118164, "loss": 0.458, "rewards/accuracies": 0.0, "rewards/chosen": 0.03380623087286949, "rewards/margins": -0.2567466199398041, "rewards/rejected": 0.29055285453796387, "step": 2322 }, { "epoch": 0.38, "learning_rate": 8.729729729729729e-07, "logits/chosen": -0.038852933794260025, "logits/rejected": -0.04201665148139, "logps/chosen": -13.816936492919922, "logps/rejected": -70.76959228515625, "loss": 1.9328, "rewards/accuracies": 1.0, "rewards/chosen": 0.1260833740234375, "rewards/margins": 0.17067794501781464, "rewards/rejected": -0.044594574719667435, "step": 2323 }, { "epoch": 0.38, "learning_rate": 8.756756756756756e-07, "logits/chosen": -0.6203933358192444, "logits/rejected": -0.6068282127380371, "logps/chosen": -97.28914642333984, "logps/rejected": -56.10588836669922, "loss": 1.0273, "rewards/accuracies": 1.0, "rewards/chosen": 1.5402649641036987, "rewards/margins": 1.3465657234191895, "rewards/rejected": 0.19369927048683167, "step": 2324 }, { "epoch": 0.38, "learning_rate": 8.783783783783784e-07, "logits/chosen": -0.2682301998138428, "logits/rejected": -0.24086478352546692, "logps/chosen": -51.5416145324707, "logps/rejected": -99.4212646484375, "loss": 2.0891, "rewards/accuracies": 0.0, "rewards/chosen": 0.8996917605400085, "rewards/margins": -2.2297682762145996, "rewards/rejected": 3.129460096359253, "step": 2325 }, { "epoch": 0.38, "learning_rate": 8.810810810810811e-07, "logits/chosen": -0.4592939615249634, "logits/rejected": -0.4592939615249634, "logps/chosen": -37.58959197998047, "logps/rejected": -37.58959197998047, "loss": 0.6212, "rewards/accuracies": 0.0, "rewards/chosen": 0.1024932861328125, "rewards/margins": 0.0, "rewards/rejected": 0.1024932861328125, "step": 2326 }, { "epoch": 0.38, "learning_rate": 8.837837837837837e-07, "logits/chosen": -0.6279320120811462, "logits/rejected": -0.6652045249938965, "logps/chosen": -70.89982604980469, "logps/rejected": -165.3992462158203, "loss": 1.8634, "rewards/accuracies": 0.0, "rewards/chosen": 0.6640731692314148, "rewards/margins": -3.4597856998443604, "rewards/rejected": 4.12385892868042, "step": 2327 }, { "epoch": 0.38, "learning_rate": 8.864864864864865e-07, "logits/chosen": -0.4749011993408203, "logits/rejected": -0.43924400210380554, "logps/chosen": -30.65346908569336, "logps/rejected": -57.89527130126953, "loss": 0.3873, "rewards/accuracies": 0.0, "rewards/chosen": 1.2569828033447266, "rewards/margins": -0.08935511112213135, "rewards/rejected": 1.346337914466858, "step": 2328 }, { "epoch": 0.38, "learning_rate": 8.891891891891892e-07, "logits/chosen": -1.0048139095306396, "logits/rejected": -1.045081377029419, "logps/chosen": -98.14729309082031, "logps/rejected": -149.93389892578125, "loss": 1.5182, "rewards/accuracies": 0.0, "rewards/chosen": 0.5394569635391235, "rewards/margins": -2.0947561264038086, "rewards/rejected": 2.6342132091522217, "step": 2329 }, { "epoch": 0.38, "learning_rate": 8.918918918918918e-07, "logits/chosen": -0.858940064907074, "logits/rejected": -0.8519359230995178, "logps/chosen": -74.2562255859375, "logps/rejected": -85.74285888671875, "loss": 0.5988, "rewards/accuracies": 0.0, "rewards/chosen": 1.243493676185608, "rewards/margins": -0.4007911682128906, "rewards/rejected": 1.6442848443984985, "step": 2330 }, { "epoch": 0.38, "learning_rate": 8.945945945945945e-07, "logits/chosen": -0.022961877286434174, "logits/rejected": -0.019256196916103363, "logps/chosen": -2.6542787551879883, "logps/rejected": -2.4110934734344482, "loss": 1.1721, "rewards/accuracies": 0.0, "rewards/chosen": 0.16503916680812836, "rewards/margins": -0.019013702869415283, "rewards/rejected": 0.18405286967754364, "step": 2331 }, { "epoch": 0.38, "learning_rate": 8.972972972972974e-07, "logits/chosen": -0.7713443636894226, "logits/rejected": -0.6615530848503113, "logps/chosen": -115.54122924804688, "logps/rejected": -105.05328369140625, "loss": 1.2175, "rewards/accuracies": 0.0, "rewards/chosen": 2.8238542079925537, "rewards/margins": -1.0247023105621338, "rewards/rejected": 3.8485565185546875, "step": 2332 }, { "epoch": 0.38, "learning_rate": 9e-07, "logits/chosen": -0.7243732810020447, "logits/rejected": -0.6722401976585388, "logps/chosen": -52.61410903930664, "logps/rejected": -116.69501495361328, "loss": 0.4282, "rewards/accuracies": 1.0, "rewards/chosen": 1.356161117553711, "rewards/margins": 0.22633707523345947, "rewards/rejected": 1.1298240423202515, "step": 2333 }, { "epoch": 0.38, "learning_rate": 9.027027027027027e-07, "logits/chosen": -0.7385777831077576, "logits/rejected": -0.5399090647697449, "logps/chosen": -116.92146301269531, "logps/rejected": -81.3668212890625, "loss": 0.5417, "rewards/accuracies": 1.0, "rewards/chosen": 3.9078567028045654, "rewards/margins": 2.0025086402893066, "rewards/rejected": 1.9053481817245483, "step": 2334 }, { "epoch": 0.38, "learning_rate": 9.054054054054053e-07, "logits/chosen": -0.3033198118209839, "logits/rejected": -0.25049394369125366, "logps/chosen": -71.56803894042969, "logps/rejected": -10.118253707885742, "loss": 0.2664, "rewards/accuracies": 1.0, "rewards/chosen": 2.0984885692596436, "rewards/margins": 1.1361322402954102, "rewards/rejected": 0.9623563885688782, "step": 2335 }, { "epoch": 0.38, "learning_rate": 9.081081081081081e-07, "logits/chosen": -0.36063867807388306, "logits/rejected": -0.33840805292129517, "logps/chosen": -43.802310943603516, "logps/rejected": -86.66036224365234, "loss": 0.5596, "rewards/accuracies": 0.0, "rewards/chosen": 1.4798946380615234, "rewards/margins": -0.34418225288391113, "rewards/rejected": 1.8240768909454346, "step": 2336 }, { "epoch": 0.38, "learning_rate": 9.108108108108108e-07, "logits/chosen": -0.7575306296348572, "logits/rejected": -0.598028302192688, "logps/chosen": -127.395263671875, "logps/rejected": -92.26617431640625, "loss": 1.5842, "rewards/accuracies": 1.0, "rewards/chosen": 4.32051420211792, "rewards/margins": 2.357748508453369, "rewards/rejected": 1.9627655744552612, "step": 2337 }, { "epoch": 0.38, "learning_rate": 9.135135135135134e-07, "logits/chosen": -0.624918520450592, "logits/rejected": -0.5620084404945374, "logps/chosen": -27.458559036254883, "logps/rejected": -60.6998176574707, "loss": 0.6677, "rewards/accuracies": 1.0, "rewards/chosen": 1.178735613822937, "rewards/margins": 0.10461866855621338, "rewards/rejected": 1.0741169452667236, "step": 2338 }, { "epoch": 0.38, "learning_rate": 9.162162162162161e-07, "logits/chosen": -0.19280152022838593, "logits/rejected": -0.19280152022838593, "logps/chosen": -49.1894645690918, "logps/rejected": -49.1894645690918, "loss": 0.8101, "rewards/accuracies": 0.0, "rewards/chosen": 0.3719802796840668, "rewards/margins": 0.0, "rewards/rejected": 0.3719802796840668, "step": 2339 }, { "epoch": 0.38, "learning_rate": 9.18918918918919e-07, "logits/chosen": -0.7848700284957886, "logits/rejected": -0.7868464589118958, "logps/chosen": -45.370296478271484, "logps/rejected": -9.220483779907227, "loss": 0.4282, "rewards/accuracies": 1.0, "rewards/chosen": 2.026209592819214, "rewards/margins": 1.550900936126709, "rewards/rejected": 0.4753086268901825, "step": 2340 }, { "epoch": 0.38, "learning_rate": 9.216216216216216e-07, "logits/chosen": -0.19495442509651184, "logits/rejected": -0.21969951689243317, "logps/chosen": -40.43879699707031, "logps/rejected": -28.519548416137695, "loss": 1.1282, "rewards/accuracies": 0.0, "rewards/chosen": 0.5496074557304382, "rewards/margins": -0.39698469638824463, "rewards/rejected": 0.9465921521186829, "step": 2341 }, { "epoch": 0.38, "learning_rate": 9.243243243243243e-07, "logits/chosen": -0.7515964508056641, "logits/rejected": -0.8172535300254822, "logps/chosen": -90.79188537597656, "logps/rejected": -119.3942642211914, "loss": 1.0616, "rewards/accuracies": 0.0, "rewards/chosen": 0.01004714984446764, "rewards/margins": -1.4917281866073608, "rewards/rejected": 1.5017753839492798, "step": 2342 }, { "epoch": 0.38, "learning_rate": 9.27027027027027e-07, "logits/chosen": -0.8570552468299866, "logits/rejected": -0.8212958574295044, "logps/chosen": -110.32398986816406, "logps/rejected": -66.52778625488281, "loss": 1.054, "rewards/accuracies": 0.0, "rewards/chosen": 0.049755096435546875, "rewards/margins": -1.8133331537246704, "rewards/rejected": 1.8630882501602173, "step": 2343 }, { "epoch": 0.38, "learning_rate": 9.297297297297297e-07, "logits/chosen": -0.7898871898651123, "logits/rejected": -0.6966389417648315, "logps/chosen": -150.79147338867188, "logps/rejected": -173.20181274414062, "loss": 0.9745, "rewards/accuracies": 0.0, "rewards/chosen": 3.281332492828369, "rewards/margins": -0.9725737571716309, "rewards/rejected": 4.25390625, "step": 2344 }, { "epoch": 0.38, "learning_rate": 9.324324324324324e-07, "logits/chosen": -0.17644597589969635, "logits/rejected": -0.1867983341217041, "logps/chosen": -99.46647644042969, "logps/rejected": -44.58573913574219, "loss": 1.2581, "rewards/accuracies": 0.0, "rewards/chosen": -0.5643768310546875, "rewards/margins": -2.181056499481201, "rewards/rejected": 1.6166797876358032, "step": 2345 }, { "epoch": 0.38, "learning_rate": 9.351351351351351e-07, "logits/chosen": -0.5958213210105896, "logits/rejected": -0.5856271386146545, "logps/chosen": -90.25096130371094, "logps/rejected": -74.26032257080078, "loss": 0.5845, "rewards/accuracies": 1.0, "rewards/chosen": 2.1549699306488037, "rewards/margins": 0.7075409889221191, "rewards/rejected": 1.4474289417266846, "step": 2346 }, { "epoch": 0.38, "learning_rate": 9.378378378378377e-07, "logits/chosen": -0.14834506809711456, "logits/rejected": -0.09151439368724823, "logps/chosen": -50.35099411010742, "logps/rejected": -44.06970977783203, "loss": 0.2622, "rewards/accuracies": 1.0, "rewards/chosen": 1.75360906124115, "rewards/margins": 0.7435016632080078, "rewards/rejected": 1.010107398033142, "step": 2347 }, { "epoch": 0.38, "learning_rate": 9.405405405405406e-07, "logits/chosen": -0.6036661863327026, "logits/rejected": -0.49872255325317383, "logps/chosen": -149.5592041015625, "logps/rejected": -145.9371337890625, "loss": 0.9108, "rewards/accuracies": 0.0, "rewards/chosen": 1.9509613513946533, "rewards/margins": -1.2552413940429688, "rewards/rejected": 3.206202745437622, "step": 2348 }, { "epoch": 0.38, "learning_rate": 9.432432432432433e-07, "logits/chosen": -0.6712337136268616, "logits/rejected": -0.6414594650268555, "logps/chosen": -95.19084167480469, "logps/rejected": -106.33501434326172, "loss": 1.0638, "rewards/accuracies": 0.0, "rewards/chosen": 0.5631629824638367, "rewards/margins": -1.6129302978515625, "rewards/rejected": 2.176093339920044, "step": 2349 }, { "epoch": 0.38, "learning_rate": 9.459459459459459e-07, "logits/chosen": -0.5180460214614868, "logits/rejected": -0.42764487862586975, "logps/chosen": -139.05514526367188, "logps/rejected": -130.094482421875, "loss": 0.3765, "rewards/accuracies": 1.0, "rewards/chosen": 3.2277145385742188, "rewards/margins": 1.2121384143829346, "rewards/rejected": 2.015576124191284, "step": 2350 }, { "epoch": 0.38, "learning_rate": 9.486486486486486e-07, "logits/chosen": -0.8809716701507568, "logits/rejected": -0.834597110748291, "logps/chosen": -105.97915649414062, "logps/rejected": -24.87722396850586, "loss": 0.6234, "rewards/accuracies": 0.0, "rewards/chosen": -0.0691375732421875, "rewards/margins": -0.3683443069458008, "rewards/rejected": 0.2992067337036133, "step": 2351 }, { "epoch": 0.38, "learning_rate": 9.513513513513513e-07, "logits/chosen": -0.5412200689315796, "logits/rejected": -0.4288998246192932, "logps/chosen": -135.88975524902344, "logps/rejected": -67.54417419433594, "loss": 0.6008, "rewards/accuracies": 1.0, "rewards/chosen": 3.784166097640991, "rewards/margins": 1.6446259021759033, "rewards/rejected": 2.139540195465088, "step": 2352 }, { "epoch": 0.38, "learning_rate": 9.540540540540541e-07, "logits/chosen": -0.5420427322387695, "logits/rejected": -0.428891122341156, "logps/chosen": -51.11800003051758, "logps/rejected": -18.57135581970215, "loss": 0.4822, "rewards/accuracies": 1.0, "rewards/chosen": 1.034239649772644, "rewards/margins": 0.638839602470398, "rewards/rejected": 0.3954000473022461, "step": 2353 }, { "epoch": 0.38, "learning_rate": 9.567567567567567e-07, "logits/chosen": -0.4380979537963867, "logits/rejected": -0.44862180948257446, "logps/chosen": -66.40956115722656, "logps/rejected": -149.269775390625, "loss": 1.0449, "rewards/accuracies": 0.0, "rewards/chosen": 1.0384796857833862, "rewards/margins": -1.3824204206466675, "rewards/rejected": 2.4209001064300537, "step": 2354 }, { "epoch": 0.38, "learning_rate": 9.594594594594594e-07, "logits/chosen": -0.46958109736442566, "logits/rejected": -0.46046149730682373, "logps/chosen": -54.766876220703125, "logps/rejected": -18.48275375366211, "loss": 0.3886, "rewards/accuracies": 1.0, "rewards/chosen": 0.896274983882904, "rewards/margins": 0.5757436752319336, "rewards/rejected": 0.32053127884864807, "step": 2355 }, { "epoch": 0.38, "learning_rate": 9.621621621621622e-07, "logits/chosen": -0.7662899494171143, "logits/rejected": -0.7660046815872192, "logps/chosen": -133.89283752441406, "logps/rejected": -70.59007263183594, "loss": 0.2853, "rewards/accuracies": 1.0, "rewards/chosen": 2.643254041671753, "rewards/margins": 1.0221351385116577, "rewards/rejected": 1.6211189031600952, "step": 2356 }, { "epoch": 0.38, "learning_rate": 9.648648648648648e-07, "logits/chosen": -0.8024416565895081, "logits/rejected": -0.7391530275344849, "logps/chosen": -66.33233642578125, "logps/rejected": -80.73772430419922, "loss": 1.2031, "rewards/accuracies": 1.0, "rewards/chosen": 1.196192979812622, "rewards/margins": 0.21943742036819458, "rewards/rejected": 0.9767555594444275, "step": 2357 }, { "epoch": 0.38, "learning_rate": 9.675675675675676e-07, "logits/chosen": -0.6353784203529358, "logits/rejected": -0.5969871878623962, "logps/chosen": -98.85951232910156, "logps/rejected": -140.2417755126953, "loss": 1.4207, "rewards/accuracies": 1.0, "rewards/chosen": 2.363050937652588, "rewards/margins": 0.45056772232055664, "rewards/rejected": 1.9124832153320312, "step": 2358 }, { "epoch": 0.38, "learning_rate": 9.702702702702702e-07, "logits/chosen": -1.2214964628219604, "logits/rejected": -1.2696415185928345, "logps/chosen": -57.315547943115234, "logps/rejected": -31.976285934448242, "loss": 0.6269, "rewards/accuracies": 1.0, "rewards/chosen": 1.5749813318252563, "rewards/margins": 0.812016487121582, "rewards/rejected": 0.7629648447036743, "step": 2359 }, { "epoch": 0.38, "learning_rate": 9.72972972972973e-07, "logits/chosen": -0.5578097105026245, "logits/rejected": -0.5423098206520081, "logps/chosen": -96.0120620727539, "logps/rejected": -125.93563079833984, "loss": 0.2633, "rewards/accuracies": 1.0, "rewards/chosen": 3.017148733139038, "rewards/margins": 1.1119126081466675, "rewards/rejected": 1.9052361249923706, "step": 2360 }, { "epoch": 0.38, "learning_rate": 9.756756756756756e-07, "logits/chosen": -0.38205084204673767, "logits/rejected": -0.39519283175468445, "logps/chosen": -136.54196166992188, "logps/rejected": -147.57177734375, "loss": 0.9706, "rewards/accuracies": 0.0, "rewards/chosen": 0.3444976806640625, "rewards/margins": -1.594824194908142, "rewards/rejected": 1.9393218755722046, "step": 2361 }, { "epoch": 0.38, "learning_rate": 9.783783783783782e-07, "logits/chosen": -0.3146612048149109, "logits/rejected": -0.32228511571884155, "logps/chosen": -41.27995300292969, "logps/rejected": -65.87592315673828, "loss": 0.6528, "rewards/accuracies": 0.0, "rewards/chosen": 0.358245849609375, "rewards/margins": -0.21254962682724, "rewards/rejected": 0.570795476436615, "step": 2362 }, { "epoch": 0.38, "learning_rate": 9.81081081081081e-07, "logits/chosen": -0.6617265343666077, "logits/rejected": -0.5398650765419006, "logps/chosen": -176.21966552734375, "logps/rejected": -44.61707305908203, "loss": 0.3916, "rewards/accuracies": 1.0, "rewards/chosen": 3.6315276622772217, "rewards/margins": 2.4614434242248535, "rewards/rejected": 1.1700843572616577, "step": 2363 }, { "epoch": 0.38, "learning_rate": 9.837837837837839e-07, "logits/chosen": -0.5965594053268433, "logits/rejected": -0.5979018807411194, "logps/chosen": -266.20074462890625, "logps/rejected": -159.69883728027344, "loss": 0.1347, "rewards/accuracies": 1.0, "rewards/chosen": 3.7208924293518066, "rewards/margins": 3.0577898025512695, "rewards/rejected": 0.6631027460098267, "step": 2364 }, { "epoch": 0.38, "learning_rate": 9.864864864864865e-07, "logits/chosen": -0.86667400598526, "logits/rejected": -0.8438736200332642, "logps/chosen": -134.69198608398438, "logps/rejected": -112.86859893798828, "loss": 0.9162, "rewards/accuracies": 0.0, "rewards/chosen": 0.7822128534317017, "rewards/margins": -0.18417125940322876, "rewards/rejected": 0.9663841128349304, "step": 2365 }, { "epoch": 0.38, "learning_rate": 9.89189189189189e-07, "logits/chosen": -0.608285129070282, "logits/rejected": -0.5454548001289368, "logps/chosen": -70.88147735595703, "logps/rejected": -15.46975326538086, "loss": 0.3243, "rewards/accuracies": 1.0, "rewards/chosen": 0.8856193423271179, "rewards/margins": 0.480625718832016, "rewards/rejected": 0.40499362349510193, "step": 2366 }, { "epoch": 0.38, "learning_rate": 9.91891891891892e-07, "logits/chosen": -0.39520785212516785, "logits/rejected": -0.3223007917404175, "logps/chosen": -123.67171478271484, "logps/rejected": -77.51045227050781, "loss": 0.2767, "rewards/accuracies": 1.0, "rewards/chosen": 3.1895883083343506, "rewards/margins": 0.49814462661743164, "rewards/rejected": 2.691443681716919, "step": 2367 }, { "epoch": 0.38, "learning_rate": 9.945945945945945e-07, "logits/chosen": -0.0048606921918690205, "logits/rejected": -0.007486138492822647, "logps/chosen": -21.3873291015625, "logps/rejected": -60.64338684082031, "loss": 0.6032, "rewards/accuracies": 0.0, "rewards/chosen": 0.22466126084327698, "rewards/margins": -0.16532593965530396, "rewards/rejected": 0.38998720049858093, "step": 2368 }, { "epoch": 0.38, "learning_rate": 9.972972972972973e-07, "logits/chosen": -0.4132329821586609, "logits/rejected": -0.4525275230407715, "logps/chosen": -147.03366088867188, "logps/rejected": -85.31260681152344, "loss": 2.0617, "rewards/accuracies": 0.0, "rewards/chosen": -0.42460939288139343, "rewards/margins": -2.8265533447265625, "rewards/rejected": 2.4019439220428467, "step": 2369 }, { "epoch": 0.38, "learning_rate": 1e-06, "logits/chosen": -0.3190271258354187, "logits/rejected": -0.3157671391963959, "logps/chosen": -61.267356872558594, "logps/rejected": -64.98711395263672, "loss": 0.7601, "rewards/accuracies": 0.0, "rewards/chosen": 0.4180641174316406, "rewards/margins": -0.2074676752090454, "rewards/rejected": 0.625531792640686, "step": 2370 }, { "epoch": 0.38, "learning_rate": 9.999999827273655e-07, "logits/chosen": -0.4910513162612915, "logits/rejected": -0.4689909517765045, "logps/chosen": -87.26461791992188, "logps/rejected": -102.55989074707031, "loss": 1.6172, "rewards/accuracies": 0.0, "rewards/chosen": 1.0186347961425781, "rewards/margins": -2.8665473461151123, "rewards/rejected": 3.8851821422576904, "step": 2371 }, { "epoch": 0.39, "learning_rate": 9.999999309094632e-07, "logits/chosen": -0.42437395453453064, "logits/rejected": -0.42456698417663574, "logps/chosen": -31.66739845275879, "logps/rejected": -21.94768524169922, "loss": 0.3958, "rewards/accuracies": 1.0, "rewards/chosen": 0.22641430795192719, "rewards/margins": 0.18370801210403442, "rewards/rejected": 0.04270629957318306, "step": 2372 }, { "epoch": 0.39, "learning_rate": 9.999998445462969e-07, "logits/chosen": -0.33225515484809875, "logits/rejected": -0.2521074712276459, "logps/chosen": -60.53701400756836, "logps/rejected": -55.76013946533203, "loss": 0.5748, "rewards/accuracies": 0.0, "rewards/chosen": 1.667392373085022, "rewards/margins": -0.018690824508666992, "rewards/rejected": 1.686083197593689, "step": 2373 }, { "epoch": 0.39, "learning_rate": 9.99999723637872e-07, "logits/chosen": -0.6260935664176941, "logits/rejected": -0.4741728901863098, "logps/chosen": -258.6835021972656, "logps/rejected": -217.6944580078125, "loss": 0.6447, "rewards/accuracies": 1.0, "rewards/chosen": 3.4747560024261475, "rewards/margins": 0.7597870826721191, "rewards/rejected": 2.7149689197540283, "step": 2374 }, { "epoch": 0.39, "learning_rate": 9.999995681841978e-07, "logits/chosen": -0.19213052093982697, "logits/rejected": -0.19213052093982697, "logps/chosen": -2.1776859760284424, "logps/rejected": -2.1776859760284424, "loss": 0.5855, "rewards/accuracies": 0.0, "rewards/chosen": 0.17062537372112274, "rewards/margins": 0.0, "rewards/rejected": 0.17062537372112274, "step": 2375 }, { "epoch": 0.39, "learning_rate": 9.99999378185284e-07, "logits/chosen": -0.4549001455307007, "logits/rejected": -0.3316177725791931, "logps/chosen": -121.09686279296875, "logps/rejected": -118.09705352783203, "loss": 1.2062, "rewards/accuracies": 0.0, "rewards/chosen": 2.775869846343994, "rewards/margins": -0.7544577121734619, "rewards/rejected": 3.530327558517456, "step": 2376 }, { "epoch": 0.39, "learning_rate": 9.999991536411446e-07, "logits/chosen": -0.5146035552024841, "logits/rejected": -0.49370917677879333, "logps/chosen": -81.80101776123047, "logps/rejected": -76.03821563720703, "loss": 0.7071, "rewards/accuracies": 0.0, "rewards/chosen": 1.7757591009140015, "rewards/margins": -0.6994034051895142, "rewards/rejected": 2.4751625061035156, "step": 2377 }, { "epoch": 0.39, "learning_rate": 9.999988945517943e-07, "logits/chosen": -0.8917023539543152, "logits/rejected": -0.7382323145866394, "logps/chosen": -89.69116973876953, "logps/rejected": -72.89441680908203, "loss": 0.8717, "rewards/accuracies": 0.0, "rewards/chosen": 0.219970703125, "rewards/margins": -0.6197037100791931, "rewards/rejected": 0.8396744132041931, "step": 2378 }, { "epoch": 0.39, "learning_rate": 9.999986009172517e-07, "logits/chosen": -0.479870468378067, "logits/rejected": -0.4198821783065796, "logps/chosen": -149.1041259765625, "logps/rejected": -118.78058624267578, "loss": 0.6831, "rewards/accuracies": 0.0, "rewards/chosen": 3.056065320968628, "rewards/margins": -1.0218055248260498, "rewards/rejected": 4.077870845794678, "step": 2379 }, { "epoch": 0.39, "learning_rate": 9.999982727375366e-07, "logits/chosen": -0.22791258990764618, "logits/rejected": -0.09328591078519821, "logps/chosen": -63.15087890625, "logps/rejected": -40.994117736816406, "loss": 0.4669, "rewards/accuracies": 1.0, "rewards/chosen": 1.6639976501464844, "rewards/margins": 1.0954341888427734, "rewards/rejected": 0.5685634613037109, "step": 2380 }, { "epoch": 0.39, "learning_rate": 9.99997910012672e-07, "logits/chosen": -0.739089846611023, "logits/rejected": -0.6650968790054321, "logps/chosen": -133.4567108154297, "logps/rejected": -55.55064010620117, "loss": 0.4915, "rewards/accuracies": 1.0, "rewards/chosen": 0.34803009033203125, "rewards/margins": 0.07286873459815979, "rewards/rejected": 0.27516135573387146, "step": 2381 }, { "epoch": 0.39, "learning_rate": 9.99997512742683e-07, "logits/chosen": -0.4221104383468628, "logits/rejected": -0.35311269760131836, "logps/chosen": -82.09719848632812, "logps/rejected": -34.139102935791016, "loss": 0.6101, "rewards/accuracies": 1.0, "rewards/chosen": 1.8643112182617188, "rewards/margins": 1.1944332122802734, "rewards/rejected": 0.6698780059814453, "step": 2382 }, { "epoch": 0.39, "learning_rate": 9.999970809275967e-07, "logits/chosen": -0.5301336646080017, "logits/rejected": -0.4764765202999115, "logps/chosen": -63.11251449584961, "logps/rejected": -105.90943908691406, "loss": 0.5223, "rewards/accuracies": 1.0, "rewards/chosen": 2.7931995391845703, "rewards/margins": 0.4547305107116699, "rewards/rejected": 2.3384690284729004, "step": 2383 }, { "epoch": 0.39, "learning_rate": 9.999966145674432e-07, "logits/chosen": -0.3352201282978058, "logits/rejected": -0.3240603804588318, "logps/chosen": -36.366424560546875, "logps/rejected": -58.754302978515625, "loss": 0.9621, "rewards/accuracies": 0.0, "rewards/chosen": 1.0035518407821655, "rewards/margins": -1.0069423913955688, "rewards/rejected": 2.0104942321777344, "step": 2384 }, { "epoch": 0.39, "learning_rate": 9.999961136622546e-07, "logits/chosen": -0.4774555563926697, "logits/rejected": -0.46019622683525085, "logps/chosen": -43.67325973510742, "logps/rejected": -73.25898742675781, "loss": 1.3678, "rewards/accuracies": 0.0, "rewards/chosen": 1.2214058637619019, "rewards/margins": -1.3734091520309448, "rewards/rejected": 2.5948150157928467, "step": 2385 }, { "epoch": 0.39, "learning_rate": 9.999955782120655e-07, "logits/chosen": -0.797650158405304, "logits/rejected": -0.7804821729660034, "logps/chosen": -116.56675720214844, "logps/rejected": -96.44000244140625, "loss": 0.7527, "rewards/accuracies": 1.0, "rewards/chosen": 0.49486085772514343, "rewards/margins": 0.11345672607421875, "rewards/rejected": 0.3814041316509247, "step": 2386 }, { "epoch": 0.39, "learning_rate": 9.999950082169131e-07, "logits/chosen": -0.44078171253204346, "logits/rejected": -0.4240018427371979, "logps/chosen": -4.83785343170166, "logps/rejected": -44.68052291870117, "loss": 0.5526, "rewards/accuracies": 1.0, "rewards/chosen": 0.43650588393211365, "rewards/margins": 0.3316352665424347, "rewards/rejected": 0.10487060993909836, "step": 2387 }, { "epoch": 0.39, "learning_rate": 9.999944036768366e-07, "logits/chosen": -0.18977750837802887, "logits/rejected": -0.1548568159341812, "logps/chosen": -77.02095031738281, "logps/rejected": -36.05315399169922, "loss": 0.4962, "rewards/accuracies": 1.0, "rewards/chosen": 1.305474877357483, "rewards/margins": 0.2564026117324829, "rewards/rejected": 1.049072265625, "step": 2388 }, { "epoch": 0.39, "learning_rate": 9.999937645918776e-07, "logits/chosen": -0.3899175822734833, "logits/rejected": -0.36363154649734497, "logps/chosen": -79.93820190429688, "logps/rejected": -55.951873779296875, "loss": 0.2856, "rewards/accuracies": 1.0, "rewards/chosen": 0.8637077212333679, "rewards/margins": 0.45488089323043823, "rewards/rejected": 0.4088268280029297, "step": 2389 }, { "epoch": 0.39, "learning_rate": 9.999930909620807e-07, "logits/chosen": -0.7903302907943726, "logits/rejected": -0.8042922616004944, "logps/chosen": -59.13238525390625, "logps/rejected": -106.83770751953125, "loss": 0.9394, "rewards/accuracies": 1.0, "rewards/chosen": 0.6971489191055298, "rewards/margins": 0.09027785062789917, "rewards/rejected": 0.6068710684776306, "step": 2390 }, { "epoch": 0.39, "learning_rate": 9.99992382787492e-07, "logits/chosen": -0.5453923940658569, "logits/rejected": -0.5154291391372681, "logps/chosen": -69.6675033569336, "logps/rejected": -87.36907196044922, "loss": 0.6267, "rewards/accuracies": 0.0, "rewards/chosen": 2.201551914215088, "rewards/margins": -0.5200722217559814, "rewards/rejected": 2.7216241359710693, "step": 2391 }, { "epoch": 0.39, "learning_rate": 9.999916400681607e-07, "logits/chosen": -0.6133474707603455, "logits/rejected": -0.5655716061592102, "logps/chosen": -31.586780548095703, "logps/rejected": -27.499122619628906, "loss": 0.8121, "rewards/accuracies": 0.0, "rewards/chosen": 1.5654804706573486, "rewards/margins": -0.03650820255279541, "rewards/rejected": 1.601988673210144, "step": 2392 }, { "epoch": 0.39, "learning_rate": 9.99990862804138e-07, "logits/chosen": -0.7237473726272583, "logits/rejected": -0.7833001017570496, "logps/chosen": -168.94358825683594, "logps/rejected": -112.7398681640625, "loss": 1.326, "rewards/accuracies": 0.0, "rewards/chosen": 0.02066192589700222, "rewards/margins": -1.9186935424804688, "rewards/rejected": 1.939355492591858, "step": 2393 }, { "epoch": 0.39, "learning_rate": 9.999900509954777e-07, "logits/chosen": -0.6126722693443298, "logits/rejected": -0.5762118697166443, "logps/chosen": -109.69944763183594, "logps/rejected": -198.94969177246094, "loss": 0.269, "rewards/accuracies": 1.0, "rewards/chosen": 3.579014539718628, "rewards/margins": 0.5666625499725342, "rewards/rejected": 3.0123519897460938, "step": 2394 }, { "epoch": 0.39, "learning_rate": 9.999892046422358e-07, "logits/chosen": -0.4902513027191162, "logits/rejected": -0.435598760843277, "logps/chosen": -212.43031311035156, "logps/rejected": -88.2989501953125, "loss": 1.2377, "rewards/accuracies": 1.0, "rewards/chosen": 4.41946268081665, "rewards/margins": 2.052851915359497, "rewards/rejected": 2.3666107654571533, "step": 2395 }, { "epoch": 0.39, "learning_rate": 9.999883237444709e-07, "logits/chosen": -0.4302949905395508, "logits/rejected": -0.2885052263736725, "logps/chosen": -117.54776000976562, "logps/rejected": -91.38540649414062, "loss": 0.7532, "rewards/accuracies": 1.0, "rewards/chosen": 3.023874044418335, "rewards/margins": 1.9465844631195068, "rewards/rejected": 1.0772895812988281, "step": 2396 }, { "epoch": 0.39, "learning_rate": 9.999874083022436e-07, "logits/chosen": -0.4971282184123993, "logits/rejected": -0.45242929458618164, "logps/chosen": -53.508148193359375, "logps/rejected": -61.406925201416016, "loss": 1.0514, "rewards/accuracies": 1.0, "rewards/chosen": 1.2983261346817017, "rewards/margins": 1.1143947839736938, "rewards/rejected": 0.1839313507080078, "step": 2397 }, { "epoch": 0.39, "learning_rate": 9.999864583156174e-07, "logits/chosen": -0.8535424470901489, "logits/rejected": -0.790632426738739, "logps/chosen": -106.66752624511719, "logps/rejected": -61.162715911865234, "loss": 0.3574, "rewards/accuracies": 1.0, "rewards/chosen": 1.351575493812561, "rewards/margins": 0.06264841556549072, "rewards/rejected": 1.2889270782470703, "step": 2398 }, { "epoch": 0.39, "learning_rate": 9.999854737846577e-07, "logits/chosen": -0.46789708733558655, "logits/rejected": -0.45844876766204834, "logps/chosen": -115.61495208740234, "logps/rejected": -85.8976821899414, "loss": 0.639, "rewards/accuracies": 1.0, "rewards/chosen": 0.07311706990003586, "rewards/margins": 0.1592399626970291, "rewards/rejected": -0.08612289279699326, "step": 2399 }, { "epoch": 0.39, "learning_rate": 9.99984454709433e-07, "logits/chosen": -0.6942926049232483, "logits/rejected": -0.7086004614830017, "logps/chosen": -92.44109344482422, "logps/rejected": -140.44708251953125, "loss": 0.4828, "rewards/accuracies": 0.0, "rewards/chosen": 0.7265213131904602, "rewards/margins": -0.1417739987373352, "rewards/rejected": 0.8682953119277954, "step": 2400 }, { "epoch": 0.39, "learning_rate": 9.999834010900131e-07, "logits/chosen": -0.7916703224182129, "logits/rejected": -1.0109554529190063, "logps/chosen": -195.64688110351562, "logps/rejected": -121.16546630859375, "loss": 0.6723, "rewards/accuracies": 0.0, "rewards/chosen": 1.688684105873108, "rewards/margins": -0.9786804914474487, "rewards/rejected": 2.6673645973205566, "step": 2401 }, { "epoch": 0.39, "learning_rate": 9.99982312926471e-07, "logits/chosen": -0.4615482985973358, "logits/rejected": -0.42285776138305664, "logps/chosen": -112.11483001708984, "logps/rejected": -57.51605224609375, "loss": 0.5607, "rewards/accuracies": 1.0, "rewards/chosen": 1.664202094078064, "rewards/margins": 0.6265525817871094, "rewards/rejected": 1.0376495122909546, "step": 2402 }, { "epoch": 0.39, "learning_rate": 9.999811902188821e-07, "logits/chosen": -0.34461310505867004, "logits/rejected": -0.34461310505867004, "logps/chosen": -1.1671745777130127, "logps/rejected": -1.1671745777130127, "loss": 0.719, "rewards/accuracies": 0.0, "rewards/chosen": 0.23237808048725128, "rewards/margins": 0.0, "rewards/rejected": 0.23237808048725128, "step": 2403 }, { "epoch": 0.39, "learning_rate": 9.99980032967324e-07, "logits/chosen": -0.6698144674301147, "logits/rejected": -0.7074242234230042, "logps/chosen": -58.94842529296875, "logps/rejected": -113.60115051269531, "loss": 1.5311, "rewards/accuracies": 0.0, "rewards/chosen": 1.6900055408477783, "rewards/margins": -1.3187484741210938, "rewards/rejected": 3.008754014968872, "step": 2404 }, { "epoch": 0.39, "learning_rate": 9.999788411718763e-07, "logits/chosen": -0.2850167751312256, "logits/rejected": -0.2665771543979645, "logps/chosen": -85.42915344238281, "logps/rejected": -42.42524337768555, "loss": 0.4635, "rewards/accuracies": 1.0, "rewards/chosen": 0.7142112851142883, "rewards/margins": 0.6749446988105774, "rewards/rejected": 0.03926658630371094, "step": 2405 }, { "epoch": 0.39, "learning_rate": 9.999776148326214e-07, "logits/chosen": -0.2984486520290375, "logits/rejected": -0.2894696891307831, "logps/chosen": -2.2989611625671387, "logps/rejected": -40.47218322753906, "loss": 0.4468, "rewards/accuracies": 1.0, "rewards/chosen": 0.2814476490020752, "rewards/margins": 0.3177921772003174, "rewards/rejected": -0.03634452819824219, "step": 2406 }, { "epoch": 0.39, "learning_rate": 9.999763539496443e-07, "logits/chosen": -0.4388507306575775, "logits/rejected": -0.30049368739128113, "logps/chosen": -215.59817504882812, "logps/rejected": -15.062694549560547, "loss": 0.3491, "rewards/accuracies": 1.0, "rewards/chosen": 0.9702972769737244, "rewards/margins": 0.09199374914169312, "rewards/rejected": 0.8783035278320312, "step": 2407 }, { "epoch": 0.39, "learning_rate": 9.99975058523032e-07, "logits/chosen": -0.032212644815444946, "logits/rejected": -0.021521231159567833, "logps/chosen": -1.0522308349609375, "logps/rejected": -10.38807487487793, "loss": 1.3092, "rewards/accuracies": 1.0, "rewards/chosen": 0.1656372994184494, "rewards/margins": 0.35194531083106995, "rewards/rejected": -0.18630801141262054, "step": 2408 }, { "epoch": 0.39, "learning_rate": 9.999737285528738e-07, "logits/chosen": -0.5115240812301636, "logits/rejected": -0.45971018075942993, "logps/chosen": -77.48078155517578, "logps/rejected": -28.90581512451172, "loss": 0.8523, "rewards/accuracies": 0.0, "rewards/chosen": 0.4037208557128906, "rewards/margins": -0.2480030059814453, "rewards/rejected": 0.6517238616943359, "step": 2409 }, { "epoch": 0.39, "learning_rate": 9.99972364039262e-07, "logits/chosen": -0.4660942256450653, "logits/rejected": -0.4660942256450653, "logps/chosen": -69.85752868652344, "logps/rejected": -69.85752868652344, "loss": 0.4764, "rewards/accuracies": 0.0, "rewards/chosen": 0.9956375360488892, "rewards/margins": 0.0, "rewards/rejected": 0.9956375360488892, "step": 2410 }, { "epoch": 0.39, "learning_rate": 9.999709649822902e-07, "logits/chosen": -0.5960611701011658, "logits/rejected": -0.5564433336257935, "logps/chosen": -66.21334075927734, "logps/rejected": -68.80595397949219, "loss": 0.3419, "rewards/accuracies": 1.0, "rewards/chosen": 2.3934991359710693, "rewards/margins": 0.8778733015060425, "rewards/rejected": 1.5156258344650269, "step": 2411 }, { "epoch": 0.39, "learning_rate": 9.999695313820558e-07, "logits/chosen": -0.7682622075080872, "logits/rejected": -0.714447557926178, "logps/chosen": -127.26537322998047, "logps/rejected": -96.28030395507812, "loss": 0.4653, "rewards/accuracies": 0.0, "rewards/chosen": 2.536161184310913, "rewards/margins": -0.22249817848205566, "rewards/rejected": 2.7586593627929688, "step": 2412 }, { "epoch": 0.39, "learning_rate": 9.999680632386575e-07, "logits/chosen": -0.3507741093635559, "logits/rejected": -0.3507741093635559, "logps/chosen": -105.11590576171875, "logps/rejected": -105.11590576171875, "loss": 0.6605, "rewards/accuracies": 0.0, "rewards/chosen": 0.5986068844795227, "rewards/margins": 0.0, "rewards/rejected": 0.5986068844795227, "step": 2413 }, { "epoch": 0.39, "learning_rate": 9.999665605521969e-07, "logits/chosen": -0.21178291738033295, "logits/rejected": -0.22897744178771973, "logps/chosen": -12.65173625946045, "logps/rejected": -5.296464443206787, "loss": 0.5353, "rewards/accuracies": 0.0, "rewards/chosen": -0.05744342878460884, "rewards/margins": -0.31239593029022217, "rewards/rejected": 0.25495249032974243, "step": 2414 }, { "epoch": 0.39, "learning_rate": 9.999650233227774e-07, "logits/chosen": -0.3030538558959961, "logits/rejected": -0.360833078622818, "logps/chosen": -97.8997802734375, "logps/rejected": -80.68851470947266, "loss": 1.2735, "rewards/accuracies": 0.0, "rewards/chosen": -0.402099609375, "rewards/margins": -1.1737877130508423, "rewards/rejected": 0.7716881036758423, "step": 2415 }, { "epoch": 0.39, "learning_rate": 9.999634515505058e-07, "logits/chosen": -0.5447731018066406, "logits/rejected": -0.39791178703308105, "logps/chosen": -64.4197998046875, "logps/rejected": -25.75630760192871, "loss": 0.4182, "rewards/accuracies": 1.0, "rewards/chosen": 1.6717430353164673, "rewards/margins": 1.3144022226333618, "rewards/rejected": 0.35734081268310547, "step": 2416 }, { "epoch": 0.39, "learning_rate": 9.999618452354901e-07, "logits/chosen": -0.08950679749250412, "logits/rejected": -0.053241804242134094, "logps/chosen": -52.006980895996094, "logps/rejected": -42.46729278564453, "loss": 0.6329, "rewards/accuracies": 1.0, "rewards/chosen": 1.3608070611953735, "rewards/margins": 0.6731373071670532, "rewards/rejected": 0.6876697540283203, "step": 2417 }, { "epoch": 0.39, "learning_rate": 9.99960204377842e-07, "logits/chosen": -0.5819914937019348, "logits/rejected": -0.540753960609436, "logps/chosen": -127.4713134765625, "logps/rejected": -53.90605163574219, "loss": 0.981, "rewards/accuracies": 0.0, "rewards/chosen": 0.48682403564453125, "rewards/margins": -0.9496887922286987, "rewards/rejected": 1.43651282787323, "step": 2418 }, { "epoch": 0.39, "learning_rate": 9.999585289776739e-07, "logits/chosen": -0.6782158017158508, "logits/rejected": -0.5424344539642334, "logps/chosen": -78.96604919433594, "logps/rejected": -109.77800750732422, "loss": 0.8824, "rewards/accuracies": 0.0, "rewards/chosen": 2.0320146083831787, "rewards/margins": -0.49355530738830566, "rewards/rejected": 2.5255699157714844, "step": 2419 }, { "epoch": 0.39, "learning_rate": 9.999568190351024e-07, "logits/chosen": -0.42818737030029297, "logits/rejected": -0.32164061069488525, "logps/chosen": -45.924171447753906, "logps/rejected": -22.592010498046875, "loss": 1.2223, "rewards/accuracies": 1.0, "rewards/chosen": 0.8229514956474304, "rewards/margins": 0.03151547908782959, "rewards/rejected": 0.7914360165596008, "step": 2420 }, { "epoch": 0.39, "learning_rate": 9.999550745502453e-07, "logits/chosen": -0.6412630677223206, "logits/rejected": -0.6249030828475952, "logps/chosen": -60.560062408447266, "logps/rejected": -57.98882293701172, "loss": 0.4318, "rewards/accuracies": 0.0, "rewards/chosen": 1.299367904663086, "rewards/margins": -0.06228446960449219, "rewards/rejected": 1.3616523742675781, "step": 2421 }, { "epoch": 0.39, "learning_rate": 9.999532955232233e-07, "logits/chosen": -0.601450502872467, "logits/rejected": -0.5366339087486267, "logps/chosen": -168.4073486328125, "logps/rejected": -58.57294464111328, "loss": 0.3754, "rewards/accuracies": 1.0, "rewards/chosen": 3.2655060291290283, "rewards/margins": 1.6826362609863281, "rewards/rejected": 1.5828697681427002, "step": 2422 }, { "epoch": 0.39, "learning_rate": 9.99951481954159e-07, "logits/chosen": -0.43177589774131775, "logits/rejected": -0.324556827545166, "logps/chosen": -109.29885864257812, "logps/rejected": -43.46625518798828, "loss": 0.5354, "rewards/accuracies": 0.0, "rewards/chosen": 0.24651947617530823, "rewards/margins": -0.5443527698516846, "rewards/rejected": 0.7908722162246704, "step": 2423 }, { "epoch": 0.39, "learning_rate": 9.999496338431782e-07, "logits/chosen": -0.705070972442627, "logits/rejected": -0.6878210306167603, "logps/chosen": -108.9577865600586, "logps/rejected": -48.92597961425781, "loss": 0.7978, "rewards/accuracies": 0.0, "rewards/chosen": -0.5613242983818054, "rewards/margins": -0.4905944764614105, "rewards/rejected": -0.0707298293709755, "step": 2424 }, { "epoch": 0.39, "learning_rate": 9.999477511904078e-07, "logits/chosen": -0.44453543424606323, "logits/rejected": -0.44967401027679443, "logps/chosen": -75.8900375366211, "logps/rejected": -94.98744201660156, "loss": 1.0307, "rewards/accuracies": 1.0, "rewards/chosen": 0.5136284232139587, "rewards/margins": 0.5572571158409119, "rewards/rejected": -0.043628692626953125, "step": 2425 }, { "epoch": 0.39, "learning_rate": 9.999458339959785e-07, "logits/chosen": -0.5104966163635254, "logits/rejected": -0.48773613572120667, "logps/chosen": -102.96854400634766, "logps/rejected": -132.88491821289062, "loss": 0.5882, "rewards/accuracies": 0.0, "rewards/chosen": 1.7055625915527344, "rewards/margins": -0.5769784450531006, "rewards/rejected": 2.282541036605835, "step": 2426 }, { "epoch": 0.39, "learning_rate": 9.999438822600227e-07, "logits/chosen": -0.6864263415336609, "logits/rejected": -0.6670066118240356, "logps/chosen": -114.89532470703125, "logps/rejected": -162.1072235107422, "loss": 1.2981, "rewards/accuracies": 0.0, "rewards/chosen": 3.0924713611602783, "rewards/margins": -2.300884962081909, "rewards/rejected": 5.3933563232421875, "step": 2427 }, { "epoch": 0.39, "learning_rate": 9.99941895982675e-07, "logits/chosen": -0.7030194401741028, "logits/rejected": -0.7038823962211609, "logps/chosen": -97.1072006225586, "logps/rejected": -61.421730041503906, "loss": 1.4309, "rewards/accuracies": 0.0, "rewards/chosen": 0.16729584336280823, "rewards/margins": -1.6341590881347656, "rewards/rejected": 1.8014549016952515, "step": 2428 }, { "epoch": 0.39, "learning_rate": 9.99939875164073e-07, "logits/chosen": -0.33768609166145325, "logits/rejected": -0.13887472450733185, "logps/chosen": -79.51953125, "logps/rejected": -72.92352294921875, "loss": 1.2988, "rewards/accuracies": 0.0, "rewards/chosen": 1.8249801397323608, "rewards/margins": -2.2639412879943848, "rewards/rejected": 4.088921546936035, "step": 2429 }, { "epoch": 0.39, "learning_rate": 9.99937819804356e-07, "logits/chosen": -0.7867834568023682, "logits/rejected": -0.7996450662612915, "logps/chosen": -35.31903839111328, "logps/rejected": -62.88524627685547, "loss": 1.5247, "rewards/accuracies": 1.0, "rewards/chosen": 0.304006963968277, "rewards/margins": 0.3501396179199219, "rewards/rejected": -0.046132661402225494, "step": 2430 }, { "epoch": 0.39, "learning_rate": 9.99935729903666e-07, "logits/chosen": -0.8210475444793701, "logits/rejected": -0.7844679355621338, "logps/chosen": -109.50094604492188, "logps/rejected": -106.66177368164062, "loss": 1.1265, "rewards/accuracies": 0.0, "rewards/chosen": 0.5625961422920227, "rewards/margins": -1.0344359874725342, "rewards/rejected": 1.5970321893692017, "step": 2431 }, { "epoch": 0.39, "learning_rate": 9.999336054621477e-07, "logits/chosen": -0.6055026054382324, "logits/rejected": -0.44275686144828796, "logps/chosen": -154.11770629882812, "logps/rejected": -81.25192260742188, "loss": 0.0581, "rewards/accuracies": 1.0, "rewards/chosen": 4.6142425537109375, "rewards/margins": 2.986401319503784, "rewards/rejected": 1.6278412342071533, "step": 2432 }, { "epoch": 0.39, "learning_rate": 9.999314464799476e-07, "logits/chosen": -0.5081048011779785, "logits/rejected": -0.5125271677970886, "logps/chosen": -94.35063171386719, "logps/rejected": -95.40174865722656, "loss": 0.6205, "rewards/accuracies": 1.0, "rewards/chosen": 1.5346893072128296, "rewards/margins": 0.31552886962890625, "rewards/rejected": 1.2191604375839233, "step": 2433 }, { "epoch": 0.4, "learning_rate": 9.99929252957215e-07, "logits/chosen": -0.7572914958000183, "logits/rejected": -1.155014157295227, "logps/chosen": -157.70950317382812, "logps/rejected": -39.425418853759766, "loss": 0.4159, "rewards/accuracies": 1.0, "rewards/chosen": 0.20424500107765198, "rewards/margins": 0.04720039665699005, "rewards/rejected": 0.15704460442066193, "step": 2434 }, { "epoch": 0.4, "learning_rate": 9.999270248941012e-07, "logits/chosen": -0.5105937719345093, "logits/rejected": -0.49587419629096985, "logps/chosen": -182.4434814453125, "logps/rejected": -169.15988159179688, "loss": 2.1372, "rewards/accuracies": 0.0, "rewards/chosen": 2.91007399559021, "rewards/margins": -1.5367295742034912, "rewards/rejected": 4.446803569793701, "step": 2435 }, { "epoch": 0.4, "learning_rate": 9.999247622907606e-07, "logits/chosen": -0.21292930841445923, "logits/rejected": -0.21221351623535156, "logps/chosen": -3.7886149883270264, "logps/rejected": -3.173379898071289, "loss": 0.4708, "rewards/accuracies": 0.0, "rewards/chosen": 0.35225266218185425, "rewards/margins": -0.08159142732620239, "rewards/rejected": 0.43384408950805664, "step": 2436 }, { "epoch": 0.4, "learning_rate": 9.99922465147349e-07, "logits/chosen": -0.6465116739273071, "logits/rejected": -0.6183395981788635, "logps/chosen": -217.64178466796875, "logps/rejected": -66.98434448242188, "loss": 0.266, "rewards/accuracies": 1.0, "rewards/chosen": 3.4887192249298096, "rewards/margins": 1.632086157798767, "rewards/rejected": 1.8566330671310425, "step": 2437 }, { "epoch": 0.4, "learning_rate": 9.999201334640254e-07, "logits/chosen": -0.5402056574821472, "logits/rejected": -0.7392902374267578, "logps/chosen": -37.54216003417969, "logps/rejected": -72.29743957519531, "loss": 0.8551, "rewards/accuracies": 0.0, "rewards/chosen": 1.0204994678497314, "rewards/margins": -0.387675404548645, "rewards/rejected": 1.4081748723983765, "step": 2438 }, { "epoch": 0.4, "learning_rate": 9.999177672409511e-07, "logits/chosen": -0.828455924987793, "logits/rejected": -0.8504675626754761, "logps/chosen": -88.53575134277344, "logps/rejected": -40.687374114990234, "loss": 1.4162, "rewards/accuracies": 0.0, "rewards/chosen": 0.701416015625, "rewards/margins": -1.2735531330108643, "rewards/rejected": 1.9749691486358643, "step": 2439 }, { "epoch": 0.4, "learning_rate": 9.999153664782892e-07, "logits/chosen": -0.1120237186551094, "logits/rejected": -0.14425788819789886, "logps/chosen": -87.07794189453125, "logps/rejected": -72.23921203613281, "loss": 1.2873, "rewards/accuracies": 0.0, "rewards/chosen": 0.4032424986362457, "rewards/margins": -2.0474600791931152, "rewards/rejected": 2.450702667236328, "step": 2440 }, { "epoch": 0.4, "learning_rate": 9.999129311762055e-07, "logits/chosen": -0.3298894166946411, "logits/rejected": -0.3460659086704254, "logps/chosen": -120.59129333496094, "logps/rejected": -60.477787017822266, "loss": 0.7922, "rewards/accuracies": 0.0, "rewards/chosen": 0.8880035281181335, "rewards/margins": -1.2333652973175049, "rewards/rejected": 2.121368885040283, "step": 2441 }, { "epoch": 0.4, "learning_rate": 9.999104613348689e-07, "logits/chosen": -0.8022715449333191, "logits/rejected": -0.8030374646186829, "logps/chosen": -104.1175537109375, "logps/rejected": -157.41958618164062, "loss": 0.4345, "rewards/accuracies": 0.0, "rewards/chosen": 0.747302234172821, "rewards/margins": -0.205078125, "rewards/rejected": 0.952380359172821, "step": 2442 }, { "epoch": 0.4, "learning_rate": 9.999079569544492e-07, "logits/chosen": -0.6722105741500854, "logits/rejected": -0.6722105741500854, "logps/chosen": -70.51878356933594, "logps/rejected": -70.51878356933594, "loss": 0.9286, "rewards/accuracies": 0.0, "rewards/chosen": 0.6473503112792969, "rewards/margins": 0.0, "rewards/rejected": 0.6473503112792969, "step": 2443 }, { "epoch": 0.4, "learning_rate": 9.999054180351202e-07, "logits/chosen": -0.06190645694732666, "logits/rejected": -0.017073269933462143, "logps/chosen": -77.66444396972656, "logps/rejected": -73.65364837646484, "loss": 0.5529, "rewards/accuracies": 1.0, "rewards/chosen": 1.8345428705215454, "rewards/margins": 0.5858207941055298, "rewards/rejected": 1.2487220764160156, "step": 2444 }, { "epoch": 0.4, "learning_rate": 9.999028445770568e-07, "logits/chosen": -0.5530145168304443, "logits/rejected": -0.4931400418281555, "logps/chosen": -72.65870666503906, "logps/rejected": -81.07771301269531, "loss": 0.7259, "rewards/accuracies": 0.0, "rewards/chosen": 0.9303329586982727, "rewards/margins": -0.39862746000289917, "rewards/rejected": 1.3289604187011719, "step": 2445 }, { "epoch": 0.4, "learning_rate": 9.99900236580437e-07, "logits/chosen": -0.1728658229112625, "logits/rejected": -0.1602553427219391, "logps/chosen": -50.9152946472168, "logps/rejected": -121.03985595703125, "loss": 0.1648, "rewards/accuracies": 1.0, "rewards/chosen": 1.1655892133712769, "rewards/margins": 1.5412087440490723, "rewards/rejected": -0.375619500875473, "step": 2446 }, { "epoch": 0.4, "learning_rate": 9.998975940454409e-07, "logits/chosen": -0.437431663274765, "logits/rejected": -0.437431663274765, "logps/chosen": -50.98429870605469, "logps/rejected": -50.98429870605469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.4191230535507202, "rewards/margins": 0.0, "rewards/rejected": 1.4191230535507202, "step": 2447 }, { "epoch": 0.4, "learning_rate": 9.998949169722512e-07, "logits/chosen": -0.4241325557231903, "logits/rejected": -0.34997260570526123, "logps/chosen": -71.9297103881836, "logps/rejected": -20.536901473999023, "loss": 0.57, "rewards/accuracies": 1.0, "rewards/chosen": 1.4350112676620483, "rewards/margins": 1.1365962028503418, "rewards/rejected": 0.29841500520706177, "step": 2448 }, { "epoch": 0.4, "learning_rate": 9.998922053610528e-07, "logits/chosen": -0.868776798248291, "logits/rejected": -0.8877236247062683, "logps/chosen": -93.81421661376953, "logps/rejected": -85.86154174804688, "loss": 1.3996, "rewards/accuracies": 0.0, "rewards/chosen": 1.440528154373169, "rewards/margins": -0.5024253129959106, "rewards/rejected": 1.9429534673690796, "step": 2449 }, { "epoch": 0.4, "learning_rate": 9.998894592120328e-07, "logits/chosen": -0.27127745747566223, "logits/rejected": -0.27037882804870605, "logps/chosen": -7.439587593078613, "logps/rejected": -1.9506503343582153, "loss": 0.5684, "rewards/accuracies": 0.0, "rewards/chosen": 0.2712215483188629, "rewards/margins": -0.21061211824417114, "rewards/rejected": 0.48183366656303406, "step": 2450 }, { "epoch": 0.4, "learning_rate": 9.998866785253814e-07, "logits/chosen": -0.37836095690727234, "logits/rejected": -0.34989288449287415, "logps/chosen": -72.80764770507812, "logps/rejected": -92.3729248046875, "loss": 0.207, "rewards/accuracies": 1.0, "rewards/chosen": 1.8517792224884033, "rewards/margins": 1.1023337841033936, "rewards/rejected": 0.749445378780365, "step": 2451 }, { "epoch": 0.4, "learning_rate": 9.998838633012904e-07, "logits/chosen": -0.8108649253845215, "logits/rejected": -0.7245452404022217, "logps/chosen": -58.651451110839844, "logps/rejected": -100.34190368652344, "loss": 1.8722, "rewards/accuracies": 0.0, "rewards/chosen": 0.765289306640625, "rewards/margins": -1.3472344875335693, "rewards/rejected": 2.1125237941741943, "step": 2452 }, { "epoch": 0.4, "learning_rate": 9.998810135399545e-07, "logits/chosen": -0.6883143782615662, "logits/rejected": -0.562526524066925, "logps/chosen": -131.01402282714844, "logps/rejected": -197.968017578125, "loss": 2.2589, "rewards/accuracies": 0.0, "rewards/chosen": 1.5577255487442017, "rewards/margins": -4.43679666519165, "rewards/rejected": 5.9945220947265625, "step": 2453 }, { "epoch": 0.4, "learning_rate": 9.998781292415703e-07, "logits/chosen": -0.5432900190353394, "logits/rejected": -0.48790284991264343, "logps/chosen": -78.13367462158203, "logps/rejected": -20.002853393554688, "loss": 0.6409, "rewards/accuracies": 0.0, "rewards/chosen": 0.06667480617761612, "rewards/margins": -0.17242583632469177, "rewards/rejected": 0.2391006499528885, "step": 2454 }, { "epoch": 0.4, "learning_rate": 9.998752104063374e-07, "logits/chosen": -0.2787163555622101, "logits/rejected": -0.2798367738723755, "logps/chosen": -118.9269790649414, "logps/rejected": -53.30326843261719, "loss": 0.9156, "rewards/accuracies": 0.0, "rewards/chosen": 1.161263346672058, "rewards/margins": -0.4265235662460327, "rewards/rejected": 1.5877869129180908, "step": 2455 }, { "epoch": 0.4, "learning_rate": 9.998722570344573e-07, "logits/chosen": -0.7666317820549011, "logits/rejected": -0.658199667930603, "logps/chosen": -199.90567016601562, "logps/rejected": -67.39219665527344, "loss": 0.3123, "rewards/accuracies": 1.0, "rewards/chosen": 3.174755811691284, "rewards/margins": 2.5303258895874023, "rewards/rejected": 0.6444298028945923, "step": 2456 }, { "epoch": 0.4, "learning_rate": 9.998692691261342e-07, "logits/chosen": -0.8196697235107422, "logits/rejected": -0.8196697235107422, "logps/chosen": -54.75886917114258, "logps/rejected": -54.75886917114258, "loss": 0.9367, "rewards/accuracies": 0.0, "rewards/chosen": 1.964049220085144, "rewards/margins": 0.0, "rewards/rejected": 1.964049220085144, "step": 2457 }, { "epoch": 0.4, "learning_rate": 9.99866246681574e-07, "logits/chosen": -0.4827984571456909, "logits/rejected": -0.4827984571456909, "logps/chosen": -59.77373504638672, "logps/rejected": -59.77373504638672, "loss": 0.755, "rewards/accuracies": 0.0, "rewards/chosen": 2.8907463550567627, "rewards/margins": 0.0, "rewards/rejected": 2.8907463550567627, "step": 2458 }, { "epoch": 0.4, "learning_rate": 9.998631897009864e-07, "logits/chosen": -0.219567209482193, "logits/rejected": -0.219567209482193, "logps/chosen": -22.357328414916992, "logps/rejected": -22.357328414916992, "loss": 0.3544, "rewards/accuracies": 0.0, "rewards/chosen": 0.8338155746459961, "rewards/margins": 0.0, "rewards/rejected": 0.8338155746459961, "step": 2459 }, { "epoch": 0.4, "learning_rate": 9.998600981845819e-07, "logits/chosen": -0.5986575484275818, "logits/rejected": -0.5311863422393799, "logps/chosen": -48.03550338745117, "logps/rejected": -104.2333755493164, "loss": 1.1532, "rewards/accuracies": 0.0, "rewards/chosen": 1.3542728424072266, "rewards/margins": -0.1713581085205078, "rewards/rejected": 1.5256309509277344, "step": 2460 }, { "epoch": 0.4, "learning_rate": 9.998569721325746e-07, "logits/chosen": -0.5695517063140869, "logits/rejected": -0.4878287613391876, "logps/chosen": -135.74754333496094, "logps/rejected": -108.90424346923828, "loss": 0.4444, "rewards/accuracies": 1.0, "rewards/chosen": 3.0132827758789062, "rewards/margins": 0.8204612731933594, "rewards/rejected": 2.192821502685547, "step": 2461 }, { "epoch": 0.4, "learning_rate": 9.998538115451798e-07, "logits/chosen": -0.6698579788208008, "logits/rejected": -0.6327229142189026, "logps/chosen": -61.76777648925781, "logps/rejected": -79.38682556152344, "loss": 0.7055, "rewards/accuracies": 0.0, "rewards/chosen": 0.5910278558731079, "rewards/margins": -1.0983054637908936, "rewards/rejected": 1.6893333196640015, "step": 2462 }, { "epoch": 0.4, "learning_rate": 9.998506164226167e-07, "logits/chosen": -0.6622394323348999, "logits/rejected": -0.6142304539680481, "logps/chosen": -103.27037811279297, "logps/rejected": -98.86677551269531, "loss": 0.8255, "rewards/accuracies": 0.0, "rewards/chosen": 2.0795669555664062, "rewards/margins": -0.27391886711120605, "rewards/rejected": 2.3534858226776123, "step": 2463 }, { "epoch": 0.4, "learning_rate": 9.998473867651053e-07, "logits/chosen": -0.5618029832839966, "logits/rejected": -0.5333448648452759, "logps/chosen": -46.86659622192383, "logps/rejected": -21.752262115478516, "loss": 1.4146, "rewards/accuracies": 1.0, "rewards/chosen": 0.5152149200439453, "rewards/margins": 0.3958992063999176, "rewards/rejected": 0.1193157210946083, "step": 2464 }, { "epoch": 0.4, "learning_rate": 9.99844122572869e-07, "logits/chosen": -0.36826398968696594, "logits/rejected": -0.2877666652202606, "logps/chosen": -92.24639892578125, "logps/rejected": -22.50690269470215, "loss": 1.446, "rewards/accuracies": 1.0, "rewards/chosen": 0.5296371579170227, "rewards/margins": 0.14309978485107422, "rewards/rejected": 0.3865373730659485, "step": 2465 }, { "epoch": 0.4, "learning_rate": 9.998408238461337e-07, "logits/chosen": -0.4623836874961853, "logits/rejected": -0.4270251393318176, "logps/chosen": -54.42992401123047, "logps/rejected": -93.42521667480469, "loss": 2.1617, "rewards/accuracies": 1.0, "rewards/chosen": 0.8057525753974915, "rewards/margins": 0.3395332396030426, "rewards/rejected": 0.46621933579444885, "step": 2466 }, { "epoch": 0.4, "learning_rate": 9.99837490585127e-07, "logits/chosen": -0.42992645502090454, "logits/rejected": -0.3153875172138214, "logps/chosen": -71.69771575927734, "logps/rejected": -68.25675964355469, "loss": 0.7834, "rewards/accuracies": 1.0, "rewards/chosen": 1.4467995166778564, "rewards/margins": 0.3302849531173706, "rewards/rejected": 1.1165145635604858, "step": 2467 }, { "epoch": 0.4, "learning_rate": 9.998341227900791e-07, "logits/chosen": -0.7161805629730225, "logits/rejected": -0.6486898064613342, "logps/chosen": -54.59137725830078, "logps/rejected": -66.80497741699219, "loss": 0.9626, "rewards/accuracies": 1.0, "rewards/chosen": 0.6477615237236023, "rewards/margins": 0.20881041884422302, "rewards/rejected": 0.4389511048793793, "step": 2468 }, { "epoch": 0.4, "learning_rate": 9.998307204612228e-07, "logits/chosen": -0.6230853796005249, "logits/rejected": -0.29229751229286194, "logps/chosen": -250.4435577392578, "logps/rejected": -83.86921691894531, "loss": 0.5311, "rewards/accuracies": 1.0, "rewards/chosen": 2.894667148590088, "rewards/margins": 0.46584320068359375, "rewards/rejected": 2.428823947906494, "step": 2469 }, { "epoch": 0.4, "learning_rate": 9.998272835987931e-07, "logits/chosen": -0.5158162117004395, "logits/rejected": -0.5136814117431641, "logps/chosen": -5.718175888061523, "logps/rejected": -23.2012939453125, "loss": 0.9607, "rewards/accuracies": 0.0, "rewards/chosen": 0.2376812994480133, "rewards/margins": -0.0168398916721344, "rewards/rejected": 0.2545211911201477, "step": 2470 }, { "epoch": 0.4, "learning_rate": 9.998238122030276e-07, "logits/chosen": -0.7118552923202515, "logits/rejected": -0.6769542098045349, "logps/chosen": -146.45803833007812, "logps/rejected": -40.264610290527344, "loss": 1.0947, "rewards/accuracies": 0.0, "rewards/chosen": 0.360952764749527, "rewards/margins": -1.2782471179962158, "rewards/rejected": 1.6391998529434204, "step": 2471 }, { "epoch": 0.4, "learning_rate": 9.998203062741658e-07, "logits/chosen": -0.12818697094917297, "logits/rejected": -0.0894668772816658, "logps/chosen": -157.42825317382812, "logps/rejected": -129.07083129882812, "loss": 1.0897, "rewards/accuracies": 1.0, "rewards/chosen": 3.4475371837615967, "rewards/margins": 0.09222245216369629, "rewards/rejected": 3.3553147315979004, "step": 2472 }, { "epoch": 0.4, "learning_rate": 9.998167658124506e-07, "logits/chosen": -0.6150591969490051, "logits/rejected": -0.6428895592689514, "logps/chosen": -13.000605583190918, "logps/rejected": -66.08013916015625, "loss": 0.3455, "rewards/accuracies": 1.0, "rewards/chosen": 0.3210562765598297, "rewards/margins": 0.3220946490764618, "rewards/rejected": -0.0010383606422692537, "step": 2473 }, { "epoch": 0.4, "learning_rate": 9.99813190818126e-07, "logits/chosen": -0.27099618315696716, "logits/rejected": -0.2631925046443939, "logps/chosen": -13.723787307739258, "logps/rejected": -5.816322326660156, "loss": 0.5833, "rewards/accuracies": 0.0, "rewards/chosen": 0.24788112938404083, "rewards/margins": -0.11873503029346466, "rewards/rejected": 0.3666161596775055, "step": 2474 }, { "epoch": 0.4, "learning_rate": 9.998095812914391e-07, "logits/chosen": -0.2394641935825348, "logits/rejected": -0.258326917886734, "logps/chosen": -3.389922857284546, "logps/rejected": -3.5482521057128906, "loss": 0.4011, "rewards/accuracies": 0.0, "rewards/chosen": 0.11836846172809601, "rewards/margins": -0.04126623272895813, "rewards/rejected": 0.15963469445705414, "step": 2475 }, { "epoch": 0.4, "learning_rate": 9.998059372326395e-07, "logits/chosen": -1.0566911697387695, "logits/rejected": -1.038604974746704, "logps/chosen": -132.955810546875, "logps/rejected": -69.36227416992188, "loss": 0.7658, "rewards/accuracies": 0.0, "rewards/chosen": 0.5479339957237244, "rewards/margins": -1.2092010974884033, "rewards/rejected": 1.757135033607483, "step": 2476 }, { "epoch": 0.4, "learning_rate": 9.998022586419788e-07, "logits/chosen": -0.6218046545982361, "logits/rejected": -0.6128737926483154, "logps/chosen": -90.24169158935547, "logps/rejected": -101.00164794921875, "loss": 0.6325, "rewards/accuracies": 1.0, "rewards/chosen": 1.114148736000061, "rewards/margins": 0.8546119928359985, "rewards/rejected": 0.2595367431640625, "step": 2477 }, { "epoch": 0.4, "learning_rate": 9.997985455197113e-07, "logits/chosen": -0.8083136081695557, "logits/rejected": -0.8421815633773804, "logps/chosen": -224.10923767089844, "logps/rejected": -142.84860229492188, "loss": 0.7022, "rewards/accuracies": 0.0, "rewards/chosen": 3.321589708328247, "rewards/margins": -0.9916398525238037, "rewards/rejected": 4.313229560852051, "step": 2478 }, { "epoch": 0.4, "learning_rate": 9.997947978660933e-07, "logits/chosen": -0.9572561383247375, "logits/rejected": -0.919899046421051, "logps/chosen": -273.6340637207031, "logps/rejected": -152.9926300048828, "loss": 1.0878, "rewards/accuracies": 0.0, "rewards/chosen": 3.694894552230835, "rewards/margins": -1.7307417392730713, "rewards/rejected": 5.425636291503906, "step": 2479 }, { "epoch": 0.4, "learning_rate": 9.997910156813839e-07, "logits/chosen": -0.9950443506240845, "logits/rejected": -0.9469634294509888, "logps/chosen": -136.07925415039062, "logps/rejected": -86.51467895507812, "loss": 1.0395, "rewards/accuracies": 0.0, "rewards/chosen": 0.8017776608467102, "rewards/margins": -1.7337327003479004, "rewards/rejected": 2.535510301589966, "step": 2480 }, { "epoch": 0.4, "learning_rate": 9.997871989658444e-07, "logits/chosen": -0.46787187457084656, "logits/rejected": -0.43795517086982727, "logps/chosen": -114.04878997802734, "logps/rejected": -71.9876480102539, "loss": 1.1075, "rewards/accuracies": 0.0, "rewards/chosen": 1.3177627325057983, "rewards/margins": -0.5361946821212769, "rewards/rejected": 1.8539574146270752, "step": 2481 }, { "epoch": 0.4, "learning_rate": 9.997833477197385e-07, "logits/chosen": -0.3969053626060486, "logits/rejected": -0.4092806875705719, "logps/chosen": -108.80033874511719, "logps/rejected": -62.113037109375, "loss": 1.1953, "rewards/accuracies": 0.0, "rewards/chosen": 0.43844375014305115, "rewards/margins": -0.7531654834747314, "rewards/rejected": 1.191609263420105, "step": 2482 }, { "epoch": 0.4, "learning_rate": 9.997794619433322e-07, "logits/chosen": -0.7233766317367554, "logits/rejected": -0.6812764406204224, "logps/chosen": -146.42840576171875, "logps/rejected": -110.43914031982422, "loss": 0.4, "rewards/accuracies": 1.0, "rewards/chosen": 0.42969971895217896, "rewards/margins": 0.4013633728027344, "rewards/rejected": 0.028336334973573685, "step": 2483 }, { "epoch": 0.4, "learning_rate": 9.99775541636894e-07, "logits/chosen": -0.9054561853408813, "logits/rejected": -0.9124044179916382, "logps/chosen": -108.3037109375, "logps/rejected": -20.54182243347168, "loss": 0.0684, "rewards/accuracies": 1.0, "rewards/chosen": 3.4681670665740967, "rewards/margins": 3.1030972003936768, "rewards/rejected": 0.36506977677345276, "step": 2484 }, { "epoch": 0.4, "learning_rate": 9.997715868006952e-07, "logits/chosen": -0.5397246479988098, "logits/rejected": -0.5804736614227295, "logps/chosen": -86.66997528076172, "logps/rejected": -85.12567138671875, "loss": 1.1163, "rewards/accuracies": 0.0, "rewards/chosen": 1.9970992803573608, "rewards/margins": -0.8440459966659546, "rewards/rejected": 2.8411452770233154, "step": 2485 }, { "epoch": 0.4, "learning_rate": 9.997675974350082e-07, "logits/chosen": -0.555403470993042, "logits/rejected": -0.617423415184021, "logps/chosen": -92.57962799072266, "logps/rejected": -139.68319702148438, "loss": 1.5336, "rewards/accuracies": 0.0, "rewards/chosen": 1.8066139221191406, "rewards/margins": -1.2810204029083252, "rewards/rejected": 3.087634325027466, "step": 2486 }, { "epoch": 0.4, "learning_rate": 9.997635735401091e-07, "logits/chosen": -0.08908844739198685, "logits/rejected": -0.04880562052130699, "logps/chosen": -103.57463073730469, "logps/rejected": -46.160789489746094, "loss": 1.0466, "rewards/accuracies": 0.0, "rewards/chosen": 0.8003601431846619, "rewards/margins": -0.23198622465133667, "rewards/rejected": 1.0323463678359985, "step": 2487 }, { "epoch": 0.4, "learning_rate": 9.99759515116276e-07, "logits/chosen": -0.2468324899673462, "logits/rejected": -0.2468324899673462, "logps/chosen": -49.691802978515625, "logps/rejected": -49.691802978515625, "loss": 1.1075, "rewards/accuracies": 0.0, "rewards/chosen": 0.7645401358604431, "rewards/margins": 0.0, "rewards/rejected": 0.7645401358604431, "step": 2488 }, { "epoch": 0.4, "learning_rate": 9.99755422163789e-07, "logits/chosen": -0.48954159021377563, "logits/rejected": -0.4637683928012848, "logps/chosen": -67.61267852783203, "logps/rejected": -8.792508125305176, "loss": 0.8608, "rewards/accuracies": 0.0, "rewards/chosen": 0.06469803303480148, "rewards/margins": -0.11196241527795792, "rewards/rejected": 0.1766604483127594, "step": 2489 }, { "epoch": 0.4, "learning_rate": 9.997512946829312e-07, "logits/chosen": -0.5543050765991211, "logits/rejected": -0.4811520278453827, "logps/chosen": -63.3771858215332, "logps/rejected": -23.000951766967773, "loss": 0.7724, "rewards/accuracies": 0.0, "rewards/chosen": -0.05017585679888725, "rewards/margins": -0.5485498309135437, "rewards/rejected": 0.49837398529052734, "step": 2490 }, { "epoch": 0.4, "learning_rate": 9.997471326739877e-07, "logits/chosen": -0.6201146841049194, "logits/rejected": -0.6156560778617859, "logps/chosen": -144.5955352783203, "logps/rejected": -111.98898315429688, "loss": 1.1224, "rewards/accuracies": 0.0, "rewards/chosen": -0.7752487063407898, "rewards/margins": -0.8674255013465881, "rewards/rejected": 0.09217681735754013, "step": 2491 }, { "epoch": 0.4, "learning_rate": 9.997429361372458e-07, "logits/chosen": -0.5977287888526917, "logits/rejected": -0.709860622882843, "logps/chosen": -153.97781372070312, "logps/rejected": -117.509765625, "loss": 1.3227, "rewards/accuracies": 0.0, "rewards/chosen": 0.5230728387832642, "rewards/margins": -2.3731751441955566, "rewards/rejected": 2.8962478637695312, "step": 2492 }, { "epoch": 0.4, "learning_rate": 9.997387050729956e-07, "logits/chosen": -0.7399746179580688, "logits/rejected": -0.6704525947570801, "logps/chosen": -60.295440673828125, "logps/rejected": -72.93154907226562, "loss": 0.9248, "rewards/accuracies": 0.0, "rewards/chosen": 1.6235703229904175, "rewards/margins": -1.4681426286697388, "rewards/rejected": 3.0917129516601562, "step": 2493 }, { "epoch": 0.4, "learning_rate": 9.997344394815298e-07, "logits/chosen": -0.15552057325839996, "logits/rejected": -0.10910677164793015, "logps/chosen": -49.314048767089844, "logps/rejected": -70.43589782714844, "loss": 2.1394, "rewards/accuracies": 0.0, "rewards/chosen": 0.6801460385322571, "rewards/margins": -1.5472595691680908, "rewards/rejected": 2.227405548095703, "step": 2494 }, { "epoch": 0.4, "learning_rate": 9.997301393631426e-07, "logits/chosen": -0.44860756397247314, "logits/rejected": -0.446091890335083, "logps/chosen": -199.8402557373047, "logps/rejected": -135.45343017578125, "loss": 0.7096, "rewards/accuracies": 0.0, "rewards/chosen": 3.884822130203247, "rewards/margins": -0.3346407413482666, "rewards/rejected": 4.219462871551514, "step": 2495 }, { "epoch": 0.41, "learning_rate": 9.997258047181312e-07, "logits/chosen": -1.0499852895736694, "logits/rejected": -1.0136533975601196, "logps/chosen": -69.12457275390625, "logps/rejected": -44.88941192626953, "loss": 0.9192, "rewards/accuracies": 0.0, "rewards/chosen": 1.9676445722579956, "rewards/margins": -1.2132774591445923, "rewards/rejected": 3.180922031402588, "step": 2496 }, { "epoch": 0.41, "learning_rate": 9.997214355467952e-07, "logits/chosen": -0.8727954030036926, "logits/rejected": -0.7831642031669617, "logps/chosen": -163.67910766601562, "logps/rejected": -107.42872619628906, "loss": 0.2917, "rewards/accuracies": 1.0, "rewards/chosen": 3.4104676246643066, "rewards/margins": 0.3030686378479004, "rewards/rejected": 3.1073989868164062, "step": 2497 }, { "epoch": 0.41, "learning_rate": 9.997170318494362e-07, "logits/chosen": -0.6807976961135864, "logits/rejected": -0.6493552923202515, "logps/chosen": -115.77335357666016, "logps/rejected": -117.43798828125, "loss": 0.9072, "rewards/accuracies": 1.0, "rewards/chosen": 2.6014244556427, "rewards/margins": 0.08667683601379395, "rewards/rejected": 2.5147476196289062, "step": 2498 }, { "epoch": 0.41, "learning_rate": 9.997125936263588e-07, "logits/chosen": -0.3289891183376312, "logits/rejected": -0.318146675825119, "logps/chosen": -111.35267639160156, "logps/rejected": -169.18251037597656, "loss": 1.6562, "rewards/accuracies": 0.0, "rewards/chosen": 2.471661329269409, "rewards/margins": -2.866267442703247, "rewards/rejected": 5.337928771972656, "step": 2499 }, { "epoch": 0.41, "learning_rate": 9.997081208778696e-07, "logits/chosen": -0.41993844509124756, "logits/rejected": -0.4143385887145996, "logps/chosen": -129.2747802734375, "logps/rejected": -42.887725830078125, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.5942276120185852, "rewards/margins": 0.04866635799407959, "rewards/rejected": 0.5455612540245056, "step": 2500 }, { "epoch": 0.41, "learning_rate": 9.997036136042773e-07, "logits/chosen": -0.22942431271076202, "logits/rejected": -0.2448979914188385, "logps/chosen": -106.08409118652344, "logps/rejected": -48.69706726074219, "loss": 1.0041, "rewards/accuracies": 0.0, "rewards/chosen": 0.46696779131889343, "rewards/margins": -1.083696722984314, "rewards/rejected": 1.5506645441055298, "step": 2501 }, { "epoch": 0.41, "learning_rate": 9.996990718058937e-07, "logits/chosen": -0.41820523142814636, "logits/rejected": -0.41820523142814636, "logps/chosen": -1.7719184160232544, "logps/rejected": -1.7719184160232544, "loss": 0.548, "rewards/accuracies": 0.0, "rewards/chosen": 0.2388087809085846, "rewards/margins": 0.0, "rewards/rejected": 0.2388087809085846, "step": 2502 }, { "epoch": 0.41, "learning_rate": 9.996944954830324e-07, "logits/chosen": -0.24164873361587524, "logits/rejected": -0.23737527430057526, "logps/chosen": -5.318913459777832, "logps/rejected": -9.516068458557129, "loss": 0.8532, "rewards/accuracies": 1.0, "rewards/chosen": 0.25258809328079224, "rewards/margins": 0.2431730329990387, "rewards/rejected": 0.009415054693818092, "step": 2503 }, { "epoch": 0.41, "learning_rate": 9.996898846360098e-07, "logits/chosen": -0.8590677380561829, "logits/rejected": -0.6139355897903442, "logps/chosen": -110.17271423339844, "logps/rejected": -21.11516571044922, "loss": 0.304, "rewards/accuracies": 1.0, "rewards/chosen": 4.265285015106201, "rewards/margins": 3.951775312423706, "rewards/rejected": 0.3135097622871399, "step": 2504 }, { "epoch": 0.41, "learning_rate": 9.99685239265144e-07, "logits/chosen": -0.5972546339035034, "logits/rejected": -0.5361295342445374, "logps/chosen": -59.287864685058594, "logps/rejected": -47.60103988647461, "loss": 0.7123, "rewards/accuracies": 1.0, "rewards/chosen": 0.3851455748081207, "rewards/margins": 0.07315826416015625, "rewards/rejected": 0.3119873106479645, "step": 2505 }, { "epoch": 0.41, "learning_rate": 9.996805593707565e-07, "logits/chosen": -0.3450838625431061, "logits/rejected": -0.3620058000087738, "logps/chosen": -92.35440826416016, "logps/rejected": -90.50144958496094, "loss": 1.2019, "rewards/accuracies": 0.0, "rewards/chosen": 0.2714393734931946, "rewards/margins": -0.07090833783149719, "rewards/rejected": 0.3423477113246918, "step": 2506 }, { "epoch": 0.41, "learning_rate": 9.996758449531702e-07, "logits/chosen": -0.7519896030426025, "logits/rejected": -0.7266903519630432, "logps/chosen": -81.84395599365234, "logps/rejected": -122.8351058959961, "loss": 1.1768, "rewards/accuracies": 0.0, "rewards/chosen": 0.7321648001670837, "rewards/margins": -1.5533232688903809, "rewards/rejected": 2.2854881286621094, "step": 2507 }, { "epoch": 0.41, "learning_rate": 9.99671096012711e-07, "logits/chosen": -0.6216642260551453, "logits/rejected": -0.6216642260551453, "logps/chosen": -36.53257751464844, "logps/rejected": -36.53257751464844, "loss": 0.7706, "rewards/accuracies": 0.0, "rewards/chosen": 1.1719902753829956, "rewards/margins": 0.0, "rewards/rejected": 1.1719902753829956, "step": 2508 }, { "epoch": 0.41, "learning_rate": 9.99666312549707e-07, "logits/chosen": -0.11325464397668839, "logits/rejected": -0.10497705638408661, "logps/chosen": -2.3559811115264893, "logps/rejected": -12.733393669128418, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.18996083736419678, "rewards/margins": 0.018204465508461, "rewards/rejected": 0.17175637185573578, "step": 2509 }, { "epoch": 0.41, "learning_rate": 9.996614945644886e-07, "logits/chosen": -0.32378166913986206, "logits/rejected": -0.36241647601127625, "logps/chosen": -85.63697814941406, "logps/rejected": -74.85554504394531, "loss": 1.3033, "rewards/accuracies": 0.0, "rewards/chosen": 1.1801636219024658, "rewards/margins": -0.863196611404419, "rewards/rejected": 2.0433602333068848, "step": 2510 }, { "epoch": 0.41, "learning_rate": 9.996566420573888e-07, "logits/chosen": -0.12186224013566971, "logits/rejected": -0.12186224013566971, "logps/chosen": -57.57508087158203, "logps/rejected": -57.57508087158203, "loss": 0.4295, "rewards/accuracies": 0.0, "rewards/chosen": 0.7483406066894531, "rewards/margins": 0.0, "rewards/rejected": 0.7483406066894531, "step": 2511 }, { "epoch": 0.41, "learning_rate": 9.996517550287432e-07, "logits/chosen": -0.5947438478469849, "logits/rejected": -0.6917694807052612, "logps/chosen": -112.77032470703125, "logps/rejected": -133.88363647460938, "loss": 1.3118, "rewards/accuracies": 0.0, "rewards/chosen": 1.5860992670059204, "rewards/margins": -2.437971591949463, "rewards/rejected": 4.024070739746094, "step": 2512 }, { "epoch": 0.41, "learning_rate": 9.996468334788886e-07, "logits/chosen": -0.524929404258728, "logits/rejected": -0.5481234192848206, "logps/chosen": -46.142093658447266, "logps/rejected": -66.05598449707031, "loss": 0.3986, "rewards/accuracies": 1.0, "rewards/chosen": 1.1177936792373657, "rewards/margins": 0.13030433654785156, "rewards/rejected": 0.9874893426895142, "step": 2513 }, { "epoch": 0.41, "learning_rate": 9.996418774081656e-07, "logits/chosen": -0.507135808467865, "logits/rejected": -0.5401145815849304, "logps/chosen": -115.05126953125, "logps/rejected": -129.22808837890625, "loss": 2.1692, "rewards/accuracies": 0.0, "rewards/chosen": 0.4778884947299957, "rewards/margins": -1.4325271844863892, "rewards/rejected": 1.9104156494140625, "step": 2514 }, { "epoch": 0.41, "learning_rate": 9.996368868169168e-07, "logits/chosen": -0.46376362442970276, "logits/rejected": -0.5106489658355713, "logps/chosen": -28.75508689880371, "logps/rejected": -49.36415100097656, "loss": 1.1637, "rewards/accuracies": 0.0, "rewards/chosen": 1.17831552028656, "rewards/margins": -1.1506198644638062, "rewards/rejected": 2.328935384750366, "step": 2515 }, { "epoch": 0.41, "learning_rate": 9.996318617054863e-07, "logits/chosen": -0.6514666080474854, "logits/rejected": -0.7248687148094177, "logps/chosen": -87.2753677368164, "logps/rejected": -142.4436798095703, "loss": 1.5552, "rewards/accuracies": 0.0, "rewards/chosen": 1.0186713933944702, "rewards/margins": -1.5312446355819702, "rewards/rejected": 2.5499160289764404, "step": 2516 }, { "epoch": 0.41, "learning_rate": 9.99626802074222e-07, "logits/chosen": -0.11489446461200714, "logits/rejected": -0.1148085743188858, "logps/chosen": -5.207259178161621, "logps/rejected": -8.104574203491211, "loss": 1.1903, "rewards/accuracies": 1.0, "rewards/chosen": 0.15335659682750702, "rewards/margins": 0.2207791805267334, "rewards/rejected": -0.06742258369922638, "step": 2517 }, { "epoch": 0.41, "learning_rate": 9.996217079234732e-07, "logits/chosen": -0.45454758405685425, "logits/rejected": -0.34533238410949707, "logps/chosen": -56.738487243652344, "logps/rejected": -52.62498092651367, "loss": 0.6167, "rewards/accuracies": 1.0, "rewards/chosen": 2.24350905418396, "rewards/margins": 0.33346378803253174, "rewards/rejected": 1.9100452661514282, "step": 2518 }, { "epoch": 0.41, "learning_rate": 9.996165792535917e-07, "logits/chosen": -0.4892992079257965, "logits/rejected": -0.4294036328792572, "logps/chosen": -87.40226745605469, "logps/rejected": -55.58162307739258, "loss": 0.798, "rewards/accuracies": 0.0, "rewards/chosen": 0.6169463992118835, "rewards/margins": -0.38305479288101196, "rewards/rejected": 1.0000011920928955, "step": 2519 }, { "epoch": 0.41, "learning_rate": 9.996114160649323e-07, "logits/chosen": -0.539084255695343, "logits/rejected": -0.5681896209716797, "logps/chosen": -80.5226058959961, "logps/rejected": -61.05561447143555, "loss": 0.9496, "rewards/accuracies": 0.0, "rewards/chosen": 1.560773491859436, "rewards/margins": -0.16853678226470947, "rewards/rejected": 1.7293102741241455, "step": 2520 }, { "epoch": 0.41, "learning_rate": 9.996062183578511e-07, "logits/chosen": -0.3197757601737976, "logits/rejected": -0.3094295263290405, "logps/chosen": -25.663942337036133, "logps/rejected": -11.18736743927002, "loss": 1.0716, "rewards/accuracies": 1.0, "rewards/chosen": 0.3519231975078583, "rewards/margins": 0.008094310760498047, "rewards/rejected": 0.34382888674736023, "step": 2521 }, { "epoch": 0.41, "learning_rate": 9.996009861327075e-07, "logits/chosen": -0.758029043674469, "logits/rejected": -0.7929846048355103, "logps/chosen": -58.90851974487305, "logps/rejected": -75.75670623779297, "loss": 0.6235, "rewards/accuracies": 1.0, "rewards/chosen": 1.70941960811615, "rewards/margins": 0.1660480499267578, "rewards/rejected": 1.543371558189392, "step": 2522 }, { "epoch": 0.41, "learning_rate": 9.995957193898632e-07, "logits/chosen": -0.8328400254249573, "logits/rejected": -0.741298496723175, "logps/chosen": -229.72142028808594, "logps/rejected": -94.318359375, "loss": 0.1555, "rewards/accuracies": 1.0, "rewards/chosen": 4.538069248199463, "rewards/margins": 1.0598008632659912, "rewards/rejected": 3.4782683849334717, "step": 2523 }, { "epoch": 0.41, "learning_rate": 9.99590418129682e-07, "logits/chosen": -0.7228428721427917, "logits/rejected": -0.8228554725646973, "logps/chosen": -156.86996459960938, "logps/rejected": -183.31704711914062, "loss": 1.8216, "rewards/accuracies": 0.0, "rewards/chosen": 3.3890349864959717, "rewards/margins": -1.0450165271759033, "rewards/rejected": 4.434051513671875, "step": 2524 }, { "epoch": 0.41, "learning_rate": 9.995850823525298e-07, "logits/chosen": -0.3923063576221466, "logits/rejected": -0.351136714220047, "logps/chosen": -56.179931640625, "logps/rejected": -56.917503356933594, "loss": 0.4141, "rewards/accuracies": 0.0, "rewards/chosen": 1.084210991859436, "rewards/margins": -0.04057157039642334, "rewards/rejected": 1.1247825622558594, "step": 2525 }, { "epoch": 0.41, "learning_rate": 9.995797120587756e-07, "logits/chosen": -0.08223845809698105, "logits/rejected": -0.08223845809698105, "logps/chosen": -55.48987579345703, "logps/rejected": -55.48987579345703, "loss": 0.8713, "rewards/accuracies": 0.0, "rewards/chosen": 1.7545921802520752, "rewards/margins": 0.0, "rewards/rejected": 1.7545921802520752, "step": 2526 }, { "epoch": 0.41, "learning_rate": 9.995743072487905e-07, "logits/chosen": -0.6374706029891968, "logits/rejected": -0.6669068932533264, "logps/chosen": -236.71368408203125, "logps/rejected": -175.09259033203125, "loss": 0.736, "rewards/accuracies": 0.0, "rewards/chosen": 3.2841858863830566, "rewards/margins": -0.5716567039489746, "rewards/rejected": 3.8558425903320312, "step": 2527 }, { "epoch": 0.41, "learning_rate": 9.995688679229476e-07, "logits/chosen": -0.28279128670692444, "logits/rejected": -0.08066060394048691, "logps/chosen": -155.50588989257812, "logps/rejected": -77.70811462402344, "loss": 0.1481, "rewards/accuracies": 1.0, "rewards/chosen": 3.8713836669921875, "rewards/margins": 1.1664068698883057, "rewards/rejected": 2.704976797103882, "step": 2528 }, { "epoch": 0.41, "learning_rate": 9.995633940816232e-07, "logits/chosen": -0.9127110838890076, "logits/rejected": -0.8946967720985413, "logps/chosen": -147.8148193359375, "logps/rejected": -112.80265808105469, "loss": 1.0759, "rewards/accuracies": 0.0, "rewards/chosen": 2.0657012462615967, "rewards/margins": -1.7949113845825195, "rewards/rejected": 3.860612630844116, "step": 2529 }, { "epoch": 0.41, "learning_rate": 9.99557885725195e-07, "logits/chosen": -1.2279834747314453, "logits/rejected": -1.205397367477417, "logps/chosen": -99.97361755371094, "logps/rejected": -69.94595336914062, "loss": 0.6488, "rewards/accuracies": 0.0, "rewards/chosen": 1.2160987854003906, "rewards/margins": -0.7355507612228394, "rewards/rejected": 1.95164954662323, "step": 2530 }, { "epoch": 0.41, "learning_rate": 9.995523428540437e-07, "logits/chosen": -0.5780540704727173, "logits/rejected": -0.523472785949707, "logps/chosen": -73.96989440917969, "logps/rejected": -104.06678009033203, "loss": 0.7718, "rewards/accuracies": 0.0, "rewards/chosen": 0.2654678523540497, "rewards/margins": -0.29332807660102844, "rewards/rejected": 0.5587959289550781, "step": 2531 }, { "epoch": 0.41, "learning_rate": 9.995467654685524e-07, "logits/chosen": -0.17450422048568726, "logits/rejected": -0.17450422048568726, "logps/chosen": -73.22537994384766, "logps/rejected": -73.22537994384766, "loss": 0.5963, "rewards/accuracies": 0.0, "rewards/chosen": 0.2146041840314865, "rewards/margins": 0.0, "rewards/rejected": 0.2146041840314865, "step": 2532 }, { "epoch": 0.41, "learning_rate": 9.995411535691062e-07, "logits/chosen": -0.3519587814807892, "logits/rejected": -0.3528701066970825, "logps/chosen": -3.3158481121063232, "logps/rejected": -5.748859882354736, "loss": 1.128, "rewards/accuracies": 0.0, "rewards/chosen": 0.15019504725933075, "rewards/margins": -0.03778122365474701, "rewards/rejected": 0.18797627091407776, "step": 2533 }, { "epoch": 0.41, "learning_rate": 9.995355071560932e-07, "logits/chosen": -0.4823487401008606, "logits/rejected": -0.4042738676071167, "logps/chosen": -62.1952018737793, "logps/rejected": -83.90083312988281, "loss": 1.1815, "rewards/accuracies": 0.0, "rewards/chosen": 1.6071643829345703, "rewards/margins": -0.7297465801239014, "rewards/rejected": 2.3369109630584717, "step": 2534 }, { "epoch": 0.41, "learning_rate": 9.995298262299033e-07, "logits/chosen": -0.5483163595199585, "logits/rejected": -0.35807105898857117, "logps/chosen": -132.85598754882812, "logps/rejected": -57.79667663574219, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": 3.418322801589966, "rewards/margins": 2.4426651000976562, "rewards/rejected": 0.9756576418876648, "step": 2535 }, { "epoch": 0.41, "learning_rate": 9.99524110790929e-07, "logits/chosen": -0.6577491760253906, "logits/rejected": -0.630466878414154, "logps/chosen": -129.51898193359375, "logps/rejected": -21.046344757080078, "loss": 0.1989, "rewards/accuracies": 1.0, "rewards/chosen": 1.9926636219024658, "rewards/margins": 1.6198185682296753, "rewards/rejected": 0.3728450834751129, "step": 2536 }, { "epoch": 0.41, "learning_rate": 9.99518360839565e-07, "logits/chosen": -0.6040841341018677, "logits/rejected": -0.4464294910430908, "logps/chosen": -57.9830207824707, "logps/rejected": -66.28614807128906, "loss": 0.5074, "rewards/accuracies": 0.0, "rewards/chosen": 1.485184907913208, "rewards/margins": -0.19404864311218262, "rewards/rejected": 1.6792335510253906, "step": 2537 }, { "epoch": 0.41, "learning_rate": 9.995125763762088e-07, "logits/chosen": -0.6261929273605347, "logits/rejected": -0.6283764243125916, "logps/chosen": -5.556430816650391, "logps/rejected": -13.017926216125488, "loss": 0.3317, "rewards/accuracies": 1.0, "rewards/chosen": 0.5749756097793579, "rewards/margins": 0.25257161259651184, "rewards/rejected": 0.32240399718284607, "step": 2538 }, { "epoch": 0.41, "learning_rate": 9.9950675740126e-07, "logits/chosen": -0.44591012597084045, "logits/rejected": -0.4495488107204437, "logps/chosen": -16.803264617919922, "logps/rejected": -19.610368728637695, "loss": 0.7861, "rewards/accuracies": 0.0, "rewards/chosen": -0.16412782669067383, "rewards/margins": -0.08523569256067276, "rewards/rejected": -0.07889213413000107, "step": 2539 }, { "epoch": 0.41, "learning_rate": 9.995009039151208e-07, "logits/chosen": -0.5335753560066223, "logits/rejected": -0.5323305726051331, "logps/chosen": -21.594871520996094, "logps/rejected": -0.6138608455657959, "loss": 0.7154, "rewards/accuracies": 0.0, "rewards/chosen": -0.13090267777442932, "rewards/margins": -0.2545931339263916, "rewards/rejected": 0.12369046360254288, "step": 2540 }, { "epoch": 0.41, "learning_rate": 9.994950159181953e-07, "logits/chosen": -0.539027214050293, "logits/rejected": -0.4670673906803131, "logps/chosen": -52.30208206176758, "logps/rejected": -70.66506958007812, "loss": 0.4293, "rewards/accuracies": 1.0, "rewards/chosen": 2.933488130569458, "rewards/margins": 0.8467891216278076, "rewards/rejected": 2.0866990089416504, "step": 2541 }, { "epoch": 0.41, "learning_rate": 9.994890934108907e-07, "logits/chosen": -0.26434653997421265, "logits/rejected": -0.31398195028305054, "logps/chosen": -41.40382766723633, "logps/rejected": -99.73975372314453, "loss": 0.5596, "rewards/accuracies": 0.0, "rewards/chosen": 1.4370533227920532, "rewards/margins": -0.1800251007080078, "rewards/rejected": 1.617078423500061, "step": 2542 }, { "epoch": 0.41, "learning_rate": 9.994831363936155e-07, "logits/chosen": -0.46367719769477844, "logits/rejected": -0.42475613951683044, "logps/chosen": -183.88424682617188, "logps/rejected": -57.57614517211914, "loss": 0.092, "rewards/accuracies": 1.0, "rewards/chosen": 4.633670330047607, "rewards/margins": 2.1596288681030273, "rewards/rejected": 2.47404146194458, "step": 2543 }, { "epoch": 0.41, "learning_rate": 9.994771448667822e-07, "logits/chosen": -0.6656687259674072, "logits/rejected": -0.7074028849601746, "logps/chosen": -124.50394439697266, "logps/rejected": -95.01176452636719, "loss": 0.105, "rewards/accuracies": 1.0, "rewards/chosen": 3.8595986366271973, "rewards/margins": 2.7260582447052, "rewards/rejected": 1.133540391921997, "step": 2544 }, { "epoch": 0.41, "learning_rate": 9.99471118830804e-07, "logits/chosen": -0.782101035118103, "logits/rejected": -0.6429508924484253, "logps/chosen": -83.90777587890625, "logps/rejected": -222.2361602783203, "loss": 2.195, "rewards/accuracies": 0.0, "rewards/chosen": 1.6935561895370483, "rewards/margins": -4.094418525695801, "rewards/rejected": 5.787974834442139, "step": 2545 }, { "epoch": 0.41, "learning_rate": 9.994650582860977e-07, "logits/chosen": -0.35207599401474, "logits/rejected": -0.3876688778400421, "logps/chosen": -248.66973876953125, "logps/rejected": -119.08830261230469, "loss": 1.9825, "rewards/accuracies": 0.0, "rewards/chosen": 2.3244569301605225, "rewards/margins": -2.5651471614837646, "rewards/rejected": 4.889604091644287, "step": 2546 }, { "epoch": 0.41, "learning_rate": 9.994589632330817e-07, "logits/chosen": -0.4666321873664856, "logits/rejected": -0.4552655816078186, "logps/chosen": -32.915035247802734, "logps/rejected": -154.95718383789062, "loss": 1.6476, "rewards/accuracies": 0.0, "rewards/chosen": 2.041243076324463, "rewards/margins": -2.097079277038574, "rewards/rejected": 4.138322353363037, "step": 2547 }, { "epoch": 0.41, "learning_rate": 9.994528336721774e-07, "logits/chosen": -0.4374087452888489, "logits/rejected": -0.24248270690441132, "logps/chosen": -233.80673217773438, "logps/rejected": -52.60896301269531, "loss": 0.2552, "rewards/accuracies": 1.0, "rewards/chosen": 0.5824036002159119, "rewards/margins": 0.5882874131202698, "rewards/rejected": -0.0058837891556322575, "step": 2548 }, { "epoch": 0.41, "learning_rate": 9.994466696038082e-07, "logits/chosen": -0.7174285054206848, "logits/rejected": -0.6046744585037231, "logps/chosen": -151.10293579101562, "logps/rejected": -92.4581298828125, "loss": 0.4291, "rewards/accuracies": 1.0, "rewards/chosen": 3.6797759532928467, "rewards/margins": 0.8941977024078369, "rewards/rejected": 2.7855782508850098, "step": 2549 }, { "epoch": 0.41, "learning_rate": 9.994404710283998e-07, "logits/chosen": -0.8555503487586975, "logits/rejected": -0.5214145183563232, "logps/chosen": -63.06209182739258, "logps/rejected": -83.79234313964844, "loss": 0.9549, "rewards/accuracies": 1.0, "rewards/chosen": 1.6277363300323486, "rewards/margins": 0.4996410608291626, "rewards/rejected": 1.128095269203186, "step": 2550 }, { "epoch": 0.41, "learning_rate": 9.994342379463805e-07, "logits/chosen": -0.7027648687362671, "logits/rejected": -0.6887409687042236, "logps/chosen": -81.18640899658203, "logps/rejected": -10.922380447387695, "loss": 0.9692, "rewards/accuracies": 0.0, "rewards/chosen": 0.4960624873638153, "rewards/margins": -0.659734845161438, "rewards/rejected": 1.1557973623275757, "step": 2551 }, { "epoch": 0.41, "learning_rate": 9.994279703581814e-07, "logits/chosen": -0.30834683775901794, "logits/rejected": -0.3408644497394562, "logps/chosen": -33.447898864746094, "logps/rejected": -96.15480041503906, "loss": 0.6445, "rewards/accuracies": 1.0, "rewards/chosen": 0.42809373140335083, "rewards/margins": 0.19136963784694672, "rewards/rejected": 0.2367240935564041, "step": 2552 }, { "epoch": 0.41, "learning_rate": 9.994216682642348e-07, "logits/chosen": -0.1286814957857132, "logits/rejected": -0.13705216348171234, "logps/chosen": -5.492484092712402, "logps/rejected": -26.00104331970215, "loss": 0.6274, "rewards/accuracies": 1.0, "rewards/chosen": 0.2876843512058258, "rewards/margins": 0.31601402163505554, "rewards/rejected": -0.02832965925335884, "step": 2553 }, { "epoch": 0.41, "learning_rate": 9.994153316649767e-07, "logits/chosen": -0.21534475684165955, "logits/rejected": -0.15752188861370087, "logps/chosen": -84.70993041992188, "logps/rejected": -52.58439636230469, "loss": 1.0293, "rewards/accuracies": 0.0, "rewards/chosen": 0.3881545960903168, "rewards/margins": -1.4162429571151733, "rewards/rejected": 1.8043975830078125, "step": 2554 }, { "epoch": 0.41, "learning_rate": 9.994089605608447e-07, "logits/chosen": -0.3806747794151306, "logits/rejected": -0.17554830014705658, "logps/chosen": -46.07904815673828, "logps/rejected": -25.838350296020508, "loss": 0.7305, "rewards/accuracies": 1.0, "rewards/chosen": 1.3424278497695923, "rewards/margins": 1.0651741027832031, "rewards/rejected": 0.2772537171840668, "step": 2555 }, { "epoch": 0.41, "learning_rate": 9.99402554952279e-07, "logits/chosen": -0.4843880236148834, "logits/rejected": -0.3887914717197418, "logps/chosen": -123.2712173461914, "logps/rejected": -66.72966766357422, "loss": 0.2444, "rewards/accuracies": 1.0, "rewards/chosen": 3.8205132484436035, "rewards/margins": 2.5452332496643066, "rewards/rejected": 1.2752799987792969, "step": 2556 }, { "epoch": 0.42, "learning_rate": 9.99396114839722e-07, "logits/chosen": -0.5235478281974792, "logits/rejected": -0.4953707158565521, "logps/chosen": -91.93084716796875, "logps/rejected": -69.72966766357422, "loss": 1.1778, "rewards/accuracies": 0.0, "rewards/chosen": 0.9254196286201477, "rewards/margins": -0.3548896908760071, "rewards/rejected": 1.2803093194961548, "step": 2557 }, { "epoch": 0.42, "learning_rate": 9.993896402236188e-07, "logits/chosen": -0.44640833139419556, "logits/rejected": -0.4490627348423004, "logps/chosen": -65.54219055175781, "logps/rejected": -104.22562408447266, "loss": 0.8316, "rewards/accuracies": 0.0, "rewards/chosen": 1.505696177482605, "rewards/margins": -1.1808418035507202, "rewards/rejected": 2.686537981033325, "step": 2558 }, { "epoch": 0.42, "learning_rate": 9.99383131104417e-07, "logits/chosen": -0.7954968214035034, "logits/rejected": -0.7833707928657532, "logps/chosen": -43.79652404785156, "logps/rejected": -75.34776306152344, "loss": 0.6305, "rewards/accuracies": 1.0, "rewards/chosen": 1.472730278968811, "rewards/margins": 0.2684967517852783, "rewards/rejected": 1.2042335271835327, "step": 2559 }, { "epoch": 0.42, "learning_rate": 9.993765874825659e-07, "logits/chosen": -0.4775193929672241, "logits/rejected": -0.37370097637176514, "logps/chosen": -139.97064208984375, "logps/rejected": -116.08512878417969, "loss": 1.3323, "rewards/accuracies": 0.0, "rewards/chosen": 1.059051513671875, "rewards/margins": -2.551387071609497, "rewards/rejected": 3.610438585281372, "step": 2560 }, { "epoch": 0.42, "learning_rate": 9.993700093585176e-07, "logits/chosen": -0.8597632646560669, "logits/rejected": -0.7480381727218628, "logps/chosen": -112.19459533691406, "logps/rejected": -31.490440368652344, "loss": 0.4832, "rewards/accuracies": 0.0, "rewards/chosen": 0.03104095533490181, "rewards/margins": -0.06626205146312714, "rewards/rejected": 0.09730301052331924, "step": 2561 }, { "epoch": 0.42, "learning_rate": 9.993633967327268e-07, "logits/chosen": -0.7638891935348511, "logits/rejected": -0.7972148656845093, "logps/chosen": -115.47180938720703, "logps/rejected": -111.5315170288086, "loss": 1.3077, "rewards/accuracies": 1.0, "rewards/chosen": 4.594192028045654, "rewards/margins": 0.46495819091796875, "rewards/rejected": 4.1292338371276855, "step": 2562 }, { "epoch": 0.42, "learning_rate": 9.993567496056504e-07, "logits/chosen": -0.1692936271429062, "logits/rejected": -0.1708698719739914, "logps/chosen": -6.127778053283691, "logps/rejected": -3.684683322906494, "loss": 0.4599, "rewards/accuracies": 0.0, "rewards/chosen": 0.23809786140918732, "rewards/margins": -0.20124782621860504, "rewards/rejected": 0.43934568762779236, "step": 2563 }, { "epoch": 0.42, "learning_rate": 9.993500679777476e-07, "logits/chosen": -0.4752958118915558, "logits/rejected": -0.4246622323989868, "logps/chosen": -91.3975830078125, "logps/rejected": -116.27637481689453, "loss": 1.0115, "rewards/accuracies": 0.0, "rewards/chosen": 2.158214569091797, "rewards/margins": -1.6183044910430908, "rewards/rejected": 3.7765190601348877, "step": 2564 }, { "epoch": 0.42, "learning_rate": 9.993433518494797e-07, "logits/chosen": -0.29432857036590576, "logits/rejected": -0.29432857036590576, "logps/chosen": -61.541316986083984, "logps/rejected": -61.541316986083984, "loss": 0.3716, "rewards/accuracies": 0.0, "rewards/chosen": -0.12028274685144424, "rewards/margins": 0.0, "rewards/rejected": -0.12028274685144424, "step": 2565 }, { "epoch": 0.42, "learning_rate": 9.993366012213113e-07, "logits/chosen": -0.22999277710914612, "logits/rejected": -0.22337305545806885, "logps/chosen": -16.62569808959961, "logps/rejected": -3.3142619132995605, "loss": 1.1207, "rewards/accuracies": 0.0, "rewards/chosen": 0.10175342857837677, "rewards/margins": -0.1488969475030899, "rewards/rejected": 0.2506503760814667, "step": 2566 }, { "epoch": 0.42, "learning_rate": 9.993298160937084e-07, "logits/chosen": -0.6226707696914673, "logits/rejected": -0.6077061295509338, "logps/chosen": -65.22886657714844, "logps/rejected": -38.13195037841797, "loss": 0.4415, "rewards/accuracies": 1.0, "rewards/chosen": 1.600062608718872, "rewards/margins": 0.1307300329208374, "rewards/rejected": 1.4693325757980347, "step": 2567 }, { "epoch": 0.42, "learning_rate": 9.9932299646714e-07, "logits/chosen": -0.6287553310394287, "logits/rejected": -0.5099707245826721, "logps/chosen": -115.09684753417969, "logps/rejected": -151.35287475585938, "loss": 1.1347, "rewards/accuracies": 0.0, "rewards/chosen": 1.9921127557754517, "rewards/margins": -1.9474838972091675, "rewards/rejected": 3.939596652984619, "step": 2568 }, { "epoch": 0.42, "learning_rate": 9.993161423420772e-07, "logits/chosen": -0.734265148639679, "logits/rejected": -0.7244272828102112, "logps/chosen": -68.86961364746094, "logps/rejected": -51.018646240234375, "loss": 0.5412, "rewards/accuracies": 1.0, "rewards/chosen": 0.6024703979492188, "rewards/margins": 0.04889488220214844, "rewards/rejected": 0.5535755157470703, "step": 2569 }, { "epoch": 0.42, "learning_rate": 9.993092537189934e-07, "logits/chosen": -0.374436616897583, "logits/rejected": -0.374436616897583, "logps/chosen": -100.7231674194336, "logps/rejected": -100.7231674194336, "loss": 1.8153, "rewards/accuracies": 0.0, "rewards/chosen": 0.15837326645851135, "rewards/margins": 0.0, "rewards/rejected": 0.15837326645851135, "step": 2570 }, { "epoch": 0.42, "learning_rate": 9.993023305983647e-07, "logits/chosen": -0.6210753321647644, "logits/rejected": -0.6025398373603821, "logps/chosen": -59.99849319458008, "logps/rejected": -88.63801574707031, "loss": 0.737, "rewards/accuracies": 1.0, "rewards/chosen": 0.2988346219062805, "rewards/margins": 0.5674915313720703, "rewards/rejected": -0.2686569392681122, "step": 2571 }, { "epoch": 0.42, "learning_rate": 9.992953729806694e-07, "logits/chosen": -0.5579719543457031, "logits/rejected": -0.5212980508804321, "logps/chosen": -53.276451110839844, "logps/rejected": -64.6108169555664, "loss": 0.3947, "rewards/accuracies": 1.0, "rewards/chosen": 1.6881256103515625, "rewards/margins": 0.9372299313545227, "rewards/rejected": 0.7508956789970398, "step": 2572 }, { "epoch": 0.42, "learning_rate": 9.992883808663884e-07, "logits/chosen": -0.5987624526023865, "logits/rejected": -0.5435345768928528, "logps/chosen": -196.38897705078125, "logps/rejected": -102.46349334716797, "loss": 0.2823, "rewards/accuracies": 1.0, "rewards/chosen": 4.122097969055176, "rewards/margins": 0.45123839378356934, "rewards/rejected": 3.6708595752716064, "step": 2573 }, { "epoch": 0.42, "learning_rate": 9.992813542560045e-07, "logits/chosen": -0.8548972010612488, "logits/rejected": -1.1886178255081177, "logps/chosen": -105.8587646484375, "logps/rejected": -34.53573226928711, "loss": 0.4332, "rewards/accuracies": 1.0, "rewards/chosen": 1.286798119544983, "rewards/margins": 1.0151218175888062, "rewards/rejected": 0.27167627215385437, "step": 2574 }, { "epoch": 0.42, "learning_rate": 9.992742931500031e-07, "logits/chosen": -0.5148677229881287, "logits/rejected": -0.5148677229881287, "logps/chosen": -67.43756866455078, "logps/rejected": -67.43756866455078, "loss": 0.5474, "rewards/accuracies": 0.0, "rewards/chosen": 2.06193470954895, "rewards/margins": 0.0, "rewards/rejected": 2.06193470954895, "step": 2575 }, { "epoch": 0.42, "learning_rate": 9.992671975488725e-07, "logits/chosen": -0.065755695104599, "logits/rejected": -0.065755695104599, "logps/chosen": -6.369819641113281, "logps/rejected": -6.369819641113281, "loss": 0.7378, "rewards/accuracies": 0.0, "rewards/chosen": 0.6040857434272766, "rewards/margins": 0.0, "rewards/rejected": 0.6040857434272766, "step": 2576 }, { "epoch": 0.42, "learning_rate": 9.992600674531025e-07, "logits/chosen": -0.7845364212989807, "logits/rejected": -0.7795016169548035, "logps/chosen": -68.68583679199219, "logps/rejected": -63.195682525634766, "loss": 0.9093, "rewards/accuracies": 0.0, "rewards/chosen": 1.4835418462753296, "rewards/margins": -0.8235195875167847, "rewards/rejected": 2.3070614337921143, "step": 2577 }, { "epoch": 0.42, "learning_rate": 9.992529028631858e-07, "logits/chosen": -0.4988722503185272, "logits/rejected": -0.470460444688797, "logps/chosen": -44.76908874511719, "logps/rejected": -65.66232299804688, "loss": 0.7959, "rewards/accuracies": 1.0, "rewards/chosen": 1.9750007390975952, "rewards/margins": 0.955528974533081, "rewards/rejected": 1.0194717645645142, "step": 2578 }, { "epoch": 0.42, "learning_rate": 9.992457037796176e-07, "logits/chosen": -0.8386649489402771, "logits/rejected": -0.82477205991745, "logps/chosen": -114.45404052734375, "logps/rejected": -69.67312622070312, "loss": 0.5289, "rewards/accuracies": 0.0, "rewards/chosen": 0.6553635001182556, "rewards/margins": -0.593554675579071, "rewards/rejected": 1.2489181756973267, "step": 2579 }, { "epoch": 0.42, "learning_rate": 9.99238470202895e-07, "logits/chosen": -0.5544690489768982, "logits/rejected": -0.5180509686470032, "logps/chosen": -36.007652282714844, "logps/rejected": -72.15928649902344, "loss": 0.6015, "rewards/accuracies": 0.0, "rewards/chosen": 1.6489852666854858, "rewards/margins": -0.05953836441040039, "rewards/rejected": 1.7085236310958862, "step": 2580 }, { "epoch": 0.42, "learning_rate": 9.99231202133518e-07, "logits/chosen": -0.379322350025177, "logits/rejected": -0.39742082357406616, "logps/chosen": -54.60323715209961, "logps/rejected": -68.70604705810547, "loss": 0.8444, "rewards/accuracies": 0.0, "rewards/chosen": -0.020282363519072533, "rewards/margins": -0.27501487731933594, "rewards/rejected": 0.25473251938819885, "step": 2581 }, { "epoch": 0.42, "learning_rate": 9.99223899571989e-07, "logits/chosen": -0.43590283393859863, "logits/rejected": -0.42047661542892456, "logps/chosen": -43.46632766723633, "logps/rejected": -82.47540283203125, "loss": 1.0103, "rewards/accuracies": 1.0, "rewards/chosen": 1.3316341638565063, "rewards/margins": 0.7712627649307251, "rewards/rejected": 0.5603713989257812, "step": 2582 }, { "epoch": 0.42, "learning_rate": 9.99216562518812e-07, "logits/chosen": -1.2212315797805786, "logits/rejected": -1.178968071937561, "logps/chosen": -67.34893798828125, "logps/rejected": -73.48626708984375, "loss": 1.2312, "rewards/accuracies": 0.0, "rewards/chosen": 1.8225929737091064, "rewards/margins": -1.2690269947052002, "rewards/rejected": 3.0916199684143066, "step": 2583 }, { "epoch": 0.42, "learning_rate": 9.992091909744942e-07, "logits/chosen": -0.4632224440574646, "logits/rejected": -0.46380814909935, "logps/chosen": -106.29463195800781, "logps/rejected": -72.79486083984375, "loss": 0.2881, "rewards/accuracies": 1.0, "rewards/chosen": 1.9460076093673706, "rewards/margins": 0.7736153602600098, "rewards/rejected": 1.1723922491073608, "step": 2584 }, { "epoch": 0.42, "learning_rate": 9.992017849395448e-07, "logits/chosen": -0.9459056854248047, "logits/rejected": -1.042617678642273, "logps/chosen": -212.765380859375, "logps/rejected": -101.69578552246094, "loss": 0.3633, "rewards/accuracies": 1.0, "rewards/chosen": 3.3663010597229004, "rewards/margins": 0.60552978515625, "rewards/rejected": 2.7607712745666504, "step": 2585 }, { "epoch": 0.42, "learning_rate": 9.991943444144756e-07, "logits/chosen": -0.1906842440366745, "logits/rejected": -0.19285570085048676, "logps/chosen": -102.4156494140625, "logps/rejected": -130.41934204101562, "loss": 0.7509, "rewards/accuracies": 1.0, "rewards/chosen": 0.2690780758857727, "rewards/margins": 0.6287063360214233, "rewards/rejected": -0.359628289937973, "step": 2586 }, { "epoch": 0.42, "learning_rate": 9.991868693998006e-07, "logits/chosen": -0.5148348212242126, "logits/rejected": -0.33978354930877686, "logps/chosen": -188.1842041015625, "logps/rejected": -49.40131378173828, "loss": 0.0957, "rewards/accuracies": 1.0, "rewards/chosen": 2.752197265625, "rewards/margins": 1.9015758037567139, "rewards/rejected": 0.8506214022636414, "step": 2587 }, { "epoch": 0.42, "learning_rate": 9.991793598960362e-07, "logits/chosen": -0.5465707182884216, "logits/rejected": -0.5554489493370056, "logps/chosen": -42.785682678222656, "logps/rejected": -11.941193580627441, "loss": 0.6646, "rewards/accuracies": 1.0, "rewards/chosen": 0.9912250638008118, "rewards/margins": 0.11783343553543091, "rewards/rejected": 0.8733916282653809, "step": 2588 }, { "epoch": 0.42, "learning_rate": 9.991718159037014e-07, "logits/chosen": -0.789864718914032, "logits/rejected": -0.45162320137023926, "logps/chosen": -159.3814697265625, "logps/rejected": -102.69828796386719, "loss": 1.2905, "rewards/accuracies": 0.0, "rewards/chosen": 0.19194336235523224, "rewards/margins": -1.4679107666015625, "rewards/rejected": 1.6598541736602783, "step": 2589 }, { "epoch": 0.42, "learning_rate": 9.991642374233175e-07, "logits/chosen": -0.6804338693618774, "logits/rejected": -0.5299579501152039, "logps/chosen": -174.7103729248047, "logps/rejected": -31.70220375061035, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 3.1264541149139404, "rewards/margins": 2.338303804397583, "rewards/rejected": 0.7881502509117126, "step": 2590 }, { "epoch": 0.42, "learning_rate": 9.991566244554078e-07, "logits/chosen": -0.6349913477897644, "logits/rejected": -0.5967226624488831, "logps/chosen": -43.39291763305664, "logps/rejected": -99.41468811035156, "loss": 1.5143, "rewards/accuracies": 0.0, "rewards/chosen": 2.2378032207489014, "rewards/margins": -0.046744346618652344, "rewards/rejected": 2.2845475673675537, "step": 2591 }, { "epoch": 0.42, "learning_rate": 9.991489770004985e-07, "logits/chosen": -0.8263808488845825, "logits/rejected": -0.7722753286361694, "logps/chosen": -70.46766662597656, "logps/rejected": -52.740089416503906, "loss": 1.358, "rewards/accuracies": 0.0, "rewards/chosen": 0.9349762201309204, "rewards/margins": -1.529739499092102, "rewards/rejected": 2.4647157192230225, "step": 2592 }, { "epoch": 0.42, "learning_rate": 9.991412950591177e-07, "logits/chosen": -0.37500065565109253, "logits/rejected": -0.30236971378326416, "logps/chosen": -65.98284149169922, "logps/rejected": -56.98653030395508, "loss": 0.7509, "rewards/accuracies": 1.0, "rewards/chosen": 1.7074226140975952, "rewards/margins": 1.0264942646026611, "rewards/rejected": 0.6809284090995789, "step": 2593 }, { "epoch": 0.42, "learning_rate": 9.991335786317963e-07, "logits/chosen": -0.4839961528778076, "logits/rejected": -0.5030642151832581, "logps/chosen": -117.43840026855469, "logps/rejected": -70.28010559082031, "loss": 0.3196, "rewards/accuracies": 1.0, "rewards/chosen": 1.9379256963729858, "rewards/margins": 1.2799103260040283, "rewards/rejected": 0.6580154299736023, "step": 2594 }, { "epoch": 0.42, "learning_rate": 9.991258277190675e-07, "logits/chosen": -0.7808011770248413, "logits/rejected": -0.7881169319152832, "logps/chosen": -65.66015625, "logps/rejected": -88.21884155273438, "loss": 0.4361, "rewards/accuracies": 0.0, "rewards/chosen": 2.3031692504882812, "rewards/margins": -0.28271937370300293, "rewards/rejected": 2.585888624191284, "step": 2595 }, { "epoch": 0.42, "learning_rate": 9.99118042321467e-07, "logits/chosen": -0.4755018651485443, "logits/rejected": -0.48536771535873413, "logps/chosen": -128.31130981445312, "logps/rejected": -38.61772918701172, "loss": 0.1214, "rewards/accuracies": 1.0, "rewards/chosen": 3.428785800933838, "rewards/margins": 1.7732815742492676, "rewards/rejected": 1.6555042266845703, "step": 2596 }, { "epoch": 0.42, "learning_rate": 9.99110222439532e-07, "logits/chosen": -0.3809138536453247, "logits/rejected": -0.44511550664901733, "logps/chosen": -114.2287826538086, "logps/rejected": -110.6429443359375, "loss": 2.5622, "rewards/accuracies": 0.0, "rewards/chosen": 0.23895645141601562, "rewards/margins": -1.4291832447052002, "rewards/rejected": 1.6681396961212158, "step": 2597 }, { "epoch": 0.42, "learning_rate": 9.991023680738037e-07, "logits/chosen": -0.8131583333015442, "logits/rejected": -0.8289511203765869, "logps/chosen": -73.60502624511719, "logps/rejected": -126.13018798828125, "loss": 0.8606, "rewards/accuracies": 1.0, "rewards/chosen": 0.7142982482910156, "rewards/margins": 0.3267371952533722, "rewards/rejected": 0.38756105303764343, "step": 2598 }, { "epoch": 0.42, "learning_rate": 9.990944792248242e-07, "logits/chosen": -0.2315753847360611, "logits/rejected": -0.2504402697086334, "logps/chosen": -67.65113067626953, "logps/rejected": -110.21733856201172, "loss": 1.181, "rewards/accuracies": 0.0, "rewards/chosen": 0.6607750058174133, "rewards/margins": -0.5410469174385071, "rewards/rejected": 1.2018219232559204, "step": 2599 }, { "epoch": 0.42, "learning_rate": 9.990865558931386e-07, "logits/chosen": -0.8025053143501282, "logits/rejected": -0.7991980910301208, "logps/chosen": -98.66958618164062, "logps/rejected": -83.16424560546875, "loss": 1.4102, "rewards/accuracies": 0.0, "rewards/chosen": 1.1448516845703125, "rewards/margins": -1.2215447425842285, "rewards/rejected": 2.366396427154541, "step": 2600 }, { "epoch": 0.42, "learning_rate": 9.990785980792943e-07, "logits/chosen": -0.7354070544242859, "logits/rejected": -0.7651417851448059, "logps/chosen": -119.84954833984375, "logps/rejected": -187.41720581054688, "loss": 1.8954, "rewards/accuracies": 0.0, "rewards/chosen": 0.3111587464809418, "rewards/margins": -3.667071580886841, "rewards/rejected": 3.9782302379608154, "step": 2601 }, { "epoch": 0.42, "learning_rate": 9.990706057838414e-07, "logits/chosen": -0.6384161710739136, "logits/rejected": -0.7220571041107178, "logps/chosen": -132.19891357421875, "logps/rejected": -147.33143615722656, "loss": 0.5944, "rewards/accuracies": 0.0, "rewards/chosen": 3.5670623779296875, "rewards/margins": -0.49049997329711914, "rewards/rejected": 4.057562351226807, "step": 2602 }, { "epoch": 0.42, "learning_rate": 9.99062579007332e-07, "logits/chosen": -0.25329527258872986, "logits/rejected": -0.6256651282310486, "logps/chosen": -117.02437591552734, "logps/rejected": -78.46914672851562, "loss": 0.7097, "rewards/accuracies": 0.0, "rewards/chosen": 1.1789039373397827, "rewards/margins": -0.9053345918655396, "rewards/rejected": 2.0842385292053223, "step": 2603 }, { "epoch": 0.42, "learning_rate": 9.990545177503202e-07, "logits/chosen": -0.4576411247253418, "logits/rejected": -0.44113513827323914, "logps/chosen": -181.72369384765625, "logps/rejected": -127.3072509765625, "loss": 0.2147, "rewards/accuracies": 1.0, "rewards/chosen": 3.0583877563476562, "rewards/margins": 2.5680313110351562, "rewards/rejected": 0.4903564453125, "step": 2604 }, { "epoch": 0.42, "learning_rate": 9.990464220133636e-07, "logits/chosen": -0.6097323894500732, "logits/rejected": -0.5597934722900391, "logps/chosen": -99.7298583984375, "logps/rejected": -99.56490325927734, "loss": 0.4142, "rewards/accuracies": 0.0, "rewards/chosen": 3.0094680786132812, "rewards/margins": -0.12691283226013184, "rewards/rejected": 3.136380910873413, "step": 2605 }, { "epoch": 0.42, "learning_rate": 9.990382917970211e-07, "logits/chosen": -0.8176049590110779, "logits/rejected": -0.9085478782653809, "logps/chosen": -201.77053833007812, "logps/rejected": -87.63981628417969, "loss": 0.298, "rewards/accuracies": 1.0, "rewards/chosen": 3.1839141845703125, "rewards/margins": 0.6937651634216309, "rewards/rejected": 2.4901490211486816, "step": 2606 }, { "epoch": 0.42, "learning_rate": 9.990301271018547e-07, "logits/chosen": -0.3183809518814087, "logits/rejected": -0.35289403796195984, "logps/chosen": -46.82342529296875, "logps/rejected": -117.03764343261719, "loss": 1.37, "rewards/accuracies": 1.0, "rewards/chosen": 0.6419906616210938, "rewards/margins": 1.1499404907226562, "rewards/rejected": -0.5079498291015625, "step": 2607 }, { "epoch": 0.42, "learning_rate": 9.990219279284283e-07, "logits/chosen": -0.6343491673469543, "logits/rejected": -0.44614291191101074, "logps/chosen": -156.68475341796875, "logps/rejected": -111.96041870117188, "loss": 0.7438, "rewards/accuracies": 1.0, "rewards/chosen": 4.205548286437988, "rewards/margins": 1.5664079189300537, "rewards/rejected": 2.6391403675079346, "step": 2608 }, { "epoch": 0.42, "learning_rate": 9.990136942773084e-07, "logits/chosen": -0.7319549322128296, "logits/rejected": -0.7516615390777588, "logps/chosen": -131.3248291015625, "logps/rejected": -92.4065933227539, "loss": 1.1874, "rewards/accuracies": 1.0, "rewards/chosen": 2.4253647327423096, "rewards/margins": 1.046240210533142, "rewards/rejected": 1.3791245222091675, "step": 2609 }, { "epoch": 0.42, "learning_rate": 9.990054261490641e-07, "logits/chosen": -0.6657261848449707, "logits/rejected": -0.6411471962928772, "logps/chosen": -68.57083892822266, "logps/rejected": -71.80506896972656, "loss": 0.7969, "rewards/accuracies": 0.0, "rewards/chosen": 0.8692169189453125, "rewards/margins": -0.33597493171691895, "rewards/rejected": 1.2051918506622314, "step": 2610 }, { "epoch": 0.42, "learning_rate": 9.989971235442665e-07, "logits/chosen": -0.6378761529922485, "logits/rejected": -0.5356360673904419, "logps/chosen": -134.10629272460938, "logps/rejected": -59.012901306152344, "loss": 0.3677, "rewards/accuracies": 1.0, "rewards/chosen": 2.4798614978790283, "rewards/margins": 0.7499946355819702, "rewards/rejected": 1.729866862297058, "step": 2611 }, { "epoch": 0.42, "learning_rate": 9.98988786463489e-07, "logits/chosen": -0.3229212164878845, "logits/rejected": -0.2986471652984619, "logps/chosen": -59.27396011352539, "logps/rejected": -68.44210052490234, "loss": 0.4046, "rewards/accuracies": 1.0, "rewards/chosen": 1.470408320426941, "rewards/margins": 0.08553361892700195, "rewards/rejected": 1.384874701499939, "step": 2612 }, { "epoch": 0.42, "learning_rate": 9.98980414907308e-07, "logits/chosen": -0.4725915193557739, "logits/rejected": -0.5250390768051147, "logps/chosen": -106.76812744140625, "logps/rejected": -62.512115478515625, "loss": 1.3423, "rewards/accuracies": 0.0, "rewards/chosen": 0.5099090933799744, "rewards/margins": -1.5607712268829346, "rewards/rejected": 2.0706803798675537, "step": 2613 }, { "epoch": 0.42, "learning_rate": 9.98972008876302e-07, "logits/chosen": -0.4851262867450714, "logits/rejected": -0.48415595293045044, "logps/chosen": -71.23651123046875, "logps/rejected": -137.7827606201172, "loss": 1.9066, "rewards/accuracies": 0.0, "rewards/chosen": 1.2690681219100952, "rewards/margins": -2.0173044204711914, "rewards/rejected": 3.286372423171997, "step": 2614 }, { "epoch": 0.42, "learning_rate": 9.98963568371051e-07, "logits/chosen": -0.45729756355285645, "logits/rejected": -0.4474465250968933, "logps/chosen": -95.12712097167969, "logps/rejected": -58.690879821777344, "loss": 0.5623, "rewards/accuracies": 0.0, "rewards/chosen": 1.2581863403320312, "rewards/margins": -0.36598432064056396, "rewards/rejected": 1.6241706609725952, "step": 2615 }, { "epoch": 0.42, "learning_rate": 9.98955093392139e-07, "logits/chosen": -0.7335230112075806, "logits/rejected": -0.7335230112075806, "logps/chosen": -15.847020149230957, "logps/rejected": -15.847020149230957, "loss": 0.7557, "rewards/accuracies": 0.0, "rewards/chosen": 1.078304409980774, "rewards/margins": 0.0, "rewards/rejected": 1.078304409980774, "step": 2616 }, { "epoch": 0.42, "learning_rate": 9.98946583940151e-07, "logits/chosen": -0.7344738245010376, "logits/rejected": -0.693510115146637, "logps/chosen": -56.762939453125, "logps/rejected": -45.48318099975586, "loss": 0.3106, "rewards/accuracies": 1.0, "rewards/chosen": 1.599542260169983, "rewards/margins": 0.29982030391693115, "rewards/rejected": 1.2997219562530518, "step": 2617 }, { "epoch": 0.42, "learning_rate": 9.98938040015675e-07, "logits/chosen": -0.6598594188690186, "logits/rejected": -0.6465256810188293, "logps/chosen": -65.40553283691406, "logps/rejected": -71.148193359375, "loss": 0.8129, "rewards/accuracies": 0.0, "rewards/chosen": 1.681739091873169, "rewards/margins": -0.4465477466583252, "rewards/rejected": 2.128286838531494, "step": 2618 }, { "epoch": 0.43, "learning_rate": 9.989294616193017e-07, "logits/chosen": -0.7870691418647766, "logits/rejected": -0.7652279138565063, "logps/chosen": -76.85652923583984, "logps/rejected": -76.72346496582031, "loss": 0.6743, "rewards/accuracies": 0.0, "rewards/chosen": 0.5391609072685242, "rewards/margins": -0.6575309634208679, "rewards/rejected": 1.196691870689392, "step": 2619 }, { "epoch": 0.43, "learning_rate": 9.989208487516235e-07, "logits/chosen": -0.7379829287528992, "logits/rejected": -0.6913102865219116, "logps/chosen": -73.53617858886719, "logps/rejected": -86.19236755371094, "loss": 0.2341, "rewards/accuracies": 1.0, "rewards/chosen": 2.126347303390503, "rewards/margins": 1.9044135808944702, "rewards/rejected": 0.2219337522983551, "step": 2620 }, { "epoch": 0.43, "learning_rate": 9.989122014132354e-07, "logits/chosen": -0.3199014961719513, "logits/rejected": -0.32215726375579834, "logps/chosen": -72.71297454833984, "logps/rejected": -73.12935638427734, "loss": 0.3598, "rewards/accuracies": 1.0, "rewards/chosen": 1.9029121398925781, "rewards/margins": 0.35940396785736084, "rewards/rejected": 1.5435081720352173, "step": 2621 }, { "epoch": 0.43, "learning_rate": 9.989035196047348e-07, "logits/chosen": -1.1622437238693237, "logits/rejected": -1.1442151069641113, "logps/chosen": -75.25141143798828, "logps/rejected": -126.89592742919922, "loss": 0.8526, "rewards/accuracies": 0.0, "rewards/chosen": 0.6278129816055298, "rewards/margins": -1.4724441766738892, "rewards/rejected": 2.100257158279419, "step": 2622 }, { "epoch": 0.43, "learning_rate": 9.98894803326722e-07, "logits/chosen": -0.2820816934108734, "logits/rejected": -0.24128739535808563, "logps/chosen": -99.25145721435547, "logps/rejected": -98.49542236328125, "loss": 1.4189, "rewards/accuracies": 0.0, "rewards/chosen": 0.33110275864601135, "rewards/margins": -1.6907097101211548, "rewards/rejected": 2.0218124389648438, "step": 2623 }, { "epoch": 0.43, "learning_rate": 9.988860525797986e-07, "logits/chosen": -0.3295294940471649, "logits/rejected": -0.19457192718982697, "logps/chosen": -81.751220703125, "logps/rejected": -14.363117218017578, "loss": 0.2231, "rewards/accuracies": 1.0, "rewards/chosen": 1.7217575311660767, "rewards/margins": 0.887145459651947, "rewards/rejected": 0.8346120715141296, "step": 2624 }, { "epoch": 0.43, "learning_rate": 9.988772673645696e-07, "logits/chosen": -0.6840325593948364, "logits/rejected": -0.6271430850028992, "logps/chosen": -136.03353881835938, "logps/rejected": -111.18829345703125, "loss": 0.343, "rewards/accuracies": 1.0, "rewards/chosen": 4.471411228179932, "rewards/margins": 0.21877431869506836, "rewards/rejected": 4.252636909484863, "step": 2625 }, { "epoch": 0.43, "learning_rate": 9.988684476816418e-07, "logits/chosen": -0.5501924753189087, "logits/rejected": -0.5479127764701843, "logps/chosen": -73.98844909667969, "logps/rejected": -76.81047821044922, "loss": 0.5573, "rewards/accuracies": 0.0, "rewards/chosen": 2.3722915649414062, "rewards/margins": -0.0947563648223877, "rewards/rejected": 2.467047929763794, "step": 2626 }, { "epoch": 0.43, "learning_rate": 9.988595935316247e-07, "logits/chosen": -0.3582318127155304, "logits/rejected": -0.33616623282432556, "logps/chosen": -142.521240234375, "logps/rejected": -82.19363403320312, "loss": 0.1537, "rewards/accuracies": 1.0, "rewards/chosen": 2.8412415981292725, "rewards/margins": 1.168337345123291, "rewards/rejected": 1.6729042530059814, "step": 2627 }, { "epoch": 0.43, "learning_rate": 9.988507049151297e-07, "logits/chosen": -0.6770076751708984, "logits/rejected": -0.6205756664276123, "logps/chosen": -71.94493103027344, "logps/rejected": -147.901611328125, "loss": 0.7425, "rewards/accuracies": 0.0, "rewards/chosen": 2.4784576892852783, "rewards/margins": -1.024998426437378, "rewards/rejected": 3.5034561157226562, "step": 2628 }, { "epoch": 0.43, "learning_rate": 9.988417818327714e-07, "logits/chosen": -0.5911967754364014, "logits/rejected": -0.5798677206039429, "logps/chosen": -58.513648986816406, "logps/rejected": -94.03932189941406, "loss": 0.5486, "rewards/accuracies": 0.0, "rewards/chosen": 0.7021743655204773, "rewards/margins": -0.2310272455215454, "rewards/rejected": 0.9332016110420227, "step": 2629 }, { "epoch": 0.43, "learning_rate": 9.98832824285166e-07, "logits/chosen": -0.5364320278167725, "logits/rejected": -0.5523207783699036, "logps/chosen": -71.0212631225586, "logps/rejected": -75.30572509765625, "loss": 0.4045, "rewards/accuracies": 0.0, "rewards/chosen": 1.330269694328308, "rewards/margins": -0.10709142684936523, "rewards/rejected": 1.4373611211776733, "step": 2630 }, { "epoch": 0.43, "learning_rate": 9.988238322729324e-07, "logits/chosen": -0.49217140674591064, "logits/rejected": -0.35492998361587524, "logps/chosen": -141.6973419189453, "logps/rejected": -120.95552062988281, "loss": 0.5521, "rewards/accuracies": 0.0, "rewards/chosen": 3.496901035308838, "rewards/margins": -0.5526914596557617, "rewards/rejected": 4.0495924949646, "step": 2631 }, { "epoch": 0.43, "learning_rate": 9.988148057966918e-07, "logits/chosen": -0.5834798812866211, "logits/rejected": -0.33253228664398193, "logps/chosen": -168.59762573242188, "logps/rejected": -197.4835205078125, "loss": 0.2919, "rewards/accuracies": 1.0, "rewards/chosen": 2.72782301902771, "rewards/margins": 0.3861207962036133, "rewards/rejected": 2.3417022228240967, "step": 2632 }, { "epoch": 0.43, "learning_rate": 9.988057448570681e-07, "logits/chosen": -0.41227301955223083, "logits/rejected": -0.3605026304721832, "logps/chosen": -80.94830322265625, "logps/rejected": -115.31973266601562, "loss": 0.6834, "rewards/accuracies": 0.0, "rewards/chosen": 1.401441216468811, "rewards/margins": -0.09711074829101562, "rewards/rejected": 1.4985519647598267, "step": 2633 }, { "epoch": 0.43, "learning_rate": 9.987966494546872e-07, "logits/chosen": -0.4487585127353668, "logits/rejected": -0.47468462586402893, "logps/chosen": -13.320131301879883, "logps/rejected": -2.9668002128601074, "loss": 0.6916, "rewards/accuracies": 0.0, "rewards/chosen": -0.2187504768371582, "rewards/margins": -0.4087221622467041, "rewards/rejected": 0.1899716854095459, "step": 2634 }, { "epoch": 0.43, "learning_rate": 9.987875195901774e-07, "logits/chosen": -0.22574400901794434, "logits/rejected": -0.22574400901794434, "logps/chosen": -89.15951538085938, "logps/rejected": -89.15951538085938, "loss": 0.5519, "rewards/accuracies": 0.0, "rewards/chosen": 0.9338165521621704, "rewards/margins": 0.0, "rewards/rejected": 0.9338165521621704, "step": 2635 }, { "epoch": 0.43, "learning_rate": 9.987783552641697e-07, "logits/chosen": -0.4505075216293335, "logits/rejected": -0.4033508896827698, "logps/chosen": -108.28654479980469, "logps/rejected": -84.91620635986328, "loss": 0.5881, "rewards/accuracies": 1.0, "rewards/chosen": 0.7033325433731079, "rewards/margins": 0.7036598324775696, "rewards/rejected": -0.0003273010370321572, "step": 2636 }, { "epoch": 0.43, "learning_rate": 9.98769156477297e-07, "logits/chosen": -0.8541598320007324, "logits/rejected": -0.8361564874649048, "logps/chosen": -202.82095336914062, "logps/rejected": -177.64439392089844, "loss": 1.2401, "rewards/accuracies": 0.0, "rewards/chosen": 3.595738172531128, "rewards/margins": -2.175981283187866, "rewards/rejected": 5.771719455718994, "step": 2637 }, { "epoch": 0.43, "learning_rate": 9.98759923230195e-07, "logits/chosen": -0.4177999496459961, "logits/rejected": -0.39414069056510925, "logps/chosen": -48.85673141479492, "logps/rejected": -77.82588195800781, "loss": 0.1966, "rewards/accuracies": 1.0, "rewards/chosen": 0.9118824005126953, "rewards/margins": 0.8477908968925476, "rewards/rejected": 0.06409149616956711, "step": 2638 }, { "epoch": 0.43, "learning_rate": 9.987506555235016e-07, "logits/chosen": -0.21990403532981873, "logits/rejected": -0.21666838228702545, "logps/chosen": -68.95602416992188, "logps/rejected": -55.664215087890625, "loss": 0.719, "rewards/accuracies": 1.0, "rewards/chosen": 1.0155349969863892, "rewards/margins": 0.12323379516601562, "rewards/rejected": 0.8923012018203735, "step": 2639 }, { "epoch": 0.43, "learning_rate": 9.987413533578573e-07, "logits/chosen": -0.03781717270612717, "logits/rejected": -0.04905165359377861, "logps/chosen": -12.167259216308594, "logps/rejected": -58.996334075927734, "loss": 0.718, "rewards/accuracies": 1.0, "rewards/chosen": -0.23782530426979065, "rewards/margins": 0.2638108432292938, "rewards/rejected": -0.5016361474990845, "step": 2640 }, { "epoch": 0.43, "learning_rate": 9.987320167339044e-07, "logits/chosen": -1.0740704536437988, "logits/rejected": -1.0451377630233765, "logps/chosen": -125.37953186035156, "logps/rejected": -35.58058166503906, "loss": 0.2888, "rewards/accuracies": 1.0, "rewards/chosen": 1.273951768875122, "rewards/margins": 1.1697224378585815, "rewards/rejected": 0.10422935336828232, "step": 2641 }, { "epoch": 0.43, "learning_rate": 9.987226456522882e-07, "logits/chosen": -0.6285374760627747, "logits/rejected": -0.621247410774231, "logps/chosen": -109.15142059326172, "logps/rejected": -163.0658721923828, "loss": 1.7266, "rewards/accuracies": 0.0, "rewards/chosen": 0.2392372190952301, "rewards/margins": -2.64167857170105, "rewards/rejected": 2.880915880203247, "step": 2642 }, { "epoch": 0.43, "learning_rate": 9.987132401136562e-07, "logits/chosen": -0.5357853770256042, "logits/rejected": -0.5038934946060181, "logps/chosen": -134.3325653076172, "logps/rejected": -160.76553344726562, "loss": 2.4156, "rewards/accuracies": 0.0, "rewards/chosen": 0.165385439991951, "rewards/margins": -4.121984958648682, "rewards/rejected": 4.287370204925537, "step": 2643 }, { "epoch": 0.43, "learning_rate": 9.987038001186584e-07, "logits/chosen": -0.6323849558830261, "logits/rejected": -0.6186243295669556, "logps/chosen": -94.66357421875, "logps/rejected": -75.85142517089844, "loss": 0.1792, "rewards/accuracies": 1.0, "rewards/chosen": 2.174031972885132, "rewards/margins": 1.3475762605667114, "rewards/rejected": 0.8264557123184204, "step": 2644 }, { "epoch": 0.43, "learning_rate": 9.986943256679464e-07, "logits/chosen": -0.542005181312561, "logits/rejected": -0.5730342268943787, "logps/chosen": -112.7937240600586, "logps/rejected": -191.2746124267578, "loss": 1.826, "rewards/accuracies": 0.0, "rewards/chosen": 1.3861550092697144, "rewards/margins": -2.9274024963378906, "rewards/rejected": 4.3135576248168945, "step": 2645 }, { "epoch": 0.43, "learning_rate": 9.986848167621753e-07, "logits/chosen": -0.23037199676036835, "logits/rejected": -0.5991467237472534, "logps/chosen": -85.203369140625, "logps/rejected": -100.3464126586914, "loss": 0.7486, "rewards/accuracies": 1.0, "rewards/chosen": 1.1496673822402954, "rewards/margins": 1.7311515808105469, "rewards/rejected": -0.5814842581748962, "step": 2646 }, { "epoch": 0.43, "learning_rate": 9.98675273402002e-07, "logits/chosen": -0.25491344928741455, "logits/rejected": -0.3469032347202301, "logps/chosen": -100.88728332519531, "logps/rejected": -112.81985473632812, "loss": 0.8671, "rewards/accuracies": 0.0, "rewards/chosen": 1.6374725103378296, "rewards/margins": -0.7657104730606079, "rewards/rejected": 2.4031829833984375, "step": 2647 }, { "epoch": 0.43, "learning_rate": 9.98665695588086e-07, "logits/chosen": -0.6567522883415222, "logits/rejected": -0.6077821254730225, "logps/chosen": -91.87080383300781, "logps/rejected": -27.359182357788086, "loss": 0.2982, "rewards/accuracies": 1.0, "rewards/chosen": 0.8431564569473267, "rewards/margins": 0.6323118209838867, "rewards/rejected": 0.21084462106227875, "step": 2648 }, { "epoch": 0.43, "learning_rate": 9.986560833210887e-07, "logits/chosen": -0.45292171835899353, "logits/rejected": -0.3530038297176361, "logps/chosen": -40.63390350341797, "logps/rejected": -24.224742889404297, "loss": 0.3275, "rewards/accuracies": 1.0, "rewards/chosen": 1.0252822637557983, "rewards/margins": 0.761942982673645, "rewards/rejected": 0.26333925127983093, "step": 2649 }, { "epoch": 0.43, "learning_rate": 9.986464366016743e-07, "logits/chosen": -0.5622467398643494, "logits/rejected": -0.662625253200531, "logps/chosen": -78.35012817382812, "logps/rejected": -107.0582275390625, "loss": 0.5224, "rewards/accuracies": 0.0, "rewards/chosen": 0.15263672173023224, "rewards/margins": -0.5324966907501221, "rewards/rejected": 0.6851333975791931, "step": 2650 }, { "epoch": 0.43, "learning_rate": 9.986367554305095e-07, "logits/chosen": -0.15530847012996674, "logits/rejected": -0.152888685464859, "logps/chosen": -2.714414596557617, "logps/rejected": -1.6092883348464966, "loss": 1.8796, "rewards/accuracies": 0.0, "rewards/chosen": 0.27487656474113464, "rewards/margins": -0.12243494391441345, "rewards/rejected": 0.3973115086555481, "step": 2651 }, { "epoch": 0.43, "learning_rate": 9.986270398082628e-07, "logits/chosen": -0.08897271752357483, "logits/rejected": -0.09778273105621338, "logps/chosen": -8.920157432556152, "logps/rejected": -2.7520265579223633, "loss": 0.9528, "rewards/accuracies": 0.0, "rewards/chosen": 0.06744623184204102, "rewards/margins": -0.20435991883277893, "rewards/rejected": 0.27180615067481995, "step": 2652 }, { "epoch": 0.43, "learning_rate": 9.98617289735606e-07, "logits/chosen": -0.8522661924362183, "logits/rejected": -0.8943927884101868, "logps/chosen": -240.98626708984375, "logps/rejected": -87.36576843261719, "loss": 0.3942, "rewards/accuracies": 0.0, "rewards/chosen": 1.8525298833847046, "rewards/margins": -0.06158602237701416, "rewards/rejected": 1.9141159057617188, "step": 2653 }, { "epoch": 0.43, "learning_rate": 9.986075052132122e-07, "logits/chosen": -0.15099094808101654, "logits/rejected": -0.21969343721866608, "logps/chosen": -52.64778137207031, "logps/rejected": -110.99464416503906, "loss": 0.4968, "rewards/accuracies": 1.0, "rewards/chosen": 0.8708149194717407, "rewards/margins": 0.03461802005767822, "rewards/rejected": 0.8361968994140625, "step": 2654 }, { "epoch": 0.43, "learning_rate": 9.985976862417577e-07, "logits/chosen": -0.5144612789154053, "logits/rejected": -0.5587118864059448, "logps/chosen": -92.27009582519531, "logps/rejected": -133.38987731933594, "loss": 2.0488, "rewards/accuracies": 0.0, "rewards/chosen": 0.4887069761753082, "rewards/margins": -2.2287840843200684, "rewards/rejected": 2.7174911499023438, "step": 2655 }, { "epoch": 0.43, "learning_rate": 9.985878328219211e-07, "logits/chosen": -0.7859712839126587, "logits/rejected": -0.6886611580848694, "logps/chosen": -170.2618408203125, "logps/rejected": -150.22511291503906, "loss": 1.4487, "rewards/accuracies": 0.0, "rewards/chosen": 2.6306748390197754, "rewards/margins": -2.6898317337036133, "rewards/rejected": 5.320506572723389, "step": 2656 }, { "epoch": 0.43, "learning_rate": 9.985779449543828e-07, "logits/chosen": -0.4967496693134308, "logits/rejected": -0.48972809314727783, "logps/chosen": -49.62542724609375, "logps/rejected": -94.149658203125, "loss": 0.6891, "rewards/accuracies": 0.0, "rewards/chosen": 0.8169029355049133, "rewards/margins": -0.7630348801612854, "rewards/rejected": 1.5799378156661987, "step": 2657 }, { "epoch": 0.43, "learning_rate": 9.98568022639826e-07, "logits/chosen": -0.504452109336853, "logits/rejected": -0.5050809383392334, "logps/chosen": -99.40530395507812, "logps/rejected": -142.16513061523438, "loss": 0.6168, "rewards/accuracies": 1.0, "rewards/chosen": 0.6046386957168579, "rewards/margins": 0.03394317626953125, "rewards/rejected": 0.5706955194473267, "step": 2658 }, { "epoch": 0.43, "learning_rate": 9.985580658789363e-07, "logits/chosen": -0.4853259027004242, "logits/rejected": -0.4600215256214142, "logps/chosen": -2.7405097484588623, "logps/rejected": -47.751792907714844, "loss": 0.6399, "rewards/accuracies": 1.0, "rewards/chosen": 0.4387024939060211, "rewards/margins": 0.43527841567993164, "rewards/rejected": 0.003424072405323386, "step": 2659 }, { "epoch": 0.43, "learning_rate": 9.985480746724018e-07, "logits/chosen": -1.630645990371704, "logits/rejected": -1.7561415433883667, "logps/chosen": -185.2017822265625, "logps/rejected": -132.43658447265625, "loss": 0.5801, "rewards/accuracies": 0.0, "rewards/chosen": 3.117706298828125, "rewards/margins": -0.24300241470336914, "rewards/rejected": 3.360708713531494, "step": 2660 }, { "epoch": 0.43, "learning_rate": 9.985380490209125e-07, "logits/chosen": -0.6112028956413269, "logits/rejected": -0.5958553552627563, "logps/chosen": -86.99376678466797, "logps/rejected": -80.23797607421875, "loss": 0.9118, "rewards/accuracies": 0.0, "rewards/chosen": 0.9220863580703735, "rewards/margins": -1.4390298128128052, "rewards/rejected": 2.3611161708831787, "step": 2661 }, { "epoch": 0.43, "learning_rate": 9.985279889251615e-07, "logits/chosen": -0.8760953545570374, "logits/rejected": -0.9473161697387695, "logps/chosen": -165.8836669921875, "logps/rejected": -73.55574035644531, "loss": 0.4117, "rewards/accuracies": 1.0, "rewards/chosen": 2.5037903785705566, "rewards/margins": 0.7334091663360596, "rewards/rejected": 1.770381212234497, "step": 2662 }, { "epoch": 0.43, "learning_rate": 9.985178943858432e-07, "logits/chosen": -0.9016454815864563, "logits/rejected": -0.8602729439735413, "logps/chosen": -84.71882629394531, "logps/rejected": -79.90705871582031, "loss": 0.9849, "rewards/accuracies": 0.0, "rewards/chosen": 0.8801116943359375, "rewards/margins": -0.99516761302948, "rewards/rejected": 1.8752793073654175, "step": 2663 }, { "epoch": 0.43, "learning_rate": 9.985077654036556e-07, "logits/chosen": -0.8582484722137451, "logits/rejected": -0.8107568621635437, "logps/chosen": -40.64982223510742, "logps/rejected": -65.40807342529297, "loss": 0.1326, "rewards/accuracies": 1.0, "rewards/chosen": 2.9797139167785645, "rewards/margins": 1.2714444398880005, "rewards/rejected": 1.708269476890564, "step": 2664 }, { "epoch": 0.43, "learning_rate": 9.984976019792983e-07, "logits/chosen": -0.5008910298347473, "logits/rejected": -0.5665101408958435, "logps/chosen": -74.6834487915039, "logps/rejected": -178.7454376220703, "loss": 2.5873, "rewards/accuracies": 0.0, "rewards/chosen": 2.751070499420166, "rewards/margins": -1.2039635181427002, "rewards/rejected": 3.955034017562866, "step": 2665 }, { "epoch": 0.43, "learning_rate": 9.984874041134737e-07, "logits/chosen": -0.21624813973903656, "logits/rejected": -0.2524571418762207, "logps/chosen": -79.11139678955078, "logps/rejected": -87.84075164794922, "loss": 1.9804, "rewards/accuracies": 0.0, "rewards/chosen": 0.0632835403084755, "rewards/margins": -2.5342698097229004, "rewards/rejected": 2.597553253173828, "step": 2666 }, { "epoch": 0.43, "learning_rate": 9.98477171806886e-07, "logits/chosen": -0.9005701541900635, "logits/rejected": -0.846820056438446, "logps/chosen": -294.256103515625, "logps/rejected": -194.39532470703125, "loss": 2.0351, "rewards/accuracies": 0.0, "rewards/chosen": 1.5450226068496704, "rewards/margins": -3.5347232818603516, "rewards/rejected": 5.079745769500732, "step": 2667 }, { "epoch": 0.43, "learning_rate": 9.984669050602424e-07, "logits/chosen": -0.7603654265403748, "logits/rejected": -0.7883183360099792, "logps/chosen": -169.4427490234375, "logps/rejected": -176.89230346679688, "loss": 1.4073, "rewards/accuracies": 0.0, "rewards/chosen": 2.2893218994140625, "rewards/margins": -2.6935577392578125, "rewards/rejected": 4.982879638671875, "step": 2668 }, { "epoch": 0.43, "learning_rate": 9.984566038742524e-07, "logits/chosen": -0.36391937732696533, "logits/rejected": -0.36391937732696533, "logps/chosen": -64.71468353271484, "logps/rejected": -64.71468353271484, "loss": 0.6323, "rewards/accuracies": 0.0, "rewards/chosen": 1.869105577468872, "rewards/margins": 0.0, "rewards/rejected": 1.869105577468872, "step": 2669 }, { "epoch": 0.43, "learning_rate": 9.984462682496273e-07, "logits/chosen": -0.7614708542823792, "logits/rejected": -0.7649321556091309, "logps/chosen": -95.12516784667969, "logps/rejected": -128.09414672851562, "loss": 2.1526, "rewards/accuracies": 0.0, "rewards/chosen": 1.859063744544983, "rewards/margins": -3.0283493995666504, "rewards/rejected": 4.887413024902344, "step": 2670 }, { "epoch": 0.43, "learning_rate": 9.984358981870814e-07, "logits/chosen": -0.30218780040740967, "logits/rejected": -0.31442880630493164, "logps/chosen": -65.93157958984375, "logps/rejected": -50.181556701660156, "loss": 0.5801, "rewards/accuracies": 1.0, "rewards/chosen": 1.3402847051620483, "rewards/margins": 0.5189460515975952, "rewards/rejected": 0.8213386535644531, "step": 2671 }, { "epoch": 0.43, "learning_rate": 9.984254936873313e-07, "logits/chosen": -0.2594582736492157, "logits/rejected": -0.31392940878868103, "logps/chosen": -66.23361206054688, "logps/rejected": -109.14271545410156, "loss": 0.9071, "rewards/accuracies": 1.0, "rewards/chosen": 1.0882492065429688, "rewards/margins": 0.1768539547920227, "rewards/rejected": 0.911395251750946, "step": 2672 }, { "epoch": 0.43, "learning_rate": 9.984150547510957e-07, "logits/chosen": -0.6717382073402405, "logits/rejected": -0.6072781682014465, "logps/chosen": -74.45767211914062, "logps/rejected": -84.8330078125, "loss": 0.4143, "rewards/accuracies": 0.0, "rewards/chosen": 1.417720079421997, "rewards/margins": -0.1333099603652954, "rewards/rejected": 1.5510300397872925, "step": 2673 }, { "epoch": 0.43, "learning_rate": 9.984045813790958e-07, "logits/chosen": -0.5535687804222107, "logits/rejected": -0.4539238214492798, "logps/chosen": -149.90467834472656, "logps/rejected": -82.32047271728516, "loss": 1.094, "rewards/accuracies": 0.0, "rewards/chosen": 1.293768286705017, "rewards/margins": -1.240592360496521, "rewards/rejected": 2.534360647201538, "step": 2674 }, { "epoch": 0.43, "learning_rate": 9.983940735720554e-07, "logits/chosen": -0.6563800573348999, "logits/rejected": -0.5717179179191589, "logps/chosen": -78.90312194824219, "logps/rejected": -70.51911163330078, "loss": 0.6698, "rewards/accuracies": 0.0, "rewards/chosen": 1.5995819568634033, "rewards/margins": -0.8134245872497559, "rewards/rejected": 2.413006544113159, "step": 2675 }, { "epoch": 0.43, "learning_rate": 9.983835313307e-07, "logits/chosen": -0.46045276522636414, "logits/rejected": -0.4433949291706085, "logps/chosen": -51.06689453125, "logps/rejected": -43.90873718261719, "loss": 0.9906, "rewards/accuracies": 0.0, "rewards/chosen": 1.1518844366073608, "rewards/margins": -0.692314624786377, "rewards/rejected": 1.8441990613937378, "step": 2676 }, { "epoch": 0.43, "learning_rate": 9.983729546557587e-07, "logits/chosen": -0.693166196346283, "logits/rejected": -0.738197922706604, "logps/chosen": -168.83363342285156, "logps/rejected": -100.14558410644531, "loss": 0.2356, "rewards/accuracies": 1.0, "rewards/chosen": 3.2502458095550537, "rewards/margins": 0.8364014625549316, "rewards/rejected": 2.413844347000122, "step": 2677 }, { "epoch": 0.43, "learning_rate": 9.983623435479618e-07, "logits/chosen": -0.3975292444229126, "logits/rejected": -0.37307730317115784, "logps/chosen": -81.77175903320312, "logps/rejected": -103.2803955078125, "loss": 0.238, "rewards/accuracies": 1.0, "rewards/chosen": 0.5652633905410767, "rewards/margins": 0.6316536068916321, "rewards/rejected": -0.06639023125171661, "step": 2678 }, { "epoch": 0.43, "learning_rate": 9.983516980080425e-07, "logits/chosen": -0.6708635091781616, "logits/rejected": -0.5425524711608887, "logps/chosen": -144.0740509033203, "logps/rejected": -88.93167877197266, "loss": 0.3791, "rewards/accuracies": 1.0, "rewards/chosen": 2.8006789684295654, "rewards/margins": 0.5560767650604248, "rewards/rejected": 2.2446022033691406, "step": 2679 }, { "epoch": 0.43, "learning_rate": 9.983410180367364e-07, "logits/chosen": -0.7230483889579773, "logits/rejected": -0.6655462980270386, "logps/chosen": -69.48341369628906, "logps/rejected": -26.71965789794922, "loss": 0.1939, "rewards/accuracies": 1.0, "rewards/chosen": 1.6431457996368408, "rewards/margins": 1.2230734825134277, "rewards/rejected": 0.42007237672805786, "step": 2680 }, { "epoch": 0.44, "learning_rate": 9.98330303634781e-07, "logits/chosen": -0.6983199715614319, "logits/rejected": -0.7503339648246765, "logps/chosen": -207.46371459960938, "logps/rejected": -129.33619689941406, "loss": 0.6065, "rewards/accuracies": 0.0, "rewards/chosen": 3.563342332839966, "rewards/margins": -0.48192906379699707, "rewards/rejected": 4.045271396636963, "step": 2681 }, { "epoch": 0.44, "learning_rate": 9.983195548029172e-07, "logits/chosen": -0.427751362323761, "logits/rejected": -0.8527939915657043, "logps/chosen": -92.83390808105469, "logps/rejected": -26.5555362701416, "loss": 0.5541, "rewards/accuracies": 1.0, "rewards/chosen": 0.6892364621162415, "rewards/margins": 0.19068527221679688, "rewards/rejected": 0.4985511898994446, "step": 2682 }, { "epoch": 0.44, "learning_rate": 9.983087715418872e-07, "logits/chosen": -0.5852149128913879, "logits/rejected": -0.5418909788131714, "logps/chosen": -73.60468292236328, "logps/rejected": -56.643699645996094, "loss": 0.2691, "rewards/accuracies": 1.0, "rewards/chosen": 1.8964966535568237, "rewards/margins": 0.38541722297668457, "rewards/rejected": 1.5110794305801392, "step": 2683 }, { "epoch": 0.44, "learning_rate": 9.98297953852436e-07, "logits/chosen": -0.6661871671676636, "logits/rejected": -0.5934963226318359, "logps/chosen": -47.59051513671875, "logps/rejected": -79.18185424804688, "loss": 0.6372, "rewards/accuracies": 0.0, "rewards/chosen": 1.7420014142990112, "rewards/margins": -0.5080641508102417, "rewards/rejected": 2.250065565109253, "step": 2684 }, { "epoch": 0.44, "learning_rate": 9.982871017353114e-07, "logits/chosen": -1.0282418727874756, "logits/rejected": -1.04795241355896, "logps/chosen": -124.97645568847656, "logps/rejected": -58.27471160888672, "loss": 0.895, "rewards/accuracies": 0.0, "rewards/chosen": 0.7708206176757812, "rewards/margins": -1.0565766096115112, "rewards/rejected": 1.8273972272872925, "step": 2685 }, { "epoch": 0.44, "learning_rate": 9.982762151912626e-07, "logits/chosen": -0.5129645466804504, "logits/rejected": -0.5564367771148682, "logps/chosen": -354.7059326171875, "logps/rejected": -86.1902847290039, "loss": 0.1537, "rewards/accuracies": 1.0, "rewards/chosen": 3.313525438308716, "rewards/margins": 1.6589034795761108, "rewards/rejected": 1.654621958732605, "step": 2686 }, { "epoch": 0.44, "learning_rate": 9.982652942210423e-07, "logits/chosen": -0.8181094527244568, "logits/rejected": -0.8242320418357849, "logps/chosen": -103.00934600830078, "logps/rejected": -53.60737228393555, "loss": 0.6854, "rewards/accuracies": 0.0, "rewards/chosen": 1.2314003705978394, "rewards/margins": -0.9298907518386841, "rewards/rejected": 2.1612911224365234, "step": 2687 }, { "epoch": 0.44, "learning_rate": 9.982543388254046e-07, "logits/chosen": -0.37838757038116455, "logits/rejected": -0.3839930295944214, "logps/chosen": -74.56344604492188, "logps/rejected": -23.775096893310547, "loss": 2.2511, "rewards/accuracies": 0.0, "rewards/chosen": 0.4960365295410156, "rewards/margins": -0.5922459363937378, "rewards/rejected": 1.0882824659347534, "step": 2688 }, { "epoch": 0.44, "learning_rate": 9.982433490051068e-07, "logits/chosen": -0.7888851165771484, "logits/rejected": -0.6491042375564575, "logps/chosen": -198.00363159179688, "logps/rejected": -69.85189819335938, "loss": 0.2327, "rewards/accuracies": 1.0, "rewards/chosen": 3.7305755615234375, "rewards/margins": 2.177475690841675, "rewards/rejected": 1.5530998706817627, "step": 2689 }, { "epoch": 0.44, "learning_rate": 9.982323247609079e-07, "logits/chosen": -0.9321643710136414, "logits/rejected": -0.974216639995575, "logps/chosen": -137.89024353027344, "logps/rejected": -120.96192169189453, "loss": 1.0805, "rewards/accuracies": 0.0, "rewards/chosen": 1.5576965808868408, "rewards/margins": -1.887880802154541, "rewards/rejected": 3.445577383041382, "step": 2690 }, { "epoch": 0.44, "learning_rate": 9.982212660935697e-07, "logits/chosen": -0.7087596654891968, "logits/rejected": -0.5692223906517029, "logps/chosen": -110.57815551757812, "logps/rejected": -187.23098754882812, "loss": 0.5675, "rewards/accuracies": 0.0, "rewards/chosen": 3.077099561691284, "rewards/margins": -0.11180734634399414, "rewards/rejected": 3.1889069080352783, "step": 2691 }, { "epoch": 0.44, "learning_rate": 9.982101730038563e-07, "logits/chosen": -0.5018445253372192, "logits/rejected": -0.5465265512466431, "logps/chosen": -193.7006072998047, "logps/rejected": -127.81854248046875, "loss": 0.7495, "rewards/accuracies": 0.0, "rewards/chosen": 4.30343770980835, "rewards/margins": -1.1605334281921387, "rewards/rejected": 5.463971138000488, "step": 2692 }, { "epoch": 0.44, "learning_rate": 9.98199045492534e-07, "logits/chosen": -0.4683037996292114, "logits/rejected": -0.46349942684173584, "logps/chosen": -79.60602569580078, "logps/rejected": -136.5032501220703, "loss": 0.3416, "rewards/accuracies": 1.0, "rewards/chosen": 0.4796432554721832, "rewards/margins": 0.4623779356479645, "rewards/rejected": 0.01726531982421875, "step": 2693 }, { "epoch": 0.44, "learning_rate": 9.981878835603716e-07, "logits/chosen": -0.4458882510662079, "logits/rejected": -0.47051283717155457, "logps/chosen": -59.5908317565918, "logps/rejected": -38.58885192871094, "loss": 0.579, "rewards/accuracies": 0.0, "rewards/chosen": 0.7097019553184509, "rewards/margins": -0.012978732585906982, "rewards/rejected": 0.7226806879043579, "step": 2694 }, { "epoch": 0.44, "learning_rate": 9.981766872081402e-07, "logits/chosen": -0.655951976776123, "logits/rejected": -0.6472917199134827, "logps/chosen": -112.01968383789062, "logps/rejected": -65.63592529296875, "loss": 1.4275, "rewards/accuracies": 0.0, "rewards/chosen": 0.3047470152378082, "rewards/margins": -1.8618470430374146, "rewards/rejected": 2.1665940284729004, "step": 2695 }, { "epoch": 0.44, "learning_rate": 9.981654564366138e-07, "logits/chosen": -0.6584237813949585, "logits/rejected": -0.6387476921081543, "logps/chosen": -51.412593841552734, "logps/rejected": -17.922693252563477, "loss": 1.8436, "rewards/accuracies": 1.0, "rewards/chosen": 0.413900762796402, "rewards/margins": 0.031840890645980835, "rewards/rejected": 0.38205987215042114, "step": 2696 }, { "epoch": 0.44, "learning_rate": 9.981541912465679e-07, "logits/chosen": -0.7446324825286865, "logits/rejected": -0.7374980449676514, "logps/chosen": -99.02775573730469, "logps/rejected": -48.24085998535156, "loss": 0.9358, "rewards/accuracies": 0.0, "rewards/chosen": 1.0143226385116577, "rewards/margins": -0.8920586109161377, "rewards/rejected": 1.9063812494277954, "step": 2697 }, { "epoch": 0.44, "learning_rate": 9.98142891638781e-07, "logits/chosen": -0.5569829940795898, "logits/rejected": -0.3738691806793213, "logps/chosen": -62.12318420410156, "logps/rejected": -117.65751647949219, "loss": 1.1817, "rewards/accuracies": 0.0, "rewards/chosen": 1.0374053716659546, "rewards/margins": -1.1959642171859741, "rewards/rejected": 2.2333695888519287, "step": 2698 }, { "epoch": 0.44, "learning_rate": 9.98131557614034e-07, "logits/chosen": -0.46114808320999146, "logits/rejected": -0.2746710479259491, "logps/chosen": -176.16998291015625, "logps/rejected": -88.95944213867188, "loss": 0.1323, "rewards/accuracies": 1.0, "rewards/chosen": 4.351355075836182, "rewards/margins": 1.8800911903381348, "rewards/rejected": 2.471263885498047, "step": 2699 }, { "epoch": 0.44, "learning_rate": 9.981201891731093e-07, "logits/chosen": -0.8368194103240967, "logits/rejected": -0.7372141480445862, "logps/chosen": -189.02447509765625, "logps/rejected": -85.88259887695312, "loss": 0.7323, "rewards/accuracies": 1.0, "rewards/chosen": 3.3865983486175537, "rewards/margins": 1.0250718593597412, "rewards/rejected": 2.3615264892578125, "step": 2700 }, { "epoch": 0.44, "learning_rate": 9.98108786316793e-07, "logits/chosen": -0.6396124362945557, "logits/rejected": -0.6317434310913086, "logps/chosen": -40.373085021972656, "logps/rejected": -64.26402282714844, "loss": 0.6259, "rewards/accuracies": 0.0, "rewards/chosen": 1.8019722700119019, "rewards/margins": -0.5619651079177856, "rewards/rejected": 2.3639373779296875, "step": 2701 }, { "epoch": 0.44, "learning_rate": 9.980973490458728e-07, "logits/chosen": -1.0477337837219238, "logits/rejected": -0.9114242792129517, "logps/chosen": -124.85763549804688, "logps/rejected": -24.903745651245117, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 3.7717742919921875, "rewards/margins": 3.5608839988708496, "rewards/rejected": 0.21089020371437073, "step": 2702 }, { "epoch": 0.44, "learning_rate": 9.980858773611388e-07, "logits/chosen": -0.7462550401687622, "logits/rejected": -0.7514265179634094, "logps/chosen": -73.51327514648438, "logps/rejected": -124.1106185913086, "loss": 1.6126, "rewards/accuracies": 0.0, "rewards/chosen": 1.3779305219650269, "rewards/margins": -2.395463466644287, "rewards/rejected": 3.7733941078186035, "step": 2703 }, { "epoch": 0.44, "learning_rate": 9.980743712633833e-07, "logits/chosen": -0.7576592564582825, "logits/rejected": -0.7547343373298645, "logps/chosen": -94.94591522216797, "logps/rejected": -140.87432861328125, "loss": 2.1512, "rewards/accuracies": 0.0, "rewards/chosen": 0.8160255551338196, "rewards/margins": -2.7379753589630127, "rewards/rejected": 3.5540008544921875, "step": 2704 }, { "epoch": 0.44, "learning_rate": 9.980628307534018e-07, "logits/chosen": -0.9478960037231445, "logits/rejected": -0.8786839246749878, "logps/chosen": -170.68588256835938, "logps/rejected": -78.87786865234375, "loss": 0.9743, "rewards/accuracies": 0.0, "rewards/chosen": -0.340423583984375, "rewards/margins": -1.6119903326034546, "rewards/rejected": 1.2715667486190796, "step": 2705 }, { "epoch": 0.44, "learning_rate": 9.980512558319915e-07, "logits/chosen": -0.4924616515636444, "logits/rejected": -0.5008504390716553, "logps/chosen": -99.32054901123047, "logps/rejected": -84.4481201171875, "loss": 0.86, "rewards/accuracies": 0.0, "rewards/chosen": 0.7600242495536804, "rewards/margins": -0.5987434983253479, "rewards/rejected": 1.3587677478790283, "step": 2706 }, { "epoch": 0.44, "learning_rate": 9.98039646499952e-07, "logits/chosen": -0.670119047164917, "logits/rejected": -0.6924880146980286, "logps/chosen": -104.51773071289062, "logps/rejected": -61.31006622314453, "loss": 1.0745, "rewards/accuracies": 0.0, "rewards/chosen": 1.3139526844024658, "rewards/margins": -0.42984163761138916, "rewards/rejected": 1.743794322013855, "step": 2707 }, { "epoch": 0.44, "learning_rate": 9.980280027580851e-07, "logits/chosen": -0.431011825799942, "logits/rejected": -0.44253450632095337, "logps/chosen": -94.41860961914062, "logps/rejected": -160.4296417236328, "loss": 0.1659, "rewards/accuracies": 1.0, "rewards/chosen": 1.7555900812149048, "rewards/margins": 1.2212212085723877, "rewards/rejected": 0.5343689322471619, "step": 2708 }, { "epoch": 0.44, "learning_rate": 9.98016324607196e-07, "logits/chosen": -0.6987573504447937, "logits/rejected": -0.6563907265663147, "logps/chosen": -158.30088806152344, "logps/rejected": -155.792236328125, "loss": 1.0193, "rewards/accuracies": 0.0, "rewards/chosen": 3.6790878772735596, "rewards/margins": -1.744380235671997, "rewards/rejected": 5.423468112945557, "step": 2709 }, { "epoch": 0.44, "learning_rate": 9.98004612048091e-07, "logits/chosen": -0.6313338279724121, "logits/rejected": -0.6366584300994873, "logps/chosen": -92.1397933959961, "logps/rejected": -69.81786346435547, "loss": 0.7182, "rewards/accuracies": 0.0, "rewards/chosen": 1.7568542957305908, "rewards/margins": -0.44034814834594727, "rewards/rejected": 2.197202444076538, "step": 2710 }, { "epoch": 0.44, "learning_rate": 9.979928650815795e-07, "logits/chosen": -0.2283165603876114, "logits/rejected": -0.23055662214756012, "logps/chosen": -0.60408616065979, "logps/rejected": -32.2591552734375, "loss": 0.9941, "rewards/accuracies": 1.0, "rewards/chosen": 0.19438278675079346, "rewards/margins": 0.4355878233909607, "rewards/rejected": -0.24120502173900604, "step": 2711 }, { "epoch": 0.44, "learning_rate": 9.97981083708473e-07, "logits/chosen": -0.8142127394676208, "logits/rejected": -0.7729659676551819, "logps/chosen": -41.00579071044922, "logps/rejected": -87.60565948486328, "loss": 1.0455, "rewards/accuracies": 0.0, "rewards/chosen": 2.3620057106018066, "rewards/margins": -1.2214789390563965, "rewards/rejected": 3.583484649658203, "step": 2712 }, { "epoch": 0.44, "learning_rate": 9.979692679295854e-07, "logits/chosen": -0.20900313556194305, "logits/rejected": -0.2434251308441162, "logps/chosen": -32.31600570678711, "logps/rejected": -56.60358428955078, "loss": 1.2333, "rewards/accuracies": 0.0, "rewards/chosen": 1.1405476331710815, "rewards/margins": -0.09656941890716553, "rewards/rejected": 1.237117052078247, "step": 2713 }, { "epoch": 0.44, "learning_rate": 9.979574177457335e-07, "logits/chosen": -0.6738712787628174, "logits/rejected": -0.6037434935569763, "logps/chosen": -79.12981414794922, "logps/rejected": -73.55546569824219, "loss": 1.0287, "rewards/accuracies": 1.0, "rewards/chosen": 1.4084144830703735, "rewards/margins": 0.6889305114746094, "rewards/rejected": 0.7194839715957642, "step": 2714 }, { "epoch": 0.44, "learning_rate": 9.979455331577359e-07, "logits/chosen": -0.529865026473999, "logits/rejected": -0.5344702005386353, "logps/chosen": -65.68595886230469, "logps/rejected": -52.798095703125, "loss": 1.9248, "rewards/accuracies": 1.0, "rewards/chosen": 1.1834945678710938, "rewards/margins": 0.034148335456848145, "rewards/rejected": 1.1493462324142456, "step": 2715 }, { "epoch": 0.44, "learning_rate": 9.979336141664131e-07, "logits/chosen": -0.38730642199516296, "logits/rejected": -0.36994901299476624, "logps/chosen": -45.150150299072266, "logps/rejected": -43.828575134277344, "loss": 0.6476, "rewards/accuracies": 1.0, "rewards/chosen": 2.3989155292510986, "rewards/margins": 0.6706790924072266, "rewards/rejected": 1.728236436843872, "step": 2716 }, { "epoch": 0.44, "learning_rate": 9.979216607725894e-07, "logits/chosen": -0.46543067693710327, "logits/rejected": -0.45679494738578796, "logps/chosen": -62.88319396972656, "logps/rejected": -39.98751449584961, "loss": 1.0826, "rewards/accuracies": 0.0, "rewards/chosen": 1.0968780517578125, "rewards/margins": -0.2778843641281128, "rewards/rejected": 1.3747624158859253, "step": 2717 }, { "epoch": 0.44, "learning_rate": 9.979096729770901e-07, "logits/chosen": -0.3718258738517761, "logits/rejected": -0.3881430923938751, "logps/chosen": -64.97486877441406, "logps/rejected": -38.772640228271484, "loss": 0.9279, "rewards/accuracies": 0.0, "rewards/chosen": 0.764495849609375, "rewards/margins": -0.7876361608505249, "rewards/rejected": 1.5521320104599, "step": 2718 }, { "epoch": 0.44, "learning_rate": 9.978976507807437e-07, "logits/chosen": -0.5807380676269531, "logits/rejected": -0.6387456059455872, "logps/chosen": -127.25932312011719, "logps/rejected": -150.25601196289062, "loss": 1.3407, "rewards/accuracies": 0.0, "rewards/chosen": 1.0686553716659546, "rewards/margins": -1.6204499006271362, "rewards/rejected": 2.689105272293091, "step": 2719 }, { "epoch": 0.44, "learning_rate": 9.97885594184381e-07, "logits/chosen": -0.5940430164337158, "logits/rejected": -0.5386526584625244, "logps/chosen": -84.66584014892578, "logps/rejected": -75.200927734375, "loss": 1.7275, "rewards/accuracies": 0.0, "rewards/chosen": 1.1350288391113281, "rewards/margins": -1.9625968933105469, "rewards/rejected": 3.097625732421875, "step": 2720 }, { "epoch": 0.44, "learning_rate": 9.978735031888345e-07, "logits/chosen": -0.4858212471008301, "logits/rejected": -0.45100700855255127, "logps/chosen": -120.75730895996094, "logps/rejected": -95.14315795898438, "loss": 1.0872, "rewards/accuracies": 0.0, "rewards/chosen": 0.9364578127861023, "rewards/margins": -1.7039597034454346, "rewards/rejected": 2.6404175758361816, "step": 2721 }, { "epoch": 0.44, "learning_rate": 9.9786137779494e-07, "logits/chosen": -0.4237370491027832, "logits/rejected": -0.2916133403778076, "logps/chosen": -75.11878204345703, "logps/rejected": -46.637939453125, "loss": 0.3194, "rewards/accuracies": 1.0, "rewards/chosen": 2.7027580738067627, "rewards/margins": 0.6787827014923096, "rewards/rejected": 2.023975372314453, "step": 2722 }, { "epoch": 0.44, "learning_rate": 9.97849218003535e-07, "logits/chosen": -0.40517136454582214, "logits/rejected": -0.40072745084762573, "logps/chosen": -66.48612213134766, "logps/rejected": -51.723262786865234, "loss": 0.5277, "rewards/accuracies": 1.0, "rewards/chosen": -0.13385163247585297, "rewards/margins": 0.008400335907936096, "rewards/rejected": -0.14225196838378906, "step": 2723 }, { "epoch": 0.44, "learning_rate": 9.9783702381546e-07, "logits/chosen": -0.10044440627098083, "logits/rejected": -0.10044440627098083, "logps/chosen": -1.2661595344543457, "logps/rejected": -1.2661595344543457, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.20031273365020752, "rewards/margins": 0.0, "rewards/rejected": 0.20031273365020752, "step": 2724 }, { "epoch": 0.44, "learning_rate": 9.978247952315568e-07, "logits/chosen": -0.29739341139793396, "logits/rejected": -0.29739341139793396, "logps/chosen": -20.433156967163086, "logps/rejected": -20.433156967163086, "loss": 0.5102, "rewards/accuracies": 0.0, "rewards/chosen": 0.9921627044677734, "rewards/margins": 0.0, "rewards/rejected": 0.9921627044677734, "step": 2725 }, { "epoch": 0.44, "learning_rate": 9.97812532252671e-07, "logits/chosen": -0.6075548529624939, "logits/rejected": -0.5539716482162476, "logps/chosen": -183.62429809570312, "logps/rejected": -147.594970703125, "loss": 0.8767, "rewards/accuracies": 0.0, "rewards/chosen": 4.368217468261719, "rewards/margins": -1.2975554466247559, "rewards/rejected": 5.665772914886475, "step": 2726 }, { "epoch": 0.44, "learning_rate": 9.978002348796494e-07, "logits/chosen": -0.5747293829917908, "logits/rejected": -0.5198538303375244, "logps/chosen": -159.9027099609375, "logps/rejected": -189.65216064453125, "loss": 0.0961, "rewards/accuracies": 1.0, "rewards/chosen": 3.7241408824920654, "rewards/margins": 1.6429929733276367, "rewards/rejected": 2.0811479091644287, "step": 2727 }, { "epoch": 0.44, "learning_rate": 9.97787903113342e-07, "logits/chosen": -0.5843726992607117, "logits/rejected": -0.5799471139907837, "logps/chosen": -44.463836669921875, "logps/rejected": -74.47740173339844, "loss": 0.4586, "rewards/accuracies": 0.0, "rewards/chosen": 0.6760544180870056, "rewards/margins": -0.2144683599472046, "rewards/rejected": 0.8905227780342102, "step": 2728 }, { "epoch": 0.44, "learning_rate": 9.977755369546006e-07, "logits/chosen": -0.6541706323623657, "logits/rejected": -0.6594878435134888, "logps/chosen": -61.84622573852539, "logps/rejected": -67.53450012207031, "loss": 1.4754, "rewards/accuracies": 0.0, "rewards/chosen": 0.5365871787071228, "rewards/margins": -0.2176334261894226, "rewards/rejected": 0.7542206048965454, "step": 2729 }, { "epoch": 0.44, "learning_rate": 9.977631364042794e-07, "logits/chosen": -0.7299497723579407, "logits/rejected": -0.6803300380706787, "logps/chosen": -142.22341918945312, "logps/rejected": -88.46749877929688, "loss": 0.7097, "rewards/accuracies": 0.0, "rewards/chosen": 0.43947601318359375, "rewards/margins": -1.0245659351348877, "rewards/rejected": 1.4640419483184814, "step": 2730 }, { "epoch": 0.44, "learning_rate": 9.977507014632355e-07, "logits/chosen": -0.8274788856506348, "logits/rejected": -0.8493097424507141, "logps/chosen": -191.81484985351562, "logps/rejected": -226.28965759277344, "loss": 0.9178, "rewards/accuracies": 0.0, "rewards/chosen": 3.76767897605896, "rewards/margins": -1.3404157161712646, "rewards/rejected": 5.108094692230225, "step": 2731 }, { "epoch": 0.44, "learning_rate": 9.977382321323277e-07, "logits/chosen": -0.28051379323005676, "logits/rejected": -0.2727262079715729, "logps/chosen": -42.381011962890625, "logps/rejected": -49.74830627441406, "loss": 1.1194, "rewards/accuracies": 1.0, "rewards/chosen": 0.8684318661689758, "rewards/margins": 0.4348388612270355, "rewards/rejected": 0.4335930049419403, "step": 2732 }, { "epoch": 0.44, "learning_rate": 9.97725728412418e-07, "logits/chosen": -0.4203905463218689, "logits/rejected": -0.4203905463218689, "logps/chosen": -18.423723220825195, "logps/rejected": -18.423723220825195, "loss": 1.6793, "rewards/accuracies": 0.0, "rewards/chosen": 0.1381082534790039, "rewards/margins": 0.0, "rewards/rejected": 0.1381082534790039, "step": 2733 }, { "epoch": 0.44, "learning_rate": 9.977131903043698e-07, "logits/chosen": -0.44032081961631775, "logits/rejected": -0.44253015518188477, "logps/chosen": -76.38494110107422, "logps/rejected": -107.37899017333984, "loss": 0.8295, "rewards/accuracies": 0.0, "rewards/chosen": -0.460671991109848, "rewards/margins": -1.1337928771972656, "rewards/rejected": 0.67312091588974, "step": 2734 }, { "epoch": 0.44, "learning_rate": 9.977006178090497e-07, "logits/chosen": 0.1640542894601822, "logits/rejected": 0.16597238183021545, "logps/chosen": -8.462032318115234, "logps/rejected": -6.826390743255615, "loss": 0.7762, "rewards/accuracies": 0.0, "rewards/chosen": 0.05176201090216637, "rewards/margins": -0.3084689676761627, "rewards/rejected": 0.3602309823036194, "step": 2735 }, { "epoch": 0.44, "learning_rate": 9.97688010927326e-07, "logits/chosen": -0.9365183115005493, "logits/rejected": -0.8688716888427734, "logps/chosen": -63.20648193359375, "logps/rejected": -23.601125717163086, "loss": 0.6258, "rewards/accuracies": 1.0, "rewards/chosen": 0.6067191958427429, "rewards/margins": 0.27024686336517334, "rewards/rejected": 0.3364723324775696, "step": 2736 }, { "epoch": 0.44, "learning_rate": 9.9767536966007e-07, "logits/chosen": -0.2266564667224884, "logits/rejected": -0.26867207884788513, "logps/chosen": -63.06376647949219, "logps/rejected": -42.20481872558594, "loss": 1.8878, "rewards/accuracies": 0.0, "rewards/chosen": 1.6579887866973877, "rewards/margins": -0.3947429656982422, "rewards/rejected": 2.05273175239563, "step": 2737 }, { "epoch": 0.44, "learning_rate": 9.97662694008155e-07, "logits/chosen": -0.529740035533905, "logits/rejected": -0.4729485511779785, "logps/chosen": -70.53448486328125, "logps/rejected": -35.648338317871094, "loss": 1.6211, "rewards/accuracies": 1.0, "rewards/chosen": 1.4986892938613892, "rewards/margins": 1.3110321760177612, "rewards/rejected": 0.1876571625471115, "step": 2738 }, { "epoch": 0.44, "learning_rate": 9.976499839724569e-07, "logits/chosen": -0.2617044150829315, "logits/rejected": -0.28224292397499084, "logps/chosen": -73.07028198242188, "logps/rejected": -101.0020751953125, "loss": 1.1168, "rewards/accuracies": 0.0, "rewards/chosen": 1.2542991638183594, "rewards/margins": -1.4308266639709473, "rewards/rejected": 2.6851258277893066, "step": 2739 }, { "epoch": 0.44, "learning_rate": 9.976372395538535e-07, "logits/chosen": -0.3669183850288391, "logits/rejected": -0.3479066789150238, "logps/chosen": -76.05943298339844, "logps/rejected": -80.48721313476562, "loss": 0.8647, "rewards/accuracies": 1.0, "rewards/chosen": 1.3311508893966675, "rewards/margins": 0.27429962158203125, "rewards/rejected": 1.0568512678146362, "step": 2740 }, { "epoch": 0.44, "learning_rate": 9.976244607532257e-07, "logits/chosen": -0.5947386622428894, "logits/rejected": -0.5617809295654297, "logps/chosen": -131.965087890625, "logps/rejected": -65.98001861572266, "loss": 0.6253, "rewards/accuracies": 1.0, "rewards/chosen": 1.090692162513733, "rewards/margins": 0.0013167858123779297, "rewards/rejected": 1.089375376701355, "step": 2741 }, { "epoch": 0.45, "learning_rate": 9.976116475714563e-07, "logits/chosen": -0.3146728277206421, "logits/rejected": -0.3226277828216553, "logps/chosen": -80.839599609375, "logps/rejected": -128.14512634277344, "loss": 1.457, "rewards/accuracies": 0.0, "rewards/chosen": 1.9631980657577515, "rewards/margins": -0.33552777767181396, "rewards/rejected": 2.2987258434295654, "step": 2742 }, { "epoch": 0.45, "learning_rate": 9.9759880000943e-07, "logits/chosen": -0.7125966548919678, "logits/rejected": -0.7265301942825317, "logps/chosen": -43.339054107666016, "logps/rejected": -30.102859497070312, "loss": 0.8806, "rewards/accuracies": 0.0, "rewards/chosen": 0.095189668238163, "rewards/margins": -0.48891374468803406, "rewards/rejected": 0.5841034054756165, "step": 2743 }, { "epoch": 0.45, "learning_rate": 9.975859180680355e-07, "logits/chosen": -0.46213918924331665, "logits/rejected": 0.40910616517066956, "logps/chosen": -63.35987854003906, "logps/rejected": -36.81129455566406, "loss": 0.3174, "rewards/accuracies": 1.0, "rewards/chosen": 1.8532699346542358, "rewards/margins": 1.4025126695632935, "rewards/rejected": 0.45075723528862, "step": 2744 }, { "epoch": 0.45, "learning_rate": 9.97573001748162e-07, "logits/chosen": -0.9518270492553711, "logits/rejected": -0.8482077121734619, "logps/chosen": -125.47714233398438, "logps/rejected": -93.95729064941406, "loss": 0.6154, "rewards/accuracies": 0.0, "rewards/chosen": 0.3336853086948395, "rewards/margins": -0.7241775989532471, "rewards/rejected": 1.0578628778457642, "step": 2745 }, { "epoch": 0.45, "learning_rate": 9.975600510507024e-07, "logits/chosen": -0.6749109029769897, "logits/rejected": -0.8465386629104614, "logps/chosen": -97.16212463378906, "logps/rejected": -142.49755859375, "loss": 0.3203, "rewards/accuracies": 1.0, "rewards/chosen": 2.5939881801605225, "rewards/margins": 0.33004164695739746, "rewards/rejected": 2.263946533203125, "step": 2746 }, { "epoch": 0.45, "learning_rate": 9.97547065976551e-07, "logits/chosen": -0.5052008628845215, "logits/rejected": -0.5270017981529236, "logps/chosen": -164.516845703125, "logps/rejected": -85.83304595947266, "loss": 0.8856, "rewards/accuracies": 0.0, "rewards/chosen": 1.8303604125976562, "rewards/margins": -1.537590742111206, "rewards/rejected": 3.3679511547088623, "step": 2747 }, { "epoch": 0.45, "learning_rate": 9.975340465266053e-07, "logits/chosen": -0.350665420293808, "logits/rejected": -0.24628475308418274, "logps/chosen": -61.399879455566406, "logps/rejected": -19.424291610717773, "loss": 0.2719, "rewards/accuracies": 1.0, "rewards/chosen": 1.25688636302948, "rewards/margins": 0.9945896863937378, "rewards/rejected": 0.2622966766357422, "step": 2748 }, { "epoch": 0.45, "learning_rate": 9.975209927017646e-07, "logits/chosen": -0.7521430850028992, "logits/rejected": -0.6634393930435181, "logps/chosen": -170.10113525390625, "logps/rejected": -100.0634765625, "loss": 0.7605, "rewards/accuracies": 0.0, "rewards/chosen": 0.4776245057582855, "rewards/margins": -1.0920013189315796, "rewards/rejected": 1.5696258544921875, "step": 2749 }, { "epoch": 0.45, "learning_rate": 9.97507904502931e-07, "logits/chosen": -0.5337458848953247, "logits/rejected": -0.5069241523742676, "logps/chosen": -50.36722183227539, "logps/rejected": -91.56549072265625, "loss": 0.5676, "rewards/accuracies": 1.0, "rewards/chosen": 2.493781805038452, "rewards/margins": 1.135846495628357, "rewards/rejected": 1.3579353094100952, "step": 2750 }, { "epoch": 0.45, "learning_rate": 9.974947819310084e-07, "logits/chosen": -0.5073699951171875, "logits/rejected": -0.5073699951171875, "logps/chosen": -26.673873901367188, "logps/rejected": -26.673873901367188, "loss": 0.3521, "rewards/accuracies": 0.0, "rewards/chosen": 1.2081127166748047, "rewards/margins": 0.0, "rewards/rejected": 1.2081127166748047, "step": 2751 }, { "epoch": 0.45, "learning_rate": 9.97481624986904e-07, "logits/chosen": -0.23693695664405823, "logits/rejected": -0.21767958998680115, "logps/chosen": -81.4033432006836, "logps/rejected": -115.09127807617188, "loss": 1.163, "rewards/accuracies": 0.0, "rewards/chosen": 1.613484263420105, "rewards/margins": -1.9583204984664917, "rewards/rejected": 3.5718047618865967, "step": 2752 }, { "epoch": 0.45, "learning_rate": 9.974684336715264e-07, "logits/chosen": -0.4094651937484741, "logits/rejected": -0.413438618183136, "logps/chosen": -81.62940979003906, "logps/rejected": -53.87852096557617, "loss": 0.8915, "rewards/accuracies": 0.0, "rewards/chosen": 1.0228134393692017, "rewards/margins": -1.3222004175186157, "rewards/rejected": 2.3450138568878174, "step": 2753 }, { "epoch": 0.45, "learning_rate": 9.974552079857871e-07, "logits/chosen": -0.17246639728546143, "logits/rejected": -0.17246639728546143, "logps/chosen": -19.9323673248291, "logps/rejected": -19.9323673248291, "loss": 0.4495, "rewards/accuracies": 0.0, "rewards/chosen": 0.05035839229822159, "rewards/margins": 0.0, "rewards/rejected": 0.05035839229822159, "step": 2754 }, { "epoch": 0.45, "learning_rate": 9.974419479306e-07, "logits/chosen": -0.32149481773376465, "logits/rejected": -0.2724834382534027, "logps/chosen": -50.733314514160156, "logps/rejected": -102.92529296875, "loss": 0.5665, "rewards/accuracies": 1.0, "rewards/chosen": 1.6038635969161987, "rewards/margins": 0.5933235883712769, "rewards/rejected": 1.0105400085449219, "step": 2755 }, { "epoch": 0.45, "learning_rate": 9.97428653506881e-07, "logits/chosen": -0.7306541800498962, "logits/rejected": -0.6982225179672241, "logps/chosen": -96.67788696289062, "logps/rejected": -128.59170532226562, "loss": 1.8905, "rewards/accuracies": 0.0, "rewards/chosen": 3.56640625, "rewards/margins": -1.5363402366638184, "rewards/rejected": 5.102746486663818, "step": 2756 }, { "epoch": 0.45, "learning_rate": 9.974153247155487e-07, "logits/chosen": -0.49117040634155273, "logits/rejected": -0.45545196533203125, "logps/chosen": -100.459716796875, "logps/rejected": -68.26005554199219, "loss": 0.2926, "rewards/accuracies": 1.0, "rewards/chosen": 2.4912993907928467, "rewards/margins": 0.34538722038269043, "rewards/rejected": 2.1459121704101562, "step": 2757 }, { "epoch": 0.45, "learning_rate": 9.974019615575243e-07, "logits/chosen": -0.4532606601715088, "logits/rejected": -0.4532606601715088, "logps/chosen": -95.0459213256836, "logps/rejected": -95.0459213256836, "loss": 0.4117, "rewards/accuracies": 0.0, "rewards/chosen": 1.5559097528457642, "rewards/margins": 0.0, "rewards/rejected": 1.5559097528457642, "step": 2758 }, { "epoch": 0.45, "learning_rate": 9.973885640337307e-07, "logits/chosen": -0.4748102128505707, "logits/rejected": -0.4583100378513336, "logps/chosen": -72.5621337890625, "logps/rejected": -112.5919189453125, "loss": 1.2654, "rewards/accuracies": 0.0, "rewards/chosen": 1.0961135625839233, "rewards/margins": -1.3677674531936646, "rewards/rejected": 2.463881015777588, "step": 2759 }, { "epoch": 0.45, "learning_rate": 9.973751321450935e-07, "logits/chosen": -0.6012923717498779, "logits/rejected": -0.4598844349384308, "logps/chosen": -37.81144332885742, "logps/rejected": -72.78169250488281, "loss": 0.9024, "rewards/accuracies": 0.0, "rewards/chosen": 1.7359058856964111, "rewards/margins": -1.0631434917449951, "rewards/rejected": 2.7990493774414062, "step": 2760 }, { "epoch": 0.45, "learning_rate": 9.973616658925412e-07, "logits/chosen": -0.9660739898681641, "logits/rejected": -0.9553161859512329, "logps/chosen": -82.60901641845703, "logps/rejected": -38.534332275390625, "loss": 0.2646, "rewards/accuracies": 1.0, "rewards/chosen": 0.8589531183242798, "rewards/margins": 0.7409858703613281, "rewards/rejected": 0.11796722561120987, "step": 2761 }, { "epoch": 0.45, "learning_rate": 9.973481652770038e-07, "logits/chosen": -0.7620826959609985, "logits/rejected": -0.8463171720504761, "logps/chosen": -295.69696044921875, "logps/rejected": -139.69711303710938, "loss": 1.8903, "rewards/accuracies": 0.0, "rewards/chosen": 2.531494140625, "rewards/margins": -3.349395751953125, "rewards/rejected": 5.880889892578125, "step": 2762 }, { "epoch": 0.45, "learning_rate": 9.973346302994139e-07, "logits/chosen": -0.6083037257194519, "logits/rejected": -0.6131683588027954, "logps/chosen": -66.68328857421875, "logps/rejected": -136.68731689453125, "loss": 1.3748, "rewards/accuracies": 0.0, "rewards/chosen": 0.6451507806777954, "rewards/margins": -2.361123561859131, "rewards/rejected": 3.006274461746216, "step": 2763 }, { "epoch": 0.45, "learning_rate": 9.97321060960707e-07, "logits/chosen": -0.6112712621688843, "logits/rejected": -0.21797646582126617, "logps/chosen": -123.88890838623047, "logps/rejected": -58.40677261352539, "loss": 0.0819, "rewards/accuracies": 1.0, "rewards/chosen": 4.343447208404541, "rewards/margins": 2.589505434036255, "rewards/rejected": 1.7539417743682861, "step": 2764 }, { "epoch": 0.45, "learning_rate": 9.973074572618204e-07, "logits/chosen": -0.3802834749221802, "logits/rejected": -0.39401543140411377, "logps/chosen": -59.36220932006836, "logps/rejected": -70.48258972167969, "loss": 0.9802, "rewards/accuracies": 1.0, "rewards/chosen": 0.4778057038784027, "rewards/margins": 0.29035454988479614, "rewards/rejected": 0.18745116889476776, "step": 2765 }, { "epoch": 0.45, "learning_rate": 9.972938192036944e-07, "logits/chosen": -0.4923504889011383, "logits/rejected": -0.48510393500328064, "logps/chosen": -76.70445251464844, "logps/rejected": -30.305667877197266, "loss": 0.147, "rewards/accuracies": 1.0, "rewards/chosen": 1.4201287031173706, "rewards/margins": 1.2115013599395752, "rewards/rejected": 0.20862732827663422, "step": 2766 }, { "epoch": 0.45, "learning_rate": 9.972801467872705e-07, "logits/chosen": -0.9804417490959167, "logits/rejected": -0.9168195128440857, "logps/chosen": -110.90325164794922, "logps/rejected": -115.95205688476562, "loss": 0.739, "rewards/accuracies": 0.0, "rewards/chosen": 1.1680138111114502, "rewards/margins": -0.8660423755645752, "rewards/rejected": 2.0340561866760254, "step": 2767 }, { "epoch": 0.45, "learning_rate": 9.97266440013494e-07, "logits/chosen": -0.8307451605796814, "logits/rejected": -0.7060705423355103, "logps/chosen": -92.55058288574219, "logps/rejected": -117.2344741821289, "loss": 1.2086, "rewards/accuracies": 0.0, "rewards/chosen": 2.5162408351898193, "rewards/margins": -0.2703230381011963, "rewards/rejected": 2.7865638732910156, "step": 2768 }, { "epoch": 0.45, "learning_rate": 9.972526988833117e-07, "logits/chosen": -0.5315032005310059, "logits/rejected": -0.5476739406585693, "logps/chosen": -7.38773250579834, "logps/rejected": -18.489490509033203, "loss": 0.5678, "rewards/accuracies": 0.0, "rewards/chosen": 0.3597889840602875, "rewards/margins": -0.11495277285575867, "rewards/rejected": 0.47474175691604614, "step": 2769 }, { "epoch": 0.45, "learning_rate": 9.972389233976729e-07, "logits/chosen": -0.993533194065094, "logits/rejected": -0.9608875513076782, "logps/chosen": -87.71975708007812, "logps/rejected": -19.195941925048828, "loss": 0.2794, "rewards/accuracies": 1.0, "rewards/chosen": 0.9598922729492188, "rewards/margins": 0.6988555788993835, "rewards/rejected": 0.2610366940498352, "step": 2770 }, { "epoch": 0.45, "learning_rate": 9.972251135575293e-07, "logits/chosen": -0.6664839386940002, "logits/rejected": -0.6502822637557983, "logps/chosen": -80.98863983154297, "logps/rejected": -60.69286346435547, "loss": 0.2641, "rewards/accuracies": 1.0, "rewards/chosen": 2.8042962551116943, "rewards/margins": 1.0470200777053833, "rewards/rejected": 1.757276177406311, "step": 2771 }, { "epoch": 0.45, "learning_rate": 9.972112693638352e-07, "logits/chosen": -0.5903957486152649, "logits/rejected": -0.41358405351638794, "logps/chosen": -83.04814147949219, "logps/rejected": -25.583513259887695, "loss": 0.4223, "rewards/accuracies": 1.0, "rewards/chosen": 2.4742584228515625, "rewards/margins": 1.9198936223983765, "rewards/rejected": 0.554364800453186, "step": 2772 }, { "epoch": 0.45, "learning_rate": 9.971973908175471e-07, "logits/chosen": -0.5055336952209473, "logits/rejected": -0.5055336952209473, "logps/chosen": -19.458330154418945, "logps/rejected": -19.458330154418945, "loss": 0.3768, "rewards/accuracies": 0.0, "rewards/chosen": 1.427747130393982, "rewards/margins": 0.0, "rewards/rejected": 1.427747130393982, "step": 2773 }, { "epoch": 0.45, "learning_rate": 9.971834779196237e-07, "logits/chosen": -0.4414650797843933, "logits/rejected": -0.4649164378643036, "logps/chosen": -101.6256332397461, "logps/rejected": -107.72998809814453, "loss": 1.2164, "rewards/accuracies": 0.0, "rewards/chosen": 2.3429360389709473, "rewards/margins": -1.426457166671753, "rewards/rejected": 3.7693932056427, "step": 2774 }, { "epoch": 0.45, "learning_rate": 9.971695306710267e-07, "logits/chosen": -0.9546013474464417, "logits/rejected": -0.960035502910614, "logps/chosen": -98.79627990722656, "logps/rejected": -76.37370300292969, "loss": 1.7002, "rewards/accuracies": 0.0, "rewards/chosen": 0.3340499997138977, "rewards/margins": -0.06699523329734802, "rewards/rejected": 0.4010452330112457, "step": 2775 }, { "epoch": 0.45, "learning_rate": 9.97155549072719e-07, "logits/chosen": -0.5651718974113464, "logits/rejected": -0.5084046721458435, "logps/chosen": -93.31173706054688, "logps/rejected": -52.57875061035156, "loss": 0.8732, "rewards/accuracies": 0.0, "rewards/chosen": 0.404754638671875, "rewards/margins": -1.113525390625, "rewards/rejected": 1.518280029296875, "step": 2776 }, { "epoch": 0.45, "learning_rate": 9.971415331256672e-07, "logits/chosen": -0.4740394949913025, "logits/rejected": -0.46582844853401184, "logps/chosen": -58.764251708984375, "logps/rejected": -87.14785766601562, "loss": 0.1786, "rewards/accuracies": 1.0, "rewards/chosen": 1.2974258661270142, "rewards/margins": 1.087300181388855, "rewards/rejected": 0.21012572944164276, "step": 2777 }, { "epoch": 0.45, "learning_rate": 9.971274828308393e-07, "logits/chosen": 0.0017340072663500905, "logits/rejected": 0.0017340072663500905, "logps/chosen": -10.898746490478516, "logps/rejected": -10.898746490478516, "loss": 0.8495, "rewards/accuracies": 0.0, "rewards/chosen": 0.4463333189487457, "rewards/margins": 0.0, "rewards/rejected": 0.4463333189487457, "step": 2778 }, { "epoch": 0.45, "learning_rate": 9.971133981892065e-07, "logits/chosen": -0.365793377161026, "logits/rejected": -0.25116443634033203, "logps/chosen": -96.47700500488281, "logps/rejected": -45.89850997924805, "loss": 0.4122, "rewards/accuracies": 0.0, "rewards/chosen": 1.476100206375122, "rewards/margins": -0.08057665824890137, "rewards/rejected": 1.5566768646240234, "step": 2779 }, { "epoch": 0.45, "learning_rate": 9.970992792017412e-07, "logits/chosen": -0.6761587262153625, "logits/rejected": -0.7035754919052124, "logps/chosen": -111.3717269897461, "logps/rejected": -53.71491622924805, "loss": 0.3011, "rewards/accuracies": 1.0, "rewards/chosen": 2.437284231185913, "rewards/margins": 0.6572003364562988, "rewards/rejected": 1.7800838947296143, "step": 2780 }, { "epoch": 0.45, "learning_rate": 9.970851258694197e-07, "logits/chosen": -0.668318510055542, "logits/rejected": -0.6119921803474426, "logps/chosen": -98.47064208984375, "logps/rejected": -167.74197387695312, "loss": 0.4083, "rewards/accuracies": 0.0, "rewards/chosen": 0.46091386675834656, "rewards/margins": -0.2037406861782074, "rewards/rejected": 0.664654552936554, "step": 2781 }, { "epoch": 0.45, "learning_rate": 9.970709381932192e-07, "logits/chosen": -0.8378563523292542, "logits/rejected": -0.8421419262886047, "logps/chosen": -115.90245819091797, "logps/rejected": -153.91001892089844, "loss": 1.0153, "rewards/accuracies": 0.0, "rewards/chosen": 2.6198203563690186, "rewards/margins": -1.6154320240020752, "rewards/rejected": 4.235252380371094, "step": 2782 }, { "epoch": 0.45, "learning_rate": 9.970567161741204e-07, "logits/chosen": -0.3509829044342041, "logits/rejected": -0.3509829044342041, "logps/chosen": -0.5993773937225342, "logps/rejected": -0.5993773937225342, "loss": 0.834, "rewards/accuracies": 0.0, "rewards/chosen": 0.15111422538757324, "rewards/margins": 0.0, "rewards/rejected": 0.15111422538757324, "step": 2783 }, { "epoch": 0.45, "learning_rate": 9.970424598131056e-07, "logits/chosen": 0.007277801167219877, "logits/rejected": 0.0033893384970724583, "logps/chosen": -2.074949026107788, "logps/rejected": -24.612417221069336, "loss": 0.7785, "rewards/accuracies": 1.0, "rewards/chosen": 0.2190452367067337, "rewards/margins": 0.25707870721817017, "rewards/rejected": -0.038033485412597656, "step": 2784 }, { "epoch": 0.45, "learning_rate": 9.970281691111597e-07, "logits/chosen": -0.9508386850357056, "logits/rejected": -0.9971612691879272, "logps/chosen": -62.80793380737305, "logps/rejected": -70.24574279785156, "loss": 1.2532, "rewards/accuracies": 0.0, "rewards/chosen": 1.0249149799346924, "rewards/margins": -0.4846874475479126, "rewards/rejected": 1.509602427482605, "step": 2785 }, { "epoch": 0.45, "learning_rate": 9.970138440692705e-07, "logits/chosen": -0.6860896944999695, "logits/rejected": -0.6706076860427856, "logps/chosen": -75.23759460449219, "logps/rejected": -129.6507568359375, "loss": 0.1734, "rewards/accuracies": 1.0, "rewards/chosen": 2.367206573486328, "rewards/margins": 2.0340797901153564, "rewards/rejected": 0.33312684297561646, "step": 2786 }, { "epoch": 0.45, "learning_rate": 9.969994846884273e-07, "logits/chosen": -0.3960302174091339, "logits/rejected": -0.4177751839160919, "logps/chosen": -104.93065643310547, "logps/rejected": -102.10932922363281, "loss": 0.9191, "rewards/accuracies": 0.0, "rewards/chosen": 0.9517410397529602, "rewards/margins": -0.07358402013778687, "rewards/rejected": 1.025325059890747, "step": 2787 }, { "epoch": 0.45, "learning_rate": 9.969850909696224e-07, "logits/chosen": -0.9452205896377563, "logits/rejected": -0.9044455289840698, "logps/chosen": -66.07939910888672, "logps/rejected": -112.82978820800781, "loss": 2.5734, "rewards/accuracies": 0.0, "rewards/chosen": 2.170703887939453, "rewards/margins": -2.434497833251953, "rewards/rejected": 4.605201721191406, "step": 2788 }, { "epoch": 0.45, "learning_rate": 9.969706629138503e-07, "logits/chosen": -1.0567436218261719, "logits/rejected": -1.042405366897583, "logps/chosen": -108.1236572265625, "logps/rejected": -62.288291931152344, "loss": 0.7539, "rewards/accuracies": 0.0, "rewards/chosen": 0.7839828729629517, "rewards/margins": -1.2225807905197144, "rewards/rejected": 2.006563663482666, "step": 2789 }, { "epoch": 0.45, "learning_rate": 9.969562005221078e-07, "logits/chosen": -0.3003205358982086, "logits/rejected": -0.1731029450893402, "logps/chosen": -69.66650390625, "logps/rejected": -78.78456115722656, "loss": 1.2141, "rewards/accuracies": 0.0, "rewards/chosen": 1.1505661010742188, "rewards/margins": -0.14121556282043457, "rewards/rejected": 1.2917816638946533, "step": 2790 }, { "epoch": 0.45, "learning_rate": 9.96941703795394e-07, "logits/chosen": -0.7471804022789001, "logits/rejected": -0.7408641576766968, "logps/chosen": -59.6536865234375, "logps/rejected": -53.98600387573242, "loss": 0.7393, "rewards/accuracies": 1.0, "rewards/chosen": 1.4542007446289062, "rewards/margins": 0.5467754006385803, "rewards/rejected": 0.9074253439903259, "step": 2791 }, { "epoch": 0.45, "learning_rate": 9.969271727347107e-07, "logits/chosen": -0.4900730848312378, "logits/rejected": -0.48395076394081116, "logps/chosen": -7.675404071807861, "logps/rejected": -11.688956260681152, "loss": 0.3283, "rewards/accuracies": 1.0, "rewards/chosen": 0.09069342911243439, "rewards/margins": 0.12254376709461212, "rewards/rejected": -0.031850337982177734, "step": 2792 }, { "epoch": 0.45, "learning_rate": 9.969126073410617e-07, "logits/chosen": -0.9179167747497559, "logits/rejected": -0.9102720022201538, "logps/chosen": -102.34677124023438, "logps/rejected": -88.68769836425781, "loss": 2.1293, "rewards/accuracies": 0.0, "rewards/chosen": 0.8785873651504517, "rewards/margins": -1.3732277154922485, "rewards/rejected": 2.2518150806427, "step": 2793 }, { "epoch": 0.45, "learning_rate": 9.96898007615453e-07, "logits/chosen": -1.102908730506897, "logits/rejected": -0.9621865153312683, "logps/chosen": -83.93500518798828, "logps/rejected": -104.5610122680664, "loss": 0.1458, "rewards/accuracies": 1.0, "rewards/chosen": 4.684670448303223, "rewards/margins": 1.8395051956176758, "rewards/rejected": 2.845165252685547, "step": 2794 }, { "epoch": 0.45, "learning_rate": 9.968833735588942e-07, "logits/chosen": -0.14746177196502686, "logits/rejected": -0.12435253709554672, "logps/chosen": -20.786596298217773, "logps/rejected": -0.7626105546951294, "loss": 1.3583, "rewards/accuracies": 0.0, "rewards/chosen": 0.19833946228027344, "rewards/margins": -0.2857285439968109, "rewards/rejected": 0.48406800627708435, "step": 2795 }, { "epoch": 0.45, "learning_rate": 9.968687051723956e-07, "logits/chosen": -0.5033896565437317, "logits/rejected": -0.44471967220306396, "logps/chosen": -30.8253173828125, "logps/rejected": -76.23667907714844, "loss": 1.4237, "rewards/accuracies": 0.0, "rewards/chosen": 0.8900157809257507, "rewards/margins": -0.810771644115448, "rewards/rejected": 1.7007874250411987, "step": 2796 }, { "epoch": 0.45, "learning_rate": 9.968540024569708e-07, "logits/chosen": -0.6522697806358337, "logits/rejected": -0.6701099872589111, "logps/chosen": -126.52640533447266, "logps/rejected": -62.21731948852539, "loss": 1.8754, "rewards/accuracies": 0.0, "rewards/chosen": -0.240275576710701, "rewards/margins": -2.5471906661987305, "rewards/rejected": 2.306915044784546, "step": 2797 }, { "epoch": 0.45, "learning_rate": 9.96839265413636e-07, "logits/chosen": -0.28467005491256714, "logits/rejected": -0.28467005491256714, "logps/chosen": -62.252586364746094, "logps/rejected": -62.252586364746094, "loss": 0.9296, "rewards/accuracies": 0.0, "rewards/chosen": 0.7838752865791321, "rewards/margins": 0.0, "rewards/rejected": 0.7838752865791321, "step": 2798 }, { "epoch": 0.45, "learning_rate": 9.968244940434088e-07, "logits/chosen": -0.31067386269569397, "logits/rejected": -0.2801799476146698, "logps/chosen": -93.81636047363281, "logps/rejected": -85.18898010253906, "loss": 0.5696, "rewards/accuracies": 1.0, "rewards/chosen": 3.81103515625, "rewards/margins": 2.5153608322143555, "rewards/rejected": 1.295674204826355, "step": 2799 }, { "epoch": 0.45, "learning_rate": 9.968096883473103e-07, "logits/chosen": -1.0819857120513916, "logits/rejected": -1.0438941717147827, "logps/chosen": -140.9322967529297, "logps/rejected": -40.83612823486328, "loss": 0.5122, "rewards/accuracies": 1.0, "rewards/chosen": 0.3394272029399872, "rewards/margins": 0.20294992625713348, "rewards/rejected": 0.1364772766828537, "step": 2800 }, { "epoch": 0.45, "learning_rate": 9.96794848326363e-07, "logits/chosen": -0.5345103740692139, "logits/rejected": -0.5345103740692139, "logps/chosen": -57.7136116027832, "logps/rejected": -57.7136116027832, "loss": 0.4674, "rewards/accuracies": 0.0, "rewards/chosen": 1.4937069416046143, "rewards/margins": 0.0, "rewards/rejected": 1.4937069416046143, "step": 2801 }, { "epoch": 0.45, "learning_rate": 9.967799739815924e-07, "logits/chosen": -0.8403136730194092, "logits/rejected": -0.7717719078063965, "logps/chosen": -62.28253936767578, "logps/rejected": -56.357261657714844, "loss": 0.2348, "rewards/accuracies": 1.0, "rewards/chosen": 1.3712364435195923, "rewards/margins": 0.7120777368545532, "rewards/rejected": 0.6591587066650391, "step": 2802 }, { "epoch": 0.45, "learning_rate": 9.967650653140262e-07, "logits/chosen": -0.5827478766441345, "logits/rejected": -0.5827478766441345, "logps/chosen": -67.99664306640625, "logps/rejected": -67.99664306640625, "loss": 0.3776, "rewards/accuracies": 0.0, "rewards/chosen": 2.352759599685669, "rewards/margins": 0.0, "rewards/rejected": 2.352759599685669, "step": 2803 }, { "epoch": 0.46, "learning_rate": 9.967501223246945e-07, "logits/chosen": -0.7630183696746826, "logits/rejected": -0.659562349319458, "logps/chosen": -182.49658203125, "logps/rejected": -76.52875518798828, "loss": 0.2785, "rewards/accuracies": 1.0, "rewards/chosen": 3.0477066040039062, "rewards/margins": 0.5102837085723877, "rewards/rejected": 2.5374228954315186, "step": 2804 }, { "epoch": 0.46, "learning_rate": 9.967351450146296e-07, "logits/chosen": -0.5010514855384827, "logits/rejected": -0.28991228342056274, "logps/chosen": -116.56410217285156, "logps/rejected": -90.32879638671875, "loss": 0.1202, "rewards/accuracies": 1.0, "rewards/chosen": 4.1032609939575195, "rewards/margins": 2.128047466278076, "rewards/rejected": 1.975213646888733, "step": 2805 }, { "epoch": 0.46, "learning_rate": 9.967201333848664e-07, "logits/chosen": -0.4783041775226593, "logits/rejected": -0.4783041775226593, "logps/chosen": -51.79425048828125, "logps/rejected": -51.79425048828125, "loss": 0.6016, "rewards/accuracies": 0.0, "rewards/chosen": 1.532537817955017, "rewards/margins": 0.0, "rewards/rejected": 1.532537817955017, "step": 2806 }, { "epoch": 0.46, "learning_rate": 9.967050874364418e-07, "logits/chosen": -0.8618499040603638, "logits/rejected": -0.8496244549751282, "logps/chosen": -92.9129638671875, "logps/rejected": -116.02557373046875, "loss": 1.2747, "rewards/accuracies": 0.0, "rewards/chosen": 2.777575731277466, "rewards/margins": -0.6902174949645996, "rewards/rejected": 3.4677932262420654, "step": 2807 }, { "epoch": 0.46, "learning_rate": 9.966900071703957e-07, "logits/chosen": -0.7503726482391357, "logits/rejected": -0.7248380184173584, "logps/chosen": -85.51229858398438, "logps/rejected": -85.5873031616211, "loss": 1.3458, "rewards/accuracies": 0.0, "rewards/chosen": 0.38519975543022156, "rewards/margins": -0.5285232067108154, "rewards/rejected": 0.9137229919433594, "step": 2808 }, { "epoch": 0.46, "learning_rate": 9.966748925877696e-07, "logits/chosen": -1.1766698360443115, "logits/rejected": -1.1691267490386963, "logps/chosen": -68.17766571044922, "logps/rejected": -34.35507583618164, "loss": 0.4309, "rewards/accuracies": 1.0, "rewards/chosen": 2.0286521911621094, "rewards/margins": 1.694962739944458, "rewards/rejected": 0.33368951082229614, "step": 2809 }, { "epoch": 0.46, "learning_rate": 9.966597436896082e-07, "logits/chosen": -0.6126782894134521, "logits/rejected": -0.7069172263145447, "logps/chosen": -303.38970947265625, "logps/rejected": -233.03118896484375, "loss": 0.7026, "rewards/accuracies": 0.0, "rewards/chosen": 3.1634156703948975, "rewards/margins": -0.9585816860198975, "rewards/rejected": 4.121997356414795, "step": 2810 }, { "epoch": 0.46, "learning_rate": 9.96644560476958e-07, "logits/chosen": -0.4596509039402008, "logits/rejected": -0.4596509039402008, "logps/chosen": -10.438791275024414, "logps/rejected": -10.438791275024414, "loss": 1.4089, "rewards/accuracies": 0.0, "rewards/chosen": 0.8326982855796814, "rewards/margins": 0.0, "rewards/rejected": 0.8326982855796814, "step": 2811 }, { "epoch": 0.46, "learning_rate": 9.966293429508678e-07, "logits/chosen": -0.7122529745101929, "logits/rejected": -0.7303513884544373, "logps/chosen": -58.19902801513672, "logps/rejected": -41.19921112060547, "loss": 1.7978, "rewards/accuracies": 0.0, "rewards/chosen": 1.1018173694610596, "rewards/margins": -0.2296050786972046, "rewards/rejected": 1.3314224481582642, "step": 2812 }, { "epoch": 0.46, "learning_rate": 9.966140911123893e-07, "logits/chosen": -0.5226424932479858, "logits/rejected": -0.5226424932479858, "logps/chosen": -57.418861389160156, "logps/rejected": -57.418861389160156, "loss": 0.5402, "rewards/accuracies": 0.0, "rewards/chosen": 1.2788078784942627, "rewards/margins": 0.0, "rewards/rejected": 1.2788078784942627, "step": 2813 }, { "epoch": 0.46, "learning_rate": 9.96598804962576e-07, "logits/chosen": -0.4224169850349426, "logits/rejected": -0.29432711005210876, "logps/chosen": -64.42704010009766, "logps/rejected": -57.48780059814453, "loss": 0.6126, "rewards/accuracies": 1.0, "rewards/chosen": 1.5346771478652954, "rewards/margins": 0.3530212640762329, "rewards/rejected": 1.1816558837890625, "step": 2814 }, { "epoch": 0.46, "learning_rate": 9.965834845024842e-07, "logits/chosen": -0.4908519387245178, "logits/rejected": -0.48645761609077454, "logps/chosen": -61.011329650878906, "logps/rejected": -86.941162109375, "loss": 1.203, "rewards/accuracies": 1.0, "rewards/chosen": 1.2327117919921875, "rewards/margins": 1.220868706703186, "rewards/rejected": 0.011843109503388405, "step": 2815 }, { "epoch": 0.46, "learning_rate": 9.965681297331725e-07, "logits/chosen": -0.6768249273300171, "logits/rejected": -0.6913802623748779, "logps/chosen": -41.86834716796875, "logps/rejected": -43.1072883605957, "loss": 0.9269, "rewards/accuracies": 0.0, "rewards/chosen": 1.774511694908142, "rewards/margins": -0.08019983768463135, "rewards/rejected": 1.8547115325927734, "step": 2816 }, { "epoch": 0.46, "learning_rate": 9.965527406557013e-07, "logits/chosen": -0.834205150604248, "logits/rejected": -0.8558136820793152, "logps/chosen": -202.81024169921875, "logps/rejected": -52.358848571777344, "loss": 0.5763, "rewards/accuracies": 1.0, "rewards/chosen": 2.0778045654296875, "rewards/margins": 0.9222530126571655, "rewards/rejected": 1.155551552772522, "step": 2817 }, { "epoch": 0.46, "learning_rate": 9.965373172711343e-07, "logits/chosen": -0.12721943855285645, "logits/rejected": -0.1840745061635971, "logps/chosen": -54.62092971801758, "logps/rejected": -69.79454040527344, "loss": 1.0861, "rewards/accuracies": 0.0, "rewards/chosen": 0.535906970500946, "rewards/margins": -1.027156114578247, "rewards/rejected": 1.5630630254745483, "step": 2818 }, { "epoch": 0.46, "learning_rate": 9.96521859580537e-07, "logits/chosen": -0.6620593070983887, "logits/rejected": -0.39715445041656494, "logps/chosen": -426.658447265625, "logps/rejected": -120.1943130493164, "loss": 0.2828, "rewards/accuracies": 1.0, "rewards/chosen": 1.9346283674240112, "rewards/margins": 0.5928382873535156, "rewards/rejected": 1.3417900800704956, "step": 2819 }, { "epoch": 0.46, "learning_rate": 9.965063675849773e-07, "logits/chosen": -0.5548496246337891, "logits/rejected": -0.5578559041023254, "logps/chosen": -121.65263366699219, "logps/rejected": -91.48454284667969, "loss": 0.4954, "rewards/accuracies": 0.0, "rewards/chosen": 0.3471206724643707, "rewards/margins": -0.3577285706996918, "rewards/rejected": 0.7048492431640625, "step": 2820 }, { "epoch": 0.46, "learning_rate": 9.964908412855255e-07, "logits/chosen": -0.6287947297096252, "logits/rejected": -0.5800250172615051, "logps/chosen": -162.37379455566406, "logps/rejected": -75.810791015625, "loss": 0.3938, "rewards/accuracies": 1.0, "rewards/chosen": 3.254892110824585, "rewards/margins": 1.5103691816329956, "rewards/rejected": 1.7445229291915894, "step": 2821 }, { "epoch": 0.46, "learning_rate": 9.964752806832543e-07, "logits/chosen": -0.7667965888977051, "logits/rejected": -0.7187288403511047, "logps/chosen": -82.87654113769531, "logps/rejected": -118.70243072509766, "loss": 0.2435, "rewards/accuracies": 1.0, "rewards/chosen": 2.145732879638672, "rewards/margins": 1.0487273931503296, "rewards/rejected": 1.0970054864883423, "step": 2822 }, { "epoch": 0.46, "learning_rate": 9.96459685779239e-07, "logits/chosen": -0.28817641735076904, "logits/rejected": -0.390939325094223, "logps/chosen": -79.4244384765625, "logps/rejected": -71.54327392578125, "loss": 1.3533, "rewards/accuracies": 0.0, "rewards/chosen": 0.5532089471817017, "rewards/margins": -1.3808997869491577, "rewards/rejected": 1.9341087341308594, "step": 2823 }, { "epoch": 0.46, "learning_rate": 9.964440565745573e-07, "logits/chosen": -0.7422308325767517, "logits/rejected": -0.7855453491210938, "logps/chosen": -44.8055534362793, "logps/rejected": -55.714759826660156, "loss": 0.5508, "rewards/accuracies": 1.0, "rewards/chosen": 1.6614567041397095, "rewards/margins": 0.13106191158294678, "rewards/rejected": 1.5303947925567627, "step": 2824 }, { "epoch": 0.46, "learning_rate": 9.964283930702884e-07, "logits/chosen": -0.0877782329916954, "logits/rejected": -0.029494471848011017, "logps/chosen": -38.610755920410156, "logps/rejected": -49.60559844970703, "loss": 0.7618, "rewards/accuracies": 1.0, "rewards/chosen": 1.7194846868515015, "rewards/margins": 0.26277005672454834, "rewards/rejected": 1.4567146301269531, "step": 2825 }, { "epoch": 0.46, "learning_rate": 9.964126952675147e-07, "logits/chosen": -0.6711336970329285, "logits/rejected": -0.6814155578613281, "logps/chosen": -108.95097351074219, "logps/rejected": -47.836673736572266, "loss": 2.3336, "rewards/accuracies": 0.0, "rewards/chosen": 0.2076873779296875, "rewards/margins": -1.2799748182296753, "rewards/rejected": 1.4876621961593628, "step": 2826 }, { "epoch": 0.46, "learning_rate": 9.96396963167321e-07, "logits/chosen": -0.8838158845901489, "logits/rejected": -0.8619096279144287, "logps/chosen": -62.662940979003906, "logps/rejected": -35.82862854003906, "loss": 0.9021, "rewards/accuracies": 1.0, "rewards/chosen": 0.6109504699707031, "rewards/margins": 0.4234786927700043, "rewards/rejected": 0.18747177720069885, "step": 2827 }, { "epoch": 0.46, "learning_rate": 9.96381196770794e-07, "logits/chosen": -0.2634252607822418, "logits/rejected": -0.2589022219181061, "logps/chosen": -57.8393440246582, "logps/rejected": -42.24560546875, "loss": 0.7158, "rewards/accuracies": 1.0, "rewards/chosen": 2.1497509479522705, "rewards/margins": 0.0034885406494140625, "rewards/rejected": 2.1462624073028564, "step": 2828 }, { "epoch": 0.46, "learning_rate": 9.963653960790232e-07, "logits/chosen": -0.7113769054412842, "logits/rejected": -0.6847732663154602, "logps/chosen": -61.455726623535156, "logps/rejected": -66.38223266601562, "loss": 0.155, "rewards/accuracies": 1.0, "rewards/chosen": 2.2792251110076904, "rewards/margins": 1.283421277999878, "rewards/rejected": 0.9958038330078125, "step": 2829 }, { "epoch": 0.46, "learning_rate": 9.963495610931001e-07, "logits/chosen": -0.7019805312156677, "logits/rejected": -0.7210304737091064, "logps/chosen": -112.50293731689453, "logps/rejected": -36.9651985168457, "loss": 1.4759, "rewards/accuracies": 0.0, "rewards/chosen": 1.0131462812423706, "rewards/margins": -0.48353147506713867, "rewards/rejected": 1.4966777563095093, "step": 2830 }, { "epoch": 0.46, "learning_rate": 9.96333691814119e-07, "logits/chosen": -0.36829110980033875, "logits/rejected": -0.3125939965248108, "logps/chosen": -112.36082458496094, "logps/rejected": -21.408340454101562, "loss": 0.3092, "rewards/accuracies": 1.0, "rewards/chosen": 1.248931884765625, "rewards/margins": 0.978899359703064, "rewards/rejected": 0.27003249526023865, "step": 2831 }, { "epoch": 0.46, "learning_rate": 9.96317788243176e-07, "logits/chosen": -0.29926684498786926, "logits/rejected": -0.2031754106283188, "logps/chosen": -111.05517578125, "logps/rejected": -23.950660705566406, "loss": 2.0235, "rewards/accuracies": 1.0, "rewards/chosen": 1.3775742053985596, "rewards/margins": 0.5200744867324829, "rewards/rejected": 0.8574997186660767, "step": 2832 }, { "epoch": 0.46, "learning_rate": 9.963018503813698e-07, "logits/chosen": -0.5714156031608582, "logits/rejected": -0.5857564210891724, "logps/chosen": -114.88980102539062, "logps/rejected": -115.4732894897461, "loss": 0.5468, "rewards/accuracies": 0.0, "rewards/chosen": 1.7423431873321533, "rewards/margins": -0.2928321361541748, "rewards/rejected": 2.035175323486328, "step": 2833 }, { "epoch": 0.46, "learning_rate": 9.962858782298023e-07, "logits/chosen": -0.6006699204444885, "logits/rejected": -0.567409098148346, "logps/chosen": -103.27996063232422, "logps/rejected": -131.1409912109375, "loss": 0.3227, "rewards/accuracies": 1.0, "rewards/chosen": 1.4262069463729858, "rewards/margins": 0.3057159185409546, "rewards/rejected": 1.1204910278320312, "step": 2834 }, { "epoch": 0.46, "learning_rate": 9.962698717895761e-07, "logits/chosen": -0.27387937903404236, "logits/rejected": -0.27387937903404236, "logps/chosen": -1.190713882446289, "logps/rejected": -1.190713882446289, "loss": 0.7149, "rewards/accuracies": 0.0, "rewards/chosen": 0.1506357491016388, "rewards/margins": 0.0, "rewards/rejected": 0.1506357491016388, "step": 2835 }, { "epoch": 0.46, "learning_rate": 9.962538310617976e-07, "logits/chosen": -0.36585813760757446, "logits/rejected": -0.35100916028022766, "logps/chosen": -70.00981903076172, "logps/rejected": -59.814056396484375, "loss": 1.1149, "rewards/accuracies": 0.0, "rewards/chosen": 0.5115295648574829, "rewards/margins": -1.2945572137832642, "rewards/rejected": 1.806086778640747, "step": 2836 }, { "epoch": 0.46, "learning_rate": 9.962377560475751e-07, "logits/chosen": -0.337313175201416, "logits/rejected": -0.2669571042060852, "logps/chosen": -71.81011962890625, "logps/rejected": -264.3656311035156, "loss": 1.5728, "rewards/accuracies": 0.0, "rewards/chosen": 1.5355110168457031, "rewards/margins": -2.132408857345581, "rewards/rejected": 3.667919874191284, "step": 2837 }, { "epoch": 0.46, "learning_rate": 9.96221646748019e-07, "logits/chosen": -0.3214304447174072, "logits/rejected": -0.34881591796875, "logps/chosen": -67.7279052734375, "logps/rejected": -54.677955627441406, "loss": 1.1371, "rewards/accuracies": 0.0, "rewards/chosen": 0.8520073294639587, "rewards/margins": -1.3507368564605713, "rewards/rejected": 2.202744245529175, "step": 2838 }, { "epoch": 0.46, "learning_rate": 9.962055031642425e-07, "logits/chosen": -0.6826164126396179, "logits/rejected": -0.5635214447975159, "logps/chosen": -206.2989044189453, "logps/rejected": -92.41407775878906, "loss": 0.3724, "rewards/accuracies": 1.0, "rewards/chosen": 3.965681552886963, "rewards/margins": 0.5710678100585938, "rewards/rejected": 3.394613742828369, "step": 2839 }, { "epoch": 0.46, "learning_rate": 9.961893252973609e-07, "logits/chosen": -0.6478317379951477, "logits/rejected": -0.6911817789077759, "logps/chosen": -160.1124725341797, "logps/rejected": -122.06044006347656, "loss": 0.8932, "rewards/accuracies": 1.0, "rewards/chosen": 2.4258408546447754, "rewards/margins": 1.7586305141448975, "rewards/rejected": 0.6672104001045227, "step": 2840 }, { "epoch": 0.46, "learning_rate": 9.96173113148492e-07, "logits/chosen": -0.428083211183548, "logits/rejected": -0.4646521508693695, "logps/chosen": -32.318321228027344, "logps/rejected": -76.90071105957031, "loss": 1.3788, "rewards/accuracies": 0.0, "rewards/chosen": 1.415026068687439, "rewards/margins": -1.2488778829574585, "rewards/rejected": 2.6639039516448975, "step": 2841 }, { "epoch": 0.46, "learning_rate": 9.961568667187554e-07, "logits/chosen": -0.5960243940353394, "logits/rejected": -0.5715738534927368, "logps/chosen": -141.03411865234375, "logps/rejected": -88.96778106689453, "loss": 0.7873, "rewards/accuracies": 0.0, "rewards/chosen": 0.8497146964073181, "rewards/margins": -0.6564720273017883, "rewards/rejected": 1.5061867237091064, "step": 2842 }, { "epoch": 0.46, "learning_rate": 9.961405860092743e-07, "logits/chosen": -0.2769201099872589, "logits/rejected": -0.2086598426103592, "logps/chosen": -140.25613403320312, "logps/rejected": -46.01540756225586, "loss": 0.7643, "rewards/accuracies": 1.0, "rewards/chosen": 2.508859395980835, "rewards/margins": 1.145958423614502, "rewards/rejected": 1.362900972366333, "step": 2843 }, { "epoch": 0.46, "learning_rate": 9.961242710211732e-07, "logits/chosen": -0.2712642550468445, "logits/rejected": -0.26405012607574463, "logps/chosen": -92.47348022460938, "logps/rejected": -90.84397888183594, "loss": 1.5431, "rewards/accuracies": 0.0, "rewards/chosen": 1.4020423889160156, "rewards/margins": -0.46020352840423584, "rewards/rejected": 1.8622459173202515, "step": 2844 }, { "epoch": 0.46, "learning_rate": 9.961079217555791e-07, "logits/chosen": -0.6073368191719055, "logits/rejected": -0.5480813384056091, "logps/chosen": -68.92564392089844, "logps/rejected": -64.33956909179688, "loss": 0.5002, "rewards/accuracies": 0.0, "rewards/chosen": 2.4559555053710938, "rewards/margins": -0.11555171012878418, "rewards/rejected": 2.571507215499878, "step": 2845 }, { "epoch": 0.46, "learning_rate": 9.960915382136222e-07, "logits/chosen": -0.6778991222381592, "logits/rejected": -0.6010990142822266, "logps/chosen": -158.34864807128906, "logps/rejected": -84.1069107055664, "loss": 0.4401, "rewards/accuracies": 1.0, "rewards/chosen": 2.9309709072113037, "rewards/margins": 0.543212890625, "rewards/rejected": 2.3877580165863037, "step": 2846 }, { "epoch": 0.46, "learning_rate": 9.96075120396434e-07, "logits/chosen": -0.27320361137390137, "logits/rejected": -0.27320361137390137, "logps/chosen": -1.0303419828414917, "logps/rejected": -1.0303419828414917, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.26816508173942566, "rewards/margins": 0.0, "rewards/rejected": 0.26816508173942566, "step": 2847 }, { "epoch": 0.46, "learning_rate": 9.960586683051486e-07, "logits/chosen": -0.5080718994140625, "logits/rejected": -0.6480616331100464, "logps/chosen": -79.46052551269531, "logps/rejected": -137.77687072753906, "loss": 1.0244, "rewards/accuracies": 0.0, "rewards/chosen": 1.2349731922149658, "rewards/margins": -1.7479872703552246, "rewards/rejected": 2.9829604625701904, "step": 2848 }, { "epoch": 0.46, "learning_rate": 9.960421819409033e-07, "logits/chosen": -0.699832558631897, "logits/rejected": -0.6192751526832581, "logps/chosen": -61.554893493652344, "logps/rejected": -56.83766174316406, "loss": 1.0189, "rewards/accuracies": 0.0, "rewards/chosen": 0.6162247061729431, "rewards/margins": -0.4322158694267273, "rewards/rejected": 1.0484405755996704, "step": 2849 }, { "epoch": 0.46, "learning_rate": 9.960256613048367e-07, "logits/chosen": -0.24823668599128723, "logits/rejected": -0.266628623008728, "logps/chosen": -10.20042610168457, "logps/rejected": -3.1702170372009277, "loss": 0.691, "rewards/accuracies": 0.0, "rewards/chosen": -0.035308171063661575, "rewards/margins": -0.37100183963775635, "rewards/rejected": 0.3356936573982239, "step": 2850 }, { "epoch": 0.46, "learning_rate": 9.960091063980903e-07, "logits/chosen": -0.5136076211929321, "logits/rejected": -0.5211334824562073, "logps/chosen": -100.70576477050781, "logps/rejected": -94.73616027832031, "loss": 0.9313, "rewards/accuracies": 0.0, "rewards/chosen": 0.6535354852676392, "rewards/margins": -0.9010680913925171, "rewards/rejected": 1.5546035766601562, "step": 2851 }, { "epoch": 0.46, "learning_rate": 9.959925172218078e-07, "logits/chosen": -0.49636387825012207, "logits/rejected": -0.41407427191734314, "logps/chosen": -127.5214614868164, "logps/rejected": -151.33651733398438, "loss": 0.2414, "rewards/accuracies": 1.0, "rewards/chosen": 4.090935707092285, "rewards/margins": 4.086734771728516, "rewards/rejected": 0.004200744908303022, "step": 2852 }, { "epoch": 0.46, "learning_rate": 9.959758937771356e-07, "logits/chosen": -0.6203690767288208, "logits/rejected": -0.5962037444114685, "logps/chosen": -52.53204345703125, "logps/rejected": -93.37054443359375, "loss": 0.4865, "rewards/accuracies": 1.0, "rewards/chosen": 1.0757766962051392, "rewards/margins": 0.7128204107284546, "rewards/rejected": 0.3629562556743622, "step": 2853 }, { "epoch": 0.46, "learning_rate": 9.959592360652222e-07, "logits/chosen": -0.5510792136192322, "logits/rejected": -0.5502473711967468, "logps/chosen": -116.22831726074219, "logps/rejected": -127.97317504882812, "loss": 0.4406, "rewards/accuracies": 1.0, "rewards/chosen": 1.677699327468872, "rewards/margins": 0.9875274896621704, "rewards/rejected": 0.6901718378067017, "step": 2854 }, { "epoch": 0.46, "learning_rate": 9.959425440872184e-07, "logits/chosen": -0.896264910697937, "logits/rejected": -0.8591595888137817, "logps/chosen": -90.75440979003906, "logps/rejected": -71.52449035644531, "loss": 0.9411, "rewards/accuracies": 0.0, "rewards/chosen": 1.5586594343185425, "rewards/margins": -0.8706725835800171, "rewards/rejected": 2.4293320178985596, "step": 2855 }, { "epoch": 0.46, "learning_rate": 9.959258178442774e-07, "logits/chosen": -0.5771440863609314, "logits/rejected": -0.4790460467338562, "logps/chosen": -163.06105041503906, "logps/rejected": -158.81564331054688, "loss": 0.1277, "rewards/accuracies": 1.0, "rewards/chosen": 4.209140300750732, "rewards/margins": 1.69571852684021, "rewards/rejected": 2.5134217739105225, "step": 2856 }, { "epoch": 0.46, "learning_rate": 9.95909057337555e-07, "logits/chosen": -0.40603575110435486, "logits/rejected": -0.36188045144081116, "logps/chosen": -78.86628723144531, "logps/rejected": -118.30115509033203, "loss": 0.8839, "rewards/accuracies": 0.0, "rewards/chosen": 1.1079254150390625, "rewards/margins": -0.7816276550292969, "rewards/rejected": 1.8895530700683594, "step": 2857 }, { "epoch": 0.46, "learning_rate": 9.958922625682087e-07, "logits/chosen": -0.9102029800415039, "logits/rejected": -0.777830958366394, "logps/chosen": -152.0514373779297, "logps/rejected": -303.2801208496094, "loss": 1.1115, "rewards/accuracies": 0.0, "rewards/chosen": 0.9807754755020142, "rewards/margins": -0.7325149774551392, "rewards/rejected": 1.7132904529571533, "step": 2858 }, { "epoch": 0.46, "learning_rate": 9.958754335373996e-07, "logits/chosen": -0.6976878046989441, "logits/rejected": -0.6267940402030945, "logps/chosen": -145.6217803955078, "logps/rejected": -64.53352355957031, "loss": 0.4818, "rewards/accuracies": 1.0, "rewards/chosen": 5.018003940582275, "rewards/margins": 3.8441879749298096, "rewards/rejected": 1.1738159656524658, "step": 2859 }, { "epoch": 0.46, "learning_rate": 9.9585857024629e-07, "logits/chosen": -0.5323889255523682, "logits/rejected": -0.5442507266998291, "logps/chosen": -41.22641372680664, "logps/rejected": -123.2959213256836, "loss": 0.7802, "rewards/accuracies": 0.0, "rewards/chosen": 0.9028869867324829, "rewards/margins": -0.11009907722473145, "rewards/rejected": 1.0129860639572144, "step": 2860 }, { "epoch": 0.46, "learning_rate": 9.958416726960451e-07, "logits/chosen": -0.5650113821029663, "logits/rejected": -0.5650113821029663, "logps/chosen": -84.62055206298828, "logps/rejected": -84.62055206298828, "loss": 0.4754, "rewards/accuracies": 0.0, "rewards/chosen": 2.0237388610839844, "rewards/margins": 0.0, "rewards/rejected": 2.0237388610839844, "step": 2861 }, { "epoch": 0.46, "learning_rate": 9.95824740887832e-07, "logits/chosen": -0.35242900252342224, "logits/rejected": -0.4028477966785431, "logps/chosen": -49.0285530090332, "logps/rejected": -104.05757141113281, "loss": 0.8671, "rewards/accuracies": 1.0, "rewards/chosen": 0.158122256398201, "rewards/margins": 0.5514938235282898, "rewards/rejected": -0.39337158203125, "step": 2862 }, { "epoch": 0.46, "learning_rate": 9.95807774822821e-07, "logits/chosen": -0.40658503770828247, "logits/rejected": -0.289701908826828, "logps/chosen": -60.0975341796875, "logps/rejected": -86.98690032958984, "loss": 0.2731, "rewards/accuracies": 1.0, "rewards/chosen": 2.9140243530273438, "rewards/margins": 0.9951759576797485, "rewards/rejected": 1.9188483953475952, "step": 2863 }, { "epoch": 0.46, "learning_rate": 9.957907745021843e-07, "logits/chosen": -0.6233574748039246, "logits/rejected": -0.6318773627281189, "logps/chosen": -74.61865997314453, "logps/rejected": -107.26203918457031, "loss": 0.7458, "rewards/accuracies": 0.0, "rewards/chosen": 1.4311035871505737, "rewards/margins": -1.1653517484664917, "rewards/rejected": 2.5964553356170654, "step": 2864 }, { "epoch": 0.47, "learning_rate": 9.957737399270962e-07, "logits/chosen": -0.34212759137153625, "logits/rejected": -0.23890960216522217, "logps/chosen": -47.953147888183594, "logps/rejected": -36.71925354003906, "loss": 0.3771, "rewards/accuracies": 1.0, "rewards/chosen": 1.2114723920822144, "rewards/margins": 0.24790924787521362, "rewards/rejected": 0.9635631442070007, "step": 2865 }, { "epoch": 0.47, "learning_rate": 9.957566710987337e-07, "logits/chosen": -0.47560787200927734, "logits/rejected": -0.2473870813846588, "logps/chosen": -26.064695358276367, "logps/rejected": -109.83831787109375, "loss": 1.8709, "rewards/accuracies": 0.0, "rewards/chosen": 0.8197927474975586, "rewards/margins": -2.574988842010498, "rewards/rejected": 3.3947815895080566, "step": 2866 }, { "epoch": 0.47, "learning_rate": 9.95739568018276e-07, "logits/chosen": -0.6084083318710327, "logits/rejected": -0.5599124431610107, "logps/chosen": -65.17388916015625, "logps/rejected": -29.420896530151367, "loss": 0.9867, "rewards/accuracies": 1.0, "rewards/chosen": 0.6450225710868835, "rewards/margins": 0.15417802333831787, "rewards/rejected": 0.4908445477485657, "step": 2867 }, { "epoch": 0.47, "learning_rate": 9.957224306869053e-07, "logits/chosen": -0.5211718082427979, "logits/rejected": -0.31162339448928833, "logps/chosen": -84.07144165039062, "logps/rejected": -161.28504943847656, "loss": 1.8096, "rewards/accuracies": 0.0, "rewards/chosen": 0.8704147338867188, "rewards/margins": -3.297604560852051, "rewards/rejected": 4.1680192947387695, "step": 2868 }, { "epoch": 0.47, "learning_rate": 9.957052591058048e-07, "logits/chosen": -0.8106447458267212, "logits/rejected": -0.7542531490325928, "logps/chosen": -71.15205383300781, "logps/rejected": -50.943572998046875, "loss": 0.9201, "rewards/accuracies": 0.0, "rewards/chosen": 1.335994005203247, "rewards/margins": -0.13713300228118896, "rewards/rejected": 1.473127007484436, "step": 2869 }, { "epoch": 0.47, "learning_rate": 9.956880532761614e-07, "logits/chosen": -0.9244967103004456, "logits/rejected": -0.7872169613838196, "logps/chosen": -99.9551010131836, "logps/rejected": -57.7180290222168, "loss": 0.4417, "rewards/accuracies": 1.0, "rewards/chosen": 0.7368919253349304, "rewards/margins": 0.009681284427642822, "rewards/rejected": 0.7272106409072876, "step": 2870 }, { "epoch": 0.47, "learning_rate": 9.956708131991639e-07, "logits/chosen": -0.5669144988059998, "logits/rejected": -0.49837005138397217, "logps/chosen": -137.697021484375, "logps/rejected": -165.06951904296875, "loss": 1.6864, "rewards/accuracies": 0.0, "rewards/chosen": 0.354177862405777, "rewards/margins": -3.1622893810272217, "rewards/rejected": 3.516467332839966, "step": 2871 }, { "epoch": 0.47, "learning_rate": 9.95653538876003e-07, "logits/chosen": -0.6810048818588257, "logits/rejected": -0.7532714009284973, "logps/chosen": -70.18235778808594, "logps/rejected": -33.56419372558594, "loss": 2.2771, "rewards/accuracies": 1.0, "rewards/chosen": 1.6618140935897827, "rewards/margins": 0.7737300395965576, "rewards/rejected": 0.8880840539932251, "step": 2872 }, { "epoch": 0.47, "learning_rate": 9.956362303078727e-07, "logits/chosen": -0.872728168964386, "logits/rejected": -0.7350476384162903, "logps/chosen": -135.3574676513672, "logps/rejected": -60.67680740356445, "loss": 0.8223, "rewards/accuracies": 0.0, "rewards/chosen": 1.6096848249435425, "rewards/margins": -0.8318089246749878, "rewards/rejected": 2.4414937496185303, "step": 2873 }, { "epoch": 0.47, "learning_rate": 9.956188874959686e-07, "logits/chosen": -0.4137645661830902, "logits/rejected": -0.39815911650657654, "logps/chosen": -31.537382125854492, "logps/rejected": -22.901264190673828, "loss": 0.4624, "rewards/accuracies": 0.0, "rewards/chosen": 0.48392391204833984, "rewards/margins": -0.1557537317276001, "rewards/rejected": 0.6396776437759399, "step": 2874 }, { "epoch": 0.47, "learning_rate": 9.95601510441489e-07, "logits/chosen": -0.6178340911865234, "logits/rejected": -0.5418006777763367, "logps/chosen": -93.1656265258789, "logps/rejected": -63.10383605957031, "loss": 0.7305, "rewards/accuracies": 0.0, "rewards/chosen": 0.4106796383857727, "rewards/margins": -0.7937553524971008, "rewards/rejected": 1.2044349908828735, "step": 2875 }, { "epoch": 0.47, "learning_rate": 9.955840991456343e-07, "logits/chosen": -0.5531359910964966, "logits/rejected": -0.5208151340484619, "logps/chosen": -75.26951599121094, "logps/rejected": -63.92810821533203, "loss": 0.665, "rewards/accuracies": 0.0, "rewards/chosen": 1.4068100452423096, "rewards/margins": -0.2486487627029419, "rewards/rejected": 1.6554588079452515, "step": 2876 }, { "epoch": 0.47, "learning_rate": 9.955666536096078e-07, "logits/chosen": -0.7219623923301697, "logits/rejected": -0.7481297850608826, "logps/chosen": -76.34556579589844, "logps/rejected": -47.15574264526367, "loss": 0.8389, "rewards/accuracies": 0.0, "rewards/chosen": 0.9506515860557556, "rewards/margins": -0.8054623007774353, "rewards/rejected": 1.756113886833191, "step": 2877 }, { "epoch": 0.47, "learning_rate": 9.955491738346146e-07, "logits/chosen": -0.5186125636100769, "logits/rejected": -0.49308764934539795, "logps/chosen": -91.65267944335938, "logps/rejected": -63.39877700805664, "loss": 0.2547, "rewards/accuracies": 1.0, "rewards/chosen": 2.3876609802246094, "rewards/margins": 0.529513955116272, "rewards/rejected": 1.8581470251083374, "step": 2878 }, { "epoch": 0.47, "learning_rate": 9.955316598218623e-07, "logits/chosen": -0.1186763197183609, "logits/rejected": -0.15155532956123352, "logps/chosen": -7.546755790710449, "logps/rejected": -60.08936309814453, "loss": 0.3516, "rewards/accuracies": 1.0, "rewards/chosen": 0.31942063570022583, "rewards/margins": 0.05106925964355469, "rewards/rejected": 0.26835137605667114, "step": 2879 }, { "epoch": 0.47, "learning_rate": 9.955141115725611e-07, "logits/chosen": -0.35156890749931335, "logits/rejected": -0.35156890749931335, "logps/chosen": -93.87875366210938, "logps/rejected": -93.87875366210938, "loss": 0.3674, "rewards/accuracies": 0.0, "rewards/chosen": 0.6073349118232727, "rewards/margins": 0.0, "rewards/rejected": 0.6073349118232727, "step": 2880 }, { "epoch": 0.47, "learning_rate": 9.954965290879236e-07, "logits/chosen": -0.693820595741272, "logits/rejected": -0.666877806186676, "logps/chosen": -93.64132690429688, "logps/rejected": -77.5880126953125, "loss": 0.6927, "rewards/accuracies": 0.0, "rewards/chosen": 2.89445424079895, "rewards/margins": -0.10694265365600586, "rewards/rejected": 3.001396894454956, "step": 2881 }, { "epoch": 0.47, "learning_rate": 9.95478912369164e-07, "logits/chosen": -0.43202468752861023, "logits/rejected": -0.3873465657234192, "logps/chosen": -108.0660171508789, "logps/rejected": -50.88875961303711, "loss": 0.5693, "rewards/accuracies": 0.0, "rewards/chosen": 0.2631324827671051, "rewards/margins": -0.6717888116836548, "rewards/rejected": 0.9349212646484375, "step": 2882 }, { "epoch": 0.47, "learning_rate": 9.954612614175002e-07, "logits/chosen": -0.40298178791999817, "logits/rejected": -0.27348509430885315, "logps/chosen": -82.78981018066406, "logps/rejected": -141.997802734375, "loss": 0.5318, "rewards/accuracies": 0.0, "rewards/chosen": 3.1514413356781006, "rewards/margins": -0.46636271476745605, "rewards/rejected": 3.6178040504455566, "step": 2883 }, { "epoch": 0.47, "learning_rate": 9.954435762341512e-07, "logits/chosen": -0.4994693398475647, "logits/rejected": -0.3291146159172058, "logps/chosen": -68.02374267578125, "logps/rejected": -135.352294921875, "loss": 0.1945, "rewards/accuracies": 1.0, "rewards/chosen": 2.5343430042266846, "rewards/margins": 0.7952133417129517, "rewards/rejected": 1.739129662513733, "step": 2884 }, { "epoch": 0.47, "learning_rate": 9.95425856820339e-07, "logits/chosen": -0.49447911977767944, "logits/rejected": -0.5030028820037842, "logps/chosen": -150.01458740234375, "logps/rejected": -139.15042114257812, "loss": 0.2688, "rewards/accuracies": 1.0, "rewards/chosen": 1.8594070672988892, "rewards/margins": 0.9706466794013977, "rewards/rejected": 0.8887603878974915, "step": 2885 }, { "epoch": 0.47, "learning_rate": 9.954081031772877e-07, "logits/chosen": -0.35416147112846375, "logits/rejected": -0.3131404519081116, "logps/chosen": -156.96713256835938, "logps/rejected": -53.10441207885742, "loss": 0.7775, "rewards/accuracies": 0.0, "rewards/chosen": 0.8601791262626648, "rewards/margins": -0.9126636385917664, "rewards/rejected": 1.7728427648544312, "step": 2886 }, { "epoch": 0.47, "learning_rate": 9.953903153062242e-07, "logits/chosen": -0.19277100265026093, "logits/rejected": -0.19663932919502258, "logps/chosen": -101.58824157714844, "logps/rejected": -75.55928039550781, "loss": 1.1325, "rewards/accuracies": 0.0, "rewards/chosen": 1.7968612909317017, "rewards/margins": -0.10709381103515625, "rewards/rejected": 1.903955101966858, "step": 2887 }, { "epoch": 0.47, "learning_rate": 9.953724932083774e-07, "logits/chosen": -1.0581519603729248, "logits/rejected": -0.890807032585144, "logps/chosen": -177.89529418945312, "logps/rejected": -138.0615692138672, "loss": 1.1779, "rewards/accuracies": 1.0, "rewards/chosen": 3.679089307785034, "rewards/margins": 0.1374983787536621, "rewards/rejected": 3.541590929031372, "step": 2888 }, { "epoch": 0.47, "learning_rate": 9.953546368849786e-07, "logits/chosen": -0.30915355682373047, "logits/rejected": -0.30915355682373047, "logps/chosen": -56.15437698364258, "logps/rejected": -56.15437698364258, "loss": 0.4324, "rewards/accuracies": 0.0, "rewards/chosen": 0.4275745451450348, "rewards/margins": 0.0, "rewards/rejected": 0.4275745451450348, "step": 2889 }, { "epoch": 0.47, "learning_rate": 9.953367463372613e-07, "logits/chosen": -0.5744604468345642, "logits/rejected": -0.46384090185165405, "logps/chosen": -120.41065216064453, "logps/rejected": -164.31126403808594, "loss": 1.6151, "rewards/accuracies": 0.0, "rewards/chosen": 1.7135292291641235, "rewards/margins": -2.6532082557678223, "rewards/rejected": 4.366737365722656, "step": 2890 }, { "epoch": 0.47, "learning_rate": 9.953188215664618e-07, "logits/chosen": -0.12422477453947067, "logits/rejected": -0.12164253741502762, "logps/chosen": -60.46403503417969, "logps/rejected": -48.89958953857422, "loss": 1.1718, "rewards/accuracies": 0.0, "rewards/chosen": 1.8839157819747925, "rewards/margins": -0.08024740219116211, "rewards/rejected": 1.9641631841659546, "step": 2891 }, { "epoch": 0.47, "learning_rate": 9.953008625738184e-07, "logits/chosen": -0.30918973684310913, "logits/rejected": -0.22951474785804749, "logps/chosen": -45.49146270751953, "logps/rejected": -57.4279899597168, "loss": 1.0512, "rewards/accuracies": 1.0, "rewards/chosen": 1.5634788274765015, "rewards/margins": 0.07230865955352783, "rewards/rejected": 1.4911701679229736, "step": 2892 }, { "epoch": 0.47, "learning_rate": 9.952828693605722e-07, "logits/chosen": -0.7969359755516052, "logits/rejected": -0.8204647302627563, "logps/chosen": -113.56256103515625, "logps/rejected": -51.60628890991211, "loss": 0.9956, "rewards/accuracies": 0.0, "rewards/chosen": 1.0424400568008423, "rewards/margins": -0.9791179895401001, "rewards/rejected": 2.0215580463409424, "step": 2893 }, { "epoch": 0.47, "learning_rate": 9.952648419279662e-07, "logits/chosen": -0.477149099111557, "logits/rejected": -0.36303913593292236, "logps/chosen": -197.66427612304688, "logps/rejected": -186.2202606201172, "loss": 0.3739, "rewards/accuracies": 1.0, "rewards/chosen": 3.7041382789611816, "rewards/margins": 0.20052027702331543, "rewards/rejected": 3.503618001937866, "step": 2894 }, { "epoch": 0.47, "learning_rate": 9.952467802772454e-07, "logits/chosen": -0.36461469531059265, "logits/rejected": -0.37728825211524963, "logps/chosen": -5.543136119842529, "logps/rejected": -1.9824862480163574, "loss": 0.5106, "rewards/accuracies": 0.0, "rewards/chosen": 0.053113412111997604, "rewards/margins": -0.22646360099315643, "rewards/rejected": 0.27957701683044434, "step": 2895 }, { "epoch": 0.47, "learning_rate": 9.952286844096587e-07, "logits/chosen": -0.6553357243537903, "logits/rejected": -0.6734371781349182, "logps/chosen": -82.57506561279297, "logps/rejected": -108.072265625, "loss": 0.4336, "rewards/accuracies": 0.0, "rewards/chosen": 2.554347276687622, "rewards/margins": -0.1601548194885254, "rewards/rejected": 2.7145020961761475, "step": 2896 }, { "epoch": 0.47, "learning_rate": 9.952105543264556e-07, "logits/chosen": -0.7378377318382263, "logits/rejected": -0.6569139957427979, "logps/chosen": -151.35617065429688, "logps/rejected": -182.43406677246094, "loss": 2.1841, "rewards/accuracies": 0.0, "rewards/chosen": 2.3795242309570312, "rewards/margins": -4.113713264465332, "rewards/rejected": 6.493237495422363, "step": 2897 }, { "epoch": 0.47, "learning_rate": 9.951923900288888e-07, "logits/chosen": -0.878517746925354, "logits/rejected": -0.5286769866943359, "logps/chosen": -308.1625061035156, "logps/rejected": -150.16912841796875, "loss": 1.9024, "rewards/accuracies": 1.0, "rewards/chosen": 3.845367431640625, "rewards/margins": 0.495758056640625, "rewards/rejected": 3.349609375, "step": 2898 }, { "epoch": 0.47, "learning_rate": 9.951741915182134e-07, "logits/chosen": -0.6709372401237488, "logits/rejected": -0.7421875, "logps/chosen": -74.81297302246094, "logps/rejected": -99.77825927734375, "loss": 0.9224, "rewards/accuracies": 0.0, "rewards/chosen": 0.6328476071357727, "rewards/margins": -1.6054673194885254, "rewards/rejected": 2.2383148670196533, "step": 2899 }, { "epoch": 0.47, "learning_rate": 9.951559587956868e-07, "logits/chosen": -0.5285942554473877, "logits/rejected": -0.3210533559322357, "logps/chosen": -127.20360565185547, "logps/rejected": -37.080875396728516, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 4.495944976806641, "rewards/margins": 4.430492401123047, "rewards/rejected": 0.06545257568359375, "step": 2900 }, { "epoch": 0.47, "learning_rate": 9.951376918625686e-07, "logits/chosen": -0.33567726612091064, "logits/rejected": -0.2800384759902954, "logps/chosen": -60.407020568847656, "logps/rejected": -71.0562744140625, "loss": 0.2955, "rewards/accuracies": 1.0, "rewards/chosen": 1.1294487714767456, "rewards/margins": 0.35851597785949707, "rewards/rejected": 0.7709327936172485, "step": 2901 }, { "epoch": 0.47, "learning_rate": 9.951193907201212e-07, "logits/chosen": -0.3174974322319031, "logits/rejected": -0.3174974322319031, "logps/chosen": -65.2566909790039, "logps/rejected": -65.2566909790039, "loss": 1.7927, "rewards/accuracies": 0.0, "rewards/chosen": 0.5675522089004517, "rewards/margins": 0.0, "rewards/rejected": 0.5675522089004517, "step": 2902 }, { "epoch": 0.47, "learning_rate": 9.951010553696085e-07, "logits/chosen": -0.13025899231433868, "logits/rejected": -0.13025899231433868, "logps/chosen": -6.795228481292725, "logps/rejected": -6.795228481292725, "loss": 0.3487, "rewards/accuracies": 0.0, "rewards/chosen": 0.6650059223175049, "rewards/margins": 0.0, "rewards/rejected": 0.6650059223175049, "step": 2903 }, { "epoch": 0.47, "learning_rate": 9.950826858122976e-07, "logits/chosen": -0.5321914553642273, "logits/rejected": -0.5185302495956421, "logps/chosen": -186.39959716796875, "logps/rejected": -90.53125762939453, "loss": 0.5982, "rewards/accuracies": 1.0, "rewards/chosen": 3.6657745838165283, "rewards/margins": 0.9388213157653809, "rewards/rejected": 2.7269532680511475, "step": 2904 }, { "epoch": 0.47, "learning_rate": 9.950642820494577e-07, "logits/chosen": -0.6056792140007019, "logits/rejected": -0.6056792140007019, "logps/chosen": -61.60835266113281, "logps/rejected": -61.60835266113281, "loss": 0.8328, "rewards/accuracies": 0.0, "rewards/chosen": 1.3127167224884033, "rewards/margins": 0.0, "rewards/rejected": 1.3127167224884033, "step": 2905 }, { "epoch": 0.47, "learning_rate": 9.9504584408236e-07, "logits/chosen": -0.18393947184085846, "logits/rejected": -0.1862122118473053, "logps/chosen": -55.22438430786133, "logps/rejected": -48.75239562988281, "loss": 1.4638, "rewards/accuracies": 0.0, "rewards/chosen": 0.7900684475898743, "rewards/margins": -1.0466639995574951, "rewards/rejected": 1.8367325067520142, "step": 2906 }, { "epoch": 0.47, "learning_rate": 9.950273719122791e-07, "logits/chosen": -0.32038575410842896, "logits/rejected": -0.2272544801235199, "logps/chosen": -83.3658218383789, "logps/rejected": -62.385128021240234, "loss": 1.0526, "rewards/accuracies": 0.0, "rewards/chosen": 0.8672820925712585, "rewards/margins": -0.7429600358009338, "rewards/rejected": 1.6102421283721924, "step": 2907 }, { "epoch": 0.47, "learning_rate": 9.950088655404905e-07, "logits/chosen": -0.46880167722702026, "logits/rejected": -0.4891137480735779, "logps/chosen": -75.72967529296875, "logps/rejected": -60.095802307128906, "loss": 1.0317, "rewards/accuracies": 0.0, "rewards/chosen": 1.9391769170761108, "rewards/margins": -0.40617454051971436, "rewards/rejected": 2.345351457595825, "step": 2908 }, { "epoch": 0.47, "learning_rate": 9.949903249682733e-07, "logits/chosen": -0.4285677373409271, "logits/rejected": -0.5127227306365967, "logps/chosen": -105.96113586425781, "logps/rejected": -142.1949920654297, "loss": 1.4034, "rewards/accuracies": 0.0, "rewards/chosen": 0.9495742917060852, "rewards/margins": -2.6625123023986816, "rewards/rejected": 3.612086534500122, "step": 2909 }, { "epoch": 0.47, "learning_rate": 9.949717501969079e-07, "logits/chosen": -0.6334408521652222, "logits/rejected": -0.6412826180458069, "logps/chosen": -110.60415649414062, "logps/rejected": -160.52293395996094, "loss": 2.268, "rewards/accuracies": 0.0, "rewards/chosen": 2.4298508167266846, "rewards/margins": -4.049646377563477, "rewards/rejected": 6.479496955871582, "step": 2910 }, { "epoch": 0.47, "learning_rate": 9.949531412276784e-07, "logits/chosen": -0.38791635632514954, "logits/rejected": -0.3827723264694214, "logps/chosen": -0.44393137097358704, "logps/rejected": -38.67815017700195, "loss": 0.3617, "rewards/accuracies": 1.0, "rewards/chosen": 0.11515489965677261, "rewards/margins": 0.18831202387809753, "rewards/rejected": -0.07315712422132492, "step": 2911 }, { "epoch": 0.47, "learning_rate": 9.949344980618698e-07, "logits/chosen": -0.2597724199295044, "logits/rejected": -0.3216930031776428, "logps/chosen": -85.18959045410156, "logps/rejected": -94.09809875488281, "loss": 0.6453, "rewards/accuracies": 1.0, "rewards/chosen": 0.4717422425746918, "rewards/margins": 0.2789062261581421, "rewards/rejected": 0.1928360015153885, "step": 2912 }, { "epoch": 0.47, "learning_rate": 9.949158207007709e-07, "logits/chosen": -0.6609838008880615, "logits/rejected": -0.6489438414573669, "logps/chosen": -65.43480682373047, "logps/rejected": -72.2928466796875, "loss": 0.5406, "rewards/accuracies": 1.0, "rewards/chosen": 1.5626229047775269, "rewards/margins": 0.8731346726417542, "rewards/rejected": 0.6894882321357727, "step": 2913 }, { "epoch": 0.47, "learning_rate": 9.948971091456714e-07, "logits/chosen": -0.4968526065349579, "logits/rejected": -0.5296271443367004, "logps/chosen": -79.97239685058594, "logps/rejected": -74.61839294433594, "loss": 1.3858, "rewards/accuracies": 0.0, "rewards/chosen": 1.1309250593185425, "rewards/margins": -2.4068603515625, "rewards/rejected": 3.537785291671753, "step": 2914 }, { "epoch": 0.47, "learning_rate": 9.948783633978647e-07, "logits/chosen": -1.0508227348327637, "logits/rejected": -1.0955256223678589, "logps/chosen": -91.50688934326172, "logps/rejected": -84.21320343017578, "loss": 1.6226, "rewards/accuracies": 0.0, "rewards/chosen": 1.6107696294784546, "rewards/margins": -0.06529927253723145, "rewards/rejected": 1.676068902015686, "step": 2915 }, { "epoch": 0.47, "learning_rate": 9.948595834586455e-07, "logits/chosen": -0.38405081629753113, "logits/rejected": -0.38474586606025696, "logps/chosen": -2.8984971046447754, "logps/rejected": -1.9704594612121582, "loss": 0.3832, "rewards/accuracies": 0.0, "rewards/chosen": 0.30270177125930786, "rewards/margins": -0.12739014625549316, "rewards/rejected": 0.430091917514801, "step": 2916 }, { "epoch": 0.47, "learning_rate": 9.948407693293116e-07, "logits/chosen": -0.597317636013031, "logits/rejected": -0.6369041204452515, "logps/chosen": -237.99404907226562, "logps/rejected": -160.36373901367188, "loss": 0.8399, "rewards/accuracies": 0.0, "rewards/chosen": 2.6924774646759033, "rewards/margins": -1.437307596206665, "rewards/rejected": 4.129785060882568, "step": 2917 }, { "epoch": 0.47, "learning_rate": 9.948219210111626e-07, "logits/chosen": -0.8281136155128479, "logits/rejected": -0.8402858972549438, "logps/chosen": -157.672607421875, "logps/rejected": -43.79720687866211, "loss": 0.0695, "rewards/accuracies": 1.0, "rewards/chosen": 3.0468368530273438, "rewards/margins": 2.83050274848938, "rewards/rejected": 0.21633414924144745, "step": 2918 }, { "epoch": 0.47, "learning_rate": 9.94803038505501e-07, "logits/chosen": -0.6889570355415344, "logits/rejected": -0.6291460394859314, "logps/chosen": -146.55258178710938, "logps/rejected": -102.03111267089844, "loss": 0.1324, "rewards/accuracies": 1.0, "rewards/chosen": 4.537370204925537, "rewards/margins": 1.7719647884368896, "rewards/rejected": 2.7654054164886475, "step": 2919 }, { "epoch": 0.47, "learning_rate": 9.947841218136314e-07, "logits/chosen": -0.7094278335571289, "logits/rejected": -0.7513046264648438, "logps/chosen": -459.38916015625, "logps/rejected": -163.12168884277344, "loss": 2.5064, "rewards/accuracies": 0.0, "rewards/chosen": 1.859649658203125, "rewards/margins": -2.854306221008301, "rewards/rejected": 4.713955879211426, "step": 2920 }, { "epoch": 0.47, "learning_rate": 9.947651709368604e-07, "logits/chosen": -0.8601273894309998, "logits/rejected": -0.8492349982261658, "logps/chosen": -59.48920440673828, "logps/rejected": -19.583396911621094, "loss": 0.8339, "rewards/accuracies": 1.0, "rewards/chosen": 2.017582654953003, "rewards/margins": 1.518470287322998, "rewards/rejected": 0.4991123378276825, "step": 2921 }, { "epoch": 0.47, "learning_rate": 9.947461858764977e-07, "logits/chosen": -0.7874711751937866, "logits/rejected": -0.8506717681884766, "logps/chosen": -75.6734619140625, "logps/rejected": -118.54798889160156, "loss": 1.2895, "rewards/accuracies": 0.0, "rewards/chosen": 3.018545627593994, "rewards/margins": -2.4334778785705566, "rewards/rejected": 5.452023506164551, "step": 2922 }, { "epoch": 0.47, "learning_rate": 9.94727166633855e-07, "logits/chosen": -0.7778906226158142, "logits/rejected": -0.7473767399787903, "logps/chosen": -138.35000610351562, "logps/rejected": -133.16090393066406, "loss": 1.4594, "rewards/accuracies": 0.0, "rewards/chosen": 0.462777704000473, "rewards/margins": -2.1203463077545166, "rewards/rejected": 2.5831239223480225, "step": 2923 }, { "epoch": 0.47, "learning_rate": 9.947081132102462e-07, "logits/chosen": -0.5254727602005005, "logits/rejected": -0.6153224110603333, "logps/chosen": -27.980745315551758, "logps/rejected": -61.832008361816406, "loss": 0.7529, "rewards/accuracies": 0.0, "rewards/chosen": 1.025944709777832, "rewards/margins": -1.2404837608337402, "rewards/rejected": 2.2664284706115723, "step": 2924 }, { "epoch": 0.47, "learning_rate": 9.946890256069877e-07, "logits/chosen": -0.770031750202179, "logits/rejected": -0.6862360239028931, "logps/chosen": -288.21600341796875, "logps/rejected": -76.92877197265625, "loss": 0.424, "rewards/accuracies": 1.0, "rewards/chosen": 0.8589996695518494, "rewards/margins": 0.6902039051055908, "rewards/rejected": 0.16879577934741974, "step": 2925 }, { "epoch": 0.47, "learning_rate": 9.946699038253984e-07, "logits/chosen": -0.8539654016494751, "logits/rejected": -0.8520697355270386, "logps/chosen": -148.44407653808594, "logps/rejected": -118.83427429199219, "loss": 0.7528, "rewards/accuracies": 0.0, "rewards/chosen": 3.9703521728515625, "rewards/margins": -0.8088622093200684, "rewards/rejected": 4.779214382171631, "step": 2926 }, { "epoch": 0.48, "learning_rate": 9.946507478667995e-07, "logits/chosen": -0.591375470161438, "logits/rejected": -0.510473370552063, "logps/chosen": -99.91614532470703, "logps/rejected": -43.08284378051758, "loss": 0.6685, "rewards/accuracies": 1.0, "rewards/chosen": 1.7211250066757202, "rewards/margins": 1.1365292072296143, "rewards/rejected": 0.5845958590507507, "step": 2927 }, { "epoch": 0.48, "learning_rate": 9.946315577325139e-07, "logits/chosen": -0.5134342908859253, "logits/rejected": -0.5229383111000061, "logps/chosen": -8.508853912353516, "logps/rejected": -28.21076202392578, "loss": 0.7064, "rewards/accuracies": 0.0, "rewards/chosen": 0.5111342668533325, "rewards/margins": -0.013606727123260498, "rewards/rejected": 0.524740993976593, "step": 2928 }, { "epoch": 0.48, "learning_rate": 9.946123334238683e-07, "logits/chosen": -0.6401327848434448, "logits/rejected": -0.5489002466201782, "logps/chosen": -111.34599304199219, "logps/rejected": -17.19615364074707, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.7733826041221619, "rewards/margins": 0.5490677356719971, "rewards/rejected": 0.224314883351326, "step": 2929 }, { "epoch": 0.48, "learning_rate": 9.945930749421902e-07, "logits/chosen": -0.12034045904874802, "logits/rejected": -0.14597615599632263, "logps/chosen": -131.89007568359375, "logps/rejected": -70.61479187011719, "loss": 0.5417, "rewards/accuracies": 0.0, "rewards/chosen": 1.493719458580017, "rewards/margins": -0.3124115467071533, "rewards/rejected": 1.8061310052871704, "step": 2930 }, { "epoch": 0.48, "learning_rate": 9.945737822888109e-07, "logits/chosen": -0.6835803985595703, "logits/rejected": -0.6890215277671814, "logps/chosen": -122.0331039428711, "logps/rejected": -103.05118560791016, "loss": 0.2128, "rewards/accuracies": 1.0, "rewards/chosen": 3.113144636154175, "rewards/margins": 0.6362838745117188, "rewards/rejected": 2.476860761642456, "step": 2931 }, { "epoch": 0.48, "learning_rate": 9.945544554650626e-07, "logits/chosen": -0.8035863041877747, "logits/rejected": -0.7890841960906982, "logps/chosen": -153.349853515625, "logps/rejected": -125.22763061523438, "loss": 0.4034, "rewards/accuracies": 0.0, "rewards/chosen": 3.6202332973480225, "rewards/margins": -0.05322408676147461, "rewards/rejected": 3.673457384109497, "step": 2932 }, { "epoch": 0.48, "learning_rate": 9.945350944722812e-07, "logits/chosen": -0.4173395037651062, "logits/rejected": -0.41198813915252686, "logps/chosen": -3.418551445007324, "logps/rejected": -17.55101776123047, "loss": 0.3172, "rewards/accuracies": 1.0, "rewards/chosen": 0.5922877192497253, "rewards/margins": 0.24869754910469055, "rewards/rejected": 0.3435901701450348, "step": 2933 }, { "epoch": 0.48, "learning_rate": 9.945156993118041e-07, "logits/chosen": -0.5303557515144348, "logits/rejected": -0.6281641125679016, "logps/chosen": -64.65135955810547, "logps/rejected": -108.53231811523438, "loss": 2.0867, "rewards/accuracies": 0.0, "rewards/chosen": 0.9417320489883423, "rewards/margins": -3.494431495666504, "rewards/rejected": 4.436163425445557, "step": 2934 }, { "epoch": 0.48, "learning_rate": 9.94496269984971e-07, "logits/chosen": -0.9318986535072327, "logits/rejected": -0.8990588188171387, "logps/chosen": -70.88853454589844, "logps/rejected": -54.222389221191406, "loss": 0.6606, "rewards/accuracies": 0.0, "rewards/chosen": 0.84224933385849, "rewards/margins": -0.21455997228622437, "rewards/rejected": 1.0568093061447144, "step": 2935 }, { "epoch": 0.48, "learning_rate": 9.94476806493125e-07, "logits/chosen": -0.669077455997467, "logits/rejected": -0.5999363660812378, "logps/chosen": -51.52488708496094, "logps/rejected": -78.85173034667969, "loss": 0.35, "rewards/accuracies": 1.0, "rewards/chosen": 3.509085178375244, "rewards/margins": 1.4664475917816162, "rewards/rejected": 2.042637586593628, "step": 2936 }, { "epoch": 0.48, "learning_rate": 9.944573088376102e-07, "logits/chosen": -0.8900279998779297, "logits/rejected": -0.9851970076560974, "logps/chosen": -278.097900390625, "logps/rejected": -97.14053344726562, "loss": 0.2979, "rewards/accuracies": 1.0, "rewards/chosen": 3.3594300746917725, "rewards/margins": 0.2852203845977783, "rewards/rejected": 3.074209690093994, "step": 2937 }, { "epoch": 0.48, "learning_rate": 9.94437777019774e-07, "logits/chosen": -0.820679247379303, "logits/rejected": -0.7371850609779358, "logps/chosen": -102.03872680664062, "logps/rejected": -78.29581451416016, "loss": 0.9025, "rewards/accuracies": 0.0, "rewards/chosen": 1.3588745594024658, "rewards/margins": -1.4594199657440186, "rewards/rejected": 2.8182945251464844, "step": 2938 }, { "epoch": 0.48, "learning_rate": 9.94418211040966e-07, "logits/chosen": -0.9631774425506592, "logits/rejected": -0.5889485478401184, "logps/chosen": -218.5303497314453, "logps/rejected": -19.992122650146484, "loss": 1.2346, "rewards/accuracies": 1.0, "rewards/chosen": 3.8160934448242188, "rewards/margins": 3.43188214302063, "rewards/rejected": 0.38421136140823364, "step": 2939 }, { "epoch": 0.48, "learning_rate": 9.943986109025377e-07, "logits/chosen": -0.4195179343223572, "logits/rejected": -0.3094036877155304, "logps/chosen": -114.18240356445312, "logps/rejected": -52.8026123046875, "loss": 0.1206, "rewards/accuracies": 1.0, "rewards/chosen": 3.2525970935821533, "rewards/margins": 1.7454102039337158, "rewards/rejected": 1.5071868896484375, "step": 2940 }, { "epoch": 0.48, "learning_rate": 9.943789766058434e-07, "logits/chosen": -0.4570200741291046, "logits/rejected": -0.4277969300746918, "logps/chosen": -87.9581069946289, "logps/rejected": -98.00955963134766, "loss": 0.4149, "rewards/accuracies": 1.0, "rewards/chosen": 1.8693153858184814, "rewards/margins": 0.42838823795318604, "rewards/rejected": 1.4409271478652954, "step": 2941 }, { "epoch": 0.48, "learning_rate": 9.943593081522397e-07, "logits/chosen": -1.080075979232788, "logits/rejected": -1.0533936023712158, "logps/chosen": -139.47738647460938, "logps/rejected": -122.70654296875, "loss": 1.0992, "rewards/accuracies": 0.0, "rewards/chosen": 3.419207811355591, "rewards/margins": -1.9785430431365967, "rewards/rejected": 5.3977508544921875, "step": 2942 }, { "epoch": 0.48, "learning_rate": 9.943396055430855e-07, "logits/chosen": -0.5103302001953125, "logits/rejected": -0.5947111248970032, "logps/chosen": -129.81881713867188, "logps/rejected": -76.838134765625, "loss": 1.2773, "rewards/accuracies": 1.0, "rewards/chosen": 1.9325439929962158, "rewards/margins": 0.26244819164276123, "rewards/rejected": 1.6700958013534546, "step": 2943 }, { "epoch": 0.48, "learning_rate": 9.943198687797421e-07, "logits/chosen": -0.6867994070053101, "logits/rejected": -0.6619365811347961, "logps/chosen": -98.68560791015625, "logps/rejected": -34.347129821777344, "loss": 1.2004, "rewards/accuracies": 1.0, "rewards/chosen": 0.37098464369773865, "rewards/margins": 0.4390644133090973, "rewards/rejected": -0.06807976216077805, "step": 2944 }, { "epoch": 0.48, "learning_rate": 9.94300097863573e-07, "logits/chosen": -0.5899682641029358, "logits/rejected": -0.5804574489593506, "logps/chosen": -105.38546752929688, "logps/rejected": -122.20777130126953, "loss": 1.3999, "rewards/accuracies": 0.0, "rewards/chosen": 1.4857971668243408, "rewards/margins": -2.389885663986206, "rewards/rejected": 3.875682830810547, "step": 2945 }, { "epoch": 0.48, "learning_rate": 9.942802927959442e-07, "logits/chosen": -0.40867188572883606, "logits/rejected": -0.3147394061088562, "logps/chosen": -201.08889770507812, "logps/rejected": -111.11309814453125, "loss": 0.0983, "rewards/accuracies": 1.0, "rewards/chosen": 4.515683174133301, "rewards/margins": 1.737335443496704, "rewards/rejected": 2.7783477306365967, "step": 2946 }, { "epoch": 0.48, "learning_rate": 9.942604535782242e-07, "logits/chosen": -0.6012988090515137, "logits/rejected": -0.5342717170715332, "logps/chosen": -78.20098876953125, "logps/rejected": -64.2550048828125, "loss": 0.7054, "rewards/accuracies": 0.0, "rewards/chosen": 0.5398941040039062, "rewards/margins": -1.127227783203125, "rewards/rejected": 1.6671218872070312, "step": 2947 }, { "epoch": 0.48, "learning_rate": 9.942405802117834e-07, "logits/chosen": -0.7935632467269897, "logits/rejected": -0.8929818272590637, "logps/chosen": -162.40475463867188, "logps/rejected": -148.3514404296875, "loss": 2.0193, "rewards/accuracies": 0.0, "rewards/chosen": 2.9795989990234375, "rewards/margins": -3.938699245452881, "rewards/rejected": 6.918298244476318, "step": 2948 }, { "epoch": 0.48, "learning_rate": 9.942206726979954e-07, "logits/chosen": -0.5154487490653992, "logits/rejected": -0.5470767021179199, "logps/chosen": -67.2938232421875, "logps/rejected": -88.07876586914062, "loss": 0.392, "rewards/accuracies": 1.0, "rewards/chosen": 2.701671600341797, "rewards/margins": 0.17745351791381836, "rewards/rejected": 2.5242180824279785, "step": 2949 }, { "epoch": 0.48, "learning_rate": 9.94200731038235e-07, "logits/chosen": -0.47360092401504517, "logits/rejected": -0.4498388171195984, "logps/chosen": -106.15619659423828, "logps/rejected": -116.47711181640625, "loss": 0.9887, "rewards/accuracies": 0.0, "rewards/chosen": 0.7028846740722656, "rewards/margins": -0.798352837562561, "rewards/rejected": 1.5012375116348267, "step": 2950 }, { "epoch": 0.48, "learning_rate": 9.941807552338803e-07, "logits/chosen": -0.7269917726516724, "logits/rejected": -0.7301092743873596, "logps/chosen": -61.787567138671875, "logps/rejected": -53.51201629638672, "loss": 1.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.7357582449913025, "rewards/margins": 0.008840978145599365, "rewards/rejected": 0.7269172668457031, "step": 2951 }, { "epoch": 0.48, "learning_rate": 9.941607452863115e-07, "logits/chosen": -0.6391805410385132, "logits/rejected": -0.6000693440437317, "logps/chosen": -44.43193817138672, "logps/rejected": -8.087835311889648, "loss": 0.2744, "rewards/accuracies": 1.0, "rewards/chosen": 2.119921922683716, "rewards/margins": 1.3508195877075195, "rewards/rejected": 0.7691022753715515, "step": 2952 }, { "epoch": 0.48, "learning_rate": 9.94140701196911e-07, "logits/chosen": -0.6459001302719116, "logits/rejected": -0.6118149161338806, "logps/chosen": -171.9904022216797, "logps/rejected": -93.46685791015625, "loss": 0.8096, "rewards/accuracies": 0.0, "rewards/chosen": 3.309138536453247, "rewards/margins": -0.2357635498046875, "rewards/rejected": 3.5449020862579346, "step": 2953 }, { "epoch": 0.48, "learning_rate": 9.941206229670634e-07, "logits/chosen": -0.5155675411224365, "logits/rejected": -0.3597438633441925, "logps/chosen": -46.36488342285156, "logps/rejected": -47.89376449584961, "loss": 0.2029, "rewards/accuracies": 1.0, "rewards/chosen": 1.950995683670044, "rewards/margins": 1.0860202312469482, "rewards/rejected": 0.8649753928184509, "step": 2954 }, { "epoch": 0.48, "learning_rate": 9.941005105981563e-07, "logits/chosen": -0.5453778505325317, "logits/rejected": -0.5568332076072693, "logps/chosen": -167.050048828125, "logps/rejected": -63.80773162841797, "loss": 0.8301, "rewards/accuracies": 0.0, "rewards/chosen": 2.064230442047119, "rewards/margins": -0.3793609142303467, "rewards/rejected": 2.443591356277466, "step": 2955 }, { "epoch": 0.48, "learning_rate": 9.940803640915792e-07, "logits/chosen": -0.8158294558525085, "logits/rejected": -0.8158294558525085, "logps/chosen": -101.73348999023438, "logps/rejected": -101.73348999023438, "loss": 0.5016, "rewards/accuracies": 0.0, "rewards/chosen": 1.8515465259552002, "rewards/margins": 0.0, "rewards/rejected": 1.8515465259552002, "step": 2956 }, { "epoch": 0.48, "learning_rate": 9.94060183448724e-07, "logits/chosen": -0.49578341841697693, "logits/rejected": -0.6277617812156677, "logps/chosen": -89.85862731933594, "logps/rejected": -137.90789794921875, "loss": 1.6966, "rewards/accuracies": 0.0, "rewards/chosen": 2.2924294471740723, "rewards/margins": -3.287747859954834, "rewards/rejected": 5.580177307128906, "step": 2957 }, { "epoch": 0.48, "learning_rate": 9.940399686709848e-07, "logits/chosen": -0.6654040813446045, "logits/rejected": -0.6372874975204468, "logps/chosen": -173.4296875, "logps/rejected": -69.04852294921875, "loss": 1.6411, "rewards/accuracies": 0.0, "rewards/chosen": -0.42928314208984375, "rewards/margins": -2.606335401535034, "rewards/rejected": 2.1770522594451904, "step": 2958 }, { "epoch": 0.48, "learning_rate": 9.940197197597587e-07, "logits/chosen": -0.44047921895980835, "logits/rejected": -0.29582855105400085, "logps/chosen": -125.50257110595703, "logps/rejected": -43.89400100708008, "loss": 0.2805, "rewards/accuracies": 1.0, "rewards/chosen": 2.878225088119507, "rewards/margins": 1.6534733772277832, "rewards/rejected": 1.2247517108917236, "step": 2959 }, { "epoch": 0.48, "learning_rate": 9.939994367164442e-07, "logits/chosen": -0.49695131182670593, "logits/rejected": -0.20523011684417725, "logps/chosen": -189.88742065429688, "logps/rejected": -39.0744514465332, "loss": 0.5123, "rewards/accuracies": 1.0, "rewards/chosen": 3.6638290882110596, "rewards/margins": 2.8859119415283203, "rewards/rejected": 0.7779170870780945, "step": 2960 }, { "epoch": 0.48, "learning_rate": 9.93979119542443e-07, "logits/chosen": -0.6661288142204285, "logits/rejected": -0.6145179867744446, "logps/chosen": -75.88639068603516, "logps/rejected": -139.599609375, "loss": 0.2014, "rewards/accuracies": 1.0, "rewards/chosen": 0.906964898109436, "rewards/margins": 1.2577217817306519, "rewards/rejected": -0.35075685381889343, "step": 2961 }, { "epoch": 0.48, "learning_rate": 9.939587682391586e-07, "logits/chosen": -0.7648383975028992, "logits/rejected": -0.732273280620575, "logps/chosen": -90.85765075683594, "logps/rejected": -54.265201568603516, "loss": 0.4676, "rewards/accuracies": 0.0, "rewards/chosen": 0.5498863458633423, "rewards/margins": -0.3100101351737976, "rewards/rejected": 0.8598964810371399, "step": 2962 }, { "epoch": 0.48, "learning_rate": 9.939383828079972e-07, "logits/chosen": -0.8924486637115479, "logits/rejected": -0.8074846863746643, "logps/chosen": -125.45626831054688, "logps/rejected": -125.269287109375, "loss": 1.5295, "rewards/accuracies": 0.0, "rewards/chosen": 1.084588646888733, "rewards/margins": -2.625906467437744, "rewards/rejected": 3.7104949951171875, "step": 2963 }, { "epoch": 0.48, "learning_rate": 9.939179632503672e-07, "logits/chosen": -0.48499447107315063, "logits/rejected": -0.4783490300178528, "logps/chosen": -1.1586744785308838, "logps/rejected": -9.115331649780273, "loss": 1.5852, "rewards/accuracies": 1.0, "rewards/chosen": 0.332903653383255, "rewards/margins": 0.31232327222824097, "rewards/rejected": 0.020580386742949486, "step": 2964 }, { "epoch": 0.48, "learning_rate": 9.938975095676797e-07, "logits/chosen": -1.1914767026901245, "logits/rejected": -0.5112650394439697, "logps/chosen": -45.055572509765625, "logps/rejected": -108.16232299804688, "loss": 0.7494, "rewards/accuracies": 0.0, "rewards/chosen": 1.7026710510253906, "rewards/margins": -0.8198814392089844, "rewards/rejected": 2.522552490234375, "step": 2965 }, { "epoch": 0.48, "learning_rate": 9.938770217613472e-07, "logits/chosen": -0.46655523777008057, "logits/rejected": -0.3138435482978821, "logps/chosen": -107.07299041748047, "logps/rejected": -53.91946029663086, "loss": 0.943, "rewards/accuracies": 1.0, "rewards/chosen": 3.3198554515838623, "rewards/margins": 1.8052805662155151, "rewards/rejected": 1.5145748853683472, "step": 2966 }, { "epoch": 0.48, "learning_rate": 9.938564998327858e-07, "logits/chosen": -0.6024346351623535, "logits/rejected": -0.5996483564376831, "logps/chosen": -80.57470703125, "logps/rejected": -42.332942962646484, "loss": 1.1044, "rewards/accuracies": 1.0, "rewards/chosen": 1.6490280628204346, "rewards/margins": 0.02729499340057373, "rewards/rejected": 1.6217330694198608, "step": 2967 }, { "epoch": 0.48, "learning_rate": 9.938359437834133e-07, "logits/chosen": -0.6520940661430359, "logits/rejected": -0.6588115096092224, "logps/chosen": -87.02310943603516, "logps/rejected": -61.97080993652344, "loss": 1.2242, "rewards/accuracies": 0.0, "rewards/chosen": -0.09992599487304688, "rewards/margins": -1.6730674505233765, "rewards/rejected": 1.5731414556503296, "step": 2968 }, { "epoch": 0.48, "learning_rate": 9.938153536146497e-07, "logits/chosen": -0.7067999243736267, "logits/rejected": -0.7241968512535095, "logps/chosen": -59.422386169433594, "logps/rejected": -62.14284896850586, "loss": 0.5667, "rewards/accuracies": 0.0, "rewards/chosen": 1.4179633855819702, "rewards/margins": -0.6991389989852905, "rewards/rejected": 2.1171023845672607, "step": 2969 }, { "epoch": 0.48, "learning_rate": 9.937947293279175e-07, "logits/chosen": -0.6117912530899048, "logits/rejected": -0.5853352546691895, "logps/chosen": -75.07435607910156, "logps/rejected": -7.8524980545043945, "loss": 0.211, "rewards/accuracies": 1.0, "rewards/chosen": 1.6778777837753296, "rewards/margins": 1.2170933485031128, "rewards/rejected": 0.4607844352722168, "step": 2970 }, { "epoch": 0.48, "learning_rate": 9.937740709246422e-07, "logits/chosen": -0.7737697958946228, "logits/rejected": -0.7335131764411926, "logps/chosen": -109.19351959228516, "logps/rejected": -61.82661437988281, "loss": 2.6197, "rewards/accuracies": 0.0, "rewards/chosen": 0.6476104855537415, "rewards/margins": -1.3414978981018066, "rewards/rejected": 1.9891083240509033, "step": 2971 }, { "epoch": 0.48, "learning_rate": 9.937533784062505e-07, "logits/chosen": -0.41020819544792175, "logits/rejected": -0.3947089910507202, "logps/chosen": -37.32844543457031, "logps/rejected": -73.88233947753906, "loss": 0.9017, "rewards/accuracies": 0.0, "rewards/chosen": 0.5005699396133423, "rewards/margins": -0.49347609281539917, "rewards/rejected": 0.9940460324287415, "step": 2972 }, { "epoch": 0.48, "learning_rate": 9.937326517741723e-07, "logits/chosen": -0.37949633598327637, "logits/rejected": -0.37949633598327637, "logps/chosen": -47.933082580566406, "logps/rejected": -47.933082580566406, "loss": 0.7179, "rewards/accuracies": 0.0, "rewards/chosen": 0.8421783447265625, "rewards/margins": 0.0, "rewards/rejected": 0.8421783447265625, "step": 2973 }, { "epoch": 0.48, "learning_rate": 9.937118910298396e-07, "logits/chosen": -0.2943184971809387, "logits/rejected": -0.21330726146697998, "logps/chosen": -59.703556060791016, "logps/rejected": -67.53083038330078, "loss": 0.953, "rewards/accuracies": 1.0, "rewards/chosen": 1.7704006433486938, "rewards/margins": 0.5138728618621826, "rewards/rejected": 1.2565277814865112, "step": 2974 }, { "epoch": 0.48, "learning_rate": 9.93691096174687e-07, "logits/chosen": -0.4503413438796997, "logits/rejected": -0.4530230760574341, "logps/chosen": -90.82402038574219, "logps/rejected": -56.83341979980469, "loss": 0.3337, "rewards/accuracies": 1.0, "rewards/chosen": 1.4792518615722656, "rewards/margins": 0.15192186832427979, "rewards/rejected": 1.3273299932479858, "step": 2975 }, { "epoch": 0.48, "learning_rate": 9.936702672101507e-07, "logits/chosen": -0.4955371022224426, "logits/rejected": -0.6352756023406982, "logps/chosen": -171.07080078125, "logps/rejected": -154.3634490966797, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 4.401564121246338, "rewards/margins": 0.5001678466796875, "rewards/rejected": 3.9013962745666504, "step": 2976 }, { "epoch": 0.48, "learning_rate": 9.936494041376701e-07, "logits/chosen": -0.4483017325401306, "logits/rejected": -0.4728632867336273, "logps/chosen": -64.29660034179688, "logps/rejected": -98.23954772949219, "loss": 0.5518, "rewards/accuracies": 0.0, "rewards/chosen": 1.0623871088027954, "rewards/margins": -0.47056424617767334, "rewards/rejected": 1.5329513549804688, "step": 2977 }, { "epoch": 0.48, "learning_rate": 9.936285069586869e-07, "logits/chosen": -0.7632102370262146, "logits/rejected": -0.7365078926086426, "logps/chosen": -72.89756774902344, "logps/rejected": -72.87269592285156, "loss": 0.8798, "rewards/accuracies": 0.0, "rewards/chosen": 0.7556824088096619, "rewards/margins": -0.850048840045929, "rewards/rejected": 1.6057312488555908, "step": 2978 }, { "epoch": 0.48, "learning_rate": 9.936075756746443e-07, "logits/chosen": -0.6275107860565186, "logits/rejected": -0.6208173632621765, "logps/chosen": -34.569480895996094, "logps/rejected": -58.70667266845703, "loss": 1.6418, "rewards/accuracies": 1.0, "rewards/chosen": 2.5512497425079346, "rewards/margins": 0.3982582092285156, "rewards/rejected": 2.152991533279419, "step": 2979 }, { "epoch": 0.48, "learning_rate": 9.93586610286989e-07, "logits/chosen": -0.5453325510025024, "logits/rejected": -0.5371764302253723, "logps/chosen": -2.0206220149993896, "logps/rejected": -8.958627700805664, "loss": 1.804, "rewards/accuracies": 1.0, "rewards/chosen": 0.2283119410276413, "rewards/margins": 0.115155428647995, "rewards/rejected": 0.1131565123796463, "step": 2980 }, { "epoch": 0.48, "learning_rate": 9.935656107971693e-07, "logits/chosen": -0.3876601457595825, "logits/rejected": -0.3795154094696045, "logps/chosen": -50.81470489501953, "logps/rejected": -19.32697868347168, "loss": 0.5829, "rewards/accuracies": 1.0, "rewards/chosen": 1.6182937622070312, "rewards/margins": 1.0113019943237305, "rewards/rejected": 0.6069917678833008, "step": 2981 }, { "epoch": 0.48, "learning_rate": 9.93544577206636e-07, "logits/chosen": -0.8574343919754028, "logits/rejected": -0.8574343919754028, "logps/chosen": -158.79986572265625, "logps/rejected": -158.79986572265625, "loss": 1.0477, "rewards/accuracies": 0.0, "rewards/chosen": 1.8468536138534546, "rewards/margins": 0.0, "rewards/rejected": 1.8468536138534546, "step": 2982 }, { "epoch": 0.48, "learning_rate": 9.935235095168423e-07, "logits/chosen": -0.6131929755210876, "logits/rejected": -1.0329173803329468, "logps/chosen": -99.68981170654297, "logps/rejected": -36.440704345703125, "loss": 0.1298, "rewards/accuracies": 1.0, "rewards/chosen": 1.4947632551193237, "rewards/margins": 1.2483704090118408, "rewards/rejected": 0.24639283120632172, "step": 2983 }, { "epoch": 0.48, "learning_rate": 9.93502407729244e-07, "logits/chosen": -0.5164697170257568, "logits/rejected": -0.40535497665405273, "logps/chosen": -61.316123962402344, "logps/rejected": -25.59197235107422, "loss": 0.3597, "rewards/accuracies": 1.0, "rewards/chosen": 1.417262315750122, "rewards/margins": 1.2306666374206543, "rewards/rejected": 0.18659572303295135, "step": 2984 }, { "epoch": 0.48, "learning_rate": 9.934812718452987e-07, "logits/chosen": -0.5030472278594971, "logits/rejected": -0.4704843759536743, "logps/chosen": -58.05772399902344, "logps/rejected": -35.006614685058594, "loss": 0.469, "rewards/accuracies": 0.0, "rewards/chosen": 1.6853684186935425, "rewards/margins": -0.23961782455444336, "rewards/rejected": 1.9249862432479858, "step": 2985 }, { "epoch": 0.48, "learning_rate": 9.93460101866467e-07, "logits/chosen": -0.1816861480474472, "logits/rejected": -0.1445506513118744, "logps/chosen": -49.54982376098633, "logps/rejected": -58.9053955078125, "loss": 0.9122, "rewards/accuracies": 0.0, "rewards/chosen": 1.6118968725204468, "rewards/margins": -0.21927154064178467, "rewards/rejected": 1.8311684131622314, "step": 2986 }, { "epoch": 0.48, "learning_rate": 9.934388977942114e-07, "logits/chosen": -0.6394618153572083, "logits/rejected": -0.6491986513137817, "logps/chosen": -49.465702056884766, "logps/rejected": -51.42433166503906, "loss": 0.6297, "rewards/accuracies": 1.0, "rewards/chosen": 1.6005833148956299, "rewards/margins": 0.10120809078216553, "rewards/rejected": 1.4993752241134644, "step": 2987 }, { "epoch": 0.48, "learning_rate": 9.93417659629997e-07, "logits/chosen": -0.507786750793457, "logits/rejected": -0.5275881290435791, "logps/chosen": -57.357383728027344, "logps/rejected": -206.88229370117188, "loss": 1.557, "rewards/accuracies": 0.0, "rewards/chosen": 1.8360542058944702, "rewards/margins": -2.3202295303344727, "rewards/rejected": 4.156283855438232, "step": 2988 }, { "epoch": 0.49, "learning_rate": 9.933963873752909e-07, "logits/chosen": -0.6093360781669617, "logits/rejected": -0.6337075233459473, "logps/chosen": -62.004478454589844, "logps/rejected": -57.670467376708984, "loss": 0.6481, "rewards/accuracies": 0.0, "rewards/chosen": 1.2577972412109375, "rewards/margins": -0.4894230365753174, "rewards/rejected": 1.7472202777862549, "step": 2989 }, { "epoch": 0.49, "learning_rate": 9.93375081031563e-07, "logits/chosen": -0.359580934047699, "logits/rejected": -0.2320956587791443, "logps/chosen": -102.71064758300781, "logps/rejected": -156.4622344970703, "loss": 1.0268, "rewards/accuracies": 0.0, "rewards/chosen": 2.428082227706909, "rewards/margins": -1.4125046730041504, "rewards/rejected": 3.8405869007110596, "step": 2990 }, { "epoch": 0.49, "learning_rate": 9.933537406002857e-07, "logits/chosen": -0.502912700176239, "logits/rejected": -0.43104302883148193, "logps/chosen": -90.03776550292969, "logps/rejected": -118.58789825439453, "loss": 0.4232, "rewards/accuracies": 0.0, "rewards/chosen": 2.1809394359588623, "rewards/margins": -0.15767979621887207, "rewards/rejected": 2.3386192321777344, "step": 2991 }, { "epoch": 0.49, "learning_rate": 9.933323660829328e-07, "logits/chosen": -0.675149142742157, "logits/rejected": -0.6533757448196411, "logps/chosen": -79.16631317138672, "logps/rejected": -141.70663452148438, "loss": 0.411, "rewards/accuracies": 1.0, "rewards/chosen": 0.6959525942802429, "rewards/margins": 1.0062644481658936, "rewards/rejected": -0.310311883687973, "step": 2992 }, { "epoch": 0.49, "learning_rate": 9.933109574809812e-07, "logits/chosen": -0.526729166507721, "logits/rejected": -0.5391553640365601, "logps/chosen": -120.26358032226562, "logps/rejected": -114.851318359375, "loss": 1.2471, "rewards/accuracies": 0.0, "rewards/chosen": 3.735238790512085, "rewards/margins": -0.16722869873046875, "rewards/rejected": 3.9024674892425537, "step": 2993 }, { "epoch": 0.49, "learning_rate": 9.932895147959104e-07, "logits/chosen": -0.7327879071235657, "logits/rejected": -0.7039821743965149, "logps/chosen": -88.33226013183594, "logps/rejected": -84.89468383789062, "loss": 0.356, "rewards/accuracies": 1.0, "rewards/chosen": 0.9864799380302429, "rewards/margins": 0.6051170229911804, "rewards/rejected": 0.3813629150390625, "step": 2994 }, { "epoch": 0.49, "learning_rate": 9.932680380292019e-07, "logits/chosen": -0.5135159492492676, "logits/rejected": -0.566120982170105, "logps/chosen": -91.94224548339844, "logps/rejected": -72.84510803222656, "loss": 1.5292, "rewards/accuracies": 0.0, "rewards/chosen": -0.21113891899585724, "rewards/margins": -2.191211700439453, "rewards/rejected": 1.9800728559494019, "step": 2995 }, { "epoch": 0.49, "learning_rate": 9.932465271823389e-07, "logits/chosen": -0.5289689302444458, "logits/rejected": -0.456659197807312, "logps/chosen": -51.97382354736328, "logps/rejected": -70.5664291381836, "loss": 0.3, "rewards/accuracies": 1.0, "rewards/chosen": 1.5605186223983765, "rewards/margins": 0.6599181890487671, "rewards/rejected": 0.9006004333496094, "step": 2996 }, { "epoch": 0.49, "learning_rate": 9.932249822568084e-07, "logits/chosen": -0.9030478596687317, "logits/rejected": -0.6720403432846069, "logps/chosen": -100.09951782226562, "logps/rejected": -91.58062744140625, "loss": 2.1111, "rewards/accuracies": 0.0, "rewards/chosen": 1.2737244367599487, "rewards/margins": -2.79345703125, "rewards/rejected": 4.067181587219238, "step": 2997 }, { "epoch": 0.49, "learning_rate": 9.932034032540983e-07, "logits/chosen": -0.5455143451690674, "logits/rejected": -0.5128642320632935, "logps/chosen": -24.22187614440918, "logps/rejected": -47.84765625, "loss": 0.3194, "rewards/accuracies": 1.0, "rewards/chosen": 0.43550702929496765, "rewards/margins": 0.5292608737945557, "rewards/rejected": -0.09375381469726562, "step": 2998 }, { "epoch": 0.49, "learning_rate": 9.931817901756997e-07, "logits/chosen": -0.5796052813529968, "logits/rejected": -0.559055745601654, "logps/chosen": -74.52532196044922, "logps/rejected": -62.1107177734375, "loss": 0.3131, "rewards/accuracies": 1.0, "rewards/chosen": 2.561717987060547, "rewards/margins": 0.8849478960037231, "rewards/rejected": 1.6767700910568237, "step": 2999 }, { "epoch": 0.49, "learning_rate": 9.931601430231062e-07, "logits/chosen": -0.33070728182792664, "logits/rejected": -0.2578829526901245, "logps/chosen": -69.81551361083984, "logps/rejected": -26.63591957092285, "loss": 0.3754, "rewards/accuracies": 1.0, "rewards/chosen": 2.441035509109497, "rewards/margins": 1.388857126235962, "rewards/rejected": 1.0521783828735352, "step": 3000 }, { "epoch": 0.49, "learning_rate": 9.931384617978129e-07, "logits/chosen": -1.139762282371521, "logits/rejected": -1.0813544988632202, "logps/chosen": -141.63714599609375, "logps/rejected": -68.22248840332031, "loss": 0.1348, "rewards/accuracies": 1.0, "rewards/chosen": 5.054800510406494, "rewards/margins": 2.732504367828369, "rewards/rejected": 2.322296142578125, "step": 3001 }, { "epoch": 0.49, "learning_rate": 9.931167465013182e-07, "logits/chosen": -0.387690007686615, "logits/rejected": -0.40319496393203735, "logps/chosen": -61.8475456237793, "logps/rejected": -50.07592010498047, "loss": 1.0705, "rewards/accuracies": 0.0, "rewards/chosen": 1.6054028272628784, "rewards/margins": -0.7707279920578003, "rewards/rejected": 2.3761308193206787, "step": 3002 }, { "epoch": 0.49, "learning_rate": 9.930949971351221e-07, "logits/chosen": -0.30627936124801636, "logits/rejected": -0.30304351449012756, "logps/chosen": -1.4560432434082031, "logps/rejected": -4.77388334274292, "loss": 0.377, "rewards/accuracies": 0.0, "rewards/chosen": 0.28904175758361816, "rewards/margins": -0.046732425689697266, "rewards/rejected": 0.33577418327331543, "step": 3003 }, { "epoch": 0.49, "learning_rate": 9.930732137007275e-07, "logits/chosen": -0.3375340402126312, "logits/rejected": -0.3375340402126312, "logps/chosen": -16.710983276367188, "logps/rejected": -16.710983276367188, "loss": 0.8451, "rewards/accuracies": 0.0, "rewards/chosen": 0.10985984653234482, "rewards/margins": 0.0, "rewards/rejected": 0.10985984653234482, "step": 3004 }, { "epoch": 0.49, "learning_rate": 9.930513961996393e-07, "logits/chosen": -0.4633542001247406, "logits/rejected": -0.4085800349712372, "logps/chosen": -78.86654663085938, "logps/rejected": -137.8452911376953, "loss": 0.8555, "rewards/accuracies": 0.0, "rewards/chosen": 0.7430412173271179, "rewards/margins": -0.1711266040802002, "rewards/rejected": 0.9141678214073181, "step": 3005 }, { "epoch": 0.49, "learning_rate": 9.930295446333647e-07, "logits/chosen": -0.740915060043335, "logits/rejected": -0.6392861008644104, "logps/chosen": -149.6530303955078, "logps/rejected": -229.19241333007812, "loss": 0.4769, "rewards/accuracies": 0.0, "rewards/chosen": 3.483290195465088, "rewards/margins": -0.42722320556640625, "rewards/rejected": 3.910513401031494, "step": 3006 }, { "epoch": 0.49, "learning_rate": 9.93007659003414e-07, "logits/chosen": -0.3798580765724182, "logits/rejected": -0.1578236073255539, "logps/chosen": -138.6660919189453, "logps/rejected": -58.588600158691406, "loss": 0.2031, "rewards/accuracies": 1.0, "rewards/chosen": 4.481889247894287, "rewards/margins": 2.393122673034668, "rewards/rejected": 2.088766574859619, "step": 3007 }, { "epoch": 0.49, "learning_rate": 9.92985739311299e-07, "logits/chosen": -0.6328847408294678, "logits/rejected": -0.1744803786277771, "logps/chosen": -110.04603576660156, "logps/rejected": -96.41183471679688, "loss": 0.5787, "rewards/accuracies": 0.0, "rewards/chosen": 1.2060364484786987, "rewards/margins": -0.6549315452575684, "rewards/rejected": 1.860967993736267, "step": 3008 }, { "epoch": 0.49, "learning_rate": 9.929637855585336e-07, "logits/chosen": -0.3683845102787018, "logits/rejected": -0.3683845102787018, "logps/chosen": -101.89781951904297, "logps/rejected": -101.89781951904297, "loss": 0.46, "rewards/accuracies": 0.0, "rewards/chosen": 0.6281486749649048, "rewards/margins": 0.0, "rewards/rejected": 0.6281486749649048, "step": 3009 }, { "epoch": 0.49, "learning_rate": 9.929417977466354e-07, "logits/chosen": -1.0032979249954224, "logits/rejected": -0.9380519390106201, "logps/chosen": -112.14088439941406, "logps/rejected": -45.84285354614258, "loss": 0.3752, "rewards/accuracies": 0.0, "rewards/chosen": 0.7480179071426392, "rewards/margins": -0.024159252643585205, "rewards/rejected": 0.7721771597862244, "step": 3010 }, { "epoch": 0.49, "learning_rate": 9.929197758771233e-07, "logits/chosen": -0.5387061238288879, "logits/rejected": -0.3527833819389343, "logps/chosen": -55.83943557739258, "logps/rejected": -49.4267578125, "loss": 0.661, "rewards/accuracies": 0.0, "rewards/chosen": 1.3094723224639893, "rewards/margins": -0.21559488773345947, "rewards/rejected": 1.5250672101974487, "step": 3011 }, { "epoch": 0.49, "learning_rate": 9.928977199515184e-07, "logits/chosen": -0.6051996350288391, "logits/rejected": -0.5376366972923279, "logps/chosen": -60.58317184448242, "logps/rejected": -68.83216857910156, "loss": 0.4701, "rewards/accuracies": 1.0, "rewards/chosen": 2.2136662006378174, "rewards/margins": 0.9140576124191284, "rewards/rejected": 1.299608588218689, "step": 3012 }, { "epoch": 0.49, "learning_rate": 9.928756299713453e-07, "logits/chosen": -0.6134124398231506, "logits/rejected": -0.6092967987060547, "logps/chosen": -86.68695831298828, "logps/rejected": -48.455814361572266, "loss": 0.5746, "rewards/accuracies": 1.0, "rewards/chosen": 1.494299292564392, "rewards/margins": 0.5923537611961365, "rewards/rejected": 0.9019455313682556, "step": 3013 }, { "epoch": 0.49, "learning_rate": 9.928535059381297e-07, "logits/chosen": -0.9954026937484741, "logits/rejected": -0.9840042591094971, "logps/chosen": -120.32609558105469, "logps/rejected": -150.15023803710938, "loss": 0.6491, "rewards/accuracies": 0.0, "rewards/chosen": 0.5001358389854431, "rewards/margins": -0.910418689250946, "rewards/rejected": 1.4105545282363892, "step": 3014 }, { "epoch": 0.49, "learning_rate": 9.928313478534002e-07, "logits/chosen": -0.37692636251449585, "logits/rejected": -0.3475205898284912, "logps/chosen": -52.0627555847168, "logps/rejected": -17.5496768951416, "loss": 0.2031, "rewards/accuracies": 1.0, "rewards/chosen": 1.7842625379562378, "rewards/margins": 1.441584587097168, "rewards/rejected": 0.34267789125442505, "step": 3015 }, { "epoch": 0.49, "learning_rate": 9.928091557186877e-07, "logits/chosen": -0.5514901876449585, "logits/rejected": -0.4625172019004822, "logps/chosen": -144.47059631347656, "logps/rejected": -62.03804397583008, "loss": 0.2097, "rewards/accuracies": 1.0, "rewards/chosen": 3.627528429031372, "rewards/margins": 0.6714787483215332, "rewards/rejected": 2.956049680709839, "step": 3016 }, { "epoch": 0.49, "learning_rate": 9.927869295355257e-07, "logits/chosen": -0.5819169878959656, "logits/rejected": -0.6538808345794678, "logps/chosen": -73.56219482421875, "logps/rejected": -135.9867401123047, "loss": 0.7995, "rewards/accuracies": 0.0, "rewards/chosen": 2.575122117996216, "rewards/margins": -1.2973418235778809, "rewards/rejected": 3.8724639415740967, "step": 3017 }, { "epoch": 0.49, "learning_rate": 9.927646693054495e-07, "logits/chosen": -0.4411912262439728, "logits/rejected": -0.006012193392962217, "logps/chosen": -149.35427856445312, "logps/rejected": -39.079856872558594, "loss": 0.1409, "rewards/accuracies": 1.0, "rewards/chosen": 5.0308837890625, "rewards/margins": 4.270933151245117, "rewards/rejected": 0.7599506378173828, "step": 3018 }, { "epoch": 0.49, "learning_rate": 9.927423750299974e-07, "logits/chosen": -0.6660200953483582, "logits/rejected": -0.6730122566223145, "logps/chosen": -76.3685302734375, "logps/rejected": -68.65862274169922, "loss": 0.5532, "rewards/accuracies": 0.0, "rewards/chosen": 1.9185837507247925, "rewards/margins": -0.14612281322479248, "rewards/rejected": 2.064706563949585, "step": 3019 }, { "epoch": 0.49, "learning_rate": 9.927200467107095e-07, "logits/chosen": -0.2751881778240204, "logits/rejected": -0.2687317132949829, "logps/chosen": -12.274545669555664, "logps/rejected": -8.48935317993164, "loss": 0.7946, "rewards/accuracies": 1.0, "rewards/chosen": 0.9318622946739197, "rewards/margins": 0.033272743225097656, "rewards/rejected": 0.898589551448822, "step": 3020 }, { "epoch": 0.49, "learning_rate": 9.926976843491285e-07, "logits/chosen": -0.2740081250667572, "logits/rejected": -0.0524221770465374, "logps/chosen": -59.26389694213867, "logps/rejected": -108.4224624633789, "loss": 0.6871, "rewards/accuracies": 0.0, "rewards/chosen": 2.2686893939971924, "rewards/margins": -0.5009143352508545, "rewards/rejected": 2.769603729248047, "step": 3021 }, { "epoch": 0.49, "learning_rate": 9.926752879467995e-07, "logits/chosen": -0.4892735183238983, "logits/rejected": -0.7492787837982178, "logps/chosen": -87.66305541992188, "logps/rejected": -63.10673522949219, "loss": 0.5763, "rewards/accuracies": 1.0, "rewards/chosen": 2.215022325515747, "rewards/margins": 0.3076660633087158, "rewards/rejected": 1.9073562622070312, "step": 3022 }, { "epoch": 0.49, "learning_rate": 9.926528575052698e-07, "logits/chosen": -0.47570866346359253, "logits/rejected": -0.38249287009239197, "logps/chosen": -74.07255554199219, "logps/rejected": -87.00181579589844, "loss": 0.2813, "rewards/accuracies": 1.0, "rewards/chosen": 2.1028192043304443, "rewards/margins": 0.7055901288986206, "rewards/rejected": 1.3972290754318237, "step": 3023 }, { "epoch": 0.49, "learning_rate": 9.92630393026089e-07, "logits/chosen": -0.7850649952888489, "logits/rejected": -0.8520060777664185, "logps/chosen": -94.0230712890625, "logps/rejected": -79.79447174072266, "loss": 0.9931, "rewards/accuracies": 0.0, "rewards/chosen": 1.354274034500122, "rewards/margins": -0.46717143058776855, "rewards/rejected": 1.8214454650878906, "step": 3024 }, { "epoch": 0.49, "learning_rate": 9.926078945108097e-07, "logits/chosen": -0.43282365798950195, "logits/rejected": -0.40051722526550293, "logps/chosen": -252.01593017578125, "logps/rejected": -108.07310485839844, "loss": 0.1104, "rewards/accuracies": 1.0, "rewards/chosen": 1.864691138267517, "rewards/margins": 1.9519027471542358, "rewards/rejected": -0.08721160888671875, "step": 3025 }, { "epoch": 0.49, "learning_rate": 9.925853619609857e-07, "logits/chosen": -0.8656134605407715, "logits/rejected": -0.8880469799041748, "logps/chosen": -54.91535949707031, "logps/rejected": -87.7491683959961, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 1.6102478504180908, "rewards/margins": 0.029300689697265625, "rewards/rejected": 1.5809471607208252, "step": 3026 }, { "epoch": 0.49, "learning_rate": 9.925627953781742e-07, "logits/chosen": -0.5102165341377258, "logits/rejected": -0.4384918212890625, "logps/chosen": -68.87120819091797, "logps/rejected": -22.133089065551758, "loss": 0.4154, "rewards/accuracies": 1.0, "rewards/chosen": 1.2351654767990112, "rewards/margins": 0.896741509437561, "rewards/rejected": 0.3384239375591278, "step": 3027 }, { "epoch": 0.49, "learning_rate": 9.925401947639343e-07, "logits/chosen": -0.8990594744682312, "logits/rejected": -0.885931670665741, "logps/chosen": -77.20077514648438, "logps/rejected": -18.97445297241211, "loss": 0.701, "rewards/accuracies": 1.0, "rewards/chosen": 0.4546409547328949, "rewards/margins": 0.3036922216415405, "rewards/rejected": 0.15094871819019318, "step": 3028 }, { "epoch": 0.49, "learning_rate": 9.925175601198272e-07, "logits/chosen": -0.351325124502182, "logits/rejected": -0.3478061854839325, "logps/chosen": -9.389267921447754, "logps/rejected": -19.554304122924805, "loss": 0.7167, "rewards/accuracies": 0.0, "rewards/chosen": 0.2136271446943283, "rewards/margins": -0.09220419824123383, "rewards/rejected": 0.30583134293556213, "step": 3029 }, { "epoch": 0.49, "learning_rate": 9.924948914474172e-07, "logits/chosen": -0.5402959585189819, "logits/rejected": -0.5094411969184875, "logps/chosen": -58.88042449951172, "logps/rejected": -105.67485809326172, "loss": 1.0568, "rewards/accuracies": 0.0, "rewards/chosen": 2.2201759815216064, "rewards/margins": -1.1991393566131592, "rewards/rejected": 3.4193153381347656, "step": 3030 }, { "epoch": 0.49, "learning_rate": 9.9247218874827e-07, "logits/chosen": -0.247004896402359, "logits/rejected": -0.22411131858825684, "logps/chosen": -76.24652099609375, "logps/rejected": -75.76109313964844, "loss": 1.6848, "rewards/accuracies": 0.0, "rewards/chosen": 1.2773040533065796, "rewards/margins": -1.1189416646957397, "rewards/rejected": 2.3962457180023193, "step": 3031 }, { "epoch": 0.49, "learning_rate": 9.924494520239546e-07, "logits/chosen": -0.7907826900482178, "logits/rejected": -0.6748138070106506, "logps/chosen": -112.76063537597656, "logps/rejected": -99.08195495605469, "loss": 0.9551, "rewards/accuracies": 0.0, "rewards/chosen": 1.6240463256835938, "rewards/margins": -1.1072471141815186, "rewards/rejected": 2.7312934398651123, "step": 3032 }, { "epoch": 0.49, "learning_rate": 9.924266812760414e-07, "logits/chosen": -1.032900333404541, "logits/rejected": -0.952337920665741, "logps/chosen": -82.68389892578125, "logps/rejected": -21.927961349487305, "loss": 0.0708, "rewards/accuracies": 1.0, "rewards/chosen": 4.69618558883667, "rewards/margins": 4.340335845947266, "rewards/rejected": 0.35584965348243713, "step": 3033 }, { "epoch": 0.49, "learning_rate": 9.92403876506104e-07, "logits/chosen": -0.31350958347320557, "logits/rejected": -0.05860349163413048, "logps/chosen": -63.24962615966797, "logps/rejected": -46.461883544921875, "loss": 0.2835, "rewards/accuracies": 1.0, "rewards/chosen": 1.7937248945236206, "rewards/margins": 0.9619663953781128, "rewards/rejected": 0.8317584991455078, "step": 3034 }, { "epoch": 0.49, "learning_rate": 9.92381037715718e-07, "logits/chosen": -0.7861009240150452, "logits/rejected": -1.0949289798736572, "logps/chosen": -95.59178161621094, "logps/rejected": -51.17344665527344, "loss": 0.3884, "rewards/accuracies": 1.0, "rewards/chosen": 0.685107409954071, "rewards/margins": 0.07470625638961792, "rewards/rejected": 0.6104011535644531, "step": 3035 }, { "epoch": 0.49, "learning_rate": 9.92358164906461e-07, "logits/chosen": -0.47872892022132874, "logits/rejected": -0.3636578619480133, "logps/chosen": -62.065982818603516, "logps/rejected": -54.98204803466797, "loss": 0.9315, "rewards/accuracies": 0.0, "rewards/chosen": 1.226243257522583, "rewards/margins": -0.6898975372314453, "rewards/rejected": 1.9161407947540283, "step": 3036 }, { "epoch": 0.49, "learning_rate": 9.923352580799138e-07, "logits/chosen": -0.47549450397491455, "logits/rejected": -0.3681102693080902, "logps/chosen": -63.22734069824219, "logps/rejected": -49.620174407958984, "loss": 0.2119, "rewards/accuracies": 1.0, "rewards/chosen": 1.9827079772949219, "rewards/margins": 0.8033256530761719, "rewards/rejected": 1.17938232421875, "step": 3037 }, { "epoch": 0.49, "learning_rate": 9.923123172376587e-07, "logits/chosen": -0.6431024074554443, "logits/rejected": -0.5868386030197144, "logps/chosen": -75.59355163574219, "logps/rejected": -182.24034118652344, "loss": 0.5531, "rewards/accuracies": 1.0, "rewards/chosen": 0.8757705688476562, "rewards/margins": 0.2283996343612671, "rewards/rejected": 0.6473709344863892, "step": 3038 }, { "epoch": 0.49, "learning_rate": 9.922893423812808e-07, "logits/chosen": -0.7042097449302673, "logits/rejected": -0.8819311857223511, "logps/chosen": -52.51353454589844, "logps/rejected": -95.6641845703125, "loss": 2.4499, "rewards/accuracies": 0.0, "rewards/chosen": 2.001072645187378, "rewards/margins": -2.912945508956909, "rewards/rejected": 4.914018154144287, "step": 3039 }, { "epoch": 0.49, "learning_rate": 9.922663335123672e-07, "logits/chosen": -0.4709072411060333, "logits/rejected": -0.2334400862455368, "logps/chosen": -161.60440063476562, "logps/rejected": -127.58453369140625, "loss": 0.9483, "rewards/accuracies": 1.0, "rewards/chosen": 4.7314453125, "rewards/margins": 1.1575348377227783, "rewards/rejected": 3.5739104747772217, "step": 3040 }, { "epoch": 0.49, "learning_rate": 9.922432906325082e-07, "logits/chosen": -0.05037480220198631, "logits/rejected": -0.24585507810115814, "logps/chosen": -93.1824951171875, "logps/rejected": -64.98222351074219, "loss": 0.7354, "rewards/accuracies": 1.0, "rewards/chosen": 1.8359100818634033, "rewards/margins": 0.36579978466033936, "rewards/rejected": 1.470110297203064, "step": 3041 }, { "epoch": 0.49, "learning_rate": 9.922202137432953e-07, "logits/chosen": -0.6720080971717834, "logits/rejected": -0.7645739912986755, "logps/chosen": -124.92135620117188, "logps/rejected": -134.58909606933594, "loss": 2.0235, "rewards/accuracies": 0.0, "rewards/chosen": 0.9283844232559204, "rewards/margins": -2.4003725051879883, "rewards/rejected": 3.328756809234619, "step": 3042 }, { "epoch": 0.49, "learning_rate": 9.92197102846323e-07, "logits/chosen": -0.5409352779388428, "logits/rejected": -0.5409352779388428, "logps/chosen": -20.96253776550293, "logps/rejected": -20.96253776550293, "loss": 0.7704, "rewards/accuracies": 0.0, "rewards/chosen": 0.09352626651525497, "rewards/margins": 0.0, "rewards/rejected": 0.09352626651525497, "step": 3043 }, { "epoch": 0.49, "learning_rate": 9.921739579431882e-07, "logits/chosen": -0.298576682806015, "logits/rejected": -0.31213995814323425, "logps/chosen": -83.30815124511719, "logps/rejected": -49.485076904296875, "loss": 1.1715, "rewards/accuracies": 0.0, "rewards/chosen": 0.08451461791992188, "rewards/margins": -2.0591447353363037, "rewards/rejected": 2.1436593532562256, "step": 3044 }, { "epoch": 0.49, "learning_rate": 9.9215077903549e-07, "logits/chosen": -0.4579547643661499, "logits/rejected": -0.4974406063556671, "logps/chosen": -105.37996673583984, "logps/rejected": -76.02377319335938, "loss": 1.2278, "rewards/accuracies": 0.0, "rewards/chosen": 0.6598884463310242, "rewards/margins": -1.4223434925079346, "rewards/rejected": 2.0822319984436035, "step": 3045 }, { "epoch": 0.49, "learning_rate": 9.921275661248294e-07, "logits/chosen": -0.31158173084259033, "logits/rejected": -0.2058849185705185, "logps/chosen": -184.99017333984375, "logps/rejected": -41.816261291503906, "loss": 1.0063, "rewards/accuracies": 1.0, "rewards/chosen": 0.8259963989257812, "rewards/margins": 0.587646484375, "rewards/rejected": 0.23834991455078125, "step": 3046 }, { "epoch": 0.49, "learning_rate": 9.92104319212811e-07, "logits/chosen": -0.41179943084716797, "logits/rejected": -0.4013872444629669, "logps/chosen": -82.50930786132812, "logps/rejected": -136.9879608154297, "loss": 0.1948, "rewards/accuracies": 1.0, "rewards/chosen": 1.2958900928497314, "rewards/margins": 0.9107017517089844, "rewards/rejected": 0.3851883113384247, "step": 3047 }, { "epoch": 0.49, "learning_rate": 9.9208103830104e-07, "logits/chosen": -0.36318135261535645, "logits/rejected": -0.4425203204154968, "logps/chosen": -71.16981506347656, "logps/rejected": -74.60171508789062, "loss": 0.3655, "rewards/accuracies": 1.0, "rewards/chosen": 1.8387550115585327, "rewards/margins": 0.4176429510116577, "rewards/rejected": 1.421112060546875, "step": 3048 }, { "epoch": 0.49, "learning_rate": 9.920577233911256e-07, "logits/chosen": -0.4150574207305908, "logits/rejected": -0.34797969460487366, "logps/chosen": -95.65620422363281, "logps/rejected": -52.83653259277344, "loss": 1.6078, "rewards/accuracies": 0.0, "rewards/chosen": 1.5032867193222046, "rewards/margins": -0.9407669305801392, "rewards/rejected": 2.4440536499023438, "step": 3049 }, { "epoch": 0.5, "learning_rate": 9.920343744846783e-07, "logits/chosen": -0.4375777542591095, "logits/rejected": -0.5018752813339233, "logps/chosen": -102.21664428710938, "logps/rejected": -98.94746398925781, "loss": 0.9367, "rewards/accuracies": 0.0, "rewards/chosen": 4.0660719871521, "rewards/margins": -0.4044070243835449, "rewards/rejected": 4.4704790115356445, "step": 3050 }, { "epoch": 0.5, "learning_rate": 9.920109915833117e-07, "logits/chosen": -0.8589287996292114, "logits/rejected": -0.8055229783058167, "logps/chosen": -351.100830078125, "logps/rejected": -115.82968139648438, "loss": 0.0857, "rewards/accuracies": 1.0, "rewards/chosen": 3.132824659347534, "rewards/margins": 2.5836341381073, "rewards/rejected": 0.5491905212402344, "step": 3051 }, { "epoch": 0.5, "learning_rate": 9.919875746886408e-07, "logits/chosen": -0.20385749638080597, "logits/rejected": -0.20385749638080597, "logps/chosen": -5.991353988647461, "logps/rejected": -5.991353988647461, "loss": 0.7947, "rewards/accuracies": 0.0, "rewards/chosen": 0.1926746368408203, "rewards/margins": 0.0, "rewards/rejected": 0.1926746368408203, "step": 3052 }, { "epoch": 0.5, "learning_rate": 9.919641238022837e-07, "logits/chosen": -0.9546785950660706, "logits/rejected": -0.9180130958557129, "logps/chosen": -96.96846008300781, "logps/rejected": -84.13506317138672, "loss": 1.3823, "rewards/accuracies": 0.0, "rewards/chosen": 1.117488145828247, "rewards/margins": -1.1817359924316406, "rewards/rejected": 2.2992241382598877, "step": 3053 }, { "epoch": 0.5, "learning_rate": 9.919406389258606e-07, "logits/chosen": -0.7425364851951599, "logits/rejected": -0.7256855964660645, "logps/chosen": -71.78453063964844, "logps/rejected": -80.45731353759766, "loss": 0.5325, "rewards/accuracies": 1.0, "rewards/chosen": 0.5288192629814148, "rewards/margins": 0.4745582342147827, "rewards/rejected": 0.054261017590761185, "step": 3054 }, { "epoch": 0.5, "learning_rate": 9.919171200609945e-07, "logits/chosen": -0.4409453570842743, "logits/rejected": -0.4942333698272705, "logps/chosen": -78.28913116455078, "logps/rejected": -90.59344482421875, "loss": 0.3995, "rewards/accuracies": 0.0, "rewards/chosen": 0.4199790954589844, "rewards/margins": -0.08083724975585938, "rewards/rejected": 0.5008163452148438, "step": 3055 }, { "epoch": 0.5, "learning_rate": 9.918935672093095e-07, "logits/chosen": -0.5774600505828857, "logits/rejected": -0.5858314037322998, "logps/chosen": -65.23319244384766, "logps/rejected": -85.3659896850586, "loss": 0.4425, "rewards/accuracies": 1.0, "rewards/chosen": 0.6600349545478821, "rewards/margins": 0.2660476863384247, "rewards/rejected": 0.3939872682094574, "step": 3056 }, { "epoch": 0.5, "learning_rate": 9.918699803724335e-07, "logits/chosen": -0.839272141456604, "logits/rejected": -0.7761252522468567, "logps/chosen": -34.01781463623047, "logps/rejected": -94.88672637939453, "loss": 0.3846, "rewards/accuracies": 0.0, "rewards/chosen": 1.4991066455841064, "rewards/margins": -0.046844482421875, "rewards/rejected": 1.5459511280059814, "step": 3057 }, { "epoch": 0.5, "learning_rate": 9.918463595519962e-07, "logits/chosen": -0.5924018025398254, "logits/rejected": -0.5968722105026245, "logps/chosen": -70.29254150390625, "logps/rejected": -122.66456604003906, "loss": 0.7568, "rewards/accuracies": 1.0, "rewards/chosen": 1.0053482055664062, "rewards/margins": 0.8886497616767883, "rewards/rejected": 0.11669845879077911, "step": 3058 }, { "epoch": 0.5, "learning_rate": 9.91822704749629e-07, "logits/chosen": -0.5110361576080322, "logits/rejected": -0.504034698009491, "logps/chosen": -93.6182861328125, "logps/rejected": -53.447715759277344, "loss": 0.6031, "rewards/accuracies": 0.0, "rewards/chosen": 0.41415712237358093, "rewards/margins": -0.0553973913192749, "rewards/rejected": 0.46955451369285583, "step": 3059 }, { "epoch": 0.5, "learning_rate": 9.917990159669668e-07, "logits/chosen": -0.5190058946609497, "logits/rejected": -0.47964057326316833, "logps/chosen": -59.616207122802734, "logps/rejected": -97.98725891113281, "loss": 1.0187, "rewards/accuracies": 1.0, "rewards/chosen": 1.2993640899658203, "rewards/margins": 1.391212821006775, "rewards/rejected": -0.09184875339269638, "step": 3060 }, { "epoch": 0.5, "learning_rate": 9.91775293205646e-07, "logits/chosen": -0.5187125205993652, "logits/rejected": -0.4682437777519226, "logps/chosen": -83.91250610351562, "logps/rejected": -90.93228149414062, "loss": 0.2712, "rewards/accuracies": 1.0, "rewards/chosen": 1.7491546869277954, "rewards/margins": 1.1081939935684204, "rewards/rejected": 0.640960693359375, "step": 3061 }, { "epoch": 0.5, "learning_rate": 9.917515364673057e-07, "logits/chosen": -0.46840938925743103, "logits/rejected": -0.24880380928516388, "logps/chosen": -134.64862060546875, "logps/rejected": -68.73176574707031, "loss": 1.0439, "rewards/accuracies": 1.0, "rewards/chosen": 3.452227830886841, "rewards/margins": 1.5846779346466064, "rewards/rejected": 1.8675498962402344, "step": 3062 }, { "epoch": 0.5, "learning_rate": 9.917277457535871e-07, "logits/chosen": -1.040688395500183, "logits/rejected": -1.0708539485931396, "logps/chosen": -127.22948455810547, "logps/rejected": -78.58882141113281, "loss": 0.7124, "rewards/accuracies": 0.0, "rewards/chosen": 0.6154960989952087, "rewards/margins": -1.095761775970459, "rewards/rejected": 1.7112579345703125, "step": 3063 }, { "epoch": 0.5, "learning_rate": 9.91703921066134e-07, "logits/chosen": -0.36970019340515137, "logits/rejected": -0.2674705386161804, "logps/chosen": -162.03578186035156, "logps/rejected": -78.78333282470703, "loss": 0.2873, "rewards/accuracies": 1.0, "rewards/chosen": 4.021191596984863, "rewards/margins": 1.4643075466156006, "rewards/rejected": 2.5568840503692627, "step": 3064 }, { "epoch": 0.5, "learning_rate": 9.916800624065926e-07, "logits/chosen": -0.8782859444618225, "logits/rejected": -0.7708548307418823, "logps/chosen": -124.83853149414062, "logps/rejected": -82.04385375976562, "loss": 0.8862, "rewards/accuracies": 1.0, "rewards/chosen": 4.496771335601807, "rewards/margins": 1.1196808815002441, "rewards/rejected": 3.3770904541015625, "step": 3065 }, { "epoch": 0.5, "learning_rate": 9.916561697766112e-07, "logits/chosen": -0.7683801054954529, "logits/rejected": -0.7888852953910828, "logps/chosen": -214.27719116210938, "logps/rejected": -55.812530517578125, "loss": 0.3933, "rewards/accuracies": 0.0, "rewards/chosen": 1.9713226556777954, "rewards/margins": -0.14143073558807373, "rewards/rejected": 2.112753391265869, "step": 3066 }, { "epoch": 0.5, "learning_rate": 9.916322431778406e-07, "logits/chosen": -0.5488560199737549, "logits/rejected": -0.5202950239181519, "logps/chosen": -100.83425903320312, "logps/rejected": -107.42721557617188, "loss": 0.9573, "rewards/accuracies": 1.0, "rewards/chosen": 1.4243453741073608, "rewards/margins": 1.008154273033142, "rewards/rejected": 0.41619110107421875, "step": 3067 }, { "epoch": 0.5, "learning_rate": 9.916082826119338e-07, "logits/chosen": -0.7178934216499329, "logits/rejected": -0.6785790324211121, "logps/chosen": -64.1854248046875, "logps/rejected": -58.64377975463867, "loss": 1.8356, "rewards/accuracies": 0.0, "rewards/chosen": 0.7075034976005554, "rewards/margins": -2.1769344806671143, "rewards/rejected": 2.8844380378723145, "step": 3068 }, { "epoch": 0.5, "learning_rate": 9.915842880805464e-07, "logits/chosen": -0.6587724089622498, "logits/rejected": -0.6556135416030884, "logps/chosen": -51.87128829956055, "logps/rejected": -75.60581970214844, "loss": 0.5124, "rewards/accuracies": 0.0, "rewards/chosen": 1.7018451690673828, "rewards/margins": -0.4866445064544678, "rewards/rejected": 2.1884896755218506, "step": 3069 }, { "epoch": 0.5, "learning_rate": 9.91560259585336e-07, "logits/chosen": -0.6534502506256104, "logits/rejected": -0.6534502506256104, "logps/chosen": -72.14729309082031, "logps/rejected": -72.14729309082031, "loss": 0.6598, "rewards/accuracies": 0.0, "rewards/chosen": 1.3491653203964233, "rewards/margins": 0.0, "rewards/rejected": 1.3491653203964233, "step": 3070 }, { "epoch": 0.5, "learning_rate": 9.91536197127963e-07, "logits/chosen": -0.5980044007301331, "logits/rejected": -0.5617388486862183, "logps/chosen": -62.45965576171875, "logps/rejected": -35.581119537353516, "loss": 0.1341, "rewards/accuracies": 1.0, "rewards/chosen": 2.6708106994628906, "rewards/margins": 1.488425374031067, "rewards/rejected": 1.1823853254318237, "step": 3071 }, { "epoch": 0.5, "learning_rate": 9.915121007100898e-07, "logits/chosen": -0.40725380182266235, "logits/rejected": -0.35809779167175293, "logps/chosen": -100.12944793701172, "logps/rejected": -64.3033676147461, "loss": 0.6119, "rewards/accuracies": 0.0, "rewards/chosen": 0.3139183223247528, "rewards/margins": -0.866162896156311, "rewards/rejected": 1.1800812482833862, "step": 3072 }, { "epoch": 0.5, "learning_rate": 9.914879703333809e-07, "logits/chosen": -0.4170888364315033, "logits/rejected": -0.41215696930885315, "logps/chosen": -21.814430236816406, "logps/rejected": -27.42041778564453, "loss": 1.9038, "rewards/accuracies": 0.0, "rewards/chosen": 0.05828571319580078, "rewards/margins": -0.4392950236797333, "rewards/rejected": 0.49758073687553406, "step": 3073 }, { "epoch": 0.5, "learning_rate": 9.914638059995038e-07, "logits/chosen": -0.666176438331604, "logits/rejected": -0.6345881819725037, "logps/chosen": -151.27383422851562, "logps/rejected": -139.25746154785156, "loss": 1.0097, "rewards/accuracies": 0.0, "rewards/chosen": 4.157330513000488, "rewards/margins": -0.7806210517883301, "rewards/rejected": 4.937951564788818, "step": 3074 }, { "epoch": 0.5, "learning_rate": 9.914396077101282e-07, "logits/chosen": -0.3040538430213928, "logits/rejected": -0.21778658032417297, "logps/chosen": -92.95260620117188, "logps/rejected": -18.306865692138672, "loss": 0.4834, "rewards/accuracies": 0.0, "rewards/chosen": 0.08410263061523438, "rewards/margins": -0.2776434123516083, "rewards/rejected": 0.36174604296684265, "step": 3075 }, { "epoch": 0.5, "learning_rate": 9.914153754669253e-07, "logits/chosen": -0.5119956135749817, "logits/rejected": -0.5119956135749817, "logps/chosen": -73.44725036621094, "logps/rejected": -73.44725036621094, "loss": 1.5214, "rewards/accuracies": 0.0, "rewards/chosen": 1.7355835437774658, "rewards/margins": 0.0, "rewards/rejected": 1.7355835437774658, "step": 3076 }, { "epoch": 0.5, "learning_rate": 9.913911092715702e-07, "logits/chosen": -0.5586248636245728, "logits/rejected": -0.5798370838165283, "logps/chosen": -147.4866943359375, "logps/rejected": -62.136924743652344, "loss": 0.9644, "rewards/accuracies": 0.0, "rewards/chosen": 0.3498992919921875, "rewards/margins": -1.5134849548339844, "rewards/rejected": 1.8633842468261719, "step": 3077 }, { "epoch": 0.5, "learning_rate": 9.91366809125739e-07, "logits/chosen": -0.7126038074493408, "logits/rejected": -0.7256168127059937, "logps/chosen": -132.394287109375, "logps/rejected": -62.458831787109375, "loss": 0.797, "rewards/accuracies": 0.0, "rewards/chosen": 0.5076553225517273, "rewards/margins": -1.1073670387268066, "rewards/rejected": 1.6150223016738892, "step": 3078 }, { "epoch": 0.5, "learning_rate": 9.913424750311106e-07, "logits/chosen": -0.4870666265487671, "logits/rejected": -0.42425188422203064, "logps/chosen": -28.084196090698242, "logps/rejected": -79.83744812011719, "loss": 0.514, "rewards/accuracies": 0.0, "rewards/chosen": 0.7620599865913391, "rewards/margins": -0.38718849420547485, "rewards/rejected": 1.149248480796814, "step": 3079 }, { "epoch": 0.5, "learning_rate": 9.913181069893662e-07, "logits/chosen": -0.5435547828674316, "logits/rejected": -0.6784083247184753, "logps/chosen": -115.67022705078125, "logps/rejected": -182.0921630859375, "loss": 2.0014, "rewards/accuracies": 0.0, "rewards/chosen": 1.8725662231445312, "rewards/margins": -3.975327968597412, "rewards/rejected": 5.847894191741943, "step": 3080 }, { "epoch": 0.5, "learning_rate": 9.912937050021894e-07, "logits/chosen": -0.544440746307373, "logits/rejected": -0.544440746307373, "logps/chosen": -68.8366928100586, "logps/rejected": -68.8366928100586, "loss": 0.6293, "rewards/accuracies": 0.0, "rewards/chosen": 2.16899037361145, "rewards/margins": 0.0, "rewards/rejected": 2.16899037361145, "step": 3081 }, { "epoch": 0.5, "learning_rate": 9.912692690712665e-07, "logits/chosen": -0.6972863078117371, "logits/rejected": -0.6434838175773621, "logps/chosen": -122.447021484375, "logps/rejected": -47.971221923828125, "loss": 0.2691, "rewards/accuracies": 1.0, "rewards/chosen": 3.4181931018829346, "rewards/margins": 0.9601807594299316, "rewards/rejected": 2.458012342453003, "step": 3082 }, { "epoch": 0.5, "learning_rate": 9.912447991982855e-07, "logits/chosen": -0.5792906284332275, "logits/rejected": -0.5976599454879761, "logps/chosen": -25.77028465270996, "logps/rejected": -55.94093704223633, "loss": 0.9904, "rewards/accuracies": 0.0, "rewards/chosen": 1.0903971195220947, "rewards/margins": -0.1053391695022583, "rewards/rejected": 1.195736289024353, "step": 3083 }, { "epoch": 0.5, "learning_rate": 9.912202953849368e-07, "logits/chosen": -0.8257481455802917, "logits/rejected": -0.7557092905044556, "logps/chosen": -64.61842346191406, "logps/rejected": -41.11896514892578, "loss": 1.2506, "rewards/accuracies": 1.0, "rewards/chosen": 1.0918701887130737, "rewards/margins": 0.9595917463302612, "rewards/rejected": 0.1322784423828125, "step": 3084 }, { "epoch": 0.5, "learning_rate": 9.91195757632914e-07, "logits/chosen": -0.5037267208099365, "logits/rejected": -0.3740140199661255, "logps/chosen": -70.40491485595703, "logps/rejected": -16.60431671142578, "loss": 0.5182, "rewards/accuracies": 1.0, "rewards/chosen": 1.9022384881973267, "rewards/margins": 1.6823005676269531, "rewards/rejected": 0.21993790566921234, "step": 3085 }, { "epoch": 0.5, "learning_rate": 9.91171185943912e-07, "logits/chosen": -0.7040837407112122, "logits/rejected": -0.7040837407112122, "logps/chosen": -0.8066434860229492, "logps/rejected": -0.8066434860229492, "loss": 0.6704, "rewards/accuracies": 0.0, "rewards/chosen": 0.2954931855201721, "rewards/margins": 0.0, "rewards/rejected": 0.2954931855201721, "step": 3086 }, { "epoch": 0.5, "learning_rate": 9.911465803196284e-07, "logits/chosen": -0.5474700331687927, "logits/rejected": -0.12916409969329834, "logps/chosen": -65.64450073242188, "logps/rejected": -58.367454528808594, "loss": 0.5625, "rewards/accuracies": 0.0, "rewards/chosen": 0.6925552487373352, "rewards/margins": -0.4525901675224304, "rewards/rejected": 1.1451454162597656, "step": 3087 }, { "epoch": 0.5, "learning_rate": 9.911219407617635e-07, "logits/chosen": -0.2319779098033905, "logits/rejected": -0.195601224899292, "logps/chosen": -104.25482177734375, "logps/rejected": -125.29683685302734, "loss": 0.9774, "rewards/accuracies": 1.0, "rewards/chosen": 1.461280107498169, "rewards/margins": 0.5995331406593323, "rewards/rejected": 0.8617469668388367, "step": 3088 }, { "epoch": 0.5, "learning_rate": 9.910972672720196e-07, "logits/chosen": -0.5516930222511292, "logits/rejected": -0.4237588047981262, "logps/chosen": -49.69236755371094, "logps/rejected": -22.441219329833984, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": 3.4887542724609375, "rewards/margins": 3.1939172744750977, "rewards/rejected": 0.29483699798583984, "step": 3089 }, { "epoch": 0.5, "learning_rate": 9.910725598521012e-07, "logits/chosen": -0.30989816784858704, "logits/rejected": -0.24193331599235535, "logps/chosen": -143.77825927734375, "logps/rejected": -41.407814025878906, "loss": 1.426, "rewards/accuracies": 1.0, "rewards/chosen": 1.4634277820587158, "rewards/margins": 0.3548729419708252, "rewards/rejected": 1.1085548400878906, "step": 3090 }, { "epoch": 0.5, "learning_rate": 9.910478185037156e-07, "logits/chosen": -0.4307968318462372, "logits/rejected": -0.4027039706707001, "logps/chosen": -67.23880004882812, "logps/rejected": -31.162307739257812, "loss": 0.44, "rewards/accuracies": 1.0, "rewards/chosen": 0.767047107219696, "rewards/margins": 0.20696407556533813, "rewards/rejected": 0.5600830316543579, "step": 3091 }, { "epoch": 0.5, "learning_rate": 9.91023043228572e-07, "logits/chosen": -0.2508231997489929, "logits/rejected": -0.2620014250278473, "logps/chosen": -59.21343231201172, "logps/rejected": -93.82608032226562, "loss": 0.2107, "rewards/accuracies": 1.0, "rewards/chosen": 0.873529851436615, "rewards/margins": 1.1043518781661987, "rewards/rejected": -0.23082199692726135, "step": 3092 }, { "epoch": 0.5, "learning_rate": 9.909982340283822e-07, "logits/chosen": -0.6695868372917175, "logits/rejected": -0.7293344736099243, "logps/chosen": -89.77706909179688, "logps/rejected": -67.03961944580078, "loss": 0.4849, "rewards/accuracies": 1.0, "rewards/chosen": 1.8607529401779175, "rewards/margins": 0.1342560052871704, "rewards/rejected": 1.726496934890747, "step": 3093 }, { "epoch": 0.5, "learning_rate": 9.909733909048605e-07, "logits/chosen": -0.5231501460075378, "logits/rejected": -0.5040568709373474, "logps/chosen": -77.49238586425781, "logps/rejected": -38.74285125732422, "loss": 0.8415, "rewards/accuracies": 0.0, "rewards/chosen": 0.429830938577652, "rewards/margins": -1.0478531122207642, "rewards/rejected": 1.4776840209960938, "step": 3094 }, { "epoch": 0.5, "learning_rate": 9.909485138597228e-07, "logits/chosen": -0.622203528881073, "logits/rejected": -0.6222527027130127, "logps/chosen": -141.02618408203125, "logps/rejected": -14.046154022216797, "loss": 0.1078, "rewards/accuracies": 1.0, "rewards/chosen": 2.96537184715271, "rewards/margins": 2.504859209060669, "rewards/rejected": 0.46051254868507385, "step": 3095 }, { "epoch": 0.5, "learning_rate": 9.909236028946885e-07, "logits/chosen": -0.09023477137088776, "logits/rejected": -0.07727272808551788, "logps/chosen": -4.632661819458008, "logps/rejected": -32.00995635986328, "loss": 0.6405, "rewards/accuracies": 1.0, "rewards/chosen": 0.21155916154384613, "rewards/margins": 0.2273552566766739, "rewards/rejected": -0.01579608954489231, "step": 3096 }, { "epoch": 0.5, "learning_rate": 9.908986580114783e-07, "logits/chosen": -0.8084508180618286, "logits/rejected": -0.6801995038986206, "logps/chosen": -100.77995300292969, "logps/rejected": -63.36461639404297, "loss": 0.1496, "rewards/accuracies": 1.0, "rewards/chosen": 3.3438127040863037, "rewards/margins": 1.1807985305786133, "rewards/rejected": 2.1630141735076904, "step": 3097 }, { "epoch": 0.5, "learning_rate": 9.908736792118157e-07, "logits/chosen": -0.237110897898674, "logits/rejected": -0.14910796284675598, "logps/chosen": -57.243160247802734, "logps/rejected": -36.25895309448242, "loss": 1.0351, "rewards/accuracies": 1.0, "rewards/chosen": 0.7629909515380859, "rewards/margins": 0.36118772625923157, "rewards/rejected": 0.40180322527885437, "step": 3098 }, { "epoch": 0.5, "learning_rate": 9.908486664974265e-07, "logits/chosen": -0.5655091404914856, "logits/rejected": -0.4970782399177551, "logps/chosen": -82.4587173461914, "logps/rejected": -18.307544708251953, "loss": 0.7836, "rewards/accuracies": 1.0, "rewards/chosen": 1.3772865533828735, "rewards/margins": 1.1623293161392212, "rewards/rejected": 0.21495723724365234, "step": 3099 }, { "epoch": 0.5, "learning_rate": 9.90823619870039e-07, "logits/chosen": -0.26193493604660034, "logits/rejected": -0.2603461444377899, "logps/chosen": -52.19558334350586, "logps/rejected": -93.7779541015625, "loss": 0.6765, "rewards/accuracies": 0.0, "rewards/chosen": 1.610568642616272, "rewards/margins": -0.3031322956085205, "rewards/rejected": 1.9137009382247925, "step": 3100 }, { "epoch": 0.5, "learning_rate": 9.907985393313836e-07, "logits/chosen": -0.19139668345451355, "logits/rejected": -0.19139668345451355, "logps/chosen": -46.91206359863281, "logps/rejected": -46.91206359863281, "loss": 0.7329, "rewards/accuracies": 0.0, "rewards/chosen": 0.8389191031455994, "rewards/margins": 0.0, "rewards/rejected": 0.8389191031455994, "step": 3101 }, { "epoch": 0.5, "learning_rate": 9.907734248831928e-07, "logits/chosen": -0.4720628261566162, "logits/rejected": -0.3944898843765259, "logps/chosen": -60.92810821533203, "logps/rejected": -50.814720153808594, "loss": 0.2388, "rewards/accuracies": 1.0, "rewards/chosen": 1.9607781171798706, "rewards/margins": 0.5044525861740112, "rewards/rejected": 1.4563255310058594, "step": 3102 }, { "epoch": 0.5, "learning_rate": 9.907482765272025e-07, "logits/chosen": -0.3811551034450531, "logits/rejected": -0.29949480295181274, "logps/chosen": -68.96345520019531, "logps/rejected": -80.5438232421875, "loss": 0.3767, "rewards/accuracies": 1.0, "rewards/chosen": 2.301013231277466, "rewards/margins": 0.1080780029296875, "rewards/rejected": 2.1929352283477783, "step": 3103 }, { "epoch": 0.5, "learning_rate": 9.907230942651497e-07, "logits/chosen": -0.7440533638000488, "logits/rejected": -0.6770711541175842, "logps/chosen": -115.65167236328125, "logps/rejected": -128.1962890625, "loss": 0.3484, "rewards/accuracies": 1.0, "rewards/chosen": 4.063336372375488, "rewards/margins": 0.3751709461212158, "rewards/rejected": 3.6881654262542725, "step": 3104 }, { "epoch": 0.5, "learning_rate": 9.906978780987742e-07, "logits/chosen": -0.3167763948440552, "logits/rejected": -0.3167763948440552, "logps/chosen": -16.10365104675293, "logps/rejected": -16.10365104675293, "loss": 0.7914, "rewards/accuracies": 0.0, "rewards/chosen": 0.7825773358345032, "rewards/margins": 0.0, "rewards/rejected": 0.7825773358345032, "step": 3105 }, { "epoch": 0.5, "learning_rate": 9.906726280298184e-07, "logits/chosen": -0.42231249809265137, "logits/rejected": -0.368782103061676, "logps/chosen": -33.156368255615234, "logps/rejected": -17.25858497619629, "loss": 0.271, "rewards/accuracies": 1.0, "rewards/chosen": 0.9418758749961853, "rewards/margins": 0.44011425971984863, "rewards/rejected": 0.5017616152763367, "step": 3106 }, { "epoch": 0.5, "learning_rate": 9.90647344060027e-07, "logits/chosen": -0.719487190246582, "logits/rejected": -0.6479291915893555, "logps/chosen": -70.26609802246094, "logps/rejected": -79.17637634277344, "loss": 1.3655, "rewards/accuracies": 0.0, "rewards/chosen": 0.8901848196983337, "rewards/margins": -1.6323120594024658, "rewards/rejected": 2.5224969387054443, "step": 3107 }, { "epoch": 0.5, "learning_rate": 9.906220261911465e-07, "logits/chosen": -0.4528590738773346, "logits/rejected": -0.405971884727478, "logps/chosen": -126.88178253173828, "logps/rejected": -62.206180572509766, "loss": 0.5813, "rewards/accuracies": 0.0, "rewards/chosen": 0.6043114066123962, "rewards/margins": -0.45409125089645386, "rewards/rejected": 1.05840265750885, "step": 3108 }, { "epoch": 0.5, "learning_rate": 9.90596674424926e-07, "logits/chosen": -0.9342431426048279, "logits/rejected": -0.9583768248558044, "logps/chosen": -227.38214111328125, "logps/rejected": -150.145751953125, "loss": 1.502, "rewards/accuracies": 0.0, "rewards/chosen": 2.696728467941284, "rewards/margins": -2.713153123855591, "rewards/rejected": 5.409881591796875, "step": 3109 }, { "epoch": 0.5, "learning_rate": 9.90571288763118e-07, "logits/chosen": -0.6596828103065491, "logits/rejected": -0.6053899526596069, "logps/chosen": -107.05754089355469, "logps/rejected": -86.73150634765625, "loss": 0.5301, "rewards/accuracies": 0.0, "rewards/chosen": 0.5640167593955994, "rewards/margins": -0.20515286922454834, "rewards/rejected": 0.7691696286201477, "step": 3110 }, { "epoch": 0.5, "learning_rate": 9.905458692074754e-07, "logits/chosen": -0.901104748249054, "logits/rejected": -1.0367172956466675, "logps/chosen": -328.38458251953125, "logps/rejected": -122.8781967163086, "loss": 0.999, "rewards/accuracies": 0.0, "rewards/chosen": 3.4000306129455566, "rewards/margins": -1.8289403915405273, "rewards/rejected": 5.228971004486084, "step": 3111 }, { "epoch": 0.51, "learning_rate": 9.905204157597548e-07, "logits/chosen": -0.5284866690635681, "logits/rejected": -0.5092833042144775, "logps/chosen": -91.0452880859375, "logps/rejected": -99.52339172363281, "loss": 2.3538, "rewards/accuracies": 0.0, "rewards/chosen": 0.2627914547920227, "rewards/margins": -2.436367988586426, "rewards/rejected": 2.6991593837738037, "step": 3112 }, { "epoch": 0.51, "learning_rate": 9.904949284217147e-07, "logits/chosen": -0.2748895585536957, "logits/rejected": -0.2748895585536957, "logps/chosen": -81.9776840209961, "logps/rejected": -81.9776840209961, "loss": 0.4036, "rewards/accuracies": 0.0, "rewards/chosen": 0.8736168146133423, "rewards/margins": 0.0, "rewards/rejected": 0.8736168146133423, "step": 3113 }, { "epoch": 0.51, "learning_rate": 9.904694071951165e-07, "logits/chosen": -0.44184672832489014, "logits/rejected": -0.3822753131389618, "logps/chosen": -32.351959228515625, "logps/rejected": -7.811795234680176, "loss": 0.8566, "rewards/accuracies": 1.0, "rewards/chosen": 0.5273681879043579, "rewards/margins": 0.3597748875617981, "rewards/rejected": 0.16759328544139862, "step": 3114 }, { "epoch": 0.51, "learning_rate": 9.90443852081723e-07, "logits/chosen": -0.5392826199531555, "logits/rejected": -0.5458217263221741, "logps/chosen": -26.733245849609375, "logps/rejected": -16.601707458496094, "loss": 0.7635, "rewards/accuracies": 0.0, "rewards/chosen": 0.7171783447265625, "rewards/margins": -0.014404714107513428, "rewards/rejected": 0.7315830588340759, "step": 3115 }, { "epoch": 0.51, "learning_rate": 9.904182630832997e-07, "logits/chosen": -0.5038672685623169, "logits/rejected": -0.4734765291213989, "logps/chosen": -76.89486694335938, "logps/rejected": -52.14581298828125, "loss": 0.5386, "rewards/accuracies": 0.0, "rewards/chosen": 1.6382683515548706, "rewards/margins": -0.34408485889434814, "rewards/rejected": 1.9823532104492188, "step": 3116 }, { "epoch": 0.51, "learning_rate": 9.90392640201615e-07, "logits/chosen": -0.4402211308479309, "logits/rejected": -0.454365074634552, "logps/chosen": -1.3939086198806763, "logps/rejected": -29.98224449157715, "loss": 0.6282, "rewards/accuracies": 1.0, "rewards/chosen": 0.452362060546875, "rewards/margins": 0.3344934582710266, "rewards/rejected": 0.11786861717700958, "step": 3117 }, { "epoch": 0.51, "learning_rate": 9.903669834384391e-07, "logits/chosen": -1.0268855094909668, "logits/rejected": -0.9579540491104126, "logps/chosen": -216.34716796875, "logps/rejected": -139.31309509277344, "loss": 1.6795, "rewards/accuracies": 0.0, "rewards/chosen": 2.4232728481292725, "rewards/margins": -2.110499620437622, "rewards/rejected": 4.5337724685668945, "step": 3118 }, { "epoch": 0.51, "learning_rate": 9.903412927955445e-07, "logits/chosen": -0.8377077579498291, "logits/rejected": -0.7999848127365112, "logps/chosen": -137.04100036621094, "logps/rejected": -219.11697387695312, "loss": 2.6735, "rewards/accuracies": 0.0, "rewards/chosen": 2.141862630844116, "rewards/margins": -4.2732744216918945, "rewards/rejected": 6.415136814117432, "step": 3119 }, { "epoch": 0.51, "learning_rate": 9.903155682747061e-07, "logits/chosen": -0.31587275862693787, "logits/rejected": -0.3262055814266205, "logps/chosen": -67.36614990234375, "logps/rejected": -61.25020217895508, "loss": 0.5799, "rewards/accuracies": 1.0, "rewards/chosen": 1.3832229375839233, "rewards/margins": 0.06724274158477783, "rewards/rejected": 1.3159801959991455, "step": 3120 }, { "epoch": 0.51, "learning_rate": 9.902898098777015e-07, "logits/chosen": -0.4263341724872589, "logits/rejected": -0.3467833995819092, "logps/chosen": -73.10612487792969, "logps/rejected": -67.67811584472656, "loss": 0.2808, "rewards/accuracies": 1.0, "rewards/chosen": 2.0033066272735596, "rewards/margins": 0.3661460876464844, "rewards/rejected": 1.6371605396270752, "step": 3121 }, { "epoch": 0.51, "learning_rate": 9.9026401760631e-07, "logits/chosen": -0.2287701666355133, "logits/rejected": -0.2287701666355133, "logps/chosen": -57.870941162109375, "logps/rejected": -57.870941162109375, "loss": 1.0296, "rewards/accuracies": 0.0, "rewards/chosen": 0.14149971306324005, "rewards/margins": 0.0, "rewards/rejected": 0.14149971306324005, "step": 3122 }, { "epoch": 0.51, "learning_rate": 9.902381914623142e-07, "logits/chosen": 0.006667356472462416, "logits/rejected": 0.0012856947723776102, "logps/chosen": -3.3732399940490723, "logps/rejected": -1.814875841140747, "loss": 0.5416, "rewards/accuracies": 0.0, "rewards/chosen": 0.18985258042812347, "rewards/margins": -0.08894120156764984, "rewards/rejected": 0.2787937819957733, "step": 3123 }, { "epoch": 0.51, "learning_rate": 9.902123314474977e-07, "logits/chosen": -0.535966694355011, "logits/rejected": -0.49661627411842346, "logps/chosen": -117.23731231689453, "logps/rejected": -141.66943359375, "loss": 3.0722, "rewards/accuracies": 0.0, "rewards/chosen": 0.791851818561554, "rewards/margins": -4.110083103179932, "rewards/rejected": 4.90193510055542, "step": 3124 }, { "epoch": 0.51, "learning_rate": 9.901864375636476e-07, "logits/chosen": -0.3820100426673889, "logits/rejected": -0.3743229806423187, "logps/chosen": -2.0314035415649414, "logps/rejected": -3.272144317626953, "loss": 0.6544, "rewards/accuracies": 0.0, "rewards/chosen": 0.3613024353981018, "rewards/margins": -0.1911216378211975, "rewards/rejected": 0.5524240732192993, "step": 3125 }, { "epoch": 0.51, "learning_rate": 9.901605098125526e-07, "logits/chosen": -0.17192445695400238, "logits/rejected": -0.2687627077102661, "logps/chosen": -91.07952117919922, "logps/rejected": -101.19853210449219, "loss": 1.4062, "rewards/accuracies": 0.0, "rewards/chosen": 0.13464049994945526, "rewards/margins": -2.6833696365356445, "rewards/rejected": 2.818010091781616, "step": 3126 }, { "epoch": 0.51, "learning_rate": 9.901345481960047e-07, "logits/chosen": -0.7314831018447876, "logits/rejected": -0.7693647742271423, "logps/chosen": -247.57691955566406, "logps/rejected": -79.74278259277344, "loss": 0.4226, "rewards/accuracies": 1.0, "rewards/chosen": 2.9164841175079346, "rewards/margins": 0.9936851263046265, "rewards/rejected": 1.922798991203308, "step": 3127 }, { "epoch": 0.51, "learning_rate": 9.90108552715797e-07, "logits/chosen": -0.8036279082298279, "logits/rejected": -0.7388154864311218, "logps/chosen": -115.70112609863281, "logps/rejected": -61.24108123779297, "loss": 0.508, "rewards/accuracies": 1.0, "rewards/chosen": 4.3525285720825195, "rewards/margins": 3.0527491569519043, "rewards/rejected": 1.2997795343399048, "step": 3128 }, { "epoch": 0.51, "learning_rate": 9.90082523373726e-07, "logits/chosen": -0.9553076028823853, "logits/rejected": -0.9524415135383606, "logps/chosen": -114.59295654296875, "logps/rejected": -107.45674896240234, "loss": 0.2112, "rewards/accuracies": 1.0, "rewards/chosen": 2.0569000244140625, "rewards/margins": 1.159887671470642, "rewards/rejected": 0.8970123529434204, "step": 3129 }, { "epoch": 0.51, "learning_rate": 9.900564601715897e-07, "logits/chosen": -0.3083263337612152, "logits/rejected": -0.08323580771684647, "logps/chosen": -83.65303039550781, "logps/rejected": -11.858699798583984, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 1.9247254133224487, "rewards/margins": 1.2437949180603027, "rewards/rejected": 0.6809305548667908, "step": 3130 }, { "epoch": 0.51, "learning_rate": 9.900303631111887e-07, "logits/chosen": -0.5566051006317139, "logits/rejected": -0.5566051006317139, "logps/chosen": -82.29512023925781, "logps/rejected": -82.29512023925781, "loss": 0.8188, "rewards/accuracies": 0.0, "rewards/chosen": 2.420428514480591, "rewards/margins": 0.0, "rewards/rejected": 2.420428514480591, "step": 3131 }, { "epoch": 0.51, "learning_rate": 9.900042321943267e-07, "logits/chosen": -0.8468790650367737, "logits/rejected": -0.7993499636650085, "logps/chosen": -91.79712677001953, "logps/rejected": -89.77085876464844, "loss": 0.4862, "rewards/accuracies": 1.0, "rewards/chosen": 0.3051033020019531, "rewards/margins": 0.13299179077148438, "rewards/rejected": 0.17211151123046875, "step": 3132 }, { "epoch": 0.51, "learning_rate": 9.899780674228086e-07, "logits/chosen": -0.6086291074752808, "logits/rejected": -0.41094282269477844, "logps/chosen": -159.10317993164062, "logps/rejected": -73.1553726196289, "loss": 0.5579, "rewards/accuracies": 1.0, "rewards/chosen": 4.931144714355469, "rewards/margins": 3.138167381286621, "rewards/rejected": 1.792977213859558, "step": 3133 }, { "epoch": 0.51, "learning_rate": 9.899518687984422e-07, "logits/chosen": -0.5629552602767944, "logits/rejected": -0.5629552602767944, "logps/chosen": -36.64613342285156, "logps/rejected": -36.64613342285156, "loss": 1.3488, "rewards/accuracies": 0.0, "rewards/chosen": 1.7179450988769531, "rewards/margins": 0.0, "rewards/rejected": 1.7179450988769531, "step": 3134 }, { "epoch": 0.51, "learning_rate": 9.899256363230377e-07, "logits/chosen": -0.8771803379058838, "logits/rejected": -0.8099738359451294, "logps/chosen": -113.3621826171875, "logps/rejected": -113.41004943847656, "loss": 1.0612, "rewards/accuracies": 0.0, "rewards/chosen": 0.011903381906449795, "rewards/margins": -1.8237273693084717, "rewards/rejected": 1.8356307744979858, "step": 3135 }, { "epoch": 0.51, "learning_rate": 9.898993699984075e-07, "logits/chosen": -0.3711150288581848, "logits/rejected": -0.278200626373291, "logps/chosen": -51.333717346191406, "logps/rejected": -14.812628746032715, "loss": 0.7628, "rewards/accuracies": 1.0, "rewards/chosen": 1.8967971801757812, "rewards/margins": 1.6054507493972778, "rewards/rejected": 0.2913464605808258, "step": 3136 }, { "epoch": 0.51, "learning_rate": 9.89873069826366e-07, "logits/chosen": -0.2689783573150635, "logits/rejected": -0.14069615304470062, "logps/chosen": -66.33243560791016, "logps/rejected": -67.01145935058594, "loss": 0.4702, "rewards/accuracies": 1.0, "rewards/chosen": 1.8458213806152344, "rewards/margins": 1.2715027332305908, "rewards/rejected": 0.5743187069892883, "step": 3137 }, { "epoch": 0.51, "learning_rate": 9.898467358087308e-07, "logits/chosen": -0.8326697945594788, "logits/rejected": -0.7853051424026489, "logps/chosen": -98.41920471191406, "logps/rejected": -83.06007385253906, "loss": 0.4864, "rewards/accuracies": 0.0, "rewards/chosen": 0.8511077761650085, "rewards/margins": -0.18644720315933228, "rewards/rejected": 1.0375549793243408, "step": 3138 }, { "epoch": 0.51, "learning_rate": 9.898203679473213e-07, "logits/chosen": 0.05093180760741234, "logits/rejected": 0.051123496145009995, "logps/chosen": -4.602631568908691, "logps/rejected": -12.273592948913574, "loss": 0.448, "rewards/accuracies": 1.0, "rewards/chosen": 0.18596945703029633, "rewards/margins": 0.35512715578079224, "rewards/rejected": -0.1691576987504959, "step": 3139 }, { "epoch": 0.51, "learning_rate": 9.89793966243959e-07, "logits/chosen": -0.5821565985679626, "logits/rejected": -0.48868826031684875, "logps/chosen": -66.5993423461914, "logps/rejected": -33.728782653808594, "loss": 0.1193, "rewards/accuracies": 1.0, "rewards/chosen": 2.289299726486206, "rewards/margins": 2.0300185680389404, "rewards/rejected": 0.2592811584472656, "step": 3140 }, { "epoch": 0.51, "learning_rate": 9.897675307004679e-07, "logits/chosen": -0.2903258204460144, "logits/rejected": -0.2903258204460144, "logps/chosen": -1.9211034774780273, "logps/rejected": -1.9211034774780273, "loss": 0.7133, "rewards/accuracies": 0.0, "rewards/chosen": 0.24875740706920624, "rewards/margins": 0.0, "rewards/rejected": 0.24875740706920624, "step": 3141 }, { "epoch": 0.51, "learning_rate": 9.897410613186748e-07, "logits/chosen": -0.0894625261425972, "logits/rejected": -0.08893099427223206, "logps/chosen": -3.2853736877441406, "logps/rejected": -1.5549647808074951, "loss": 1.5753, "rewards/accuracies": 0.0, "rewards/chosen": 0.03933222219347954, "rewards/margins": -0.3302454650402069, "rewards/rejected": 0.36957767605781555, "step": 3142 }, { "epoch": 0.51, "learning_rate": 9.897145581004085e-07, "logits/chosen": -0.5039379596710205, "logits/rejected": -0.49761709570884705, "logps/chosen": -137.2988739013672, "logps/rejected": -115.01631164550781, "loss": 0.5594, "rewards/accuracies": 0.0, "rewards/chosen": 2.8052918910980225, "rewards/margins": -0.7127425670623779, "rewards/rejected": 3.5180344581604004, "step": 3143 }, { "epoch": 0.51, "learning_rate": 9.896880210474996e-07, "logits/chosen": -0.3886784613132477, "logits/rejected": -0.5054831504821777, "logps/chosen": -89.01455688476562, "logps/rejected": -132.90689086914062, "loss": 1.22, "rewards/accuracies": 0.0, "rewards/chosen": 2.3365814685821533, "rewards/margins": -2.2589385509490967, "rewards/rejected": 4.59552001953125, "step": 3144 }, { "epoch": 0.51, "learning_rate": 9.896614501617822e-07, "logits/chosen": -0.8532106876373291, "logits/rejected": -0.5712807774543762, "logps/chosen": -127.69805908203125, "logps/rejected": -81.45683288574219, "loss": 0.1032, "rewards/accuracies": 1.0, "rewards/chosen": 4.340284824371338, "rewards/margins": 1.6834237575531006, "rewards/rejected": 2.6568610668182373, "step": 3145 }, { "epoch": 0.51, "learning_rate": 9.896348454450918e-07, "logits/chosen": -0.5677018165588379, "logits/rejected": -0.5574713945388794, "logps/chosen": -131.47564697265625, "logps/rejected": -145.75323486328125, "loss": 2.3686, "rewards/accuracies": 0.0, "rewards/chosen": 0.3795761168003082, "rewards/margins": -4.650193691253662, "rewards/rejected": 5.0297698974609375, "step": 3146 }, { "epoch": 0.51, "learning_rate": 9.896082068992664e-07, "logits/chosen": -0.6747379899024963, "logits/rejected": -0.6739933490753174, "logps/chosen": -71.85841369628906, "logps/rejected": -12.358233451843262, "loss": 0.6326, "rewards/accuracies": 1.0, "rewards/chosen": 0.6625480651855469, "rewards/margins": 0.06724327802658081, "rewards/rejected": 0.5953047871589661, "step": 3147 }, { "epoch": 0.51, "learning_rate": 9.895815345261467e-07, "logits/chosen": -0.6305682063102722, "logits/rejected": -0.5793203711509705, "logps/chosen": -61.89815139770508, "logps/rejected": -55.93978500366211, "loss": 0.48, "rewards/accuracies": 1.0, "rewards/chosen": 2.39288592338562, "rewards/margins": 0.2842109203338623, "rewards/rejected": 2.108675003051758, "step": 3148 }, { "epoch": 0.51, "learning_rate": 9.895548283275754e-07, "logits/chosen": -1.3069572448730469, "logits/rejected": -1.2567946910858154, "logps/chosen": -123.09689331054688, "logps/rejected": -103.78041076660156, "loss": 0.4013, "rewards/accuracies": 1.0, "rewards/chosen": 4.177694797515869, "rewards/margins": 2.349667549133301, "rewards/rejected": 1.828027367591858, "step": 3149 }, { "epoch": 0.51, "learning_rate": 9.895280883053976e-07, "logits/chosen": -0.988126277923584, "logits/rejected": -1.0110925436019897, "logps/chosen": -147.1043701171875, "logps/rejected": -186.82449340820312, "loss": 3.2025, "rewards/accuracies": 0.0, "rewards/chosen": 2.2259202003479004, "rewards/margins": -6.386048793792725, "rewards/rejected": 8.611968994140625, "step": 3150 }, { "epoch": 0.51, "learning_rate": 9.89501314461461e-07, "logits/chosen": -0.7760403156280518, "logits/rejected": -0.7556450366973877, "logps/chosen": -57.4193115234375, "logps/rejected": -139.6207733154297, "loss": 1.7011, "rewards/accuracies": 0.0, "rewards/chosen": 0.8398048281669617, "rewards/margins": -2.307990312576294, "rewards/rejected": 3.1477952003479004, "step": 3151 }, { "epoch": 0.51, "learning_rate": 9.894745067976152e-07, "logits/chosen": -0.8804354667663574, "logits/rejected": -0.8078016638755798, "logps/chosen": -105.62195587158203, "logps/rejected": -83.03214263916016, "loss": 1.0245, "rewards/accuracies": 0.0, "rewards/chosen": 0.5649299621582031, "rewards/margins": -1.3217620849609375, "rewards/rejected": 1.8866920471191406, "step": 3152 }, { "epoch": 0.51, "learning_rate": 9.894476653157125e-07, "logits/chosen": -0.667959988117218, "logits/rejected": -0.615104079246521, "logps/chosen": -119.10009765625, "logps/rejected": -116.66685485839844, "loss": 0.424, "rewards/accuracies": 1.0, "rewards/chosen": 2.7952346801757812, "rewards/margins": 0.15154409408569336, "rewards/rejected": 2.643690586090088, "step": 3153 }, { "epoch": 0.51, "learning_rate": 9.894207900176073e-07, "logits/chosen": -0.3083169162273407, "logits/rejected": -0.22745738923549652, "logps/chosen": -89.42860412597656, "logps/rejected": -57.213809967041016, "loss": 1.044, "rewards/accuracies": 0.0, "rewards/chosen": -0.04726257547736168, "rewards/margins": -1.4018818140029907, "rewards/rejected": 1.3546192646026611, "step": 3154 }, { "epoch": 0.51, "learning_rate": 9.893938809051564e-07, "logits/chosen": -0.2972991466522217, "logits/rejected": -0.3028661608695984, "logps/chosen": -9.237602233886719, "logps/rejected": -5.564952850341797, "loss": 0.5628, "rewards/accuracies": 0.0, "rewards/chosen": 0.09575500339269638, "rewards/margins": -0.1801632046699524, "rewards/rejected": 0.27591821551322937, "step": 3155 }, { "epoch": 0.51, "learning_rate": 9.89366937980219e-07, "logits/chosen": -0.6508758664131165, "logits/rejected": -0.5428649187088013, "logps/chosen": -184.79830932617188, "logps/rejected": -49.65996170043945, "loss": 0.085, "rewards/accuracies": 1.0, "rewards/chosen": 4.580389499664307, "rewards/margins": 3.380377769470215, "rewards/rejected": 1.2000118494033813, "step": 3156 }, { "epoch": 0.51, "learning_rate": 9.893399612446566e-07, "logits/chosen": -0.37960466742515564, "logits/rejected": -0.39467695355415344, "logps/chosen": -49.599082946777344, "logps/rejected": -66.32971954345703, "loss": 0.8039, "rewards/accuracies": 1.0, "rewards/chosen": 0.3388996124267578, "rewards/margins": 0.496634304523468, "rewards/rejected": -0.157734677195549, "step": 3157 }, { "epoch": 0.51, "learning_rate": 9.893129507003334e-07, "logits/chosen": -0.6032512784004211, "logits/rejected": -0.6507905125617981, "logps/chosen": -102.27652740478516, "logps/rejected": -71.93408203125, "loss": 1.6109, "rewards/accuracies": 0.0, "rewards/chosen": 1.1204025745391846, "rewards/margins": -0.4571791887283325, "rewards/rejected": 1.577581763267517, "step": 3158 }, { "epoch": 0.51, "learning_rate": 9.892859063491147e-07, "logits/chosen": -0.707699716091156, "logits/rejected": -0.6846294403076172, "logps/chosen": -70.48800659179688, "logps/rejected": -154.46339416503906, "loss": 1.9521, "rewards/accuracies": 0.0, "rewards/chosen": 0.9320755004882812, "rewards/margins": -3.6194000244140625, "rewards/rejected": 4.551475524902344, "step": 3159 }, { "epoch": 0.51, "learning_rate": 9.892588281928698e-07, "logits/chosen": -0.7475637197494507, "logits/rejected": -0.737585723400116, "logps/chosen": -89.72544860839844, "logps/rejected": -170.63742065429688, "loss": 0.3219, "rewards/accuracies": 1.0, "rewards/chosen": 1.1165603399276733, "rewards/margins": 0.3029189705848694, "rewards/rejected": 0.813641369342804, "step": 3160 }, { "epoch": 0.51, "learning_rate": 9.892317162334694e-07, "logits/chosen": -0.6850019097328186, "logits/rejected": -0.6705484986305237, "logps/chosen": -120.0591049194336, "logps/rejected": -86.54885864257812, "loss": 0.7603, "rewards/accuracies": 0.0, "rewards/chosen": 1.0211433172225952, "rewards/margins": -1.1321052312850952, "rewards/rejected": 2.1532485485076904, "step": 3161 }, { "epoch": 0.51, "learning_rate": 9.892045704727863e-07, "logits/chosen": -0.5385881066322327, "logits/rejected": -0.5380269289016724, "logps/chosen": -6.809768199920654, "logps/rejected": -19.92412757873535, "loss": 0.2812, "rewards/accuracies": 1.0, "rewards/chosen": 0.1597946733236313, "rewards/margins": 0.395367294549942, "rewards/rejected": -0.23557262122631073, "step": 3162 }, { "epoch": 0.51, "learning_rate": 9.891773909126963e-07, "logits/chosen": -0.3849990963935852, "logits/rejected": -0.3715430200099945, "logps/chosen": -74.85100555419922, "logps/rejected": -92.20390319824219, "loss": 0.8779, "rewards/accuracies": 0.0, "rewards/chosen": 1.837104082107544, "rewards/margins": -1.5487313270568848, "rewards/rejected": 3.3858354091644287, "step": 3163 }, { "epoch": 0.51, "learning_rate": 9.891501775550774e-07, "logits/chosen": -0.6535590887069702, "logits/rejected": -0.7120189070701599, "logps/chosen": -238.66392517089844, "logps/rejected": -95.60415649414062, "loss": 0.6695, "rewards/accuracies": 0.0, "rewards/chosen": 3.2774674892425537, "rewards/margins": -0.7796065807342529, "rewards/rejected": 4.057074069976807, "step": 3164 }, { "epoch": 0.51, "learning_rate": 9.891229304018097e-07, "logits/chosen": -0.4724816083908081, "logits/rejected": -0.5173720121383667, "logps/chosen": -94.36276245117188, "logps/rejected": -181.60252380371094, "loss": 0.8214, "rewards/accuracies": 0.0, "rewards/chosen": 1.052703857421875, "rewards/margins": -0.7902663946151733, "rewards/rejected": 1.8429702520370483, "step": 3165 }, { "epoch": 0.51, "learning_rate": 9.890956494547754e-07, "logits/chosen": -0.5761911273002625, "logits/rejected": -0.5341790914535522, "logps/chosen": -128.55052185058594, "logps/rejected": -51.27301025390625, "loss": 0.3846, "rewards/accuracies": 1.0, "rewards/chosen": 2.934718370437622, "rewards/margins": 0.9066834449768066, "rewards/rejected": 2.0280349254608154, "step": 3166 }, { "epoch": 0.51, "learning_rate": 9.890683347158596e-07, "logits/chosen": -0.33095091581344604, "logits/rejected": -0.33083611726760864, "logps/chosen": -25.161073684692383, "logps/rejected": -21.923301696777344, "loss": 0.8302, "rewards/accuracies": 0.0, "rewards/chosen": -0.29163646697998047, "rewards/margins": -0.7844894528388977, "rewards/rejected": 0.49285298585891724, "step": 3167 }, { "epoch": 0.51, "learning_rate": 9.890409861869495e-07, "logits/chosen": -0.5024011731147766, "logits/rejected": -0.4660196304321289, "logps/chosen": -79.40815734863281, "logps/rejected": -65.97674560546875, "loss": 0.5807, "rewards/accuracies": 1.0, "rewards/chosen": 2.2843093872070312, "rewards/margins": 0.5492942333221436, "rewards/rejected": 1.7350151538848877, "step": 3168 }, { "epoch": 0.51, "learning_rate": 9.89013603869935e-07, "logits/chosen": -0.8084933757781982, "logits/rejected": -0.7036662697792053, "logps/chosen": -45.62877655029297, "logps/rejected": -25.057586669921875, "loss": 0.196, "rewards/accuracies": 1.0, "rewards/chosen": 1.3442124128341675, "rewards/margins": 1.4359123706817627, "rewards/rejected": -0.091699980199337, "step": 3169 }, { "epoch": 0.51, "learning_rate": 9.889861877667069e-07, "logits/chosen": -0.5659952759742737, "logits/rejected": -0.5304456949234009, "logps/chosen": -104.59902954101562, "logps/rejected": -83.2874984741211, "loss": 0.4501, "rewards/accuracies": 0.0, "rewards/chosen": 2.0328080654144287, "rewards/margins": -0.1608259677886963, "rewards/rejected": 2.193634033203125, "step": 3170 }, { "epoch": 0.51, "learning_rate": 9.889587378791604e-07, "logits/chosen": -0.8195487260818481, "logits/rejected": -0.6048615574836731, "logps/chosen": -49.967140197753906, "logps/rejected": -83.96237182617188, "loss": 0.8812, "rewards/accuracies": 0.0, "rewards/chosen": 2.433339834213257, "rewards/margins": -0.7739737033843994, "rewards/rejected": 3.2073135375976562, "step": 3171 }, { "epoch": 0.51, "learning_rate": 9.889312542091916e-07, "logits/chosen": -0.5137369632720947, "logits/rejected": -0.5327484011650085, "logps/chosen": -11.578170776367188, "logps/rejected": -3.057452440261841, "loss": 0.4997, "rewards/accuracies": 0.0, "rewards/chosen": 0.22909851372241974, "rewards/margins": -0.20254363119602203, "rewards/rejected": 0.4316421449184418, "step": 3172 }, { "epoch": 0.52, "learning_rate": 9.889037367586997e-07, "logits/chosen": -0.7685391902923584, "logits/rejected": -0.7896968722343445, "logps/chosen": -104.01139831542969, "logps/rejected": -67.00386047363281, "loss": 0.8774, "rewards/accuracies": 0.0, "rewards/chosen": 0.4228103756904602, "rewards/margins": -0.15988314151763916, "rewards/rejected": 0.5826935172080994, "step": 3173 }, { "epoch": 0.52, "learning_rate": 9.888761855295852e-07, "logits/chosen": -0.49073663353919983, "logits/rejected": -0.47630971670150757, "logps/chosen": -74.50564575195312, "logps/rejected": -118.8852310180664, "loss": 2.0858, "rewards/accuracies": 0.0, "rewards/chosen": 1.0645004510879517, "rewards/margins": -4.144184112548828, "rewards/rejected": 5.20868444442749, "step": 3174 }, { "epoch": 0.52, "learning_rate": 9.888486005237523e-07, "logits/chosen": -0.41397911310195923, "logits/rejected": -0.3567827343940735, "logps/chosen": -58.99794006347656, "logps/rejected": -60.40840148925781, "loss": 0.4288, "rewards/accuracies": 1.0, "rewards/chosen": 1.6672486066818237, "rewards/margins": 0.0547943115234375, "rewards/rejected": 1.6124542951583862, "step": 3175 }, { "epoch": 0.52, "learning_rate": 9.888209817431068e-07, "logits/chosen": -0.11132347583770752, "logits/rejected": -0.1165294200181961, "logps/chosen": -2.9338459968566895, "logps/rejected": -17.40150260925293, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 0.2516293227672577, "rewards/margins": 0.06364390254020691, "rewards/rejected": 0.18798542022705078, "step": 3176 }, { "epoch": 0.52, "learning_rate": 9.887933291895565e-07, "logits/chosen": -0.9853588938713074, "logits/rejected": -0.8492376208305359, "logps/chosen": -99.77377319335938, "logps/rejected": -94.27275085449219, "loss": 0.3772, "rewards/accuracies": 1.0, "rewards/chosen": 4.086468696594238, "rewards/margins": 1.40342116355896, "rewards/rejected": 2.6830475330352783, "step": 3177 }, { "epoch": 0.52, "learning_rate": 9.887656428650121e-07, "logits/chosen": -0.8464022278785706, "logits/rejected": -0.7511299848556519, "logps/chosen": -136.91937255859375, "logps/rejected": -20.180160522460938, "loss": 0.0766, "rewards/accuracies": 1.0, "rewards/chosen": 3.3483474254608154, "rewards/margins": 2.719456911087036, "rewards/rejected": 0.6288904547691345, "step": 3178 }, { "epoch": 0.52, "learning_rate": 9.887379227713868e-07, "logits/chosen": -0.8309953808784485, "logits/rejected": -0.7823314666748047, "logps/chosen": -48.550682067871094, "logps/rejected": -90.91056060791016, "loss": 0.4794, "rewards/accuracies": 0.0, "rewards/chosen": 1.452612280845642, "rewards/margins": -0.42151570320129395, "rewards/rejected": 1.874127984046936, "step": 3179 }, { "epoch": 0.52, "learning_rate": 9.887101689105953e-07, "logits/chosen": -0.5938155651092529, "logits/rejected": -0.5701199769973755, "logps/chosen": -98.08659362792969, "logps/rejected": -55.176387786865234, "loss": 1.0013, "rewards/accuracies": 0.0, "rewards/chosen": 0.8235244750976562, "rewards/margins": -0.8006794452667236, "rewards/rejected": 1.6242039203643799, "step": 3180 }, { "epoch": 0.52, "learning_rate": 9.886823812845555e-07, "logits/chosen": -0.9701393842697144, "logits/rejected": -0.8846325874328613, "logps/chosen": -123.17494201660156, "logps/rejected": -64.12835693359375, "loss": 0.477, "rewards/accuracies": 1.0, "rewards/chosen": 5.0509934425354, "rewards/margins": 3.258460283279419, "rewards/rejected": 1.7925331592559814, "step": 3181 }, { "epoch": 0.52, "learning_rate": 9.88654559895187e-07, "logits/chosen": -0.6721072793006897, "logits/rejected": -0.6028297543525696, "logps/chosen": -242.21734619140625, "logps/rejected": -113.70303344726562, "loss": 1.1973, "rewards/accuracies": 0.0, "rewards/chosen": 2.827139377593994, "rewards/margins": -1.6478166580200195, "rewards/rejected": 4.474956035614014, "step": 3182 }, { "epoch": 0.52, "learning_rate": 9.886267047444122e-07, "logits/chosen": -0.8071768879890442, "logits/rejected": -0.7702550888061523, "logps/chosen": -49.48186492919922, "logps/rejected": -114.39105224609375, "loss": 0.343, "rewards/accuracies": 1.0, "rewards/chosen": 1.9812119007110596, "rewards/margins": 0.05268561840057373, "rewards/rejected": 1.9285262823104858, "step": 3183 }, { "epoch": 0.52, "learning_rate": 9.885988158341553e-07, "logits/chosen": -0.6159024834632874, "logits/rejected": -0.44738420844078064, "logps/chosen": -152.16294860839844, "logps/rejected": -135.53306579589844, "loss": 1.2698, "rewards/accuracies": 0.0, "rewards/chosen": 3.0858705043792725, "rewards/margins": -1.7846543788909912, "rewards/rejected": 4.870524883270264, "step": 3184 }, { "epoch": 0.52, "learning_rate": 9.885708931663436e-07, "logits/chosen": -0.3105472922325134, "logits/rejected": -0.3105472922325134, "logps/chosen": -4.067188739776611, "logps/rejected": -4.067188739776611, "loss": 0.7937, "rewards/accuracies": 0.0, "rewards/chosen": 0.30065783858299255, "rewards/margins": 0.0, "rewards/rejected": 0.30065783858299255, "step": 3185 }, { "epoch": 0.52, "learning_rate": 9.885429367429062e-07, "logits/chosen": -0.5271167755126953, "logits/rejected": -0.4831357002258301, "logps/chosen": -63.742469787597656, "logps/rejected": -63.39392852783203, "loss": 0.5526, "rewards/accuracies": 0.0, "rewards/chosen": 1.234534502029419, "rewards/margins": -0.17773890495300293, "rewards/rejected": 1.4122734069824219, "step": 3186 }, { "epoch": 0.52, "learning_rate": 9.885149465657741e-07, "logits/chosen": -0.4386323094367981, "logits/rejected": -0.40716129541397095, "logps/chosen": -65.1926040649414, "logps/rejected": -103.97319030761719, "loss": 0.8251, "rewards/accuracies": 0.0, "rewards/chosen": 0.8989036679267883, "rewards/margins": -0.90732342004776, "rewards/rejected": 1.8062270879745483, "step": 3187 }, { "epoch": 0.52, "learning_rate": 9.884869226368819e-07, "logits/chosen": -0.7400597333908081, "logits/rejected": -0.6802715063095093, "logps/chosen": -75.78944396972656, "logps/rejected": -17.30245018005371, "loss": 0.3995, "rewards/accuracies": 1.0, "rewards/chosen": 0.9135482907295227, "rewards/margins": 0.5922390222549438, "rewards/rejected": 0.32130929827690125, "step": 3188 }, { "epoch": 0.52, "learning_rate": 9.884588649581654e-07, "logits/chosen": -0.6768020391464233, "logits/rejected": -0.6145744323730469, "logps/chosen": -86.15724182128906, "logps/rejected": -51.97971725463867, "loss": 0.0705, "rewards/accuracies": 1.0, "rewards/chosen": 2.716616153717041, "rewards/margins": 2.289558172225952, "rewards/rejected": 0.42705804109573364, "step": 3189 }, { "epoch": 0.52, "learning_rate": 9.884307735315632e-07, "logits/chosen": -0.39997124671936035, "logits/rejected": -0.3430781066417694, "logps/chosen": -46.389583587646484, "logps/rejected": -91.98095703125, "loss": 1.0215, "rewards/accuracies": 0.0, "rewards/chosen": 1.8344624042510986, "rewards/margins": -0.26851534843444824, "rewards/rejected": 2.102977752685547, "step": 3190 }, { "epoch": 0.52, "learning_rate": 9.88402648359016e-07, "logits/chosen": -0.5919556617736816, "logits/rejected": -0.5831751227378845, "logps/chosen": -65.1581039428711, "logps/rejected": -73.39915466308594, "loss": 0.6271, "rewards/accuracies": 0.0, "rewards/chosen": 1.2546623945236206, "rewards/margins": -0.8948303461074829, "rewards/rejected": 2.1494927406311035, "step": 3191 }, { "epoch": 0.52, "learning_rate": 9.88374489442467e-07, "logits/chosen": -0.6581343412399292, "logits/rejected": -0.5790500044822693, "logps/chosen": -98.49066162109375, "logps/rejected": -71.81713104248047, "loss": 0.4361, "rewards/accuracies": 1.0, "rewards/chosen": 3.4620513916015625, "rewards/margins": 1.930548071861267, "rewards/rejected": 1.5315033197402954, "step": 3192 }, { "epoch": 0.52, "learning_rate": 9.88346296783862e-07, "logits/chosen": -0.4080219566822052, "logits/rejected": -0.5751200318336487, "logps/chosen": -33.869956970214844, "logps/rejected": -40.07408905029297, "loss": 2.4286, "rewards/accuracies": 0.0, "rewards/chosen": 1.5036338567733765, "rewards/margins": -0.9458924531936646, "rewards/rejected": 2.449526309967041, "step": 3193 }, { "epoch": 0.52, "learning_rate": 9.883180703851487e-07, "logits/chosen": -0.918884813785553, "logits/rejected": -0.8346569538116455, "logps/chosen": -142.73040771484375, "logps/rejected": -111.85318756103516, "loss": 0.3711, "rewards/accuracies": 1.0, "rewards/chosen": 7.019557476043701, "rewards/margins": 4.520920753479004, "rewards/rejected": 2.4986367225646973, "step": 3194 }, { "epoch": 0.52, "learning_rate": 9.882898102482772e-07, "logits/chosen": -0.4436381459236145, "logits/rejected": -0.35426783561706543, "logps/chosen": -107.23531341552734, "logps/rejected": -94.81519317626953, "loss": 0.2835, "rewards/accuracies": 1.0, "rewards/chosen": 4.78345251083374, "rewards/margins": 1.0073041915893555, "rewards/rejected": 3.7761483192443848, "step": 3195 }, { "epoch": 0.52, "learning_rate": 9.882615163751999e-07, "logits/chosen": -0.5349027514457703, "logits/rejected": -0.533268928527832, "logps/chosen": -88.26661682128906, "logps/rejected": -134.57809448242188, "loss": 0.8528, "rewards/accuracies": 1.0, "rewards/chosen": 0.20151519775390625, "rewards/margins": 1.1461701393127441, "rewards/rejected": -0.9446548819541931, "step": 3196 }, { "epoch": 0.52, "learning_rate": 9.88233188767872e-07, "logits/chosen": -0.5148573517799377, "logits/rejected": -0.4190814197063446, "logps/chosen": -95.56571197509766, "logps/rejected": -35.748016357421875, "loss": 0.0273, "rewards/accuracies": 1.0, "rewards/chosen": 3.939441680908203, "rewards/margins": 3.2527663707733154, "rewards/rejected": 0.6866752505302429, "step": 3197 }, { "epoch": 0.52, "learning_rate": 9.882048274282505e-07, "logits/chosen": -0.4920014441013336, "logits/rejected": -0.5069195628166199, "logps/chosen": -46.485713958740234, "logps/rejected": -55.614158630371094, "loss": 0.7379, "rewards/accuracies": 1.0, "rewards/chosen": 0.750863254070282, "rewards/margins": 0.19802814722061157, "rewards/rejected": 0.5528351068496704, "step": 3198 }, { "epoch": 0.52, "learning_rate": 9.881764323582947e-07, "logits/chosen": -0.1118585392832756, "logits/rejected": -0.1152561828494072, "logps/chosen": -3.669320583343506, "logps/rejected": -1.993849515914917, "loss": 0.4203, "rewards/accuracies": 0.0, "rewards/chosen": -0.02309732511639595, "rewards/margins": -0.1001424789428711, "rewards/rejected": 0.07704515755176544, "step": 3199 }, { "epoch": 0.52, "learning_rate": 9.881480035599666e-07, "logits/chosen": -1.0665385723114014, "logits/rejected": -0.9876596927642822, "logps/chosen": -83.87869262695312, "logps/rejected": -34.9337158203125, "loss": 1.1339, "rewards/accuracies": 0.0, "rewards/chosen": 0.4767822325229645, "rewards/margins": -0.18558958172798157, "rewards/rejected": 0.662371814250946, "step": 3200 }, { "epoch": 0.52, "learning_rate": 9.881195410352304e-07, "logits/chosen": -0.33215874433517456, "logits/rejected": -0.2910401225090027, "logps/chosen": -58.90872573852539, "logps/rejected": -21.745325088500977, "loss": 0.3676, "rewards/accuracies": 0.0, "rewards/chosen": 0.19453774392604828, "rewards/margins": -0.07311783730983734, "rewards/rejected": 0.2676555812358856, "step": 3201 }, { "epoch": 0.52, "learning_rate": 9.880910447860527e-07, "logits/chosen": -0.8461915254592896, "logits/rejected": -0.8272604942321777, "logps/chosen": -79.9474105834961, "logps/rejected": -96.18215942382812, "loss": 1.4208, "rewards/accuracies": 0.0, "rewards/chosen": 0.33775100111961365, "rewards/margins": -2.341115713119507, "rewards/rejected": 2.6788666248321533, "step": 3202 }, { "epoch": 0.52, "learning_rate": 9.88062514814402e-07, "logits/chosen": -0.6579921841621399, "logits/rejected": -0.6307887434959412, "logps/chosen": -62.936363220214844, "logps/rejected": -55.18544006347656, "loss": 0.8352, "rewards/accuracies": 0.0, "rewards/chosen": 0.8366249203681946, "rewards/margins": -0.2214149832725525, "rewards/rejected": 1.058039903640747, "step": 3203 }, { "epoch": 0.52, "learning_rate": 9.880339511222494e-07, "logits/chosen": -0.9659005999565125, "logits/rejected": -0.9793957471847534, "logps/chosen": -163.18118286132812, "logps/rejected": -49.17057800292969, "loss": 0.7754, "rewards/accuracies": 0.0, "rewards/chosen": 0.292001336812973, "rewards/margins": -0.3057583272457123, "rewards/rejected": 0.5977596640586853, "step": 3204 }, { "epoch": 0.52, "learning_rate": 9.880053537115688e-07, "logits/chosen": -0.7201816439628601, "logits/rejected": -0.6134873032569885, "logps/chosen": -48.34684753417969, "logps/rejected": -88.3191909790039, "loss": 1.047, "rewards/accuracies": 0.0, "rewards/chosen": 1.8220452070236206, "rewards/margins": -1.1178497076034546, "rewards/rejected": 2.939894914627075, "step": 3205 }, { "epoch": 0.52, "learning_rate": 9.87976722584336e-07, "logits/chosen": -0.28633710741996765, "logits/rejected": -0.3039138913154602, "logps/chosen": -61.463722229003906, "logps/rejected": -121.58187866210938, "loss": 0.268, "rewards/accuracies": 1.0, "rewards/chosen": 1.0538772344589233, "rewards/margins": 1.1710975170135498, "rewards/rejected": -0.11722030490636826, "step": 3206 }, { "epoch": 0.52, "learning_rate": 9.879480577425288e-07, "logits/chosen": -0.611770749092102, "logits/rejected": -0.5726118683815002, "logps/chosen": -67.44866943359375, "logps/rejected": -142.3453826904297, "loss": 0.7934, "rewards/accuracies": 1.0, "rewards/chosen": 1.1826263666152954, "rewards/margins": 0.8518539667129517, "rewards/rejected": 0.33077239990234375, "step": 3207 }, { "epoch": 0.52, "learning_rate": 9.879193591881278e-07, "logits/chosen": -0.393669992685318, "logits/rejected": -0.39671820402145386, "logps/chosen": -109.84357452392578, "logps/rejected": -91.39066314697266, "loss": 0.7605, "rewards/accuracies": 1.0, "rewards/chosen": 3.1589150428771973, "rewards/margins": 0.41953587532043457, "rewards/rejected": 2.7393791675567627, "step": 3208 }, { "epoch": 0.52, "learning_rate": 9.878906269231158e-07, "logits/chosen": -0.3776518404483795, "logits/rejected": -0.28598126769065857, "logps/chosen": -38.63129425048828, "logps/rejected": -45.94474792480469, "loss": 0.2973, "rewards/accuracies": 1.0, "rewards/chosen": 0.8344330191612244, "rewards/margins": 0.2960083484649658, "rewards/rejected": 0.5384246706962585, "step": 3209 }, { "epoch": 0.52, "learning_rate": 9.87861860949478e-07, "logits/chosen": -0.678544819355011, "logits/rejected": -0.6235386729240417, "logps/chosen": -86.0218276977539, "logps/rejected": -42.130611419677734, "loss": 0.2628, "rewards/accuracies": 1.0, "rewards/chosen": 1.5145012140274048, "rewards/margins": 1.3247989416122437, "rewards/rejected": 0.18970222771167755, "step": 3210 }, { "epoch": 0.52, "learning_rate": 9.878330612692017e-07, "logits/chosen": -0.7915661334991455, "logits/rejected": -0.8060092329978943, "logps/chosen": -118.37440490722656, "logps/rejected": -66.2214126586914, "loss": 1.2103, "rewards/accuracies": 0.0, "rewards/chosen": 0.7185356020927429, "rewards/margins": -1.3377037048339844, "rewards/rejected": 2.056239366531372, "step": 3211 }, { "epoch": 0.52, "learning_rate": 9.878042278842769e-07, "logits/chosen": -0.361816942691803, "logits/rejected": -0.29003965854644775, "logps/chosen": -90.84854125976562, "logps/rejected": -43.63010025024414, "loss": 0.1979, "rewards/accuracies": 1.0, "rewards/chosen": 2.183070421218872, "rewards/margins": 1.2616455554962158, "rewards/rejected": 0.9214248657226562, "step": 3212 }, { "epoch": 0.52, "learning_rate": 9.877753607966953e-07, "logits/chosen": -0.6499428749084473, "logits/rejected": -0.6044941544532776, "logps/chosen": -194.88156127929688, "logps/rejected": -50.492820739746094, "loss": 1.3953, "rewards/accuracies": 1.0, "rewards/chosen": 2.775290012359619, "rewards/margins": 1.1956696510314941, "rewards/rejected": 1.579620361328125, "step": 3213 }, { "epoch": 0.52, "learning_rate": 9.877464600084522e-07, "logits/chosen": -0.7179867029190063, "logits/rejected": -0.7149091958999634, "logps/chosen": -177.07562255859375, "logps/rejected": -68.63876342773438, "loss": 1.1789, "rewards/accuracies": 0.0, "rewards/chosen": 1.4928025007247925, "rewards/margins": -0.901199221611023, "rewards/rejected": 2.3940017223358154, "step": 3214 }, { "epoch": 0.52, "learning_rate": 9.877175255215434e-07, "logits/chosen": -0.47689011693000793, "logits/rejected": -0.4344939589500427, "logps/chosen": -92.7329330444336, "logps/rejected": -38.510746002197266, "loss": 0.7601, "rewards/accuracies": 1.0, "rewards/chosen": 1.1718841791152954, "rewards/margins": 0.8228862881660461, "rewards/rejected": 0.34899789094924927, "step": 3215 }, { "epoch": 0.52, "learning_rate": 9.876885573379685e-07, "logits/chosen": -0.3864230513572693, "logits/rejected": -0.279879093170166, "logps/chosen": -110.93215942382812, "logps/rejected": -59.69258117675781, "loss": 1.2977, "rewards/accuracies": 0.0, "rewards/chosen": 0.8370025753974915, "rewards/margins": -0.09910887479782104, "rewards/rejected": 0.9361114501953125, "step": 3216 }, { "epoch": 0.52, "learning_rate": 9.876595554597288e-07, "logits/chosen": -0.23979228734970093, "logits/rejected": -0.23979228734970093, "logps/chosen": -38.467552185058594, "logps/rejected": -38.467552185058594, "loss": 1.0133, "rewards/accuracies": 0.0, "rewards/chosen": 0.06672515720129013, "rewards/margins": 0.0, "rewards/rejected": 0.06672515720129013, "step": 3217 }, { "epoch": 0.52, "learning_rate": 9.876305198888282e-07, "logits/chosen": -0.566304624080658, "logits/rejected": -0.4815223515033722, "logps/chosen": -125.60578155517578, "logps/rejected": -76.09516906738281, "loss": 0.9812, "rewards/accuracies": 0.0, "rewards/chosen": 0.4735267758369446, "rewards/margins": -1.7276146411895752, "rewards/rejected": 2.201141357421875, "step": 3218 }, { "epoch": 0.52, "learning_rate": 9.876014506272726e-07, "logits/chosen": -0.8275668025016785, "logits/rejected": -0.7956293821334839, "logps/chosen": -79.2146987915039, "logps/rejected": -59.094482421875, "loss": 0.7045, "rewards/accuracies": 0.0, "rewards/chosen": 0.6409057974815369, "rewards/margins": -0.9863571524620056, "rewards/rejected": 1.6272629499435425, "step": 3219 }, { "epoch": 0.52, "learning_rate": 9.875723476770705e-07, "logits/chosen": -0.5041266679763794, "logits/rejected": -0.5219585299491882, "logps/chosen": -69.3675537109375, "logps/rejected": -143.6370086669922, "loss": 0.3736, "rewards/accuracies": 1.0, "rewards/chosen": 0.6916404962539673, "rewards/margins": 0.4680442810058594, "rewards/rejected": 0.22359620034694672, "step": 3220 }, { "epoch": 0.52, "learning_rate": 9.875432110402326e-07, "logits/chosen": -0.040009114891290665, "logits/rejected": -0.05935453623533249, "logps/chosen": -4.742823600769043, "logps/rejected": -75.44137573242188, "loss": 0.8155, "rewards/accuracies": 1.0, "rewards/chosen": 0.4789867401123047, "rewards/margins": 0.2013065218925476, "rewards/rejected": 0.2776802182197571, "step": 3221 }, { "epoch": 0.52, "learning_rate": 9.87514040718772e-07, "logits/chosen": -0.5754224061965942, "logits/rejected": -0.5938132405281067, "logps/chosen": -20.87071990966797, "logps/rejected": -62.15571594238281, "loss": 0.4982, "rewards/accuracies": 0.0, "rewards/chosen": 0.5541236996650696, "rewards/margins": -0.0515289306640625, "rewards/rejected": 0.6056526303291321, "step": 3222 }, { "epoch": 0.52, "learning_rate": 9.874848367147043e-07, "logits/chosen": -0.6239839792251587, "logits/rejected": -0.5177066326141357, "logps/chosen": -105.6053237915039, "logps/rejected": -56.836483001708984, "loss": 0.913, "rewards/accuracies": 1.0, "rewards/chosen": 1.3687553405761719, "rewards/margins": 0.14081454277038574, "rewards/rejected": 1.2279407978057861, "step": 3223 }, { "epoch": 0.52, "learning_rate": 9.874555990300469e-07, "logits/chosen": -0.5629388689994812, "logits/rejected": -0.5073824524879456, "logps/chosen": -227.4267578125, "logps/rejected": -108.6236572265625, "loss": 0.5009, "rewards/accuracies": 1.0, "rewards/chosen": 2.607836961746216, "rewards/margins": 1.084980845451355, "rewards/rejected": 1.5228561162948608, "step": 3224 }, { "epoch": 0.52, "learning_rate": 9.874263276668199e-07, "logits/chosen": -0.5858724117279053, "logits/rejected": -0.585634708404541, "logps/chosen": -93.48028564453125, "logps/rejected": -115.0784912109375, "loss": 1.4729, "rewards/accuracies": 1.0, "rewards/chosen": 0.7844276428222656, "rewards/margins": 0.1173248291015625, "rewards/rejected": 0.6671028137207031, "step": 3225 }, { "epoch": 0.52, "learning_rate": 9.873970226270456e-07, "logits/chosen": -0.2316664606332779, "logits/rejected": -0.24605382978916168, "logps/chosen": -20.127695083618164, "logps/rejected": -38.275970458984375, "loss": 0.6023, "rewards/accuracies": 0.0, "rewards/chosen": 0.1929437667131424, "rewards/margins": -0.06429843604564667, "rewards/rejected": 0.25724220275878906, "step": 3226 }, { "epoch": 0.52, "learning_rate": 9.87367683912749e-07, "logits/chosen": -0.658527672290802, "logits/rejected": -0.6714731454849243, "logps/chosen": -99.69721984863281, "logps/rejected": -61.95804977416992, "loss": 0.8488, "rewards/accuracies": 0.0, "rewards/chosen": 0.655744194984436, "rewards/margins": -1.1350085735321045, "rewards/rejected": 1.7907527685165405, "step": 3227 }, { "epoch": 0.52, "learning_rate": 9.873383115259569e-07, "logits/chosen": -0.7214078903198242, "logits/rejected": -0.6386032700538635, "logps/chosen": -109.20381164550781, "logps/rejected": -34.2733154296875, "loss": 0.1457, "rewards/accuracies": 1.0, "rewards/chosen": 1.5154014825820923, "rewards/margins": 1.132574439048767, "rewards/rejected": 0.3828270137310028, "step": 3228 }, { "epoch": 0.52, "learning_rate": 9.873089054686988e-07, "logits/chosen": -0.8013524413108826, "logits/rejected": -0.7351393699645996, "logps/chosen": -123.22777557373047, "logps/rejected": -41.560062408447266, "loss": 0.545, "rewards/accuracies": 1.0, "rewards/chosen": 1.1935539245605469, "rewards/margins": 1.0030590295791626, "rewards/rejected": 0.19049492478370667, "step": 3229 }, { "epoch": 0.52, "learning_rate": 9.87279465743006e-07, "logits/chosen": -0.4241482615470886, "logits/rejected": -0.4009231925010681, "logps/chosen": -29.701414108276367, "logps/rejected": -26.414188385009766, "loss": 0.619, "rewards/accuracies": 1.0, "rewards/chosen": 0.2595636546611786, "rewards/margins": 0.07053014636039734, "rewards/rejected": 0.18903350830078125, "step": 3230 }, { "epoch": 0.52, "learning_rate": 9.872499923509132e-07, "logits/chosen": -0.6647679805755615, "logits/rejected": -0.5758509039878845, "logps/chosen": -236.40371704101562, "logps/rejected": -88.70785522460938, "loss": 0.296, "rewards/accuracies": 1.0, "rewards/chosen": 3.760449171066284, "rewards/margins": 1.0278983116149902, "rewards/rejected": 2.732550859451294, "step": 3231 }, { "epoch": 0.52, "learning_rate": 9.872204852944561e-07, "logits/chosen": -0.6145241260528564, "logits/rejected": -0.5356396436691284, "logps/chosen": -118.94514465332031, "logps/rejected": -52.776893615722656, "loss": 0.7664, "rewards/accuracies": 0.0, "rewards/chosen": -0.3767242431640625, "rewards/margins": -0.26218336820602417, "rewards/rejected": -0.11454086750745773, "step": 3232 }, { "epoch": 0.52, "learning_rate": 9.871909445756735e-07, "logits/chosen": -0.4706791937351227, "logits/rejected": -0.46483272314071655, "logps/chosen": -3.7155232429504395, "logps/rejected": -10.783064842224121, "loss": 0.3599, "rewards/accuracies": 0.0, "rewards/chosen": 0.02012944221496582, "rewards/margins": -0.008814288303256035, "rewards/rejected": 0.028943730518221855, "step": 3233 }, { "epoch": 0.52, "learning_rate": 9.871613701966066e-07, "logits/chosen": -0.5092735290527344, "logits/rejected": -0.5374007821083069, "logps/chosen": -134.48524475097656, "logps/rejected": -94.88499450683594, "loss": 0.8644, "rewards/accuracies": 0.0, "rewards/chosen": 0.5257980227470398, "rewards/margins": -1.0613198280334473, "rewards/rejected": 1.5871177911758423, "step": 3234 }, { "epoch": 0.53, "learning_rate": 9.871317621592986e-07, "logits/chosen": -0.5091466307640076, "logits/rejected": -0.45930054783821106, "logps/chosen": -101.31695556640625, "logps/rejected": -76.61614227294922, "loss": 0.6855, "rewards/accuracies": 0.0, "rewards/chosen": 0.974578857421875, "rewards/margins": -1.0650444030761719, "rewards/rejected": 2.039623260498047, "step": 3235 }, { "epoch": 0.53, "learning_rate": 9.871021204657952e-07, "logits/chosen": -0.3165260851383209, "logits/rejected": -0.33571290969848633, "logps/chosen": -48.37760925292969, "logps/rejected": -71.72635650634766, "loss": 1.2383, "rewards/accuracies": 0.0, "rewards/chosen": 1.985582709312439, "rewards/margins": -1.2840501070022583, "rewards/rejected": 3.2696328163146973, "step": 3236 }, { "epoch": 0.53, "learning_rate": 9.87072445118144e-07, "logits/chosen": -0.8241614699363708, "logits/rejected": -0.6782558560371399, "logps/chosen": -96.76773071289062, "logps/rejected": -21.32853889465332, "loss": 0.0548, "rewards/accuracies": 1.0, "rewards/chosen": 5.602206707000732, "rewards/margins": 5.342224597930908, "rewards/rejected": 0.2599819302558899, "step": 3237 }, { "epoch": 0.53, "learning_rate": 9.870427361183958e-07, "logits/chosen": -0.41062554717063904, "logits/rejected": -0.21330270171165466, "logps/chosen": -111.93989562988281, "logps/rejected": -56.135189056396484, "loss": 0.1398, "rewards/accuracies": 1.0, "rewards/chosen": 4.205580234527588, "rewards/margins": 2.3466732501983643, "rewards/rejected": 1.8589069843292236, "step": 3238 }, { "epoch": 0.53, "learning_rate": 9.870129934686028e-07, "logits/chosen": -0.42055439949035645, "logits/rejected": -0.4773891270160675, "logps/chosen": -117.53224182128906, "logps/rejected": -59.640533447265625, "loss": 0.9243, "rewards/accuracies": 1.0, "rewards/chosen": 5.091056823730469, "rewards/margins": 3.197627067565918, "rewards/rejected": 1.8934296369552612, "step": 3239 }, { "epoch": 0.53, "learning_rate": 9.869832171708203e-07, "logits/chosen": -0.6238434910774231, "logits/rejected": -0.6547348499298096, "logps/chosen": -130.22219848632812, "logps/rejected": -104.70661926269531, "loss": 0.7859, "rewards/accuracies": 0.0, "rewards/chosen": 1.224029541015625, "rewards/margins": -0.9109885692596436, "rewards/rejected": 2.1350181102752686, "step": 3240 }, { "epoch": 0.53, "learning_rate": 9.869534072271053e-07, "logits/chosen": -0.39068982005119324, "logits/rejected": -0.4252980947494507, "logps/chosen": -71.16194152832031, "logps/rejected": -80.22113800048828, "loss": 0.7077, "rewards/accuracies": 1.0, "rewards/chosen": 0.9018279910087585, "rewards/margins": 0.45080792903900146, "rewards/rejected": 0.4510200619697571, "step": 3241 }, { "epoch": 0.53, "learning_rate": 9.869235636395176e-07, "logits/chosen": -0.696109414100647, "logits/rejected": -0.65793776512146, "logps/chosen": -60.03968048095703, "logps/rejected": -24.495254516601562, "loss": 1.3166, "rewards/accuracies": 1.0, "rewards/chosen": 1.7669228315353394, "rewards/margins": 1.058591365814209, "rewards/rejected": 0.7083315253257751, "step": 3242 }, { "epoch": 0.53, "learning_rate": 9.868936864101187e-07, "logits/chosen": -0.7135942578315735, "logits/rejected": -0.6927798390388489, "logps/chosen": -67.39306640625, "logps/rejected": -23.2602596282959, "loss": 0.3622, "rewards/accuracies": 1.0, "rewards/chosen": 1.6661789417266846, "rewards/margins": 1.6627426147460938, "rewards/rejected": 0.0034362792503088713, "step": 3243 }, { "epoch": 0.53, "learning_rate": 9.868637755409733e-07, "logits/chosen": -0.4496474266052246, "logits/rejected": -0.3830864429473877, "logps/chosen": -104.9009017944336, "logps/rejected": -63.643592834472656, "loss": 0.6542, "rewards/accuracies": 1.0, "rewards/chosen": 2.9228675365448, "rewards/margins": 0.9362403154373169, "rewards/rejected": 1.986627221107483, "step": 3244 }, { "epoch": 0.53, "learning_rate": 9.868338310341477e-07, "logits/chosen": -0.7012104392051697, "logits/rejected": -0.5643790364265442, "logps/chosen": -188.458740234375, "logps/rejected": -97.05938720703125, "loss": 0.3863, "rewards/accuracies": 1.0, "rewards/chosen": 2.824108839035034, "rewards/margins": 0.5254080295562744, "rewards/rejected": 2.2987008094787598, "step": 3245 }, { "epoch": 0.53, "learning_rate": 9.868038528917108e-07, "logits/chosen": -0.7690947651863098, "logits/rejected": -0.7173184156417847, "logps/chosen": -51.81321334838867, "logps/rejected": -55.46173095703125, "loss": 1.036, "rewards/accuracies": 0.0, "rewards/chosen": 1.9427226781845093, "rewards/margins": -0.19835245609283447, "rewards/rejected": 2.1410751342773438, "step": 3246 }, { "epoch": 0.53, "learning_rate": 9.867738411157338e-07, "logits/chosen": -0.7031509876251221, "logits/rejected": -0.5108370184898376, "logps/chosen": -73.28526306152344, "logps/rejected": -242.63377380371094, "loss": 2.1581, "rewards/accuracies": 0.0, "rewards/chosen": 3.024336338043213, "rewards/margins": -4.048226833343506, "rewards/rejected": 7.072563171386719, "step": 3247 }, { "epoch": 0.53, "learning_rate": 9.867437957082904e-07, "logits/chosen": -0.5087643265724182, "logits/rejected": -0.4080069065093994, "logps/chosen": -79.53993225097656, "logps/rejected": -22.925716400146484, "loss": 0.3, "rewards/accuracies": 1.0, "rewards/chosen": 1.2145980596542358, "rewards/margins": 0.9917745590209961, "rewards/rejected": 0.22282353043556213, "step": 3248 }, { "epoch": 0.53, "learning_rate": 9.867137166714564e-07, "logits/chosen": -1.2501304149627686, "logits/rejected": -1.2284120321273804, "logps/chosen": -142.22769165039062, "logps/rejected": -149.8658447265625, "loss": 0.7996, "rewards/accuracies": 1.0, "rewards/chosen": 4.2903594970703125, "rewards/margins": 0.8023085594177246, "rewards/rejected": 3.488050937652588, "step": 3249 }, { "epoch": 0.53, "learning_rate": 9.866836040073097e-07, "logits/chosen": -0.4758465886116028, "logits/rejected": -0.3171267509460449, "logps/chosen": -152.72889709472656, "logps/rejected": -15.331798553466797, "loss": 0.6442, "rewards/accuracies": 1.0, "rewards/chosen": 0.7384018301963806, "rewards/margins": 0.4448576271533966, "rewards/rejected": 0.293544203042984, "step": 3250 }, { "epoch": 0.53, "learning_rate": 9.86653457717931e-07, "logits/chosen": -0.44691845774650574, "logits/rejected": -0.413338303565979, "logps/chosen": -110.27345275878906, "logps/rejected": -103.95706939697266, "loss": 0.456, "rewards/accuracies": 0.0, "rewards/chosen": 0.3123512268066406, "rewards/margins": -0.04139861464500427, "rewards/rejected": 0.3537498414516449, "step": 3251 }, { "epoch": 0.53, "learning_rate": 9.866232778054032e-07, "logits/chosen": -0.6467587947845459, "logits/rejected": -0.5946018695831299, "logps/chosen": -59.460845947265625, "logps/rejected": -68.80941772460938, "loss": 0.2589, "rewards/accuracies": 1.0, "rewards/chosen": 2.137723684310913, "rewards/margins": 0.8925713300704956, "rewards/rejected": 1.2451523542404175, "step": 3252 }, { "epoch": 0.53, "learning_rate": 9.865930642718114e-07, "logits/chosen": -0.542646586894989, "logits/rejected": -0.532214879989624, "logps/chosen": -86.58983612060547, "logps/rejected": -129.2667236328125, "loss": 1.556, "rewards/accuracies": 0.0, "rewards/chosen": 1.6924209594726562, "rewards/margins": -1.2979629039764404, "rewards/rejected": 2.9903838634490967, "step": 3253 }, { "epoch": 0.53, "learning_rate": 9.86562817119243e-07, "logits/chosen": -0.5795087814331055, "logits/rejected": -0.5419571995735168, "logps/chosen": -111.54566192626953, "logps/rejected": -165.00717163085938, "loss": 0.2037, "rewards/accuracies": 1.0, "rewards/chosen": 0.666455864906311, "rewards/margins": 0.9669227600097656, "rewards/rejected": -0.300466924905777, "step": 3254 }, { "epoch": 0.53, "learning_rate": 9.86532536349788e-07, "logits/chosen": -0.45133844017982483, "logits/rejected": -0.3904229998588562, "logps/chosen": -53.108917236328125, "logps/rejected": -53.72187042236328, "loss": 0.3599, "rewards/accuracies": 1.0, "rewards/chosen": 1.1303542852401733, "rewards/margins": 1.034101128578186, "rewards/rejected": 0.09625320881605148, "step": 3255 }, { "epoch": 0.53, "learning_rate": 9.865022219655383e-07, "logits/chosen": -0.28392213582992554, "logits/rejected": -0.28157296776771545, "logps/chosen": -127.42874908447266, "logps/rejected": -73.23393249511719, "loss": 1.5437, "rewards/accuracies": 0.0, "rewards/chosen": -0.12325821071863174, "rewards/margins": -2.7640929222106934, "rewards/rejected": 2.6408348083496094, "step": 3256 }, { "epoch": 0.53, "learning_rate": 9.864718739685882e-07, "logits/chosen": -0.9929037094116211, "logits/rejected": -0.9543529748916626, "logps/chosen": -246.71664428710938, "logps/rejected": -179.42526245117188, "loss": 1.7098, "rewards/accuracies": 0.0, "rewards/chosen": 4.7723541259765625, "rewards/margins": -3.3846311569213867, "rewards/rejected": 8.15698528289795, "step": 3257 }, { "epoch": 0.53, "learning_rate": 9.864414923610348e-07, "logits/chosen": -0.9357783794403076, "logits/rejected": -0.8827460408210754, "logps/chosen": -88.46043395996094, "logps/rejected": -31.614479064941406, "loss": 0.8643, "rewards/accuracies": 0.0, "rewards/chosen": 0.008813477121293545, "rewards/margins": -0.087407685816288, "rewards/rejected": 0.09622116386890411, "step": 3258 }, { "epoch": 0.53, "learning_rate": 9.86411077144977e-07, "logits/chosen": -1.177405595779419, "logits/rejected": -1.308734655380249, "logps/chosen": -231.40301513671875, "logps/rejected": -25.559654235839844, "loss": 0.3191, "rewards/accuracies": 1.0, "rewards/chosen": 1.8982574939727783, "rewards/margins": 1.6364647150039673, "rewards/rejected": 0.26179274916648865, "step": 3259 }, { "epoch": 0.53, "learning_rate": 9.863806283225163e-07, "logits/chosen": -0.8305289149284363, "logits/rejected": -0.7318840622901917, "logps/chosen": -130.53367614746094, "logps/rejected": -88.8206787109375, "loss": 0.9033, "rewards/accuracies": 0.0, "rewards/chosen": 3.692836046218872, "rewards/margins": -1.1567504405975342, "rewards/rejected": 4.849586486816406, "step": 3260 }, { "epoch": 0.53, "learning_rate": 9.86350145895756e-07, "logits/chosen": -0.45568370819091797, "logits/rejected": -0.45568370819091797, "logps/chosen": -27.912967681884766, "logps/rejected": -27.912967681884766, "loss": 0.605, "rewards/accuracies": 0.0, "rewards/chosen": 1.5886669158935547, "rewards/margins": 0.0, "rewards/rejected": 1.5886669158935547, "step": 3261 }, { "epoch": 0.53, "learning_rate": 9.86319629866803e-07, "logits/chosen": -0.37803739309310913, "logits/rejected": -0.37803739309310913, "logps/chosen": -58.60181427001953, "logps/rejected": -58.60181427001953, "loss": 1.5777, "rewards/accuracies": 0.0, "rewards/chosen": 1.934404730796814, "rewards/margins": 0.0, "rewards/rejected": 1.934404730796814, "step": 3262 }, { "epoch": 0.53, "learning_rate": 9.86289080237765e-07, "logits/chosen": -0.8951709866523743, "logits/rejected": -1.2228219509124756, "logps/chosen": -93.86085510253906, "logps/rejected": -37.25495147705078, "loss": 0.0985, "rewards/accuracies": 1.0, "rewards/chosen": 1.9581336975097656, "rewards/margins": 1.8091102838516235, "rewards/rejected": 0.14902344346046448, "step": 3263 }, { "epoch": 0.53, "learning_rate": 9.862584970107528e-07, "logits/chosen": -0.23113104701042175, "logits/rejected": -0.23304100334644318, "logps/chosen": -2.578582286834717, "logps/rejected": -8.985092163085938, "loss": 0.6599, "rewards/accuracies": 1.0, "rewards/chosen": 0.2679741382598877, "rewards/margins": 0.09886087477207184, "rewards/rejected": 0.16911326348781586, "step": 3264 }, { "epoch": 0.53, "learning_rate": 9.862278801878794e-07, "logits/chosen": -0.8146096467971802, "logits/rejected": -0.8255428671836853, "logps/chosen": -86.42109680175781, "logps/rejected": -114.96442413330078, "loss": 0.5696, "rewards/accuracies": 0.0, "rewards/chosen": 0.25331878662109375, "rewards/margins": -0.47136688232421875, "rewards/rejected": 0.7246856689453125, "step": 3265 }, { "epoch": 0.53, "learning_rate": 9.861972297712603e-07, "logits/chosen": -0.42720288038253784, "logits/rejected": -0.38071978092193604, "logps/chosen": -92.81573486328125, "logps/rejected": -63.57615280151367, "loss": 0.4052, "rewards/accuracies": 1.0, "rewards/chosen": 1.7773956060409546, "rewards/margins": 0.22947955131530762, "rewards/rejected": 1.547916054725647, "step": 3266 }, { "epoch": 0.53, "learning_rate": 9.861665457630133e-07, "logits/chosen": -0.5113090872764587, "logits/rejected": -0.5053799748420715, "logps/chosen": -10.696681022644043, "logps/rejected": -5.648336410522461, "loss": 0.5021, "rewards/accuracies": 1.0, "rewards/chosen": 1.2977436780929565, "rewards/margins": 0.17646777629852295, "rewards/rejected": 1.1212759017944336, "step": 3267 }, { "epoch": 0.53, "learning_rate": 9.86135828165258e-07, "logits/chosen": -0.4496011734008789, "logits/rejected": -0.6797184348106384, "logps/chosen": -101.93302917480469, "logps/rejected": -127.23421478271484, "loss": 2.229, "rewards/accuracies": 0.0, "rewards/chosen": 1.9150093793869019, "rewards/margins": -3.58500337600708, "rewards/rejected": 5.5000128746032715, "step": 3268 }, { "epoch": 0.53, "learning_rate": 9.861050769801166e-07, "logits/chosen": -0.6462295651435852, "logits/rejected": -0.5542082786560059, "logps/chosen": -93.51432800292969, "logps/rejected": -82.07842254638672, "loss": 0.3698, "rewards/accuracies": 1.0, "rewards/chosen": 2.854994297027588, "rewards/margins": 0.44509363174438477, "rewards/rejected": 2.409900665283203, "step": 3269 }, { "epoch": 0.53, "learning_rate": 9.86074292209714e-07, "logits/chosen": -0.47219014167785645, "logits/rejected": -0.5121666789054871, "logps/chosen": -74.04669189453125, "logps/rejected": -94.68966674804688, "loss": 0.2512, "rewards/accuracies": 1.0, "rewards/chosen": 0.9429580569267273, "rewards/margins": 0.5875434875488281, "rewards/rejected": 0.35541459918022156, "step": 3270 }, { "epoch": 0.53, "learning_rate": 9.860434738561773e-07, "logits/chosen": -0.7617856860160828, "logits/rejected": -0.7384313344955444, "logps/chosen": -53.221656799316406, "logps/rejected": -62.05866622924805, "loss": 0.4561, "rewards/accuracies": 0.0, "rewards/chosen": 0.5094887018203735, "rewards/margins": -0.22197073698043823, "rewards/rejected": 0.7314594388008118, "step": 3271 }, { "epoch": 0.53, "learning_rate": 9.860126219216353e-07, "logits/chosen": -0.390743613243103, "logits/rejected": -0.31302160024642944, "logps/chosen": -50.719322204589844, "logps/rejected": -70.41154479980469, "loss": 0.1964, "rewards/accuracies": 1.0, "rewards/chosen": 2.4292564392089844, "rewards/margins": 0.8300895690917969, "rewards/rejected": 1.5991668701171875, "step": 3272 }, { "epoch": 0.53, "learning_rate": 9.859817364082201e-07, "logits/chosen": -0.48562145233154297, "logits/rejected": -0.44998568296432495, "logps/chosen": -103.21658325195312, "logps/rejected": -90.42387390136719, "loss": 0.4918, "rewards/accuracies": 1.0, "rewards/chosen": 0.6370132565498352, "rewards/margins": 0.19561156630516052, "rewards/rejected": 0.4414016902446747, "step": 3273 }, { "epoch": 0.53, "learning_rate": 9.859508173180653e-07, "logits/chosen": -0.41464537382125854, "logits/rejected": -0.2111193984746933, "logps/chosen": -91.49220275878906, "logps/rejected": -54.194801330566406, "loss": 0.7237, "rewards/accuracies": 1.0, "rewards/chosen": 1.5200958251953125, "rewards/margins": 0.15656352043151855, "rewards/rejected": 1.363532304763794, "step": 3274 }, { "epoch": 0.53, "learning_rate": 9.85919864653307e-07, "logits/chosen": -0.839573860168457, "logits/rejected": -0.423735111951828, "logps/chosen": -86.61785125732422, "logps/rejected": -221.17636108398438, "loss": 0.4747, "rewards/accuracies": 1.0, "rewards/chosen": 1.6491936445236206, "rewards/margins": 0.759925127029419, "rewards/rejected": 0.8892685174942017, "step": 3275 }, { "epoch": 0.53, "learning_rate": 9.858888784160836e-07, "logits/chosen": -0.5767126679420471, "logits/rejected": -0.6010741591453552, "logps/chosen": -73.63362884521484, "logps/rejected": -92.61369323730469, "loss": 0.7717, "rewards/accuracies": 0.0, "rewards/chosen": 1.729962944984436, "rewards/margins": -0.424763560295105, "rewards/rejected": 2.154726505279541, "step": 3276 }, { "epoch": 0.53, "learning_rate": 9.858578586085366e-07, "logits/chosen": -0.5967643857002258, "logits/rejected": -0.5565604567527771, "logps/chosen": -159.0626220703125, "logps/rejected": -98.45954132080078, "loss": 0.6191, "rewards/accuracies": 1.0, "rewards/chosen": 4.042642116546631, "rewards/margins": 1.591480016708374, "rewards/rejected": 2.451162099838257, "step": 3277 }, { "epoch": 0.53, "learning_rate": 9.858268052328088e-07, "logits/chosen": -0.22030845284461975, "logits/rejected": -0.2180420607328415, "logps/chosen": -7.548344135284424, "logps/rejected": -0.857515275478363, "loss": 0.4617, "rewards/accuracies": 0.0, "rewards/chosen": 0.08736415207386017, "rewards/margins": -0.10369925200939178, "rewards/rejected": 0.19106340408325195, "step": 3278 }, { "epoch": 0.53, "learning_rate": 9.857957182910455e-07, "logits/chosen": -0.7211032509803772, "logits/rejected": -0.7005343437194824, "logps/chosen": -98.72877502441406, "logps/rejected": -64.25466918945312, "loss": 0.5523, "rewards/accuracies": 0.0, "rewards/chosen": 1.6644859313964844, "rewards/margins": -0.4233894348144531, "rewards/rejected": 2.0878753662109375, "step": 3279 }, { "epoch": 0.53, "learning_rate": 9.857645977853947e-07, "logits/chosen": -0.6560417413711548, "logits/rejected": -0.6568878889083862, "logps/chosen": -123.4672622680664, "logps/rejected": -160.89041137695312, "loss": 0.6422, "rewards/accuracies": 0.0, "rewards/chosen": 4.238710880279541, "rewards/margins": -0.8764200210571289, "rewards/rejected": 5.11513090133667, "step": 3280 }, { "epoch": 0.53, "learning_rate": 9.857334437180068e-07, "logits/chosen": -0.9276300072669983, "logits/rejected": -0.8868907690048218, "logps/chosen": -81.51896667480469, "logps/rejected": -99.6792984008789, "loss": 1.9359, "rewards/accuracies": 0.0, "rewards/chosen": 1.7010421752929688, "rewards/margins": -3.786579132080078, "rewards/rejected": 5.487621307373047, "step": 3281 }, { "epoch": 0.53, "learning_rate": 9.857022560910337e-07, "logits/chosen": -0.4136921465396881, "logits/rejected": -0.2620478868484497, "logps/chosen": -40.152713775634766, "logps/rejected": -21.164400100708008, "loss": 0.3995, "rewards/accuracies": 1.0, "rewards/chosen": 0.6855144500732422, "rewards/margins": 0.3291833698749542, "rewards/rejected": 0.35633108019828796, "step": 3282 }, { "epoch": 0.53, "learning_rate": 9.856710349066307e-07, "logits/chosen": -0.9897782206535339, "logits/rejected": -1.0330067873001099, "logps/chosen": -50.19281768798828, "logps/rejected": -18.829715728759766, "loss": 1.1017, "rewards/accuracies": 1.0, "rewards/chosen": 2.4115960597991943, "rewards/margins": 1.548803448677063, "rewards/rejected": 0.8627926111221313, "step": 3283 }, { "epoch": 0.53, "learning_rate": 9.856397801669545e-07, "logits/chosen": -0.7897588610649109, "logits/rejected": -0.8335152864456177, "logps/chosen": -72.12158203125, "logps/rejected": -120.29476165771484, "loss": 2.0243, "rewards/accuracies": 0.0, "rewards/chosen": 1.54607093334198, "rewards/margins": -1.5447684526443481, "rewards/rejected": 3.090839385986328, "step": 3284 }, { "epoch": 0.53, "learning_rate": 9.856084918741648e-07, "logits/chosen": -0.6517306566238403, "logits/rejected": -0.554492712020874, "logps/chosen": -58.39374542236328, "logps/rejected": -43.64011764526367, "loss": 1.1809, "rewards/accuracies": 1.0, "rewards/chosen": 2.9217560291290283, "rewards/margins": 1.5130141973495483, "rewards/rejected": 1.40874183177948, "step": 3285 }, { "epoch": 0.53, "learning_rate": 9.85577170030423e-07, "logits/chosen": -0.1815120428800583, "logits/rejected": -0.2131543606519699, "logps/chosen": -98.22386169433594, "logps/rejected": -54.44206619262695, "loss": 0.7955, "rewards/accuracies": 0.0, "rewards/chosen": -0.09319915622472763, "rewards/margins": -1.306321382522583, "rewards/rejected": 1.2131222486495972, "step": 3286 }, { "epoch": 0.53, "learning_rate": 9.855458146378934e-07, "logits/chosen": -0.5075379014015198, "logits/rejected": -0.4176505208015442, "logps/chosen": -54.430152893066406, "logps/rejected": -60.41603088378906, "loss": 0.5397, "rewards/accuracies": 1.0, "rewards/chosen": 1.2619285583496094, "rewards/margins": 0.07225418090820312, "rewards/rejected": 1.1896743774414062, "step": 3287 }, { "epoch": 0.53, "learning_rate": 9.855144256987423e-07, "logits/chosen": -0.5524964332580566, "logits/rejected": -0.5539431571960449, "logps/chosen": -55.12914276123047, "logps/rejected": -72.44039916992188, "loss": 0.2941, "rewards/accuracies": 1.0, "rewards/chosen": 1.8308517932891846, "rewards/margins": 0.7223999500274658, "rewards/rejected": 1.1084518432617188, "step": 3288 }, { "epoch": 0.53, "learning_rate": 9.854830032151383e-07, "logits/chosen": -0.21250884234905243, "logits/rejected": -0.18144744634628296, "logps/chosen": -83.06269836425781, "logps/rejected": -65.02313232421875, "loss": 0.5644, "rewards/accuracies": 0.0, "rewards/chosen": 0.6577423214912415, "rewards/margins": -0.5514755845069885, "rewards/rejected": 1.20921790599823, "step": 3289 }, { "epoch": 0.53, "learning_rate": 9.854515471892526e-07, "logits/chosen": -0.35563555359840393, "logits/rejected": -0.36617881059646606, "logps/chosen": -97.83688354492188, "logps/rejected": -80.08415222167969, "loss": 0.2359, "rewards/accuracies": 1.0, "rewards/chosen": 2.53806471824646, "rewards/margins": 0.6053208112716675, "rewards/rejected": 1.9327439069747925, "step": 3290 }, { "epoch": 0.53, "learning_rate": 9.854200576232582e-07, "logits/chosen": -0.5831631422042847, "logits/rejected": -0.5653451085090637, "logps/chosen": -45.72923278808594, "logps/rejected": -76.3804931640625, "loss": 0.3088, "rewards/accuracies": 1.0, "rewards/chosen": 1.3260811567306519, "rewards/margins": 0.41541218757629395, "rewards/rejected": 0.9106689691543579, "step": 3291 }, { "epoch": 0.53, "learning_rate": 9.853885345193312e-07, "logits/chosen": -0.9241297841072083, "logits/rejected": -0.6748477816581726, "logps/chosen": -193.79718017578125, "logps/rejected": -73.63536071777344, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": 4.22091817855835, "rewards/margins": 1.8908522129058838, "rewards/rejected": 2.330065965652466, "step": 3292 }, { "epoch": 0.53, "learning_rate": 9.853569778796488e-07, "logits/chosen": -0.9014112949371338, "logits/rejected": -0.9131635427474976, "logps/chosen": -60.36122131347656, "logps/rejected": -61.97191619873047, "loss": 1.872, "rewards/accuracies": 1.0, "rewards/chosen": 3.0744354724884033, "rewards/margins": 0.6562583446502686, "rewards/rejected": 2.4181771278381348, "step": 3293 }, { "epoch": 0.53, "learning_rate": 9.85325387706392e-07, "logits/chosen": -0.07471205294132233, "logits/rejected": -0.15629129111766815, "logps/chosen": -108.11680603027344, "logps/rejected": -114.73542022705078, "loss": 1.8367, "rewards/accuracies": 0.0, "rewards/chosen": 0.3495796322822571, "rewards/margins": -3.5414047241210938, "rewards/rejected": 3.890984296798706, "step": 3294 }, { "epoch": 0.53, "learning_rate": 9.852937640017431e-07, "logits/chosen": -0.44464999437332153, "logits/rejected": -0.4321005940437317, "logps/chosen": -73.20149230957031, "logps/rejected": -108.1805419921875, "loss": 0.7639, "rewards/accuracies": 0.0, "rewards/chosen": 1.205683946609497, "rewards/margins": -0.9489045143127441, "rewards/rejected": 2.154588460922241, "step": 3295 }, { "epoch": 0.53, "learning_rate": 9.85262106767887e-07, "logits/chosen": -0.7903769016265869, "logits/rejected": -0.8231319189071655, "logps/chosen": -55.39071273803711, "logps/rejected": -74.84954833984375, "loss": 2.135, "rewards/accuracies": 0.0, "rewards/chosen": 2.3396222591400146, "rewards/margins": -0.8449268341064453, "rewards/rejected": 3.18454909324646, "step": 3296 }, { "epoch": 0.54, "learning_rate": 9.852304160070108e-07, "logits/chosen": -0.9408047795295715, "logits/rejected": -0.7112641930580139, "logps/chosen": -108.19365692138672, "logps/rejected": -21.22780418395996, "loss": 0.0289, "rewards/accuracies": 1.0, "rewards/chosen": 4.540595531463623, "rewards/margins": 4.136279582977295, "rewards/rejected": 0.4043159484863281, "step": 3297 }, { "epoch": 0.54, "learning_rate": 9.851986917213044e-07, "logits/chosen": -0.5346225500106812, "logits/rejected": -0.5477132201194763, "logps/chosen": -59.33414077758789, "logps/rejected": -37.459197998046875, "loss": 0.6467, "rewards/accuracies": 1.0, "rewards/chosen": 2.2352147102355957, "rewards/margins": 0.565341591835022, "rewards/rejected": 1.6698731184005737, "step": 3298 }, { "epoch": 0.54, "learning_rate": 9.851669339129591e-07, "logits/chosen": -0.40819138288497925, "logits/rejected": -0.42362701892852783, "logps/chosen": -81.29498291015625, "logps/rejected": -90.78179168701172, "loss": 0.6635, "rewards/accuracies": 0.0, "rewards/chosen": 0.8649734854698181, "rewards/margins": -0.10054850578308105, "rewards/rejected": 0.9655219912528992, "step": 3299 }, { "epoch": 0.54, "learning_rate": 9.851351425841695e-07, "logits/chosen": -0.3611488342285156, "logits/rejected": -0.29732316732406616, "logps/chosen": -63.40286636352539, "logps/rejected": -104.47319030761719, "loss": 0.2769, "rewards/accuracies": 1.0, "rewards/chosen": 1.4220730066299438, "rewards/margins": 1.1585407257080078, "rewards/rejected": 0.26353225111961365, "step": 3300 }, { "epoch": 0.54, "learning_rate": 9.851033177371319e-07, "logits/chosen": -0.2944210469722748, "logits/rejected": -0.23091746866703033, "logps/chosen": -73.3594741821289, "logps/rejected": -74.36021423339844, "loss": 0.1568, "rewards/accuracies": 1.0, "rewards/chosen": 1.9910026788711548, "rewards/margins": 1.5202529430389404, "rewards/rejected": 0.4707496762275696, "step": 3301 }, { "epoch": 0.54, "learning_rate": 9.850714593740453e-07, "logits/chosen": -0.6678653955459595, "logits/rejected": -0.633012056350708, "logps/chosen": -94.85322570800781, "logps/rejected": -37.38957977294922, "loss": 1.0961, "rewards/accuracies": 1.0, "rewards/chosen": 1.0703636407852173, "rewards/margins": 0.4351200461387634, "rewards/rejected": 0.6352435946464539, "step": 3302 }, { "epoch": 0.54, "learning_rate": 9.850395674971105e-07, "logits/chosen": -0.5917452573776245, "logits/rejected": -0.5966182351112366, "logps/chosen": -67.359130859375, "logps/rejected": -45.4300537109375, "loss": 0.8092, "rewards/accuracies": 0.0, "rewards/chosen": 1.0076340436935425, "rewards/margins": -0.42051076889038086, "rewards/rejected": 1.4281448125839233, "step": 3303 }, { "epoch": 0.54, "learning_rate": 9.850076421085308e-07, "logits/chosen": -0.44591841101646423, "logits/rejected": -0.33354243636131287, "logps/chosen": -53.08506774902344, "logps/rejected": -17.43569564819336, "loss": 0.6395, "rewards/accuracies": 1.0, "rewards/chosen": 1.1534347534179688, "rewards/margins": 0.9536808133125305, "rewards/rejected": 0.19975395500659943, "step": 3304 }, { "epoch": 0.54, "learning_rate": 9.849756832105128e-07, "logits/chosen": -0.651798665523529, "logits/rejected": -0.5478319525718689, "logps/chosen": -141.81246948242188, "logps/rejected": -111.22392272949219, "loss": 0.6633, "rewards/accuracies": 0.0, "rewards/chosen": 2.577374219894409, "rewards/margins": -0.09222579002380371, "rewards/rejected": 2.669600009918213, "step": 3305 }, { "epoch": 0.54, "learning_rate": 9.849436908052636e-07, "logits/chosen": -1.0187342166900635, "logits/rejected": -1.0010483264923096, "logps/chosen": -52.07227325439453, "logps/rejected": -104.92354583740234, "loss": 0.4297, "rewards/accuracies": 1.0, "rewards/chosen": 0.9957244992256165, "rewards/margins": 0.721239447593689, "rewards/rejected": 0.2744850218296051, "step": 3306 }, { "epoch": 0.54, "learning_rate": 9.84911664894994e-07, "logits/chosen": -0.7242097854614258, "logits/rejected": -0.8072027564048767, "logps/chosen": -73.14460754394531, "logps/rejected": -110.60064697265625, "loss": 1.8649, "rewards/accuracies": 0.0, "rewards/chosen": 1.5644890069961548, "rewards/margins": -3.1847453117370605, "rewards/rejected": 4.749234199523926, "step": 3307 }, { "epoch": 0.54, "learning_rate": 9.848796054819165e-07, "logits/chosen": -0.47705259919166565, "logits/rejected": -0.42514461278915405, "logps/chosen": -65.2657470703125, "logps/rejected": -70.89800262451172, "loss": 0.2601, "rewards/accuracies": 1.0, "rewards/chosen": 1.9051307439804077, "rewards/margins": 0.766642689704895, "rewards/rejected": 1.1384880542755127, "step": 3308 }, { "epoch": 0.54, "learning_rate": 9.848475125682465e-07, "logits/chosen": -0.4990662932395935, "logits/rejected": -0.49684858322143555, "logps/chosen": -61.097900390625, "logps/rejected": -55.33305358886719, "loss": 1.8248, "rewards/accuracies": 0.0, "rewards/chosen": 1.7656234502792358, "rewards/margins": -0.3655472993850708, "rewards/rejected": 2.1311707496643066, "step": 3309 }, { "epoch": 0.54, "learning_rate": 9.84815386156201e-07, "logits/chosen": -0.32694143056869507, "logits/rejected": -0.3542234003543854, "logps/chosen": -96.80400848388672, "logps/rejected": -48.85771560668945, "loss": 1.5921, "rewards/accuracies": 0.0, "rewards/chosen": 0.15492402017116547, "rewards/margins": -1.2452003955841064, "rewards/rejected": 1.400124430656433, "step": 3310 }, { "epoch": 0.54, "learning_rate": 9.847832262479996e-07, "logits/chosen": -0.574048638343811, "logits/rejected": -0.4986783564090729, "logps/chosen": -66.00359344482422, "logps/rejected": -99.23384094238281, "loss": 0.3918, "rewards/accuracies": 1.0, "rewards/chosen": 3.2850501537323, "rewards/margins": 0.7189993858337402, "rewards/rejected": 2.5660507678985596, "step": 3311 }, { "epoch": 0.54, "learning_rate": 9.847510328458642e-07, "logits/chosen": -0.19899486005306244, "logits/rejected": -0.1997418850660324, "logps/chosen": -3.506361484527588, "logps/rejected": -1.6613836288452148, "loss": 0.7408, "rewards/accuracies": 0.0, "rewards/chosen": 0.040109921246767044, "rewards/margins": -0.08297452330589294, "rewards/rejected": 0.12308444827795029, "step": 3312 }, { "epoch": 0.54, "learning_rate": 9.847188059520193e-07, "logits/chosen": -0.6318592429161072, "logits/rejected": -0.6229424476623535, "logps/chosen": -42.1137809753418, "logps/rejected": -36.47303009033203, "loss": 0.2903, "rewards/accuracies": 1.0, "rewards/chosen": 1.6702247858047485, "rewards/margins": 0.4665076732635498, "rewards/rejected": 1.2037171125411987, "step": 3313 }, { "epoch": 0.54, "learning_rate": 9.846865455686914e-07, "logits/chosen": -0.6185960173606873, "logits/rejected": -0.6072440147399902, "logps/chosen": -22.239501953125, "logps/rejected": -65.39494323730469, "loss": 0.3342, "rewards/accuracies": 1.0, "rewards/chosen": -0.04023914411664009, "rewards/margins": 0.32392406463623047, "rewards/rejected": -0.36416321992874146, "step": 3314 }, { "epoch": 0.54, "learning_rate": 9.846542516981093e-07, "logits/chosen": -0.38006094098091125, "logits/rejected": -0.38006094098091125, "logps/chosen": -1.927037000656128, "logps/rejected": -1.927037000656128, "loss": 0.3864, "rewards/accuracies": 0.0, "rewards/chosen": 0.22269542515277863, "rewards/margins": 0.0, "rewards/rejected": 0.22269542515277863, "step": 3315 }, { "epoch": 0.54, "learning_rate": 9.846219243425045e-07, "logits/chosen": -0.3360272943973541, "logits/rejected": -0.3001682162284851, "logps/chosen": -32.461090087890625, "logps/rejected": -11.268468856811523, "loss": 2.5908, "rewards/accuracies": 1.0, "rewards/chosen": 1.535787582397461, "rewards/margins": 0.2872117757797241, "rewards/rejected": 1.2485758066177368, "step": 3316 }, { "epoch": 0.54, "learning_rate": 9.8458956350411e-07, "logits/chosen": -0.6685526371002197, "logits/rejected": -0.6556094884872437, "logps/chosen": -99.13362121582031, "logps/rejected": -102.05769348144531, "loss": 0.3667, "rewards/accuracies": 1.0, "rewards/chosen": 1.3879508972167969, "rewards/margins": 0.9578574895858765, "rewards/rejected": 0.430093377828598, "step": 3317 }, { "epoch": 0.54, "learning_rate": 9.84557169185162e-07, "logits/chosen": -0.3793834447860718, "logits/rejected": -0.4260232150554657, "logps/chosen": -50.98857116699219, "logps/rejected": -88.03667449951172, "loss": 0.7512, "rewards/accuracies": 0.0, "rewards/chosen": 0.6821869015693665, "rewards/margins": -0.32379990816116333, "rewards/rejected": 1.0059868097305298, "step": 3318 }, { "epoch": 0.54, "learning_rate": 9.845247413878982e-07, "logits/chosen": -1.1951484680175781, "logits/rejected": -1.063559889793396, "logps/chosen": -133.6065216064453, "logps/rejected": -82.42567443847656, "loss": 0.2005, "rewards/accuracies": 1.0, "rewards/chosen": 5.46316385269165, "rewards/margins": 2.189406633377075, "rewards/rejected": 3.273757219314575, "step": 3319 }, { "epoch": 0.54, "learning_rate": 9.8449228011456e-07, "logits/chosen": -0.9070086479187012, "logits/rejected": -0.9050602912902832, "logps/chosen": -77.75580596923828, "logps/rejected": -78.33857727050781, "loss": 0.7317, "rewards/accuracies": 1.0, "rewards/chosen": 0.8378624320030212, "rewards/margins": 0.02536928653717041, "rewards/rejected": 0.8124931454658508, "step": 3320 }, { "epoch": 0.54, "learning_rate": 9.84459785367389e-07, "logits/chosen": -0.548969030380249, "logits/rejected": -0.42400693893432617, "logps/chosen": -72.83755493164062, "logps/rejected": -34.42767333984375, "loss": 0.9116, "rewards/accuracies": 1.0, "rewards/chosen": 0.5129165649414062, "rewards/margins": 0.32421112060546875, "rewards/rejected": 0.1887054443359375, "step": 3321 }, { "epoch": 0.54, "learning_rate": 9.844272571486311e-07, "logits/chosen": -0.6705501675605774, "logits/rejected": -0.5881650447845459, "logps/chosen": -123.14094543457031, "logps/rejected": -53.655574798583984, "loss": 0.0563, "rewards/accuracies": 1.0, "rewards/chosen": 4.139012336730957, "rewards/margins": 2.215174674987793, "rewards/rejected": 1.923837661743164, "step": 3322 }, { "epoch": 0.54, "learning_rate": 9.843946954605333e-07, "logits/chosen": -0.7425791621208191, "logits/rejected": -0.7270910739898682, "logps/chosen": -94.35633850097656, "logps/rejected": -105.34361267089844, "loss": 1.0763, "rewards/accuracies": 0.0, "rewards/chosen": -0.18460999429225922, "rewards/margins": -1.3898636102676392, "rewards/rejected": 1.2052536010742188, "step": 3323 }, { "epoch": 0.54, "learning_rate": 9.843621003053454e-07, "logits/chosen": -0.3433288335800171, "logits/rejected": -0.3518415093421936, "logps/chosen": -8.643766403198242, "logps/rejected": -3.241833209991455, "loss": 0.6551, "rewards/accuracies": 0.0, "rewards/chosen": 0.28267738223075867, "rewards/margins": -0.022568315267562866, "rewards/rejected": 0.30524569749832153, "step": 3324 }, { "epoch": 0.54, "learning_rate": 9.843294716853197e-07, "logits/chosen": -0.40023767948150635, "logits/rejected": -0.40023767948150635, "logps/chosen": -54.79890441894531, "logps/rejected": -54.79890441894531, "loss": 1.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.5542789697647095, "rewards/margins": 0.0, "rewards/rejected": 0.5542789697647095, "step": 3325 }, { "epoch": 0.54, "learning_rate": 9.8429680960271e-07, "logits/chosen": -0.3803062438964844, "logits/rejected": -0.35805121064186096, "logps/chosen": -88.50614166259766, "logps/rejected": -48.140586853027344, "loss": 0.7791, "rewards/accuracies": 0.0, "rewards/chosen": 1.605455756187439, "rewards/margins": -0.6645249128341675, "rewards/rejected": 2.2699806690216064, "step": 3326 }, { "epoch": 0.54, "learning_rate": 9.842641140597733e-07, "logits/chosen": -0.5577439665794373, "logits/rejected": -0.3802199363708496, "logps/chosen": -94.61114501953125, "logps/rejected": -195.8033447265625, "loss": 2.717, "rewards/accuracies": 0.0, "rewards/chosen": 0.6177231073379517, "rewards/margins": -2.3572540283203125, "rewards/rejected": 2.9749772548675537, "step": 3327 }, { "epoch": 0.54, "learning_rate": 9.842313850587687e-07, "logits/chosen": -0.24831518530845642, "logits/rejected": -0.23483769595623016, "logps/chosen": -4.989870071411133, "logps/rejected": -3.392977714538574, "loss": 0.5989, "rewards/accuracies": 1.0, "rewards/chosen": 0.19403114914894104, "rewards/margins": 0.11214519292116165, "rewards/rejected": 0.08188595622777939, "step": 3328 }, { "epoch": 0.54, "learning_rate": 9.84198622601957e-07, "logits/chosen": -0.5709309577941895, "logits/rejected": -0.5872658491134644, "logps/chosen": -96.43800354003906, "logps/rejected": -57.942569732666016, "loss": 1.4625, "rewards/accuracies": 1.0, "rewards/chosen": 0.9130813479423523, "rewards/margins": 0.0974048376083374, "rewards/rejected": 0.8156765103340149, "step": 3329 }, { "epoch": 0.54, "learning_rate": 9.84165826691602e-07, "logits/chosen": -0.3700850009918213, "logits/rejected": -0.2921386659145355, "logps/chosen": -102.59449768066406, "logps/rejected": -178.95510864257812, "loss": 0.5842, "rewards/accuracies": 1.0, "rewards/chosen": 0.02490844763815403, "rewards/margins": 0.7312911748886108, "rewards/rejected": -0.7063827514648438, "step": 3330 }, { "epoch": 0.54, "learning_rate": 9.841329973299697e-07, "logits/chosen": -0.759900689125061, "logits/rejected": -0.7254572510719299, "logps/chosen": -74.68434143066406, "logps/rejected": -84.59886169433594, "loss": 0.2971, "rewards/accuracies": 1.0, "rewards/chosen": 1.809698462486267, "rewards/margins": 0.4701545238494873, "rewards/rejected": 1.3395439386367798, "step": 3331 }, { "epoch": 0.54, "learning_rate": 9.84100134519328e-07, "logits/chosen": -1.5531357526779175, "logits/rejected": -0.8852695822715759, "logps/chosen": -109.86634826660156, "logps/rejected": -59.8629150390625, "loss": 1.1864, "rewards/accuracies": 0.0, "rewards/chosen": 0.9337677359580994, "rewards/margins": -0.801719605922699, "rewards/rejected": 1.7354873418807983, "step": 3332 }, { "epoch": 0.54, "learning_rate": 9.840672382619477e-07, "logits/chosen": -0.8408143520355225, "logits/rejected": -0.8557316660881042, "logps/chosen": -59.021366119384766, "logps/rejected": -70.0581283569336, "loss": 0.8501, "rewards/accuracies": 1.0, "rewards/chosen": 1.876627802848816, "rewards/margins": 1.2979474067687988, "rewards/rejected": 0.5786804556846619, "step": 3333 }, { "epoch": 0.54, "learning_rate": 9.840343085601016e-07, "logits/chosen": -0.5490761995315552, "logits/rejected": -0.3704063296318054, "logps/chosen": -86.24479675292969, "logps/rejected": -14.601293563842773, "loss": 0.1552, "rewards/accuracies": 1.0, "rewards/chosen": 2.558100938796997, "rewards/margins": 1.7978236675262451, "rewards/rejected": 0.7602772116661072, "step": 3334 }, { "epoch": 0.54, "learning_rate": 9.840013454160646e-07, "logits/chosen": -0.45519858598709106, "logits/rejected": -0.5222455859184265, "logps/chosen": -85.62957000732422, "logps/rejected": -85.75796508789062, "loss": 1.8621, "rewards/accuracies": 0.0, "rewards/chosen": 1.6297630071640015, "rewards/margins": -2.6190452575683594, "rewards/rejected": 4.24880838394165, "step": 3335 }, { "epoch": 0.54, "learning_rate": 9.839683488321143e-07, "logits/chosen": -0.3137970268726349, "logits/rejected": -0.3189399242401123, "logps/chosen": -3.9209749698638916, "logps/rejected": -12.226540565490723, "loss": 0.7477, "rewards/accuracies": 1.0, "rewards/chosen": 0.3592541515827179, "rewards/margins": 0.15399578213691711, "rewards/rejected": 0.20525836944580078, "step": 3336 }, { "epoch": 0.54, "learning_rate": 9.83935318810531e-07, "logits/chosen": -0.8548064231872559, "logits/rejected": -0.8470978736877441, "logps/chosen": -152.3143310546875, "logps/rejected": -201.58546447753906, "loss": 0.7018, "rewards/accuracies": 0.0, "rewards/chosen": 4.637217998504639, "rewards/margins": -1.1159453392028809, "rewards/rejected": 5.7531633377075195, "step": 3337 }, { "epoch": 0.54, "learning_rate": 9.839022553535954e-07, "logits/chosen": -0.7141883969306946, "logits/rejected": -0.6381029486656189, "logps/chosen": -50.88203430175781, "logps/rejected": -80.1728286743164, "loss": 0.2841, "rewards/accuracies": 1.0, "rewards/chosen": 0.9266616702079773, "rewards/margins": 0.39282989501953125, "rewards/rejected": 0.533831775188446, "step": 3338 }, { "epoch": 0.54, "learning_rate": 9.838691584635931e-07, "logits/chosen": 0.08166458457708359, "logits/rejected": 0.08396273106336594, "logps/chosen": -12.527745246887207, "logps/rejected": -9.284940719604492, "loss": 0.3515, "rewards/accuracies": 1.0, "rewards/chosen": 0.35385236144065857, "rewards/margins": 0.2255890816450119, "rewards/rejected": 0.12826327979564667, "step": 3339 }, { "epoch": 0.54, "learning_rate": 9.838360281428104e-07, "logits/chosen": -0.6742199659347534, "logits/rejected": -0.511839747428894, "logps/chosen": -137.11944580078125, "logps/rejected": -103.81645965576172, "loss": 0.5764, "rewards/accuracies": 0.0, "rewards/chosen": 1.782312035560608, "rewards/margins": -0.730059027671814, "rewards/rejected": 2.512371063232422, "step": 3340 }, { "epoch": 0.54, "learning_rate": 9.838028643935362e-07, "logits/chosen": -0.49193909764289856, "logits/rejected": -0.3912639021873474, "logps/chosen": -106.05758666992188, "logps/rejected": -66.0487289428711, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 3.359396457672119, "rewards/margins": 2.91054630279541, "rewards/rejected": 0.44885024428367615, "step": 3341 }, { "epoch": 0.54, "learning_rate": 9.837696672180618e-07, "logits/chosen": -0.579693078994751, "logits/rejected": -0.6128714680671692, "logps/chosen": -51.38249969482422, "logps/rejected": -76.75938415527344, "loss": 0.517, "rewards/accuracies": 0.0, "rewards/chosen": 1.803697943687439, "rewards/margins": -0.5752280950546265, "rewards/rejected": 2.3789260387420654, "step": 3342 }, { "epoch": 0.54, "learning_rate": 9.837364366186807e-07, "logits/chosen": -0.37078288197517395, "logits/rejected": -0.3424989879131317, "logps/chosen": -68.99861145019531, "logps/rejected": -92.72660827636719, "loss": 0.2115, "rewards/accuracies": 1.0, "rewards/chosen": 2.9165198802948, "rewards/margins": 1.0578407049179077, "rewards/rejected": 1.858679175376892, "step": 3343 }, { "epoch": 0.54, "learning_rate": 9.837031725976892e-07, "logits/chosen": -0.43899157643318176, "logits/rejected": -0.4811873435974121, "logps/chosen": -82.97874450683594, "logps/rejected": -97.27345275878906, "loss": 0.6711, "rewards/accuracies": 0.0, "rewards/chosen": 0.13612671196460724, "rewards/margins": -1.0360060930252075, "rewards/rejected": 1.1721328496932983, "step": 3344 }, { "epoch": 0.54, "learning_rate": 9.836698751573854e-07, "logits/chosen": -0.7025238871574402, "logits/rejected": -0.5686019062995911, "logps/chosen": -77.1363525390625, "logps/rejected": -70.85020446777344, "loss": 1.1667, "rewards/accuracies": 0.0, "rewards/chosen": 1.328973412513733, "rewards/margins": -1.1901458501815796, "rewards/rejected": 2.5191192626953125, "step": 3345 }, { "epoch": 0.54, "learning_rate": 9.836365443000697e-07, "logits/chosen": -0.3671879768371582, "logits/rejected": -0.4172447621822357, "logps/chosen": -81.95315551757812, "logps/rejected": -106.3719482421875, "loss": 0.5264, "rewards/accuracies": 1.0, "rewards/chosen": 1.0313987731933594, "rewards/margins": 0.6188957095146179, "rewards/rejected": 0.41250306367874146, "step": 3346 }, { "epoch": 0.54, "learning_rate": 9.83603180028045e-07, "logits/chosen": -0.0857817679643631, "logits/rejected": -0.09765351563692093, "logps/chosen": -3.514317274093628, "logps/rejected": -27.39724349975586, "loss": 0.3064, "rewards/accuracies": 1.0, "rewards/chosen": 0.2162865847349167, "rewards/margins": 0.4155465364456177, "rewards/rejected": -0.199259951710701, "step": 3347 }, { "epoch": 0.54, "learning_rate": 9.835697823436161e-07, "logits/chosen": -0.5961259603500366, "logits/rejected": -0.5613647103309631, "logps/chosen": -45.94032287597656, "logps/rejected": -54.00834655761719, "loss": 0.4191, "rewards/accuracies": 1.0, "rewards/chosen": 2.532987356185913, "rewards/margins": 0.007215261459350586, "rewards/rejected": 2.5257720947265625, "step": 3348 }, { "epoch": 0.54, "learning_rate": 9.835363512490912e-07, "logits/chosen": -0.5896623730659485, "logits/rejected": -0.7170036435127258, "logps/chosen": -49.83232116699219, "logps/rejected": -51.60430908203125, "loss": 1.2449, "rewards/accuracies": 1.0, "rewards/chosen": 2.0711402893066406, "rewards/margins": 0.9174289703369141, "rewards/rejected": 1.1537113189697266, "step": 3349 }, { "epoch": 0.54, "learning_rate": 9.835028867467797e-07, "logits/chosen": -0.46532541513442993, "logits/rejected": -0.25664669275283813, "logps/chosen": -125.28434753417969, "logps/rejected": -48.28778076171875, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": 4.067732334136963, "rewards/margins": 2.1662368774414062, "rewards/rejected": 1.901495337486267, "step": 3350 }, { "epoch": 0.54, "learning_rate": 9.834693888389934e-07, "logits/chosen": -0.5678374171257019, "logits/rejected": -0.5654082894325256, "logps/chosen": -42.32123565673828, "logps/rejected": -20.320789337158203, "loss": 0.6676, "rewards/accuracies": 1.0, "rewards/chosen": 0.7025306820869446, "rewards/margins": 0.45709115266799927, "rewards/rejected": 0.2454395294189453, "step": 3351 }, { "epoch": 0.54, "learning_rate": 9.834358575280473e-07, "logits/chosen": -0.7048075795173645, "logits/rejected": -0.6727501749992371, "logps/chosen": -114.32026672363281, "logps/rejected": -172.7940673828125, "loss": 1.9084, "rewards/accuracies": 0.0, "rewards/chosen": 3.564868211746216, "rewards/margins": -2.423243761062622, "rewards/rejected": 5.988111972808838, "step": 3352 }, { "epoch": 0.54, "learning_rate": 9.834022928162575e-07, "logits/chosen": -0.17180795967578888, "logits/rejected": -0.22992902994155884, "logps/chosen": -72.98277282714844, "logps/rejected": -65.97782897949219, "loss": 0.3641, "rewards/accuracies": 1.0, "rewards/chosen": 1.30584716796875, "rewards/margins": 0.24117279052734375, "rewards/rejected": 1.0646743774414062, "step": 3353 }, { "epoch": 0.54, "learning_rate": 9.833686947059434e-07, "logits/chosen": -0.4889688193798065, "logits/rejected": -0.5072718262672424, "logps/chosen": -78.40535736083984, "logps/rejected": -79.50383758544922, "loss": 0.8659, "rewards/accuracies": 0.0, "rewards/chosen": 1.4317810535430908, "rewards/margins": -1.4495582580566406, "rewards/rejected": 2.8813393115997314, "step": 3354 }, { "epoch": 0.54, "learning_rate": 9.83335063199426e-07, "logits/chosen": -0.1257191300392151, "logits/rejected": -0.1257191300392151, "logps/chosen": -19.80980110168457, "logps/rejected": -19.80980110168457, "loss": 0.421, "rewards/accuracies": 0.0, "rewards/chosen": 0.0701383575797081, "rewards/margins": 0.0, "rewards/rejected": 0.0701383575797081, "step": 3355 }, { "epoch": 0.54, "learning_rate": 9.833013982990293e-07, "logits/chosen": -0.9053449034690857, "logits/rejected": -1.2893236875534058, "logps/chosen": -134.4914093017578, "logps/rejected": -64.42801666259766, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": 5.0008745193481445, "rewards/margins": 4.735002517700195, "rewards/rejected": 0.26587221026420593, "step": 3356 }, { "epoch": 0.54, "learning_rate": 9.83267700007079e-07, "logits/chosen": -0.5856055617332458, "logits/rejected": -0.41269516944885254, "logps/chosen": -182.28634643554688, "logps/rejected": -179.5929412841797, "loss": 1.0481, "rewards/accuracies": 1.0, "rewards/chosen": 4.605660915374756, "rewards/margins": 1.5231397151947021, "rewards/rejected": 3.0825212001800537, "step": 3357 }, { "epoch": 0.55, "learning_rate": 9.832339683259033e-07, "logits/chosen": -0.8158971071243286, "logits/rejected": -0.4284798502922058, "logps/chosen": -132.05845642089844, "logps/rejected": -61.51997375488281, "loss": 0.7971, "rewards/accuracies": 0.0, "rewards/chosen": 0.5436905026435852, "rewards/margins": -1.0425751209259033, "rewards/rejected": 1.5862655639648438, "step": 3358 }, { "epoch": 0.55, "learning_rate": 9.832002032578328e-07, "logits/chosen": -0.40976691246032715, "logits/rejected": -0.30423641204833984, "logps/chosen": -72.25579071044922, "logps/rejected": -22.53449249267578, "loss": 0.1296, "rewards/accuracies": 1.0, "rewards/chosen": 2.316136121749878, "rewards/margins": 2.178556203842163, "rewards/rejected": 0.13757991790771484, "step": 3359 }, { "epoch": 0.55, "learning_rate": 9.831664048052002e-07, "logits/chosen": -0.5390074253082275, "logits/rejected": -0.5322228074073792, "logps/chosen": -1.4940727949142456, "logps/rejected": -8.913481712341309, "loss": 0.896, "rewards/accuracies": 1.0, "rewards/chosen": 0.31873542070388794, "rewards/margins": 0.16372747719287872, "rewards/rejected": 0.15500794351100922, "step": 3360 }, { "epoch": 0.55, "learning_rate": 9.83132572970341e-07, "logits/chosen": -0.7315891981124878, "logits/rejected": -0.7067036032676697, "logps/chosen": -73.97299194335938, "logps/rejected": -54.26987838745117, "loss": 0.6317, "rewards/accuracies": 1.0, "rewards/chosen": 0.7101883292198181, "rewards/margins": 0.2630169093608856, "rewards/rejected": 0.4471714198589325, "step": 3361 }, { "epoch": 0.55, "learning_rate": 9.830987077555924e-07, "logits/chosen": -0.7434436678886414, "logits/rejected": -0.7754374742507935, "logps/chosen": -80.09668731689453, "logps/rejected": -48.82884216308594, "loss": 1.4016, "rewards/accuracies": 0.0, "rewards/chosen": 0.10032882541418076, "rewards/margins": -2.0391364097595215, "rewards/rejected": 2.13946533203125, "step": 3362 }, { "epoch": 0.55, "learning_rate": 9.83064809163294e-07, "logits/chosen": -0.5698800086975098, "logits/rejected": -0.5398573875427246, "logps/chosen": -84.02590942382812, "logps/rejected": -61.28043746948242, "loss": 2.8234, "rewards/accuracies": 0.0, "rewards/chosen": 0.9122329950332642, "rewards/margins": -1.769816279411316, "rewards/rejected": 2.68204927444458, "step": 3363 }, { "epoch": 0.55, "learning_rate": 9.830308771957883e-07, "logits/chosen": -0.28026655316352844, "logits/rejected": -0.32593733072280884, "logps/chosen": -41.075340270996094, "logps/rejected": -69.6933364868164, "loss": 0.6442, "rewards/accuracies": 1.0, "rewards/chosen": 1.752085566520691, "rewards/margins": 0.2064685821533203, "rewards/rejected": 1.5456169843673706, "step": 3364 }, { "epoch": 0.55, "learning_rate": 9.829969118554195e-07, "logits/chosen": -0.6315164566040039, "logits/rejected": -0.5094074010848999, "logps/chosen": -168.619140625, "logps/rejected": -79.49469757080078, "loss": 1.5038, "rewards/accuracies": 0.0, "rewards/chosen": 0.8926498293876648, "rewards/margins": -2.6284561157226562, "rewards/rejected": 3.521106004714966, "step": 3365 }, { "epoch": 0.55, "learning_rate": 9.82962913144534e-07, "logits/chosen": -0.7539858222007751, "logits/rejected": -0.2493010312318802, "logps/chosen": -85.38352966308594, "logps/rejected": -231.9451904296875, "loss": 2.0567, "rewards/accuracies": 0.0, "rewards/chosen": 1.030726671218872, "rewards/margins": -3.764549493789673, "rewards/rejected": 4.795276165008545, "step": 3366 }, { "epoch": 0.55, "learning_rate": 9.829288810654814e-07, "logits/chosen": -0.8279820084571838, "logits/rejected": -0.8159845471382141, "logps/chosen": -63.59606170654297, "logps/rejected": -52.621463775634766, "loss": 0.9359, "rewards/accuracies": 0.0, "rewards/chosen": 2.4984848499298096, "rewards/margins": -0.04614138603210449, "rewards/rejected": 2.544626235961914, "step": 3367 }, { "epoch": 0.55, "learning_rate": 9.828948156206124e-07, "logits/chosen": -0.2798517942428589, "logits/rejected": -0.29048869013786316, "logps/chosen": -49.06689453125, "logps/rejected": -76.22298431396484, "loss": 0.6703, "rewards/accuracies": 0.0, "rewards/chosen": -0.14345169067382812, "rewards/margins": -0.3441009521484375, "rewards/rejected": 0.20064926147460938, "step": 3368 }, { "epoch": 0.55, "learning_rate": 9.828607168122808e-07, "logits/chosen": -0.6688746809959412, "logits/rejected": -0.6857969164848328, "logps/chosen": -206.42623901367188, "logps/rejected": -54.69816589355469, "loss": 0.2416, "rewards/accuracies": 1.0, "rewards/chosen": 2.5407776832580566, "rewards/margins": 0.5743744373321533, "rewards/rejected": 1.9664032459259033, "step": 3369 }, { "epoch": 0.55, "learning_rate": 9.828265846428428e-07, "logits/chosen": -0.5513418316841125, "logits/rejected": -1.1821311712265015, "logps/chosen": -104.47120666503906, "logps/rejected": -63.24431228637695, "loss": 0.5344, "rewards/accuracies": 1.0, "rewards/chosen": 0.8226280212402344, "rewards/margins": 0.37508201599121094, "rewards/rejected": 0.44754600524902344, "step": 3370 }, { "epoch": 0.55, "learning_rate": 9.827924191146561e-07, "logits/chosen": -0.2528821527957916, "logits/rejected": -0.29483872652053833, "logps/chosen": -113.25142669677734, "logps/rejected": -81.65802764892578, "loss": 0.6088, "rewards/accuracies": 1.0, "rewards/chosen": 1.909066081047058, "rewards/margins": 0.5714806318283081, "rewards/rejected": 1.33758544921875, "step": 3371 }, { "epoch": 0.55, "learning_rate": 9.827582202300815e-07, "logits/chosen": -0.5406650900840759, "logits/rejected": -0.48458701372146606, "logps/chosen": -45.31688690185547, "logps/rejected": -4.408982276916504, "loss": 0.4687, "rewards/accuracies": 1.0, "rewards/chosen": 1.6165672540664673, "rewards/margins": 0.6868910789489746, "rewards/rejected": 0.9296761751174927, "step": 3372 }, { "epoch": 0.55, "learning_rate": 9.827239879914817e-07, "logits/chosen": -0.679229199886322, "logits/rejected": -0.6683895587921143, "logps/chosen": -109.48095703125, "logps/rejected": -78.44222259521484, "loss": 1.9552, "rewards/accuracies": 0.0, "rewards/chosen": 1.3499641418457031, "rewards/margins": -1.7207748889923096, "rewards/rejected": 3.0707390308380127, "step": 3373 }, { "epoch": 0.55, "learning_rate": 9.82689722401222e-07, "logits/chosen": -0.7719979286193848, "logits/rejected": -0.7536543011665344, "logps/chosen": -59.50151824951172, "logps/rejected": -112.77511596679688, "loss": 1.3453, "rewards/accuracies": 0.0, "rewards/chosen": 2.5802438259124756, "rewards/margins": -2.5547845363616943, "rewards/rejected": 5.13502836227417, "step": 3374 }, { "epoch": 0.55, "learning_rate": 9.8265542346167e-07, "logits/chosen": -0.4350142478942871, "logits/rejected": -0.4350142478942871, "logps/chosen": -71.80789184570312, "logps/rejected": -71.80789184570312, "loss": 0.7232, "rewards/accuracies": 0.0, "rewards/chosen": 1.1126540899276733, "rewards/margins": 0.0, "rewards/rejected": 1.1126540899276733, "step": 3375 }, { "epoch": 0.55, "learning_rate": 9.826210911751949e-07, "logits/chosen": -0.597791314125061, "logits/rejected": -0.6150924563407898, "logps/chosen": -95.50759887695312, "logps/rejected": -58.67973327636719, "loss": 0.8549, "rewards/accuracies": 0.0, "rewards/chosen": 1.135151743888855, "rewards/margins": -0.4756941795349121, "rewards/rejected": 1.610845923423767, "step": 3376 }, { "epoch": 0.55, "learning_rate": 9.825867255441688e-07, "logits/chosen": -0.6697909832000732, "logits/rejected": -0.6508654952049255, "logps/chosen": -222.42767333984375, "logps/rejected": -149.68255615234375, "loss": 0.317, "rewards/accuracies": 1.0, "rewards/chosen": 1.0520858764648438, "rewards/margins": 0.4601394534111023, "rewards/rejected": 0.5919464230537415, "step": 3377 }, { "epoch": 0.55, "learning_rate": 9.825523265709665e-07, "logits/chosen": -0.45798420906066895, "logits/rejected": -0.40219658613204956, "logps/chosen": -62.69586944580078, "logps/rejected": -71.94125366210938, "loss": 0.6447, "rewards/accuracies": 0.0, "rewards/chosen": 0.7748954892158508, "rewards/margins": -0.8202605843544006, "rewards/rejected": 1.5951560735702515, "step": 3378 }, { "epoch": 0.55, "learning_rate": 9.825178942579645e-07, "logits/chosen": -0.7729661464691162, "logits/rejected": -0.6950631141662598, "logps/chosen": -40.306053161621094, "logps/rejected": -59.295982360839844, "loss": 0.6032, "rewards/accuracies": 0.0, "rewards/chosen": 1.2631019353866577, "rewards/margins": -0.3244895935058594, "rewards/rejected": 1.587591528892517, "step": 3379 }, { "epoch": 0.55, "learning_rate": 9.824834286075413e-07, "logits/chosen": -0.4771804213523865, "logits/rejected": -0.38722124695777893, "logps/chosen": -57.20030212402344, "logps/rejected": -16.347448348999023, "loss": 0.2306, "rewards/accuracies": 1.0, "rewards/chosen": 1.1946510076522827, "rewards/margins": 0.7806994915008545, "rewards/rejected": 0.41395148634910583, "step": 3380 }, { "epoch": 0.55, "learning_rate": 9.824489296220788e-07, "logits/chosen": -0.42030084133148193, "logits/rejected": -0.37299051880836487, "logps/chosen": -91.62550354003906, "logps/rejected": -46.68586730957031, "loss": 0.6545, "rewards/accuracies": 0.0, "rewards/chosen": 0.8823555111885071, "rewards/margins": -0.04000931978225708, "rewards/rejected": 0.9223648309707642, "step": 3381 }, { "epoch": 0.55, "learning_rate": 9.824143973039602e-07, "logits/chosen": -0.6774590015411377, "logits/rejected": -0.6833183765411377, "logps/chosen": -126.84619903564453, "logps/rejected": -59.54050064086914, "loss": 1.5797, "rewards/accuracies": 0.0, "rewards/chosen": 0.37940749526023865, "rewards/margins": -0.8698993921279907, "rewards/rejected": 1.2493069171905518, "step": 3382 }, { "epoch": 0.55, "learning_rate": 9.823798316555712e-07, "logits/chosen": -0.032481808215379715, "logits/rejected": -0.032481808215379715, "logps/chosen": -1.7160024642944336, "logps/rejected": -1.7160024642944336, "loss": 0.8671, "rewards/accuracies": 0.0, "rewards/chosen": 0.31664013862609863, "rewards/margins": 0.0, "rewards/rejected": 0.31664013862609863, "step": 3383 }, { "epoch": 0.55, "learning_rate": 9.823452326793002e-07, "logits/chosen": -0.46948251128196716, "logits/rejected": -0.3861880600452423, "logps/chosen": -63.409828186035156, "logps/rejected": -55.09085464477539, "loss": 0.9332, "rewards/accuracies": 1.0, "rewards/chosen": 1.743508219718933, "rewards/margins": 1.4861302375793457, "rewards/rejected": 0.2573780119419098, "step": 3384 }, { "epoch": 0.55, "learning_rate": 9.823106003775378e-07, "logits/chosen": -0.7232860326766968, "logits/rejected": -0.7424780130386353, "logps/chosen": -226.3299560546875, "logps/rejected": -98.9930648803711, "loss": 0.2533, "rewards/accuracies": 1.0, "rewards/chosen": 3.093554735183716, "rewards/margins": 3.4426095485687256, "rewards/rejected": -0.3490547239780426, "step": 3385 }, { "epoch": 0.55, "learning_rate": 9.822759347526765e-07, "logits/chosen": -0.9112890362739563, "logits/rejected": -0.8412654399871826, "logps/chosen": -86.59750366210938, "logps/rejected": -32.48856735229492, "loss": 0.6172, "rewards/accuracies": 1.0, "rewards/chosen": 0.8062378168106079, "rewards/margins": 0.3460388481616974, "rewards/rejected": 0.4601989686489105, "step": 3386 }, { "epoch": 0.55, "learning_rate": 9.822412358071113e-07, "logits/chosen": -0.5975553393363953, "logits/rejected": -0.5619800686836243, "logps/chosen": -88.81157684326172, "logps/rejected": -65.40711975097656, "loss": 0.2949, "rewards/accuracies": 1.0, "rewards/chosen": 1.8716392517089844, "rewards/margins": 0.31344521045684814, "rewards/rejected": 1.5581940412521362, "step": 3387 }, { "epoch": 0.55, "learning_rate": 9.822065035432398e-07, "logits/chosen": -0.32510870695114136, "logits/rejected": -0.29718056321144104, "logps/chosen": -40.7320556640625, "logps/rejected": -40.35924530029297, "loss": 0.535, "rewards/accuracies": 0.0, "rewards/chosen": 0.6096740961074829, "rewards/margins": -0.44678568840026855, "rewards/rejected": 1.0564597845077515, "step": 3388 }, { "epoch": 0.55, "learning_rate": 9.821717379634616e-07, "logits/chosen": -0.6111131310462952, "logits/rejected": -0.5783882141113281, "logps/chosen": -54.43649673461914, "logps/rejected": -57.577293395996094, "loss": 0.2377, "rewards/accuracies": 1.0, "rewards/chosen": 1.789699912071228, "rewards/margins": 0.9231479167938232, "rewards/rejected": 0.8665519952774048, "step": 3389 }, { "epoch": 0.55, "learning_rate": 9.821369390701787e-07, "logits/chosen": 0.043980225920677185, "logits/rejected": 0.05832497403025627, "logps/chosen": -1.7143607139587402, "logps/rejected": -22.38490104675293, "loss": 0.449, "rewards/accuracies": 1.0, "rewards/chosen": 0.3145659565925598, "rewards/margins": 0.41256821155548096, "rewards/rejected": -0.09800224751234055, "step": 3390 }, { "epoch": 0.55, "learning_rate": 9.821021068657953e-07, "logits/chosen": -0.6324988603591919, "logits/rejected": -0.6331576108932495, "logps/chosen": -129.4278106689453, "logps/rejected": -163.36016845703125, "loss": 0.2938, "rewards/accuracies": 1.0, "rewards/chosen": 4.162675380706787, "rewards/margins": 0.8217694759368896, "rewards/rejected": 3.3409059047698975, "step": 3391 }, { "epoch": 0.55, "learning_rate": 9.82067241352718e-07, "logits/chosen": -0.5980064868927002, "logits/rejected": -0.5248802304267883, "logps/chosen": -119.03443908691406, "logps/rejected": -105.02828979492188, "loss": 0.2641, "rewards/accuracies": 1.0, "rewards/chosen": 4.343055725097656, "rewards/margins": 1.9058196544647217, "rewards/rejected": 2.4372360706329346, "step": 3392 }, { "epoch": 0.55, "learning_rate": 9.820323425333556e-07, "logits/chosen": -0.5933482646942139, "logits/rejected": -0.34342312812805176, "logps/chosen": -180.2088623046875, "logps/rejected": -97.88047790527344, "loss": 0.3022, "rewards/accuracies": 1.0, "rewards/chosen": 4.115475654602051, "rewards/margins": 0.6875321865081787, "rewards/rejected": 3.427943468093872, "step": 3393 }, { "epoch": 0.55, "learning_rate": 9.819974104101196e-07, "logits/chosen": -0.9605256915092468, "logits/rejected": -0.9408430457115173, "logps/chosen": -79.11988830566406, "logps/rejected": -82.05599975585938, "loss": 1.185, "rewards/accuracies": 0.0, "rewards/chosen": 1.0021514892578125, "rewards/margins": -1.4959831237792969, "rewards/rejected": 2.4981346130371094, "step": 3394 }, { "epoch": 0.55, "learning_rate": 9.819624449854231e-07, "logits/chosen": -0.3208352029323578, "logits/rejected": -0.3208352029323578, "logps/chosen": -85.48818969726562, "logps/rejected": -85.48818969726562, "loss": 0.3767, "rewards/accuracies": 0.0, "rewards/chosen": 2.534210205078125, "rewards/margins": 0.0, "rewards/rejected": 2.534210205078125, "step": 3395 }, { "epoch": 0.55, "learning_rate": 9.81927446261682e-07, "logits/chosen": -0.2989448010921478, "logits/rejected": -0.29615435004234314, "logps/chosen": -74.07208251953125, "logps/rejected": -83.46974182128906, "loss": 0.77, "rewards/accuracies": 0.0, "rewards/chosen": 0.7870720028877258, "rewards/margins": -0.44328993558883667, "rewards/rejected": 1.2303619384765625, "step": 3396 }, { "epoch": 0.55, "learning_rate": 9.818924142413143e-07, "logits/chosen": -0.6810586452484131, "logits/rejected": -0.6551181077957153, "logps/chosen": -60.16474914550781, "logps/rejected": -91.19097900390625, "loss": 0.3965, "rewards/accuracies": 1.0, "rewards/chosen": 2.704864501953125, "rewards/margins": 0.6045119762420654, "rewards/rejected": 2.1003525257110596, "step": 3397 }, { "epoch": 0.55, "learning_rate": 9.818573489267406e-07, "logits/chosen": -0.3892533481121063, "logits/rejected": -0.19997547566890717, "logps/chosen": -52.12097930908203, "logps/rejected": -78.50364685058594, "loss": 0.656, "rewards/accuracies": 1.0, "rewards/chosen": 1.9871643781661987, "rewards/margins": 0.31508636474609375, "rewards/rejected": 1.672078013420105, "step": 3398 }, { "epoch": 0.55, "learning_rate": 9.818222503203835e-07, "logits/chosen": -1.056283712387085, "logits/rejected": -0.6172910332679749, "logps/chosen": -135.750244140625, "logps/rejected": -91.31712341308594, "loss": 0.6804, "rewards/accuracies": 0.0, "rewards/chosen": 0.8461197018623352, "rewards/margins": -1.0516600608825684, "rewards/rejected": 1.8977798223495483, "step": 3399 }, { "epoch": 0.55, "learning_rate": 9.81787118424668e-07, "logits/chosen": -0.7168309688568115, "logits/rejected": -0.6831295490264893, "logps/chosen": -127.78807067871094, "logps/rejected": -138.76248168945312, "loss": 0.9438, "rewards/accuracies": 0.0, "rewards/chosen": 1.2005493640899658, "rewards/margins": -1.6197662353515625, "rewards/rejected": 2.8203155994415283, "step": 3400 }, { "epoch": 0.55, "learning_rate": 9.817519532420212e-07, "logits/chosen": -0.41561582684516907, "logits/rejected": -0.41561582684516907, "logps/chosen": -148.85507202148438, "logps/rejected": -148.85507202148438, "loss": 0.5124, "rewards/accuracies": 0.0, "rewards/chosen": 1.3955535888671875, "rewards/margins": 0.0, "rewards/rejected": 1.3955535888671875, "step": 3401 }, { "epoch": 0.55, "learning_rate": 9.817167547748728e-07, "logits/chosen": -0.042816419154405594, "logits/rejected": -0.04372984543442726, "logps/chosen": -2.098785638809204, "logps/rejected": -1.158512830734253, "loss": 0.8353, "rewards/accuracies": 0.0, "rewards/chosen": 0.1773044914007187, "rewards/margins": -0.15178106725215912, "rewards/rejected": 0.3290855586528778, "step": 3402 }, { "epoch": 0.55, "learning_rate": 9.816815230256548e-07, "logits/chosen": -0.5705797672271729, "logits/rejected": -0.6246430277824402, "logps/chosen": -60.83264923095703, "logps/rejected": -52.45447540283203, "loss": 0.8925, "rewards/accuracies": 0.0, "rewards/chosen": 0.5468330383300781, "rewards/margins": -1.2271705865859985, "rewards/rejected": 1.7740036249160767, "step": 3403 }, { "epoch": 0.55, "learning_rate": 9.816462579968013e-07, "logits/chosen": -1.1551746129989624, "logits/rejected": -1.141668677330017, "logps/chosen": -100.26203918457031, "logps/rejected": -107.1851806640625, "loss": 1.5422, "rewards/accuracies": 0.0, "rewards/chosen": 1.3264816999435425, "rewards/margins": -2.1299362182617188, "rewards/rejected": 3.4564177989959717, "step": 3404 }, { "epoch": 0.55, "learning_rate": 9.816109596907485e-07, "logits/chosen": -0.47226089239120483, "logits/rejected": -0.46242159605026245, "logps/chosen": -80.93966674804688, "logps/rejected": -57.96971893310547, "loss": 0.493, "rewards/accuracies": 1.0, "rewards/chosen": 4.389359951019287, "rewards/margins": 1.738875389099121, "rewards/rejected": 2.650484561920166, "step": 3405 }, { "epoch": 0.55, "learning_rate": 9.815756281099357e-07, "logits/chosen": -0.41984137892723083, "logits/rejected": -0.4245446026325226, "logps/chosen": -17.476486206054688, "logps/rejected": -2.9406163692474365, "loss": 0.7344, "rewards/accuracies": 1.0, "rewards/chosen": 0.16207276284694672, "rewards/margins": 0.04586704820394516, "rewards/rejected": 0.11620571464300156, "step": 3406 }, { "epoch": 0.55, "learning_rate": 9.815402632568035e-07, "logits/chosen": -0.6987470388412476, "logits/rejected": -0.7495698928833008, "logps/chosen": -95.27177429199219, "logps/rejected": -167.82455444335938, "loss": 1.2017, "rewards/accuracies": 0.0, "rewards/chosen": 0.9117538332939148, "rewards/margins": -1.6556320190429688, "rewards/rejected": 2.5673859119415283, "step": 3407 }, { "epoch": 0.55, "learning_rate": 9.815048651337956e-07, "logits/chosen": -0.44578951597213745, "logits/rejected": -0.3772051930427551, "logps/chosen": -59.47479248046875, "logps/rejected": -73.31594848632812, "loss": 1.062, "rewards/accuracies": 0.0, "rewards/chosen": 1.4242295026779175, "rewards/margins": -0.5135345458984375, "rewards/rejected": 1.937764048576355, "step": 3408 }, { "epoch": 0.55, "learning_rate": 9.814694337433576e-07, "logits/chosen": -0.5915122032165527, "logits/rejected": -0.5088091492652893, "logps/chosen": -94.79362487792969, "logps/rejected": -69.6537857055664, "loss": 0.3202, "rewards/accuracies": 1.0, "rewards/chosen": 2.9319145679473877, "rewards/margins": 1.8735755681991577, "rewards/rejected": 1.05833899974823, "step": 3409 }, { "epoch": 0.55, "learning_rate": 9.814339690879374e-07, "logits/chosen": -0.2864489257335663, "logits/rejected": -0.24219681322574615, "logps/chosen": -72.82655334472656, "logps/rejected": -52.42107391357422, "loss": 0.586, "rewards/accuracies": 0.0, "rewards/chosen": 1.3118469715118408, "rewards/margins": -0.4209449291229248, "rewards/rejected": 1.7327919006347656, "step": 3410 }, { "epoch": 0.55, "learning_rate": 9.813984711699852e-07, "logits/chosen": -0.598008394241333, "logits/rejected": -0.5203381180763245, "logps/chosen": -134.1104736328125, "logps/rejected": -107.63510131835938, "loss": 0.1147, "rewards/accuracies": 1.0, "rewards/chosen": 3.6993424892425537, "rewards/margins": 1.3842194080352783, "rewards/rejected": 2.3151230812072754, "step": 3411 }, { "epoch": 0.55, "learning_rate": 9.813629399919539e-07, "logits/chosen": -0.3124171793460846, "logits/rejected": -0.3455839455127716, "logps/chosen": -88.66215515136719, "logps/rejected": -39.97351837158203, "loss": 1.3548, "rewards/accuracies": 0.0, "rewards/chosen": 0.18735580146312714, "rewards/margins": -1.873533010482788, "rewards/rejected": 2.0608887672424316, "step": 3412 }, { "epoch": 0.55, "learning_rate": 9.81327375556298e-07, "logits/chosen": -0.8919181227684021, "logits/rejected": -0.8075422644615173, "logps/chosen": -50.342674255371094, "logps/rejected": -48.15639877319336, "loss": 0.4348, "rewards/accuracies": 1.0, "rewards/chosen": 2.236212968826294, "rewards/margins": 0.3432224988937378, "rewards/rejected": 1.8929904699325562, "step": 3413 }, { "epoch": 0.55, "learning_rate": 9.812917778654747e-07, "logits/chosen": -0.8819672465324402, "logits/rejected": -0.8190591335296631, "logps/chosen": -106.19790649414062, "logps/rejected": -16.576154708862305, "loss": 0.2619, "rewards/accuracies": 1.0, "rewards/chosen": 1.957170844078064, "rewards/margins": 1.7445508241653442, "rewards/rejected": 0.21261997520923615, "step": 3414 }, { "epoch": 0.55, "learning_rate": 9.812561469219439e-07, "logits/chosen": -0.8918707966804504, "logits/rejected": -0.8484019637107849, "logps/chosen": -149.37106323242188, "logps/rejected": -157.18740844726562, "loss": 0.9333, "rewards/accuracies": 0.0, "rewards/chosen": 5.769692897796631, "rewards/margins": -0.8033390045166016, "rewards/rejected": 6.573031902313232, "step": 3415 }, { "epoch": 0.55, "learning_rate": 9.812204827281667e-07, "logits/chosen": -0.14458169043064117, "logits/rejected": -0.14458169043064117, "logps/chosen": -59.4019775390625, "logps/rejected": -59.4019775390625, "loss": 0.3612, "rewards/accuracies": 0.0, "rewards/chosen": 0.997271716594696, "rewards/margins": 0.0, "rewards/rejected": 0.997271716594696, "step": 3416 }, { "epoch": 0.55, "learning_rate": 9.811847852866078e-07, "logits/chosen": -0.5437775254249573, "logits/rejected": -0.5505305528640747, "logps/chosen": -59.853492736816406, "logps/rejected": -192.7327117919922, "loss": 0.5746, "rewards/accuracies": 0.0, "rewards/chosen": 0.5218093991279602, "rewards/margins": -0.7231857180595398, "rewards/rejected": 1.2449951171875, "step": 3417 }, { "epoch": 0.55, "learning_rate": 9.811490545997329e-07, "logits/chosen": -0.2749660313129425, "logits/rejected": -0.28659796714782715, "logps/chosen": -9.368318557739258, "logps/rejected": -11.300507545471191, "loss": 0.3224, "rewards/accuracies": 1.0, "rewards/chosen": 0.008110237307846546, "rewards/margins": 0.12530088424682617, "rewards/rejected": -0.1171906515955925, "step": 3418 }, { "epoch": 0.55, "learning_rate": 9.811132906700113e-07, "logits/chosen": -0.94517582654953, "logits/rejected": -0.9778159260749817, "logps/chosen": -151.48268127441406, "logps/rejected": -158.3905029296875, "loss": 0.7718, "rewards/accuracies": 0.0, "rewards/chosen": 3.124812364578247, "rewards/margins": -1.1758482456207275, "rewards/rejected": 4.300660610198975, "step": 3419 }, { "epoch": 0.56, "learning_rate": 9.810774934999136e-07, "logits/chosen": -0.21421118080615997, "logits/rejected": -0.22619597613811493, "logps/chosen": -3.8517231941223145, "logps/rejected": -2.0817391872406006, "loss": 1.8235, "rewards/accuracies": 0.0, "rewards/chosen": 0.47172999382019043, "rewards/margins": -0.07683724164962769, "rewards/rejected": 0.5485672354698181, "step": 3420 }, { "epoch": 0.56, "learning_rate": 9.810416630919129e-07, "logits/chosen": -0.5087838172912598, "logits/rejected": -0.42473411560058594, "logps/chosen": -71.6331787109375, "logps/rejected": -10.64017105102539, "loss": 0.2424, "rewards/accuracies": 1.0, "rewards/chosen": 1.7817963361740112, "rewards/margins": 0.6881771087646484, "rewards/rejected": 1.0936192274093628, "step": 3421 }, { "epoch": 0.56, "learning_rate": 9.81005799448485e-07, "logits/chosen": -0.8018863797187805, "logits/rejected": -0.7048745155334473, "logps/chosen": -100.5158920288086, "logps/rejected": -17.457550048828125, "loss": 0.2389, "rewards/accuracies": 1.0, "rewards/chosen": 0.6920387148857117, "rewards/margins": 0.5631504058837891, "rewards/rejected": 0.1288883239030838, "step": 3422 }, { "epoch": 0.56, "learning_rate": 9.80969902572108e-07, "logits/chosen": -0.6027565598487854, "logits/rejected": -0.15209120512008667, "logps/chosen": -256.3921813964844, "logps/rejected": -149.63612365722656, "loss": 0.9355, "rewards/accuracies": 1.0, "rewards/chosen": 3.9770050048828125, "rewards/margins": 0.6255156993865967, "rewards/rejected": 3.351489305496216, "step": 3423 }, { "epoch": 0.56, "learning_rate": 9.809339724652612e-07, "logits/chosen": -0.36944764852523804, "logits/rejected": -0.3030844032764435, "logps/chosen": -129.51434326171875, "logps/rejected": -87.74385070800781, "loss": 0.8386, "rewards/accuracies": 0.0, "rewards/chosen": 1.113250732421875, "rewards/margins": -0.18668973445892334, "rewards/rejected": 1.2999404668807983, "step": 3424 }, { "epoch": 0.56, "learning_rate": 9.80898009130428e-07, "logits/chosen": -0.5304474234580994, "logits/rejected": -0.47021475434303284, "logps/chosen": -78.69404602050781, "logps/rejected": -78.03564453125, "loss": 0.3851, "rewards/accuracies": 1.0, "rewards/chosen": 3.0208892822265625, "rewards/margins": 0.5615324974060059, "rewards/rejected": 2.4593567848205566, "step": 3425 }, { "epoch": 0.56, "learning_rate": 9.808620125700924e-07, "logits/chosen": -0.5813329219818115, "logits/rejected": -0.13066409528255463, "logps/chosen": -135.74972534179688, "logps/rejected": -150.52732849121094, "loss": 0.593, "rewards/accuracies": 1.0, "rewards/chosen": 2.8837356567382812, "rewards/margins": 1.2588698863983154, "rewards/rejected": 1.6248657703399658, "step": 3426 }, { "epoch": 0.56, "learning_rate": 9.808259827867416e-07, "logits/chosen": -0.10219044983386993, "logits/rejected": -0.10219044983386993, "logps/chosen": -81.32051086425781, "logps/rejected": -81.32051086425781, "loss": 0.4443, "rewards/accuracies": 0.0, "rewards/chosen": 0.7248008847236633, "rewards/margins": 0.0, "rewards/rejected": 0.7248008847236633, "step": 3427 }, { "epoch": 0.56, "learning_rate": 9.807899197828653e-07, "logits/chosen": -1.0702069997787476, "logits/rejected": -0.956355094909668, "logps/chosen": -105.94293212890625, "logps/rejected": -59.21302795410156, "loss": 0.765, "rewards/accuracies": 1.0, "rewards/chosen": 3.527235507965088, "rewards/margins": 1.7418793439865112, "rewards/rejected": 1.7853561639785767, "step": 3428 }, { "epoch": 0.56, "learning_rate": 9.807538235609547e-07, "logits/chosen": -0.6047595739364624, "logits/rejected": -0.4636518359184265, "logps/chosen": -67.05436706542969, "logps/rejected": -77.00541687011719, "loss": 1.6471, "rewards/accuracies": 0.0, "rewards/chosen": 1.622473120689392, "rewards/margins": -0.4551805257797241, "rewards/rejected": 2.077653646469116, "step": 3429 }, { "epoch": 0.56, "learning_rate": 9.807176941235038e-07, "logits/chosen": -0.42605453729629517, "logits/rejected": -0.4717646837234497, "logps/chosen": -128.07810974121094, "logps/rejected": -195.03099060058594, "loss": 2.6529, "rewards/accuracies": 0.0, "rewards/chosen": 3.796473741531372, "rewards/margins": -2.575613260269165, "rewards/rejected": 6.372087001800537, "step": 3430 }, { "epoch": 0.56, "learning_rate": 9.806815314730088e-07, "logits/chosen": -0.4038234353065491, "logits/rejected": -0.359007328748703, "logps/chosen": -61.11134338378906, "logps/rejected": -65.70289611816406, "loss": 0.5142, "rewards/accuracies": 1.0, "rewards/chosen": 1.4465675354003906, "rewards/margins": 0.5623466372489929, "rewards/rejected": 0.8842208981513977, "step": 3431 }, { "epoch": 0.56, "learning_rate": 9.806453356119682e-07, "logits/chosen": -0.4114409387111664, "logits/rejected": -0.4422663450241089, "logps/chosen": -112.46932220458984, "logps/rejected": -115.79371643066406, "loss": 1.5777, "rewards/accuracies": 0.0, "rewards/chosen": 1.589952826499939, "rewards/margins": -3.0708365440368652, "rewards/rejected": 4.660789489746094, "step": 3432 }, { "epoch": 0.56, "learning_rate": 9.806091065428829e-07, "logits/chosen": -0.8923380374908447, "logits/rejected": -0.865195631980896, "logps/chosen": -212.81134033203125, "logps/rejected": -70.38449096679688, "loss": 0.3628, "rewards/accuracies": 1.0, "rewards/chosen": 4.320309638977051, "rewards/margins": 1.4892046451568604, "rewards/rejected": 2.8311049938201904, "step": 3433 }, { "epoch": 0.56, "learning_rate": 9.80572844268256e-07, "logits/chosen": -0.7084386348724365, "logits/rejected": -0.6553679704666138, "logps/chosen": -72.16853332519531, "logps/rejected": -63.53925323486328, "loss": 0.7415, "rewards/accuracies": 1.0, "rewards/chosen": 2.568063497543335, "rewards/margins": 0.0023050308227539062, "rewards/rejected": 2.565758466720581, "step": 3434 }, { "epoch": 0.56, "learning_rate": 9.805365487905926e-07, "logits/chosen": -0.6294546723365784, "logits/rejected": -0.5688015222549438, "logps/chosen": -77.10552978515625, "logps/rejected": -78.94671630859375, "loss": 0.2943, "rewards/accuracies": 1.0, "rewards/chosen": 2.482081651687622, "rewards/margins": 1.3605133295059204, "rewards/rejected": 1.1215683221817017, "step": 3435 }, { "epoch": 0.56, "learning_rate": 9.805002201124006e-07, "logits/chosen": -0.6022735238075256, "logits/rejected": -0.5730189085006714, "logps/chosen": -82.49373626708984, "logps/rejected": -91.08857727050781, "loss": 1.6844, "rewards/accuracies": 0.0, "rewards/chosen": 1.8517769575119019, "rewards/margins": -1.959572672843933, "rewards/rejected": 3.811349630355835, "step": 3436 }, { "epoch": 0.56, "learning_rate": 9.8046385823619e-07, "logits/chosen": -0.7929219007492065, "logits/rejected": -0.7393743395805359, "logps/chosen": -67.1346435546875, "logps/rejected": -94.26992797851562, "loss": 0.3205, "rewards/accuracies": 1.0, "rewards/chosen": 1.1514511108398438, "rewards/margins": 0.33474576473236084, "rewards/rejected": 0.8167053461074829, "step": 3437 }, { "epoch": 0.56, "learning_rate": 9.804274631644728e-07, "logits/chosen": -0.7829886674880981, "logits/rejected": -0.7518757581710815, "logps/chosen": -137.13589477539062, "logps/rejected": -87.03553771972656, "loss": 0.4707, "rewards/accuracies": 0.0, "rewards/chosen": 3.1787445545196533, "rewards/margins": -0.3701355457305908, "rewards/rejected": 3.548880100250244, "step": 3438 }, { "epoch": 0.56, "learning_rate": 9.80391034899764e-07, "logits/chosen": -0.40955650806427, "logits/rejected": -0.4270111918449402, "logps/chosen": -65.19136810302734, "logps/rejected": -153.65457153320312, "loss": 0.2803, "rewards/accuracies": 1.0, "rewards/chosen": 1.2112175226211548, "rewards/margins": 1.276018500328064, "rewards/rejected": -0.06480102986097336, "step": 3439 }, { "epoch": 0.56, "learning_rate": 9.8035457344458e-07, "logits/chosen": -0.7685928344726562, "logits/rejected": -0.8033947348594666, "logps/chosen": -120.66912078857422, "logps/rejected": -123.64695739746094, "loss": 2.4527, "rewards/accuracies": 0.0, "rewards/chosen": 1.6391884088516235, "rewards/margins": -4.841277599334717, "rewards/rejected": 6.480465888977051, "step": 3440 }, { "epoch": 0.56, "learning_rate": 9.803180788014402e-07, "logits/chosen": -0.17758244276046753, "logits/rejected": -0.18544556200504303, "logps/chosen": -5.0753889083862305, "logps/rejected": -3.907240390777588, "loss": 1.2343, "rewards/accuracies": 0.0, "rewards/chosen": 0.2357538789510727, "rewards/margins": -0.07024334371089935, "rewards/rejected": 0.30599722266197205, "step": 3441 }, { "epoch": 0.56, "learning_rate": 9.802815509728662e-07, "logits/chosen": -0.5637421607971191, "logits/rejected": -0.5637421607971191, "logps/chosen": -55.9606819152832, "logps/rejected": -55.9606819152832, "loss": 0.5269, "rewards/accuracies": 0.0, "rewards/chosen": 0.9607387781143188, "rewards/margins": 0.0, "rewards/rejected": 0.9607387781143188, "step": 3442 }, { "epoch": 0.56, "learning_rate": 9.802449899613811e-07, "logits/chosen": -0.412332147359848, "logits/rejected": -0.3315925598144531, "logps/chosen": -93.84331512451172, "logps/rejected": -59.58799743652344, "loss": 0.5868, "rewards/accuracies": 1.0, "rewards/chosen": 1.5627418756484985, "rewards/margins": 0.23864972591400146, "rewards/rejected": 1.324092149734497, "step": 3443 }, { "epoch": 0.56, "learning_rate": 9.802083957695114e-07, "logits/chosen": -0.7362022399902344, "logits/rejected": -0.7094830870628357, "logps/chosen": -93.2350845336914, "logps/rejected": -58.631553649902344, "loss": 0.5069, "rewards/accuracies": 0.0, "rewards/chosen": 1.4281028509140015, "rewards/margins": -0.48997199535369873, "rewards/rejected": 1.9180748462677002, "step": 3444 }, { "epoch": 0.56, "learning_rate": 9.801717683997856e-07, "logits/chosen": -0.5281264185905457, "logits/rejected": -0.5457303524017334, "logps/chosen": -79.56654357910156, "logps/rejected": -26.205684661865234, "loss": 0.9836, "rewards/accuracies": 1.0, "rewards/chosen": 1.2832252979278564, "rewards/margins": 0.5545029044151306, "rewards/rejected": 0.7287223935127258, "step": 3445 }, { "epoch": 0.56, "learning_rate": 9.801351078547337e-07, "logits/chosen": -0.28829777240753174, "logits/rejected": -0.2077483981847763, "logps/chosen": -57.63677978515625, "logps/rejected": -84.84246063232422, "loss": 0.2715, "rewards/accuracies": 1.0, "rewards/chosen": 1.0847961902618408, "rewards/margins": 0.5083199143409729, "rewards/rejected": 0.5764762759208679, "step": 3446 }, { "epoch": 0.56, "learning_rate": 9.800984141368891e-07, "logits/chosen": -0.8291355967521667, "logits/rejected": -0.6844117641448975, "logps/chosen": -129.0357666015625, "logps/rejected": -152.76199340820312, "loss": 0.1293, "rewards/accuracies": 1.0, "rewards/chosen": 5.988018989562988, "rewards/margins": 1.8966097831726074, "rewards/rejected": 4.091409206390381, "step": 3447 }, { "epoch": 0.56, "learning_rate": 9.80061687248787e-07, "logits/chosen": -0.7321898341178894, "logits/rejected": -0.6749422550201416, "logps/chosen": -98.02482604980469, "logps/rejected": -81.85939025878906, "loss": 0.7179, "rewards/accuracies": 0.0, "rewards/chosen": 1.614338755607605, "rewards/margins": -0.7153304815292358, "rewards/rejected": 2.329669237136841, "step": 3448 }, { "epoch": 0.56, "learning_rate": 9.800249271929643e-07, "logits/chosen": -0.5816813111305237, "logits/rejected": -0.6332587003707886, "logps/chosen": -71.0980453491211, "logps/rejected": -98.63963317871094, "loss": 1.7724, "rewards/accuracies": 0.0, "rewards/chosen": 1.7177711725234985, "rewards/margins": -2.7760658264160156, "rewards/rejected": 4.493836879730225, "step": 3449 }, { "epoch": 0.56, "learning_rate": 9.799881339719614e-07, "logits/chosen": -0.6005874276161194, "logits/rejected": -0.6666544079780579, "logps/chosen": -99.81002044677734, "logps/rejected": -99.4112548828125, "loss": 1.6242, "rewards/accuracies": 0.0, "rewards/chosen": 0.32030564546585083, "rewards/margins": -1.3555190563201904, "rewards/rejected": 1.675824761390686, "step": 3450 }, { "epoch": 0.56, "learning_rate": 9.7995130758832e-07, "logits/chosen": -0.9916425347328186, "logits/rejected": -0.8689231872558594, "logps/chosen": -116.28025817871094, "logps/rejected": -22.28779411315918, "loss": 0.5369, "rewards/accuracies": 1.0, "rewards/chosen": 1.0923141241073608, "rewards/margins": 0.5834577083587646, "rewards/rejected": 0.5088564157485962, "step": 3451 }, { "epoch": 0.56, "learning_rate": 9.799144480445847e-07, "logits/chosen": -0.6917713284492493, "logits/rejected": -0.6337227821350098, "logps/chosen": -187.9512939453125, "logps/rejected": -111.8926773071289, "loss": 2.1886, "rewards/accuracies": 0.0, "rewards/chosen": 3.5123047828674316, "rewards/margins": -2.650470733642578, "rewards/rejected": 6.16277551651001, "step": 3452 }, { "epoch": 0.56, "learning_rate": 9.798775553433022e-07, "logits/chosen": -0.26660484075546265, "logits/rejected": -0.2816479802131653, "logps/chosen": -10.727935791015625, "logps/rejected": -23.933149337768555, "loss": 0.573, "rewards/accuracies": 0.0, "rewards/chosen": -0.23116055130958557, "rewards/margins": -0.33195364475250244, "rewards/rejected": 0.10079307854175568, "step": 3453 }, { "epoch": 0.56, "learning_rate": 9.798406294870209e-07, "logits/chosen": -0.6212602257728577, "logits/rejected": -0.6675576567649841, "logps/chosen": -115.12501525878906, "logps/rejected": -129.328857421875, "loss": 2.4375, "rewards/accuracies": 0.0, "rewards/chosen": 1.828576683998108, "rewards/margins": -3.8131227493286133, "rewards/rejected": 5.641699314117432, "step": 3454 }, { "epoch": 0.56, "learning_rate": 9.798036704782925e-07, "logits/chosen": -0.7007220387458801, "logits/rejected": -0.646195650100708, "logps/chosen": -63.30693054199219, "logps/rejected": -104.01759338378906, "loss": 0.5987, "rewards/accuracies": 0.0, "rewards/chosen": 1.6308822631835938, "rewards/margins": -0.10286867618560791, "rewards/rejected": 1.7337509393692017, "step": 3455 }, { "epoch": 0.56, "learning_rate": 9.797666783196706e-07, "logits/chosen": -0.6313053965568542, "logits/rejected": -0.619291365146637, "logps/chosen": -64.4517822265625, "logps/rejected": -32.217132568359375, "loss": 0.9967, "rewards/accuracies": 0.0, "rewards/chosen": 1.5030921697616577, "rewards/margins": -0.2625160217285156, "rewards/rejected": 1.7656081914901733, "step": 3456 }, { "epoch": 0.56, "learning_rate": 9.797296530137107e-07, "logits/chosen": -0.34193533658981323, "logits/rejected": -0.3200160562992096, "logps/chosen": -67.07167053222656, "logps/rejected": -58.31195831298828, "loss": 0.4361, "rewards/accuracies": 1.0, "rewards/chosen": 2.3824234008789062, "rewards/margins": 1.108361840248108, "rewards/rejected": 1.2740615606307983, "step": 3457 }, { "epoch": 0.56, "learning_rate": 9.796925945629709e-07, "logits/chosen": -0.6854826211929321, "logits/rejected": -0.7105558514595032, "logps/chosen": -83.83668518066406, "logps/rejected": -138.66195678710938, "loss": 0.6294, "rewards/accuracies": 1.0, "rewards/chosen": 0.5213684439659119, "rewards/margins": 0.49095919728279114, "rewards/rejected": 0.03040924109518528, "step": 3458 }, { "epoch": 0.56, "learning_rate": 9.796555029700118e-07, "logits/chosen": -0.7552865147590637, "logits/rejected": -0.7444643378257751, "logps/chosen": -207.610107421875, "logps/rejected": -168.78024291992188, "loss": 0.9265, "rewards/accuracies": 0.0, "rewards/chosen": 4.8916168212890625, "rewards/margins": -1.00203275680542, "rewards/rejected": 5.893649578094482, "step": 3459 }, { "epoch": 0.56, "learning_rate": 9.79618378237396e-07, "logits/chosen": -0.7426276206970215, "logits/rejected": -0.6605854630470276, "logps/chosen": -253.4423828125, "logps/rejected": -95.41120910644531, "loss": 0.3399, "rewards/accuracies": 1.0, "rewards/chosen": 3.885455369949341, "rewards/margins": 3.621424913406372, "rewards/rejected": 0.26403045654296875, "step": 3460 }, { "epoch": 0.56, "learning_rate": 9.795812203676885e-07, "logits/chosen": -0.8197286128997803, "logits/rejected": -0.7938342690467834, "logps/chosen": -35.29824447631836, "logps/rejected": -39.15331268310547, "loss": 0.3702, "rewards/accuracies": 1.0, "rewards/chosen": 1.8566677570343018, "rewards/margins": 0.45262837409973145, "rewards/rejected": 1.4040393829345703, "step": 3461 }, { "epoch": 0.56, "learning_rate": 9.795440293634566e-07, "logits/chosen": -0.8562712073326111, "logits/rejected": -0.8338302969932556, "logps/chosen": -125.12210083007812, "logps/rejected": -74.45365142822266, "loss": 1.1167, "rewards/accuracies": 0.0, "rewards/chosen": 0.501635730266571, "rewards/margins": -1.4850304126739502, "rewards/rejected": 1.9866660833358765, "step": 3462 }, { "epoch": 0.56, "learning_rate": 9.795068052272697e-07, "logits/chosen": -0.5769624710083008, "logits/rejected": -0.43484243750572205, "logps/chosen": -164.82923889160156, "logps/rejected": -79.9316177368164, "loss": 0.2899, "rewards/accuracies": 1.0, "rewards/chosen": 4.189175605773926, "rewards/margins": 2.613048553466797, "rewards/rejected": 1.5761269330978394, "step": 3463 }, { "epoch": 0.56, "learning_rate": 9.794695479616995e-07, "logits/chosen": -0.04222482070326805, "logits/rejected": -0.04416082426905632, "logps/chosen": -4.380559921264648, "logps/rejected": -8.91730785369873, "loss": 0.863, "rewards/accuracies": 1.0, "rewards/chosen": 0.22395172715187073, "rewards/margins": 0.31272250413894653, "rewards/rejected": -0.08877076953649521, "step": 3464 }, { "epoch": 0.56, "learning_rate": 9.794322575693206e-07, "logits/chosen": -0.17569860816001892, "logits/rejected": -0.17569860816001892, "logps/chosen": -4.707582950592041, "logps/rejected": -4.707582950592041, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.00413966178894043, "rewards/margins": 0.0, "rewards/rejected": -0.00413966178894043, "step": 3465 }, { "epoch": 0.56, "learning_rate": 9.793949340527089e-07, "logits/chosen": -0.8075571656227112, "logits/rejected": -0.7928254008293152, "logps/chosen": -232.66119384765625, "logps/rejected": -173.0408935546875, "loss": 0.3131, "rewards/accuracies": 1.0, "rewards/chosen": 4.095757961273193, "rewards/margins": 0.7591521739959717, "rewards/rejected": 3.3366057872772217, "step": 3466 }, { "epoch": 0.56, "learning_rate": 9.793575774144435e-07, "logits/chosen": -0.8638606071472168, "logits/rejected": -0.869555652141571, "logps/chosen": -134.93858337402344, "logps/rejected": -25.39401626586914, "loss": 0.4871, "rewards/accuracies": 1.0, "rewards/chosen": 1.0710738897323608, "rewards/margins": 1.1249911785125732, "rewards/rejected": -0.05391731485724449, "step": 3467 }, { "epoch": 0.56, "learning_rate": 9.79320187657105e-07, "logits/chosen": -1.1100252866744995, "logits/rejected": -1.0861186981201172, "logps/chosen": -90.19157409667969, "logps/rejected": -118.64612579345703, "loss": 1.0706, "rewards/accuracies": 0.0, "rewards/chosen": 0.9368011355400085, "rewards/margins": -1.9458153247833252, "rewards/rejected": 2.8826165199279785, "step": 3468 }, { "epoch": 0.56, "learning_rate": 9.79282764783277e-07, "logits/chosen": -0.5495539307594299, "logits/rejected": -0.3166946768760681, "logps/chosen": -125.92383575439453, "logps/rejected": -89.78021240234375, "loss": 0.3875, "rewards/accuracies": 1.0, "rewards/chosen": 1.441048502922058, "rewards/margins": 0.040155887603759766, "rewards/rejected": 1.4008926153182983, "step": 3469 }, { "epoch": 0.56, "learning_rate": 9.792453087955453e-07, "logits/chosen": -0.6105589866638184, "logits/rejected": -0.6299626231193542, "logps/chosen": -47.717864990234375, "logps/rejected": -26.90775489807129, "loss": 0.4745, "rewards/accuracies": 1.0, "rewards/chosen": 0.0873844176530838, "rewards/margins": 0.39603614807128906, "rewards/rejected": -0.30865174531936646, "step": 3470 }, { "epoch": 0.56, "learning_rate": 9.79207819696497e-07, "logits/chosen": -0.29947301745414734, "logits/rejected": -0.22259467840194702, "logps/chosen": -18.68667984008789, "logps/rejected": -80.61430358886719, "loss": 1.3553, "rewards/accuracies": 0.0, "rewards/chosen": 0.22983188927173615, "rewards/margins": -2.616905689239502, "rewards/rejected": 2.8467376232147217, "step": 3471 }, { "epoch": 0.56, "learning_rate": 9.791702974887228e-07, "logits/chosen": -0.536187469959259, "logits/rejected": -0.4465210735797882, "logps/chosen": -124.97715759277344, "logps/rejected": -138.655029296875, "loss": 0.6753, "rewards/accuracies": 1.0, "rewards/chosen": 0.6012771725654602, "rewards/margins": 0.2702530026435852, "rewards/rejected": 0.331024169921875, "step": 3472 }, { "epoch": 0.56, "learning_rate": 9.79132742174815e-07, "logits/chosen": -0.7753058671951294, "logits/rejected": -0.7145873308181763, "logps/chosen": -139.92076110839844, "logps/rejected": -169.1090545654297, "loss": 0.2105, "rewards/accuracies": 1.0, "rewards/chosen": 5.6291399002075195, "rewards/margins": 1.41668701171875, "rewards/rejected": 4.2124528884887695, "step": 3473 }, { "epoch": 0.56, "learning_rate": 9.790951537573683e-07, "logits/chosen": -0.5947755575180054, "logits/rejected": -0.5870063900947571, "logps/chosen": -57.15764617919922, "logps/rejected": -73.18061828613281, "loss": 0.4054, "rewards/accuracies": 0.0, "rewards/chosen": 1.1306403875350952, "rewards/margins": -0.1454247236251831, "rewards/rejected": 1.2760651111602783, "step": 3474 }, { "epoch": 0.56, "learning_rate": 9.790575322389797e-07, "logits/chosen": -0.33017662167549133, "logits/rejected": -0.36402082443237305, "logps/chosen": -26.822513580322266, "logps/rejected": -75.38526153564453, "loss": 0.8168, "rewards/accuracies": 0.0, "rewards/chosen": 0.2649608552455902, "rewards/margins": -1.3798458576202393, "rewards/rejected": 1.6448067426681519, "step": 3475 }, { "epoch": 0.56, "learning_rate": 9.790198776222487e-07, "logits/chosen": -0.6325555443763733, "logits/rejected": -0.5341715216636658, "logps/chosen": -66.04578399658203, "logps/rejected": -63.287696838378906, "loss": 0.4111, "rewards/accuracies": 1.0, "rewards/chosen": 2.708742618560791, "rewards/margins": 0.5749282836914062, "rewards/rejected": 2.1338143348693848, "step": 3476 }, { "epoch": 0.56, "learning_rate": 9.789821899097766e-07, "logits/chosen": -0.32920557260513306, "logits/rejected": -0.08353034406900406, "logps/chosen": -74.4881362915039, "logps/rejected": -49.92205810546875, "loss": 0.5472, "rewards/accuracies": 1.0, "rewards/chosen": 1.5532540082931519, "rewards/margins": 1.4373539686203003, "rewards/rejected": 0.11590003967285156, "step": 3477 }, { "epoch": 0.56, "learning_rate": 9.789444691041672e-07, "logits/chosen": -0.6817445158958435, "logits/rejected": -0.7088775038719177, "logps/chosen": -38.66657257080078, "logps/rejected": -213.53848266601562, "loss": 2.5298, "rewards/accuracies": 0.0, "rewards/chosen": 1.75786292552948, "rewards/margins": -3.456529140472412, "rewards/rejected": 5.214392185211182, "step": 3478 }, { "epoch": 0.56, "learning_rate": 9.789067152080268e-07, "logits/chosen": -0.6349693536758423, "logits/rejected": -0.6349693536758423, "logps/chosen": -75.70757293701172, "logps/rejected": -75.70757293701172, "loss": 0.3736, "rewards/accuracies": 0.0, "rewards/chosen": 1.4968093633651733, "rewards/margins": 0.0, "rewards/rejected": 1.4968093633651733, "step": 3479 }, { "epoch": 0.56, "learning_rate": 9.78868928223964e-07, "logits/chosen": -0.6796392202377319, "logits/rejected": -0.6796392202377319, "logps/chosen": -27.50164222717285, "logps/rejected": -27.50164222717285, "loss": 0.3926, "rewards/accuracies": 0.0, "rewards/chosen": 1.0752824544906616, "rewards/margins": 0.0, "rewards/rejected": 1.0752824544906616, "step": 3480 }, { "epoch": 0.57, "learning_rate": 9.788311081545893e-07, "logits/chosen": -0.24192577600479126, "logits/rejected": -0.24325089156627655, "logps/chosen": -6.54388427734375, "logps/rejected": -1.7893675565719604, "loss": 1.1312, "rewards/accuracies": 0.0, "rewards/chosen": 0.17468976974487305, "rewards/margins": -0.3767484426498413, "rewards/rejected": 0.5514382123947144, "step": 3481 }, { "epoch": 0.57, "learning_rate": 9.787932550025157e-07, "logits/chosen": -0.27366793155670166, "logits/rejected": -0.2887999713420868, "logps/chosen": -16.701709747314453, "logps/rejected": -25.59781265258789, "loss": 0.5953, "rewards/accuracies": 0.0, "rewards/chosen": 0.2286764234304428, "rewards/margins": -0.08708496391773224, "rewards/rejected": 0.31576138734817505, "step": 3482 }, { "epoch": 0.57, "learning_rate": 9.787553687703584e-07, "logits/chosen": -1.2729476690292358, "logits/rejected": -1.2577638626098633, "logps/chosen": -72.61624145507812, "logps/rejected": -35.754249572753906, "loss": 1.0194, "rewards/accuracies": 1.0, "rewards/chosen": 1.4896621704101562, "rewards/margins": 1.1174671649932861, "rewards/rejected": 0.3721950650215149, "step": 3483 }, { "epoch": 0.57, "learning_rate": 9.787174494607354e-07, "logits/chosen": -0.41178253293037415, "logits/rejected": -0.4107564687728882, "logps/chosen": -94.52806091308594, "logps/rejected": -107.99443054199219, "loss": 0.4648, "rewards/accuracies": 0.0, "rewards/chosen": 2.2606003284454346, "rewards/margins": -0.29689788818359375, "rewards/rejected": 2.5574982166290283, "step": 3484 }, { "epoch": 0.57, "learning_rate": 9.786794970762663e-07, "logits/chosen": -0.5274497866630554, "logits/rejected": -0.49281054735183716, "logps/chosen": -82.99076080322266, "logps/rejected": -80.06988525390625, "loss": 1.1199, "rewards/accuracies": 0.0, "rewards/chosen": 0.8649528622627258, "rewards/margins": -1.788515567779541, "rewards/rejected": 2.653468370437622, "step": 3485 }, { "epoch": 0.57, "learning_rate": 9.786415116195732e-07, "logits/chosen": -0.6519834399223328, "logits/rejected": -0.7200620174407959, "logps/chosen": -210.71939086914062, "logps/rejected": -115.42754364013672, "loss": 0.8327, "rewards/accuracies": 0.0, "rewards/chosen": 3.3067626953125, "rewards/margins": -0.8017830848693848, "rewards/rejected": 4.108545780181885, "step": 3486 }, { "epoch": 0.57, "learning_rate": 9.786034930932807e-07, "logits/chosen": -0.7229905724525452, "logits/rejected": -0.6655986905097961, "logps/chosen": -113.529052734375, "logps/rejected": -56.289344787597656, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": 4.9494524002075195, "rewards/margins": 3.031514883041382, "rewards/rejected": 1.9179375171661377, "step": 3487 }, { "epoch": 0.57, "learning_rate": 9.785654415000153e-07, "logits/chosen": -0.5499765872955322, "logits/rejected": -0.5443966388702393, "logps/chosen": -154.6983184814453, "logps/rejected": -104.55785369873047, "loss": 2.331, "rewards/accuracies": 1.0, "rewards/chosen": 2.746067762374878, "rewards/margins": 1.518485188484192, "rewards/rejected": 1.227582573890686, "step": 3488 }, { "epoch": 0.57, "learning_rate": 9.785273568424062e-07, "logits/chosen": -0.7693368196487427, "logits/rejected": -0.7860043048858643, "logps/chosen": -128.89794921875, "logps/rejected": -128.50082397460938, "loss": 1.1659, "rewards/accuracies": 0.0, "rewards/chosen": 1.3697983026504517, "rewards/margins": -2.217310905456543, "rewards/rejected": 3.587109327316284, "step": 3489 }, { "epoch": 0.57, "learning_rate": 9.784892391230845e-07, "logits/chosen": -0.8267812132835388, "logits/rejected": -0.8897089958190918, "logps/chosen": -264.44921875, "logps/rejected": -146.4645538330078, "loss": 1.8642, "rewards/accuracies": 0.0, "rewards/chosen": 4.767889499664307, "rewards/margins": -3.336195468902588, "rewards/rejected": 8.104084968566895, "step": 3490 }, { "epoch": 0.57, "learning_rate": 9.78451088344684e-07, "logits/chosen": -0.5957364439964294, "logits/rejected": -0.4634442627429962, "logps/chosen": -44.0307502746582, "logps/rejected": -68.0948486328125, "loss": 0.4273, "rewards/accuracies": 1.0, "rewards/chosen": 1.678659439086914, "rewards/margins": 0.7522349953651428, "rewards/rejected": 0.9264244437217712, "step": 3491 }, { "epoch": 0.57, "learning_rate": 9.784129045098404e-07, "logits/chosen": -0.7606127262115479, "logits/rejected": -0.5781604051589966, "logps/chosen": -163.84539794921875, "logps/rejected": -78.3668212890625, "loss": 0.1488, "rewards/accuracies": 1.0, "rewards/chosen": 3.967608690261841, "rewards/margins": 1.4518952369689941, "rewards/rejected": 2.5157134532928467, "step": 3492 }, { "epoch": 0.57, "learning_rate": 9.783746876211917e-07, "logits/chosen": -0.7728057503700256, "logits/rejected": -0.794828474521637, "logps/chosen": -133.77005004882812, "logps/rejected": -54.281097412109375, "loss": 0.9432, "rewards/accuracies": 0.0, "rewards/chosen": 0.5302154421806335, "rewards/margins": -1.592583417892456, "rewards/rejected": 2.1227989196777344, "step": 3493 }, { "epoch": 0.57, "learning_rate": 9.783364376813787e-07, "logits/chosen": -0.5775553584098816, "logits/rejected": -0.530596911907196, "logps/chosen": -55.39262008666992, "logps/rejected": -19.757213592529297, "loss": 1.2311, "rewards/accuracies": 1.0, "rewards/chosen": 0.7363109588623047, "rewards/margins": 0.40658798813819885, "rewards/rejected": 0.32972297072410583, "step": 3494 }, { "epoch": 0.57, "learning_rate": 9.78298154693044e-07, "logits/chosen": -0.7496643662452698, "logits/rejected": -0.6973792314529419, "logps/chosen": -85.61170959472656, "logps/rejected": -79.87138366699219, "loss": 0.5029, "rewards/accuracies": 0.0, "rewards/chosen": 2.2503738403320312, "rewards/margins": -0.25206995010375977, "rewards/rejected": 2.502443790435791, "step": 3495 }, { "epoch": 0.57, "learning_rate": 9.782598386588324e-07, "logits/chosen": -0.47675567865371704, "logits/rejected": -0.44374823570251465, "logps/chosen": -77.5748291015625, "logps/rejected": -47.81261444091797, "loss": 0.3821, "rewards/accuracies": 0.0, "rewards/chosen": 1.3804367780685425, "rewards/margins": -0.08229446411132812, "rewards/rejected": 1.4627312421798706, "step": 3496 }, { "epoch": 0.57, "learning_rate": 9.782214895813913e-07, "logits/chosen": -0.9673125147819519, "logits/rejected": -0.33979660272598267, "logps/chosen": -203.97821044921875, "logps/rejected": -109.30487060546875, "loss": 0.0791, "rewards/accuracies": 1.0, "rewards/chosen": 3.706613302230835, "rewards/margins": 1.7843995094299316, "rewards/rejected": 1.9222137928009033, "step": 3497 }, { "epoch": 0.57, "learning_rate": 9.781831074633703e-07, "logits/chosen": -0.5703891515731812, "logits/rejected": -0.5325918197631836, "logps/chosen": -57.06983947753906, "logps/rejected": -110.7895736694336, "loss": 0.1362, "rewards/accuracies": 1.0, "rewards/chosen": 2.8496689796447754, "rewards/margins": 1.2595216035842896, "rewards/rejected": 1.5901473760604858, "step": 3498 }, { "epoch": 0.57, "learning_rate": 9.78144692307421e-07, "logits/chosen": -0.6489604711532593, "logits/rejected": -0.5625087022781372, "logps/chosen": -78.06008911132812, "logps/rejected": -140.85931396484375, "loss": 0.432, "rewards/accuracies": 1.0, "rewards/chosen": 1.971028208732605, "rewards/margins": 1.6806808710098267, "rewards/rejected": 0.29034730792045593, "step": 3499 }, { "epoch": 0.57, "learning_rate": 9.781062441161979e-07, "logits/chosen": -0.6152195334434509, "logits/rejected": -0.47928038239479065, "logps/chosen": -74.28327941894531, "logps/rejected": -92.09858703613281, "loss": 0.7227, "rewards/accuracies": 0.0, "rewards/chosen": 1.079951524734497, "rewards/margins": -1.1589736938476562, "rewards/rejected": 2.2389252185821533, "step": 3500 }, { "epoch": 0.57, "learning_rate": 9.78067762892357e-07, "logits/chosen": -0.6624125242233276, "logits/rejected": -0.19705474376678467, "logps/chosen": -113.87701416015625, "logps/rejected": -62.88312530517578, "loss": 1.4977, "rewards/accuracies": 0.0, "rewards/chosen": 0.3971725404262543, "rewards/margins": -2.892949104309082, "rewards/rejected": 3.290121555328369, "step": 3501 }, { "epoch": 0.57, "learning_rate": 9.780292486385574e-07, "logits/chosen": -0.9254971742630005, "logits/rejected": -0.9171141386032104, "logps/chosen": -73.77203369140625, "logps/rejected": -85.8656997680664, "loss": 0.3369, "rewards/accuracies": 1.0, "rewards/chosen": 2.578325033187866, "rewards/margins": 0.30597615242004395, "rewards/rejected": 2.2723488807678223, "step": 3502 }, { "epoch": 0.57, "learning_rate": 9.779907013574598e-07, "logits/chosen": -0.4662835896015167, "logits/rejected": -0.4662835896015167, "logps/chosen": -76.8767318725586, "logps/rejected": -76.8767318725586, "loss": 0.7946, "rewards/accuracies": 0.0, "rewards/chosen": 1.7751991748809814, "rewards/margins": 0.0, "rewards/rejected": 1.7751991748809814, "step": 3503 }, { "epoch": 0.57, "learning_rate": 9.779521210517275e-07, "logits/chosen": -0.5330418348312378, "logits/rejected": -0.5971832275390625, "logps/chosen": -86.06402587890625, "logps/rejected": -50.003662109375, "loss": 0.6991, "rewards/accuracies": 0.0, "rewards/chosen": 1.4810211658477783, "rewards/margins": -0.7751967906951904, "rewards/rejected": 2.2562179565429688, "step": 3504 }, { "epoch": 0.57, "learning_rate": 9.779135077240262e-07, "logits/chosen": -0.3678792119026184, "logits/rejected": -0.2833269536495209, "logps/chosen": -64.51470947265625, "logps/rejected": -171.52560424804688, "loss": 0.1735, "rewards/accuracies": 1.0, "rewards/chosen": 1.8991577625274658, "rewards/margins": 0.9622558951377869, "rewards/rejected": 0.936901867389679, "step": 3505 }, { "epoch": 0.57, "learning_rate": 9.778748613770234e-07, "logits/chosen": -0.7335308194160461, "logits/rejected": -0.5851157903671265, "logps/chosen": -133.1642303466797, "logps/rejected": -92.9794921875, "loss": 0.0574, "rewards/accuracies": 1.0, "rewards/chosen": 6.220855712890625, "rewards/margins": 4.136260986328125, "rewards/rejected": 2.0845947265625, "step": 3506 }, { "epoch": 0.57, "learning_rate": 9.778361820133894e-07, "logits/chosen": -0.3027375638484955, "logits/rejected": -0.320033460855484, "logps/chosen": -56.656185150146484, "logps/rejected": -18.936420440673828, "loss": 0.413, "rewards/accuracies": 0.0, "rewards/chosen": 0.1572238951921463, "rewards/margins": -0.19534684717655182, "rewards/rejected": 0.3525707423686981, "step": 3507 }, { "epoch": 0.57, "learning_rate": 9.777974696357967e-07, "logits/chosen": -0.17695273458957672, "logits/rejected": -0.16452309489250183, "logps/chosen": -99.47091674804688, "logps/rejected": -46.92988586425781, "loss": 1.1822, "rewards/accuracies": 0.0, "rewards/chosen": 0.3046112060546875, "rewards/margins": -0.5358421206474304, "rewards/rejected": 0.8404533267021179, "step": 3508 }, { "epoch": 0.57, "learning_rate": 9.777587242469196e-07, "logits/chosen": -0.4008547365665436, "logits/rejected": -0.4008547365665436, "logps/chosen": -21.116533279418945, "logps/rejected": -21.116533279418945, "loss": 0.5222, "rewards/accuracies": 0.0, "rewards/chosen": 0.26456186175346375, "rewards/margins": 0.0, "rewards/rejected": 0.26456186175346375, "step": 3509 }, { "epoch": 0.57, "learning_rate": 9.777199458494354e-07, "logits/chosen": -0.468239426612854, "logits/rejected": -0.4281143844127655, "logps/chosen": -82.04715728759766, "logps/rejected": -34.73799133300781, "loss": 0.2436, "rewards/accuracies": 1.0, "rewards/chosen": 2.2928879261016846, "rewards/margins": 0.785448431968689, "rewards/rejected": 1.5074394941329956, "step": 3510 }, { "epoch": 0.57, "learning_rate": 9.776811344460231e-07, "logits/chosen": -0.6850350499153137, "logits/rejected": -0.648315966129303, "logps/chosen": -223.17318725585938, "logps/rejected": -105.31640625, "loss": 0.4122, "rewards/accuracies": 1.0, "rewards/chosen": 2.5784270763397217, "rewards/margins": 1.4610892534255981, "rewards/rejected": 1.1173378229141235, "step": 3511 }, { "epoch": 0.57, "learning_rate": 9.776422900393644e-07, "logits/chosen": -0.30928662419319153, "logits/rejected": -0.29968035221099854, "logps/chosen": -2.109323263168335, "logps/rejected": -13.240303039550781, "loss": 0.7738, "rewards/accuracies": 1.0, "rewards/chosen": 0.3470481336116791, "rewards/margins": 0.29401031136512756, "rewards/rejected": 0.05303783342242241, "step": 3512 }, { "epoch": 0.57, "learning_rate": 9.776034126321429e-07, "logits/chosen": -0.6189419031143188, "logits/rejected": -0.5908763408660889, "logps/chosen": -49.0069465637207, "logps/rejected": -105.98500061035156, "loss": 0.8498, "rewards/accuracies": 0.0, "rewards/chosen": 0.8509395718574524, "rewards/margins": -0.2932468056678772, "rewards/rejected": 1.1441863775253296, "step": 3513 }, { "epoch": 0.57, "learning_rate": 9.775645022270446e-07, "logits/chosen": -0.2700534760951996, "logits/rejected": -0.25739169120788574, "logps/chosen": -64.21680450439453, "logps/rejected": -39.734073638916016, "loss": 0.6855, "rewards/accuracies": 0.0, "rewards/chosen": 0.5424125790596008, "rewards/margins": -0.32383692264556885, "rewards/rejected": 0.8662495017051697, "step": 3514 }, { "epoch": 0.57, "learning_rate": 9.77525558826758e-07, "logits/chosen": -0.38396456837654114, "logits/rejected": -0.3004607558250427, "logps/chosen": -69.47836303710938, "logps/rejected": -162.95758056640625, "loss": 0.1776, "rewards/accuracies": 1.0, "rewards/chosen": 1.8344688415527344, "rewards/margins": 0.9029136300086975, "rewards/rejected": 0.9315552115440369, "step": 3515 }, { "epoch": 0.57, "learning_rate": 9.774865824339737e-07, "logits/chosen": -0.5937822461128235, "logits/rejected": -0.5898321866989136, "logps/chosen": -93.62470245361328, "logps/rejected": -86.45231628417969, "loss": 0.9753, "rewards/accuracies": 0.0, "rewards/chosen": 0.7242271304130554, "rewards/margins": -1.0468392372131348, "rewards/rejected": 1.7710663080215454, "step": 3516 }, { "epoch": 0.57, "learning_rate": 9.774475730513847e-07, "logits/chosen": -0.4553029239177704, "logits/rejected": -0.36472758650779724, "logps/chosen": -35.99686813354492, "logps/rejected": -53.465232849121094, "loss": 0.7274, "rewards/accuracies": 0.0, "rewards/chosen": 0.8821979761123657, "rewards/margins": -0.7875049114227295, "rewards/rejected": 1.6697028875350952, "step": 3517 }, { "epoch": 0.57, "learning_rate": 9.774085306816857e-07, "logits/chosen": -0.6892122030258179, "logits/rejected": -0.6143578886985779, "logps/chosen": -92.67611694335938, "logps/rejected": -140.23855590820312, "loss": 0.0936, "rewards/accuracies": 1.0, "rewards/chosen": 3.494004964828491, "rewards/margins": 1.8184586763381958, "rewards/rejected": 1.6755462884902954, "step": 3518 }, { "epoch": 0.57, "learning_rate": 9.77369455327575e-07, "logits/chosen": -0.5064883828163147, "logits/rejected": -0.31219053268432617, "logps/chosen": -133.64736938476562, "logps/rejected": -58.228271484375, "loss": 0.4044, "rewards/accuracies": 1.0, "rewards/chosen": 4.107509136199951, "rewards/margins": 1.8565301895141602, "rewards/rejected": 2.250978946685791, "step": 3519 }, { "epoch": 0.57, "learning_rate": 9.773303469917514e-07, "logits/chosen": -1.1120705604553223, "logits/rejected": -0.917327880859375, "logps/chosen": -227.26815795898438, "logps/rejected": -196.3707275390625, "loss": 0.5229, "rewards/accuracies": 0.0, "rewards/chosen": 5.188500881195068, "rewards/margins": -0.5296297073364258, "rewards/rejected": 5.718130588531494, "step": 3520 }, { "epoch": 0.57, "learning_rate": 9.772912056769175e-07, "logits/chosen": -0.7252746820449829, "logits/rejected": -0.7003710269927979, "logps/chosen": -48.56951141357422, "logps/rejected": -72.48919677734375, "loss": 0.4965, "rewards/accuracies": 1.0, "rewards/chosen": 2.4637811183929443, "rewards/margins": 0.6226433515548706, "rewards/rejected": 1.8411377668380737, "step": 3521 }, { "epoch": 0.57, "learning_rate": 9.772520313857775e-07, "logits/chosen": -0.6600974798202515, "logits/rejected": -0.5219656825065613, "logps/chosen": -116.06741333007812, "logps/rejected": -55.27871322631836, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": 4.996285915374756, "rewards/margins": 3.1859803199768066, "rewards/rejected": 1.8103054761886597, "step": 3522 }, { "epoch": 0.57, "learning_rate": 9.77212824121038e-07, "logits/chosen": -0.48606935143470764, "logits/rejected": -0.5151095986366272, "logps/chosen": -132.2799072265625, "logps/rejected": -126.93059539794922, "loss": 1.7972, "rewards/accuracies": 0.0, "rewards/chosen": 2.587146043777466, "rewards/margins": -3.5504038333892822, "rewards/rejected": 6.137549877166748, "step": 3523 }, { "epoch": 0.57, "learning_rate": 9.771735838854077e-07, "logits/chosen": -0.3580770492553711, "logits/rejected": -0.3494298458099365, "logps/chosen": -33.399147033691406, "logps/rejected": -60.64561462402344, "loss": 0.4132, "rewards/accuracies": 0.0, "rewards/chosen": 1.5901802778244019, "rewards/margins": -0.12840187549591064, "rewards/rejected": 1.7185821533203125, "step": 3524 }, { "epoch": 0.57, "learning_rate": 9.77134310681598e-07, "logits/chosen": -0.6237379908561707, "logits/rejected": -0.5363926887512207, "logps/chosen": -80.72964477539062, "logps/rejected": -75.14936828613281, "loss": 0.2737, "rewards/accuracies": 1.0, "rewards/chosen": 2.03193736076355, "rewards/margins": 0.45061635971069336, "rewards/rejected": 1.5813210010528564, "step": 3525 }, { "epoch": 0.57, "learning_rate": 9.770950045123218e-07, "logits/chosen": -0.6208292841911316, "logits/rejected": -0.49639204144477844, "logps/chosen": -65.37934112548828, "logps/rejected": -27.0098876953125, "loss": 0.3457, "rewards/accuracies": 1.0, "rewards/chosen": 1.6054916381835938, "rewards/margins": 1.2907419204711914, "rewards/rejected": 0.31474971771240234, "step": 3526 }, { "epoch": 0.57, "learning_rate": 9.770556653802953e-07, "logits/chosen": -0.40782395005226135, "logits/rejected": -0.38096973299980164, "logps/chosen": -80.08969116210938, "logps/rejected": -61.54174041748047, "loss": 1.0186, "rewards/accuracies": 0.0, "rewards/chosen": 0.14114074409008026, "rewards/margins": -1.3105034828186035, "rewards/rejected": 1.4516441822052002, "step": 3527 }, { "epoch": 0.57, "learning_rate": 9.770162932882363e-07, "logits/chosen": -0.19124634563922882, "logits/rejected": -0.20367348194122314, "logps/chosen": -20.124114990234375, "logps/rejected": -6.753070831298828, "loss": 0.8266, "rewards/accuracies": 0.0, "rewards/chosen": -0.13006821274757385, "rewards/margins": -0.2920398712158203, "rewards/rejected": 0.16197167336940765, "step": 3528 }, { "epoch": 0.57, "learning_rate": 9.769768882388647e-07, "logits/chosen": -0.8341838121414185, "logits/rejected": -0.7431140542030334, "logps/chosen": -50.225372314453125, "logps/rejected": -88.20857238769531, "loss": 0.7673, "rewards/accuracies": 1.0, "rewards/chosen": 2.5881593227386475, "rewards/margins": 0.36066150665283203, "rewards/rejected": 2.2274978160858154, "step": 3529 }, { "epoch": 0.57, "learning_rate": 9.769374502349036e-07, "logits/chosen": -0.9596173763275146, "logits/rejected": -0.8869929909706116, "logps/chosen": -68.32275390625, "logps/rejected": -89.09693908691406, "loss": 0.5016, "rewards/accuracies": 0.0, "rewards/chosen": 0.98263019323349, "rewards/margins": -0.13146203756332397, "rewards/rejected": 1.114092230796814, "step": 3530 }, { "epoch": 0.57, "learning_rate": 9.768979792790776e-07, "logits/chosen": -0.9207679629325867, "logits/rejected": -0.9108160734176636, "logps/chosen": -107.21394348144531, "logps/rejected": -103.47137451171875, "loss": 0.5557, "rewards/accuracies": 1.0, "rewards/chosen": 0.9636398553848267, "rewards/margins": 0.45590364933013916, "rewards/rejected": 0.5077362060546875, "step": 3531 }, { "epoch": 0.57, "learning_rate": 9.768584753741134e-07, "logits/chosen": -0.550719141960144, "logits/rejected": -0.5240522623062134, "logps/chosen": -82.19134521484375, "logps/rejected": -96.65835571289062, "loss": 0.5632, "rewards/accuracies": 0.0, "rewards/chosen": 1.6730835437774658, "rewards/margins": -0.7076506614685059, "rewards/rejected": 2.3807342052459717, "step": 3532 }, { "epoch": 0.57, "learning_rate": 9.768189385227409e-07, "logits/chosen": -0.7621759176254272, "logits/rejected": -0.7512902617454529, "logps/chosen": -63.97065734863281, "logps/rejected": -94.61219787597656, "loss": 3.0916, "rewards/accuracies": 0.0, "rewards/chosen": 1.754280924797058, "rewards/margins": -3.9802207946777344, "rewards/rejected": 5.734501838684082, "step": 3533 }, { "epoch": 0.57, "learning_rate": 9.767793687276911e-07, "logits/chosen": -0.6591843366622925, "logits/rejected": -0.5940648913383484, "logps/chosen": -88.3468017578125, "logps/rejected": -100.28128814697266, "loss": 0.6297, "rewards/accuracies": 0.0, "rewards/chosen": 3.3366806507110596, "rewards/margins": -0.8984811305999756, "rewards/rejected": 4.235161781311035, "step": 3534 }, { "epoch": 0.57, "learning_rate": 9.767397659916986e-07, "logits/chosen": -0.17414532601833344, "logits/rejected": -0.31971845030784607, "logps/chosen": -82.15618896484375, "logps/rejected": -95.10107421875, "loss": 1.0763, "rewards/accuracies": 0.0, "rewards/chosen": 1.1364082098007202, "rewards/margins": -1.8512252569198608, "rewards/rejected": 2.987633466720581, "step": 3535 }, { "epoch": 0.57, "learning_rate": 9.76700130317499e-07, "logits/chosen": -0.4774476885795593, "logits/rejected": -0.4034948945045471, "logps/chosen": -81.36480712890625, "logps/rejected": -32.247802734375, "loss": 0.6756, "rewards/accuracies": 0.0, "rewards/chosen": 0.7007614374160767, "rewards/margins": -0.11010891199111938, "rewards/rejected": 0.810870349407196, "step": 3536 }, { "epoch": 0.57, "learning_rate": 9.76660461707831e-07, "logits/chosen": -0.6456011533737183, "logits/rejected": -0.6679009199142456, "logps/chosen": -56.463233947753906, "logps/rejected": -49.74953842163086, "loss": 0.7296, "rewards/accuracies": 1.0, "rewards/chosen": 2.0919196605682373, "rewards/margins": 0.5448504686355591, "rewards/rejected": 1.5470691919326782, "step": 3537 }, { "epoch": 0.57, "learning_rate": 9.766207601654355e-07, "logits/chosen": -0.5676514506340027, "logits/rejected": -0.5849047303199768, "logps/chosen": -76.82815551757812, "logps/rejected": -63.13408660888672, "loss": 0.5584, "rewards/accuracies": 1.0, "rewards/chosen": 1.5404342412948608, "rewards/margins": 0.4225219488143921, "rewards/rejected": 1.1179122924804688, "step": 3538 }, { "epoch": 0.57, "learning_rate": 9.76581025693055e-07, "logits/chosen": -0.5858599543571472, "logits/rejected": -0.4793175756931305, "logps/chosen": -78.11251068115234, "logps/rejected": -143.1910858154297, "loss": 0.2297, "rewards/accuracies": 1.0, "rewards/chosen": 1.7486251592636108, "rewards/margins": 1.3843169212341309, "rewards/rejected": 0.3643081784248352, "step": 3539 }, { "epoch": 0.57, "learning_rate": 9.765412582934353e-07, "logits/chosen": -0.15978670120239258, "logits/rejected": -0.09032110124826431, "logps/chosen": -83.95816802978516, "logps/rejected": -32.88943862915039, "loss": 0.2882, "rewards/accuracies": 1.0, "rewards/chosen": 3.5429298877716064, "rewards/margins": 2.5755882263183594, "rewards/rejected": 0.9673416018486023, "step": 3540 }, { "epoch": 0.57, "learning_rate": 9.765014579693237e-07, "logits/chosen": -0.4787325859069824, "logits/rejected": -0.5515598058700562, "logps/chosen": -81.88214874267578, "logps/rejected": -82.100341796875, "loss": 0.8025, "rewards/accuracies": 0.0, "rewards/chosen": 1.071254014968872, "rewards/margins": -1.1946594715118408, "rewards/rejected": 2.265913486480713, "step": 3541 }, { "epoch": 0.57, "learning_rate": 9.764616247234701e-07, "logits/chosen": -0.5433064103126526, "logits/rejected": -0.43389958143234253, "logps/chosen": -82.60260772705078, "logps/rejected": -63.58222961425781, "loss": 0.4741, "rewards/accuracies": 0.0, "rewards/chosen": 0.649396538734436, "rewards/margins": -0.25228196382522583, "rewards/rejected": 0.9016785025596619, "step": 3542 }, { "epoch": 0.58, "learning_rate": 9.764217585586263e-07, "logits/chosen": -0.5636003613471985, "logits/rejected": -0.5653981566429138, "logps/chosen": -94.85167694091797, "logps/rejected": -131.3643798828125, "loss": 0.4447, "rewards/accuracies": 1.0, "rewards/chosen": 1.4434776306152344, "rewards/margins": 0.9956108331680298, "rewards/rejected": 0.447866827249527, "step": 3543 }, { "epoch": 0.58, "learning_rate": 9.763818594775473e-07, "logits/chosen": -0.5595394372940063, "logits/rejected": -0.45508286356925964, "logps/chosen": -46.55227279663086, "logps/rejected": -62.04737091064453, "loss": 0.5505, "rewards/accuracies": 0.0, "rewards/chosen": 1.589704155921936, "rewards/margins": -0.6836816072463989, "rewards/rejected": 2.273385763168335, "step": 3544 }, { "epoch": 0.58, "learning_rate": 9.763419274829892e-07, "logits/chosen": -0.41308486461639404, "logits/rejected": -0.40035054087638855, "logps/chosen": -75.38285064697266, "logps/rejected": -46.50110626220703, "loss": 0.8024, "rewards/accuracies": 0.0, "rewards/chosen": 0.3596237301826477, "rewards/margins": -0.4374191164970398, "rewards/rejected": 0.7970428466796875, "step": 3545 }, { "epoch": 0.58, "learning_rate": 9.76301962577711e-07, "logits/chosen": -0.47492700815200806, "logits/rejected": -0.4948804974555969, "logps/chosen": -98.15655517578125, "logps/rejected": -142.23849487304688, "loss": 0.2452, "rewards/accuracies": 1.0, "rewards/chosen": 1.043971300125122, "rewards/margins": 0.5078155994415283, "rewards/rejected": 0.5361557006835938, "step": 3546 }, { "epoch": 0.58, "learning_rate": 9.76261964764474e-07, "logits/chosen": -0.4560889005661011, "logits/rejected": -0.4560889005661011, "logps/chosen": -80.10957336425781, "logps/rejected": -80.10957336425781, "loss": 0.3899, "rewards/accuracies": 0.0, "rewards/chosen": 1.8119720220565796, "rewards/margins": 0.0, "rewards/rejected": 1.8119720220565796, "step": 3547 }, { "epoch": 0.58, "learning_rate": 9.762219340460418e-07, "logits/chosen": -0.5558434128761292, "logits/rejected": -0.4139263331890106, "logps/chosen": -45.190093994140625, "logps/rejected": -69.25723266601562, "loss": 0.3101, "rewards/accuracies": 1.0, "rewards/chosen": 1.5672367811203003, "rewards/margins": 0.16522789001464844, "rewards/rejected": 1.4020088911056519, "step": 3548 }, { "epoch": 0.58, "learning_rate": 9.7618187042518e-07, "logits/chosen": -0.41049787402153015, "logits/rejected": -0.40690478682518005, "logps/chosen": -37.869415283203125, "logps/rejected": -52.29609298706055, "loss": 0.6322, "rewards/accuracies": 1.0, "rewards/chosen": 2.0412490367889404, "rewards/margins": 0.7717562913894653, "rewards/rejected": 1.269492745399475, "step": 3549 }, { "epoch": 0.58, "learning_rate": 9.761417739046565e-07, "logits/chosen": -0.7193246483802795, "logits/rejected": -0.7402538657188416, "logps/chosen": -77.6125717163086, "logps/rejected": -97.94019317626953, "loss": 0.6522, "rewards/accuracies": 1.0, "rewards/chosen": 1.986585259437561, "rewards/margins": 0.8209679126739502, "rewards/rejected": 1.1656173467636108, "step": 3550 }, { "epoch": 0.58, "learning_rate": 9.761016444872418e-07, "logits/chosen": -0.5297698974609375, "logits/rejected": -0.5473296046257019, "logps/chosen": -85.8954086303711, "logps/rejected": -102.63644409179688, "loss": 0.4048, "rewards/accuracies": 0.0, "rewards/chosen": 0.12817993760108948, "rewards/margins": -0.19579774141311646, "rewards/rejected": 0.32397767901420593, "step": 3551 }, { "epoch": 0.58, "learning_rate": 9.760614821757084e-07, "logits/chosen": -0.8201118111610413, "logits/rejected": -0.809814453125, "logps/chosen": -130.7972412109375, "logps/rejected": -68.6690902709961, "loss": 1.8192, "rewards/accuracies": 0.0, "rewards/chosen": 0.300933837890625, "rewards/margins": -1.6722389459609985, "rewards/rejected": 1.9731727838516235, "step": 3552 }, { "epoch": 0.58, "learning_rate": 9.76021286972831e-07, "logits/chosen": -0.28627243638038635, "logits/rejected": -0.3431062698364258, "logps/chosen": -78.40863037109375, "logps/rejected": -102.31723022460938, "loss": 2.4493, "rewards/accuracies": 0.0, "rewards/chosen": 1.646173119544983, "rewards/margins": -2.980630874633789, "rewards/rejected": 4.626803874969482, "step": 3553 }, { "epoch": 0.58, "learning_rate": 9.75981058881387e-07, "logits/chosen": -0.6654559969902039, "logits/rejected": -0.5934580564498901, "logps/chosen": -145.0582275390625, "logps/rejected": -106.3382568359375, "loss": 1.0372, "rewards/accuracies": 0.0, "rewards/chosen": 0.497964471578598, "rewards/margins": -0.3648239076137543, "rewards/rejected": 0.8627883791923523, "step": 3554 }, { "epoch": 0.58, "learning_rate": 9.759407979041557e-07, "logits/chosen": -0.9920147657394409, "logits/rejected": -0.9572678804397583, "logps/chosen": -94.22738647460938, "logps/rejected": -112.75497436523438, "loss": 1.0334, "rewards/accuracies": 0.0, "rewards/chosen": 3.1291940212249756, "rewards/margins": -1.3423912525177002, "rewards/rejected": 4.471585273742676, "step": 3555 }, { "epoch": 0.58, "learning_rate": 9.759005040439184e-07, "logits/chosen": -0.2965681552886963, "logits/rejected": -0.39607566595077515, "logps/chosen": -111.70808410644531, "logps/rejected": -109.611572265625, "loss": 1.8847, "rewards/accuracies": 0.0, "rewards/chosen": 1.144673228263855, "rewards/margins": -2.2887773513793945, "rewards/rejected": 3.43345046043396, "step": 3556 }, { "epoch": 0.58, "learning_rate": 9.758601773034594e-07, "logits/chosen": -0.6916265487670898, "logits/rejected": -0.6757829189300537, "logps/chosen": -228.260009765625, "logps/rejected": -45.671478271484375, "loss": 0.9113, "rewards/accuracies": 1.0, "rewards/chosen": 5.700308322906494, "rewards/margins": 4.730250835418701, "rewards/rejected": 0.9700576663017273, "step": 3557 }, { "epoch": 0.58, "learning_rate": 9.758198176855646e-07, "logits/chosen": -0.6948537826538086, "logits/rejected": -0.6299307942390442, "logps/chosen": -89.7841567993164, "logps/rejected": -43.787696838378906, "loss": 1.1314, "rewards/accuracies": 1.0, "rewards/chosen": 1.6076011657714844, "rewards/margins": 0.17206990718841553, "rewards/rejected": 1.4355312585830688, "step": 3558 }, { "epoch": 0.58, "learning_rate": 9.75779425193023e-07, "logits/chosen": -0.834237277507782, "logits/rejected": -0.7678024768829346, "logps/chosen": -113.6628189086914, "logps/rejected": -33.079307556152344, "loss": 0.8307, "rewards/accuracies": 1.0, "rewards/chosen": 0.6646491885185242, "rewards/margins": 0.46197202801704407, "rewards/rejected": 0.2026771605014801, "step": 3559 }, { "epoch": 0.58, "learning_rate": 9.757389998286245e-07, "logits/chosen": -0.694293737411499, "logits/rejected": -0.6547988653182983, "logps/chosen": -192.39349365234375, "logps/rejected": -143.73309326171875, "loss": 0.6007, "rewards/accuracies": 0.0, "rewards/chosen": 5.1410369873046875, "rewards/margins": -0.4140167236328125, "rewards/rejected": 5.5550537109375, "step": 3560 }, { "epoch": 0.58, "learning_rate": 9.75698541595163e-07, "logits/chosen": -0.7249189615249634, "logits/rejected": -0.6462122201919556, "logps/chosen": -91.29403686523438, "logps/rejected": -182.98373413085938, "loss": 0.1763, "rewards/accuracies": 1.0, "rewards/chosen": 1.6841728687286377, "rewards/margins": 0.9975517392158508, "rewards/rejected": 0.6866211295127869, "step": 3561 }, { "epoch": 0.58, "learning_rate": 9.756580504954333e-07, "logits/chosen": -0.4452897012233734, "logits/rejected": -0.42388713359832764, "logps/chosen": -50.13395690917969, "logps/rejected": -3.6897549629211426, "loss": 1.6623, "rewards/accuracies": 0.0, "rewards/chosen": -0.16785545647144318, "rewards/margins": -0.6155170798301697, "rewards/rejected": 0.4476616382598877, "step": 3562 }, { "epoch": 0.58, "learning_rate": 9.75617526532233e-07, "logits/chosen": -0.7077615261077881, "logits/rejected": -0.6559725999832153, "logps/chosen": -125.97824096679688, "logps/rejected": -38.389808654785156, "loss": 0.2709, "rewards/accuracies": 1.0, "rewards/chosen": 1.083099365234375, "rewards/margins": 0.4491836428642273, "rewards/rejected": 0.6339157223701477, "step": 3563 }, { "epoch": 0.58, "learning_rate": 9.755769697083618e-07, "logits/chosen": -0.521777331829071, "logits/rejected": -0.3156037926673889, "logps/chosen": -55.718116760253906, "logps/rejected": -87.93653869628906, "loss": 0.6125, "rewards/accuracies": 0.0, "rewards/chosen": 1.951751708984375, "rewards/margins": -0.7483711242675781, "rewards/rejected": 2.700122833251953, "step": 3564 }, { "epoch": 0.58, "learning_rate": 9.75536380026622e-07, "logits/chosen": -0.9251478910446167, "logits/rejected": -0.9380836486816406, "logps/chosen": -120.90052032470703, "logps/rejected": -170.832275390625, "loss": 0.9408, "rewards/accuracies": 0.0, "rewards/chosen": -0.21446533501148224, "rewards/margins": -0.55767822265625, "rewards/rejected": 0.34321290254592896, "step": 3565 }, { "epoch": 0.58, "learning_rate": 9.754957574898182e-07, "logits/chosen": -0.5725653171539307, "logits/rejected": -0.5915774703025818, "logps/chosen": -37.48411178588867, "logps/rejected": -52.902652740478516, "loss": 1.3273, "rewards/accuracies": 0.0, "rewards/chosen": 0.5703258514404297, "rewards/margins": -0.8618423938751221, "rewards/rejected": 1.4321682453155518, "step": 3566 }, { "epoch": 0.58, "learning_rate": 9.754551021007565e-07, "logits/chosen": -0.4002135992050171, "logits/rejected": -0.3771544098854065, "logps/chosen": -32.38618087768555, "logps/rejected": -77.31024932861328, "loss": 0.5982, "rewards/accuracies": 0.0, "rewards/chosen": 0.9000717401504517, "rewards/margins": -0.694793701171875, "rewards/rejected": 1.5948654413223267, "step": 3567 }, { "epoch": 0.58, "learning_rate": 9.75414413862246e-07, "logits/chosen": -0.4757261872291565, "logits/rejected": -0.4343995749950409, "logps/chosen": -119.66976928710938, "logps/rejected": -160.79843139648438, "loss": 0.851, "rewards/accuracies": 0.0, "rewards/chosen": 2.445272922515869, "rewards/margins": -1.468632459640503, "rewards/rejected": 3.913905382156372, "step": 3568 }, { "epoch": 0.58, "learning_rate": 9.753736927770982e-07, "logits/chosen": -0.8431782722473145, "logits/rejected": -0.9310163855552673, "logps/chosen": -198.64224243164062, "logps/rejected": -145.554443359375, "loss": 0.1296, "rewards/accuracies": 1.0, "rewards/chosen": 2.736210584640503, "rewards/margins": 2.4811110496520996, "rewards/rejected": 0.25509950518608093, "step": 3569 }, { "epoch": 0.58, "learning_rate": 9.753329388481259e-07, "logits/chosen": -0.6295238137245178, "logits/rejected": -0.623655378818512, "logps/chosen": -64.6041259765625, "logps/rejected": -84.71817779541016, "loss": 0.8773, "rewards/accuracies": 1.0, "rewards/chosen": 1.6321067810058594, "rewards/margins": 1.1712859869003296, "rewards/rejected": 0.4608207643032074, "step": 3570 }, { "epoch": 0.58, "learning_rate": 9.752921520781453e-07, "logits/chosen": -0.606978178024292, "logits/rejected": -0.5071275234222412, "logps/chosen": -179.08047485351562, "logps/rejected": -90.72718811035156, "loss": 1.0907, "rewards/accuracies": 1.0, "rewards/chosen": 4.78997802734375, "rewards/margins": 2.167759656906128, "rewards/rejected": 2.622218370437622, "step": 3571 }, { "epoch": 0.58, "learning_rate": 9.752513324699742e-07, "logits/chosen": -0.9320738315582275, "logits/rejected": -0.8294890522956848, "logps/chosen": -120.8968505859375, "logps/rejected": -19.04262924194336, "loss": 0.2479, "rewards/accuracies": 1.0, "rewards/chosen": 4.520053386688232, "rewards/margins": 4.0993242263793945, "rewards/rejected": 0.42072924971580505, "step": 3572 }, { "epoch": 0.58, "learning_rate": 9.75210480026433e-07, "logits/chosen": -1.085985779762268, "logits/rejected": -1.3546216487884521, "logps/chosen": -92.44613647460938, "logps/rejected": -34.90144729614258, "loss": 0.5606, "rewards/accuracies": 1.0, "rewards/chosen": 0.9471282958984375, "rewards/margins": 0.6268306970596313, "rewards/rejected": 0.32029762864112854, "step": 3573 }, { "epoch": 0.58, "learning_rate": 9.751695947503442e-07, "logits/chosen": -0.5554243922233582, "logits/rejected": -0.471437007188797, "logps/chosen": -87.39006805419922, "logps/rejected": -61.8026123046875, "loss": 0.2024, "rewards/accuracies": 1.0, "rewards/chosen": 2.0806968212127686, "rewards/margins": 0.7326613664627075, "rewards/rejected": 1.348035454750061, "step": 3574 }, { "epoch": 0.58, "learning_rate": 9.751286766445322e-07, "logits/chosen": -0.7052590250968933, "logits/rejected": -0.6917517185211182, "logps/chosen": -108.54584503173828, "logps/rejected": -75.21605682373047, "loss": 0.9583, "rewards/accuracies": 0.0, "rewards/chosen": 0.2894477844238281, "rewards/margins": -1.3701523542404175, "rewards/rejected": 1.6596001386642456, "step": 3575 }, { "epoch": 0.58, "learning_rate": 9.750877257118247e-07, "logits/chosen": -0.11147180199623108, "logits/rejected": -0.10519145429134369, "logps/chosen": -46.37040710449219, "logps/rejected": -17.569591522216797, "loss": 1.6914, "rewards/accuracies": 1.0, "rewards/chosen": 1.1290390491485596, "rewards/margins": 0.13289529085159302, "rewards/rejected": 0.9961437582969666, "step": 3576 }, { "epoch": 0.58, "learning_rate": 9.750467419550504e-07, "logits/chosen": -0.5547188520431519, "logits/rejected": -0.47327813506126404, "logps/chosen": -105.76190185546875, "logps/rejected": -38.02964782714844, "loss": 0.3746, "rewards/accuracies": 1.0, "rewards/chosen": 0.9649879336357117, "rewards/margins": 1.0528110265731812, "rewards/rejected": -0.08782310783863068, "step": 3577 }, { "epoch": 0.58, "learning_rate": 9.750057253770411e-07, "logits/chosen": -0.672957718372345, "logits/rejected": -0.6692870855331421, "logps/chosen": -53.84151077270508, "logps/rejected": -66.5953369140625, "loss": 0.9881, "rewards/accuracies": 1.0, "rewards/chosen": 1.6171635389328003, "rewards/margins": 0.07654154300689697, "rewards/rejected": 1.5406219959259033, "step": 3578 }, { "epoch": 0.58, "learning_rate": 9.74964675980631e-07, "logits/chosen": -0.8549844026565552, "logits/rejected": -0.8406412601470947, "logps/chosen": -136.97657775878906, "logps/rejected": -168.1359100341797, "loss": 0.7568, "rewards/accuracies": 0.0, "rewards/chosen": 1.14043128490448, "rewards/margins": -1.098835825920105, "rewards/rejected": 2.239267110824585, "step": 3579 }, { "epoch": 0.58, "learning_rate": 9.749235937686558e-07, "logits/chosen": -0.9106483459472656, "logits/rejected": -0.7864723801612854, "logps/chosen": -66.69422912597656, "logps/rejected": -14.827888488769531, "loss": 0.5084, "rewards/accuracies": 1.0, "rewards/chosen": 1.1366554498672485, "rewards/margins": 0.4316331744194031, "rewards/rejected": 0.7050222754478455, "step": 3580 }, { "epoch": 0.58, "learning_rate": 9.74882478743954e-07, "logits/chosen": -0.7273836731910706, "logits/rejected": -0.7202067971229553, "logps/chosen": -78.19863891601562, "logps/rejected": -79.40542602539062, "loss": 1.0039, "rewards/accuracies": 0.0, "rewards/chosen": 0.35202255845069885, "rewards/margins": -0.8379387855529785, "rewards/rejected": 1.189961314201355, "step": 3581 }, { "epoch": 0.58, "learning_rate": 9.748413309093665e-07, "logits/chosen": -0.4289989173412323, "logits/rejected": -0.5238640904426575, "logps/chosen": -126.42619323730469, "logps/rejected": -141.65231323242188, "loss": 1.6401, "rewards/accuracies": 0.0, "rewards/chosen": 0.44704437255859375, "rewards/margins": -3.0939316749572754, "rewards/rejected": 3.540976047515869, "step": 3582 }, { "epoch": 0.58, "learning_rate": 9.74800150267736e-07, "logits/chosen": -0.7595140933990479, "logits/rejected": -0.7288039326667786, "logps/chosen": -118.4239273071289, "logps/rejected": -190.9969024658203, "loss": 0.2871, "rewards/accuracies": 1.0, "rewards/chosen": 3.8637092113494873, "rewards/margins": 0.45389628410339355, "rewards/rejected": 3.4098129272460938, "step": 3583 }, { "epoch": 0.58, "learning_rate": 9.747589368219075e-07, "logits/chosen": -0.5177592039108276, "logits/rejected": -0.5408796668052673, "logps/chosen": -54.23103332519531, "logps/rejected": -61.04692077636719, "loss": 1.0586, "rewards/accuracies": 0.0, "rewards/chosen": 1.2665207386016846, "rewards/margins": -1.1379828453063965, "rewards/rejected": 2.404503583908081, "step": 3584 }, { "epoch": 0.58, "learning_rate": 9.747176905747288e-07, "logits/chosen": -0.9170122742652893, "logits/rejected": -0.8752193450927734, "logps/chosen": -239.77468872070312, "logps/rejected": -80.67741394042969, "loss": 0.3623, "rewards/accuracies": 1.0, "rewards/chosen": 3.5820863246917725, "rewards/margins": 0.2035233974456787, "rewards/rejected": 3.3785629272460938, "step": 3585 }, { "epoch": 0.58, "learning_rate": 9.746764115290494e-07, "logits/chosen": -0.4673401415348053, "logits/rejected": -0.40349170565605164, "logps/chosen": -70.61772155761719, "logps/rejected": -92.58490753173828, "loss": 0.2851, "rewards/accuracies": 1.0, "rewards/chosen": 2.1775002479553223, "rewards/margins": 0.41336214542388916, "rewards/rejected": 1.764138102531433, "step": 3586 }, { "epoch": 0.58, "learning_rate": 9.746350996877214e-07, "logits/chosen": -0.6115829348564148, "logits/rejected": -0.5857995748519897, "logps/chosen": -83.81920623779297, "logps/rejected": -88.83553314208984, "loss": 0.8467, "rewards/accuracies": 1.0, "rewards/chosen": 1.6814689636230469, "rewards/margins": 1.328467607498169, "rewards/rejected": 0.3530014157295227, "step": 3587 }, { "epoch": 0.58, "learning_rate": 9.745937550535992e-07, "logits/chosen": -0.43158701062202454, "logits/rejected": -0.4261128902435303, "logps/chosen": -121.55979919433594, "logps/rejected": -117.94316864013672, "loss": 2.1433, "rewards/accuracies": 1.0, "rewards/chosen": 1.6631546020507812, "rewards/margins": 0.4754524230957031, "rewards/rejected": 1.1877021789550781, "step": 3588 }, { "epoch": 0.58, "learning_rate": 9.745523776295393e-07, "logits/chosen": -0.19725091755390167, "logits/rejected": -0.16462238132953644, "logps/chosen": -36.2008056640625, "logps/rejected": -20.963838577270508, "loss": 0.8461, "rewards/accuracies": 0.0, "rewards/chosen": -0.017289353534579277, "rewards/margins": -0.2135000228881836, "rewards/rejected": 0.19621066749095917, "step": 3589 }, { "epoch": 0.58, "learning_rate": 9.745109674184e-07, "logits/chosen": -0.41089433431625366, "logits/rejected": -0.36736083030700684, "logps/chosen": -67.06390380859375, "logps/rejected": -85.23209381103516, "loss": 0.5673, "rewards/accuracies": 0.0, "rewards/chosen": 1.9102753400802612, "rewards/margins": -0.3030921220779419, "rewards/rejected": 2.213367462158203, "step": 3590 }, { "epoch": 0.58, "learning_rate": 9.74469524423043e-07, "logits/chosen": -0.4778347313404083, "logits/rejected": -0.4635104537010193, "logps/chosen": -84.7115478515625, "logps/rejected": -154.43856811523438, "loss": 0.1395, "rewards/accuracies": 1.0, "rewards/chosen": 1.8565781116485596, "rewards/margins": 1.261610507965088, "rewards/rejected": 0.5949676632881165, "step": 3591 }, { "epoch": 0.58, "learning_rate": 9.744280486463313e-07, "logits/chosen": -1.2071161270141602, "logits/rejected": -1.1633092164993286, "logps/chosen": -88.9185791015625, "logps/rejected": -148.0079345703125, "loss": 2.0222, "rewards/accuracies": 0.0, "rewards/chosen": 3.276933431625366, "rewards/margins": -1.618398904800415, "rewards/rejected": 4.895332336425781, "step": 3592 }, { "epoch": 0.58, "learning_rate": 9.743865400911304e-07, "logits/chosen": -0.8132737278938293, "logits/rejected": -0.9479347467422485, "logps/chosen": -86.29273986816406, "logps/rejected": -73.69168853759766, "loss": 0.592, "rewards/accuracies": 0.0, "rewards/chosen": 2.6454193592071533, "rewards/margins": -0.5913383960723877, "rewards/rejected": 3.236757755279541, "step": 3593 }, { "epoch": 0.58, "learning_rate": 9.74344998760308e-07, "logits/chosen": -0.5455507636070251, "logits/rejected": -0.43515002727508545, "logps/chosen": -127.19922637939453, "logps/rejected": -78.51409912109375, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": 4.287383556365967, "rewards/margins": 2.1010241508483887, "rewards/rejected": 2.186359405517578, "step": 3594 }, { "epoch": 0.58, "learning_rate": 9.74303424656735e-07, "logits/chosen": -0.5709218382835388, "logits/rejected": -0.6173950433731079, "logps/chosen": -73.99750518798828, "logps/rejected": -170.22979736328125, "loss": 0.7718, "rewards/accuracies": 0.0, "rewards/chosen": 0.3902534544467926, "rewards/margins": -1.2502602338790894, "rewards/rejected": 1.6405136585235596, "step": 3595 }, { "epoch": 0.58, "learning_rate": 9.74261817783283e-07, "logits/chosen": -0.6504011154174805, "logits/rejected": -0.5025476813316345, "logps/chosen": -145.09603881835938, "logps/rejected": -81.32952880859375, "loss": 0.9326, "rewards/accuracies": 1.0, "rewards/chosen": 5.266156196594238, "rewards/margins": 2.897888422012329, "rewards/rejected": 2.368267774581909, "step": 3596 }, { "epoch": 0.58, "learning_rate": 9.74220178142827e-07, "logits/chosen": -0.5250923037528992, "logits/rejected": -0.5236225128173828, "logps/chosen": -46.746795654296875, "logps/rejected": -35.48478317260742, "loss": 0.4516, "rewards/accuracies": 0.0, "rewards/chosen": 0.3479881286621094, "rewards/margins": -0.2692314386367798, "rewards/rejected": 0.6172195672988892, "step": 3597 }, { "epoch": 0.58, "learning_rate": 9.741785057382436e-07, "logits/chosen": -0.6091664433479309, "logits/rejected": -0.5936225652694702, "logps/chosen": -87.56560516357422, "logps/rejected": -59.351783752441406, "loss": 0.3175, "rewards/accuracies": 1.0, "rewards/chosen": 1.5427674055099487, "rewards/margins": 0.39249956607818604, "rewards/rejected": 1.1502678394317627, "step": 3598 }, { "epoch": 0.58, "learning_rate": 9.741368005724124e-07, "logits/chosen": -0.411272257566452, "logits/rejected": -0.411272257566452, "logps/chosen": -51.98161315917969, "logps/rejected": -51.98161315917969, "loss": 0.3677, "rewards/accuracies": 0.0, "rewards/chosen": 1.2269699573516846, "rewards/margins": 0.0, "rewards/rejected": 1.2269699573516846, "step": 3599 }, { "epoch": 0.58, "learning_rate": 9.740950626482144e-07, "logits/chosen": -0.5932608246803284, "logits/rejected": -0.6013739705085754, "logps/chosen": -15.576919555664062, "logps/rejected": -6.554634094238281, "loss": 1.2116, "rewards/accuracies": 0.0, "rewards/chosen": 0.1812458038330078, "rewards/margins": -0.16933020949363708, "rewards/rejected": 0.3505760133266449, "step": 3600 }, { "epoch": 0.58, "learning_rate": 9.740532919685339e-07, "logits/chosen": -0.5486035943031311, "logits/rejected": -0.5486035943031311, "logps/chosen": -58.782161712646484, "logps/rejected": -58.782161712646484, "loss": 0.5231, "rewards/accuracies": 0.0, "rewards/chosen": 1.268850326538086, "rewards/margins": 0.0, "rewards/rejected": 1.268850326538086, "step": 3601 }, { "epoch": 0.58, "learning_rate": 9.74011488536256e-07, "logits/chosen": -0.49988681077957153, "logits/rejected": -0.6213319301605225, "logps/chosen": -83.33808898925781, "logps/rejected": -147.19583129882812, "loss": 2.4324, "rewards/accuracies": 0.0, "rewards/chosen": 2.1667983531951904, "rewards/margins": -4.826324462890625, "rewards/rejected": 6.9931230545043945, "step": 3602 }, { "epoch": 0.58, "learning_rate": 9.739696523542696e-07, "logits/chosen": -0.5604768395423889, "logits/rejected": -0.4987705945968628, "logps/chosen": -136.53399658203125, "logps/rejected": -70.34648132324219, "loss": 0.32, "rewards/accuracies": 1.0, "rewards/chosen": 4.3022003173828125, "rewards/margins": 1.9207572937011719, "rewards/rejected": 2.3814430236816406, "step": 3603 }, { "epoch": 0.58, "learning_rate": 9.739277834254649e-07, "logits/chosen": -0.7190118432044983, "logits/rejected": -0.6381463408470154, "logps/chosen": -75.29073333740234, "logps/rejected": -17.63956069946289, "loss": 0.4306, "rewards/accuracies": 1.0, "rewards/chosen": 2.7449791431427, "rewards/margins": 1.9027690887451172, "rewards/rejected": 0.8422099947929382, "step": 3604 }, { "epoch": 0.59, "learning_rate": 9.738858817527347e-07, "logits/chosen": -0.7860316634178162, "logits/rejected": -0.857833206653595, "logps/chosen": -185.29830932617188, "logps/rejected": -56.666465759277344, "loss": 0.464, "rewards/accuracies": 1.0, "rewards/chosen": 2.350384473800659, "rewards/margins": 0.1731734275817871, "rewards/rejected": 2.177211046218872, "step": 3605 }, { "epoch": 0.59, "learning_rate": 9.73843947338974e-07, "logits/chosen": -0.5916155576705933, "logits/rejected": -0.5422084331512451, "logps/chosen": -39.03017044067383, "logps/rejected": -6.596080780029297, "loss": 0.6814, "rewards/accuracies": 1.0, "rewards/chosen": 1.5815280675888062, "rewards/margins": 0.9407424330711365, "rewards/rejected": 0.6407856345176697, "step": 3606 }, { "epoch": 0.59, "learning_rate": 9.738019801870802e-07, "logits/chosen": -0.9282316565513611, "logits/rejected": -0.9662988781929016, "logps/chosen": -107.03885650634766, "logps/rejected": -38.88106918334961, "loss": 0.7544, "rewards/accuracies": 0.0, "rewards/chosen": 1.3119102716445923, "rewards/margins": -0.8312240839004517, "rewards/rejected": 2.143134355545044, "step": 3607 }, { "epoch": 0.59, "learning_rate": 9.737599802999528e-07, "logits/chosen": -0.42620792984962463, "logits/rejected": -0.4272043704986572, "logps/chosen": -2.130671262741089, "logps/rejected": -3.338109016418457, "loss": 0.5058, "rewards/accuracies": 0.0, "rewards/chosen": 0.3369171917438507, "rewards/margins": -0.09217703342437744, "rewards/rejected": 0.42909422516822815, "step": 3608 }, { "epoch": 0.59, "learning_rate": 9.737179476804932e-07, "logits/chosen": -0.8358864784240723, "logits/rejected": -0.7408928871154785, "logps/chosen": -103.86465454101562, "logps/rejected": -104.62806701660156, "loss": 0.5141, "rewards/accuracies": 0.0, "rewards/chosen": 1.1302422285079956, "rewards/margins": -0.5656195878982544, "rewards/rejected": 1.69586181640625, "step": 3609 }, { "epoch": 0.59, "learning_rate": 9.73675882331606e-07, "logits/chosen": -0.42230430245399475, "logits/rejected": -0.4268430471420288, "logps/chosen": -57.470603942871094, "logps/rejected": -103.21863555908203, "loss": 1.65, "rewards/accuracies": 0.0, "rewards/chosen": 0.1533641815185547, "rewards/margins": -0.5150432586669922, "rewards/rejected": 0.6684074401855469, "step": 3610 }, { "epoch": 0.59, "learning_rate": 9.736337842561972e-07, "logits/chosen": -0.557183563709259, "logits/rejected": -0.557183563709259, "logps/chosen": -24.66394805908203, "logps/rejected": -24.66394805908203, "loss": 0.5977, "rewards/accuracies": 0.0, "rewards/chosen": -0.004093170166015625, "rewards/margins": 0.0, "rewards/rejected": -0.004093170166015625, "step": 3611 }, { "epoch": 0.59, "learning_rate": 9.735916534571756e-07, "logits/chosen": -0.5666142702102661, "logits/rejected": -0.5710386037826538, "logps/chosen": -86.83240509033203, "logps/rejected": -104.15252685546875, "loss": 1.2488, "rewards/accuracies": 0.0, "rewards/chosen": 0.4313697814941406, "rewards/margins": -0.3703811764717102, "rewards/rejected": 0.8017509579658508, "step": 3612 }, { "epoch": 0.59, "learning_rate": 9.73549489937452e-07, "logits/chosen": -0.5992671847343445, "logits/rejected": -0.4761013090610504, "logps/chosen": -52.28522491455078, "logps/rejected": -121.24000549316406, "loss": 0.854, "rewards/accuracies": 0.0, "rewards/chosen": 0.9256588220596313, "rewards/margins": -0.10258984565734863, "rewards/rejected": 1.02824866771698, "step": 3613 }, { "epoch": 0.59, "learning_rate": 9.73507293699939e-07, "logits/chosen": -0.38046231865882874, "logits/rejected": -0.38046231865882874, "logps/chosen": -25.866640090942383, "logps/rejected": -25.866640090942383, "loss": 0.8983, "rewards/accuracies": 0.0, "rewards/chosen": 1.0733057260513306, "rewards/margins": 0.0, "rewards/rejected": 1.0733057260513306, "step": 3614 }, { "epoch": 0.59, "learning_rate": 9.73465064747553e-07, "logits/chosen": -0.9525198340415955, "logits/rejected": -0.9726635813713074, "logps/chosen": -98.52935791015625, "logps/rejected": -63.7771110534668, "loss": 0.9325, "rewards/accuracies": 0.0, "rewards/chosen": 1.2856765985488892, "rewards/margins": -0.31984448432922363, "rewards/rejected": 1.6055210828781128, "step": 3615 }, { "epoch": 0.59, "learning_rate": 9.734228030832103e-07, "logits/chosen": -0.3378733992576599, "logits/rejected": -0.3378733992576599, "logps/chosen": -31.6221923828125, "logps/rejected": -31.6221923828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.3357502222061157, "rewards/margins": 0.0, "rewards/rejected": 1.3357502222061157, "step": 3616 }, { "epoch": 0.59, "learning_rate": 9.73380508709832e-07, "logits/chosen": -0.4102061986923218, "logits/rejected": -0.4622715711593628, "logps/chosen": -120.63021850585938, "logps/rejected": -133.26535034179688, "loss": 1.4873, "rewards/accuracies": 0.0, "rewards/chosen": 0.9554229974746704, "rewards/margins": -2.908024787902832, "rewards/rejected": 3.863447666168213, "step": 3617 }, { "epoch": 0.59, "learning_rate": 9.733381816303394e-07, "logits/chosen": -0.44816723465919495, "logits/rejected": -0.44816723465919495, "logps/chosen": -4.1908464431762695, "logps/rejected": -4.1908464431762695, "loss": 0.4051, "rewards/accuracies": 0.0, "rewards/chosen": 0.14172668755054474, "rewards/margins": 0.0, "rewards/rejected": 0.14172668755054474, "step": 3618 }, { "epoch": 0.59, "learning_rate": 9.732958218476573e-07, "logits/chosen": -0.2131706178188324, "logits/rejected": -0.2131706178188324, "logps/chosen": -76.17642211914062, "logps/rejected": -76.17642211914062, "loss": 0.7271, "rewards/accuracies": 0.0, "rewards/chosen": 1.8406356573104858, "rewards/margins": 0.0, "rewards/rejected": 1.8406356573104858, "step": 3619 }, { "epoch": 0.59, "learning_rate": 9.732534293647123e-07, "logits/chosen": -0.604089081287384, "logits/rejected": -0.5524577498435974, "logps/chosen": -105.67245483398438, "logps/rejected": -49.28058624267578, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": 5.150045871734619, "rewards/margins": 3.059163808822632, "rewards/rejected": 2.0908820629119873, "step": 3620 }, { "epoch": 0.59, "learning_rate": 9.732110041844333e-07, "logits/chosen": -0.7117273807525635, "logits/rejected": -0.5590892434120178, "logps/chosen": -236.6697998046875, "logps/rejected": -55.196983337402344, "loss": 0.0757, "rewards/accuracies": 1.0, "rewards/chosen": 5.208996772766113, "rewards/margins": 2.8306963443756104, "rewards/rejected": 2.378300428390503, "step": 3621 }, { "epoch": 0.59, "learning_rate": 9.731685463097516e-07, "logits/chosen": -0.5488340854644775, "logits/rejected": -0.4992039203643799, "logps/chosen": -210.37796020507812, "logps/rejected": -98.23800659179688, "loss": 0.1452, "rewards/accuracies": 1.0, "rewards/chosen": 3.737286329269409, "rewards/margins": 1.5991911888122559, "rewards/rejected": 2.1380951404571533, "step": 3622 }, { "epoch": 0.59, "learning_rate": 9.731260557436003e-07, "logits/chosen": -0.359896183013916, "logits/rejected": -0.399817556142807, "logps/chosen": -129.90516662597656, "logps/rejected": -86.05992889404297, "loss": 0.3216, "rewards/accuracies": 1.0, "rewards/chosen": 2.3410842418670654, "rewards/margins": 0.34084153175354004, "rewards/rejected": 2.0002427101135254, "step": 3623 }, { "epoch": 0.59, "learning_rate": 9.730835324889155e-07, "logits/chosen": -0.48017194867134094, "logits/rejected": -0.5704972743988037, "logps/chosen": -123.70841979980469, "logps/rejected": -120.9309310913086, "loss": 2.1929, "rewards/accuracies": 0.0, "rewards/chosen": 0.8488754630088806, "rewards/margins": -4.268067359924316, "rewards/rejected": 5.116942882537842, "step": 3624 }, { "epoch": 0.59, "learning_rate": 9.73040976548635e-07, "logits/chosen": -0.7446248531341553, "logits/rejected": -0.7020704746246338, "logps/chosen": -82.0352554321289, "logps/rejected": -62.824581146240234, "loss": 0.7899, "rewards/accuracies": 0.0, "rewards/chosen": 1.2624915838241577, "rewards/margins": -1.2128194570541382, "rewards/rejected": 2.475311040878296, "step": 3625 }, { "epoch": 0.59, "learning_rate": 9.729983879256986e-07, "logits/chosen": -0.49478664994239807, "logits/rejected": -0.5971609354019165, "logps/chosen": -45.54920959472656, "logps/rejected": -118.9988021850586, "loss": 1.3842, "rewards/accuracies": 1.0, "rewards/chosen": 1.9654388427734375, "rewards/margins": 0.26368939876556396, "rewards/rejected": 1.7017494440078735, "step": 3626 }, { "epoch": 0.59, "learning_rate": 9.729557666230494e-07, "logits/chosen": -0.6804348230361938, "logits/rejected": -0.6527903079986572, "logps/chosen": -58.373130798339844, "logps/rejected": -70.25994873046875, "loss": 0.8435, "rewards/accuracies": 0.0, "rewards/chosen": 0.6590995788574219, "rewards/margins": -1.1443970203399658, "rewards/rejected": 1.8034965991973877, "step": 3627 }, { "epoch": 0.59, "learning_rate": 9.72913112643632e-07, "logits/chosen": -0.7355701327323914, "logits/rejected": -0.704825758934021, "logps/chosen": -74.75011444091797, "logps/rejected": -70.92811584472656, "loss": 1.1047, "rewards/accuracies": 0.0, "rewards/chosen": 0.6290473937988281, "rewards/margins": -1.7188668251037598, "rewards/rejected": 2.347914218902588, "step": 3628 }, { "epoch": 0.59, "learning_rate": 9.72870425990393e-07, "logits/chosen": -0.8092342615127563, "logits/rejected": -0.8303768038749695, "logps/chosen": -48.21746826171875, "logps/rejected": -95.63606262207031, "loss": 0.4968, "rewards/accuracies": 0.0, "rewards/chosen": 0.20147399604320526, "rewards/margins": -0.4473968744277954, "rewards/rejected": 0.6488708853721619, "step": 3629 }, { "epoch": 0.59, "learning_rate": 9.72827706666282e-07, "logits/chosen": -0.40938353538513184, "logits/rejected": -0.40734371542930603, "logps/chosen": -64.53911590576172, "logps/rejected": -36.10939407348633, "loss": 0.945, "rewards/accuracies": 0.0, "rewards/chosen": 1.0868148803710938, "rewards/margins": -0.7975025177001953, "rewards/rejected": 1.884317398071289, "step": 3630 }, { "epoch": 0.59, "learning_rate": 9.727849546742506e-07, "logits/chosen": -0.2888195216655731, "logits/rejected": -0.29840442538261414, "logps/chosen": -4.087017059326172, "logps/rejected": -1.8926011323928833, "loss": 0.6836, "rewards/accuracies": 0.0, "rewards/chosen": 0.17574191093444824, "rewards/margins": -0.21202334761619568, "rewards/rejected": 0.3877652585506439, "step": 3631 }, { "epoch": 0.59, "learning_rate": 9.72742170017252e-07, "logits/chosen": -0.6400720477104187, "logits/rejected": -0.6480879187583923, "logps/chosen": -52.208518981933594, "logps/rejected": -48.57112121582031, "loss": 0.7792, "rewards/accuracies": 0.0, "rewards/chosen": 0.8517902493476868, "rewards/margins": -1.083808422088623, "rewards/rejected": 1.9355987310409546, "step": 3632 }, { "epoch": 0.59, "learning_rate": 9.72699352698243e-07, "logits/chosen": -0.7338889837265015, "logits/rejected": -0.7338889837265015, "logps/chosen": -4.6131062507629395, "logps/rejected": -4.6131062507629395, "loss": 1.1381, "rewards/accuracies": 0.0, "rewards/chosen": 0.5977930426597595, "rewards/margins": 0.0, "rewards/rejected": 0.5977930426597595, "step": 3633 }, { "epoch": 0.59, "learning_rate": 9.726565027201813e-07, "logits/chosen": -0.27988290786743164, "logits/rejected": -0.2063920646905899, "logps/chosen": -62.17332458496094, "logps/rejected": -56.23524856567383, "loss": 1.1674, "rewards/accuracies": 0.0, "rewards/chosen": 1.2155426740646362, "rewards/margins": -1.1887584924697876, "rewards/rejected": 2.404301166534424, "step": 3634 }, { "epoch": 0.59, "learning_rate": 9.726136200860273e-07, "logits/chosen": -0.542510449886322, "logits/rejected": -0.5324831008911133, "logps/chosen": -88.88221740722656, "logps/rejected": -74.71066284179688, "loss": 0.7632, "rewards/accuracies": 0.0, "rewards/chosen": 0.5899208188056946, "rewards/margins": -0.766131579875946, "rewards/rejected": 1.3560523986816406, "step": 3635 }, { "epoch": 0.59, "learning_rate": 9.725707047987443e-07, "logits/chosen": -0.827311635017395, "logits/rejected": -0.8488110303878784, "logps/chosen": -78.0055923461914, "logps/rejected": -144.23374938964844, "loss": 0.9755, "rewards/accuracies": 0.0, "rewards/chosen": 1.5790550708770752, "rewards/margins": -0.056899189949035645, "rewards/rejected": 1.6359542608261108, "step": 3636 }, { "epoch": 0.59, "learning_rate": 9.72527756861297e-07, "logits/chosen": -0.795861005783081, "logits/rejected": -0.7411721348762512, "logps/chosen": -64.12893676757812, "logps/rejected": -19.82815170288086, "loss": 0.3105, "rewards/accuracies": 1.0, "rewards/chosen": 1.1539344787597656, "rewards/margins": 0.7409602999687195, "rewards/rejected": 0.41297417879104614, "step": 3637 }, { "epoch": 0.59, "learning_rate": 9.72484776276653e-07, "logits/chosen": -0.6165683269500732, "logits/rejected": -0.5213137269020081, "logps/chosen": -62.62884521484375, "logps/rejected": -20.961917877197266, "loss": 0.9182, "rewards/accuracies": 1.0, "rewards/chosen": 2.1008827686309814, "rewards/margins": 1.4674065113067627, "rewards/rejected": 0.6334762573242188, "step": 3638 }, { "epoch": 0.59, "learning_rate": 9.724417630477815e-07, "logits/chosen": -0.6654958128929138, "logits/rejected": -0.5728248953819275, "logps/chosen": -95.96345520019531, "logps/rejected": -62.99311828613281, "loss": 0.7641, "rewards/accuracies": 0.0, "rewards/chosen": 1.562689185142517, "rewards/margins": -0.2661949396133423, "rewards/rejected": 1.8288841247558594, "step": 3639 }, { "epoch": 0.59, "learning_rate": 9.723987171776545e-07, "logits/chosen": -0.46462562680244446, "logits/rejected": -0.46462562680244446, "logps/chosen": -2.065477132797241, "logps/rejected": -2.065477132797241, "loss": 0.3542, "rewards/accuracies": 0.0, "rewards/chosen": 0.2908169627189636, "rewards/margins": 0.0, "rewards/rejected": 0.2908169627189636, "step": 3640 }, { "epoch": 0.59, "learning_rate": 9.72355638669246e-07, "logits/chosen": -0.7659043073654175, "logits/rejected": -0.6882256865501404, "logps/chosen": -68.58619689941406, "logps/rejected": -88.55511474609375, "loss": 0.4619, "rewards/accuracies": 1.0, "rewards/chosen": 1.3911850452423096, "rewards/margins": 0.38961029052734375, "rewards/rejected": 1.0015747547149658, "step": 3641 }, { "epoch": 0.59, "learning_rate": 9.723125275255323e-07, "logits/chosen": -0.7334756255149841, "logits/rejected": -0.5977514982223511, "logps/chosen": -115.89480590820312, "logps/rejected": -101.85240936279297, "loss": 0.9381, "rewards/accuracies": 1.0, "rewards/chosen": 2.958822727203369, "rewards/margins": 0.36334776878356934, "rewards/rejected": 2.5954749584198, "step": 3642 }, { "epoch": 0.59, "learning_rate": 9.722693837494922e-07, "logits/chosen": -0.7677997350692749, "logits/rejected": -0.7663711905479431, "logps/chosen": -81.34392547607422, "logps/rejected": -133.1471710205078, "loss": 0.6188, "rewards/accuracies": 0.0, "rewards/chosen": 1.502711534500122, "rewards/margins": -0.03647768497467041, "rewards/rejected": 1.5391892194747925, "step": 3643 }, { "epoch": 0.59, "learning_rate": 9.722262073441062e-07, "logits/chosen": -0.5982029438018799, "logits/rejected": -0.5183539390563965, "logps/chosen": -110.6989974975586, "logps/rejected": -68.93086242675781, "loss": 0.4846, "rewards/accuracies": 0.0, "rewards/chosen": 2.550412893295288, "rewards/margins": -0.4343910217285156, "rewards/rejected": 2.9848039150238037, "step": 3644 }, { "epoch": 0.59, "learning_rate": 9.721829983123575e-07, "logits/chosen": -0.35880064964294434, "logits/rejected": -0.2429274618625641, "logps/chosen": -69.84098052978516, "logps/rejected": -46.751808166503906, "loss": 0.8209, "rewards/accuracies": 1.0, "rewards/chosen": 0.41802141070365906, "rewards/margins": 0.29735490679740906, "rewards/rejected": 0.12066650390625, "step": 3645 }, { "epoch": 0.59, "learning_rate": 9.721397566572313e-07, "logits/chosen": -0.23798972368240356, "logits/rejected": -0.20220869779586792, "logps/chosen": -52.77342224121094, "logps/rejected": -46.467620849609375, "loss": 0.3248, "rewards/accuracies": 1.0, "rewards/chosen": 2.2400033473968506, "rewards/margins": 0.908998966217041, "rewards/rejected": 1.3310043811798096, "step": 3646 }, { "epoch": 0.59, "learning_rate": 9.720964823817157e-07, "logits/chosen": -0.5981970429420471, "logits/rejected": -0.6125998497009277, "logps/chosen": -86.2538070678711, "logps/rejected": -58.48845672607422, "loss": 0.5344, "rewards/accuracies": 0.0, "rewards/chosen": 1.275397539138794, "rewards/margins": -0.6140700578689575, "rewards/rejected": 1.8894675970077515, "step": 3647 }, { "epoch": 0.59, "learning_rate": 9.720531754888e-07, "logits/chosen": -0.29884061217308044, "logits/rejected": -0.219352126121521, "logps/chosen": -88.15345764160156, "logps/rejected": -87.74630737304688, "loss": 0.8737, "rewards/accuracies": 0.0, "rewards/chosen": 1.243719458580017, "rewards/margins": -1.525152564048767, "rewards/rejected": 2.768872022628784, "step": 3648 }, { "epoch": 0.59, "learning_rate": 9.720098359814763e-07, "logits/chosen": -0.5930348634719849, "logits/rejected": -0.5964899063110352, "logps/chosen": -6.5840349197387695, "logps/rejected": -3.878863573074341, "loss": 1.1713, "rewards/accuracies": 0.0, "rewards/chosen": 0.06977663189172745, "rewards/margins": -0.1943526268005371, "rewards/rejected": 0.26412925124168396, "step": 3649 }, { "epoch": 0.59, "learning_rate": 9.719664638627394e-07, "logits/chosen": -0.45652535557746887, "logits/rejected": -0.37165582180023193, "logps/chosen": -57.56866455078125, "logps/rejected": -19.05170440673828, "loss": 0.302, "rewards/accuracies": 1.0, "rewards/chosen": 0.7736579775810242, "rewards/margins": 0.4818292558193207, "rewards/rejected": 0.2918287217617035, "step": 3650 }, { "epoch": 0.59, "learning_rate": 9.719230591355857e-07, "logits/chosen": -0.5618603229522705, "logits/rejected": -0.4027262032032013, "logps/chosen": -105.51553344726562, "logps/rejected": -81.21066284179688, "loss": 0.4672, "rewards/accuracies": 1.0, "rewards/chosen": 0.5083595514297485, "rewards/margins": 0.3357948660850525, "rewards/rejected": 0.17256470024585724, "step": 3651 }, { "epoch": 0.59, "learning_rate": 9.718796218030137e-07, "logits/chosen": -0.4862203598022461, "logits/rejected": -0.5171658992767334, "logps/chosen": -75.79681396484375, "logps/rejected": -98.68437957763672, "loss": 1.273, "rewards/accuracies": 0.0, "rewards/chosen": 1.4046531915664673, "rewards/margins": -2.443903923034668, "rewards/rejected": 3.848557233810425, "step": 3652 }, { "epoch": 0.59, "learning_rate": 9.718361518680249e-07, "logits/chosen": -0.5333731770515442, "logits/rejected": -0.47793617844581604, "logps/chosen": -56.52109909057617, "logps/rejected": -125.05105590820312, "loss": 3.1505, "rewards/accuracies": 0.0, "rewards/chosen": 2.0799381732940674, "rewards/margins": -3.8234035968780518, "rewards/rejected": 5.903341770172119, "step": 3653 }, { "epoch": 0.59, "learning_rate": 9.717926493336226e-07, "logits/chosen": -0.8121545910835266, "logits/rejected": -0.7836490273475647, "logps/chosen": -116.4349136352539, "logps/rejected": -84.09651184082031, "loss": 1.2453, "rewards/accuracies": 0.0, "rewards/chosen": 0.8228965997695923, "rewards/margins": -2.3470029830932617, "rewards/rejected": 3.1698997020721436, "step": 3654 }, { "epoch": 0.59, "learning_rate": 9.717491142028125e-07, "logits/chosen": -0.6864644885063171, "logits/rejected": -0.7038664817810059, "logps/chosen": -129.66595458984375, "logps/rejected": -56.32585144042969, "loss": 0.7156, "rewards/accuracies": 1.0, "rewards/chosen": 3.1612274646759033, "rewards/margins": 0.7938704490661621, "rewards/rejected": 2.367357015609741, "step": 3655 }, { "epoch": 0.59, "learning_rate": 9.717055464786021e-07, "logits/chosen": -1.0604534149169922, "logits/rejected": -1.0732585191726685, "logps/chosen": -173.58425903320312, "logps/rejected": -99.59660339355469, "loss": 0.3597, "rewards/accuracies": 1.0, "rewards/chosen": 2.4933807849884033, "rewards/margins": 0.9925743341445923, "rewards/rejected": 1.500806450843811, "step": 3656 }, { "epoch": 0.59, "learning_rate": 9.716619461640019e-07, "logits/chosen": -0.9354894161224365, "logits/rejected": -0.9142134189605713, "logps/chosen": -69.33177185058594, "logps/rejected": -79.16908264160156, "loss": 0.6973, "rewards/accuracies": 1.0, "rewards/chosen": 2.7767410278320312, "rewards/margins": 1.0418593883514404, "rewards/rejected": 1.7348816394805908, "step": 3657 }, { "epoch": 0.59, "learning_rate": 9.716183132620241e-07, "logits/chosen": 0.0679776594042778, "logits/rejected": 0.06649588793516159, "logps/chosen": -5.178131580352783, "logps/rejected": -4.133542060852051, "loss": 0.4205, "rewards/accuracies": 1.0, "rewards/chosen": 0.09031128883361816, "rewards/margins": 0.028253696858882904, "rewards/rejected": 0.06205759197473526, "step": 3658 }, { "epoch": 0.59, "learning_rate": 9.715746477756835e-07, "logits/chosen": -0.530738353729248, "logits/rejected": -0.530738353729248, "logps/chosen": -77.54522705078125, "logps/rejected": -77.54522705078125, "loss": 0.7446, "rewards/accuracies": 0.0, "rewards/chosen": 0.8267745971679688, "rewards/margins": 0.0, "rewards/rejected": 0.8267745971679688, "step": 3659 }, { "epoch": 0.59, "learning_rate": 9.715309497079966e-07, "logits/chosen": -0.37576743960380554, "logits/rejected": -0.3688647449016571, "logps/chosen": -11.209978103637695, "logps/rejected": -8.145637512207031, "loss": 0.7256, "rewards/accuracies": 1.0, "rewards/chosen": 0.24637480080127716, "rewards/margins": 0.007355883717536926, "rewards/rejected": 0.23901891708374023, "step": 3660 }, { "epoch": 0.59, "learning_rate": 9.714872190619827e-07, "logits/chosen": -0.7470726370811462, "logits/rejected": -0.7078404426574707, "logps/chosen": -86.04154205322266, "logps/rejected": -104.2620620727539, "loss": 0.337, "rewards/accuracies": 1.0, "rewards/chosen": 1.418413519859314, "rewards/margins": 0.7432311773300171, "rewards/rejected": 0.6751823425292969, "step": 3661 }, { "epoch": 0.59, "learning_rate": 9.714434558406634e-07, "logits/chosen": -0.38824188709259033, "logits/rejected": -0.378002792596817, "logps/chosen": -23.06434440612793, "logps/rejected": -1.6962053775787354, "loss": 0.7342, "rewards/accuracies": 0.0, "rewards/chosen": -0.09701328724622726, "rewards/margins": -0.5124074816703796, "rewards/rejected": 0.41539421677589417, "step": 3662 }, { "epoch": 0.59, "learning_rate": 9.713996600470622e-07, "logits/chosen": -0.858106791973114, "logits/rejected": -0.9289137721061707, "logps/chosen": -300.29803466796875, "logps/rejected": -163.9344482421875, "loss": 1.3991, "rewards/accuracies": 0.0, "rewards/chosen": 2.268942356109619, "rewards/margins": -2.6414170265197754, "rewards/rejected": 4.9103593826293945, "step": 3663 }, { "epoch": 0.59, "learning_rate": 9.713558316842047e-07, "logits/chosen": -0.4430571496486664, "logits/rejected": -0.34164705872535706, "logps/chosen": -47.520042419433594, "logps/rejected": -53.5396614074707, "loss": 1.7917, "rewards/accuracies": 1.0, "rewards/chosen": 1.3664321899414062, "rewards/margins": 0.7566967010498047, "rewards/rejected": 0.6097354888916016, "step": 3664 }, { "epoch": 0.59, "learning_rate": 9.713119707551192e-07, "logits/chosen": -0.5047406554222107, "logits/rejected": -0.5258166790008545, "logps/chosen": -173.92282104492188, "logps/rejected": -50.1790771484375, "loss": 0.2491, "rewards/accuracies": 1.0, "rewards/chosen": 2.585108995437622, "rewards/margins": 1.3415402173995972, "rewards/rejected": 1.243568778038025, "step": 3665 }, { "epoch": 0.6, "learning_rate": 9.712680772628363e-07, "logits/chosen": -0.4749583601951599, "logits/rejected": -0.475381463766098, "logps/chosen": -4.0258073806762695, "logps/rejected": -14.038742065429688, "loss": 0.5737, "rewards/accuracies": 1.0, "rewards/chosen": 0.084180548787117, "rewards/margins": 0.1344596892595291, "rewards/rejected": -0.05027914047241211, "step": 3666 }, { "epoch": 0.6, "learning_rate": 9.712241512103883e-07, "logits/chosen": -0.30886343121528625, "logits/rejected": -0.3033088147640228, "logps/chosen": -125.35078430175781, "logps/rejected": -89.58987426757812, "loss": 0.3896, "rewards/accuracies": 1.0, "rewards/chosen": 3.816793918609619, "rewards/margins": 0.9023149013519287, "rewards/rejected": 2.9144790172576904, "step": 3667 }, { "epoch": 0.6, "learning_rate": 9.711801926008105e-07, "logits/chosen": -0.3606771230697632, "logits/rejected": -0.23948167264461517, "logps/chosen": -126.50932312011719, "logps/rejected": -98.47868347167969, "loss": 0.1141, "rewards/accuracies": 1.0, "rewards/chosen": 3.568202257156372, "rewards/margins": 1.680883765220642, "rewards/rejected": 1.88731849193573, "step": 3668 }, { "epoch": 0.6, "learning_rate": 9.711362014371393e-07, "logits/chosen": -1.0380260944366455, "logits/rejected": -0.9450899958610535, "logps/chosen": -117.47145080566406, "logps/rejected": -21.862550735473633, "loss": 0.0489, "rewards/accuracies": 1.0, "rewards/chosen": 6.321531772613525, "rewards/margins": 5.8669962882995605, "rewards/rejected": 0.4545353055000305, "step": 3669 }, { "epoch": 0.6, "learning_rate": 9.710921777224147e-07, "logits/chosen": -0.993916392326355, "logits/rejected": -0.9531379342079163, "logps/chosen": -196.7030029296875, "logps/rejected": -146.99557495117188, "loss": 1.1382, "rewards/accuracies": 0.0, "rewards/chosen": 3.877699375152588, "rewards/margins": -1.2761626243591309, "rewards/rejected": 5.153861999511719, "step": 3670 }, { "epoch": 0.6, "learning_rate": 9.710481214596785e-07, "logits/chosen": -0.6665138602256775, "logits/rejected": -0.6647756695747375, "logps/chosen": -170.91458129882812, "logps/rejected": -88.06773376464844, "loss": 0.1324, "rewards/accuracies": 1.0, "rewards/chosen": 1.758703589439392, "rewards/margins": 1.508928656578064, "rewards/rejected": 0.24977493286132812, "step": 3671 }, { "epoch": 0.6, "learning_rate": 9.710040326519737e-07, "logits/chosen": -0.5562587380409241, "logits/rejected": -0.53801029920578, "logps/chosen": -85.55612182617188, "logps/rejected": -50.41786193847656, "loss": 0.5386, "rewards/accuracies": 0.0, "rewards/chosen": 0.8665657043457031, "rewards/margins": -0.36612164974212646, "rewards/rejected": 1.2326873540878296, "step": 3672 }, { "epoch": 0.6, "learning_rate": 9.709599113023472e-07, "logits/chosen": -0.5321239829063416, "logits/rejected": -0.5479617714881897, "logps/chosen": -109.08192443847656, "logps/rejected": -73.40194702148438, "loss": 0.7014, "rewards/accuracies": 1.0, "rewards/chosen": 1.0295876264572144, "rewards/margins": 0.3263534903526306, "rewards/rejected": 0.7032341361045837, "step": 3673 }, { "epoch": 0.6, "learning_rate": 9.70915757413847e-07, "logits/chosen": -0.3097497224807739, "logits/rejected": -0.22600997984409332, "logps/chosen": -81.4955062866211, "logps/rejected": -37.452964782714844, "loss": 0.7705, "rewards/accuracies": 0.0, "rewards/chosen": 0.61110919713974, "rewards/margins": -0.40746158361434937, "rewards/rejected": 1.0185707807540894, "step": 3674 }, { "epoch": 0.6, "learning_rate": 9.708715709895237e-07, "logits/chosen": -0.9071955680847168, "logits/rejected": -0.8829414248466492, "logps/chosen": -111.43126678466797, "logps/rejected": -68.25972747802734, "loss": 1.6115, "rewards/accuracies": 0.0, "rewards/chosen": 0.7607582211494446, "rewards/margins": -3.0780029296875, "rewards/rejected": 3.8387610912323, "step": 3675 }, { "epoch": 0.6, "learning_rate": 9.708273520324306e-07, "logits/chosen": -0.6461285352706909, "logits/rejected": -0.6376320123672485, "logps/chosen": -113.35261535644531, "logps/rejected": -100.2953109741211, "loss": 0.3563, "rewards/accuracies": 1.0, "rewards/chosen": 2.9666764736175537, "rewards/margins": 0.2991218566894531, "rewards/rejected": 2.6675546169281006, "step": 3676 }, { "epoch": 0.6, "learning_rate": 9.707831005456222e-07, "logits/chosen": -0.17751240730285645, "logits/rejected": -0.30082741379737854, "logps/chosen": -85.487060546875, "logps/rejected": -82.68672180175781, "loss": 0.6668, "rewards/accuracies": 0.0, "rewards/chosen": 0.7767059206962585, "rewards/margins": -0.9575241208076477, "rewards/rejected": 1.7342300415039062, "step": 3677 }, { "epoch": 0.6, "learning_rate": 9.707388165321561e-07, "logits/chosen": -0.79241544008255, "logits/rejected": -0.7646543383598328, "logps/chosen": -51.0001335144043, "logps/rejected": -57.75390625, "loss": 0.314, "rewards/accuracies": 1.0, "rewards/chosen": 2.2050023078918457, "rewards/margins": 1.093376636505127, "rewards/rejected": 1.1116256713867188, "step": 3678 }, { "epoch": 0.6, "learning_rate": 9.706944999950921e-07, "logits/chosen": -0.999271810054779, "logits/rejected": -1.024536371231079, "logps/chosen": -68.79141235351562, "logps/rejected": -153.3663330078125, "loss": 3.0344, "rewards/accuracies": 0.0, "rewards/chosen": 1.2720855474472046, "rewards/margins": -5.477067470550537, "rewards/rejected": 6.749153137207031, "step": 3679 }, { "epoch": 0.6, "learning_rate": 9.70650150937492e-07, "logits/chosen": -0.47274380922317505, "logits/rejected": -0.47058558464050293, "logps/chosen": -2.1510744094848633, "logps/rejected": -17.342464447021484, "loss": 0.421, "rewards/accuracies": 1.0, "rewards/chosen": 0.20644746720790863, "rewards/margins": 0.1978808045387268, "rewards/rejected": 0.008566665463149548, "step": 3680 }, { "epoch": 0.6, "learning_rate": 9.706057693624197e-07, "logits/chosen": -0.2534290850162506, "logits/rejected": -0.27639898657798767, "logps/chosen": -1.1346235275268555, "logps/rejected": -23.432382583618164, "loss": 0.3388, "rewards/accuracies": 1.0, "rewards/chosen": 0.27580782771110535, "rewards/margins": 0.12665125727653503, "rewards/rejected": 0.1491565704345703, "step": 3681 }, { "epoch": 0.6, "learning_rate": 9.705613552729415e-07, "logits/chosen": -0.6906984448432922, "logits/rejected": -0.6975403428077698, "logps/chosen": -10.07746696472168, "logps/rejected": -2.8446848392486572, "loss": 0.6134, "rewards/accuracies": 0.0, "rewards/chosen": 0.13952389359474182, "rewards/margins": -0.2856833338737488, "rewards/rejected": 0.4252072274684906, "step": 3682 }, { "epoch": 0.6, "learning_rate": 9.705169086721264e-07, "logits/chosen": -0.5585240721702576, "logits/rejected": -0.4834645688533783, "logps/chosen": -56.763633728027344, "logps/rejected": -71.17000579833984, "loss": 1.1752, "rewards/accuracies": 0.0, "rewards/chosen": 2.0964455604553223, "rewards/margins": -1.1016905307769775, "rewards/rejected": 3.1981360912323, "step": 3683 }, { "epoch": 0.6, "learning_rate": 9.704724295630447e-07, "logits/chosen": -0.8660466074943542, "logits/rejected": -0.7532925009727478, "logps/chosen": -166.95477294921875, "logps/rejected": -95.34685516357422, "loss": 1.803, "rewards/accuracies": 0.0, "rewards/chosen": 0.4994369447231293, "rewards/margins": -1.4959602355957031, "rewards/rejected": 1.9953972101211548, "step": 3684 }, { "epoch": 0.6, "learning_rate": 9.7042791794877e-07, "logits/chosen": -0.40219730138778687, "logits/rejected": -0.42271196842193604, "logps/chosen": -77.82257843017578, "logps/rejected": -66.12386322021484, "loss": 0.9248, "rewards/accuracies": 0.0, "rewards/chosen": 0.5795829892158508, "rewards/margins": -0.811444103717804, "rewards/rejected": 1.3910270929336548, "step": 3685 }, { "epoch": 0.6, "learning_rate": 9.703833738323772e-07, "logits/chosen": -0.3605530261993408, "logits/rejected": -0.21030810475349426, "logps/chosen": -64.166748046875, "logps/rejected": -32.25853729248047, "loss": 0.3379, "rewards/accuracies": 1.0, "rewards/chosen": 1.2010284662246704, "rewards/margins": 1.0057308673858643, "rewards/rejected": 0.19529762864112854, "step": 3686 }, { "epoch": 0.6, "learning_rate": 9.703387972169443e-07, "logits/chosen": -0.4108312726020813, "logits/rejected": -0.4293150305747986, "logps/chosen": -169.45301818847656, "logps/rejected": -86.11560821533203, "loss": 0.3418, "rewards/accuracies": 1.0, "rewards/chosen": 3.2897064685821533, "rewards/margins": 2.5861458778381348, "rewards/rejected": 0.7035606503486633, "step": 3687 }, { "epoch": 0.6, "learning_rate": 9.702941881055508e-07, "logits/chosen": -0.08596208691596985, "logits/rejected": -0.08596208691596985, "logps/chosen": -2.292422294616699, "logps/rejected": -2.292422294616699, "loss": 0.5499, "rewards/accuracies": 0.0, "rewards/chosen": 0.20316290855407715, "rewards/margins": 0.0, "rewards/rejected": 0.20316290855407715, "step": 3688 }, { "epoch": 0.6, "learning_rate": 9.702495465012787e-07, "logits/chosen": -0.5585616230964661, "logits/rejected": -0.4963398873806, "logps/chosen": -54.4046630859375, "logps/rejected": -73.48516845703125, "loss": 1.2367, "rewards/accuracies": 0.0, "rewards/chosen": 1.2965888977050781, "rewards/margins": -1.207183837890625, "rewards/rejected": 2.503772735595703, "step": 3689 }, { "epoch": 0.6, "learning_rate": 9.702048724072127e-07, "logits/chosen": -0.6029136776924133, "logits/rejected": -0.6029136776924133, "logps/chosen": -79.1390380859375, "logps/rejected": -79.1390380859375, "loss": 0.7606, "rewards/accuracies": 0.0, "rewards/chosen": 1.9519981145858765, "rewards/margins": 0.0, "rewards/rejected": 1.9519981145858765, "step": 3690 }, { "epoch": 0.6, "learning_rate": 9.70160165826439e-07, "logits/chosen": -0.5988553166389465, "logits/rejected": -0.5654775500297546, "logps/chosen": -62.668617248535156, "logps/rejected": -59.75458908081055, "loss": 1.5277, "rewards/accuracies": 0.0, "rewards/chosen": 2.269153594970703, "rewards/margins": -0.691725492477417, "rewards/rejected": 2.96087908744812, "step": 3691 }, { "epoch": 0.6, "learning_rate": 9.701154267620468e-07, "logits/chosen": -0.5951094627380371, "logits/rejected": -0.3565857708454132, "logps/chosen": -68.7480239868164, "logps/rejected": -53.254974365234375, "loss": 0.7096, "rewards/accuracies": 1.0, "rewards/chosen": 2.7349159717559814, "rewards/margins": 1.2606849670410156, "rewards/rejected": 1.4742310047149658, "step": 3692 }, { "epoch": 0.6, "learning_rate": 9.700706552171267e-07, "logits/chosen": -1.2006733417510986, "logits/rejected": -1.221759557723999, "logps/chosen": -70.72219848632812, "logps/rejected": -58.74812698364258, "loss": 2.0239, "rewards/accuracies": 0.0, "rewards/chosen": 0.5806289911270142, "rewards/margins": -1.811008095741272, "rewards/rejected": 2.391637086868286, "step": 3693 }, { "epoch": 0.6, "learning_rate": 9.700258511947722e-07, "logits/chosen": -0.7329253554344177, "logits/rejected": -0.8826974630355835, "logps/chosen": -136.13185119628906, "logps/rejected": -126.13297271728516, "loss": 2.3452, "rewards/accuracies": 0.0, "rewards/chosen": 1.5398346185684204, "rewards/margins": -2.9687461853027344, "rewards/rejected": 4.508580684661865, "step": 3694 }, { "epoch": 0.6, "learning_rate": 9.699810146980788e-07, "logits/chosen": -0.3227679431438446, "logits/rejected": -0.3227679431438446, "logps/chosen": -0.9657512307167053, "logps/rejected": -0.9657512307167053, "loss": 1.8538, "rewards/accuracies": 0.0, "rewards/chosen": 0.05168599635362625, "rewards/margins": 0.0, "rewards/rejected": 0.05168599635362625, "step": 3695 }, { "epoch": 0.6, "learning_rate": 9.699361457301443e-07, "logits/chosen": -0.5442081689834595, "logits/rejected": -0.44651171565055847, "logps/chosen": -134.03384399414062, "logps/rejected": -64.30823516845703, "loss": 0.1687, "rewards/accuracies": 1.0, "rewards/chosen": 3.820513963699341, "rewards/margins": 1.172844648361206, "rewards/rejected": 2.6476693153381348, "step": 3696 }, { "epoch": 0.6, "learning_rate": 9.698912442940685e-07, "logits/chosen": -0.5570109486579895, "logits/rejected": -0.5430555939674377, "logps/chosen": -113.81573486328125, "logps/rejected": -53.06760787963867, "loss": 0.9843, "rewards/accuracies": 0.0, "rewards/chosen": 1.2361290454864502, "rewards/margins": -0.6131983995437622, "rewards/rejected": 1.8493274450302124, "step": 3697 }, { "epoch": 0.6, "learning_rate": 9.698463103929541e-07, "logits/chosen": -0.5442333817481995, "logits/rejected": -0.5712334513664246, "logps/chosen": -45.94684982299805, "logps/rejected": -104.20952606201172, "loss": 0.5102, "rewards/accuracies": 0.0, "rewards/chosen": 0.8745647668838501, "rewards/margins": -0.25094640254974365, "rewards/rejected": 1.1255111694335938, "step": 3698 }, { "epoch": 0.6, "learning_rate": 9.698013440299054e-07, "logits/chosen": -0.3448772430419922, "logits/rejected": -0.3595713675022125, "logps/chosen": -1.461925745010376, "logps/rejected": -38.48250961303711, "loss": 0.4456, "rewards/accuracies": 0.0, "rewards/chosen": 0.35161712765693665, "rewards/margins": -0.19283214211463928, "rewards/rejected": 0.5444492697715759, "step": 3699 }, { "epoch": 0.6, "learning_rate": 9.697563452080291e-07, "logits/chosen": -0.5817950963973999, "logits/rejected": -0.5513469576835632, "logps/chosen": -104.30120086669922, "logps/rejected": -102.7960205078125, "loss": 0.4955, "rewards/accuracies": 1.0, "rewards/chosen": 1.1945641040802002, "rewards/margins": 0.17256391048431396, "rewards/rejected": 1.0220001935958862, "step": 3700 }, { "epoch": 0.6, "learning_rate": 9.69711313930434e-07, "logits/chosen": -0.6086913347244263, "logits/rejected": -0.6098082065582275, "logps/chosen": -143.57615661621094, "logps/rejected": -75.08395385742188, "loss": 1.4764, "rewards/accuracies": 0.0, "rewards/chosen": 1.3006607294082642, "rewards/margins": -1.679772973060608, "rewards/rejected": 2.980433702468872, "step": 3701 }, { "epoch": 0.6, "learning_rate": 9.696662502002318e-07, "logits/chosen": -0.9363308548927307, "logits/rejected": -0.3696412146091461, "logps/chosen": -91.29257202148438, "logps/rejected": -151.50918579101562, "loss": 1.5277, "rewards/accuracies": 0.0, "rewards/chosen": 0.6420562863349915, "rewards/margins": -0.30626678466796875, "rewards/rejected": 0.9483230710029602, "step": 3702 }, { "epoch": 0.6, "learning_rate": 9.696211540205358e-07, "logits/chosen": -1.0577027797698975, "logits/rejected": -0.9643285870552063, "logps/chosen": -139.61390686035156, "logps/rejected": -20.350112915039062, "loss": 0.3295, "rewards/accuracies": 1.0, "rewards/chosen": 7.505568027496338, "rewards/margins": 6.920517444610596, "rewards/rejected": 0.5850505828857422, "step": 3703 }, { "epoch": 0.6, "learning_rate": 9.695760253944613e-07, "logits/chosen": -0.5455767512321472, "logits/rejected": -0.5816189646720886, "logps/chosen": -91.43473815917969, "logps/rejected": -108.6017837524414, "loss": 0.2053, "rewards/accuracies": 1.0, "rewards/chosen": 1.2655242681503296, "rewards/margins": 0.698417603969574, "rewards/rejected": 0.5671066641807556, "step": 3704 }, { "epoch": 0.6, "learning_rate": 9.69530864325127e-07, "logits/chosen": -0.5745729207992554, "logits/rejected": -0.5216684341430664, "logps/chosen": -185.25628662109375, "logps/rejected": -122.04910278320312, "loss": 0.347, "rewards/accuracies": 1.0, "rewards/chosen": 6.140251159667969, "rewards/margins": 0.13212251663208008, "rewards/rejected": 6.008128643035889, "step": 3705 }, { "epoch": 0.6, "learning_rate": 9.694856708156524e-07, "logits/chosen": -0.4439564347267151, "logits/rejected": -0.4439564347267151, "logps/chosen": -75.27881622314453, "logps/rejected": -75.27881622314453, "loss": 0.9471, "rewards/accuracies": 0.0, "rewards/chosen": 1.8041718006134033, "rewards/margins": 0.0, "rewards/rejected": 1.8041718006134033, "step": 3706 }, { "epoch": 0.6, "learning_rate": 9.694404448691606e-07, "logits/chosen": -0.8561505675315857, "logits/rejected": -0.8051905035972595, "logps/chosen": -127.40318298339844, "logps/rejected": -207.83676147460938, "loss": 1.1209, "rewards/accuracies": 0.0, "rewards/chosen": 3.64164137840271, "rewards/margins": -2.055968999862671, "rewards/rejected": 5.697610378265381, "step": 3707 }, { "epoch": 0.6, "learning_rate": 9.693951864887758e-07, "logits/chosen": -0.5288459658622742, "logits/rejected": -0.3883366286754608, "logps/chosen": -90.79364013671875, "logps/rejected": -36.193973541259766, "loss": 0.9278, "rewards/accuracies": 1.0, "rewards/chosen": 0.9103805422782898, "rewards/margins": 0.7516086101531982, "rewards/rejected": 0.15877190232276917, "step": 3708 }, { "epoch": 0.6, "learning_rate": 9.69349895677625e-07, "logits/chosen": -0.1275627166032791, "logits/rejected": -0.1674039363861084, "logps/chosen": -88.40113830566406, "logps/rejected": -59.6633186340332, "loss": 1.017, "rewards/accuracies": 0.0, "rewards/chosen": 1.9436630010604858, "rewards/margins": -0.7115689516067505, "rewards/rejected": 2.6552319526672363, "step": 3709 }, { "epoch": 0.6, "learning_rate": 9.693045724388374e-07, "logits/chosen": -0.8191643953323364, "logits/rejected": -0.6543196439743042, "logps/chosen": -102.26197814941406, "logps/rejected": -169.5921173095703, "loss": 1.0153, "rewards/accuracies": 0.0, "rewards/chosen": 4.030156135559082, "rewards/margins": -0.9119687080383301, "rewards/rejected": 4.942124843597412, "step": 3710 }, { "epoch": 0.6, "learning_rate": 9.692592167755445e-07, "logits/chosen": -0.7334868907928467, "logits/rejected": -0.6522631645202637, "logps/chosen": -73.566650390625, "logps/rejected": -66.55762481689453, "loss": 0.5665, "rewards/accuracies": 0.0, "rewards/chosen": 1.3267822265625, "rewards/margins": -0.7340476512908936, "rewards/rejected": 2.0608298778533936, "step": 3711 }, { "epoch": 0.6, "learning_rate": 9.6921382869088e-07, "logits/chosen": -0.6939009428024292, "logits/rejected": -0.619737446308136, "logps/chosen": -177.08840942382812, "logps/rejected": -132.37335205078125, "loss": 0.2578, "rewards/accuracies": 1.0, "rewards/chosen": 4.896018981933594, "rewards/margins": 0.47484731674194336, "rewards/rejected": 4.42117166519165, "step": 3712 }, { "epoch": 0.6, "learning_rate": 9.691684081879796e-07, "logits/chosen": -0.6641635298728943, "logits/rejected": -0.6776178479194641, "logps/chosen": -86.10257720947266, "logps/rejected": -185.90493774414062, "loss": 0.7725, "rewards/accuracies": 1.0, "rewards/chosen": 1.1630836725234985, "rewards/margins": 1.6281838417053223, "rewards/rejected": -0.46510010957717896, "step": 3713 }, { "epoch": 0.6, "learning_rate": 9.691229552699815e-07, "logits/chosen": -0.7428998351097107, "logits/rejected": -0.5849242806434631, "logps/chosen": -62.85682678222656, "logps/rejected": -66.98693084716797, "loss": 0.2852, "rewards/accuracies": 1.0, "rewards/chosen": 2.670170545578003, "rewards/margins": 1.014243245124817, "rewards/rejected": 1.655927300453186, "step": 3714 }, { "epoch": 0.6, "learning_rate": 9.69077469940026e-07, "logits/chosen": -0.42269811034202576, "logits/rejected": -0.42861899733543396, "logps/chosen": -80.61312866210938, "logps/rejected": -90.50595092773438, "loss": 0.389, "rewards/accuracies": 1.0, "rewards/chosen": 1.0773544311523438, "rewards/margins": 0.11047667264938354, "rewards/rejected": 0.9668777585029602, "step": 3715 }, { "epoch": 0.6, "learning_rate": 9.69031952201256e-07, "logits/chosen": -0.914051353931427, "logits/rejected": -0.8191784024238586, "logps/chosen": -177.2191162109375, "logps/rejected": -167.83460998535156, "loss": 0.8045, "rewards/accuracies": 0.0, "rewards/chosen": 4.263909816741943, "rewards/margins": -1.2342958450317383, "rewards/rejected": 5.498205661773682, "step": 3716 }, { "epoch": 0.6, "learning_rate": 9.68986402056816e-07, "logits/chosen": -0.7341442704200745, "logits/rejected": -0.6897657513618469, "logps/chosen": -45.65298843383789, "logps/rejected": -82.79950714111328, "loss": 0.2121, "rewards/accuracies": 1.0, "rewards/chosen": 1.4375457763671875, "rewards/margins": 0.6960998177528381, "rewards/rejected": 0.7414459586143494, "step": 3717 }, { "epoch": 0.6, "learning_rate": 9.689408195098532e-07, "logits/chosen": -0.7587566375732422, "logits/rejected": -0.6657955646514893, "logps/chosen": -77.3929214477539, "logps/rejected": -78.13311767578125, "loss": 0.5804, "rewards/accuracies": 0.0, "rewards/chosen": 1.8285354375839233, "rewards/margins": -0.6153825521469116, "rewards/rejected": 2.443917989730835, "step": 3718 }, { "epoch": 0.6, "learning_rate": 9.688952045635167e-07, "logits/chosen": -0.6648828983306885, "logits/rejected": -0.5732777118682861, "logps/chosen": -167.9044189453125, "logps/rejected": -109.48518371582031, "loss": 0.1012, "rewards/accuracies": 1.0, "rewards/chosen": 4.5593719482421875, "rewards/margins": 2.632493495941162, "rewards/rejected": 1.9268783330917358, "step": 3719 }, { "epoch": 0.6, "learning_rate": 9.688495572209585e-07, "logits/chosen": -0.9544409513473511, "logits/rejected": -0.8713975548744202, "logps/chosen": -148.78147888183594, "logps/rejected": -171.40927124023438, "loss": 0.2604, "rewards/accuracies": 1.0, "rewards/chosen": 4.1735124588012695, "rewards/margins": 1.6553804874420166, "rewards/rejected": 2.518131971359253, "step": 3720 }, { "epoch": 0.6, "learning_rate": 9.688038774853322e-07, "logits/chosen": -0.858515202999115, "logits/rejected": -0.8043506145477295, "logps/chosen": -63.51018142700195, "logps/rejected": -70.2010726928711, "loss": 1.2145, "rewards/accuracies": 0.0, "rewards/chosen": 0.8292140960693359, "rewards/margins": -1.071346640586853, "rewards/rejected": 1.900560736656189, "step": 3721 }, { "epoch": 0.6, "learning_rate": 9.687581653597939e-07, "logits/chosen": -0.4407600462436676, "logits/rejected": -0.4306260049343109, "logps/chosen": -66.66508483886719, "logps/rejected": -256.78643798828125, "loss": 2.2093, "rewards/accuracies": 0.0, "rewards/chosen": 1.2282875776290894, "rewards/margins": -4.017601490020752, "rewards/rejected": 5.245889186859131, "step": 3722 }, { "epoch": 0.6, "learning_rate": 9.687124208475017e-07, "logits/chosen": -0.4959854781627655, "logits/rejected": -0.40971794724464417, "logps/chosen": -159.01983642578125, "logps/rejected": -121.87161254882812, "loss": 0.2613, "rewards/accuracies": 1.0, "rewards/chosen": 0.958453357219696, "rewards/margins": 0.7306067943572998, "rewards/rejected": 0.22784653306007385, "step": 3723 }, { "epoch": 0.6, "learning_rate": 9.686666439516163e-07, "logits/chosen": -0.43157297372817993, "logits/rejected": -0.4614083468914032, "logps/chosen": -47.406307220458984, "logps/rejected": -105.66441345214844, "loss": 0.5691, "rewards/accuracies": 0.0, "rewards/chosen": 2.3388829231262207, "rewards/margins": -0.6660702228546143, "rewards/rejected": 3.004953145980835, "step": 3724 }, { "epoch": 0.6, "learning_rate": 9.686208346753005e-07, "logits/chosen": -0.5607666373252869, "logits/rejected": -0.4786287248134613, "logps/chosen": -45.852874755859375, "logps/rejected": -61.852088928222656, "loss": 0.5406, "rewards/accuracies": 0.0, "rewards/chosen": 0.8425361514091492, "rewards/margins": -0.21552056074142456, "rewards/rejected": 1.0580567121505737, "step": 3725 }, { "epoch": 0.6, "learning_rate": 9.68574993021719e-07, "logits/chosen": -0.5626500248908997, "logits/rejected": -0.558991551399231, "logps/chosen": -64.60377502441406, "logps/rejected": -78.98701477050781, "loss": 0.08, "rewards/accuracies": 1.0, "rewards/chosen": 4.1577582359313965, "rewards/margins": 2.4991445541381836, "rewards/rejected": 1.6586135625839233, "step": 3726 }, { "epoch": 0.6, "learning_rate": 9.68529118994039e-07, "logits/chosen": -0.63153076171875, "logits/rejected": -0.5690500736236572, "logps/chosen": -99.23942565917969, "logps/rejected": -104.86224365234375, "loss": 0.3114, "rewards/accuracies": 1.0, "rewards/chosen": 3.6781814098358154, "rewards/margins": 1.167750358581543, "rewards/rejected": 2.5104310512542725, "step": 3727 }, { "epoch": 0.61, "learning_rate": 9.684832125954303e-07, "logits/chosen": -0.8993227481842041, "logits/rejected": -0.9047488570213318, "logps/chosen": -169.55247497558594, "logps/rejected": -130.65322875976562, "loss": 2.4202, "rewards/accuracies": 0.0, "rewards/chosen": 2.9875075817108154, "rewards/margins": -3.903434991836548, "rewards/rejected": 6.890942573547363, "step": 3728 }, { "epoch": 0.61, "learning_rate": 9.684372738290645e-07, "logits/chosen": -0.7140520811080933, "logits/rejected": -0.648170530796051, "logps/chosen": -29.692052841186523, "logps/rejected": -106.51173400878906, "loss": 0.4666, "rewards/accuracies": 1.0, "rewards/chosen": 2.1732287406921387, "rewards/margins": 0.1155390739440918, "rewards/rejected": 2.057689666748047, "step": 3729 }, { "epoch": 0.61, "learning_rate": 9.683913026981154e-07, "logits/chosen": -0.8351004123687744, "logits/rejected": -0.7864686250686646, "logps/chosen": -137.13682556152344, "logps/rejected": -152.67398071289062, "loss": 0.2012, "rewards/accuracies": 1.0, "rewards/chosen": 1.5765609741210938, "rewards/margins": 1.2600631713867188, "rewards/rejected": 0.316497802734375, "step": 3730 }, { "epoch": 0.61, "learning_rate": 9.683452992057593e-07, "logits/chosen": -0.45909935235977173, "logits/rejected": -0.4095415771007538, "logps/chosen": -65.99996948242188, "logps/rejected": -69.97576904296875, "loss": 0.3663, "rewards/accuracies": 1.0, "rewards/chosen": 0.5217254757881165, "rewards/margins": 0.01250535249710083, "rewards/rejected": 0.5092201232910156, "step": 3731 }, { "epoch": 0.61, "learning_rate": 9.682992633551743e-07, "logits/chosen": -0.8769598603248596, "logits/rejected": -0.8769598603248596, "logps/chosen": -47.17803955078125, "logps/rejected": -47.17803955078125, "loss": 0.5836, "rewards/accuracies": 0.0, "rewards/chosen": 1.9545990228652954, "rewards/margins": 0.0, "rewards/rejected": 1.9545990228652954, "step": 3732 }, { "epoch": 0.61, "learning_rate": 9.682531951495416e-07, "logits/chosen": -0.9301584959030151, "logits/rejected": -0.9164735078811646, "logps/chosen": -45.295127868652344, "logps/rejected": -67.04837799072266, "loss": 0.1194, "rewards/accuracies": 1.0, "rewards/chosen": 1.6867049932479858, "rewards/margins": 1.3158988952636719, "rewards/rejected": 0.37080612778663635, "step": 3733 }, { "epoch": 0.61, "learning_rate": 9.682070945920436e-07, "logits/chosen": -0.142860546708107, "logits/rejected": -0.15807422995567322, "logps/chosen": -4.860714912414551, "logps/rejected": -1.7806556224822998, "loss": 0.6289, "rewards/accuracies": 0.0, "rewards/chosen": 0.1299430876970291, "rewards/margins": -0.08663459122180939, "rewards/rejected": 0.2165776789188385, "step": 3734 }, { "epoch": 0.61, "learning_rate": 9.681609616858657e-07, "logits/chosen": -1.2305394411087036, "logits/rejected": -1.2423946857452393, "logps/chosen": -82.75639343261719, "logps/rejected": -49.28096008300781, "loss": 0.8365, "rewards/accuracies": 1.0, "rewards/chosen": 0.7343673706054688, "rewards/margins": 0.6755630373954773, "rewards/rejected": 0.05880432203412056, "step": 3735 }, { "epoch": 0.61, "learning_rate": 9.681147964341952e-07, "logits/chosen": -0.9031802415847778, "logits/rejected": -0.8803644776344299, "logps/chosen": -39.2719841003418, "logps/rejected": -80.76240539550781, "loss": 0.5811, "rewards/accuracies": 0.0, "rewards/chosen": 1.558346152305603, "rewards/margins": -0.45570576190948486, "rewards/rejected": 2.014051914215088, "step": 3736 }, { "epoch": 0.61, "learning_rate": 9.680685988402212e-07, "logits/chosen": -0.31131264567375183, "logits/rejected": -0.3242477476596832, "logps/chosen": -56.059349060058594, "logps/rejected": -55.21140670776367, "loss": 0.3489, "rewards/accuracies": 1.0, "rewards/chosen": 0.6975288391113281, "rewards/margins": 0.33121833205223083, "rewards/rejected": 0.3663105070590973, "step": 3737 }, { "epoch": 0.61, "learning_rate": 9.680223689071362e-07, "logits/chosen": -0.7378262877464294, "logits/rejected": -0.6803282499313354, "logps/chosen": -93.01353454589844, "logps/rejected": -62.518798828125, "loss": 0.557, "rewards/accuracies": 0.0, "rewards/chosen": 0.3771255612373352, "rewards/margins": -0.33547133207321167, "rewards/rejected": 0.7125968933105469, "step": 3738 }, { "epoch": 0.61, "learning_rate": 9.679761066381341e-07, "logits/chosen": -0.5589299201965332, "logits/rejected": -0.559602677822113, "logps/chosen": -8.538320541381836, "logps/rejected": -7.577293872833252, "loss": 1.4438, "rewards/accuracies": 0.0, "rewards/chosen": -0.03385677561163902, "rewards/margins": -0.29224249720573425, "rewards/rejected": 0.25838571786880493, "step": 3739 }, { "epoch": 0.61, "learning_rate": 9.67929812036411e-07, "logits/chosen": -0.6662235856056213, "logits/rejected": -0.56965571641922, "logps/chosen": -59.179718017578125, "logps/rejected": -116.38435363769531, "loss": 0.3288, "rewards/accuracies": 1.0, "rewards/chosen": 4.173234462738037, "rewards/margins": 0.2008969783782959, "rewards/rejected": 3.972337484359741, "step": 3740 }, { "epoch": 0.61, "learning_rate": 9.678834851051653e-07, "logits/chosen": -0.7614873051643372, "logits/rejected": -0.7356504201889038, "logps/chosen": -14.076343536376953, "logps/rejected": -19.125709533691406, "loss": 0.591, "rewards/accuracies": 1.0, "rewards/chosen": 1.438608169555664, "rewards/margins": 0.10167086124420166, "rewards/rejected": 1.3369373083114624, "step": 3741 }, { "epoch": 0.61, "learning_rate": 9.67837125847598e-07, "logits/chosen": -0.6944959759712219, "logits/rejected": -0.58872389793396, "logps/chosen": -68.09302520751953, "logps/rejected": -96.1755599975586, "loss": 0.3349, "rewards/accuracies": 1.0, "rewards/chosen": 2.074927568435669, "rewards/margins": 0.3675888776779175, "rewards/rejected": 1.7073386907577515, "step": 3742 }, { "epoch": 0.61, "learning_rate": 9.677907342669123e-07, "logits/chosen": -0.7377906441688538, "logits/rejected": -0.6066121459007263, "logps/chosen": -170.62356567382812, "logps/rejected": -85.20846557617188, "loss": 0.0514, "rewards/accuracies": 1.0, "rewards/chosen": 5.99916410446167, "rewards/margins": 3.018782377243042, "rewards/rejected": 2.980381727218628, "step": 3743 }, { "epoch": 0.61, "learning_rate": 9.677443103663128e-07, "logits/chosen": -0.2957858145236969, "logits/rejected": -0.2957858145236969, "logps/chosen": -47.1635856628418, "logps/rejected": -47.1635856628418, "loss": 1.0406, "rewards/accuracies": 0.0, "rewards/chosen": -0.32067224383354187, "rewards/margins": 0.0, "rewards/rejected": -0.32067224383354187, "step": 3744 }, { "epoch": 0.61, "learning_rate": 9.676978541490074e-07, "logits/chosen": -0.6875135898590088, "logits/rejected": -0.6718629598617554, "logps/chosen": -77.58147430419922, "logps/rejected": -57.130409240722656, "loss": 0.7006, "rewards/accuracies": 1.0, "rewards/chosen": 0.9162521362304688, "rewards/margins": 0.034487903118133545, "rewards/rejected": 0.8817642331123352, "step": 3745 }, { "epoch": 0.61, "learning_rate": 9.676513656182057e-07, "logits/chosen": -0.3648735284805298, "logits/rejected": -0.3485172986984253, "logps/chosen": -53.39577865600586, "logps/rejected": -78.71994018554688, "loss": 0.5041, "rewards/accuracies": 0.0, "rewards/chosen": 1.8183170557022095, "rewards/margins": -0.5015407800674438, "rewards/rejected": 2.3198578357696533, "step": 3746 }, { "epoch": 0.61, "learning_rate": 9.676048447771198e-07, "logits/chosen": -0.3526248037815094, "logits/rejected": -0.4089733362197876, "logps/chosen": -99.91897583007812, "logps/rejected": -94.94062805175781, "loss": 0.2826, "rewards/accuracies": 1.0, "rewards/chosen": 4.023338317871094, "rewards/margins": 1.9947631359100342, "rewards/rejected": 2.0285751819610596, "step": 3747 }, { "epoch": 0.61, "learning_rate": 9.675582916289633e-07, "logits/chosen": -0.49340441823005676, "logits/rejected": -0.49340441823005676, "logps/chosen": -0.6330450773239136, "logps/rejected": -0.6330450773239136, "loss": 1.7097, "rewards/accuracies": 0.0, "rewards/chosen": 0.22321200370788574, "rewards/margins": 0.0, "rewards/rejected": 0.22321200370788574, "step": 3748 }, { "epoch": 0.61, "learning_rate": 9.675117061769532e-07, "logits/chosen": -0.5792956948280334, "logits/rejected": -0.5454835891723633, "logps/chosen": -48.01633071899414, "logps/rejected": -61.131004333496094, "loss": 0.6866, "rewards/accuracies": 0.0, "rewards/chosen": 1.3862133026123047, "rewards/margins": -0.41805076599121094, "rewards/rejected": 1.8042640686035156, "step": 3749 }, { "epoch": 0.61, "learning_rate": 9.674650884243074e-07, "logits/chosen": -0.7372555732727051, "logits/rejected": -0.7384741902351379, "logps/chosen": -52.939048767089844, "logps/rejected": -30.050445556640625, "loss": 0.5356, "rewards/accuracies": 0.0, "rewards/chosen": 1.0535004138946533, "rewards/margins": -0.5672146081924438, "rewards/rejected": 1.6207150220870972, "step": 3750 }, { "epoch": 0.61, "learning_rate": 9.674184383742475e-07, "logits/chosen": -0.8546059131622314, "logits/rejected": -0.7922844886779785, "logps/chosen": -86.40299987792969, "logps/rejected": -84.56745910644531, "loss": 1.4012, "rewards/accuracies": 0.0, "rewards/chosen": 1.58025062084198, "rewards/margins": -0.25194549560546875, "rewards/rejected": 1.8321961164474487, "step": 3751 }, { "epoch": 0.61, "learning_rate": 9.673717560299963e-07, "logits/chosen": -0.276508629322052, "logits/rejected": -0.276508629322052, "logps/chosen": -18.860143661499023, "logps/rejected": -18.860143661499023, "loss": 0.6632, "rewards/accuracies": 0.0, "rewards/chosen": 0.09032421559095383, "rewards/margins": 0.0, "rewards/rejected": 0.09032421559095383, "step": 3752 }, { "epoch": 0.61, "learning_rate": 9.673250413947791e-07, "logits/chosen": -0.5842404961585999, "logits/rejected": -0.5923036932945251, "logps/chosen": -25.12897491455078, "logps/rejected": -25.52873992919922, "loss": 0.6381, "rewards/accuracies": 0.0, "rewards/chosen": 0.1274135559797287, "rewards/margins": -0.020048141479492188, "rewards/rejected": 0.1474616974592209, "step": 3753 }, { "epoch": 0.61, "learning_rate": 9.672782944718233e-07, "logits/chosen": -0.7383846044540405, "logits/rejected": -0.722331166267395, "logps/chosen": -56.01128005981445, "logps/rejected": -61.336669921875, "loss": 0.7949, "rewards/accuracies": 0.0, "rewards/chosen": 0.7794368863105774, "rewards/margins": -1.1605732440948486, "rewards/rejected": 1.9400100708007812, "step": 3754 }, { "epoch": 0.61, "learning_rate": 9.672315152643587e-07, "logits/chosen": -0.7032737731933594, "logits/rejected": -0.6405837535858154, "logps/chosen": -100.74452209472656, "logps/rejected": -94.75483703613281, "loss": 0.4, "rewards/accuracies": 1.0, "rewards/chosen": 4.3803391456604, "rewards/margins": 1.7580125331878662, "rewards/rejected": 2.622326612472534, "step": 3755 }, { "epoch": 0.61, "learning_rate": 9.671847037756176e-07, "logits/chosen": -0.5012564063072205, "logits/rejected": -0.4796891212463379, "logps/chosen": -26.887109756469727, "logps/rejected": -77.41061401367188, "loss": 0.3373, "rewards/accuracies": 1.0, "rewards/chosen": 0.5523756146430969, "rewards/margins": 0.3614484667778015, "rewards/rejected": 0.19092713296413422, "step": 3756 }, { "epoch": 0.61, "learning_rate": 9.671378600088338e-07, "logits/chosen": -0.5966686010360718, "logits/rejected": -0.5572707653045654, "logps/chosen": -82.61966705322266, "logps/rejected": -137.96514892578125, "loss": 0.7707, "rewards/accuracies": 1.0, "rewards/chosen": 0.5070526003837585, "rewards/margins": 0.3082031011581421, "rewards/rejected": 0.19884948432445526, "step": 3757 }, { "epoch": 0.61, "learning_rate": 9.670909839672441e-07, "logits/chosen": -0.40966880321502686, "logits/rejected": -0.42334625124931335, "logps/chosen": -44.41849899291992, "logps/rejected": -50.414791107177734, "loss": 1.2734, "rewards/accuracies": 0.0, "rewards/chosen": 1.1314095258712769, "rewards/margins": -0.7771953344345093, "rewards/rejected": 1.9086048603057861, "step": 3758 }, { "epoch": 0.61, "learning_rate": 9.67044075654087e-07, "logits/chosen": -0.7196587920188904, "logits/rejected": -0.716496467590332, "logps/chosen": -97.74697875976562, "logps/rejected": -79.41673278808594, "loss": 0.7183, "rewards/accuracies": 0.0, "rewards/chosen": 1.4917182922363281, "rewards/margins": -0.9700424671173096, "rewards/rejected": 2.4617607593536377, "step": 3759 }, { "epoch": 0.61, "learning_rate": 9.669971350726035e-07, "logits/chosen": -0.9932551980018616, "logits/rejected": -0.9265726208686829, "logps/chosen": -81.77088928222656, "logps/rejected": -87.63023376464844, "loss": 0.6088, "rewards/accuracies": 0.0, "rewards/chosen": 0.8277389407157898, "rewards/margins": -0.6859809756278992, "rewards/rejected": 1.513719916343689, "step": 3760 }, { "epoch": 0.61, "learning_rate": 9.669501622260367e-07, "logits/chosen": -0.385382741689682, "logits/rejected": -0.43427833914756775, "logps/chosen": -67.61665344238281, "logps/rejected": -45.103668212890625, "loss": 1.3095, "rewards/accuracies": 0.0, "rewards/chosen": 0.8881897330284119, "rewards/margins": -0.9894790053367615, "rewards/rejected": 1.8776687383651733, "step": 3761 }, { "epoch": 0.61, "learning_rate": 9.669031571176323e-07, "logits/chosen": -0.5256052017211914, "logits/rejected": -0.43951523303985596, "logps/chosen": -59.73701477050781, "logps/rejected": -28.38078498840332, "loss": 0.5768, "rewards/accuracies": 1.0, "rewards/chosen": 2.2177658081054688, "rewards/margins": 1.2978544235229492, "rewards/rejected": 0.9199113845825195, "step": 3762 }, { "epoch": 0.61, "learning_rate": 9.668561197506374e-07, "logits/chosen": -0.6732537150382996, "logits/rejected": -0.6016129851341248, "logps/chosen": -86.38629150390625, "logps/rejected": -47.53407287597656, "loss": 2.1722, "rewards/accuracies": 0.0, "rewards/chosen": 0.2855484187602997, "rewards/margins": -1.0278511047363281, "rewards/rejected": 1.3133995532989502, "step": 3763 }, { "epoch": 0.61, "learning_rate": 9.66809050128302e-07, "logits/chosen": -0.48099440336227417, "logits/rejected": -0.44985467195510864, "logps/chosen": -74.33806610107422, "logps/rejected": -142.98028564453125, "loss": 2.0031, "rewards/accuracies": 0.0, "rewards/chosen": 0.7162208557128906, "rewards/margins": -3.46517276763916, "rewards/rejected": 4.181393623352051, "step": 3764 }, { "epoch": 0.61, "learning_rate": 9.667619482538783e-07, "logits/chosen": -0.4921621084213257, "logits/rejected": -0.4711441695690155, "logps/chosen": -242.823486328125, "logps/rejected": -105.5382308959961, "loss": 0.6626, "rewards/accuracies": 1.0, "rewards/chosen": 2.5606415271759033, "rewards/margins": 1.7645232677459717, "rewards/rejected": 0.7961181998252869, "step": 3765 }, { "epoch": 0.61, "learning_rate": 9.667148141306205e-07, "logits/chosen": -0.510274350643158, "logits/rejected": -0.3902609348297119, "logps/chosen": -105.7483901977539, "logps/rejected": -104.81779479980469, "loss": 0.7537, "rewards/accuracies": 0.0, "rewards/chosen": 0.9367119073867798, "rewards/margins": -0.17444229125976562, "rewards/rejected": 1.1111541986465454, "step": 3766 }, { "epoch": 0.61, "learning_rate": 9.66667647761785e-07, "logits/chosen": -0.5244315266609192, "logits/rejected": -0.5976465940475464, "logps/chosen": -113.52525329589844, "logps/rejected": -175.73410034179688, "loss": 0.7989, "rewards/accuracies": 0.0, "rewards/chosen": 4.179539680480957, "rewards/margins": -0.3024430274963379, "rewards/rejected": 4.481982707977295, "step": 3767 }, { "epoch": 0.61, "learning_rate": 9.666204491506308e-07, "logits/chosen": -0.6462138295173645, "logits/rejected": -0.630308210849762, "logps/chosen": -87.35271453857422, "logps/rejected": -51.95018768310547, "loss": 0.7623, "rewards/accuracies": 0.0, "rewards/chosen": 0.9303909540176392, "rewards/margins": -0.2515380382537842, "rewards/rejected": 1.1819289922714233, "step": 3768 }, { "epoch": 0.61, "learning_rate": 9.66573218300419e-07, "logits/chosen": -0.7836886644363403, "logits/rejected": -0.6949011087417603, "logps/chosen": -126.4420166015625, "logps/rejected": -65.77494812011719, "loss": 0.1079, "rewards/accuracies": 1.0, "rewards/chosen": 4.98707914352417, "rewards/margins": 3.033393383026123, "rewards/rejected": 1.9536857604980469, "step": 3769 }, { "epoch": 0.61, "learning_rate": 9.665259552144122e-07, "logits/chosen": -0.5054336786270142, "logits/rejected": -0.5054336786270142, "logps/chosen": -62.712501525878906, "logps/rejected": -62.712501525878906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.1228859424591064, "rewards/margins": 0.0, "rewards/rejected": 1.1228859424591064, "step": 3770 }, { "epoch": 0.61, "learning_rate": 9.664786598958762e-07, "logits/chosen": -0.5857647657394409, "logits/rejected": -0.5538245439529419, "logps/chosen": -56.66733169555664, "logps/rejected": -90.17329406738281, "loss": 0.5874, "rewards/accuracies": 1.0, "rewards/chosen": 1.6029186248779297, "rewards/margins": 0.33368563652038574, "rewards/rejected": 1.269232988357544, "step": 3771 }, { "epoch": 0.61, "learning_rate": 9.66431332348079e-07, "logits/chosen": -0.5994093418121338, "logits/rejected": -0.6182006001472473, "logps/chosen": -106.06065368652344, "logps/rejected": -96.3130111694336, "loss": 0.6764, "rewards/accuracies": 1.0, "rewards/chosen": 1.004076361656189, "rewards/margins": 0.17231440544128418, "rewards/rejected": 0.8317619562149048, "step": 3772 }, { "epoch": 0.61, "learning_rate": 9.663839725742899e-07, "logits/chosen": -0.9898622632026672, "logits/rejected": -1.0189787149429321, "logps/chosen": -144.2523193359375, "logps/rejected": -40.236328125, "loss": 0.44, "rewards/accuracies": 1.0, "rewards/chosen": 3.022264242172241, "rewards/margins": 1.5822430849075317, "rewards/rejected": 1.4400211572647095, "step": 3773 }, { "epoch": 0.61, "learning_rate": 9.663365805777814e-07, "logits/chosen": -0.6005105376243591, "logits/rejected": -0.6158022880554199, "logps/chosen": -91.56564331054688, "logps/rejected": -127.69881439208984, "loss": 0.8978, "rewards/accuracies": 0.0, "rewards/chosen": 2.8195037841796875, "rewards/margins": -0.698767900466919, "rewards/rejected": 3.5182716846466064, "step": 3774 }, { "epoch": 0.61, "learning_rate": 9.662891563618277e-07, "logits/chosen": -1.1224470138549805, "logits/rejected": -1.0170533657073975, "logps/chosen": -163.42237854003906, "logps/rejected": -21.514320373535156, "loss": 0.0455, "rewards/accuracies": 1.0, "rewards/chosen": 5.356062412261963, "rewards/margins": 4.885106563568115, "rewards/rejected": 0.47095605731010437, "step": 3775 }, { "epoch": 0.61, "learning_rate": 9.662416999297052e-07, "logits/chosen": -0.9852888584136963, "logits/rejected": -0.9555004239082336, "logps/chosen": -50.018428802490234, "logps/rejected": -109.66302490234375, "loss": 0.5614, "rewards/accuracies": 1.0, "rewards/chosen": 0.8484417200088501, "rewards/margins": 0.38530999422073364, "rewards/rejected": 0.46313172578811646, "step": 3776 }, { "epoch": 0.61, "learning_rate": 9.661942112846929e-07, "logits/chosen": -0.8946751952171326, "logits/rejected": -0.921213686466217, "logps/chosen": -71.1532211303711, "logps/rejected": -72.6966552734375, "loss": 0.3361, "rewards/accuracies": 1.0, "rewards/chosen": 3.021498918533325, "rewards/margins": 0.053014278411865234, "rewards/rejected": 2.96848464012146, "step": 3777 }, { "epoch": 0.61, "learning_rate": 9.66146690430072e-07, "logits/chosen": -0.5258342027664185, "logits/rejected": -0.5258342027664185, "logps/chosen": -39.213096618652344, "logps/rejected": -39.213096618652344, "loss": 1.6938, "rewards/accuracies": 0.0, "rewards/chosen": 1.6106147766113281, "rewards/margins": 0.0, "rewards/rejected": 1.6106147766113281, "step": 3778 }, { "epoch": 0.61, "learning_rate": 9.660991373691252e-07, "logits/chosen": -0.7087271213531494, "logits/rejected": -0.715026319026947, "logps/chosen": -75.95033264160156, "logps/rejected": -124.25802612304688, "loss": 0.4247, "rewards/accuracies": 1.0, "rewards/chosen": 1.0256439447402954, "rewards/margins": 0.7642624378204346, "rewards/rejected": 0.2613815367221832, "step": 3779 }, { "epoch": 0.61, "learning_rate": 9.660515521051384e-07, "logits/chosen": -0.5720645189285278, "logits/rejected": -0.5722308158874512, "logps/chosen": -3.464590072631836, "logps/rejected": -5.035315990447998, "loss": 0.3584, "rewards/accuracies": 1.0, "rewards/chosen": 0.30911922454833984, "rewards/margins": 0.06995968520641327, "rewards/rejected": 0.23915953934192657, "step": 3780 }, { "epoch": 0.61, "learning_rate": 9.660039346413992e-07, "logits/chosen": -0.3322024345397949, "logits/rejected": -0.17766129970550537, "logps/chosen": -122.82363891601562, "logps/rejected": -78.46142578125, "loss": 0.127, "rewards/accuracies": 1.0, "rewards/chosen": 3.8500640392303467, "rewards/margins": 2.3256995677948, "rewards/rejected": 1.5243644714355469, "step": 3781 }, { "epoch": 0.61, "learning_rate": 9.659562849811974e-07, "logits/chosen": -0.5225801467895508, "logits/rejected": -0.5382717847824097, "logps/chosen": -45.28856658935547, "logps/rejected": -52.49443435668945, "loss": 0.716, "rewards/accuracies": 0.0, "rewards/chosen": 1.400410532951355, "rewards/margins": -0.9398661851882935, "rewards/rejected": 2.3402767181396484, "step": 3782 }, { "epoch": 0.61, "learning_rate": 9.659086031278254e-07, "logits/chosen": -0.7381349802017212, "logits/rejected": -0.7188076972961426, "logps/chosen": -83.15668487548828, "logps/rejected": -115.59043884277344, "loss": 0.488, "rewards/accuracies": 0.0, "rewards/chosen": 2.3809432983398438, "rewards/margins": -0.4884033203125, "rewards/rejected": 2.8693466186523438, "step": 3783 }, { "epoch": 0.61, "learning_rate": 9.658608890845771e-07, "logits/chosen": -0.48392343521118164, "logits/rejected": -0.5278359055519104, "logps/chosen": -68.05352783203125, "logps/rejected": -68.46943664550781, "loss": 0.4883, "rewards/accuracies": 1.0, "rewards/chosen": 1.5620269775390625, "rewards/margins": 1.1155608892440796, "rewards/rejected": 0.4464660584926605, "step": 3784 }, { "epoch": 0.61, "learning_rate": 9.658131428547498e-07, "logits/chosen": -0.08640534430742264, "logits/rejected": -0.08640534430742264, "logps/chosen": -46.128517150878906, "logps/rejected": -46.128517150878906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.01521148718893528, "rewards/margins": 0.0, "rewards/rejected": 0.01521148718893528, "step": 3785 }, { "epoch": 0.61, "learning_rate": 9.657653644416418e-07, "logits/chosen": -0.5885348320007324, "logits/rejected": -0.5605877041816711, "logps/chosen": -124.41134643554688, "logps/rejected": -209.06631469726562, "loss": 0.7714, "rewards/accuracies": 0.0, "rewards/chosen": 3.254107713699341, "rewards/margins": -0.9547045230865479, "rewards/rejected": 4.208812236785889, "step": 3786 }, { "epoch": 0.61, "learning_rate": 9.65717553848554e-07, "logits/chosen": -0.5584083199501038, "logits/rejected": -0.559467077255249, "logps/chosen": -37.25701141357422, "logps/rejected": -55.48735046386719, "loss": 0.3861, "rewards/accuracies": 0.0, "rewards/chosen": 1.3376601934432983, "rewards/margins": -0.08183526992797852, "rewards/rejected": 1.4194954633712769, "step": 3787 }, { "epoch": 0.61, "learning_rate": 9.6566971107879e-07, "logits/chosen": -0.7269356846809387, "logits/rejected": -0.7269356846809387, "logps/chosen": -47.33903503417969, "logps/rejected": -47.33903503417969, "loss": 2.108, "rewards/accuracies": 0.0, "rewards/chosen": 2.284841299057007, "rewards/margins": 0.0, "rewards/rejected": 2.284841299057007, "step": 3788 }, { "epoch": 0.61, "learning_rate": 9.65621836135655e-07, "logits/chosen": -0.4390813410282135, "logits/rejected": -0.30738964676856995, "logps/chosen": -60.99961853027344, "logps/rejected": -46.142723083496094, "loss": 0.7234, "rewards/accuracies": 1.0, "rewards/chosen": 2.441021680831909, "rewards/margins": 0.6888045072555542, "rewards/rejected": 1.752217173576355, "step": 3789 }, { "epoch": 0.62, "learning_rate": 9.65573929022457e-07, "logits/chosen": -0.5677700638771057, "logits/rejected": -0.4911814332008362, "logps/chosen": -81.00245666503906, "logps/rejected": -9.183518409729004, "loss": 1.075, "rewards/accuracies": 1.0, "rewards/chosen": 1.0352051258087158, "rewards/margins": 0.13279491662979126, "rewards/rejected": 0.9024102091789246, "step": 3790 }, { "epoch": 0.62, "learning_rate": 9.65525989742506e-07, "logits/chosen": -0.7022539377212524, "logits/rejected": -0.5945366621017456, "logps/chosen": -51.78001022338867, "logps/rejected": -28.49380874633789, "loss": 0.7177, "rewards/accuracies": 1.0, "rewards/chosen": 0.9117469787597656, "rewards/margins": 0.09838634729385376, "rewards/rejected": 0.8133606314659119, "step": 3791 }, { "epoch": 0.62, "learning_rate": 9.654780182991138e-07, "logits/chosen": -0.5575560927391052, "logits/rejected": -0.556832492351532, "logps/chosen": -10.226430892944336, "logps/rejected": -2.9212193489074707, "loss": 0.4949, "rewards/accuracies": 0.0, "rewards/chosen": 0.17153683304786682, "rewards/margins": -0.3284001350402832, "rewards/rejected": 0.49993696808815, "step": 3792 }, { "epoch": 0.62, "learning_rate": 9.65430014695595e-07, "logits/chosen": 0.0960630401968956, "logits/rejected": 0.11332877725362778, "logps/chosen": -5.1283674240112305, "logps/rejected": -5.202162742614746, "loss": 0.4804, "rewards/accuracies": 0.0, "rewards/chosen": -0.025528525933623314, "rewards/margins": -0.37938231229782104, "rewards/rejected": 0.3538537919521332, "step": 3793 }, { "epoch": 0.62, "learning_rate": 9.65381978935266e-07, "logits/chosen": -0.7701901793479919, "logits/rejected": -0.7455785274505615, "logps/chosen": -87.5542221069336, "logps/rejected": -69.3321304321289, "loss": 0.31, "rewards/accuracies": 1.0, "rewards/chosen": 0.9094139337539673, "rewards/margins": 0.1845901608467102, "rewards/rejected": 0.7248237729072571, "step": 3794 }, { "epoch": 0.62, "learning_rate": 9.653339110214458e-07, "logits/chosen": -0.4854969084262848, "logits/rejected": -0.5287335515022278, "logps/chosen": -70.61404418945312, "logps/rejected": -155.58152770996094, "loss": 1.4826, "rewards/accuracies": 0.0, "rewards/chosen": 1.8246322870254517, "rewards/margins": -1.4612046480178833, "rewards/rejected": 3.285836935043335, "step": 3795 }, { "epoch": 0.62, "learning_rate": 9.652858109574552e-07, "logits/chosen": -0.062443722039461136, "logits/rejected": -0.03413164243102074, "logps/chosen": -75.29631805419922, "logps/rejected": -60.85979080200195, "loss": 0.1319, "rewards/accuracies": 1.0, "rewards/chosen": 3.2957863807678223, "rewards/margins": 1.8135143518447876, "rewards/rejected": 1.4822720289230347, "step": 3796 }, { "epoch": 0.62, "learning_rate": 9.652376787466178e-07, "logits/chosen": -0.6116744875907898, "logits/rejected": -0.5969892144203186, "logps/chosen": -36.343482971191406, "logps/rejected": -36.21228790283203, "loss": 1.7007, "rewards/accuracies": 0.0, "rewards/chosen": 0.17438851296901703, "rewards/margins": -0.3160964846611023, "rewards/rejected": 0.4904850125312805, "step": 3797 }, { "epoch": 0.62, "learning_rate": 9.65189514392259e-07, "logits/chosen": -0.8126810789108276, "logits/rejected": -0.8954996466636658, "logps/chosen": -179.92730712890625, "logps/rejected": -10.273835182189941, "loss": 0.053, "rewards/accuracies": 1.0, "rewards/chosen": 3.1501846313476562, "rewards/margins": 2.199465036392212, "rewards/rejected": 0.9507195353507996, "step": 3798 }, { "epoch": 0.62, "learning_rate": 9.651413178977064e-07, "logits/chosen": -0.39504677057266235, "logits/rejected": -0.4279518723487854, "logps/chosen": -205.38641357421875, "logps/rejected": -117.0320053100586, "loss": 0.9772, "rewards/accuracies": 0.0, "rewards/chosen": 3.169032335281372, "rewards/margins": -1.6864464282989502, "rewards/rejected": 4.855478763580322, "step": 3799 }, { "epoch": 0.62, "learning_rate": 9.6509308926629e-07, "logits/chosen": -0.6777706742286682, "logits/rejected": -0.5997236371040344, "logps/chosen": -98.40640258789062, "logps/rejected": -46.02701187133789, "loss": 0.7398, "rewards/accuracies": 0.0, "rewards/chosen": 0.8149795532226562, "rewards/margins": -0.8939586877822876, "rewards/rejected": 1.7089382410049438, "step": 3800 }, { "epoch": 0.62, "learning_rate": 9.650448285013417e-07, "logits/chosen": -0.5945090055465698, "logits/rejected": -0.6650243401527405, "logps/chosen": -86.31034088134766, "logps/rejected": -78.3368148803711, "loss": 2.1514, "rewards/accuracies": 0.0, "rewards/chosen": 2.2902519702911377, "rewards/margins": -0.9863250255584717, "rewards/rejected": 3.2765769958496094, "step": 3801 }, { "epoch": 0.62, "learning_rate": 9.64996535606196e-07, "logits/chosen": -0.0035005654208362103, "logits/rejected": -0.03106646053493023, "logps/chosen": -69.70195770263672, "logps/rejected": -61.27360534667969, "loss": 2.3986, "rewards/accuracies": 0.0, "rewards/chosen": 0.5736038088798523, "rewards/margins": -1.1628754138946533, "rewards/rejected": 1.7364791631698608, "step": 3802 }, { "epoch": 0.62, "learning_rate": 9.649482105841898e-07, "logits/chosen": -0.6593624353408813, "logits/rejected": -0.6228350400924683, "logps/chosen": -36.290489196777344, "logps/rejected": -49.857154846191406, "loss": 0.9427, "rewards/accuracies": 0.0, "rewards/chosen": 0.32627755403518677, "rewards/margins": -1.225752592086792, "rewards/rejected": 1.5520302057266235, "step": 3803 }, { "epoch": 0.62, "learning_rate": 9.648998534386615e-07, "logits/chosen": -0.5314232110977173, "logits/rejected": -0.5344172716140747, "logps/chosen": -59.51609802246094, "logps/rejected": -90.33610534667969, "loss": 0.6523, "rewards/accuracies": 1.0, "rewards/chosen": 0.49557802081108093, "rewards/margins": 0.10425415635108948, "rewards/rejected": 0.39132386445999146, "step": 3804 }, { "epoch": 0.62, "learning_rate": 9.648514641729522e-07, "logits/chosen": -0.26163920760154724, "logits/rejected": -0.26163920760154724, "logps/chosen": -40.75434875488281, "logps/rejected": -40.75434875488281, "loss": 0.3986, "rewards/accuracies": 0.0, "rewards/chosen": 0.24527131021022797, "rewards/margins": 0.0, "rewards/rejected": 0.24527131021022797, "step": 3805 }, { "epoch": 0.62, "learning_rate": 9.64803042790405e-07, "logits/chosen": -0.2744521498680115, "logits/rejected": -0.18931041657924652, "logps/chosen": -54.90042495727539, "logps/rejected": -68.08281707763672, "loss": 0.9347, "rewards/accuracies": 1.0, "rewards/chosen": 2.086296558380127, "rewards/margins": 0.4789860248565674, "rewards/rejected": 1.6073105335235596, "step": 3806 }, { "epoch": 0.62, "learning_rate": 9.647545892943657e-07, "logits/chosen": -0.712181568145752, "logits/rejected": -0.7008618116378784, "logps/chosen": -60.330322265625, "logps/rejected": -19.559181213378906, "loss": 1.0877, "rewards/accuracies": 1.0, "rewards/chosen": 0.5652931332588196, "rewards/margins": 0.3017200529575348, "rewards/rejected": 0.2635730803012848, "step": 3807 }, { "epoch": 0.62, "learning_rate": 9.647061036881821e-07, "logits/chosen": -0.46503496170043945, "logits/rejected": -0.4139900207519531, "logps/chosen": -116.66863250732422, "logps/rejected": -64.51309204101562, "loss": 0.7287, "rewards/accuracies": 0.0, "rewards/chosen": 1.4547722339630127, "rewards/margins": -0.3122931718826294, "rewards/rejected": 1.767065405845642, "step": 3808 }, { "epoch": 0.62, "learning_rate": 9.646575859752035e-07, "logits/chosen": -0.6998966336250305, "logits/rejected": -0.671498715877533, "logps/chosen": -64.90269470214844, "logps/rejected": -47.93313217163086, "loss": 0.4541, "rewards/accuracies": 0.0, "rewards/chosen": 1.6469993591308594, "rewards/margins": -0.37906527519226074, "rewards/rejected": 2.02606463432312, "step": 3809 }, { "epoch": 0.62, "learning_rate": 9.646090361587827e-07, "logits/chosen": -0.5961005687713623, "logits/rejected": -0.5901542901992798, "logps/chosen": -86.6092758178711, "logps/rejected": -25.471576690673828, "loss": 0.7873, "rewards/accuracies": 0.0, "rewards/chosen": 0.7701011896133423, "rewards/margins": -0.9567482471466064, "rewards/rejected": 1.7268494367599487, "step": 3810 }, { "epoch": 0.62, "learning_rate": 9.645604542422732e-07, "logits/chosen": -0.5372639894485474, "logits/rejected": -0.5256133079528809, "logps/chosen": -49.35976028442383, "logps/rejected": -53.26740264892578, "loss": 0.3735, "rewards/accuracies": 1.0, "rewards/chosen": 2.8930721282958984, "rewards/margins": 0.4279346466064453, "rewards/rejected": 2.465137481689453, "step": 3811 }, { "epoch": 0.62, "learning_rate": 9.645118402290324e-07, "logits/chosen": -0.8378502726554871, "logits/rejected": -0.7845337986946106, "logps/chosen": -129.3952178955078, "logps/rejected": -99.7280044555664, "loss": 0.1903, "rewards/accuracies": 1.0, "rewards/chosen": 5.271693706512451, "rewards/margins": 1.6431376934051514, "rewards/rejected": 3.6285560131073, "step": 3812 }, { "epoch": 0.62, "learning_rate": 9.644631941224184e-07, "logits/chosen": -0.5990808606147766, "logits/rejected": -0.5990808606147766, "logps/chosen": -42.31013107299805, "logps/rejected": -42.31013107299805, "loss": 0.3799, "rewards/accuracies": 0.0, "rewards/chosen": 0.187591552734375, "rewards/margins": 0.0, "rewards/rejected": 0.187591552734375, "step": 3813 }, { "epoch": 0.62, "learning_rate": 9.644145159257927e-07, "logits/chosen": -0.8400272130966187, "logits/rejected": -0.7987784147262573, "logps/chosen": -158.85702514648438, "logps/rejected": -20.744792938232422, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": 4.291134834289551, "rewards/margins": 3.91805362701416, "rewards/rejected": 0.3730812072753906, "step": 3814 }, { "epoch": 0.62, "learning_rate": 9.643658056425183e-07, "logits/chosen": -0.6385319232940674, "logits/rejected": -0.6013761162757874, "logps/chosen": -40.02351379394531, "logps/rejected": -33.447120666503906, "loss": 0.2899, "rewards/accuracies": 1.0, "rewards/chosen": 1.4817756414413452, "rewards/margins": 0.38076090812683105, "rewards/rejected": 1.1010147333145142, "step": 3815 }, { "epoch": 0.62, "learning_rate": 9.643170632759606e-07, "logits/chosen": -0.18448226153850555, "logits/rejected": -0.18448226153850555, "logps/chosen": -3.291187286376953, "logps/rejected": -3.291187286376953, "loss": 0.8211, "rewards/accuracies": 0.0, "rewards/chosen": 0.07965221256017685, "rewards/margins": 0.0, "rewards/rejected": 0.07965221256017685, "step": 3816 }, { "epoch": 0.62, "learning_rate": 9.64268288829487e-07, "logits/chosen": -0.2933020293712616, "logits/rejected": -0.2873878479003906, "logps/chosen": -27.729454040527344, "logps/rejected": -2.618867874145508, "loss": 0.8691, "rewards/accuracies": 0.0, "rewards/chosen": 0.24682407081127167, "rewards/margins": -0.1814599186182022, "rewards/rejected": 0.4282839894294739, "step": 3817 }, { "epoch": 0.62, "learning_rate": 9.642194823064677e-07, "logits/chosen": -0.5754467844963074, "logits/rejected": -0.5075235366821289, "logps/chosen": -92.44783020019531, "logps/rejected": -129.720458984375, "loss": 0.3555, "rewards/accuracies": 1.0, "rewards/chosen": 3.1016814708709717, "rewards/margins": 1.9592925310134888, "rewards/rejected": 1.142388939857483, "step": 3818 }, { "epoch": 0.62, "learning_rate": 9.641706437102748e-07, "logits/chosen": -0.6543228626251221, "logits/rejected": -0.6258866786956787, "logps/chosen": -89.11964416503906, "logps/rejected": -63.499229431152344, "loss": 0.9413, "rewards/accuracies": 0.0, "rewards/chosen": 0.8669372797012329, "rewards/margins": -1.1541153192520142, "rewards/rejected": 2.021052598953247, "step": 3819 }, { "epoch": 0.62, "learning_rate": 9.641217730442824e-07, "logits/chosen": -0.5636354088783264, "logits/rejected": -0.5652418732643127, "logps/chosen": -113.5538101196289, "logps/rejected": -99.64306640625, "loss": 0.6248, "rewards/accuracies": 0.0, "rewards/chosen": 2.144573211669922, "rewards/margins": -0.35517358779907227, "rewards/rejected": 2.499746799468994, "step": 3820 }, { "epoch": 0.62, "learning_rate": 9.640728703118668e-07, "logits/chosen": -0.4146899878978729, "logits/rejected": -0.3365340530872345, "logps/chosen": -94.91109466552734, "logps/rejected": -79.92164611816406, "loss": 0.854, "rewards/accuracies": 0.0, "rewards/chosen": 0.2980812191963196, "rewards/margins": -0.5648094415664673, "rewards/rejected": 0.8628906607627869, "step": 3821 }, { "epoch": 0.62, "learning_rate": 9.640239355164073e-07, "logits/chosen": -0.8120242953300476, "logits/rejected": -0.7857897877693176, "logps/chosen": -114.2390365600586, "logps/rejected": -62.49512481689453, "loss": 0.6445, "rewards/accuracies": 0.0, "rewards/chosen": 0.5139335989952087, "rewards/margins": -0.811663806438446, "rewards/rejected": 1.3255974054336548, "step": 3822 }, { "epoch": 0.62, "learning_rate": 9.639749686612842e-07, "logits/chosen": -0.6150091886520386, "logits/rejected": -0.6484158039093018, "logps/chosen": -81.01538848876953, "logps/rejected": -97.30735778808594, "loss": 0.5794, "rewards/accuracies": 0.0, "rewards/chosen": 1.220617651939392, "rewards/margins": -0.0871880054473877, "rewards/rejected": 1.3078056573867798, "step": 3823 }, { "epoch": 0.62, "learning_rate": 9.63925969749881e-07, "logits/chosen": -0.5735546946525574, "logits/rejected": -0.40393882989883423, "logps/chosen": -173.94351196289062, "logps/rejected": -59.66429138183594, "loss": 0.0533, "rewards/accuracies": 1.0, "rewards/chosen": 4.528203010559082, "rewards/margins": 2.5730302333831787, "rewards/rejected": 1.9551727771759033, "step": 3824 }, { "epoch": 0.62, "learning_rate": 9.638769387855832e-07, "logits/chosen": -0.39952343702316284, "logits/rejected": -0.39952343702316284, "logps/chosen": -0.39154621958732605, "logps/rejected": -0.39154621958732605, "loss": 1.3939, "rewards/accuracies": 0.0, "rewards/chosen": 0.07485847920179367, "rewards/margins": 0.0, "rewards/rejected": 0.07485847920179367, "step": 3825 }, { "epoch": 0.62, "learning_rate": 9.638278757717779e-07, "logits/chosen": -0.6778713464736938, "logits/rejected": -0.5269997119903564, "logps/chosen": -64.81745910644531, "logps/rejected": -22.31169319152832, "loss": 0.2385, "rewards/accuracies": 1.0, "rewards/chosen": 1.7905257940292358, "rewards/margins": 1.081482172012329, "rewards/rejected": 0.7090436816215515, "step": 3826 }, { "epoch": 0.62, "learning_rate": 9.637787807118553e-07, "logits/chosen": -0.793666660785675, "logits/rejected": -0.7658379077911377, "logps/chosen": -88.97834777832031, "logps/rejected": -114.4570541381836, "loss": 0.3769, "rewards/accuracies": 1.0, "rewards/chosen": 0.9446777701377869, "rewards/margins": 0.44497913122177124, "rewards/rejected": 0.4996986389160156, "step": 3827 }, { "epoch": 0.62, "learning_rate": 9.637296536092074e-07, "logits/chosen": -0.6360651254653931, "logits/rejected": -0.6029577255249023, "logps/chosen": -104.9500503540039, "logps/rejected": -65.09770202636719, "loss": 0.1153, "rewards/accuracies": 1.0, "rewards/chosen": 4.0686469078063965, "rewards/margins": 1.9316504001617432, "rewards/rejected": 2.1369965076446533, "step": 3828 }, { "epoch": 0.62, "learning_rate": 9.636804944672282e-07, "logits/chosen": -0.793168842792511, "logits/rejected": -0.6562057733535767, "logps/chosen": -67.6273422241211, "logps/rejected": -52.74797058105469, "loss": 0.381, "rewards/accuracies": 0.0, "rewards/chosen": 2.8153724670410156, "rewards/margins": -0.06446623802185059, "rewards/rejected": 2.879838705062866, "step": 3829 }, { "epoch": 0.62, "learning_rate": 9.636313032893142e-07, "logits/chosen": -0.42828240990638733, "logits/rejected": -0.44040679931640625, "logps/chosen": -71.37741088867188, "logps/rejected": -68.30741882324219, "loss": 0.6055, "rewards/accuracies": 0.0, "rewards/chosen": 1.6344345808029175, "rewards/margins": -0.12420117855072021, "rewards/rejected": 1.7586357593536377, "step": 3830 }, { "epoch": 0.62, "learning_rate": 9.635820800788638e-07, "logits/chosen": -0.9946209788322449, "logits/rejected": -1.0094105005264282, "logps/chosen": -229.13833618164062, "logps/rejected": -94.79489135742188, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 3.3877227306365967, "rewards/margins": -0.20586872100830078, "rewards/rejected": 3.5935914516448975, "step": 3831 }, { "epoch": 0.62, "learning_rate": 9.635328248392784e-07, "logits/chosen": -0.7657275795936584, "logits/rejected": -0.5763903260231018, "logps/chosen": -111.40280151367188, "logps/rejected": -31.0556640625, "loss": 0.1183, "rewards/accuracies": 1.0, "rewards/chosen": 3.103628635406494, "rewards/margins": 2.9139657020568848, "rewards/rejected": 0.18966293334960938, "step": 3832 }, { "epoch": 0.62, "learning_rate": 9.63483537573961e-07, "logits/chosen": -0.4553302228450775, "logits/rejected": -0.4553302228450775, "logps/chosen": -29.104690551757812, "logps/rejected": -29.104690551757812, "loss": 0.7057, "rewards/accuracies": 0.0, "rewards/chosen": 0.9925994873046875, "rewards/margins": 0.0, "rewards/rejected": 0.9925994873046875, "step": 3833 }, { "epoch": 0.62, "learning_rate": 9.634342182863162e-07, "logits/chosen": -0.4916262924671173, "logits/rejected": -0.5239298343658447, "logps/chosen": -72.74269104003906, "logps/rejected": -49.57939910888672, "loss": 1.0607, "rewards/accuracies": 0.0, "rewards/chosen": 1.1208785772323608, "rewards/margins": -0.3485618829727173, "rewards/rejected": 1.4694404602050781, "step": 3834 }, { "epoch": 0.62, "learning_rate": 9.633848669797523e-07, "logits/chosen": -0.8653336763381958, "logits/rejected": -0.770959198474884, "logps/chosen": -84.57817077636719, "logps/rejected": -103.86687469482422, "loss": 0.8047, "rewards/accuracies": 0.0, "rewards/chosen": 0.9346832633018494, "rewards/margins": -1.1595184803009033, "rewards/rejected": 2.0942018032073975, "step": 3835 }, { "epoch": 0.62, "learning_rate": 9.633354836576785e-07, "logits/chosen": -0.7550974488258362, "logits/rejected": -0.711258053779602, "logps/chosen": -111.84060668945312, "logps/rejected": -135.84356689453125, "loss": 1.2603, "rewards/accuracies": 1.0, "rewards/chosen": 3.847247362136841, "rewards/margins": 0.2347259521484375, "rewards/rejected": 3.6125214099884033, "step": 3836 }, { "epoch": 0.62, "learning_rate": 9.63286068323507e-07, "logits/chosen": -1.0943753719329834, "logits/rejected": -1.0622386932373047, "logps/chosen": -96.6517333984375, "logps/rejected": -55.3308219909668, "loss": 0.9141, "rewards/accuracies": 0.0, "rewards/chosen": 0.48057785630226135, "rewards/margins": -0.7471928596496582, "rewards/rejected": 1.2277706861495972, "step": 3837 }, { "epoch": 0.62, "learning_rate": 9.632366209806518e-07, "logits/chosen": -0.5716902017593384, "logits/rejected": -0.5298311710357666, "logps/chosen": -75.68788146972656, "logps/rejected": -86.51185607910156, "loss": 0.2369, "rewards/accuracies": 1.0, "rewards/chosen": 2.2146332263946533, "rewards/margins": 0.534382700920105, "rewards/rejected": 1.6802505254745483, "step": 3838 }, { "epoch": 0.62, "learning_rate": 9.631871416325293e-07, "logits/chosen": -0.6696366667747498, "logits/rejected": -0.6504749059677124, "logps/chosen": -46.54519271850586, "logps/rejected": -54.82997131347656, "loss": 0.3034, "rewards/accuracies": 1.0, "rewards/chosen": 0.8081005215644836, "rewards/margins": 0.2456134557723999, "rewards/rejected": 0.5624870657920837, "step": 3839 }, { "epoch": 0.62, "learning_rate": 9.63137630282558e-07, "logits/chosen": -0.5763489007949829, "logits/rejected": -0.5763489007949829, "logps/chosen": -79.46121215820312, "logps/rejected": -79.46121215820312, "loss": 0.5303, "rewards/accuracies": 0.0, "rewards/chosen": 1.5917068719863892, "rewards/margins": 0.0, "rewards/rejected": 1.5917068719863892, "step": 3840 }, { "epoch": 0.62, "learning_rate": 9.630880869341587e-07, "logits/chosen": -0.3103734254837036, "logits/rejected": -0.26279276609420776, "logps/chosen": -41.89013671875, "logps/rejected": -50.088661193847656, "loss": 0.3925, "rewards/accuracies": 1.0, "rewards/chosen": 1.3128811120986938, "rewards/margins": 0.9259002804756165, "rewards/rejected": 0.3869808316230774, "step": 3841 }, { "epoch": 0.62, "learning_rate": 9.630385115907544e-07, "logits/chosen": -0.4934629499912262, "logits/rejected": -0.4188261926174164, "logps/chosen": -60.10985565185547, "logps/rejected": -57.57511520385742, "loss": 0.3782, "rewards/accuracies": 1.0, "rewards/chosen": 1.6453545093536377, "rewards/margins": 1.3226295709609985, "rewards/rejected": 0.3227249085903168, "step": 3842 }, { "epoch": 0.62, "learning_rate": 9.629889042557704e-07, "logits/chosen": -0.6781800985336304, "logits/rejected": -0.6783241629600525, "logps/chosen": -74.05574798583984, "logps/rejected": -89.33847045898438, "loss": 0.3999, "rewards/accuracies": 0.0, "rewards/chosen": 1.8192436695098877, "rewards/margins": -0.1680976152420044, "rewards/rejected": 1.987341284751892, "step": 3843 }, { "epoch": 0.62, "learning_rate": 9.629392649326337e-07, "logits/chosen": -1.0183790922164917, "logits/rejected": -0.8509995341300964, "logps/chosen": -141.20968627929688, "logps/rejected": -82.48590087890625, "loss": 0.479, "rewards/accuracies": 1.0, "rewards/chosen": 3.5094926357269287, "rewards/margins": 1.2420473098754883, "rewards/rejected": 2.2674453258514404, "step": 3844 }, { "epoch": 0.62, "learning_rate": 9.628895936247742e-07, "logits/chosen": -0.6658157706260681, "logits/rejected": -0.4348418712615967, "logps/chosen": -228.73822021484375, "logps/rejected": -26.73373031616211, "loss": 0.7925, "rewards/accuracies": 1.0, "rewards/chosen": 3.46793532371521, "rewards/margins": 3.1340763568878174, "rewards/rejected": 0.33385905623435974, "step": 3845 }, { "epoch": 0.62, "learning_rate": 9.628398903356239e-07, "logits/chosen": -0.5954428911209106, "logits/rejected": -0.4914308786392212, "logps/chosen": -96.18385314941406, "logps/rejected": -15.758260726928711, "loss": 0.6983, "rewards/accuracies": 1.0, "rewards/chosen": 0.7199394106864929, "rewards/margins": 0.005746066570281982, "rewards/rejected": 0.7141933441162109, "step": 3846 }, { "epoch": 0.62, "learning_rate": 9.627901550686163e-07, "logits/chosen": -0.9107956886291504, "logits/rejected": -0.7018696665763855, "logps/chosen": -126.29124450683594, "logps/rejected": -91.07115173339844, "loss": 0.2154, "rewards/accuracies": 1.0, "rewards/chosen": 4.192807197570801, "rewards/margins": 2.3443942070007324, "rewards/rejected": 1.848413109779358, "step": 3847 }, { "epoch": 0.62, "learning_rate": 9.627403878271882e-07, "logits/chosen": -1.0526996850967407, "logits/rejected": -0.9795450568199158, "logps/chosen": -180.97555541992188, "logps/rejected": -26.62575912475586, "loss": 0.3289, "rewards/accuracies": 1.0, "rewards/chosen": 4.537791728973389, "rewards/margins": 3.6833419799804688, "rewards/rejected": 0.8544498682022095, "step": 3848 }, { "epoch": 0.62, "learning_rate": 9.626905886147779e-07, "logits/chosen": -0.7076638340950012, "logits/rejected": -0.7051262855529785, "logps/chosen": -102.65861511230469, "logps/rejected": -88.95091247558594, "loss": 1.5227, "rewards/accuracies": 0.0, "rewards/chosen": 1.1058151721954346, "rewards/margins": -2.6862807273864746, "rewards/rejected": 3.792095899581909, "step": 3849 }, { "epoch": 0.62, "learning_rate": 9.626407574348257e-07, "logits/chosen": -0.7569440007209778, "logits/rejected": -0.8530953526496887, "logps/chosen": -106.18833923339844, "logps/rejected": -104.89569854736328, "loss": 2.548, "rewards/accuracies": 0.0, "rewards/chosen": 2.1879501342773438, "rewards/margins": -2.960334300994873, "rewards/rejected": 5.148284435272217, "step": 3850 }, { "epoch": 0.63, "learning_rate": 9.625908942907747e-07, "logits/chosen": -0.268963485956192, "logits/rejected": -0.25910693407058716, "logps/chosen": -39.33139419555664, "logps/rejected": -45.633304595947266, "loss": 0.3624, "rewards/accuracies": 1.0, "rewards/chosen": -0.07153778523206711, "rewards/margins": 0.08885612338781357, "rewards/rejected": -0.16039390861988068, "step": 3851 }, { "epoch": 0.63, "learning_rate": 9.6254099918607e-07, "logits/chosen": -0.9141325950622559, "logits/rejected": -0.92220538854599, "logps/chosen": -102.24830627441406, "logps/rejected": -136.57904052734375, "loss": 3.3077, "rewards/accuracies": 0.0, "rewards/chosen": 0.36015626788139343, "rewards/margins": -4.854143142700195, "rewards/rejected": 5.214299201965332, "step": 3852 }, { "epoch": 0.63, "learning_rate": 9.624910721241588e-07, "logits/chosen": -0.5414304733276367, "logits/rejected": -0.5688794255256653, "logps/chosen": -75.73373413085938, "logps/rejected": -93.7469253540039, "loss": 2.1873, "rewards/accuracies": 1.0, "rewards/chosen": 1.4657105207443237, "rewards/margins": 0.1911308765411377, "rewards/rejected": 1.274579644203186, "step": 3853 }, { "epoch": 0.63, "learning_rate": 9.624411131084908e-07, "logits/chosen": -0.6249743103981018, "logits/rejected": -0.4728127419948578, "logps/chosen": -92.60363006591797, "logps/rejected": -58.784217834472656, "loss": 0.1382, "rewards/accuracies": 1.0, "rewards/chosen": 3.2087464332580566, "rewards/margins": 1.705439805984497, "rewards/rejected": 1.5033066272735596, "step": 3854 }, { "epoch": 0.63, "learning_rate": 9.623911221425174e-07, "logits/chosen": -0.6506456732749939, "logits/rejected": -0.6483326554298401, "logps/chosen": -0.7746951580047607, "logps/rejected": -4.572246074676514, "loss": 0.561, "rewards/accuracies": 0.0, "rewards/chosen": 0.23332026600837708, "rewards/margins": -0.057582974433898926, "rewards/rejected": 0.290903240442276, "step": 3855 }, { "epoch": 0.63, "learning_rate": 9.623410992296929e-07, "logits/chosen": -0.7153545022010803, "logits/rejected": -0.7164190411567688, "logps/chosen": -207.39035034179688, "logps/rejected": -108.03558349609375, "loss": 0.0999, "rewards/accuracies": 1.0, "rewards/chosen": 4.798291206359863, "rewards/margins": 2.2563233375549316, "rewards/rejected": 2.5419678688049316, "step": 3856 }, { "epoch": 0.63, "learning_rate": 9.62291044373473e-07, "logits/chosen": -0.7217284440994263, "logits/rejected": -0.6950057148933411, "logps/chosen": -70.97824096679688, "logps/rejected": -85.51254272460938, "loss": 0.3395, "rewards/accuracies": 1.0, "rewards/chosen": 1.5759705305099487, "rewards/margins": 0.7316864728927612, "rewards/rejected": 0.8442840576171875, "step": 3857 }, { "epoch": 0.63, "learning_rate": 9.62240957577316e-07, "logits/chosen": -0.33393800258636475, "logits/rejected": -0.2285861074924469, "logps/chosen": -43.177337646484375, "logps/rejected": -35.90254211425781, "loss": 0.4915, "rewards/accuracies": 1.0, "rewards/chosen": 1.0658305883407593, "rewards/margins": 0.21926230192184448, "rewards/rejected": 0.8465682864189148, "step": 3858 }, { "epoch": 0.63, "learning_rate": 9.62190838844683e-07, "logits/chosen": -0.7585852146148682, "logits/rejected": -0.7951102256774902, "logps/chosen": -131.61727905273438, "logps/rejected": -78.68106079101562, "loss": 1.0521, "rewards/accuracies": 0.0, "rewards/chosen": 1.1362946033477783, "rewards/margins": -1.8577666282653809, "rewards/rejected": 2.994061231613159, "step": 3859 }, { "epoch": 0.63, "learning_rate": 9.62140688179036e-07, "logits/chosen": -0.6174051761627197, "logits/rejected": -0.5813974142074585, "logps/chosen": -23.70351791381836, "logps/rejected": -19.47440528869629, "loss": 0.6584, "rewards/accuracies": 0.0, "rewards/chosen": 1.032392144203186, "rewards/margins": -0.011034011840820312, "rewards/rejected": 1.0434261560440063, "step": 3860 }, { "epoch": 0.63, "learning_rate": 9.620905055838402e-07, "logits/chosen": -0.7043513059616089, "logits/rejected": -0.5896024107933044, "logps/chosen": -88.56617736816406, "logps/rejected": -53.031497955322266, "loss": 0.9794, "rewards/accuracies": 0.0, "rewards/chosen": 1.9514694213867188, "rewards/margins": -0.37256884574890137, "rewards/rejected": 2.32403826713562, "step": 3861 }, { "epoch": 0.63, "learning_rate": 9.62040291062563e-07, "logits/chosen": -0.7674242854118347, "logits/rejected": -0.7445173263549805, "logps/chosen": -72.31425476074219, "logps/rejected": -39.91703796386719, "loss": 1.5397, "rewards/accuracies": 0.0, "rewards/chosen": 0.3758690059185028, "rewards/margins": -0.8249588012695312, "rewards/rejected": 1.2008278369903564, "step": 3862 }, { "epoch": 0.63, "learning_rate": 9.619900446186734e-07, "logits/chosen": -1.0240461826324463, "logits/rejected": -0.9488001465797424, "logps/chosen": -115.53063201904297, "logps/rejected": -85.44975280761719, "loss": 0.7552, "rewards/accuracies": 0.0, "rewards/chosen": 0.3326621949672699, "rewards/margins": -1.2120338678359985, "rewards/rejected": 1.5446960926055908, "step": 3863 }, { "epoch": 0.63, "learning_rate": 9.619397662556433e-07, "logits/chosen": -0.9052680730819702, "logits/rejected": -0.8955774307250977, "logps/chosen": -93.53094482421875, "logps/rejected": -39.15162658691406, "loss": 0.1969, "rewards/accuracies": 1.0, "rewards/chosen": 1.802533745765686, "rewards/margins": 1.5387961864471436, "rewards/rejected": 0.2637374997138977, "step": 3864 }, { "epoch": 0.63, "learning_rate": 9.618894559769462e-07, "logits/chosen": -0.4753715693950653, "logits/rejected": -0.4753715693950653, "logps/chosen": -96.98788452148438, "logps/rejected": -96.98788452148438, "loss": 0.3533, "rewards/accuracies": 0.0, "rewards/chosen": 1.7099014520645142, "rewards/margins": 0.0, "rewards/rejected": 1.7099014520645142, "step": 3865 }, { "epoch": 0.63, "learning_rate": 9.618391137860582e-07, "logits/chosen": -0.5704286098480225, "logits/rejected": -0.6225592494010925, "logps/chosen": -93.66617584228516, "logps/rejected": -116.59835815429688, "loss": 1.4653, "rewards/accuracies": 0.0, "rewards/chosen": 1.1562172174453735, "rewards/margins": -2.277637481689453, "rewards/rejected": 3.433854818344116, "step": 3866 }, { "epoch": 0.63, "learning_rate": 9.617887396864573e-07, "logits/chosen": -0.7591677308082581, "logits/rejected": -0.7688509225845337, "logps/chosen": -72.63970184326172, "logps/rejected": -77.60812377929688, "loss": 1.1005, "rewards/accuracies": 0.0, "rewards/chosen": 1.6316200494766235, "rewards/margins": -1.696001410484314, "rewards/rejected": 3.3276214599609375, "step": 3867 }, { "epoch": 0.63, "learning_rate": 9.61738333681624e-07, "logits/chosen": -0.31674468517303467, "logits/rejected": -0.5605230927467346, "logps/chosen": -104.22920989990234, "logps/rejected": -111.05668640136719, "loss": 1.1291, "rewards/accuracies": 0.0, "rewards/chosen": 1.5972007513046265, "rewards/margins": -1.3052605390548706, "rewards/rejected": 2.902461290359497, "step": 3868 }, { "epoch": 0.63, "learning_rate": 9.616878957750407e-07, "logits/chosen": -0.41760730743408203, "logits/rejected": -0.4033220708370209, "logps/chosen": -54.28076934814453, "logps/rejected": -57.48664093017578, "loss": 2.1553, "rewards/accuracies": 0.0, "rewards/chosen": 1.5527397394180298, "rewards/margins": -0.593854546546936, "rewards/rejected": 2.146594285964966, "step": 3869 }, { "epoch": 0.63, "learning_rate": 9.616374259701925e-07, "logits/chosen": -0.6698117256164551, "logits/rejected": -0.7023324966430664, "logps/chosen": -78.33455657958984, "logps/rejected": -174.47537231445312, "loss": 0.469, "rewards/accuracies": 0.0, "rewards/chosen": 0.3047126829624176, "rewards/margins": -0.43134382367134094, "rewards/rejected": 0.7360565066337585, "step": 3870 }, { "epoch": 0.63, "learning_rate": 9.615869242705663e-07, "logits/chosen": -0.7079372406005859, "logits/rejected": -0.6852352023124695, "logps/chosen": -42.30827331542969, "logps/rejected": -30.199586868286133, "loss": 0.3607, "rewards/accuracies": 1.0, "rewards/chosen": 2.424025058746338, "rewards/margins": 0.8757418394088745, "rewards/rejected": 1.5482832193374634, "step": 3871 }, { "epoch": 0.63, "learning_rate": 9.615363906796509e-07, "logits/chosen": -0.46438196301460266, "logits/rejected": -0.47972559928894043, "logps/chosen": -48.82539367675781, "logps/rejected": -86.32347869873047, "loss": 0.7556, "rewards/accuracies": 1.0, "rewards/chosen": 0.31610146164894104, "rewards/margins": 0.027338802814483643, "rewards/rejected": 0.2887626588344574, "step": 3872 }, { "epoch": 0.63, "learning_rate": 9.614858252009384e-07, "logits/chosen": -0.7009881734848022, "logits/rejected": -0.739149808883667, "logps/chosen": -83.11395263671875, "logps/rejected": -57.82919692993164, "loss": 0.441, "rewards/accuracies": 0.0, "rewards/chosen": 1.2734123468399048, "rewards/margins": -0.11262929439544678, "rewards/rejected": 1.3860416412353516, "step": 3873 }, { "epoch": 0.63, "learning_rate": 9.614352278379216e-07, "logits/chosen": -0.66954106092453, "logits/rejected": -0.6113938689231873, "logps/chosen": -60.53855895996094, "logps/rejected": -65.99312591552734, "loss": 0.9703, "rewards/accuracies": 0.0, "rewards/chosen": 0.9037376642227173, "rewards/margins": -0.5887085199356079, "rewards/rejected": 1.4924461841583252, "step": 3874 }, { "epoch": 0.63, "learning_rate": 9.613845985940969e-07, "logits/chosen": -0.8838486671447754, "logits/rejected": -0.79311603307724, "logps/chosen": -141.1103515625, "logps/rejected": -77.96209716796875, "loss": 0.1276, "rewards/accuracies": 1.0, "rewards/chosen": 3.1198577880859375, "rewards/margins": 1.578820824623108, "rewards/rejected": 1.5410369634628296, "step": 3875 }, { "epoch": 0.63, "learning_rate": 9.61333937472962e-07, "logits/chosen": -0.5744134187698364, "logits/rejected": -0.60503751039505, "logps/chosen": -146.88818359375, "logps/rejected": -72.67218017578125, "loss": 0.6962, "rewards/accuracies": 0.0, "rewards/chosen": 0.9471115469932556, "rewards/margins": -1.0467581748962402, "rewards/rejected": 1.9938697814941406, "step": 3876 }, { "epoch": 0.63, "learning_rate": 9.612832444780175e-07, "logits/chosen": -0.5171766877174377, "logits/rejected": -0.5905799269676208, "logps/chosen": -73.2656478881836, "logps/rejected": -105.85308074951172, "loss": 1.8433, "rewards/accuracies": 0.0, "rewards/chosen": 0.7024444937705994, "rewards/margins": -2.4179511070251465, "rewards/rejected": 3.1203956604003906, "step": 3877 }, { "epoch": 0.63, "learning_rate": 9.612325196127654e-07, "logits/chosen": -0.5517037510871887, "logits/rejected": -0.5335257649421692, "logps/chosen": -40.53905487060547, "logps/rejected": -69.86074829101562, "loss": 0.5598, "rewards/accuracies": 0.0, "rewards/chosen": 1.745622992515564, "rewards/margins": -0.7061723470687866, "rewards/rejected": 2.4517953395843506, "step": 3878 }, { "epoch": 0.63, "learning_rate": 9.611817628807103e-07, "logits/chosen": -0.3319704532623291, "logits/rejected": -0.3586447536945343, "logps/chosen": -5.247481822967529, "logps/rejected": -41.047794342041016, "loss": 0.2931, "rewards/accuracies": 1.0, "rewards/chosen": 0.6222399473190308, "rewards/margins": 0.5021765232086182, "rewards/rejected": 0.1200634017586708, "step": 3879 }, { "epoch": 0.63, "learning_rate": 9.61130974285359e-07, "logits/chosen": -0.8754441142082214, "logits/rejected": -0.7686243653297424, "logps/chosen": -143.465576171875, "logps/rejected": -101.49159240722656, "loss": 0.2835, "rewards/accuracies": 1.0, "rewards/chosen": 3.5839920043945312, "rewards/margins": 0.3229491710662842, "rewards/rejected": 3.261042833328247, "step": 3880 }, { "epoch": 0.63, "learning_rate": 9.610801538302207e-07, "logits/chosen": -0.4682905077934265, "logits/rejected": -0.44770434498786926, "logps/chosen": -160.84788513183594, "logps/rejected": -69.21939849853516, "loss": 0.6181, "rewards/accuracies": 1.0, "rewards/chosen": 3.1722047328948975, "rewards/margins": 1.3376351594924927, "rewards/rejected": 1.8345695734024048, "step": 3881 }, { "epoch": 0.63, "learning_rate": 9.610293015188067e-07, "logits/chosen": -0.8060232996940613, "logits/rejected": -0.6399542689323425, "logps/chosen": -68.5684814453125, "logps/rejected": -118.14765930175781, "loss": 0.8332, "rewards/accuracies": 0.0, "rewards/chosen": 0.9816848635673523, "rewards/margins": -0.35185545682907104, "rewards/rejected": 1.3335403203964233, "step": 3882 }, { "epoch": 0.63, "learning_rate": 9.609784173546302e-07, "logits/chosen": -0.43052414059638977, "logits/rejected": -0.6382544636726379, "logps/chosen": -140.00538635253906, "logps/rejected": -58.352455139160156, "loss": 0.2428, "rewards/accuracies": 1.0, "rewards/chosen": 3.122576951980591, "rewards/margins": 1.0040335655212402, "rewards/rejected": 2.1185433864593506, "step": 3883 }, { "epoch": 0.63, "learning_rate": 9.60927501341207e-07, "logits/chosen": -0.7298829555511475, "logits/rejected": -0.758101761341095, "logps/chosen": -97.29967498779297, "logps/rejected": -70.20874786376953, "loss": 1.1528, "rewards/accuracies": 0.0, "rewards/chosen": 1.3371574878692627, "rewards/margins": -0.8932967185974121, "rewards/rejected": 2.230454206466675, "step": 3884 }, { "epoch": 0.63, "learning_rate": 9.608765534820547e-07, "logits/chosen": -0.3426210582256317, "logits/rejected": -0.35454660654067993, "logps/chosen": -75.74301147460938, "logps/rejected": -34.433658599853516, "loss": 2.7017, "rewards/accuracies": 1.0, "rewards/chosen": 0.9961571097373962, "rewards/margins": 0.010225296020507812, "rewards/rejected": 0.9859318137168884, "step": 3885 }, { "epoch": 0.63, "learning_rate": 9.608255737806932e-07, "logits/chosen": -0.24181579053401947, "logits/rejected": -0.2638546824455261, "logps/chosen": -15.29220199584961, "logps/rejected": -3.1390380859375, "loss": 0.876, "rewards/accuracies": 0.0, "rewards/chosen": -0.3032381236553192, "rewards/margins": -0.3655601143836975, "rewards/rejected": 0.06232199817895889, "step": 3886 }, { "epoch": 0.63, "learning_rate": 9.60774562240645e-07, "logits/chosen": -0.6365565657615662, "logits/rejected": -0.4938722550868988, "logps/chosen": -122.5624771118164, "logps/rejected": -53.246463775634766, "loss": 0.0862, "rewards/accuracies": 1.0, "rewards/chosen": 3.7851829528808594, "rewards/margins": 2.1021220684051514, "rewards/rejected": 1.683060884475708, "step": 3887 }, { "epoch": 0.63, "learning_rate": 9.607235188654349e-07, "logits/chosen": -0.8676555156707764, "logits/rejected": -0.8706309795379639, "logps/chosen": -108.353271484375, "logps/rejected": -73.71774291992188, "loss": 1.0583, "rewards/accuracies": 0.0, "rewards/chosen": 0.9342567324638367, "rewards/margins": -1.0195114612579346, "rewards/rejected": 1.9537681341171265, "step": 3888 }, { "epoch": 0.63, "learning_rate": 9.606724436585885e-07, "logits/chosen": -0.7610240578651428, "logits/rejected": -0.7250264883041382, "logps/chosen": -183.33517456054688, "logps/rejected": -175.0226593017578, "loss": 1.5124, "rewards/accuracies": 0.0, "rewards/chosen": 3.2841522693634033, "rewards/margins": -2.969377279281616, "rewards/rejected": 6.2535295486450195, "step": 3889 }, { "epoch": 0.63, "learning_rate": 9.606213366236353e-07, "logits/chosen": -0.700269341468811, "logits/rejected": -0.7157238721847534, "logps/chosen": -71.75262451171875, "logps/rejected": -62.467041015625, "loss": 0.2985, "rewards/accuracies": 1.0, "rewards/chosen": 1.4727143049240112, "rewards/margins": 0.22360694408416748, "rewards/rejected": 1.2491073608398438, "step": 3890 }, { "epoch": 0.63, "learning_rate": 9.605701977641063e-07, "logits/chosen": -0.6707380414009094, "logits/rejected": -0.6370737552642822, "logps/chosen": -70.98037719726562, "logps/rejected": -88.96601867675781, "loss": 0.3224, "rewards/accuracies": 1.0, "rewards/chosen": 2.576899766921997, "rewards/margins": 0.30966949462890625, "rewards/rejected": 2.267230272293091, "step": 3891 }, { "epoch": 0.63, "learning_rate": 9.605190270835346e-07, "logits/chosen": -0.6480754017829895, "logits/rejected": -0.5649492144584656, "logps/chosen": -75.82581329345703, "logps/rejected": -73.82083129882812, "loss": 0.6385, "rewards/accuracies": 0.0, "rewards/chosen": 2.35835337638855, "rewards/margins": -0.06755828857421875, "rewards/rejected": 2.4259116649627686, "step": 3892 }, { "epoch": 0.63, "learning_rate": 9.604678245854555e-07, "logits/chosen": -0.5899699330329895, "logits/rejected": -0.5899699330329895, "logps/chosen": -51.38722610473633, "logps/rejected": -51.38722610473633, "loss": 2.0381, "rewards/accuracies": 0.0, "rewards/chosen": 1.4160289764404297, "rewards/margins": 0.0, "rewards/rejected": 1.4160289764404297, "step": 3893 }, { "epoch": 0.63, "learning_rate": 9.604165902734068e-07, "logits/chosen": -0.4949242174625397, "logits/rejected": -0.40183261036872864, "logps/chosen": -64.66586303710938, "logps/rejected": -58.7592658996582, "loss": 0.2958, "rewards/accuracies": 1.0, "rewards/chosen": 2.1703553199768066, "rewards/margins": 0.44420063495635986, "rewards/rejected": 1.7261546850204468, "step": 3894 }, { "epoch": 0.63, "learning_rate": 9.60365324150928e-07, "logits/chosen": -0.2932579815387726, "logits/rejected": -0.2932579815387726, "logps/chosen": -93.1070556640625, "logps/rejected": -93.1070556640625, "loss": 0.3689, "rewards/accuracies": 0.0, "rewards/chosen": 1.09393310546875, "rewards/margins": 0.0, "rewards/rejected": 1.09393310546875, "step": 3895 }, { "epoch": 0.63, "learning_rate": 9.603140262215616e-07, "logits/chosen": -0.7185393571853638, "logits/rejected": -0.690753161907196, "logps/chosen": -61.04265594482422, "logps/rejected": -9.602983474731445, "loss": 0.5556, "rewards/accuracies": 1.0, "rewards/chosen": 1.286536455154419, "rewards/margins": 0.5049389004707336, "rewards/rejected": 0.7815975546836853, "step": 3896 }, { "epoch": 0.63, "learning_rate": 9.602626964888514e-07, "logits/chosen": -1.0678784847259521, "logits/rejected": -1.0133593082427979, "logps/chosen": -144.9253387451172, "logps/rejected": -189.19757080078125, "loss": 1.3306, "rewards/accuracies": 0.0, "rewards/chosen": 0.9222946166992188, "rewards/margins": -2.255091905593872, "rewards/rejected": 3.177386522293091, "step": 3897 }, { "epoch": 0.63, "learning_rate": 9.602113349563438e-07, "logits/chosen": -0.7759073376655579, "logits/rejected": -0.6607097387313843, "logps/chosen": -104.04205322265625, "logps/rejected": -91.72897338867188, "loss": 0.2128, "rewards/accuracies": 1.0, "rewards/chosen": 4.422868251800537, "rewards/margins": 1.0510129928588867, "rewards/rejected": 3.3718552589416504, "step": 3898 }, { "epoch": 0.63, "learning_rate": 9.601599416275877e-07, "logits/chosen": -0.7486725449562073, "logits/rejected": -0.6856604218482971, "logps/chosen": -47.88889694213867, "logps/rejected": -47.13616943359375, "loss": 0.7671, "rewards/accuracies": 0.0, "rewards/chosen": 2.1158719062805176, "rewards/margins": -0.2768855094909668, "rewards/rejected": 2.3927574157714844, "step": 3899 }, { "epoch": 0.63, "learning_rate": 9.601085165061336e-07, "logits/chosen": -0.6408460736274719, "logits/rejected": -0.5823169946670532, "logps/chosen": -102.490234375, "logps/rejected": -127.42413330078125, "loss": 1.6345, "rewards/accuracies": 0.0, "rewards/chosen": 3.19085693359375, "rewards/margins": -3.1923828125, "rewards/rejected": 6.38323974609375, "step": 3900 }, { "epoch": 0.63, "learning_rate": 9.600570595955346e-07, "logits/chosen": -0.689588725566864, "logits/rejected": -0.5482984781265259, "logps/chosen": -64.4497299194336, "logps/rejected": -60.81779861450195, "loss": 1.373, "rewards/accuracies": 0.0, "rewards/chosen": 1.0345535278320312, "rewards/margins": -1.7134883403778076, "rewards/rejected": 2.748041868209839, "step": 3901 }, { "epoch": 0.63, "learning_rate": 9.60005570899346e-07, "logits/chosen": -0.619868278503418, "logits/rejected": -0.5428964495658875, "logps/chosen": -194.57496643066406, "logps/rejected": -126.32868957519531, "loss": 0.4582, "rewards/accuracies": 0.0, "rewards/chosen": 0.7559829950332642, "rewards/margins": -0.06130063533782959, "rewards/rejected": 0.8172836303710938, "step": 3902 }, { "epoch": 0.63, "learning_rate": 9.59954050421125e-07, "logits/chosen": -0.9395692348480225, "logits/rejected": -0.8610444068908691, "logps/chosen": -148.12765502929688, "logps/rejected": -38.723785400390625, "loss": 0.2156, "rewards/accuracies": 1.0, "rewards/chosen": 3.536677598953247, "rewards/margins": 2.8444125652313232, "rewards/rejected": 0.692264974117279, "step": 3903 }, { "epoch": 0.63, "learning_rate": 9.599024981644312e-07, "logits/chosen": -0.5477415323257446, "logits/rejected": -0.6039231419563293, "logps/chosen": -92.72891235351562, "logps/rejected": -118.30045318603516, "loss": 0.6253, "rewards/accuracies": 0.0, "rewards/chosen": 1.2595665454864502, "rewards/margins": -0.8119521141052246, "rewards/rejected": 2.071518659591675, "step": 3904 }, { "epoch": 0.63, "learning_rate": 9.598509141328263e-07, "logits/chosen": -0.7160147428512573, "logits/rejected": -0.6904642581939697, "logps/chosen": -52.53618621826172, "logps/rejected": -119.05401611328125, "loss": 0.434, "rewards/accuracies": 0.0, "rewards/chosen": 0.7939972281455994, "rewards/margins": -0.189208984375, "rewards/rejected": 0.9832062125205994, "step": 3905 }, { "epoch": 0.63, "learning_rate": 9.597992983298745e-07, "logits/chosen": -0.873237669467926, "logits/rejected": -0.7836378216743469, "logps/chosen": -179.21435546875, "logps/rejected": -13.965258598327637, "loss": 0.0888, "rewards/accuracies": 1.0, "rewards/chosen": 3.4111390113830566, "rewards/margins": 2.653815269470215, "rewards/rejected": 0.7573237419128418, "step": 3906 }, { "epoch": 0.63, "learning_rate": 9.59747650759142e-07, "logits/chosen": -0.23990498483181, "logits/rejected": -0.23990498483181, "logps/chosen": -38.24822998046875, "logps/rejected": -38.24822998046875, "loss": 1.8671, "rewards/accuracies": 0.0, "rewards/chosen": 1.912279486656189, "rewards/margins": 0.0, "rewards/rejected": 1.912279486656189, "step": 3907 }, { "epoch": 0.63, "learning_rate": 9.596959714241968e-07, "logits/chosen": -0.48698869347572327, "logits/rejected": -0.4953574240207672, "logps/chosen": -72.67411041259766, "logps/rejected": -94.22012329101562, "loss": 1.3462, "rewards/accuracies": 0.0, "rewards/chosen": 1.6973587274551392, "rewards/margins": -0.8821853399276733, "rewards/rejected": 2.5795440673828125, "step": 3908 }, { "epoch": 0.63, "learning_rate": 9.596442603286097e-07, "logits/chosen": -0.6776076555252075, "logits/rejected": -0.674701452255249, "logps/chosen": -120.00716400146484, "logps/rejected": -165.32046508789062, "loss": 0.4276, "rewards/accuracies": 1.0, "rewards/chosen": 4.738556861877441, "rewards/margins": 1.777355432510376, "rewards/rejected": 2.9612014293670654, "step": 3909 }, { "epoch": 0.63, "learning_rate": 9.595925174759537e-07, "logits/chosen": -0.7750880122184753, "logits/rejected": -0.6826991438865662, "logps/chosen": -73.68109893798828, "logps/rejected": -54.63670349121094, "loss": 0.1662, "rewards/accuracies": 1.0, "rewards/chosen": 2.396573781967163, "rewards/margins": 1.136917233467102, "rewards/rejected": 1.259656548500061, "step": 3910 }, { "epoch": 0.63, "learning_rate": 9.59540742869803e-07, "logits/chosen": -0.6402163505554199, "logits/rejected": -0.5971899628639221, "logps/chosen": -72.67424011230469, "logps/rejected": -44.20346450805664, "loss": 0.5666, "rewards/accuracies": 1.0, "rewards/chosen": 0.8926803469657898, "rewards/margins": 0.022279322147369385, "rewards/rejected": 0.8704010248184204, "step": 3911 }, { "epoch": 0.63, "learning_rate": 9.594889365137352e-07, "logits/chosen": -0.605861485004425, "logits/rejected": -0.6600908041000366, "logps/chosen": -237.90370178222656, "logps/rejected": -46.72687530517578, "loss": 1.01, "rewards/accuracies": 1.0, "rewards/chosen": 3.2810380458831787, "rewards/margins": 1.0949013233184814, "rewards/rejected": 2.1861367225646973, "step": 3912 }, { "epoch": 0.64, "learning_rate": 9.5943709841133e-07, "logits/chosen": -0.7388917803764343, "logits/rejected": -0.6643965840339661, "logps/chosen": -92.76985931396484, "logps/rejected": -53.913394927978516, "loss": 0.0937, "rewards/accuracies": 1.0, "rewards/chosen": 3.3571083545684814, "rewards/margins": 2.091794967651367, "rewards/rejected": 1.2653133869171143, "step": 3913 }, { "epoch": 0.64, "learning_rate": 9.593852285661683e-07, "logits/chosen": -0.38940879702568054, "logits/rejected": -0.4055361747741699, "logps/chosen": -51.02998352050781, "logps/rejected": -64.59873962402344, "loss": 0.5753, "rewards/accuracies": 0.0, "rewards/chosen": 1.027103066444397, "rewards/margins": -0.7270419597625732, "rewards/rejected": 1.7541450262069702, "step": 3914 }, { "epoch": 0.64, "learning_rate": 9.59333326981834e-07, "logits/chosen": -0.30480411648750305, "logits/rejected": -0.3256235420703888, "logps/chosen": -80.80828857421875, "logps/rejected": -97.09364318847656, "loss": 1.8757, "rewards/accuracies": 0.0, "rewards/chosen": 1.4890708923339844, "rewards/margins": -3.634772777557373, "rewards/rejected": 5.123843669891357, "step": 3915 }, { "epoch": 0.64, "learning_rate": 9.59281393661913e-07, "logits/chosen": -0.586395263671875, "logits/rejected": -0.5767417550086975, "logps/chosen": -53.44864273071289, "logps/rejected": -109.98646545410156, "loss": 0.7911, "rewards/accuracies": 1.0, "rewards/chosen": 1.2832424640655518, "rewards/margins": 0.6494671106338501, "rewards/rejected": 0.6337753534317017, "step": 3916 }, { "epoch": 0.64, "learning_rate": 9.592294286099938e-07, "logits/chosen": -0.3870979845523834, "logits/rejected": -0.38398537039756775, "logps/chosen": -0.7846599817276001, "logps/rejected": -2.0668585300445557, "loss": 0.5803, "rewards/accuracies": 1.0, "rewards/chosen": 0.17147402465343475, "rewards/margins": 0.08242752403020859, "rewards/rejected": 0.08904650062322617, "step": 3917 }, { "epoch": 0.64, "learning_rate": 9.59177431829666e-07, "logits/chosen": -0.33115264773368835, "logits/rejected": -0.26392948627471924, "logps/chosen": -43.0191650390625, "logps/rejected": -38.488990783691406, "loss": 0.7965, "rewards/accuracies": 1.0, "rewards/chosen": 1.8491318225860596, "rewards/margins": 0.5796276330947876, "rewards/rejected": 1.269504189491272, "step": 3918 }, { "epoch": 0.64, "learning_rate": 9.591254033245227e-07, "logits/chosen": -0.5544798374176025, "logits/rejected": -0.5308942794799805, "logps/chosen": -18.124849319458008, "logps/rejected": -50.089866638183594, "loss": 0.5938, "rewards/accuracies": 0.0, "rewards/chosen": 1.0937174558639526, "rewards/margins": -0.6057103872299194, "rewards/rejected": 1.699427843093872, "step": 3919 }, { "epoch": 0.64, "learning_rate": 9.590733430981582e-07, "logits/chosen": -0.5749043822288513, "logits/rejected": -0.5586249232292175, "logps/chosen": -77.86865234375, "logps/rejected": -88.75227355957031, "loss": 1.0523, "rewards/accuracies": 0.0, "rewards/chosen": 0.8861252069473267, "rewards/margins": -1.923226237297058, "rewards/rejected": 2.8093514442443848, "step": 3920 }, { "epoch": 0.64, "learning_rate": 9.590212511541693e-07, "logits/chosen": -0.28339865803718567, "logits/rejected": -0.29514920711517334, "logps/chosen": -36.38800811767578, "logps/rejected": -55.28481674194336, "loss": 1.6987, "rewards/accuracies": 0.0, "rewards/chosen": 0.9883934259414673, "rewards/margins": -0.24849355220794678, "rewards/rejected": 1.236886978149414, "step": 3921 }, { "epoch": 0.64, "learning_rate": 9.589691274961555e-07, "logits/chosen": -0.8294582962989807, "logits/rejected": -0.7239663004875183, "logps/chosen": -146.25137329101562, "logps/rejected": -32.086822509765625, "loss": 0.0748, "rewards/accuracies": 1.0, "rewards/chosen": 4.063027858734131, "rewards/margins": 3.17458176612854, "rewards/rejected": 0.888446033000946, "step": 3922 }, { "epoch": 0.64, "learning_rate": 9.589169721277177e-07, "logits/chosen": -0.8614311218261719, "logits/rejected": -0.8335428833961487, "logps/chosen": -100.16461181640625, "logps/rejected": -84.33573150634766, "loss": 1.1863, "rewards/accuracies": 0.0, "rewards/chosen": 0.0009094238630495965, "rewards/margins": -2.1574602127075195, "rewards/rejected": 2.158369541168213, "step": 3923 }, { "epoch": 0.64, "learning_rate": 9.588647850524594e-07, "logits/chosen": -1.008200764656067, "logits/rejected": -0.9762060046195984, "logps/chosen": -107.19400024414062, "logps/rejected": -24.38083267211914, "loss": 0.1177, "rewards/accuracies": 1.0, "rewards/chosen": 1.669641137123108, "rewards/margins": 1.5008134841918945, "rewards/rejected": 0.16882763803005219, "step": 3924 }, { "epoch": 0.64, "learning_rate": 9.588125662739862e-07, "logits/chosen": -0.28947627544403076, "logits/rejected": -0.2922915518283844, "logps/chosen": -75.01434326171875, "logps/rejected": -72.78561401367188, "loss": 0.6342, "rewards/accuracies": 1.0, "rewards/chosen": 1.9427833557128906, "rewards/margins": 1.183234453201294, "rewards/rejected": 0.7595489621162415, "step": 3925 }, { "epoch": 0.64, "learning_rate": 9.587603157959062e-07, "logits/chosen": -0.5627397298812866, "logits/rejected": -0.5057664513587952, "logps/chosen": -143.520263671875, "logps/rejected": -20.393362045288086, "loss": 0.1593, "rewards/accuracies": 1.0, "rewards/chosen": 6.697845458984375, "rewards/margins": 6.413607597351074, "rewards/rejected": 0.2842378616333008, "step": 3926 }, { "epoch": 0.64, "learning_rate": 9.587080336218292e-07, "logits/chosen": -0.5017815828323364, "logits/rejected": -0.5017815828323364, "logps/chosen": -81.27713012695312, "logps/rejected": -81.27713012695312, "loss": 0.4887, "rewards/accuracies": 0.0, "rewards/chosen": 0.9699211120605469, "rewards/margins": 0.0, "rewards/rejected": 0.9699211120605469, "step": 3927 }, { "epoch": 0.64, "learning_rate": 9.586557197553673e-07, "logits/chosen": -0.2605953514575958, "logits/rejected": -0.4157112240791321, "logps/chosen": -121.40176391601562, "logps/rejected": -143.7111053466797, "loss": 1.0956, "rewards/accuracies": 0.0, "rewards/chosen": 1.2458374500274658, "rewards/margins": -1.9867935180664062, "rewards/rejected": 3.232630968093872, "step": 3928 }, { "epoch": 0.64, "learning_rate": 9.58603374200135e-07, "logits/chosen": -0.9415435194969177, "logits/rejected": -0.9172530174255371, "logps/chosen": -37.472808837890625, "logps/rejected": -30.547874450683594, "loss": 0.457, "rewards/accuracies": 0.0, "rewards/chosen": 0.6034378409385681, "rewards/margins": -0.2493610382080078, "rewards/rejected": 0.8527988791465759, "step": 3929 }, { "epoch": 0.64, "learning_rate": 9.58550996959749e-07, "logits/chosen": -0.08816104382276535, "logits/rejected": -0.041001688688993454, "logps/chosen": -44.51606369018555, "logps/rejected": -49.7159423828125, "loss": 0.314, "rewards/accuracies": 1.0, "rewards/chosen": 1.7501472234725952, "rewards/margins": 1.2818301916122437, "rewards/rejected": 0.46831703186035156, "step": 3930 }, { "epoch": 0.64, "learning_rate": 9.584985880378278e-07, "logits/chosen": -0.6459586024284363, "logits/rejected": -0.6749672293663025, "logps/chosen": -54.1646614074707, "logps/rejected": -52.985774993896484, "loss": 0.2994, "rewards/accuracies": 1.0, "rewards/chosen": 1.6014240980148315, "rewards/margins": 0.3096306324005127, "rewards/rejected": 1.2917934656143188, "step": 3931 }, { "epoch": 0.64, "learning_rate": 9.584461474379925e-07, "logits/chosen": -0.6528204083442688, "logits/rejected": -0.5353260040283203, "logps/chosen": -119.38291931152344, "logps/rejected": -28.2239933013916, "loss": 0.5324, "rewards/accuracies": 1.0, "rewards/chosen": 1.2936325073242188, "rewards/margins": 0.7257135510444641, "rewards/rejected": 0.5679189562797546, "step": 3932 }, { "epoch": 0.64, "learning_rate": 9.583936751638666e-07, "logits/chosen": -0.9679537415504456, "logits/rejected": -0.9289447069168091, "logps/chosen": -159.6131591796875, "logps/rejected": -115.8704605102539, "loss": 0.3513, "rewards/accuracies": 1.0, "rewards/chosen": 2.465986728668213, "rewards/margins": 1.415570855140686, "rewards/rejected": 1.0504158735275269, "step": 3933 }, { "epoch": 0.64, "learning_rate": 9.583411712190749e-07, "logits/chosen": -0.4926486611366272, "logits/rejected": -0.5066066980361938, "logps/chosen": -65.31501007080078, "logps/rejected": -117.31485748291016, "loss": 1.6381, "rewards/accuracies": 0.0, "rewards/chosen": 1.1240028142929077, "rewards/margins": -0.45661473274230957, "rewards/rejected": 1.5806175470352173, "step": 3934 }, { "epoch": 0.64, "learning_rate": 9.58288635607245e-07, "logits/chosen": -0.6512258648872375, "logits/rejected": -0.5516232848167419, "logps/chosen": -47.28474426269531, "logps/rejected": -24.381301879882812, "loss": 0.5188, "rewards/accuracies": 1.0, "rewards/chosen": 0.96010822057724, "rewards/margins": 0.5924572348594666, "rewards/rejected": 0.36765098571777344, "step": 3935 }, { "epoch": 0.64, "learning_rate": 9.582360683320068e-07, "logits/chosen": -0.7920616865158081, "logits/rejected": -0.7137195467948914, "logps/chosen": -159.46707153320312, "logps/rejected": -116.5131607055664, "loss": 0.9437, "rewards/accuracies": 0.0, "rewards/chosen": 0.9098953604698181, "rewards/margins": -0.9571815133094788, "rewards/rejected": 1.8670768737792969, "step": 3936 }, { "epoch": 0.64, "learning_rate": 9.581834693969923e-07, "logits/chosen": -0.5760580897331238, "logits/rejected": -0.34376996755599976, "logps/chosen": -131.34942626953125, "logps/rejected": -67.38700866699219, "loss": 0.1013, "rewards/accuracies": 1.0, "rewards/chosen": 4.6600341796875, "rewards/margins": 1.7286818027496338, "rewards/rejected": 2.931352376937866, "step": 3937 }, { "epoch": 0.64, "learning_rate": 9.581308388058354e-07, "logits/chosen": -0.5844846963882446, "logits/rejected": -0.6507666707038879, "logps/chosen": -62.894691467285156, "logps/rejected": -105.81283569335938, "loss": 0.5487, "rewards/accuracies": 1.0, "rewards/chosen": 2.6410224437713623, "rewards/margins": 0.052689313888549805, "rewards/rejected": 2.5883331298828125, "step": 3938 }, { "epoch": 0.64, "learning_rate": 9.580781765621724e-07, "logits/chosen": -0.6611924767494202, "logits/rejected": -0.5738396048545837, "logps/chosen": -38.33424377441406, "logps/rejected": -40.173744201660156, "loss": 0.6979, "rewards/accuracies": 1.0, "rewards/chosen": 2.053395986557007, "rewards/margins": 0.32859885692596436, "rewards/rejected": 1.7247971296310425, "step": 3939 }, { "epoch": 0.64, "learning_rate": 9.580254826696417e-07, "logits/chosen": -0.885199785232544, "logits/rejected": -0.8220933675765991, "logps/chosen": -219.57290649414062, "logps/rejected": -129.8494110107422, "loss": 1.1818, "rewards/accuracies": 0.0, "rewards/chosen": 3.438319444656372, "rewards/margins": -1.6320602893829346, "rewards/rejected": 5.070379734039307, "step": 3940 }, { "epoch": 0.64, "learning_rate": 9.57972757131884e-07, "logits/chosen": -0.6979947686195374, "logits/rejected": -0.615516722202301, "logps/chosen": -161.3405303955078, "logps/rejected": -26.80620574951172, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 5.119960308074951, "rewards/margins": 4.910943508148193, "rewards/rejected": 0.2090167999267578, "step": 3941 }, { "epoch": 0.64, "learning_rate": 9.579199999525423e-07, "logits/chosen": -0.4828949272632599, "logits/rejected": -0.44883444905281067, "logps/chosen": -45.20718002319336, "logps/rejected": -2.352548599243164, "loss": 1.2897, "rewards/accuracies": 0.0, "rewards/chosen": -0.2149818390607834, "rewards/margins": -0.6357134580612183, "rewards/rejected": 0.4207316040992737, "step": 3942 }, { "epoch": 0.64, "learning_rate": 9.578672111352613e-07, "logits/chosen": -0.8653886318206787, "logits/rejected": -0.6834530830383301, "logps/chosen": -140.6063232421875, "logps/rejected": -195.98379516601562, "loss": 0.8342, "rewards/accuracies": 0.0, "rewards/chosen": 4.185406684875488, "rewards/margins": -1.3729643821716309, "rewards/rejected": 5.558371067047119, "step": 3943 }, { "epoch": 0.64, "learning_rate": 9.578143906836884e-07, "logits/chosen": -0.8634841442108154, "logits/rejected": -0.7602157592773438, "logps/chosen": -90.07645416259766, "logps/rejected": -122.06166076660156, "loss": 0.6229, "rewards/accuracies": 1.0, "rewards/chosen": 1.8783591985702515, "rewards/margins": 0.5744452476501465, "rewards/rejected": 1.303913950920105, "step": 3944 }, { "epoch": 0.64, "learning_rate": 9.577615386014732e-07, "logits/chosen": -0.7006195187568665, "logits/rejected": -0.5375313758850098, "logps/chosen": -138.191650390625, "logps/rejected": -191.2179412841797, "loss": 0.3457, "rewards/accuracies": 1.0, "rewards/chosen": 7.013354778289795, "rewards/margins": 0.2462329864501953, "rewards/rejected": 6.7671217918396, "step": 3945 }, { "epoch": 0.64, "learning_rate": 9.577086548922668e-07, "logits/chosen": -1.1911464929580688, "logits/rejected": -1.2159066200256348, "logps/chosen": -156.83291625976562, "logps/rejected": -85.53588104248047, "loss": 1.1115, "rewards/accuracies": 0.0, "rewards/chosen": 1.6088287830352783, "rewards/margins": -1.938378095626831, "rewards/rejected": 3.5472068786621094, "step": 3946 }, { "epoch": 0.64, "learning_rate": 9.576557395597236e-07, "logits/chosen": -0.5260149240493774, "logits/rejected": -0.5292813181877136, "logps/chosen": -79.92161560058594, "logps/rejected": -118.58525085449219, "loss": 1.0488, "rewards/accuracies": 0.0, "rewards/chosen": 3.260491132736206, "rewards/margins": -1.005047082901001, "rewards/rejected": 4.265538215637207, "step": 3947 }, { "epoch": 0.64, "learning_rate": 9.57602792607499e-07, "logits/chosen": -0.5645658373832703, "logits/rejected": -0.6639061570167542, "logps/chosen": -95.35533905029297, "logps/rejected": -145.66091918945312, "loss": 1.115, "rewards/accuracies": 0.0, "rewards/chosen": 1.3108834028244019, "rewards/margins": -0.13247597217559814, "rewards/rejected": 1.443359375, "step": 3948 }, { "epoch": 0.64, "learning_rate": 9.57549814039251e-07, "logits/chosen": -0.614405632019043, "logits/rejected": -0.6512407660484314, "logps/chosen": -83.5229263305664, "logps/rejected": -98.05859375, "loss": 1.0529, "rewards/accuracies": 0.0, "rewards/chosen": 1.3472099304199219, "rewards/margins": -1.5066163539886475, "rewards/rejected": 2.8538262844085693, "step": 3949 }, { "epoch": 0.64, "learning_rate": 9.574968038586406e-07, "logits/chosen": -0.8716657757759094, "logits/rejected": -0.8858553171157837, "logps/chosen": -103.86006164550781, "logps/rejected": -101.52986145019531, "loss": 1.0603, "rewards/accuracies": 0.0, "rewards/chosen": 0.7020325064659119, "rewards/margins": -1.95393967628479, "rewards/rejected": 2.6559722423553467, "step": 3950 }, { "epoch": 0.64, "learning_rate": 9.5744376206933e-07, "logits/chosen": -0.43941572308540344, "logits/rejected": -0.416640043258667, "logps/chosen": -62.70744323730469, "logps/rejected": -113.84468841552734, "loss": 1.1775, "rewards/accuracies": 0.0, "rewards/chosen": 0.6579757928848267, "rewards/margins": -1.6326926946640015, "rewards/rejected": 2.290668487548828, "step": 3951 }, { "epoch": 0.64, "learning_rate": 9.573906886749834e-07, "logits/chosen": -0.3494465947151184, "logits/rejected": -0.4371252954006195, "logps/chosen": -78.32984161376953, "logps/rejected": -127.68541717529297, "loss": 1.109, "rewards/accuracies": 0.0, "rewards/chosen": 2.4041268825531006, "rewards/margins": -2.0890748500823975, "rewards/rejected": 4.493201732635498, "step": 3952 }, { "epoch": 0.64, "learning_rate": 9.573375836792683e-07, "logits/chosen": -0.8116142153739929, "logits/rejected": -0.8572458624839783, "logps/chosen": -90.27195739746094, "logps/rejected": -104.36351013183594, "loss": 1.3952, "rewards/accuracies": 0.0, "rewards/chosen": 0.8909942507743835, "rewards/margins": -2.4594619274139404, "rewards/rejected": 3.3504562377929688, "step": 3953 }, { "epoch": 0.64, "learning_rate": 9.572844470858537e-07, "logits/chosen": -0.3490494191646576, "logits/rejected": -0.39605894684791565, "logps/chosen": -60.28558349609375, "logps/rejected": -62.97597885131836, "loss": 0.8712, "rewards/accuracies": 0.0, "rewards/chosen": 1.0598000288009644, "rewards/margins": -1.1817349195480347, "rewards/rejected": 2.241534948348999, "step": 3954 }, { "epoch": 0.64, "learning_rate": 9.572312788984105e-07, "logits/chosen": -0.8169330358505249, "logits/rejected": -0.9159201979637146, "logps/chosen": -58.487308502197266, "logps/rejected": -156.67507934570312, "loss": 1.3136, "rewards/accuracies": 0.0, "rewards/chosen": 0.7447948455810547, "rewards/margins": -2.5129506587982178, "rewards/rejected": 3.2577455043792725, "step": 3955 }, { "epoch": 0.64, "learning_rate": 9.571780791206121e-07, "logits/chosen": -0.3824627101421356, "logits/rejected": -0.38076624274253845, "logps/chosen": -4.042613983154297, "logps/rejected": -1.6418139934539795, "loss": 0.7854, "rewards/accuracies": 0.0, "rewards/chosen": 0.03966346010565758, "rewards/margins": -0.11259274184703827, "rewards/rejected": 0.15225620567798615, "step": 3956 }, { "epoch": 0.64, "learning_rate": 9.571248477561346e-07, "logits/chosen": -0.32548823952674866, "logits/rejected": -0.3230390250682831, "logps/chosen": -24.258014678955078, "logps/rejected": -23.77600860595703, "loss": 0.5198, "rewards/accuracies": 0.0, "rewards/chosen": 0.11009140312671661, "rewards/margins": -0.3090408444404602, "rewards/rejected": 0.4191322326660156, "step": 3957 }, { "epoch": 0.64, "learning_rate": 9.570715848086555e-07, "logits/chosen": -0.7291668057441711, "logits/rejected": -0.421208918094635, "logps/chosen": -88.14168548583984, "logps/rejected": -69.30868530273438, "loss": 1.76, "rewards/accuracies": 1.0, "rewards/chosen": 2.0445945262908936, "rewards/margins": 1.3076765537261963, "rewards/rejected": 0.7369179129600525, "step": 3958 }, { "epoch": 0.64, "learning_rate": 9.570182902818544e-07, "logits/chosen": -0.8888363838195801, "logits/rejected": -0.9014866352081299, "logps/chosen": -109.03746795654297, "logps/rejected": -120.76241302490234, "loss": 0.6277, "rewards/accuracies": 1.0, "rewards/chosen": 0.3501853942871094, "rewards/margins": 0.12494811415672302, "rewards/rejected": 0.22523728013038635, "step": 3959 }, { "epoch": 0.64, "learning_rate": 9.569649641794141e-07, "logits/chosen": -0.7046456933021545, "logits/rejected": -0.6378418207168579, "logps/chosen": -106.04824829101562, "logps/rejected": -61.7192268371582, "loss": 1.4587, "rewards/accuracies": 0.0, "rewards/chosen": 0.4132141172885895, "rewards/margins": -0.5176212787628174, "rewards/rejected": 0.9308353662490845, "step": 3960 }, { "epoch": 0.64, "learning_rate": 9.569116065050185e-07, "logits/chosen": -1.0011149644851685, "logits/rejected": -0.9714066386222839, "logps/chosen": -148.99624633789062, "logps/rejected": -96.90896606445312, "loss": 0.8814, "rewards/accuracies": 0.0, "rewards/chosen": 3.154414415359497, "rewards/margins": -0.8697159290313721, "rewards/rejected": 4.024130344390869, "step": 3961 }, { "epoch": 0.64, "learning_rate": 9.568582172623543e-07, "logits/chosen": -0.6821130514144897, "logits/rejected": -0.6538734436035156, "logps/chosen": -150.58795166015625, "logps/rejected": -91.52210998535156, "loss": 0.442, "rewards/accuracies": 0.0, "rewards/chosen": 5.507803440093994, "rewards/margins": -0.12226247787475586, "rewards/rejected": 5.63006591796875, "step": 3962 }, { "epoch": 0.64, "learning_rate": 9.5680479645511e-07, "logits/chosen": -0.6352466344833374, "logits/rejected": -0.599195122718811, "logps/chosen": -70.65623474121094, "logps/rejected": -150.2843475341797, "loss": 1.4402, "rewards/accuracies": 0.0, "rewards/chosen": 0.4780540466308594, "rewards/margins": -0.8836692571640015, "rewards/rejected": 1.3617233037948608, "step": 3963 }, { "epoch": 0.64, "learning_rate": 9.567513440869767e-07, "logits/chosen": -0.38480308651924133, "logits/rejected": -0.39271658658981323, "logps/chosen": -6.293004035949707, "logps/rejected": -2.000062942504883, "loss": 1.1857, "rewards/accuracies": 0.0, "rewards/chosen": 0.3503377139568329, "rewards/margins": -0.29436710476875305, "rewards/rejected": 0.6447048187255859, "step": 3964 }, { "epoch": 0.64, "learning_rate": 9.566978601616473e-07, "logits/chosen": -0.44385790824890137, "logits/rejected": -0.44385790824890137, "logps/chosen": -51.936248779296875, "logps/rejected": -51.936248779296875, "loss": 1.2712, "rewards/accuracies": 0.0, "rewards/chosen": 0.49200019240379333, "rewards/margins": 0.0, "rewards/rejected": 0.49200019240379333, "step": 3965 }, { "epoch": 0.64, "learning_rate": 9.56644344682817e-07, "logits/chosen": -0.9447527527809143, "logits/rejected": -0.9447527527809143, "logps/chosen": -45.18810272216797, "logps/rejected": -45.18810272216797, "loss": 0.6537, "rewards/accuracies": 0.0, "rewards/chosen": 0.30762290954589844, "rewards/margins": 0.0, "rewards/rejected": 0.30762290954589844, "step": 3966 }, { "epoch": 0.64, "learning_rate": 9.565907976541833e-07, "logits/chosen": -0.42276668548583984, "logits/rejected": -0.4196866750717163, "logps/chosen": -94.06964111328125, "logps/rejected": -45.13272476196289, "loss": 0.2208, "rewards/accuracies": 1.0, "rewards/chosen": 1.553430199623108, "rewards/margins": 0.9066761136054993, "rewards/rejected": 0.6467540860176086, "step": 3967 }, { "epoch": 0.64, "learning_rate": 9.56537219079446e-07, "logits/chosen": -0.7117382884025574, "logits/rejected": -0.6889176368713379, "logps/chosen": -74.8411636352539, "logps/rejected": -91.73799133300781, "loss": 0.3788, "rewards/accuracies": 1.0, "rewards/chosen": 0.819256603717804, "rewards/margins": 0.10507357120513916, "rewards/rejected": 0.7141830325126648, "step": 3968 }, { "epoch": 0.64, "learning_rate": 9.564836089623064e-07, "logits/chosen": -0.3601991534233093, "logits/rejected": -0.24445168673992157, "logps/chosen": -87.04232788085938, "logps/rejected": -69.68819427490234, "loss": 0.3949, "rewards/accuracies": 1.0, "rewards/chosen": 4.573150634765625, "rewards/margins": 2.746868133544922, "rewards/rejected": 1.8262825012207031, "step": 3969 }, { "epoch": 0.64, "learning_rate": 9.56429967306469e-07, "logits/chosen": -0.5967769026756287, "logits/rejected": -0.4786682426929474, "logps/chosen": -193.59078979492188, "logps/rejected": -156.07534790039062, "loss": 1.3764, "rewards/accuracies": 0.0, "rewards/chosen": 3.1291656494140625, "rewards/margins": -0.09279489517211914, "rewards/rejected": 3.2219605445861816, "step": 3970 }, { "epoch": 0.64, "learning_rate": 9.563762941156395e-07, "logits/chosen": -0.880434513092041, "logits/rejected": -0.9416149854660034, "logps/chosen": -97.85466003417969, "logps/rejected": -118.93955993652344, "loss": 1.462, "rewards/accuracies": 0.0, "rewards/chosen": 0.22790680825710297, "rewards/margins": -2.857687473297119, "rewards/rejected": 3.0855941772460938, "step": 3971 }, { "epoch": 0.64, "learning_rate": 9.563225893935264e-07, "logits/chosen": -1.0392245054244995, "logits/rejected": -1.001883864402771, "logps/chosen": -87.56314849853516, "logps/rejected": -168.3113555908203, "loss": 2.6978, "rewards/accuracies": 0.0, "rewards/chosen": 0.9752426147460938, "rewards/margins": -2.0085601806640625, "rewards/rejected": 2.9838027954101562, "step": 3972 }, { "epoch": 0.64, "learning_rate": 9.562688531438398e-07, "logits/chosen": -0.6299446225166321, "logits/rejected": -0.603992223739624, "logps/chosen": -105.25840759277344, "logps/rejected": -69.24849700927734, "loss": 0.2717, "rewards/accuracies": 1.0, "rewards/chosen": 1.5394691228866577, "rewards/margins": 0.42194437980651855, "rewards/rejected": 1.1175247430801392, "step": 3973 }, { "epoch": 0.65, "learning_rate": 9.56215085370293e-07, "logits/chosen": -0.8526538014411926, "logits/rejected": -0.8002105951309204, "logps/chosen": -153.3438720703125, "logps/rejected": -82.88804626464844, "loss": 0.6036, "rewards/accuracies": 1.0, "rewards/chosen": 3.836747884750366, "rewards/margins": 0.7834854125976562, "rewards/rejected": 3.05326247215271, "step": 3974 }, { "epoch": 0.65, "learning_rate": 9.561612860766006e-07, "logits/chosen": -0.678403913974762, "logits/rejected": -0.6445494294166565, "logps/chosen": -51.62321853637695, "logps/rejected": -52.008846282958984, "loss": 0.5339, "rewards/accuracies": 0.0, "rewards/chosen": 0.9307575225830078, "rewards/margins": -0.3454322814941406, "rewards/rejected": 1.2761898040771484, "step": 3975 }, { "epoch": 0.65, "learning_rate": 9.561074552664794e-07, "logits/chosen": -0.5897209644317627, "logits/rejected": -0.6035733819007874, "logps/chosen": -103.03500366210938, "logps/rejected": -111.48747253417969, "loss": 0.9247, "rewards/accuracies": 0.0, "rewards/chosen": 0.9380546808242798, "rewards/margins": -1.368308186531067, "rewards/rejected": 2.3063628673553467, "step": 3976 }, { "epoch": 0.65, "learning_rate": 9.560535929436487e-07, "logits/chosen": -0.6081411838531494, "logits/rejected": -0.6007627248764038, "logps/chosen": -51.06983947753906, "logps/rejected": -69.09397888183594, "loss": 0.6663, "rewards/accuracies": 1.0, "rewards/chosen": 1.1391716003417969, "rewards/margins": 0.2167617678642273, "rewards/rejected": 0.9224098324775696, "step": 3977 }, { "epoch": 0.65, "learning_rate": 9.559996991118303e-07, "logits/chosen": -0.21132637560367584, "logits/rejected": -0.22010578215122223, "logps/chosen": -3.072474956512451, "logps/rejected": -1.4581420421600342, "loss": 0.4504, "rewards/accuracies": 0.0, "rewards/chosen": 0.2821565270423889, "rewards/margins": -0.1448567807674408, "rewards/rejected": 0.4270133078098297, "step": 3978 }, { "epoch": 0.65, "learning_rate": 9.55945773774747e-07, "logits/chosen": -0.7343106269836426, "logits/rejected": -0.7228154540061951, "logps/chosen": -132.65234375, "logps/rejected": -137.55386352539062, "loss": 0.3578, "rewards/accuracies": 1.0, "rewards/chosen": 1.4068390130996704, "rewards/margins": 0.7376754879951477, "rewards/rejected": 0.6691635251045227, "step": 3979 }, { "epoch": 0.65, "learning_rate": 9.558918169361251e-07, "logits/chosen": -0.7420266270637512, "logits/rejected": -0.7710869312286377, "logps/chosen": -161.53298950195312, "logps/rejected": -98.59241485595703, "loss": 0.2499, "rewards/accuracies": 1.0, "rewards/chosen": 3.000836133956909, "rewards/margins": 0.6763129234313965, "rewards/rejected": 2.3245232105255127, "step": 3980 }, { "epoch": 0.65, "learning_rate": 9.558378285996924e-07, "logits/chosen": -0.4039476811885834, "logits/rejected": -0.41059204936027527, "logps/chosen": -247.260986328125, "logps/rejected": -40.3423957824707, "loss": 3.2298, "rewards/accuracies": 1.0, "rewards/chosen": 3.07358717918396, "rewards/margins": 1.6324323415756226, "rewards/rejected": 1.4411548376083374, "step": 3981 }, { "epoch": 0.65, "learning_rate": 9.55783808769179e-07, "logits/chosen": -0.9893394708633423, "logits/rejected": -0.9918815493583679, "logps/chosen": -158.59857177734375, "logps/rejected": -179.6856689453125, "loss": 1.2064, "rewards/accuracies": 0.0, "rewards/chosen": 2.90915846824646, "rewards/margins": -2.2297699451446533, "rewards/rejected": 5.138928413391113, "step": 3982 }, { "epoch": 0.65, "learning_rate": 9.55729757448317e-07, "logits/chosen": -0.3627694547176361, "logits/rejected": -0.36543789505958557, "logps/chosen": -8.527168273925781, "logps/rejected": -3.4644203186035156, "loss": 0.7399, "rewards/accuracies": 0.0, "rewards/chosen": 0.4209124743938446, "rewards/margins": -0.1790667474269867, "rewards/rejected": 0.5999792218208313, "step": 3983 }, { "epoch": 0.65, "learning_rate": 9.556756746408409e-07, "logits/chosen": -0.4467327892780304, "logits/rejected": -0.4447043240070343, "logps/chosen": -89.53868865966797, "logps/rejected": -108.27315521240234, "loss": 1.9928, "rewards/accuracies": 0.0, "rewards/chosen": 1.3633384704589844, "rewards/margins": -3.081437587738037, "rewards/rejected": 4.4447760581970215, "step": 3984 }, { "epoch": 0.65, "learning_rate": 9.556215603504873e-07, "logits/chosen": -0.7639504671096802, "logits/rejected": -0.6797647476196289, "logps/chosen": -70.50658416748047, "logps/rejected": -248.23509216308594, "loss": 1.8602, "rewards/accuracies": 0.0, "rewards/chosen": 2.641680955886841, "rewards/margins": -3.600480794906616, "rewards/rejected": 6.242161750793457, "step": 3985 }, { "epoch": 0.65, "learning_rate": 9.55567414580995e-07, "logits/chosen": -0.15153706073760986, "logits/rejected": -0.15582887828350067, "logps/chosen": -5.76395845413208, "logps/rejected": -3.815715789794922, "loss": 0.6066, "rewards/accuracies": 0.0, "rewards/chosen": 0.012080383487045765, "rewards/margins": -0.1530981957912445, "rewards/rejected": 0.165178582072258, "step": 3986 }, { "epoch": 0.65, "learning_rate": 9.55513237336105e-07, "logits/chosen": -0.49918368458747864, "logits/rejected": -0.43384140729904175, "logps/chosen": -67.19890594482422, "logps/rejected": -41.807254791259766, "loss": 0.5004, "rewards/accuracies": 1.0, "rewards/chosen": 2.2682671546936035, "rewards/margins": 0.835936427116394, "rewards/rejected": 1.4323307275772095, "step": 3987 }, { "epoch": 0.65, "learning_rate": 9.554590286195604e-07, "logits/chosen": -0.49723631143569946, "logits/rejected": -0.5065729022026062, "logps/chosen": -66.99119567871094, "logps/rejected": -112.63475036621094, "loss": 0.7327, "rewards/accuracies": 1.0, "rewards/chosen": 0.342062383890152, "rewards/margins": 0.7228577136993408, "rewards/rejected": -0.38079530000686646, "step": 3988 }, { "epoch": 0.65, "learning_rate": 9.554047884351064e-07, "logits/chosen": -0.8712450861930847, "logits/rejected": -0.7975491285324097, "logps/chosen": -81.63532257080078, "logps/rejected": -22.770418167114258, "loss": 0.2696, "rewards/accuracies": 1.0, "rewards/chosen": 1.2380844354629517, "rewards/margins": 0.41634124517440796, "rewards/rejected": 0.8217431902885437, "step": 3989 }, { "epoch": 0.65, "learning_rate": 9.55350516786491e-07, "logits/chosen": -0.7960328459739685, "logits/rejected": -0.8031396269798279, "logps/chosen": -66.38723754882812, "logps/rejected": -52.78227615356445, "loss": 0.4686, "rewards/accuracies": 0.0, "rewards/chosen": 0.6388320922851562, "rewards/margins": -0.05661964416503906, "rewards/rejected": 0.6954517364501953, "step": 3990 }, { "epoch": 0.65, "learning_rate": 9.55296213677463e-07, "logits/chosen": -0.47489506006240845, "logits/rejected": -0.3880139887332916, "logps/chosen": -36.95754623413086, "logps/rejected": -93.27595520019531, "loss": 0.1622, "rewards/accuracies": 1.0, "rewards/chosen": 1.1907482147216797, "rewards/margins": 1.0700709819793701, "rewards/rejected": 0.12067718803882599, "step": 3991 }, { "epoch": 0.65, "learning_rate": 9.552418791117746e-07, "logits/chosen": -0.6218522787094116, "logits/rejected": -0.51814866065979, "logps/chosen": -61.527793884277344, "logps/rejected": -87.33114624023438, "loss": 0.5137, "rewards/accuracies": 1.0, "rewards/chosen": 2.0119972229003906, "rewards/margins": 0.06572341918945312, "rewards/rejected": 1.9462738037109375, "step": 3992 }, { "epoch": 0.65, "learning_rate": 9.551875130931803e-07, "logits/chosen": -0.045387160032987595, "logits/rejected": 0.04329637065529823, "logps/chosen": -54.35542678833008, "logps/rejected": -3.314225673675537, "loss": 0.8392, "rewards/accuracies": 0.0, "rewards/chosen": -0.1206279769539833, "rewards/margins": -0.3564980924129486, "rewards/rejected": 0.2358701229095459, "step": 3993 }, { "epoch": 0.65, "learning_rate": 9.551331156254357e-07, "logits/chosen": -0.5187651515007019, "logits/rejected": -0.5187651515007019, "logps/chosen": -129.5386962890625, "logps/rejected": -129.5386962890625, "loss": 0.3719, "rewards/accuracies": 0.0, "rewards/chosen": 2.38944411277771, "rewards/margins": 0.0, "rewards/rejected": 2.38944411277771, "step": 3994 }, { "epoch": 0.65, "learning_rate": 9.550786867122994e-07, "logits/chosen": -0.768727719783783, "logits/rejected": -0.7317959666252136, "logps/chosen": -73.90311431884766, "logps/rejected": -51.32063293457031, "loss": 0.557, "rewards/accuracies": 1.0, "rewards/chosen": 1.3143638372421265, "rewards/margins": 0.6051215529441833, "rewards/rejected": 0.7092422842979431, "step": 3995 }, { "epoch": 0.65, "learning_rate": 9.550242263575317e-07, "logits/chosen": -0.6975825428962708, "logits/rejected": -0.7153056859970093, "logps/chosen": -107.14591217041016, "logps/rejected": -146.8743438720703, "loss": 1.5464, "rewards/accuracies": 0.0, "rewards/chosen": 2.309119462966919, "rewards/margins": -3.039597272872925, "rewards/rejected": 5.348716735839844, "step": 3996 }, { "epoch": 0.65, "learning_rate": 9.549697345648955e-07, "logits/chosen": -0.3561571538448334, "logits/rejected": -0.3704271614551544, "logps/chosen": -1.0579617023468018, "logps/rejected": -35.786476135253906, "loss": 0.7995, "rewards/accuracies": 1.0, "rewards/chosen": 0.28659799695014954, "rewards/margins": 0.08754442632198334, "rewards/rejected": 0.1990535706281662, "step": 3997 }, { "epoch": 0.65, "learning_rate": 9.549152113381556e-07, "logits/chosen": -0.5258463621139526, "logits/rejected": -0.5148816108703613, "logps/chosen": -53.26892852783203, "logps/rejected": -103.42919921875, "loss": 0.1789, "rewards/accuracies": 1.0, "rewards/chosen": 1.9508278369903564, "rewards/margins": 1.1669899225234985, "rewards/rejected": 0.7838379144668579, "step": 3998 }, { "epoch": 0.65, "learning_rate": 9.54860656681079e-07, "logits/chosen": -0.6776643395423889, "logits/rejected": -0.489528626203537, "logps/chosen": -86.47091674804688, "logps/rejected": -90.3193359375, "loss": 2.1872, "rewards/accuracies": 0.0, "rewards/chosen": 1.2453140020370483, "rewards/margins": -1.6293915510177612, "rewards/rejected": 2.8747055530548096, "step": 3999 }, { "epoch": 0.65, "learning_rate": 9.548060705974352e-07, "logits/chosen": -0.6887620091438293, "logits/rejected": -0.6832363605499268, "logps/chosen": -148.98277282714844, "logps/rejected": -71.25875091552734, "loss": 1.4454, "rewards/accuracies": 0.0, "rewards/chosen": 0.6816329956054688, "rewards/margins": -0.3957970142364502, "rewards/rejected": 1.077430009841919, "step": 4000 }, { "epoch": 0.65, "learning_rate": 9.54751453090995e-07, "logits/chosen": -0.3003106415271759, "logits/rejected": -0.2903527319431305, "logps/chosen": -90.85938262939453, "logps/rejected": -52.57541275024414, "loss": 0.8408, "rewards/accuracies": 0.0, "rewards/chosen": 0.49664536118507385, "rewards/margins": -0.26380351185798645, "rewards/rejected": 0.7604488730430603, "step": 4001 }, { "epoch": 0.65, "learning_rate": 9.546968041655325e-07, "logits/chosen": -0.6628832817077637, "logits/rejected": -0.5089678168296814, "logps/chosen": -63.23002624511719, "logps/rejected": -28.935375213623047, "loss": 0.104, "rewards/accuracies": 1.0, "rewards/chosen": 1.928248643875122, "rewards/margins": 1.7284802198410034, "rewards/rejected": 0.19976845383644104, "step": 4002 }, { "epoch": 0.65, "learning_rate": 9.546421238248233e-07, "logits/chosen": -0.5077962279319763, "logits/rejected": -0.47066161036491394, "logps/chosen": -52.773521423339844, "logps/rejected": -59.431739807128906, "loss": 0.7364, "rewards/accuracies": 1.0, "rewards/chosen": 1.9682525396347046, "rewards/margins": 0.6607657670974731, "rewards/rejected": 1.3074867725372314, "step": 4003 }, { "epoch": 0.65, "learning_rate": 9.54587412072645e-07, "logits/chosen": -0.38261187076568604, "logits/rejected": -0.36001983284950256, "logps/chosen": -98.23985290527344, "logps/rejected": -53.15833282470703, "loss": 0.4429, "rewards/accuracies": 0.0, "rewards/chosen": 1.1262375116348267, "rewards/margins": -0.3090766668319702, "rewards/rejected": 1.4353141784667969, "step": 4004 }, { "epoch": 0.65, "learning_rate": 9.54532668912778e-07, "logits/chosen": -0.8608939051628113, "logits/rejected": -0.8264440894126892, "logps/chosen": -77.74893188476562, "logps/rejected": -45.661033630371094, "loss": 1.2452, "rewards/accuracies": 0.0, "rewards/chosen": 1.2554848194122314, "rewards/margins": -1.0080101490020752, "rewards/rejected": 2.2634949684143066, "step": 4005 }, { "epoch": 0.65, "learning_rate": 9.544778943490042e-07, "logits/chosen": -0.7908056974411011, "logits/rejected": -0.8226975798606873, "logps/chosen": -192.76156616210938, "logps/rejected": -95.4677734375, "loss": 0.3646, "rewards/accuracies": 0.0, "rewards/chosen": 3.8377411365509033, "rewards/margins": -0.06158447265625, "rewards/rejected": 3.8993256092071533, "step": 4006 }, { "epoch": 0.65, "learning_rate": 9.544230883851084e-07, "logits/chosen": -0.5320281386375427, "logits/rejected": -0.47672200202941895, "logps/chosen": -112.9986572265625, "logps/rejected": -129.91258239746094, "loss": 1.8435, "rewards/accuracies": 0.0, "rewards/chosen": 1.8884857892990112, "rewards/margins": -3.426971435546875, "rewards/rejected": 5.315457344055176, "step": 4007 }, { "epoch": 0.65, "learning_rate": 9.54368251024877e-07, "logits/chosen": -0.48874422907829285, "logits/rejected": -0.4520843029022217, "logps/chosen": -130.42779541015625, "logps/rejected": -96.37451171875, "loss": 0.5006, "rewards/accuracies": 1.0, "rewards/chosen": 4.791128635406494, "rewards/margins": 1.4963548183441162, "rewards/rejected": 3.294773817062378, "step": 4008 }, { "epoch": 0.65, "learning_rate": 9.543133822720985e-07, "logits/chosen": -0.5562872886657715, "logits/rejected": -0.5369732975959778, "logps/chosen": -100.59230041503906, "logps/rejected": -68.52096557617188, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": 3.6909453868865967, "rewards/margins": 2.1014907360076904, "rewards/rejected": 1.5894546508789062, "step": 4009 }, { "epoch": 0.65, "learning_rate": 9.542584821305643e-07, "logits/chosen": -0.6797667145729065, "logits/rejected": -0.6709675192832947, "logps/chosen": -52.30921936035156, "logps/rejected": -123.95172882080078, "loss": 0.2339, "rewards/accuracies": 1.0, "rewards/chosen": 0.42357102036476135, "rewards/margins": 1.3174484968185425, "rewards/rejected": -0.8938774466514587, "step": 4010 }, { "epoch": 0.65, "learning_rate": 9.542035506040671e-07, "logits/chosen": -0.5829183459281921, "logits/rejected": -0.5906193256378174, "logps/chosen": -78.46015930175781, "logps/rejected": -98.97212219238281, "loss": 0.4483, "rewards/accuracies": 0.0, "rewards/chosen": 1.9266624450683594, "rewards/margins": -0.21026921272277832, "rewards/rejected": 2.1369316577911377, "step": 4011 }, { "epoch": 0.65, "learning_rate": 9.541485876964022e-07, "logits/chosen": -0.11835605651140213, "logits/rejected": -0.12013117969036102, "logps/chosen": -11.057476043701172, "logps/rejected": -1.9041647911071777, "loss": 0.3697, "rewards/accuracies": 1.0, "rewards/chosen": 0.27377986907958984, "rewards/margins": 0.008459985256195068, "rewards/rejected": 0.2653198838233948, "step": 4012 }, { "epoch": 0.65, "learning_rate": 9.540935934113671e-07, "logits/chosen": -0.8466732501983643, "logits/rejected": -0.8241083025932312, "logps/chosen": -74.48576354980469, "logps/rejected": -67.58282470703125, "loss": 0.5481, "rewards/accuracies": 0.0, "rewards/chosen": 0.8693267703056335, "rewards/margins": -0.5798432230949402, "rewards/rejected": 1.4491699934005737, "step": 4013 }, { "epoch": 0.65, "learning_rate": 9.540385677527615e-07, "logits/chosen": -0.683490514755249, "logits/rejected": -0.6906818151473999, "logps/chosen": -130.46994018554688, "logps/rejected": -76.19432830810547, "loss": 0.2226, "rewards/accuracies": 1.0, "rewards/chosen": 3.32615065574646, "rewards/margins": 0.8176445960998535, "rewards/rejected": 2.5085060596466064, "step": 4014 }, { "epoch": 0.65, "learning_rate": 9.53983510724387e-07, "logits/chosen": -0.960374653339386, "logits/rejected": -1.033949613571167, "logps/chosen": -218.642822265625, "logps/rejected": -233.75485229492188, "loss": 1.7574, "rewards/accuracies": 0.0, "rewards/chosen": 4.798492431640625, "rewards/margins": -1.2199249267578125, "rewards/rejected": 6.0184173583984375, "step": 4015 }, { "epoch": 0.65, "learning_rate": 9.539284223300475e-07, "logits/chosen": -0.49492955207824707, "logits/rejected": -0.44438639283180237, "logps/chosen": -93.66769409179688, "logps/rejected": -125.65522766113281, "loss": 0.7626, "rewards/accuracies": 0.0, "rewards/chosen": 1.3224884271621704, "rewards/margins": -1.2579468488693237, "rewards/rejected": 2.580435276031494, "step": 4016 }, { "epoch": 0.65, "learning_rate": 9.538733025735493e-07, "logits/chosen": -0.6329213976860046, "logits/rejected": -0.6588714718818665, "logps/chosen": -69.1861801147461, "logps/rejected": -116.46315002441406, "loss": 0.8158, "rewards/accuracies": 1.0, "rewards/chosen": 1.7624809741973877, "rewards/margins": 0.18260276317596436, "rewards/rejected": 1.5798782110214233, "step": 4017 }, { "epoch": 0.65, "learning_rate": 9.538181514587003e-07, "logits/chosen": -0.6058780550956726, "logits/rejected": -0.6734794974327087, "logps/chosen": -52.807090759277344, "logps/rejected": -85.3592758178711, "loss": 1.0407, "rewards/accuracies": 0.0, "rewards/chosen": 0.3780319392681122, "rewards/margins": -0.10265424847602844, "rewards/rejected": 0.4806861877441406, "step": 4018 }, { "epoch": 0.65, "learning_rate": 9.53762968989311e-07, "logits/chosen": -0.177201047539711, "logits/rejected": -0.18503214418888092, "logps/chosen": -2.039400577545166, "logps/rejected": -2.959364891052246, "loss": 0.6476, "rewards/accuracies": 0.0, "rewards/chosen": 0.2453947514295578, "rewards/margins": -0.0173921138048172, "rewards/rejected": 0.262786865234375, "step": 4019 }, { "epoch": 0.65, "learning_rate": 9.537077551691941e-07, "logits/chosen": -0.3124249279499054, "logits/rejected": -0.32996657490730286, "logps/chosen": -8.441046714782715, "logps/rejected": -42.49861145019531, "loss": 0.6013, "rewards/accuracies": 1.0, "rewards/chosen": 0.10228271782398224, "rewards/margins": 0.6032360196113586, "rewards/rejected": -0.5009533166885376, "step": 4020 }, { "epoch": 0.65, "learning_rate": 9.536525100021647e-07, "logits/chosen": -0.5616958141326904, "logits/rejected": -0.5534599423408508, "logps/chosen": -112.97772216796875, "logps/rejected": -115.04861450195312, "loss": 0.1407, "rewards/accuracies": 1.0, "rewards/chosen": 4.916348457336426, "rewards/margins": 2.6937623023986816, "rewards/rejected": 2.222586154937744, "step": 4021 }, { "epoch": 0.65, "learning_rate": 9.53597233492039e-07, "logits/chosen": -0.8839482069015503, "logits/rejected": -0.877628743648529, "logps/chosen": -112.8204345703125, "logps/rejected": -223.5719451904297, "loss": 1.9458, "rewards/accuracies": 0.0, "rewards/chosen": 1.3641022443771362, "rewards/margins": -1.7168701887130737, "rewards/rejected": 3.08097243309021, "step": 4022 }, { "epoch": 0.65, "learning_rate": 9.535419256426365e-07, "logits/chosen": -0.57523512840271, "logits/rejected": -0.6202818155288696, "logps/chosen": -190.10757446289062, "logps/rejected": -93.58767700195312, "loss": 1.3894, "rewards/accuracies": 1.0, "rewards/chosen": 3.4478394985198975, "rewards/margins": 2.2683045864105225, "rewards/rejected": 1.179534912109375, "step": 4023 }, { "epoch": 0.65, "learning_rate": 9.534865864577783e-07, "logits/chosen": -0.6449266076087952, "logits/rejected": -0.6558610200881958, "logps/chosen": -89.89227294921875, "logps/rejected": -81.85023498535156, "loss": 0.5658, "rewards/accuracies": 0.0, "rewards/chosen": 1.4484809637069702, "rewards/margins": -0.44871222972869873, "rewards/rejected": 1.897193193435669, "step": 4024 }, { "epoch": 0.65, "learning_rate": 9.534312159412881e-07, "logits/chosen": -0.4456055164337158, "logits/rejected": -0.4456055164337158, "logps/chosen": -85.8578872680664, "logps/rejected": -85.8578872680664, "loss": 0.3702, "rewards/accuracies": 0.0, "rewards/chosen": 1.7522499561309814, "rewards/margins": 0.0, "rewards/rejected": 1.7522499561309814, "step": 4025 }, { "epoch": 0.65, "learning_rate": 9.533758140969912e-07, "logits/chosen": -0.6458935737609863, "logits/rejected": -0.8237974047660828, "logps/chosen": -86.03892517089844, "logps/rejected": -71.00029754638672, "loss": 0.4737, "rewards/accuracies": 1.0, "rewards/chosen": 1.001251220703125, "rewards/margins": 0.33519822359085083, "rewards/rejected": 0.6660529971122742, "step": 4026 }, { "epoch": 0.65, "learning_rate": 9.533203809287156e-07, "logits/chosen": -0.5848807096481323, "logits/rejected": -0.5147203207015991, "logps/chosen": -102.66935729980469, "logps/rejected": -41.08378219604492, "loss": 0.0825, "rewards/accuracies": 1.0, "rewards/chosen": 2.635319471359253, "rewards/margins": 2.0103678703308105, "rewards/rejected": 0.6249515414237976, "step": 4027 }, { "epoch": 0.65, "learning_rate": 9.532649164402908e-07, "logits/chosen": -0.5455293655395508, "logits/rejected": -0.5560210943222046, "logps/chosen": -62.450462341308594, "logps/rejected": -61.221458435058594, "loss": 0.8918, "rewards/accuracies": 1.0, "rewards/chosen": 0.7512565851211548, "rewards/margins": 0.5306224822998047, "rewards/rejected": 0.2206340879201889, "step": 4028 }, { "epoch": 0.65, "learning_rate": 9.532094206355492e-07, "logits/chosen": -0.4369996190071106, "logits/rejected": -0.45901432633399963, "logps/chosen": -115.93315124511719, "logps/rejected": -112.15118408203125, "loss": 1.2871, "rewards/accuracies": 0.0, "rewards/chosen": 2.1439788341522217, "rewards/margins": -2.344433546066284, "rewards/rejected": 4.488412380218506, "step": 4029 }, { "epoch": 0.65, "learning_rate": 9.531538935183249e-07, "logits/chosen": -0.5530422329902649, "logits/rejected": -0.5275819897651672, "logps/chosen": -116.52069091796875, "logps/rejected": -60.31775665283203, "loss": 0.267, "rewards/accuracies": 1.0, "rewards/chosen": 2.788900852203369, "rewards/margins": 0.39334797859191895, "rewards/rejected": 2.39555287361145, "step": 4030 }, { "epoch": 0.65, "learning_rate": 9.530983350924544e-07, "logits/chosen": -0.6498866081237793, "logits/rejected": -0.6394702196121216, "logps/chosen": -65.57373046875, "logps/rejected": -50.747276306152344, "loss": 1.0464, "rewards/accuracies": 0.0, "rewards/chosen": 1.2277328968048096, "rewards/margins": -0.2951880693435669, "rewards/rejected": 1.5229209661483765, "step": 4031 }, { "epoch": 0.65, "learning_rate": 9.530427453617762e-07, "logits/chosen": -0.8655884861946106, "logits/rejected": -0.8736299872398376, "logps/chosen": -236.94776916503906, "logps/rejected": -93.70452880859375, "loss": 0.0612, "rewards/accuracies": 1.0, "rewards/chosen": 4.340257167816162, "rewards/margins": 2.3217527866363525, "rewards/rejected": 2.0185043811798096, "step": 4032 }, { "epoch": 0.65, "learning_rate": 9.529871243301311e-07, "logits/chosen": -0.8895387649536133, "logits/rejected": -0.8971101641654968, "logps/chosen": -56.81415557861328, "logps/rejected": -75.05955505371094, "loss": 0.5055, "rewards/accuracies": 1.0, "rewards/chosen": 1.3825668096542358, "rewards/margins": 0.02543783187866211, "rewards/rejected": 1.3571289777755737, "step": 4033 }, { "epoch": 0.65, "learning_rate": 9.529314720013618e-07, "logits/chosen": -0.6358028054237366, "logits/rejected": -0.6348137259483337, "logps/chosen": -36.2210693359375, "logps/rejected": -17.67643928527832, "loss": 0.8267, "rewards/accuracies": 1.0, "rewards/chosen": 0.27611809968948364, "rewards/margins": 0.014422029256820679, "rewards/rejected": 0.26169607043266296, "step": 4034 }, { "epoch": 0.65, "learning_rate": 9.528757883793134e-07, "logits/chosen": -0.8322089910507202, "logits/rejected": -0.8264533281326294, "logps/chosen": -214.7548828125, "logps/rejected": -93.49466705322266, "loss": 0.9529, "rewards/accuracies": 0.0, "rewards/chosen": 3.6807923316955566, "rewards/margins": -0.20654821395874023, "rewards/rejected": 3.887340545654297, "step": 4035 }, { "epoch": 0.66, "learning_rate": 9.528200734678332e-07, "logits/chosen": -0.41581374406814575, "logits/rejected": -0.45169052481651306, "logps/chosen": -62.934181213378906, "logps/rejected": -76.72348022460938, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 1.4984734058380127, "rewards/margins": 0.13299179077148438, "rewards/rejected": 1.3654816150665283, "step": 4036 }, { "epoch": 0.66, "learning_rate": 9.527643272707706e-07, "logits/chosen": -0.7466206550598145, "logits/rejected": -0.7317546606063843, "logps/chosen": -99.22346496582031, "logps/rejected": -72.84870910644531, "loss": 0.8449, "rewards/accuracies": 0.0, "rewards/chosen": 3.0090959072113037, "rewards/margins": -1.009667158126831, "rewards/rejected": 4.018763065338135, "step": 4037 }, { "epoch": 0.66, "learning_rate": 9.527085497919772e-07, "logits/chosen": -0.13881482183933258, "logits/rejected": -0.07081866264343262, "logps/chosen": -83.09423828125, "logps/rejected": -89.52297973632812, "loss": 2.4189, "rewards/accuracies": 1.0, "rewards/chosen": 1.9981491565704346, "rewards/margins": 0.053250908851623535, "rewards/rejected": 1.944898247718811, "step": 4038 }, { "epoch": 0.66, "learning_rate": 9.526527410353063e-07, "logits/chosen": -0.6198537349700928, "logits/rejected": -0.540705680847168, "logps/chosen": -47.55259323120117, "logps/rejected": -18.408588409423828, "loss": 0.8669, "rewards/accuracies": 1.0, "rewards/chosen": 0.9838886260986328, "rewards/margins": 0.6512660980224609, "rewards/rejected": 0.3326225280761719, "step": 4039 }, { "epoch": 0.66, "learning_rate": 9.525969010046142e-07, "logits/chosen": -0.8203610777854919, "logits/rejected": -0.7680962085723877, "logps/chosen": -49.59264373779297, "logps/rejected": -97.19074249267578, "loss": 1.9845, "rewards/accuracies": 1.0, "rewards/chosen": 1.9189239740371704, "rewards/margins": 0.4277534484863281, "rewards/rejected": 1.4911705255508423, "step": 4040 }, { "epoch": 0.66, "learning_rate": 9.525410297037589e-07, "logits/chosen": -0.13505667448043823, "logits/rejected": -0.18291442096233368, "logps/chosen": -20.84109878540039, "logps/rejected": -29.353288650512695, "loss": 0.466, "rewards/accuracies": 0.0, "rewards/chosen": 0.31026098132133484, "rewards/margins": -0.02655714750289917, "rewards/rejected": 0.336818128824234, "step": 4041 }, { "epoch": 0.66, "learning_rate": 9.524851271366001e-07, "logits/chosen": -0.578524649143219, "logits/rejected": -0.5008665323257446, "logps/chosen": -49.81718063354492, "logps/rejected": -52.10013961791992, "loss": 0.9626, "rewards/accuracies": 0.0, "rewards/chosen": 1.2541477680206299, "rewards/margins": -0.419281005859375, "rewards/rejected": 1.6734287738800049, "step": 4042 }, { "epoch": 0.66, "learning_rate": 9.524291933070007e-07, "logits/chosen": -0.471147745847702, "logits/rejected": -0.46489009261131287, "logps/chosen": -381.63531494140625, "logps/rejected": -107.84786987304688, "loss": 0.097, "rewards/accuracies": 1.0, "rewards/chosen": 4.21697998046875, "rewards/margins": 2.0115020275115967, "rewards/rejected": 2.2054779529571533, "step": 4043 }, { "epoch": 0.66, "learning_rate": 9.523732282188249e-07, "logits/chosen": -0.7951818108558655, "logits/rejected": -0.6870245933532715, "logps/chosen": -73.0494613647461, "logps/rejected": -13.971765518188477, "loss": 0.6586, "rewards/accuracies": 1.0, "rewards/chosen": 1.211962103843689, "rewards/margins": 0.3803046941757202, "rewards/rejected": 0.8316574096679688, "step": 4044 }, { "epoch": 0.66, "learning_rate": 9.523172318759394e-07, "logits/chosen": -0.4428720474243164, "logits/rejected": -0.4662904143333435, "logps/chosen": -55.89141082763672, "logps/rejected": -57.734596252441406, "loss": 0.765, "rewards/accuracies": 0.0, "rewards/chosen": 0.11549492180347443, "rewards/margins": -0.18804512917995453, "rewards/rejected": 0.30354005098342896, "step": 4045 }, { "epoch": 0.66, "learning_rate": 9.522612042822131e-07, "logits/chosen": -0.6034885048866272, "logits/rejected": -0.25780969858169556, "logps/chosen": -153.40452575683594, "logps/rejected": -104.91314697265625, "loss": 0.5907, "rewards/accuracies": 1.0, "rewards/chosen": 3.576350450515747, "rewards/margins": 0.01732945442199707, "rewards/rejected": 3.55902099609375, "step": 4046 }, { "epoch": 0.66, "learning_rate": 9.52205145441517e-07, "logits/chosen": -0.25420114398002625, "logits/rejected": -0.24511316418647766, "logps/chosen": -38.379398345947266, "logps/rejected": -41.26048278808594, "loss": 0.4379, "rewards/accuracies": 0.0, "rewards/chosen": 1.7193794250488281, "rewards/margins": -0.24294281005859375, "rewards/rejected": 1.9623222351074219, "step": 4047 }, { "epoch": 0.66, "learning_rate": 9.521490553577241e-07, "logits/chosen": -0.28036555647850037, "logits/rejected": -0.23821963369846344, "logps/chosen": -86.41218566894531, "logps/rejected": -106.24868774414062, "loss": 0.9473, "rewards/accuracies": 0.0, "rewards/chosen": 2.01432204246521, "rewards/margins": -1.6738309860229492, "rewards/rejected": 3.688153028488159, "step": 4048 }, { "epoch": 0.66, "learning_rate": 9.520929340347096e-07, "logits/chosen": -0.3385130763053894, "logits/rejected": -0.29232871532440186, "logps/chosen": -95.48456573486328, "logps/rejected": -65.73130798339844, "loss": 0.4368, "rewards/accuracies": 1.0, "rewards/chosen": 4.176912784576416, "rewards/margins": 2.1968202590942383, "rewards/rejected": 1.9800926446914673, "step": 4049 }, { "epoch": 0.66, "learning_rate": 9.520367814763513e-07, "logits/chosen": -0.8208695650100708, "logits/rejected": -0.6338099837303162, "logps/chosen": -134.68603515625, "logps/rejected": -76.31682586669922, "loss": 1.0392, "rewards/accuracies": 0.0, "rewards/chosen": 1.4808685779571533, "rewards/margins": -0.18015515804290771, "rewards/rejected": 1.661023736000061, "step": 4050 }, { "epoch": 0.66, "learning_rate": 9.519805976865284e-07, "logits/chosen": -0.7418591380119324, "logits/rejected": -0.7115374207496643, "logps/chosen": -57.25544738769531, "logps/rejected": -142.10568237304688, "loss": 1.0108, "rewards/accuracies": 1.0, "rewards/chosen": 1.3884941339492798, "rewards/margins": 0.5767814517021179, "rewards/rejected": 0.8117126822471619, "step": 4051 }, { "epoch": 0.66, "learning_rate": 9.51924382669123e-07, "logits/chosen": -0.4646783769130707, "logits/rejected": -0.4666637182235718, "logps/chosen": -65.49044036865234, "logps/rejected": -88.00169372558594, "loss": 0.5965, "rewards/accuracies": 1.0, "rewards/chosen": 1.0323371887207031, "rewards/margins": 0.4329795837402344, "rewards/rejected": 0.5993576049804688, "step": 4052 }, { "epoch": 0.66, "learning_rate": 9.518681364280189e-07, "logits/chosen": -0.26739150285720825, "logits/rejected": -0.26739150285720825, "logps/chosen": -89.05416107177734, "logps/rejected": -89.05416107177734, "loss": 0.4416, "rewards/accuracies": 0.0, "rewards/chosen": 1.5612815618515015, "rewards/margins": 0.0, "rewards/rejected": 1.5612815618515015, "step": 4053 }, { "epoch": 0.66, "learning_rate": 9.518118589671023e-07, "logits/chosen": -0.49415233731269836, "logits/rejected": -0.3772934377193451, "logps/chosen": -62.085487365722656, "logps/rejected": -23.78194808959961, "loss": 0.5997, "rewards/accuracies": 1.0, "rewards/chosen": 1.744707465171814, "rewards/margins": 1.3953973054885864, "rewards/rejected": 0.34931012988090515, "step": 4054 }, { "epoch": 0.66, "learning_rate": 9.517555502902611e-07, "logits/chosen": -0.264670729637146, "logits/rejected": -0.3091261088848114, "logps/chosen": -70.60269165039062, "logps/rejected": -51.1461181640625, "loss": 0.6774, "rewards/accuracies": 0.0, "rewards/chosen": 1.3060104846954346, "rewards/margins": -0.8841843605041504, "rewards/rejected": 2.190194845199585, "step": 4055 }, { "epoch": 0.66, "learning_rate": 9.51699210401386e-07, "logits/chosen": -0.48718753457069397, "logits/rejected": -0.4635288119316101, "logps/chosen": -59.45623779296875, "logps/rejected": -54.66197967529297, "loss": 0.7704, "rewards/accuracies": 0.0, "rewards/chosen": 0.4375164210796356, "rewards/margins": -0.09917029738426208, "rewards/rejected": 0.5366867184638977, "step": 4056 }, { "epoch": 0.66, "learning_rate": 9.516428393043696e-07, "logits/chosen": -0.3550927937030792, "logits/rejected": -0.2796694338321686, "logps/chosen": -153.30239868164062, "logps/rejected": -131.3837890625, "loss": 0.6196, "rewards/accuracies": 0.0, "rewards/chosen": 3.8336808681488037, "rewards/margins": -0.8312790393829346, "rewards/rejected": 4.664959907531738, "step": 4057 }, { "epoch": 0.66, "learning_rate": 9.515864370031063e-07, "logits/chosen": -0.5669592618942261, "logits/rejected": -0.5908092856407166, "logps/chosen": -121.52147674560547, "logps/rejected": -73.20573425292969, "loss": 0.9961, "rewards/accuracies": 0.0, "rewards/chosen": 1.2770546674728394, "rewards/margins": -1.391919732093811, "rewards/rejected": 2.6689743995666504, "step": 4058 }, { "epoch": 0.66, "learning_rate": 9.515300035014933e-07, "logits/chosen": -0.767228364944458, "logits/rejected": -0.767228364944458, "logps/chosen": -56.38434982299805, "logps/rejected": -56.38434982299805, "loss": 0.3844, "rewards/accuracies": 0.0, "rewards/chosen": 2.709413528442383, "rewards/margins": 0.0, "rewards/rejected": 2.709413528442383, "step": 4059 }, { "epoch": 0.66, "learning_rate": 9.514735388034294e-07, "logits/chosen": -0.6673396825790405, "logits/rejected": -0.592591404914856, "logps/chosen": -93.33892059326172, "logps/rejected": -90.53002166748047, "loss": 1.2552, "rewards/accuracies": 0.0, "rewards/chosen": 1.9576271772384644, "rewards/margins": -1.4151214361190796, "rewards/rejected": 3.372748613357544, "step": 4060 }, { "epoch": 0.66, "learning_rate": 9.514170429128159e-07, "logits/chosen": -0.5334551930427551, "logits/rejected": -0.5388184189796448, "logps/chosen": -14.130906105041504, "logps/rejected": -14.350010871887207, "loss": 0.7219, "rewards/accuracies": 0.0, "rewards/chosen": 1.0766692161560059, "rewards/margins": -0.11185109615325928, "rewards/rejected": 1.1885203123092651, "step": 4061 }, { "epoch": 0.66, "learning_rate": 9.513605158335562e-07, "logits/chosen": -0.718533992767334, "logits/rejected": -0.7703663110733032, "logps/chosen": -83.5768051147461, "logps/rejected": -103.98785400390625, "loss": 1.5383, "rewards/accuracies": 0.0, "rewards/chosen": 1.513741374015808, "rewards/margins": -2.898268699645996, "rewards/rejected": 4.412010192871094, "step": 4062 }, { "epoch": 0.66, "learning_rate": 9.513039575695554e-07, "logits/chosen": -0.5839517116546631, "logits/rejected": -0.5839517116546631, "logps/chosen": -1.785269021987915, "logps/rejected": -1.785269021987915, "loss": 0.5729, "rewards/accuracies": 0.0, "rewards/chosen": 0.16787464916706085, "rewards/margins": 0.0, "rewards/rejected": 0.16787464916706085, "step": 4063 }, { "epoch": 0.66, "learning_rate": 9.512473681247217e-07, "logits/chosen": -0.5247439742088318, "logits/rejected": -0.5088874101638794, "logps/chosen": -20.963577270507812, "logps/rejected": -20.920164108276367, "loss": 1.6223, "rewards/accuracies": 1.0, "rewards/chosen": 1.0285618305206299, "rewards/margins": 0.1531481146812439, "rewards/rejected": 0.875413715839386, "step": 4064 }, { "epoch": 0.66, "learning_rate": 9.511907475029643e-07, "logits/chosen": -0.5389898419380188, "logits/rejected": -0.5834090113639832, "logps/chosen": -158.83441162109375, "logps/rejected": -99.455078125, "loss": 0.2389, "rewards/accuracies": 1.0, "rewards/chosen": 4.221438884735107, "rewards/margins": 0.53305983543396, "rewards/rejected": 3.6883790493011475, "step": 4065 }, { "epoch": 0.66, "learning_rate": 9.511340957081957e-07, "logits/chosen": -0.7478919625282288, "logits/rejected": -0.7131432890892029, "logps/chosen": -98.07557678222656, "logps/rejected": -88.40510559082031, "loss": 0.4605, "rewards/accuracies": 1.0, "rewards/chosen": 3.505793809890747, "rewards/margins": 0.537015438079834, "rewards/rejected": 2.968778371810913, "step": 4066 }, { "epoch": 0.66, "learning_rate": 9.510774127443298e-07, "logits/chosen": -0.27090537548065186, "logits/rejected": -0.22314535081386566, "logps/chosen": -59.06755065917969, "logps/rejected": -58.25917434692383, "loss": 1.4591, "rewards/accuracies": 1.0, "rewards/chosen": 1.3626785278320312, "rewards/margins": 1.1690826416015625, "rewards/rejected": 0.19359588623046875, "step": 4067 }, { "epoch": 0.66, "learning_rate": 9.510206986152826e-07, "logits/chosen": -0.56797194480896, "logits/rejected": -0.5319659113883972, "logps/chosen": -99.82630920410156, "logps/rejected": -118.764404296875, "loss": 0.2785, "rewards/accuracies": 1.0, "rewards/chosen": 2.975294589996338, "rewards/margins": 0.7354416847229004, "rewards/rejected": 2.2398529052734375, "step": 4068 }, { "epoch": 0.66, "learning_rate": 9.509639533249727e-07, "logits/chosen": -0.45815590023994446, "logits/rejected": -0.4677366614341736, "logps/chosen": -123.3014144897461, "logps/rejected": -82.68692016601562, "loss": 0.7893, "rewards/accuracies": 0.0, "rewards/chosen": 2.0665199756622314, "rewards/margins": -0.3560798168182373, "rewards/rejected": 2.4225997924804688, "step": 4069 }, { "epoch": 0.66, "learning_rate": 9.509071768773208e-07, "logits/chosen": -0.5512593984603882, "logits/rejected": -0.5948979258537292, "logps/chosen": -74.72960662841797, "logps/rejected": -105.36832427978516, "loss": 1.0427, "rewards/accuracies": 0.0, "rewards/chosen": 2.2219719886779785, "rewards/margins": -1.7147018909454346, "rewards/rejected": 3.936673879623413, "step": 4070 }, { "epoch": 0.66, "learning_rate": 9.508503692762496e-07, "logits/chosen": -0.2490493506193161, "logits/rejected": -0.23845107853412628, "logps/chosen": -3.9913041591644287, "logps/rejected": -9.868520736694336, "loss": 0.9151, "rewards/accuracies": 1.0, "rewards/chosen": 0.3053095042705536, "rewards/margins": 0.49123117327690125, "rewards/rejected": -0.18592166900634766, "step": 4071 }, { "epoch": 0.66, "learning_rate": 9.507935305256837e-07, "logits/chosen": -0.6536049842834473, "logits/rejected": -0.054634448140859604, "logps/chosen": -76.11184692382812, "logps/rejected": -113.32876586914062, "loss": 0.4841, "rewards/accuracies": 1.0, "rewards/chosen": 3.0492584705352783, "rewards/margins": 0.47952890396118164, "rewards/rejected": 2.5697295665740967, "step": 4072 }, { "epoch": 0.66, "learning_rate": 9.507366606295504e-07, "logits/chosen": -0.6414368748664856, "logits/rejected": -0.6189642548561096, "logps/chosen": -62.255271911621094, "logps/rejected": -86.27886962890625, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.9570960998535156, "rewards/margins": -0.21994400024414062, "rewards/rejected": 1.1770401000976562, "step": 4073 }, { "epoch": 0.66, "learning_rate": 9.506797595917786e-07, "logits/chosen": -0.22163595259189606, "logits/rejected": -0.20536112785339355, "logps/chosen": -30.831157684326172, "logps/rejected": -1.5098581314086914, "loss": 0.8061, "rewards/accuracies": 0.0, "rewards/chosen": 0.17342758178710938, "rewards/margins": -0.37623876333236694, "rewards/rejected": 0.5496663451194763, "step": 4074 }, { "epoch": 0.66, "learning_rate": 9.506228274162999e-07, "logits/chosen": -0.5072762370109558, "logits/rejected": -0.48558786511421204, "logps/chosen": -80.23040771484375, "logps/rejected": -90.07353973388672, "loss": 0.3941, "rewards/accuracies": 1.0, "rewards/chosen": 1.294196367263794, "rewards/margins": 0.7351608872413635, "rewards/rejected": 0.5590354800224304, "step": 4075 }, { "epoch": 0.66, "learning_rate": 9.505658641070476e-07, "logits/chosen": -0.759108304977417, "logits/rejected": -0.8848235607147217, "logps/chosen": -149.84805297851562, "logps/rejected": -173.97784423828125, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": 7.268972873687744, "rewards/margins": 2.6132354736328125, "rewards/rejected": 4.655737400054932, "step": 4076 }, { "epoch": 0.66, "learning_rate": 9.505088696679575e-07, "logits/chosen": -0.8064607977867126, "logits/rejected": -0.7632606625556946, "logps/chosen": -112.66771697998047, "logps/rejected": -46.63880920410156, "loss": 0.5815, "rewards/accuracies": 0.0, "rewards/chosen": 0.7280616760253906, "rewards/margins": -0.7710456848144531, "rewards/rejected": 1.4991073608398438, "step": 4077 }, { "epoch": 0.66, "learning_rate": 9.504518441029672e-07, "logits/chosen": -0.5234326720237732, "logits/rejected": -0.45349064469337463, "logps/chosen": -63.168678283691406, "logps/rejected": -82.52471160888672, "loss": 1.1076, "rewards/accuracies": 0.0, "rewards/chosen": 2.770989179611206, "rewards/margins": -0.02855682373046875, "rewards/rejected": 2.799546003341675, "step": 4078 }, { "epoch": 0.66, "learning_rate": 9.503947874160166e-07, "logits/chosen": -0.20784811675548553, "logits/rejected": -0.24175846576690674, "logps/chosen": -97.38430786132812, "logps/rejected": -62.154579162597656, "loss": 1.1749, "rewards/accuracies": 0.0, "rewards/chosen": 0.3042411804199219, "rewards/margins": -1.2922958135604858, "rewards/rejected": 1.5965369939804077, "step": 4079 }, { "epoch": 0.66, "learning_rate": 9.50337699611048e-07, "logits/chosen": -0.7521182894706726, "logits/rejected": -0.6590770483016968, "logps/chosen": -81.51978302001953, "logps/rejected": -60.592864990234375, "loss": 1.5134, "rewards/accuracies": 0.0, "rewards/chosen": 1.13774573802948, "rewards/margins": -0.40870201587677, "rewards/rejected": 1.54644775390625, "step": 4080 }, { "epoch": 0.66, "learning_rate": 9.502805806920055e-07, "logits/chosen": -0.5101031064987183, "logits/rejected": -0.27725905179977417, "logps/chosen": -298.75079345703125, "logps/rejected": -19.868610382080078, "loss": 0.2721, "rewards/accuracies": 1.0, "rewards/chosen": 2.320648193359375, "rewards/margins": 1.932795763015747, "rewards/rejected": 0.3878524899482727, "step": 4081 }, { "epoch": 0.66, "learning_rate": 9.502234306628354e-07, "logits/chosen": -0.8442601561546326, "logits/rejected": -0.8618500232696533, "logps/chosen": -102.30441284179688, "logps/rejected": -70.81198120117188, "loss": 0.7877, "rewards/accuracies": 0.0, "rewards/chosen": 3.473092794418335, "rewards/margins": -0.40706324577331543, "rewards/rejected": 3.8801560401916504, "step": 4082 }, { "epoch": 0.66, "learning_rate": 9.501662495274863e-07, "logits/chosen": -0.5780047178268433, "logits/rejected": -0.5299502611160278, "logps/chosen": -92.69499206542969, "logps/rejected": -186.19723510742188, "loss": 0.1848, "rewards/accuracies": 1.0, "rewards/chosen": 2.0885040760040283, "rewards/margins": 0.9073241949081421, "rewards/rejected": 1.1811798810958862, "step": 4083 }, { "epoch": 0.66, "learning_rate": 9.501090372899088e-07, "logits/chosen": -0.4322538375854492, "logits/rejected": -0.36731746792793274, "logps/chosen": -64.46361541748047, "logps/rejected": -92.1827392578125, "loss": 0.8672, "rewards/accuracies": 0.0, "rewards/chosen": 1.464422583580017, "rewards/margins": -1.475732445716858, "rewards/rejected": 2.940155029296875, "step": 4084 }, { "epoch": 0.66, "learning_rate": 9.500517939540558e-07, "logits/chosen": -0.7697654366493225, "logits/rejected": -0.7316488027572632, "logps/chosen": -122.55775451660156, "logps/rejected": -123.56822204589844, "loss": 1.8599, "rewards/accuracies": 0.0, "rewards/chosen": 3.4061295986175537, "rewards/margins": -0.7319090366363525, "rewards/rejected": 4.138038635253906, "step": 4085 }, { "epoch": 0.66, "learning_rate": 9.499945195238824e-07, "logits/chosen": -0.5461064577102661, "logits/rejected": -0.5799337029457092, "logps/chosen": -70.56245422363281, "logps/rejected": -91.36471557617188, "loss": 0.4852, "rewards/accuracies": 0.0, "rewards/chosen": 1.338324785232544, "rewards/margins": -0.27276909351348877, "rewards/rejected": 1.6110938787460327, "step": 4086 }, { "epoch": 0.66, "learning_rate": 9.499372140033455e-07, "logits/chosen": -0.4502003788948059, "logits/rejected": -0.4502003788948059, "logps/chosen": -36.81169891357422, "logps/rejected": -36.81169891357422, "loss": 0.7832, "rewards/accuracies": 0.0, "rewards/chosen": 1.483221411705017, "rewards/margins": 0.0, "rewards/rejected": 1.483221411705017, "step": 4087 }, { "epoch": 0.66, "learning_rate": 9.498798773964045e-07, "logits/chosen": -1.2020987272262573, "logits/rejected": -1.2089040279388428, "logps/chosen": -75.88877868652344, "logps/rejected": -47.328102111816406, "loss": 0.2019, "rewards/accuracies": 1.0, "rewards/chosen": 1.0782829523086548, "rewards/margins": 0.8484375476837158, "rewards/rejected": 0.22984543442726135, "step": 4088 }, { "epoch": 0.66, "learning_rate": 9.498225097070207e-07, "logits/chosen": -0.4872773289680481, "logits/rejected": -0.44652625918388367, "logps/chosen": -25.97222900390625, "logps/rejected": -54.00276565551758, "loss": 0.603, "rewards/accuracies": 1.0, "rewards/chosen": 1.7686840295791626, "rewards/margins": 0.40015947818756104, "rewards/rejected": 1.3685245513916016, "step": 4089 }, { "epoch": 0.66, "learning_rate": 9.497651109391578e-07, "logits/chosen": -0.7626256346702576, "logits/rejected": -0.7467461824417114, "logps/chosen": -45.342342376708984, "logps/rejected": -60.18751525878906, "loss": 1.3336, "rewards/accuracies": 0.0, "rewards/chosen": 1.9711993932724, "rewards/margins": -0.3160351514816284, "rewards/rejected": 2.2872345447540283, "step": 4090 }, { "epoch": 0.66, "learning_rate": 9.497076810967815e-07, "logits/chosen": -1.1100444793701172, "logits/rejected": -1.023332118988037, "logps/chosen": -102.16532135009766, "logps/rejected": -219.24407958984375, "loss": 0.797, "rewards/accuracies": 0.0, "rewards/chosen": 1.4806610345840454, "rewards/margins": -1.246627926826477, "rewards/rejected": 2.7272889614105225, "step": 4091 }, { "epoch": 0.66, "learning_rate": 9.496502201838597e-07, "logits/chosen": -0.6197192668914795, "logits/rejected": -0.6120585799217224, "logps/chosen": -100.36913299560547, "logps/rejected": -74.15913391113281, "loss": 1.3382, "rewards/accuracies": 0.0, "rewards/chosen": 1.7574516534805298, "rewards/margins": -1.226192593574524, "rewards/rejected": 2.9836442470550537, "step": 4092 }, { "epoch": 0.66, "learning_rate": 9.495927282043621e-07, "logits/chosen": -0.6277218461036682, "logits/rejected": -0.5963220596313477, "logps/chosen": -198.86370849609375, "logps/rejected": -165.27365112304688, "loss": 0.0844, "rewards/accuracies": 1.0, "rewards/chosen": 4.829104900360107, "rewards/margins": 2.3010103702545166, "rewards/rejected": 2.528094530105591, "step": 4093 }, { "epoch": 0.66, "learning_rate": 9.495352051622611e-07, "logits/chosen": -0.7568410634994507, "logits/rejected": -0.6838467121124268, "logps/chosen": -157.23426818847656, "logps/rejected": -136.25030517578125, "loss": 1.5582, "rewards/accuracies": 0.0, "rewards/chosen": 3.879438877105713, "rewards/margins": -1.142329216003418, "rewards/rejected": 5.021768093109131, "step": 4094 }, { "epoch": 0.66, "learning_rate": 9.49477651061531e-07, "logits/chosen": -1.1105259656906128, "logits/rejected": -1.1433970928192139, "logps/chosen": -84.28422546386719, "logps/rejected": -60.205177307128906, "loss": 0.5031, "rewards/accuracies": 1.0, "rewards/chosen": 4.329654693603516, "rewards/margins": 2.793919324874878, "rewards/rejected": 1.5357353687286377, "step": 4095 }, { "epoch": 0.66, "learning_rate": 9.494200659061482e-07, "logits/chosen": -0.5213602185249329, "logits/rejected": -0.5242191553115845, "logps/chosen": -55.67767333984375, "logps/rejected": -140.33251953125, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.8673920035362244, "rewards/margins": 0.6517670154571533, "rewards/rejected": 0.21562500298023224, "step": 4096 }, { "epoch": 0.66, "learning_rate": 9.493624497000914e-07, "logits/chosen": -0.23481178283691406, "logits/rejected": -0.272646963596344, "logps/chosen": -57.52608871459961, "logps/rejected": -121.33875274658203, "loss": 0.0928, "rewards/accuracies": 1.0, "rewards/chosen": 1.5070606470108032, "rewards/margins": 1.964121699333191, "rewards/rejected": -0.4570610225200653, "step": 4097 }, { "epoch": 0.67, "learning_rate": 9.493048024473411e-07, "logits/chosen": -0.8396955132484436, "logits/rejected": -0.8593517541885376, "logps/chosen": -103.96192932128906, "logps/rejected": -92.33851623535156, "loss": 0.7959, "rewards/accuracies": 0.0, "rewards/chosen": 0.6725494265556335, "rewards/margins": -1.0394272804260254, "rewards/rejected": 1.7119766473770142, "step": 4098 }, { "epoch": 0.67, "learning_rate": 9.492471241518802e-07, "logits/chosen": -0.9925147294998169, "logits/rejected": -0.9441007375717163, "logps/chosen": -136.57208251953125, "logps/rejected": -108.6395034790039, "loss": 2.223, "rewards/accuracies": 0.0, "rewards/chosen": 1.0803406238555908, "rewards/margins": -4.256069183349609, "rewards/rejected": 5.336410045623779, "step": 4099 }, { "epoch": 0.67, "learning_rate": 9.491894148176941e-07, "logits/chosen": -0.7484617829322815, "logits/rejected": -0.6568279266357422, "logps/chosen": -43.217933654785156, "logps/rejected": -152.24615478515625, "loss": 0.5812, "rewards/accuracies": 1.0, "rewards/chosen": 1.744136095046997, "rewards/margins": 0.513081431388855, "rewards/rejected": 1.231054663658142, "step": 4100 }, { "epoch": 0.67, "learning_rate": 9.491316744487697e-07, "logits/chosen": -0.837498128414154, "logits/rejected": -0.7908021807670593, "logps/chosen": -166.93539428710938, "logps/rejected": -117.61914825439453, "loss": 1.3706, "rewards/accuracies": 0.0, "rewards/chosen": 4.075906276702881, "rewards/margins": -2.4519996643066406, "rewards/rejected": 6.5279059410095215, "step": 4101 }, { "epoch": 0.67, "learning_rate": 9.490739030490962e-07, "logits/chosen": -0.5576450228691101, "logits/rejected": -0.5045180916786194, "logps/chosen": -99.97444915771484, "logps/rejected": -152.10658264160156, "loss": 1.3605, "rewards/accuracies": 0.0, "rewards/chosen": 0.9417740106582642, "rewards/margins": -2.571676731109619, "rewards/rejected": 3.5134506225585938, "step": 4102 }, { "epoch": 0.67, "learning_rate": 9.490161006226651e-07, "logits/chosen": -0.9336709976196289, "logits/rejected": -0.9041796326637268, "logps/chosen": -43.76469039916992, "logps/rejected": -40.722049713134766, "loss": 0.2652, "rewards/accuracies": 1.0, "rewards/chosen": 2.3179523944854736, "rewards/margins": 0.4232734441757202, "rewards/rejected": 1.8946789503097534, "step": 4103 }, { "epoch": 0.67, "learning_rate": 9.489582671734702e-07, "logits/chosen": -0.6128136515617371, "logits/rejected": -0.6128136515617371, "logps/chosen": -126.66703796386719, "logps/rejected": -126.66703796386719, "loss": 1.39, "rewards/accuracies": 0.0, "rewards/chosen": 1.960475206375122, "rewards/margins": 0.0, "rewards/rejected": 1.960475206375122, "step": 4104 }, { "epoch": 0.67, "learning_rate": 9.48900402705507e-07, "logits/chosen": -0.6547951698303223, "logits/rejected": -0.7016898989677429, "logps/chosen": -88.39134216308594, "logps/rejected": -137.34689331054688, "loss": 1.3164, "rewards/accuracies": 0.0, "rewards/chosen": 1.5925003290176392, "rewards/margins": -1.7845200300216675, "rewards/rejected": 3.3770203590393066, "step": 4105 }, { "epoch": 0.67, "learning_rate": 9.488425072227737e-07, "logits/chosen": -0.8434640169143677, "logits/rejected": -0.7845290303230286, "logps/chosen": -97.85629272460938, "logps/rejected": -41.19227600097656, "loss": 0.8196, "rewards/accuracies": 1.0, "rewards/chosen": 1.3011878728866577, "rewards/margins": 1.224514365196228, "rewards/rejected": 0.07667350769042969, "step": 4106 }, { "epoch": 0.67, "learning_rate": 9.487845807292701e-07, "logits/chosen": -0.5529274940490723, "logits/rejected": -0.47293123602867126, "logps/chosen": -75.07005310058594, "logps/rejected": -51.75261306762695, "loss": 0.6501, "rewards/accuracies": 0.0, "rewards/chosen": -0.07034149020910263, "rewards/margins": -0.9056987762451172, "rewards/rejected": 0.8353573083877563, "step": 4107 }, { "epoch": 0.67, "learning_rate": 9.487266232289982e-07, "logits/chosen": -0.4721076786518097, "logits/rejected": -0.5619553327560425, "logps/chosen": -195.28492736816406, "logps/rejected": -136.6986083984375, "loss": 0.1541, "rewards/accuracies": 1.0, "rewards/chosen": 3.0348281860351562, "rewards/margins": 1.9505783319473267, "rewards/rejected": 1.0842498540878296, "step": 4108 }, { "epoch": 0.67, "learning_rate": 9.486686347259627e-07, "logits/chosen": -0.45280763506889343, "logits/rejected": -0.44616320729255676, "logps/chosen": -64.65693664550781, "logps/rejected": -62.68717956542969, "loss": 0.6136, "rewards/accuracies": 0.0, "rewards/chosen": 1.1308501958847046, "rewards/margins": -0.871505856513977, "rewards/rejected": 2.0023560523986816, "step": 4109 }, { "epoch": 0.67, "learning_rate": 9.486106152241699e-07, "logits/chosen": -0.22812671959400177, "logits/rejected": -0.2624896168708801, "logps/chosen": -19.599918365478516, "logps/rejected": -38.124385833740234, "loss": 1.635, "rewards/accuracies": 0.0, "rewards/chosen": 0.27156639099121094, "rewards/margins": -0.1870216429233551, "rewards/rejected": 0.45858803391456604, "step": 4110 }, { "epoch": 0.67, "learning_rate": 9.485525647276282e-07, "logits/chosen": -0.4635717272758484, "logits/rejected": -0.4434826970100403, "logps/chosen": -74.15351104736328, "logps/rejected": -89.64324951171875, "loss": 0.2334, "rewards/accuracies": 1.0, "rewards/chosen": 1.8133400678634644, "rewards/margins": 0.5352073907852173, "rewards/rejected": 1.278132677078247, "step": 4111 }, { "epoch": 0.67, "learning_rate": 9.484944832403488e-07, "logits/chosen": -0.36715906858444214, "logits/rejected": -0.19071225821971893, "logps/chosen": -108.27528381347656, "logps/rejected": -25.407367706298828, "loss": 0.1166, "rewards/accuracies": 1.0, "rewards/chosen": 2.379368543624878, "rewards/margins": 1.6743460893630981, "rewards/rejected": 0.7050224542617798, "step": 4112 }, { "epoch": 0.67, "learning_rate": 9.484363707663441e-07, "logits/chosen": -0.2901882827281952, "logits/rejected": -0.27118533849716187, "logps/chosen": -53.17986297607422, "logps/rejected": -87.77760314941406, "loss": 0.6164, "rewards/accuracies": 1.0, "rewards/chosen": 1.4304252862930298, "rewards/margins": 0.3347480297088623, "rewards/rejected": 1.0956772565841675, "step": 4113 }, { "epoch": 0.67, "learning_rate": 9.483782273096295e-07, "logits/chosen": -0.7282240390777588, "logits/rejected": -0.7171679735183716, "logps/chosen": -85.71033477783203, "logps/rejected": -78.20769500732422, "loss": 0.7033, "rewards/accuracies": 0.0, "rewards/chosen": 1.012673258781433, "rewards/margins": -0.9488067626953125, "rewards/rejected": 1.9614800214767456, "step": 4114 }, { "epoch": 0.67, "learning_rate": 9.483200528742217e-07, "logits/chosen": -0.5107132196426392, "logits/rejected": -0.3725185692310333, "logps/chosen": -152.94546508789062, "logps/rejected": -106.63650512695312, "loss": 0.1933, "rewards/accuracies": 1.0, "rewards/chosen": 4.497784614562988, "rewards/margins": 1.1141541004180908, "rewards/rejected": 3.3836305141448975, "step": 4115 }, { "epoch": 0.67, "learning_rate": 9.482618474641405e-07, "logits/chosen": -0.8325769901275635, "logits/rejected": -0.7962417602539062, "logps/chosen": -78.2391357421875, "logps/rejected": -71.78416442871094, "loss": 0.7458, "rewards/accuracies": 0.0, "rewards/chosen": 0.8083114624023438, "rewards/margins": -0.7835365533828735, "rewards/rejected": 1.5918480157852173, "step": 4116 }, { "epoch": 0.67, "learning_rate": 9.482036110834071e-07, "logits/chosen": -0.15455439686775208, "logits/rejected": -0.16228121519088745, "logps/chosen": -2.775778293609619, "logps/rejected": -9.705926895141602, "loss": 0.8406, "rewards/accuracies": 1.0, "rewards/chosen": 0.15990686416625977, "rewards/margins": 0.05253753811120987, "rewards/rejected": 0.1073693260550499, "step": 4117 }, { "epoch": 0.67, "learning_rate": 9.481453437360452e-07, "logits/chosen": -0.5736282467842102, "logits/rejected": -0.5281445980072021, "logps/chosen": -75.71343994140625, "logps/rejected": -82.5108871459961, "loss": 1.0641, "rewards/accuracies": 1.0, "rewards/chosen": 1.8408775329589844, "rewards/margins": 0.10923230648040771, "rewards/rejected": 1.7316452264785767, "step": 4118 }, { "epoch": 0.67, "learning_rate": 9.480870454260803e-07, "logits/chosen": -0.2706579864025116, "logits/rejected": 0.2083204984664917, "logps/chosen": -41.01930236816406, "logps/rejected": -36.92995071411133, "loss": 0.7675, "rewards/accuracies": 1.0, "rewards/chosen": 1.3602936267852783, "rewards/margins": 1.0475399494171143, "rewards/rejected": 0.31275367736816406, "step": 4119 }, { "epoch": 0.67, "learning_rate": 9.480287161575405e-07, "logits/chosen": -0.9690780639648438, "logits/rejected": -0.9586668610572815, "logps/chosen": -86.65279388427734, "logps/rejected": -99.13097381591797, "loss": 1.7742, "rewards/accuracies": 0.0, "rewards/chosen": 0.7245674133300781, "rewards/margins": -2.489523410797119, "rewards/rejected": 3.2140908241271973, "step": 4120 }, { "epoch": 0.67, "learning_rate": 9.479703559344556e-07, "logits/chosen": -0.31731554865837097, "logits/rejected": -0.3089577555656433, "logps/chosen": -0.628622829914093, "logps/rejected": -7.700274467468262, "loss": 0.6195, "rewards/accuracies": 1.0, "rewards/chosen": 0.1802099496126175, "rewards/margins": 0.1994231939315796, "rewards/rejected": -0.019213248044252396, "step": 4121 }, { "epoch": 0.67, "learning_rate": 9.479119647608579e-07, "logits/chosen": -0.898286759853363, "logits/rejected": -0.836938202381134, "logps/chosen": -42.922218322753906, "logps/rejected": -22.02635955810547, "loss": 0.1958, "rewards/accuracies": 1.0, "rewards/chosen": 1.467342734336853, "rewards/margins": 1.0405579805374146, "rewards/rejected": 0.4267847239971161, "step": 4122 }, { "epoch": 0.67, "learning_rate": 9.478535426407817e-07, "logits/chosen": -0.3241950273513794, "logits/rejected": -0.325709730386734, "logps/chosen": -59.849830627441406, "logps/rejected": -90.53414154052734, "loss": 0.8533, "rewards/accuracies": 0.0, "rewards/chosen": 0.2464759796857834, "rewards/margins": -0.2691757082939148, "rewards/rejected": 0.5156517028808594, "step": 4123 }, { "epoch": 0.67, "learning_rate": 9.477950895782632e-07, "logits/chosen": -0.6277198195457458, "logits/rejected": -0.6357884407043457, "logps/chosen": -110.6792221069336, "logps/rejected": -72.2369384765625, "loss": 0.4956, "rewards/accuracies": 0.0, "rewards/chosen": 3.358496904373169, "rewards/margins": -0.22832345962524414, "rewards/rejected": 3.586820363998413, "step": 4124 }, { "epoch": 0.67, "learning_rate": 9.47736605577341e-07, "logits/chosen": -0.6848301887512207, "logits/rejected": -0.6460272073745728, "logps/chosen": -45.95818328857422, "logps/rejected": -61.27235412597656, "loss": 1.4688, "rewards/accuracies": 1.0, "rewards/chosen": 2.869061231613159, "rewards/margins": 0.7670974731445312, "rewards/rejected": 2.101963758468628, "step": 4125 }, { "epoch": 0.67, "learning_rate": 9.47678090642056e-07, "logits/chosen": -0.6733140349388123, "logits/rejected": -0.6938102841377258, "logps/chosen": -190.69515991210938, "logps/rejected": -75.92123413085938, "loss": 0.2771, "rewards/accuracies": 1.0, "rewards/chosen": 3.7608582973480225, "rewards/margins": 0.38449716567993164, "rewards/rejected": 3.376361131668091, "step": 4126 }, { "epoch": 0.67, "learning_rate": 9.47619544776451e-07, "logits/chosen": -0.6389926075935364, "logits/rejected": -0.62074214220047, "logps/chosen": -133.14373779296875, "logps/rejected": -125.02708435058594, "loss": 0.9719, "rewards/accuracies": 0.0, "rewards/chosen": 4.930563449859619, "rewards/margins": -0.16002941131591797, "rewards/rejected": 5.090592861175537, "step": 4127 }, { "epoch": 0.67, "learning_rate": 9.475609679845708e-07, "logits/chosen": -0.348310261964798, "logits/rejected": -0.38366591930389404, "logps/chosen": -79.80899047851562, "logps/rejected": -113.98518371582031, "loss": 0.7438, "rewards/accuracies": 0.0, "rewards/chosen": 1.1206741333007812, "rewards/margins": -1.0638000965118408, "rewards/rejected": 2.184474229812622, "step": 4128 }, { "epoch": 0.67, "learning_rate": 9.475023602704625e-07, "logits/chosen": -0.48420804738998413, "logits/rejected": -0.4297856092453003, "logps/chosen": -64.17157745361328, "logps/rejected": -48.7651481628418, "loss": 0.1886, "rewards/accuracies": 1.0, "rewards/chosen": 2.5140016078948975, "rewards/margins": 1.7528557777404785, "rewards/rejected": 0.7611457705497742, "step": 4129 }, { "epoch": 0.67, "learning_rate": 9.474437216381754e-07, "logits/chosen": -0.7541351318359375, "logits/rejected": -0.7502908706665039, "logps/chosen": -49.324520111083984, "logps/rejected": -10.591507911682129, "loss": 0.2443, "rewards/accuracies": 1.0, "rewards/chosen": 1.592875361442566, "rewards/margins": 0.8328134417533875, "rewards/rejected": 0.7600619196891785, "step": 4130 }, { "epoch": 0.67, "learning_rate": 9.47385052091761e-07, "logits/chosen": -0.6142253279685974, "logits/rejected": -0.3844488561153412, "logps/chosen": -95.60549926757812, "logps/rejected": -59.13127136230469, "loss": 0.9077, "rewards/accuracies": 1.0, "rewards/chosen": 3.3860504627227783, "rewards/margins": 1.6579574346542358, "rewards/rejected": 1.7280930280685425, "step": 4131 }, { "epoch": 0.67, "learning_rate": 9.473263516352727e-07, "logits/chosen": -0.55543053150177, "logits/rejected": -0.39509841799736023, "logps/chosen": -75.48664093017578, "logps/rejected": -70.95310974121094, "loss": 0.3663, "rewards/accuracies": 1.0, "rewards/chosen": 2.678333282470703, "rewards/margins": 0.477689266204834, "rewards/rejected": 2.200644016265869, "step": 4132 }, { "epoch": 0.67, "learning_rate": 9.472676202727661e-07, "logits/chosen": -0.5222344994544983, "logits/rejected": -0.8346758484840393, "logps/chosen": -137.8472900390625, "logps/rejected": -50.584617614746094, "loss": 0.3872, "rewards/accuracies": 1.0, "rewards/chosen": 4.488873481750488, "rewards/margins": 3.9439733028411865, "rewards/rejected": 0.544900119304657, "step": 4133 }, { "epoch": 0.67, "learning_rate": 9.47208858008299e-07, "logits/chosen": -0.7396162748336792, "logits/rejected": -0.7396162748336792, "logps/chosen": -36.324405670166016, "logps/rejected": -36.324405670166016, "loss": 2.0294, "rewards/accuracies": 0.0, "rewards/chosen": 2.033238649368286, "rewards/margins": 0.0, "rewards/rejected": 2.033238649368286, "step": 4134 }, { "epoch": 0.67, "learning_rate": 9.471500648459314e-07, "logits/chosen": -0.466157466173172, "logits/rejected": -0.39915934205055237, "logps/chosen": -74.8472900390625, "logps/rejected": -55.742767333984375, "loss": 0.728, "rewards/accuracies": 0.0, "rewards/chosen": 1.7658051252365112, "rewards/margins": -0.42025911808013916, "rewards/rejected": 2.1860642433166504, "step": 4135 }, { "epoch": 0.67, "learning_rate": 9.470912407897251e-07, "logits/chosen": -0.6330217123031616, "logits/rejected": -0.6029636263847351, "logps/chosen": -72.57601165771484, "logps/rejected": -84.45423889160156, "loss": 0.9209, "rewards/accuracies": 0.0, "rewards/chosen": 2.167736053466797, "rewards/margins": -0.7672333717346191, "rewards/rejected": 2.934969425201416, "step": 4136 }, { "epoch": 0.67, "learning_rate": 9.470323858437448e-07, "logits/chosen": -0.43053409457206726, "logits/rejected": -0.4356076419353485, "logps/chosen": -3.089423894882202, "logps/rejected": -1.7801002264022827, "loss": 0.4648, "rewards/accuracies": 0.0, "rewards/chosen": 0.0787668451666832, "rewards/margins": -0.19092971086502075, "rewards/rejected": 0.26969656348228455, "step": 4137 }, { "epoch": 0.67, "learning_rate": 9.469735000120562e-07, "logits/chosen": -0.4248112440109253, "logits/rejected": -0.25902625918388367, "logps/chosen": -55.028656005859375, "logps/rejected": -52.22762680053711, "loss": 0.9554, "rewards/accuracies": 0.0, "rewards/chosen": 1.356953501701355, "rewards/margins": -0.1808100938796997, "rewards/rejected": 1.5377635955810547, "step": 4138 }, { "epoch": 0.67, "learning_rate": 9.469145832987282e-07, "logits/chosen": -0.17202092707157135, "logits/rejected": -0.17202092707157135, "logps/chosen": -0.4965689182281494, "logps/rejected": -0.4965689182281494, "loss": 0.9963, "rewards/accuracies": 0.0, "rewards/chosen": 0.0756986141204834, "rewards/margins": 0.0, "rewards/rejected": 0.0756986141204834, "step": 4139 }, { "epoch": 0.67, "learning_rate": 9.468556357078312e-07, "logits/chosen": -0.6912813782691956, "logits/rejected": -0.6912813782691956, "logps/chosen": -23.70745849609375, "logps/rejected": -23.70745849609375, "loss": 1.2286, "rewards/accuracies": 0.0, "rewards/chosen": 0.6807309985160828, "rewards/margins": 0.0, "rewards/rejected": 0.6807309985160828, "step": 4140 }, { "epoch": 0.67, "learning_rate": 9.46796657243438e-07, "logits/chosen": -0.717999279499054, "logits/rejected": -0.667783796787262, "logps/chosen": -88.26983642578125, "logps/rejected": -79.98966217041016, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": 5.36104154586792, "rewards/margins": 2.6990153789520264, "rewards/rejected": 2.6620261669158936, "step": 4141 }, { "epoch": 0.67, "learning_rate": 9.467376479096234e-07, "logits/chosen": -0.7742135524749756, "logits/rejected": -0.7970010042190552, "logps/chosen": -52.2740478515625, "logps/rejected": -96.19430541992188, "loss": 0.7796, "rewards/accuracies": 1.0, "rewards/chosen": 1.27916419506073, "rewards/margins": 0.304036021232605, "rewards/rejected": 0.975128173828125, "step": 4142 }, { "epoch": 0.67, "learning_rate": 9.466786077104645e-07, "logits/chosen": -0.7355023622512817, "logits/rejected": -0.7080119848251343, "logps/chosen": -87.90505981445312, "logps/rejected": -130.5308380126953, "loss": 1.0931, "rewards/accuracies": 1.0, "rewards/chosen": 0.7267059683799744, "rewards/margins": 0.8994003534317017, "rewards/rejected": -0.1726943999528885, "step": 4143 }, { "epoch": 0.67, "learning_rate": 9.466195366500401e-07, "logits/chosen": -0.7695962190628052, "logits/rejected": -0.7665526866912842, "logps/chosen": -73.2249984741211, "logps/rejected": -64.28649139404297, "loss": 2.4507, "rewards/accuracies": 1.0, "rewards/chosen": 2.298884630203247, "rewards/margins": 0.1943519115447998, "rewards/rejected": 2.1045327186584473, "step": 4144 }, { "epoch": 0.67, "learning_rate": 9.465604347324318e-07, "logits/chosen": -0.7744958400726318, "logits/rejected": -0.8209068775177002, "logps/chosen": -7.40232515335083, "logps/rejected": -72.5306625366211, "loss": 1.1994, "rewards/accuracies": 1.0, "rewards/chosen": 0.46859604120254517, "rewards/margins": 0.2190263420343399, "rewards/rejected": 0.24956969916820526, "step": 4145 }, { "epoch": 0.67, "learning_rate": 9.465013019617228e-07, "logits/chosen": -0.958491325378418, "logits/rejected": -0.9258902668952942, "logps/chosen": -43.57036590576172, "logps/rejected": -38.63821029663086, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": 1.5624805688858032, "rewards/margins": 1.4349597692489624, "rewards/rejected": 0.12752075493335724, "step": 4146 }, { "epoch": 0.67, "learning_rate": 9.464421383419987e-07, "logits/chosen": -0.9193508625030518, "logits/rejected": -0.7940930724143982, "logps/chosen": -150.490966796875, "logps/rejected": -60.42181396484375, "loss": 0.1219, "rewards/accuracies": 1.0, "rewards/chosen": 3.781481981277466, "rewards/margins": 3.9148614406585693, "rewards/rejected": -0.13337936997413635, "step": 4147 }, { "epoch": 0.67, "learning_rate": 9.463829438773472e-07, "logits/chosen": -0.7776678800582886, "logits/rejected": -0.6560678482055664, "logps/chosen": -132.25772094726562, "logps/rejected": -30.49066162109375, "loss": 0.9554, "rewards/accuracies": 1.0, "rewards/chosen": 4.783793926239014, "rewards/margins": 2.0745325088500977, "rewards/rejected": 2.709261417388916, "step": 4148 }, { "epoch": 0.67, "learning_rate": 9.463237185718579e-07, "logits/chosen": -0.4918142855167389, "logits/rejected": -0.547680675983429, "logps/chosen": -73.18939971923828, "logps/rejected": -110.39765930175781, "loss": 1.4627, "rewards/accuracies": 0.0, "rewards/chosen": 1.4123162031173706, "rewards/margins": -2.8124213218688965, "rewards/rejected": 4.224737644195557, "step": 4149 }, { "epoch": 0.67, "learning_rate": 9.462644624296227e-07, "logits/chosen": -0.7191466093063354, "logits/rejected": -0.6491402983665466, "logps/chosen": -43.34608459472656, "logps/rejected": -40.534141540527344, "loss": 0.4625, "rewards/accuracies": 1.0, "rewards/chosen": 1.7747093439102173, "rewards/margins": 1.1157078742980957, "rewards/rejected": 0.6590015292167664, "step": 4150 }, { "epoch": 0.67, "learning_rate": 9.46205175454736e-07, "logits/chosen": -0.7054585218429565, "logits/rejected": -0.6538897752761841, "logps/chosen": -90.8783950805664, "logps/rejected": -103.8499755859375, "loss": 0.4407, "rewards/accuracies": 1.0, "rewards/chosen": 1.5971702337265015, "rewards/margins": 1.2144500017166138, "rewards/rejected": 0.3827202022075653, "step": 4151 }, { "epoch": 0.67, "learning_rate": 9.461458576512935e-07, "logits/chosen": -0.303093820810318, "logits/rejected": -0.2997361123561859, "logps/chosen": -66.4110107421875, "logps/rejected": -42.81889343261719, "loss": 1.1677, "rewards/accuracies": 0.0, "rewards/chosen": 0.6048866510391235, "rewards/margins": -0.2640666961669922, "rewards/rejected": 0.8689533472061157, "step": 4152 }, { "epoch": 0.67, "learning_rate": 9.460865090233938e-07, "logits/chosen": -0.8960814476013184, "logits/rejected": -0.7840815186500549, "logps/chosen": -124.03509521484375, "logps/rejected": -68.05465698242188, "loss": 0.7416, "rewards/accuracies": 0.0, "rewards/chosen": 0.8578552603721619, "rewards/margins": -0.6499618887901306, "rewards/rejected": 1.5078171491622925, "step": 4153 }, { "epoch": 0.67, "learning_rate": 9.460271295751372e-07, "logits/chosen": -0.915382981300354, "logits/rejected": -0.8448740243911743, "logps/chosen": -247.7301025390625, "logps/rejected": -162.41357421875, "loss": 0.7599, "rewards/accuracies": 0.0, "rewards/chosen": 4.701562404632568, "rewards/margins": -1.1488466262817383, "rewards/rejected": 5.850409030914307, "step": 4154 }, { "epoch": 0.67, "learning_rate": 9.459677193106264e-07, "logits/chosen": -0.2146741896867752, "logits/rejected": -0.16783937811851501, "logps/chosen": -46.614501953125, "logps/rejected": -94.12300109863281, "loss": 1.1438, "rewards/accuracies": 0.0, "rewards/chosen": 0.8420150876045227, "rewards/margins": -1.7365760803222656, "rewards/rejected": 2.5785911083221436, "step": 4155 }, { "epoch": 0.67, "learning_rate": 9.459082782339658e-07, "logits/chosen": -0.6505964398384094, "logits/rejected": -0.5984814763069153, "logps/chosen": -107.13179779052734, "logps/rejected": -106.03687286376953, "loss": 0.2842, "rewards/accuracies": 1.0, "rewards/chosen": 1.7593109607696533, "rewards/margins": 0.9487663507461548, "rewards/rejected": 0.8105446100234985, "step": 4156 }, { "epoch": 0.67, "learning_rate": 9.458488063492624e-07, "logits/chosen": -0.6325541138648987, "logits/rejected": -0.6361705660820007, "logps/chosen": -182.72457885742188, "logps/rejected": -66.0498275756836, "loss": 0.8329, "rewards/accuracies": 1.0, "rewards/chosen": 3.381979465484619, "rewards/margins": 1.5723450183868408, "rewards/rejected": 1.8096344470977783, "step": 4157 }, { "epoch": 0.67, "learning_rate": 9.457893036606253e-07, "logits/chosen": -0.41623860597610474, "logits/rejected": -0.42339032888412476, "logps/chosen": -58.620140075683594, "logps/rejected": -109.10668182373047, "loss": 0.3853, "rewards/accuracies": 1.0, "rewards/chosen": 1.4638786315917969, "rewards/margins": 0.8324203491210938, "rewards/rejected": 0.6314582824707031, "step": 4158 }, { "epoch": 0.68, "learning_rate": 9.457297701721654e-07, "logits/chosen": -0.7161113619804382, "logits/rejected": -0.6396090388298035, "logps/chosen": -108.69178771972656, "logps/rejected": -78.17692565917969, "loss": 0.1242, "rewards/accuracies": 1.0, "rewards/chosen": 2.5959153175354004, "rewards/margins": 1.4657585620880127, "rewards/rejected": 1.1301567554473877, "step": 4159 }, { "epoch": 0.68, "learning_rate": 9.456702058879957e-07, "logits/chosen": -0.45840296149253845, "logits/rejected": -0.4061520993709564, "logps/chosen": -59.48835372924805, "logps/rejected": -48.61206817626953, "loss": 1.5891, "rewards/accuracies": 1.0, "rewards/chosen": 0.9926090240478516, "rewards/margins": 0.14464527368545532, "rewards/rejected": 0.8479637503623962, "step": 4160 }, { "epoch": 0.68, "learning_rate": 9.456106108122319e-07, "logits/chosen": -0.5951054692268372, "logits/rejected": -0.6006450653076172, "logps/chosen": -90.50056457519531, "logps/rejected": -40.779579162597656, "loss": 0.4713, "rewards/accuracies": 0.0, "rewards/chosen": 0.8128204345703125, "rewards/margins": -0.35757148265838623, "rewards/rejected": 1.1703919172286987, "step": 4161 }, { "epoch": 0.68, "learning_rate": 9.455509849489913e-07, "logits/chosen": -1.1713411808013916, "logits/rejected": -1.2040272951126099, "logps/chosen": -84.60771942138672, "logps/rejected": -103.14069366455078, "loss": 0.6386, "rewards/accuracies": 0.0, "rewards/chosen": 0.3996635377407074, "rewards/margins": -0.6372169256210327, "rewards/rejected": 1.0368804931640625, "step": 4162 }, { "epoch": 0.68, "learning_rate": 9.454913283023935e-07, "logits/chosen": -0.6872979998588562, "logits/rejected": -0.630859375, "logps/chosen": -67.35286712646484, "logps/rejected": -73.81393432617188, "loss": 0.8587, "rewards/accuracies": 0.0, "rewards/chosen": 1.5385032892227173, "rewards/margins": -1.1758795976638794, "rewards/rejected": 2.7143828868865967, "step": 4163 }, { "epoch": 0.68, "learning_rate": 9.454316408765603e-07, "logits/chosen": -0.32897332310676575, "logits/rejected": -0.24546952545642853, "logps/chosen": -163.4808807373047, "logps/rejected": -80.46499633789062, "loss": 0.0905, "rewards/accuracies": 1.0, "rewards/chosen": 5.21614408493042, "rewards/margins": 1.7109224796295166, "rewards/rejected": 3.5052216053009033, "step": 4164 }, { "epoch": 0.68, "learning_rate": 9.453719226756152e-07, "logits/chosen": -0.6692962646484375, "logits/rejected": -0.6364915370941162, "logps/chosen": -103.25788116455078, "logps/rejected": -62.09428405761719, "loss": 0.909, "rewards/accuracies": 0.0, "rewards/chosen": 1.3475478887557983, "rewards/margins": -0.4883819818496704, "rewards/rejected": 1.8359298706054688, "step": 4165 }, { "epoch": 0.68, "learning_rate": 9.453121737036845e-07, "logits/chosen": -0.701936662197113, "logits/rejected": -0.6942176818847656, "logps/chosen": -53.6938362121582, "logps/rejected": -92.26485443115234, "loss": 1.0306, "rewards/accuracies": 1.0, "rewards/chosen": 1.5359890460968018, "rewards/margins": 0.3500187397003174, "rewards/rejected": 1.1859703063964844, "step": 4166 }, { "epoch": 0.68, "learning_rate": 9.452523939648962e-07, "logits/chosen": -0.584419846534729, "logits/rejected": -0.6275789141654968, "logps/chosen": -102.26412963867188, "logps/rejected": -81.4629135131836, "loss": 0.6975, "rewards/accuracies": 0.0, "rewards/chosen": 0.9676055908203125, "rewards/margins": -0.5444068908691406, "rewards/rejected": 1.5120124816894531, "step": 4167 }, { "epoch": 0.68, "learning_rate": 9.451925834633804e-07, "logits/chosen": -0.5178254842758179, "logits/rejected": -0.4464789927005768, "logps/chosen": -52.73313903808594, "logps/rejected": -92.02481079101562, "loss": 0.3221, "rewards/accuracies": 1.0, "rewards/chosen": 0.46291467547416687, "rewards/margins": 0.3056362271308899, "rewards/rejected": 0.15727844834327698, "step": 4168 }, { "epoch": 0.68, "learning_rate": 9.451327422032696e-07, "logits/chosen": -1.003997564315796, "logits/rejected": -0.9150537848472595, "logps/chosen": -114.53471374511719, "logps/rejected": -55.65833282470703, "loss": 0.3829, "rewards/accuracies": 1.0, "rewards/chosen": 0.8547905087471008, "rewards/margins": 0.001002490520477295, "rewards/rejected": 0.8537880182266235, "step": 4169 }, { "epoch": 0.68, "learning_rate": 9.450728701886983e-07, "logits/chosen": -0.34423011541366577, "logits/rejected": -0.3532553017139435, "logps/chosen": -2.9996111392974854, "logps/rejected": -1.6367931365966797, "loss": 0.5221, "rewards/accuracies": 0.0, "rewards/chosen": 0.14953505992889404, "rewards/margins": -0.03381752967834473, "rewards/rejected": 0.18335258960723877, "step": 4170 }, { "epoch": 0.68, "learning_rate": 9.450129674238029e-07, "logits/chosen": -0.6118547320365906, "logits/rejected": -0.6118547320365906, "logps/chosen": -41.176605224609375, "logps/rejected": -41.176605224609375, "loss": 0.4301, "rewards/accuracies": 0.0, "rewards/chosen": 0.7827106714248657, "rewards/margins": 0.0, "rewards/rejected": 0.7827106714248657, "step": 4171 }, { "epoch": 0.68, "learning_rate": 9.449530339127221e-07, "logits/chosen": -0.8105239868164062, "logits/rejected": -0.8411191701889038, "logps/chosen": -37.27784729003906, "logps/rejected": -75.12064361572266, "loss": 0.3352, "rewards/accuracies": 1.0, "rewards/chosen": 1.8500053882598877, "rewards/margins": 0.08194279670715332, "rewards/rejected": 1.7680625915527344, "step": 4172 }, { "epoch": 0.68, "learning_rate": 9.448930696595969e-07, "logits/chosen": -0.4514974057674408, "logits/rejected": -0.45894676446914673, "logps/chosen": -69.54193115234375, "logps/rejected": -60.643699645996094, "loss": 1.1232, "rewards/accuracies": 0.0, "rewards/chosen": 0.19849777221679688, "rewards/margins": -1.8154594898223877, "rewards/rejected": 2.0139572620391846, "step": 4173 }, { "epoch": 0.68, "learning_rate": 9.448330746685704e-07, "logits/chosen": -0.734131932258606, "logits/rejected": -0.7442768812179565, "logps/chosen": -71.96356201171875, "logps/rejected": -78.88871002197266, "loss": 1.0105, "rewards/accuracies": 0.0, "rewards/chosen": 1.9632889032363892, "rewards/margins": -0.7109931707382202, "rewards/rejected": 2.6742820739746094, "step": 4174 }, { "epoch": 0.68, "learning_rate": 9.447730489437873e-07, "logits/chosen": -0.663227915763855, "logits/rejected": -0.6617934107780457, "logps/chosen": -44.14252471923828, "logps/rejected": -89.79170227050781, "loss": 0.6665, "rewards/accuracies": 1.0, "rewards/chosen": 0.6525714993476868, "rewards/margins": 0.10945016145706177, "rewards/rejected": 0.543121337890625, "step": 4175 }, { "epoch": 0.68, "learning_rate": 9.447129924893949e-07, "logits/chosen": -0.8767761588096619, "logits/rejected": -0.9478719234466553, "logps/chosen": -287.09539794921875, "logps/rejected": -84.38353729248047, "loss": 0.5013, "rewards/accuracies": 1.0, "rewards/chosen": 4.414849758148193, "rewards/margins": 2.8495733737945557, "rewards/rejected": 1.5652763843536377, "step": 4176 }, { "epoch": 0.68, "learning_rate": 9.446529053095429e-07, "logits/chosen": -0.22721849381923676, "logits/rejected": -0.33140119910240173, "logps/chosen": -79.90496826171875, "logps/rejected": -115.32493591308594, "loss": 1.8387, "rewards/accuracies": 0.0, "rewards/chosen": 0.6740585565567017, "rewards/margins": -3.2769713401794434, "rewards/rejected": 3.9510300159454346, "step": 4177 }, { "epoch": 0.68, "learning_rate": 9.445927874083823e-07, "logits/chosen": -0.6111600995063782, "logits/rejected": -0.5712829232215881, "logps/chosen": -119.53560638427734, "logps/rejected": -62.79322052001953, "loss": 0.0988, "rewards/accuracies": 1.0, "rewards/chosen": 3.3565773963928223, "rewards/margins": 2.1862754821777344, "rewards/rejected": 1.1703017950057983, "step": 4178 }, { "epoch": 0.68, "learning_rate": 9.44532638790067e-07, "logits/chosen": -0.6311511993408203, "logits/rejected": -0.6833068132400513, "logps/chosen": -51.924537658691406, "logps/rejected": -70.1443862915039, "loss": 2.0742, "rewards/accuracies": 0.0, "rewards/chosen": 2.294100284576416, "rewards/margins": -0.6269552707672119, "rewards/rejected": 2.921055555343628, "step": 4179 }, { "epoch": 0.68, "learning_rate": 9.444724594587523e-07, "logits/chosen": -0.7536593675613403, "logits/rejected": -0.6657230257987976, "logps/chosen": -74.69964599609375, "logps/rejected": -64.9977035522461, "loss": 0.2828, "rewards/accuracies": 1.0, "rewards/chosen": 1.5703552961349487, "rewards/margins": 0.5817665457725525, "rewards/rejected": 0.9885887503623962, "step": 4180 }, { "epoch": 0.68, "learning_rate": 9.444122494185966e-07, "logits/chosen": -0.5071842670440674, "logits/rejected": -0.3722604811191559, "logps/chosen": -46.80180740356445, "logps/rejected": -9.750818252563477, "loss": 0.2555, "rewards/accuracies": 1.0, "rewards/chosen": 1.5609924793243408, "rewards/margins": 0.6039045453071594, "rewards/rejected": 0.9570879340171814, "step": 4181 }, { "epoch": 0.68, "learning_rate": 9.443520086737593e-07, "logits/chosen": -0.5108985304832458, "logits/rejected": -0.42198798060417175, "logps/chosen": -66.3612289428711, "logps/rejected": -35.703704833984375, "loss": 0.2702, "rewards/accuracies": 1.0, "rewards/chosen": 1.2787697315216064, "rewards/margins": 1.193576455116272, "rewards/rejected": 0.08519325405359268, "step": 4182 }, { "epoch": 0.68, "learning_rate": 9.442917372284029e-07, "logits/chosen": -0.45775264501571655, "logits/rejected": -0.4085720181465149, "logps/chosen": -30.785160064697266, "logps/rejected": -46.38591003417969, "loss": 1.1878, "rewards/accuracies": 1.0, "rewards/chosen": 1.1749523878097534, "rewards/margins": 0.022807717323303223, "rewards/rejected": 1.1521446704864502, "step": 4183 }, { "epoch": 0.68, "learning_rate": 9.442314350866912e-07, "logits/chosen": -0.7554047703742981, "logits/rejected": -0.6221252083778381, "logps/chosen": -125.21171569824219, "logps/rejected": -148.11729431152344, "loss": 0.2204, "rewards/accuracies": 1.0, "rewards/chosen": 5.47707986831665, "rewards/margins": 0.7080140113830566, "rewards/rejected": 4.769065856933594, "step": 4184 }, { "epoch": 0.68, "learning_rate": 9.441711022527907e-07, "logits/chosen": -0.5903143882751465, "logits/rejected": -0.4323576092720032, "logps/chosen": -50.7447395324707, "logps/rejected": -62.02243423461914, "loss": 0.6145, "rewards/accuracies": 0.0, "rewards/chosen": 1.6334842443466187, "rewards/margins": -0.8100219964981079, "rewards/rejected": 2.4435062408447266, "step": 4185 }, { "epoch": 0.68, "learning_rate": 9.4411073873087e-07, "logits/chosen": -0.5895805954933167, "logits/rejected": -0.595908522605896, "logps/chosen": -106.43959045410156, "logps/rejected": -88.17852783203125, "loss": 0.5804, "rewards/accuracies": 0.0, "rewards/chosen": 1.2171363830566406, "rewards/margins": -0.6636428833007812, "rewards/rejected": 1.8807792663574219, "step": 4186 }, { "epoch": 0.68, "learning_rate": 9.440503445250993e-07, "logits/chosen": -0.5440989136695862, "logits/rejected": -0.47446584701538086, "logps/chosen": -78.57144165039062, "logps/rejected": -13.080336570739746, "loss": 0.2808, "rewards/accuracies": 1.0, "rewards/chosen": 1.1675010919570923, "rewards/margins": 0.3338923454284668, "rewards/rejected": 0.8336087465286255, "step": 4187 }, { "epoch": 0.68, "learning_rate": 9.439899196396515e-07, "logits/chosen": -0.8620376586914062, "logits/rejected": -0.8620376586914062, "logps/chosen": -65.57859802246094, "logps/rejected": -65.57859802246094, "loss": 0.4497, "rewards/accuracies": 0.0, "rewards/chosen": 3.2720229625701904, "rewards/margins": 0.0, "rewards/rejected": 3.2720229625701904, "step": 4188 }, { "epoch": 0.68, "learning_rate": 9.439294640787013e-07, "logits/chosen": -0.2869587540626526, "logits/rejected": -0.22235006093978882, "logps/chosen": -46.38300704956055, "logps/rejected": -13.61161994934082, "loss": 0.4829, "rewards/accuracies": 0.0, "rewards/chosen": 0.8273830413818359, "rewards/margins": -0.07757514715194702, "rewards/rejected": 0.904958188533783, "step": 4189 }, { "epoch": 0.68, "learning_rate": 9.438689778464257e-07, "logits/chosen": -0.6497086882591248, "logits/rejected": -0.6721515655517578, "logps/chosen": -105.92213439941406, "logps/rejected": -106.08244323730469, "loss": 0.9836, "rewards/accuracies": 0.0, "rewards/chosen": 0.16492386162281036, "rewards/margins": -0.9459999203681946, "rewards/rejected": 1.1109237670898438, "step": 4190 }, { "epoch": 0.68, "learning_rate": 9.438084609470036e-07, "logits/chosen": -0.8361846804618835, "logits/rejected": -0.8512778282165527, "logps/chosen": -65.54583740234375, "logps/rejected": -45.14909362792969, "loss": 1.1138, "rewards/accuracies": 0.0, "rewards/chosen": 0.48243027925491333, "rewards/margins": -1.2006492614746094, "rewards/rejected": 1.6830796003341675, "step": 4191 }, { "epoch": 0.68, "learning_rate": 9.437479133846162e-07, "logits/chosen": -0.562811553478241, "logits/rejected": -0.5472373366355896, "logps/chosen": -59.663543701171875, "logps/rejected": -42.01247024536133, "loss": 0.8439, "rewards/accuracies": 1.0, "rewards/chosen": 1.564910888671875, "rewards/margins": 0.18543505668640137, "rewards/rejected": 1.3794758319854736, "step": 4192 }, { "epoch": 0.68, "learning_rate": 9.436873351634469e-07, "logits/chosen": -0.3072302043437958, "logits/rejected": -0.30366694927215576, "logps/chosen": -72.77279663085938, "logps/rejected": -58.463279724121094, "loss": 0.8207, "rewards/accuracies": 0.0, "rewards/chosen": 1.296966552734375, "rewards/margins": -0.28535306453704834, "rewards/rejected": 1.5823196172714233, "step": 4193 }, { "epoch": 0.68, "learning_rate": 9.436267262876807e-07, "logits/chosen": -0.5863978862762451, "logits/rejected": -0.5913426280021667, "logps/chosen": -47.127784729003906, "logps/rejected": -70.97854614257812, "loss": 0.7966, "rewards/accuracies": 0.0, "rewards/chosen": 1.1255199909210205, "rewards/margins": -0.33645057678222656, "rewards/rejected": 1.461970567703247, "step": 4194 }, { "epoch": 0.68, "learning_rate": 9.435660867615057e-07, "logits/chosen": -1.148460030555725, "logits/rejected": -1.0369479656219482, "logps/chosen": -87.95417785644531, "logps/rejected": -53.22338104248047, "loss": 0.403, "rewards/accuracies": 1.0, "rewards/chosen": 3.8096847534179688, "rewards/margins": 1.9115585088729858, "rewards/rejected": 1.898126244544983, "step": 4195 }, { "epoch": 0.68, "learning_rate": 9.435054165891108e-07, "logits/chosen": -0.8384459614753723, "logits/rejected": -0.8165360689163208, "logps/chosen": -68.58245086669922, "logps/rejected": -34.66679382324219, "loss": 1.0655, "rewards/accuracies": 1.0, "rewards/chosen": 1.5453414916992188, "rewards/margins": 0.6242965459823608, "rewards/rejected": 0.9210449457168579, "step": 4196 }, { "epoch": 0.68, "learning_rate": 9.434447157746883e-07, "logits/chosen": -0.7447773814201355, "logits/rejected": -0.8043686747550964, "logps/chosen": -46.669212341308594, "logps/rejected": -89.41995239257812, "loss": 1.1965, "rewards/accuracies": 0.0, "rewards/chosen": 1.2761650085449219, "rewards/margins": -1.3168563842773438, "rewards/rejected": 2.5930213928222656, "step": 4197 }, { "epoch": 0.68, "learning_rate": 9.433839843224318e-07, "logits/chosen": -0.42496052384376526, "logits/rejected": -0.45671653747558594, "logps/chosen": -1.4139201641082764, "logps/rejected": -48.570011138916016, "loss": 0.5001, "rewards/accuracies": 1.0, "rewards/chosen": 0.3929426074028015, "rewards/margins": 0.12272393703460693, "rewards/rejected": 0.2702186703681946, "step": 4198 }, { "epoch": 0.68, "learning_rate": 9.433232222365372e-07, "logits/chosen": -0.5039800405502319, "logits/rejected": -0.5176867246627808, "logps/chosen": -110.80329895019531, "logps/rejected": -128.2064666748047, "loss": 1.2197, "rewards/accuracies": 0.0, "rewards/chosen": 0.8039771914482117, "rewards/margins": -2.295133113861084, "rewards/rejected": 3.0991103649139404, "step": 4199 }, { "epoch": 0.68, "learning_rate": 9.432624295212029e-07, "logits/chosen": -0.7061492800712585, "logits/rejected": -0.742749810218811, "logps/chosen": -47.824310302734375, "logps/rejected": -58.001243591308594, "loss": 0.4219, "rewards/accuracies": 1.0, "rewards/chosen": 2.434863328933716, "rewards/margins": 0.24681782722473145, "rewards/rejected": 2.1880455017089844, "step": 4200 }, { "epoch": 0.68, "learning_rate": 9.43201606180629e-07, "logits/chosen": -0.3690885305404663, "logits/rejected": -0.2190071940422058, "logps/chosen": -60.81813430786133, "logps/rejected": -50.164302825927734, "loss": 1.0393, "rewards/accuracies": 0.0, "rewards/chosen": 1.7492557764053345, "rewards/margins": -0.13873744010925293, "rewards/rejected": 1.8879932165145874, "step": 4201 }, { "epoch": 0.68, "learning_rate": 9.431407522190175e-07, "logits/chosen": -0.736210823059082, "logits/rejected": -0.7024587988853455, "logps/chosen": -106.91017150878906, "logps/rejected": -127.06278228759766, "loss": 0.4807, "rewards/accuracies": 1.0, "rewards/chosen": 3.116162061691284, "rewards/margins": 1.0656471252441406, "rewards/rejected": 2.0505149364471436, "step": 4202 }, { "epoch": 0.68, "learning_rate": 9.430798676405732e-07, "logits/chosen": -1.0255686044692993, "logits/rejected": -0.9802693724632263, "logps/chosen": -127.8852310180664, "logps/rejected": -74.2330322265625, "loss": 0.236, "rewards/accuracies": 1.0, "rewards/chosen": 4.22808313369751, "rewards/margins": 0.8986594676971436, "rewards/rejected": 3.329423666000366, "step": 4203 }, { "epoch": 0.68, "learning_rate": 9.430189524495022e-07, "logits/chosen": -0.8045378923416138, "logits/rejected": -0.787354588508606, "logps/chosen": -203.7347412109375, "logps/rejected": -192.2421875, "loss": 0.3759, "rewards/accuracies": 0.0, "rewards/chosen": 5.151355266571045, "rewards/margins": -0.0261077880859375, "rewards/rejected": 5.177463054656982, "step": 4204 }, { "epoch": 0.68, "learning_rate": 9.429580066500138e-07, "logits/chosen": -0.42324554920196533, "logits/rejected": -0.4069548547267914, "logps/chosen": -36.11015701293945, "logps/rejected": -21.769502639770508, "loss": 0.7175, "rewards/accuracies": 0.0, "rewards/chosen": 0.005191802978515625, "rewards/margins": -0.6385661959648132, "rewards/rejected": 0.6437579989433289, "step": 4205 }, { "epoch": 0.68, "learning_rate": 9.428970302463184e-07, "logits/chosen": -0.31132692098617554, "logits/rejected": -0.27745357155799866, "logps/chosen": -49.829654693603516, "logps/rejected": -7.290560722351074, "loss": 0.7991, "rewards/accuracies": 0.0, "rewards/chosen": 0.55877685546875, "rewards/margins": -0.19572937488555908, "rewards/rejected": 0.7545062303543091, "step": 4206 }, { "epoch": 0.68, "learning_rate": 9.428360232426289e-07, "logits/chosen": -0.52435302734375, "logits/rejected": -0.5961713790893555, "logps/chosen": -44.274349212646484, "logps/rejected": -41.9840087890625, "loss": 1.4835, "rewards/accuracies": 0.0, "rewards/chosen": 0.740753173828125, "rewards/margins": -1.4935500621795654, "rewards/rejected": 2.2343032360076904, "step": 4207 }, { "epoch": 0.68, "learning_rate": 9.427749856431603e-07, "logits/chosen": -0.561028778553009, "logits/rejected": -0.5537370443344116, "logps/chosen": -63.43023681640625, "logps/rejected": -89.94244384765625, "loss": 0.9431, "rewards/accuracies": 1.0, "rewards/chosen": 2.32441782951355, "rewards/margins": 0.44383764266967773, "rewards/rejected": 1.880580186843872, "step": 4208 }, { "epoch": 0.68, "learning_rate": 9.427139174521296e-07, "logits/chosen": -0.24791589379310608, "logits/rejected": -0.2021685540676117, "logps/chosen": -42.734344482421875, "logps/rejected": -112.97403717041016, "loss": 1.4999, "rewards/accuracies": 0.0, "rewards/chosen": 2.0028176307678223, "rewards/margins": -2.4697937965393066, "rewards/rejected": 4.472611427307129, "step": 4209 }, { "epoch": 0.68, "learning_rate": 9.426528186737565e-07, "logits/chosen": -1.0450184345245361, "logits/rejected": -0.9710652828216553, "logps/chosen": -114.68986511230469, "logps/rejected": -84.25921630859375, "loss": 0.7303, "rewards/accuracies": 0.0, "rewards/chosen": 1.3976181745529175, "rewards/margins": -1.1781593561172485, "rewards/rejected": 2.575777530670166, "step": 4210 }, { "epoch": 0.68, "learning_rate": 9.425916893122621e-07, "logits/chosen": -1.0046460628509521, "logits/rejected": -1.0329068899154663, "logps/chosen": -112.36488342285156, "logps/rejected": -35.88039016723633, "loss": 0.5332, "rewards/accuracies": 0.0, "rewards/chosen": 0.1802017241716385, "rewards/margins": -0.09983329474925995, "rewards/rejected": 0.28003501892089844, "step": 4211 }, { "epoch": 0.68, "learning_rate": 9.425305293718696e-07, "logits/chosen": -0.429545134305954, "logits/rejected": -0.4296957552433014, "logps/chosen": -74.6653823852539, "logps/rejected": -65.23448181152344, "loss": 0.7529, "rewards/accuracies": 0.0, "rewards/chosen": 1.8467857837677002, "rewards/margins": -0.8916275501251221, "rewards/rejected": 2.7384133338928223, "step": 4212 }, { "epoch": 0.68, "learning_rate": 9.424693388568048e-07, "logits/chosen": -0.7034047842025757, "logits/rejected": -0.6159101128578186, "logps/chosen": -64.19985961914062, "logps/rejected": -49.146575927734375, "loss": 0.5117, "rewards/accuracies": 1.0, "rewards/chosen": 2.2418534755706787, "rewards/margins": 0.629658579826355, "rewards/rejected": 1.6121948957443237, "step": 4213 }, { "epoch": 0.68, "learning_rate": 9.424081177712955e-07, "logits/chosen": -0.600995659828186, "logits/rejected": -0.5850239396095276, "logps/chosen": -64.44637298583984, "logps/rejected": -104.86914825439453, "loss": 1.0817, "rewards/accuracies": 1.0, "rewards/chosen": 1.780678629875183, "rewards/margins": 0.8650063276290894, "rewards/rejected": 0.9156723022460938, "step": 4214 }, { "epoch": 0.68, "learning_rate": 9.423468661195712e-07, "logits/chosen": -0.6991456747055054, "logits/rejected": -0.7521236538887024, "logps/chosen": -76.09231567382812, "logps/rejected": -107.40753173828125, "loss": 1.2627, "rewards/accuracies": 0.0, "rewards/chosen": 1.5433791875839233, "rewards/margins": -2.2250657081604004, "rewards/rejected": 3.768444776535034, "step": 4215 }, { "epoch": 0.68, "learning_rate": 9.422855839058641e-07, "logits/chosen": -0.7020348906517029, "logits/rejected": -0.6946772933006287, "logps/chosen": -27.974037170410156, "logps/rejected": -63.5391731262207, "loss": 0.2006, "rewards/accuracies": 1.0, "rewards/chosen": 1.7779461145401, "rewards/margins": 0.8224540948867798, "rewards/rejected": 0.9554920196533203, "step": 4216 }, { "epoch": 0.68, "learning_rate": 9.42224271134408e-07, "logits/chosen": -0.6340319514274597, "logits/rejected": -0.5638229250907898, "logps/chosen": -65.76509094238281, "logps/rejected": -25.037609100341797, "loss": 0.4744, "rewards/accuracies": 1.0, "rewards/chosen": 1.9173774719238281, "rewards/margins": 1.6697040796279907, "rewards/rejected": 0.2476734220981598, "step": 4217 }, { "epoch": 0.68, "learning_rate": 9.421629278094391e-07, "logits/chosen": -0.41586732864379883, "logits/rejected": -0.4448539614677429, "logps/chosen": -56.536094665527344, "logps/rejected": -80.41215515136719, "loss": 0.6204, "rewards/accuracies": 0.0, "rewards/chosen": 1.1585838794708252, "rewards/margins": -0.012911200523376465, "rewards/rejected": 1.1714950799942017, "step": 4218 }, { "epoch": 0.68, "learning_rate": 9.42101553935196e-07, "logits/chosen": -0.7129813432693481, "logits/rejected": -0.7129813432693481, "logps/chosen": -40.09124755859375, "logps/rejected": -40.09124755859375, "loss": 0.6052, "rewards/accuracies": 0.0, "rewards/chosen": 1.5785518884658813, "rewards/margins": 0.0, "rewards/rejected": 1.5785518884658813, "step": 4219 }, { "epoch": 0.68, "learning_rate": 9.420401495159183e-07, "logits/chosen": 0.04967444762587547, "logits/rejected": 0.08235276490449905, "logps/chosen": -6.761810302734375, "logps/rejected": -11.865660667419434, "loss": 0.6405, "rewards/accuracies": 0.0, "rewards/chosen": 0.5224645733833313, "rewards/margins": -0.242059588432312, "rewards/rejected": 0.7645241618156433, "step": 4220 }, { "epoch": 0.69, "learning_rate": 9.419787145558491e-07, "logits/chosen": -0.1157510057091713, "logits/rejected": -0.1157510057091713, "logps/chosen": -63.77911376953125, "logps/rejected": -63.77911376953125, "loss": 0.4103, "rewards/accuracies": 0.0, "rewards/chosen": 0.5114235281944275, "rewards/margins": 0.0, "rewards/rejected": 0.5114235281944275, "step": 4221 }, { "epoch": 0.69, "learning_rate": 9.419172490592328e-07, "logits/chosen": -1.1528737545013428, "logits/rejected": -1.1495082378387451, "logps/chosen": -84.59790802001953, "logps/rejected": -65.63842010498047, "loss": 1.1766, "rewards/accuracies": 0.0, "rewards/chosen": 0.38361892104148865, "rewards/margins": -1.6946839094161987, "rewards/rejected": 2.0783028602600098, "step": 4222 }, { "epoch": 0.69, "learning_rate": 9.41855753030316e-07, "logits/chosen": -0.2815377712249756, "logits/rejected": -0.2665815055370331, "logps/chosen": -69.93497467041016, "logps/rejected": -75.1866226196289, "loss": 0.5906, "rewards/accuracies": 0.0, "rewards/chosen": 0.4855293333530426, "rewards/margins": -0.07842710614204407, "rewards/rejected": 0.5639564394950867, "step": 4223 }, { "epoch": 0.69, "learning_rate": 9.417942264733476e-07, "logits/chosen": -0.37490516901016235, "logits/rejected": -0.3476516008377075, "logps/chosen": -17.098363876342773, "logps/rejected": -6.282075881958008, "loss": 0.7325, "rewards/accuracies": 0.0, "rewards/chosen": 0.3664722442626953, "rewards/margins": -0.4056794047355652, "rewards/rejected": 0.7721516489982605, "step": 4224 }, { "epoch": 0.69, "learning_rate": 9.417326693925783e-07, "logits/chosen": -0.24762754142284393, "logits/rejected": -0.3476787805557251, "logps/chosen": -90.32672882080078, "logps/rejected": -105.01737976074219, "loss": 2.7955, "rewards/accuracies": 0.0, "rewards/chosen": 0.7775444388389587, "rewards/margins": -1.7531516551971436, "rewards/rejected": 2.530696153640747, "step": 4225 }, { "epoch": 0.69, "learning_rate": 9.416710817922614e-07, "logits/chosen": -0.5583885312080383, "logits/rejected": -0.48367783427238464, "logps/chosen": -90.97528839111328, "logps/rejected": -63.299110412597656, "loss": 1.6648, "rewards/accuracies": 0.0, "rewards/chosen": 1.3628562688827515, "rewards/margins": -0.014215946197509766, "rewards/rejected": 1.3770722150802612, "step": 4226 }, { "epoch": 0.69, "learning_rate": 9.416094636766519e-07, "logits/chosen": -0.2514791786670685, "logits/rejected": -0.23330138623714447, "logps/chosen": -77.19564819335938, "logps/rejected": -137.40737915039062, "loss": 1.0421, "rewards/accuracies": 1.0, "rewards/chosen": 0.6034652590751648, "rewards/margins": 0.2803970277309418, "rewards/rejected": 0.323068231344223, "step": 4227 }, { "epoch": 0.69, "learning_rate": 9.415478150500069e-07, "logits/chosen": -0.5226263403892517, "logits/rejected": -0.5262283086776733, "logps/chosen": -2.1918373107910156, "logps/rejected": -1.3271819353103638, "loss": 0.8812, "rewards/accuracies": 1.0, "rewards/chosen": 0.270770788192749, "rewards/margins": 0.0189608633518219, "rewards/rejected": 0.2518099248409271, "step": 4228 }, { "epoch": 0.69, "learning_rate": 9.414861359165858e-07, "logits/chosen": -1.0505200624465942, "logits/rejected": -1.0803887844085693, "logps/chosen": -238.80206298828125, "logps/rejected": -37.938026428222656, "loss": 0.0349, "rewards/accuracies": 1.0, "rewards/chosen": 4.704541206359863, "rewards/margins": 4.22603702545166, "rewards/rejected": 0.4785041809082031, "step": 4229 }, { "epoch": 0.69, "learning_rate": 9.414244262806501e-07, "logits/chosen": -0.7692509293556213, "logits/rejected": -0.6935677528381348, "logps/chosen": -162.28440856933594, "logps/rejected": -61.822872161865234, "loss": 0.2898, "rewards/accuracies": 1.0, "rewards/chosen": 2.2399659156799316, "rewards/margins": 0.49259912967681885, "rewards/rejected": 1.7473667860031128, "step": 4230 }, { "epoch": 0.69, "learning_rate": 9.413626861464634e-07, "logits/chosen": -0.45896148681640625, "logits/rejected": -0.4776919484138489, "logps/chosen": -50.89322280883789, "logps/rejected": -73.53036499023438, "loss": 0.7932, "rewards/accuracies": 0.0, "rewards/chosen": 1.2478207349777222, "rewards/margins": -1.2615970373153687, "rewards/rejected": 2.509417772293091, "step": 4231 }, { "epoch": 0.69, "learning_rate": 9.413009155182913e-07, "logits/chosen": -0.7355940937995911, "logits/rejected": -0.713650107383728, "logps/chosen": -72.9808349609375, "logps/rejected": -106.87074279785156, "loss": 0.6549, "rewards/accuracies": 1.0, "rewards/chosen": 1.5945533514022827, "rewards/margins": 0.7864051461219788, "rewards/rejected": 0.808148205280304, "step": 4232 }, { "epoch": 0.69, "learning_rate": 9.412391144004017e-07, "logits/chosen": -0.32525646686553955, "logits/rejected": -0.34390491247177124, "logps/chosen": -76.57713317871094, "logps/rejected": -132.3372039794922, "loss": 0.6342, "rewards/accuracies": 1.0, "rewards/chosen": 0.9921607971191406, "rewards/margins": 0.21543043851852417, "rewards/rejected": 0.7767303586006165, "step": 4233 }, { "epoch": 0.69, "learning_rate": 9.411772827970641e-07, "logits/chosen": -0.7718244194984436, "logits/rejected": -0.7769259810447693, "logps/chosen": -86.00242614746094, "logps/rejected": -141.42247009277344, "loss": 3.0268, "rewards/accuracies": 0.0, "rewards/chosen": 1.7149620056152344, "rewards/margins": -4.889235973358154, "rewards/rejected": 6.604197978973389, "step": 4234 }, { "epoch": 0.69, "learning_rate": 9.411154207125509e-07, "logits/chosen": -0.26390963792800903, "logits/rejected": -0.24226832389831543, "logps/chosen": -95.82059478759766, "logps/rejected": -61.14774703979492, "loss": 0.3704, "rewards/accuracies": 1.0, "rewards/chosen": 1.943762183189392, "rewards/margins": 1.1286861896514893, "rewards/rejected": 0.8150760531425476, "step": 4235 }, { "epoch": 0.69, "learning_rate": 9.410535281511358e-07, "logits/chosen": -0.5891399383544922, "logits/rejected": -0.6711212396621704, "logps/chosen": -116.64136505126953, "logps/rejected": -97.88613891601562, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 2.4686546325683594, "rewards/margins": -0.23610234260559082, "rewards/rejected": 2.70475697517395, "step": 4236 }, { "epoch": 0.69, "learning_rate": 9.409916051170954e-07, "logits/chosen": -0.7961018085479736, "logits/rejected": -0.7773359417915344, "logps/chosen": -137.5112762451172, "logps/rejected": -103.89399719238281, "loss": 0.2741, "rewards/accuracies": 1.0, "rewards/chosen": 1.7063690423965454, "rewards/margins": 0.3868980407714844, "rewards/rejected": 1.319471001625061, "step": 4237 }, { "epoch": 0.69, "learning_rate": 9.409296516147077e-07, "logits/chosen": -0.24754954874515533, "logits/rejected": -0.24754954874515533, "logps/chosen": -3.3671910762786865, "logps/rejected": -3.3671910762786865, "loss": 0.6069, "rewards/accuracies": 0.0, "rewards/chosen": 0.2589676082134247, "rewards/margins": 0.0, "rewards/rejected": 0.2589676082134247, "step": 4238 }, { "epoch": 0.69, "learning_rate": 9.408676676482533e-07, "logits/chosen": -0.6795094609260559, "logits/rejected": -0.7660223245620728, "logps/chosen": -47.6007080078125, "logps/rejected": -150.18133544921875, "loss": 3.1497, "rewards/accuracies": 0.0, "rewards/chosen": 0.8525974154472351, "rewards/margins": -3.665910005569458, "rewards/rejected": 4.518507480621338, "step": 4239 }, { "epoch": 0.69, "learning_rate": 9.408056532220143e-07, "logits/chosen": -0.8330383896827698, "logits/rejected": -0.7107995748519897, "logps/chosen": -111.82781219482422, "logps/rejected": -92.01753234863281, "loss": 0.2564, "rewards/accuracies": 1.0, "rewards/chosen": 4.299510955810547, "rewards/margins": 2.7278099060058594, "rewards/rejected": 1.5717010498046875, "step": 4240 }, { "epoch": 0.69, "learning_rate": 9.407436083402758e-07, "logits/chosen": -0.35091161727905273, "logits/rejected": -0.3524511456489563, "logps/chosen": -20.347820281982422, "logps/rejected": -6.758059978485107, "loss": 0.7288, "rewards/accuracies": 0.0, "rewards/chosen": -0.02728881873190403, "rewards/margins": -0.16859184205532074, "rewards/rejected": 0.14130301773548126, "step": 4241 }, { "epoch": 0.69, "learning_rate": 9.406815330073244e-07, "logits/chosen": -1.0241554975509644, "logits/rejected": -0.8740309476852417, "logps/chosen": -120.7223892211914, "logps/rejected": -32.48411560058594, "loss": 0.3425, "rewards/accuracies": 1.0, "rewards/chosen": 0.6650421023368835, "rewards/margins": 0.42719417810440063, "rewards/rejected": 0.23784790933132172, "step": 4242 }, { "epoch": 0.69, "learning_rate": 9.406194272274488e-07, "logits/chosen": -0.3389248251914978, "logits/rejected": -0.1169319599866867, "logps/chosen": -72.17117309570312, "logps/rejected": -68.98858642578125, "loss": 0.0902, "rewards/accuracies": 1.0, "rewards/chosen": 3.859081983566284, "rewards/margins": 1.652003288269043, "rewards/rejected": 2.207078695297241, "step": 4243 }, { "epoch": 0.69, "learning_rate": 9.405572910049397e-07, "logits/chosen": -0.2877078950405121, "logits/rejected": -0.2877078950405121, "logps/chosen": -56.180206298828125, "logps/rejected": -56.180206298828125, "loss": 0.6223, "rewards/accuracies": 0.0, "rewards/chosen": 2.3955628871917725, "rewards/margins": 0.0, "rewards/rejected": 2.3955628871917725, "step": 4244 }, { "epoch": 0.69, "learning_rate": 9.404951243440908e-07, "logits/chosen": -0.902759850025177, "logits/rejected": -0.8955175280570984, "logps/chosen": -85.98918151855469, "logps/rejected": -133.2559356689453, "loss": 0.7988, "rewards/accuracies": 0.0, "rewards/chosen": 4.507382392883301, "rewards/margins": -1.3040494918823242, "rewards/rejected": 5.811431884765625, "step": 4245 }, { "epoch": 0.69, "learning_rate": 9.404329272491965e-07, "logits/chosen": -0.8018043041229248, "logits/rejected": -0.8051937222480774, "logps/chosen": -71.68592834472656, "logps/rejected": -16.390792846679688, "loss": 0.1314, "rewards/accuracies": 1.0, "rewards/chosen": 3.920146942138672, "rewards/margins": 3.286557197570801, "rewards/rejected": 0.6335897445678711, "step": 4246 }, { "epoch": 0.69, "learning_rate": 9.403706997245544e-07, "logits/chosen": -0.5274531841278076, "logits/rejected": -0.41097113490104675, "logps/chosen": -76.53073120117188, "logps/rejected": -86.1500244140625, "loss": 0.2146, "rewards/accuracies": 1.0, "rewards/chosen": 2.0664658546447754, "rewards/margins": 1.1634438037872314, "rewards/rejected": 0.9030219912528992, "step": 4247 }, { "epoch": 0.69, "learning_rate": 9.403084417744639e-07, "logits/chosen": -0.846714437007904, "logits/rejected": -0.8622156977653503, "logps/chosen": -82.7758560180664, "logps/rejected": -108.04054260253906, "loss": 0.2685, "rewards/accuracies": 1.0, "rewards/chosen": 3.5911171436309814, "rewards/margins": 0.48415446281433105, "rewards/rejected": 3.1069626808166504, "step": 4248 }, { "epoch": 0.69, "learning_rate": 9.402461534032263e-07, "logits/chosen": -0.6559526920318604, "logits/rejected": -0.6447555422782898, "logps/chosen": -76.48567199707031, "logps/rejected": -75.11078643798828, "loss": 0.7664, "rewards/accuracies": 0.0, "rewards/chosen": 2.6380441188812256, "rewards/margins": -0.17873001098632812, "rewards/rejected": 2.8167741298675537, "step": 4249 }, { "epoch": 0.69, "learning_rate": 9.401838346151449e-07, "logits/chosen": -0.4063740372657776, "logits/rejected": -0.4296194016933441, "logps/chosen": -50.38255310058594, "logps/rejected": -115.58619689941406, "loss": 0.3518, "rewards/accuracies": 1.0, "rewards/chosen": 1.8973770141601562, "rewards/margins": 0.2439819574356079, "rewards/rejected": 1.6533950567245483, "step": 4250 }, { "epoch": 0.69, "learning_rate": 9.401214854145259e-07, "logits/chosen": -0.7706952691078186, "logits/rejected": -0.5781722068786621, "logps/chosen": -68.18284606933594, "logps/rejected": -170.55484008789062, "loss": 0.4928, "rewards/accuracies": 1.0, "rewards/chosen": 2.178802490234375, "rewards/margins": 0.4373321533203125, "rewards/rejected": 1.7414703369140625, "step": 4251 }, { "epoch": 0.69, "learning_rate": 9.400591058056766e-07, "logits/chosen": -0.1298139989376068, "logits/rejected": -0.1298139989376068, "logps/chosen": -74.72061920166016, "logps/rejected": -74.72061920166016, "loss": 1.2327, "rewards/accuracies": 0.0, "rewards/chosen": 0.2110191434621811, "rewards/margins": 0.0, "rewards/rejected": 0.2110191434621811, "step": 4252 }, { "epoch": 0.69, "learning_rate": 9.399966957929069e-07, "logits/chosen": -0.6049283742904663, "logits/rejected": -0.6160023212432861, "logps/chosen": -58.4370231628418, "logps/rejected": -46.23323059082031, "loss": 1.1703, "rewards/accuracies": 0.0, "rewards/chosen": 0.5921863913536072, "rewards/margins": -1.3500049114227295, "rewards/rejected": 1.9421913623809814, "step": 4253 }, { "epoch": 0.69, "learning_rate": 9.399342553805289e-07, "logits/chosen": -0.5943347215652466, "logits/rejected": -0.5610462427139282, "logps/chosen": -44.23168182373047, "logps/rejected": -55.063575744628906, "loss": 0.2662, "rewards/accuracies": 1.0, "rewards/chosen": 1.3026542663574219, "rewards/margins": 0.37765729427337646, "rewards/rejected": 0.9249969720840454, "step": 4254 }, { "epoch": 0.69, "learning_rate": 9.398717845728566e-07, "logits/chosen": -0.3785579800605774, "logits/rejected": -0.37964892387390137, "logps/chosen": -67.27423095703125, "logps/rejected": -43.6241340637207, "loss": 0.7563, "rewards/accuracies": 0.0, "rewards/chosen": 0.7201889157295227, "rewards/margins": -1.2604224681854248, "rewards/rejected": 1.9806114435195923, "step": 4255 }, { "epoch": 0.69, "learning_rate": 9.398092833742058e-07, "logits/chosen": -0.5379986763000488, "logits/rejected": -0.6006803512573242, "logps/chosen": -62.076416015625, "logps/rejected": -94.06466674804688, "loss": 1.0366, "rewards/accuracies": 0.0, "rewards/chosen": 1.9055877923965454, "rewards/margins": -1.8340119123458862, "rewards/rejected": 3.7395997047424316, "step": 4256 }, { "epoch": 0.69, "learning_rate": 9.397467517888953e-07, "logits/chosen": -0.3968435227870941, "logits/rejected": -0.420939564704895, "logps/chosen": -36.487403869628906, "logps/rejected": -81.60377502441406, "loss": 0.8982, "rewards/accuracies": 0.0, "rewards/chosen": 1.2582988739013672, "rewards/margins": -0.8568494319915771, "rewards/rejected": 2.1151483058929443, "step": 4257 }, { "epoch": 0.69, "learning_rate": 9.39684189821245e-07, "logits/chosen": -0.29592758417129517, "logits/rejected": -0.29620200395584106, "logps/chosen": -100.82371520996094, "logps/rejected": -94.82847595214844, "loss": 0.5494, "rewards/accuracies": 0.0, "rewards/chosen": 0.7727310061454773, "rewards/margins": -0.5666877627372742, "rewards/rejected": 1.3394187688827515, "step": 4258 }, { "epoch": 0.69, "learning_rate": 9.396215974755777e-07, "logits/chosen": -0.15634050965309143, "logits/rejected": -0.16133782267570496, "logps/chosen": -4.245631217956543, "logps/rejected": -0.9482298493385315, "loss": 0.6606, "rewards/accuracies": 0.0, "rewards/chosen": 0.14638057351112366, "rewards/margins": -0.04664206504821777, "rewards/rejected": 0.19302263855934143, "step": 4259 }, { "epoch": 0.69, "learning_rate": 9.395589747562177e-07, "logits/chosen": -0.2525080740451813, "logits/rejected": -0.2525080740451813, "logps/chosen": -17.328540802001953, "logps/rejected": -17.328540802001953, "loss": 0.3636, "rewards/accuracies": 0.0, "rewards/chosen": 0.061127472668886185, "rewards/margins": 0.0, "rewards/rejected": 0.061127472668886185, "step": 4260 }, { "epoch": 0.69, "learning_rate": 9.394963216674917e-07, "logits/chosen": -0.4331844747066498, "logits/rejected": -0.30966824293136597, "logps/chosen": -99.74378967285156, "logps/rejected": -15.309416770935059, "loss": 0.1376, "rewards/accuracies": 1.0, "rewards/chosen": 1.9597259759902954, "rewards/margins": 1.67716383934021, "rewards/rejected": 0.28256216645240784, "step": 4261 }, { "epoch": 0.69, "learning_rate": 9.394336382137284e-07, "logits/chosen": -0.8692618608474731, "logits/rejected": -0.7573243975639343, "logps/chosen": -211.7113494873047, "logps/rejected": -13.247673034667969, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 4.16732931137085, "rewards/margins": 3.5859649181365967, "rewards/rejected": 0.5813644528388977, "step": 4262 }, { "epoch": 0.69, "learning_rate": 9.393709243992587e-07, "logits/chosen": -0.4798763394355774, "logits/rejected": -0.3952469527721405, "logps/chosen": -211.18536376953125, "logps/rejected": -114.70492553710938, "loss": 2.1445, "rewards/accuracies": 0.0, "rewards/chosen": 1.0340973138809204, "rewards/margins": -4.212251663208008, "rewards/rejected": 5.246348857879639, "step": 4263 }, { "epoch": 0.69, "learning_rate": 9.393081802284153e-07, "logits/chosen": -0.4527646601200104, "logits/rejected": -0.4930263161659241, "logps/chosen": -111.45174407958984, "logps/rejected": -93.41433715820312, "loss": 0.4462, "rewards/accuracies": 0.0, "rewards/chosen": 0.8930404782295227, "rewards/margins": -0.3356773257255554, "rewards/rejected": 1.2287178039550781, "step": 4264 }, { "epoch": 0.69, "learning_rate": 9.392454057055337e-07, "logits/chosen": -0.9163420796394348, "logits/rejected": -0.9307947158813477, "logps/chosen": -93.22657775878906, "logps/rejected": -91.39598083496094, "loss": 0.4503, "rewards/accuracies": 1.0, "rewards/chosen": 0.4130721986293793, "rewards/margins": 0.414581298828125, "rewards/rejected": -0.0015090942615643144, "step": 4265 }, { "epoch": 0.69, "learning_rate": 9.391826008349506e-07, "logits/chosen": -0.6737143397331238, "logits/rejected": -0.6174978017807007, "logps/chosen": -78.48270416259766, "logps/rejected": -53.145263671875, "loss": 0.7372, "rewards/accuracies": 1.0, "rewards/chosen": 2.856609344482422, "rewards/margins": 0.6199285984039307, "rewards/rejected": 2.236680746078491, "step": 4266 }, { "epoch": 0.69, "learning_rate": 9.391197656210052e-07, "logits/chosen": -0.771083652973175, "logits/rejected": -0.7065130472183228, "logps/chosen": -79.88662719726562, "logps/rejected": -96.47945404052734, "loss": 0.9056, "rewards/accuracies": 0.0, "rewards/chosen": 1.573126196861267, "rewards/margins": -1.3275467157363892, "rewards/rejected": 2.9006729125976562, "step": 4267 }, { "epoch": 0.69, "learning_rate": 9.390569000680393e-07, "logits/chosen": -0.45933765172958374, "logits/rejected": -0.4654853641986847, "logps/chosen": -30.14654541015625, "logps/rejected": -7.470460891723633, "loss": 1.0319, "rewards/accuracies": 1.0, "rewards/chosen": 1.1663410663604736, "rewards/margins": 0.5433712601661682, "rewards/rejected": 0.6229698061943054, "step": 4268 }, { "epoch": 0.69, "learning_rate": 9.389940041803959e-07, "logits/chosen": -0.2738039493560791, "logits/rejected": -0.3039967119693756, "logps/chosen": -15.378090858459473, "logps/rejected": -29.6241512298584, "loss": 0.9602, "rewards/accuracies": 0.0, "rewards/chosen": 0.7942517399787903, "rewards/margins": -0.06061220169067383, "rewards/rejected": 0.8548639416694641, "step": 4269 }, { "epoch": 0.69, "learning_rate": 9.389310779624204e-07, "logits/chosen": -0.6403490304946899, "logits/rejected": -0.4922638535499573, "logps/chosen": -73.20856475830078, "logps/rejected": -38.68191146850586, "loss": 0.2133, "rewards/accuracies": 1.0, "rewards/chosen": 2.2328758239746094, "rewards/margins": 0.8817447423934937, "rewards/rejected": 1.3511310815811157, "step": 4270 }, { "epoch": 0.69, "learning_rate": 9.388681214184609e-07, "logits/chosen": -0.5854862332344055, "logits/rejected": -0.6743075847625732, "logps/chosen": -180.56007385253906, "logps/rejected": -83.98905944824219, "loss": 1.4373, "rewards/accuracies": 1.0, "rewards/chosen": 4.922177314758301, "rewards/margins": 0.24751901626586914, "rewards/rejected": 4.674658298492432, "step": 4271 }, { "epoch": 0.69, "learning_rate": 9.388051345528667e-07, "logits/chosen": -0.4637202322483063, "logits/rejected": -0.45837903022766113, "logps/chosen": -47.21403503417969, "logps/rejected": -53.31372833251953, "loss": 1.4867, "rewards/accuracies": 1.0, "rewards/chosen": 0.8747844696044922, "rewards/margins": 0.13243675231933594, "rewards/rejected": 0.7423477172851562, "step": 4272 }, { "epoch": 0.69, "learning_rate": 9.387421173699896e-07, "logits/chosen": -0.629591703414917, "logits/rejected": -0.5418626666069031, "logps/chosen": -75.8063735961914, "logps/rejected": -16.182170867919922, "loss": 1.8229, "rewards/accuracies": 1.0, "rewards/chosen": 1.630113959312439, "rewards/margins": 1.0838470458984375, "rewards/rejected": 0.5462669730186462, "step": 4273 }, { "epoch": 0.69, "learning_rate": 9.386790698741837e-07, "logits/chosen": -0.3886849284172058, "logits/rejected": -0.3886849284172058, "logps/chosen": -40.28496551513672, "logps/rejected": -40.28496551513672, "loss": 0.3549, "rewards/accuracies": 0.0, "rewards/chosen": 1.292951226234436, "rewards/margins": 0.0, "rewards/rejected": 1.292951226234436, "step": 4274 }, { "epoch": 0.69, "learning_rate": 9.38615992069805e-07, "logits/chosen": -0.6605362296104431, "logits/rejected": -0.7636741399765015, "logps/chosen": -159.80160522460938, "logps/rejected": -147.32513427734375, "loss": 2.7474, "rewards/accuracies": 0.0, "rewards/chosen": 0.2349899262189865, "rewards/margins": -5.009953022003174, "rewards/rejected": 5.244943141937256, "step": 4275 }, { "epoch": 0.69, "learning_rate": 9.385528839612114e-07, "logits/chosen": -0.813766360282898, "logits/rejected": -0.7737908959388733, "logps/chosen": -47.02246856689453, "logps/rejected": -77.02203369140625, "loss": 0.4372, "rewards/accuracies": 1.0, "rewards/chosen": 2.825631856918335, "rewards/margins": 0.23924732208251953, "rewards/rejected": 2.5863845348358154, "step": 4276 }, { "epoch": 0.69, "learning_rate": 9.384897455527632e-07, "logits/chosen": -0.3319351077079773, "logits/rejected": -0.24517694115638733, "logps/chosen": -51.30031967163086, "logps/rejected": -14.84127426147461, "loss": 0.4632, "rewards/accuracies": 1.0, "rewards/chosen": 2.1783206462860107, "rewards/margins": 2.097522497177124, "rewards/rejected": 0.08079814910888672, "step": 4277 }, { "epoch": 0.69, "learning_rate": 9.384265768488224e-07, "logits/chosen": -0.4828280806541443, "logits/rejected": -0.39622047543525696, "logps/chosen": -148.15289306640625, "logps/rejected": -69.43173217773438, "loss": 1.1972, "rewards/accuracies": 1.0, "rewards/chosen": 1.3198364973068237, "rewards/margins": 0.13826298713684082, "rewards/rejected": 1.181573510169983, "step": 4278 }, { "epoch": 0.69, "learning_rate": 9.383633778537539e-07, "logits/chosen": -0.4531274437904358, "logits/rejected": -0.4114178419113159, "logps/chosen": -76.83915710449219, "logps/rejected": -84.55982971191406, "loss": 0.4473, "rewards/accuracies": 0.0, "rewards/chosen": 0.797808825969696, "rewards/margins": -0.13187485933303833, "rewards/rejected": 0.9296836853027344, "step": 4279 }, { "epoch": 0.69, "learning_rate": 9.383001485719235e-07, "logits/chosen": -1.0285202264785767, "logits/rejected": -1.0728434324264526, "logps/chosen": -293.001953125, "logps/rejected": -155.255615234375, "loss": 1.1866, "rewards/accuracies": 0.0, "rewards/chosen": 2.9514801502227783, "rewards/margins": -1.3783724308013916, "rewards/rejected": 4.32985258102417, "step": 4280 }, { "epoch": 0.69, "learning_rate": 9.382368890077002e-07, "logits/chosen": -0.5575268268585205, "logits/rejected": -0.38324815034866333, "logps/chosen": -195.88821411132812, "logps/rejected": -122.30929565429688, "loss": 1.5312, "rewards/accuracies": 1.0, "rewards/chosen": 4.888571262359619, "rewards/margins": 1.5424485206604004, "rewards/rejected": 3.3461227416992188, "step": 4281 }, { "epoch": 0.7, "learning_rate": 9.381735991654546e-07, "logits/chosen": -0.3745037615299225, "logits/rejected": -0.44552677869796753, "logps/chosen": -18.35468101501465, "logps/rejected": -76.10404968261719, "loss": 1.021, "rewards/accuracies": 0.0, "rewards/chosen": 0.9481161236763, "rewards/margins": -1.6337313652038574, "rewards/rejected": 2.5818474292755127, "step": 4282 }, { "epoch": 0.7, "learning_rate": 9.381102790495592e-07, "logits/chosen": -0.7097358107566833, "logits/rejected": -0.7004063129425049, "logps/chosen": -128.57501220703125, "logps/rejected": -52.16991424560547, "loss": 0.6625, "rewards/accuracies": 0.0, "rewards/chosen": 0.5585876703262329, "rewards/margins": -0.904167890548706, "rewards/rejected": 1.462755560874939, "step": 4283 }, { "epoch": 0.7, "learning_rate": 9.380469286643891e-07, "logits/chosen": -0.7300475239753723, "logits/rejected": -0.629126787185669, "logps/chosen": -105.2144775390625, "logps/rejected": -70.9880599975586, "loss": 0.4304, "rewards/accuracies": 0.0, "rewards/chosen": 1.050850749015808, "rewards/margins": -0.20960462093353271, "rewards/rejected": 1.2604553699493408, "step": 4284 }, { "epoch": 0.7, "learning_rate": 9.379835480143209e-07, "logits/chosen": -0.9726497530937195, "logits/rejected": -0.9628008008003235, "logps/chosen": -83.09507751464844, "logps/rejected": -40.325687408447266, "loss": 0.4025, "rewards/accuracies": 0.0, "rewards/chosen": 0.7105072140693665, "rewards/margins": -0.09519577026367188, "rewards/rejected": 0.8057029843330383, "step": 4285 }, { "epoch": 0.7, "learning_rate": 9.379201371037338e-07, "logits/chosen": -0.38757601380348206, "logits/rejected": -0.36558955907821655, "logps/chosen": -112.98301696777344, "logps/rejected": -147.244873046875, "loss": 1.2313, "rewards/accuracies": 1.0, "rewards/chosen": 1.8344100713729858, "rewards/margins": 0.8409133553504944, "rewards/rejected": 0.9934967160224915, "step": 4286 }, { "epoch": 0.7, "learning_rate": 9.378566959370089e-07, "logits/chosen": -0.5572156310081482, "logits/rejected": -0.45222899317741394, "logps/chosen": -98.1621322631836, "logps/rejected": -84.46990966796875, "loss": 1.3014, "rewards/accuracies": 1.0, "rewards/chosen": 3.057760000228882, "rewards/margins": 1.4178979396820068, "rewards/rejected": 1.639862060546875, "step": 4287 }, { "epoch": 0.7, "learning_rate": 9.377932245185294e-07, "logits/chosen": -0.4205343723297119, "logits/rejected": -0.46743154525756836, "logps/chosen": -146.57321166992188, "logps/rejected": -129.82691955566406, "loss": 0.319, "rewards/accuracies": 1.0, "rewards/chosen": 5.464561462402344, "rewards/margins": 0.476531982421875, "rewards/rejected": 4.988029479980469, "step": 4288 }, { "epoch": 0.7, "learning_rate": 9.377297228526805e-07, "logits/chosen": -0.6930266618728638, "logits/rejected": -0.5100476145744324, "logps/chosen": -106.24482727050781, "logps/rejected": -75.61824798583984, "loss": 0.1526, "rewards/accuracies": 1.0, "rewards/chosen": 3.6755921840667725, "rewards/margins": 1.272261142730713, "rewards/rejected": 2.4033310413360596, "step": 4289 }, { "epoch": 0.7, "learning_rate": 9.376661909438494e-07, "logits/chosen": -0.6969194412231445, "logits/rejected": -0.6483510732650757, "logps/chosen": -62.271766662597656, "logps/rejected": -63.598690032958984, "loss": 0.5677, "rewards/accuracies": 0.0, "rewards/chosen": 1.7125953435897827, "rewards/margins": -0.22065699100494385, "rewards/rejected": 1.9332523345947266, "step": 4290 }, { "epoch": 0.7, "learning_rate": 9.37602628796426e-07, "logits/chosen": -0.7937602400779724, "logits/rejected": -0.8475335836410522, "logps/chosen": -148.9658966064453, "logps/rejected": -120.19099426269531, "loss": 0.8346, "rewards/accuracies": 0.0, "rewards/chosen": 5.917851448059082, "rewards/margins": -0.1569061279296875, "rewards/rejected": 6.0747575759887695, "step": 4291 }, { "epoch": 0.7, "learning_rate": 9.375390364148015e-07, "logits/chosen": -0.613617479801178, "logits/rejected": -0.43518489599227905, "logps/chosen": -86.57756042480469, "logps/rejected": -19.40224838256836, "loss": 0.2316, "rewards/accuracies": 1.0, "rewards/chosen": 2.921891927719116, "rewards/margins": 1.5834126472473145, "rewards/rejected": 1.3384792804718018, "step": 4292 }, { "epoch": 0.7, "learning_rate": 9.374754138033695e-07, "logits/chosen": -0.6618756651878357, "logits/rejected": -0.341369092464447, "logps/chosen": -188.06381225585938, "logps/rejected": -90.89071655273438, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": 7.073464870452881, "rewards/margins": 4.1920294761657715, "rewards/rejected": 2.8814353942871094, "step": 4293 }, { "epoch": 0.7, "learning_rate": 9.374117609665261e-07, "logits/chosen": -0.4846075475215912, "logits/rejected": -0.479855477809906, "logps/chosen": -81.7807846069336, "logps/rejected": -69.43023681640625, "loss": 0.9347, "rewards/accuracies": 0.0, "rewards/chosen": 0.9292404055595398, "rewards/margins": -0.257179319858551, "rewards/rejected": 1.1864197254180908, "step": 4294 }, { "epoch": 0.7, "learning_rate": 9.373480779086687e-07, "logits/chosen": -0.5740000009536743, "logits/rejected": -0.5654948949813843, "logps/chosen": -138.7555694580078, "logps/rejected": -71.14324951171875, "loss": 1.4604, "rewards/accuracies": 0.0, "rewards/chosen": 3.8780274391174316, "rewards/margins": -0.5699095726013184, "rewards/rejected": 4.44793701171875, "step": 4295 }, { "epoch": 0.7, "learning_rate": 9.372843646341972e-07, "logits/chosen": -0.7952702641487122, "logits/rejected": -0.7402437329292297, "logps/chosen": -150.606201171875, "logps/rejected": -40.74491500854492, "loss": 0.6443, "rewards/accuracies": 1.0, "rewards/chosen": 3.6655242443084717, "rewards/margins": 1.749111533164978, "rewards/rejected": 1.9164127111434937, "step": 4296 }, { "epoch": 0.7, "learning_rate": 9.37220621147514e-07, "logits/chosen": -0.8125397562980652, "logits/rejected": -0.8655152916908264, "logps/chosen": -78.38239288330078, "logps/rejected": -175.8064422607422, "loss": 1.3194, "rewards/accuracies": 0.0, "rewards/chosen": 1.0219535827636719, "rewards/margins": -1.2833092212677002, "rewards/rejected": 2.305262804031372, "step": 4297 }, { "epoch": 0.7, "learning_rate": 9.371568474530227e-07, "logits/chosen": -0.21559903025627136, "logits/rejected": -0.2880522906780243, "logps/chosen": -79.64810180664062, "logps/rejected": -107.40669250488281, "loss": 1.7419, "rewards/accuracies": 0.0, "rewards/chosen": 1.4999443292617798, "rewards/margins": -3.3882226943969727, "rewards/rejected": 4.888166904449463, "step": 4298 }, { "epoch": 0.7, "learning_rate": 9.370930435551298e-07, "logits/chosen": -1.041811466217041, "logits/rejected": -1.0384892225265503, "logps/chosen": -184.34014892578125, "logps/rejected": -60.523338317871094, "loss": 0.7084, "rewards/accuracies": 1.0, "rewards/chosen": 4.341836452484131, "rewards/margins": 1.921431541442871, "rewards/rejected": 2.4204049110412598, "step": 4299 }, { "epoch": 0.7, "learning_rate": 9.370292094582433e-07, "logits/chosen": -0.2496623396873474, "logits/rejected": -0.23386749625205994, "logps/chosen": -53.47140121459961, "logps/rejected": -86.9283218383789, "loss": 0.5785, "rewards/accuracies": 0.0, "rewards/chosen": -0.10298042744398117, "rewards/margins": -0.4605617821216583, "rewards/rejected": 0.35758134722709656, "step": 4300 }, { "epoch": 0.7, "learning_rate": 9.369653451667736e-07, "logits/chosen": -0.5754032731056213, "logits/rejected": -0.5602530241012573, "logps/chosen": -101.3914794921875, "logps/rejected": -38.06138610839844, "loss": 0.1814, "rewards/accuracies": 1.0, "rewards/chosen": 4.271111965179443, "rewards/margins": 1.9960246086120605, "rewards/rejected": 2.275087356567383, "step": 4301 }, { "epoch": 0.7, "learning_rate": 9.369014506851332e-07, "logits/chosen": -0.5933612585067749, "logits/rejected": -0.520753026008606, "logps/chosen": -99.84590911865234, "logps/rejected": -78.98857116699219, "loss": 0.3453, "rewards/accuracies": 1.0, "rewards/chosen": 4.6788859367370605, "rewards/margins": 1.2568690776824951, "rewards/rejected": 3.4220168590545654, "step": 4302 }, { "epoch": 0.7, "learning_rate": 9.368375260177366e-07, "logits/chosen": -0.5483176708221436, "logits/rejected": -0.5193665623664856, "logps/chosen": -52.535865783691406, "logps/rejected": -70.73165893554688, "loss": 0.5125, "rewards/accuracies": 1.0, "rewards/chosen": 1.28044593334198, "rewards/margins": 0.14643096923828125, "rewards/rejected": 1.1340149641036987, "step": 4303 }, { "epoch": 0.7, "learning_rate": 9.367735711690004e-07, "logits/chosen": -0.987282931804657, "logits/rejected": -0.9253027439117432, "logps/chosen": -74.13232421875, "logps/rejected": -82.44070434570312, "loss": 2.8582, "rewards/accuracies": 0.0, "rewards/chosen": 2.067486524581909, "rewards/margins": -1.3731727600097656, "rewards/rejected": 3.440659284591675, "step": 4304 }, { "epoch": 0.7, "learning_rate": 9.367095861433432e-07, "logits/chosen": -0.3490636944770813, "logits/rejected": -0.3947473466396332, "logps/chosen": -56.6563720703125, "logps/rejected": -79.97787475585938, "loss": 0.3832, "rewards/accuracies": 1.0, "rewards/chosen": 1.1936005353927612, "rewards/margins": 0.9881439805030823, "rewards/rejected": 0.20545653998851776, "step": 4305 }, { "epoch": 0.7, "learning_rate": 9.366455709451856e-07, "logits/chosen": -0.39720478653907776, "logits/rejected": -0.47455909848213196, "logps/chosen": -109.7080078125, "logps/rejected": -135.35708618164062, "loss": 1.1266, "rewards/accuracies": 0.0, "rewards/chosen": 2.299938917160034, "rewards/margins": -1.3328933715820312, "rewards/rejected": 3.6328322887420654, "step": 4306 }, { "epoch": 0.7, "learning_rate": 9.365815255789507e-07, "logits/chosen": -0.824140191078186, "logits/rejected": -0.817213237285614, "logps/chosen": -46.95219421386719, "logps/rejected": -48.98165512084961, "loss": 0.1584, "rewards/accuracies": 1.0, "rewards/chosen": 2.0004501342773438, "rewards/margins": 1.0094902515411377, "rewards/rejected": 0.9909599423408508, "step": 4307 }, { "epoch": 0.7, "learning_rate": 9.365174500490634e-07, "logits/chosen": -0.46518993377685547, "logits/rejected": -0.37281084060668945, "logps/chosen": -72.61579895019531, "logps/rejected": -43.03802490234375, "loss": 0.4155, "rewards/accuracies": 0.0, "rewards/chosen": 2.028573751449585, "rewards/margins": -0.03236722946166992, "rewards/rejected": 2.060940980911255, "step": 4308 }, { "epoch": 0.7, "learning_rate": 9.364533443599507e-07, "logits/chosen": -0.46182680130004883, "logits/rejected": -0.46182680130004883, "logps/chosen": -56.40791702270508, "logps/rejected": -56.40791702270508, "loss": 0.9007, "rewards/accuracies": 0.0, "rewards/chosen": 1.173003077507019, "rewards/margins": 0.0, "rewards/rejected": 1.173003077507019, "step": 4309 }, { "epoch": 0.7, "learning_rate": 9.363892085160417e-07, "logits/chosen": -0.456640362739563, "logits/rejected": -0.45909348130226135, "logps/chosen": -66.20722961425781, "logps/rejected": -176.99282836914062, "loss": 1.0231, "rewards/accuracies": 0.0, "rewards/chosen": 3.6856491565704346, "rewards/margins": -0.9777357578277588, "rewards/rejected": 4.663384914398193, "step": 4310 }, { "epoch": 0.7, "learning_rate": 9.363250425217675e-07, "logits/chosen": -0.5777485966682434, "logits/rejected": -0.596296489238739, "logps/chosen": -87.00984954833984, "logps/rejected": -171.7552490234375, "loss": 0.6015, "rewards/accuracies": 1.0, "rewards/chosen": 1.0690406560897827, "rewards/margins": 0.50674968957901, "rewards/rejected": 0.5622909665107727, "step": 4311 }, { "epoch": 0.7, "learning_rate": 9.362608463815613e-07, "logits/chosen": -0.4975818693637848, "logits/rejected": -0.451671838760376, "logps/chosen": -62.75212097167969, "logps/rejected": -51.94133758544922, "loss": 0.5746, "rewards/accuracies": 1.0, "rewards/chosen": 1.527058482170105, "rewards/margins": 0.5483574271202087, "rewards/rejected": 0.9787010550498962, "step": 4312 }, { "epoch": 0.7, "learning_rate": 9.361966200998586e-07, "logits/chosen": -0.34854957461357117, "logits/rejected": -0.34854957461357117, "logps/chosen": -80.17967987060547, "logps/rejected": -80.17967987060547, "loss": 1.3637, "rewards/accuracies": 0.0, "rewards/chosen": 2.1861443519592285, "rewards/margins": 0.0, "rewards/rejected": 2.1861443519592285, "step": 4313 }, { "epoch": 0.7, "learning_rate": 9.36132363681097e-07, "logits/chosen": -0.7775280475616455, "logits/rejected": -0.7076764702796936, "logps/chosen": -83.4873046875, "logps/rejected": -75.69233703613281, "loss": 1.6635, "rewards/accuracies": 0.0, "rewards/chosen": 1.283606767654419, "rewards/margins": -1.170382022857666, "rewards/rejected": 2.453988790512085, "step": 4314 }, { "epoch": 0.7, "learning_rate": 9.360680771297154e-07, "logits/chosen": -0.5181690454483032, "logits/rejected": -0.514435350894928, "logps/chosen": -47.99837112426758, "logps/rejected": -120.29689025878906, "loss": 1.4496, "rewards/accuracies": 0.0, "rewards/chosen": 1.665910005569458, "rewards/margins": -2.7402255535125732, "rewards/rejected": 4.406135559082031, "step": 4315 }, { "epoch": 0.7, "learning_rate": 9.360037604501561e-07, "logits/chosen": -0.6911210417747498, "logits/rejected": -0.6419315338134766, "logps/chosen": -46.31504440307617, "logps/rejected": -35.26112747192383, "loss": 0.4595, "rewards/accuracies": 0.0, "rewards/chosen": 1.9811336994171143, "rewards/margins": -0.19521760940551758, "rewards/rejected": 2.176351308822632, "step": 4316 }, { "epoch": 0.7, "learning_rate": 9.359394136468624e-07, "logits/chosen": -0.607651948928833, "logits/rejected": -0.5499160289764404, "logps/chosen": -134.44744873046875, "logps/rejected": -13.541122436523438, "loss": 0.3252, "rewards/accuracies": 1.0, "rewards/chosen": 4.141334533691406, "rewards/margins": 3.6469712257385254, "rewards/rejected": 0.494363397359848, "step": 4317 }, { "epoch": 0.7, "learning_rate": 9.358750367242801e-07, "logits/chosen": -0.5567362904548645, "logits/rejected": -0.5732932686805725, "logps/chosen": -194.81549072265625, "logps/rejected": -75.45175170898438, "loss": 1.3173, "rewards/accuracies": 1.0, "rewards/chosen": 2.9434266090393066, "rewards/margins": 1.0677177906036377, "rewards/rejected": 1.875708818435669, "step": 4318 }, { "epoch": 0.7, "learning_rate": 9.358106296868569e-07, "logits/chosen": -0.7713152766227722, "logits/rejected": -0.6956820487976074, "logps/chosen": -62.15364074707031, "logps/rejected": -78.99263000488281, "loss": 0.2478, "rewards/accuracies": 1.0, "rewards/chosen": 2.0622360706329346, "rewards/margins": 0.5946060419082642, "rewards/rejected": 1.4676300287246704, "step": 4319 }, { "epoch": 0.7, "learning_rate": 9.35746192539043e-07, "logits/chosen": -0.7776398658752441, "logits/rejected": -0.7720342874526978, "logps/chosen": -159.19091796875, "logps/rejected": -112.82493591308594, "loss": 0.253, "rewards/accuracies": 1.0, "rewards/chosen": 1.7327759265899658, "rewards/margins": 0.55511474609375, "rewards/rejected": 1.1776611804962158, "step": 4320 }, { "epoch": 0.7, "learning_rate": 9.356817252852902e-07, "logits/chosen": -0.588193953037262, "logits/rejected": -0.7207173109054565, "logps/chosen": -102.05331420898438, "logps/rejected": -75.98307800292969, "loss": 2.6839, "rewards/accuracies": 0.0, "rewards/chosen": 0.3542068600654602, "rewards/margins": -2.8256845474243164, "rewards/rejected": 3.179891347885132, "step": 4321 }, { "epoch": 0.7, "learning_rate": 9.356172279300527e-07, "logits/chosen": -0.22007215023040771, "logits/rejected": -0.22492846846580505, "logps/chosen": -6.529431343078613, "logps/rejected": -3.2381391525268555, "loss": 0.5394, "rewards/accuracies": 1.0, "rewards/chosen": 0.2530835270881653, "rewards/margins": 0.013760760426521301, "rewards/rejected": 0.23932276666164398, "step": 4322 }, { "epoch": 0.7, "learning_rate": 9.355527004777867e-07, "logits/chosen": -0.557883620262146, "logits/rejected": -0.5987032055854797, "logps/chosen": -101.3756332397461, "logps/rejected": -106.47198486328125, "loss": 1.0248, "rewards/accuracies": 0.0, "rewards/chosen": 1.1081825494766235, "rewards/margins": -0.6558326482772827, "rewards/rejected": 1.7640151977539062, "step": 4323 }, { "epoch": 0.7, "learning_rate": 9.354881429329502e-07, "logits/chosen": -0.4442098140716553, "logits/rejected": -0.40788766741752625, "logps/chosen": -62.25196838378906, "logps/rejected": -74.23828125, "loss": 0.7397, "rewards/accuracies": 1.0, "rewards/chosen": 1.172271728515625, "rewards/margins": 0.3041595220565796, "rewards/rejected": 0.8681122064590454, "step": 4324 }, { "epoch": 0.7, "learning_rate": 9.354235553000036e-07, "logits/chosen": -0.19380402565002441, "logits/rejected": -0.19380402565002441, "logps/chosen": -71.59710693359375, "logps/rejected": -71.59710693359375, "loss": 0.5458, "rewards/accuracies": 0.0, "rewards/chosen": 0.7892013788223267, "rewards/margins": 0.0, "rewards/rejected": 0.7892013788223267, "step": 4325 }, { "epoch": 0.7, "learning_rate": 9.353589375834095e-07, "logits/chosen": -0.7682753801345825, "logits/rejected": -0.6623170375823975, "logps/chosen": -128.27694702148438, "logps/rejected": -101.73942565917969, "loss": 0.1368, "rewards/accuracies": 1.0, "rewards/chosen": 3.242405652999878, "rewards/margins": 1.1725730895996094, "rewards/rejected": 2.0698325634002686, "step": 4326 }, { "epoch": 0.7, "learning_rate": 9.352942897876321e-07, "logits/chosen": -0.570851743221283, "logits/rejected": -0.6128516793251038, "logps/chosen": -87.09386444091797, "logps/rejected": -44.26701354980469, "loss": 1.2406, "rewards/accuracies": 0.0, "rewards/chosen": 0.11027755588293076, "rewards/margins": -1.6027077436447144, "rewards/rejected": 1.7129852771759033, "step": 4327 }, { "epoch": 0.7, "learning_rate": 9.352296119171382e-07, "logits/chosen": -0.43163278698921204, "logits/rejected": -0.27271828055381775, "logps/chosen": -78.62623596191406, "logps/rejected": -52.18263244628906, "loss": 0.5598, "rewards/accuracies": 0.0, "rewards/chosen": 0.8306862115859985, "rewards/margins": -0.23967206478118896, "rewards/rejected": 1.0703582763671875, "step": 4328 }, { "epoch": 0.7, "learning_rate": 9.351649039763962e-07, "logits/chosen": -0.5872485637664795, "logits/rejected": -0.4600640535354614, "logps/chosen": -114.52835083007812, "logps/rejected": -144.49880981445312, "loss": 1.1045, "rewards/accuracies": 0.0, "rewards/chosen": 4.198394775390625, "rewards/margins": -0.0931549072265625, "rewards/rejected": 4.2915496826171875, "step": 4329 }, { "epoch": 0.7, "learning_rate": 9.351001659698769e-07, "logits/chosen": -0.5737985968589783, "logits/rejected": -0.5470746755599976, "logps/chosen": -79.62489318847656, "logps/rejected": -41.466148376464844, "loss": 0.9992, "rewards/accuracies": 1.0, "rewards/chosen": 1.9152244329452515, "rewards/margins": 0.025620222091674805, "rewards/rejected": 1.8896042108535767, "step": 4330 }, { "epoch": 0.7, "learning_rate": 9.35035397902053e-07, "logits/chosen": -0.7932001948356628, "logits/rejected": -0.7940382361412048, "logps/chosen": -70.02484130859375, "logps/rejected": -86.84387969970703, "loss": 0.8943, "rewards/accuracies": 1.0, "rewards/chosen": 1.5581070184707642, "rewards/margins": 0.2677955627441406, "rewards/rejected": 1.2903114557266235, "step": 4331 }, { "epoch": 0.7, "learning_rate": 9.349705997773995e-07, "logits/chosen": -0.38529592752456665, "logits/rejected": -0.3520401120185852, "logps/chosen": -114.44224548339844, "logps/rejected": -68.73080444335938, "loss": 1.6154, "rewards/accuracies": 0.0, "rewards/chosen": 0.11042022705078125, "rewards/margins": -1.11504065990448, "rewards/rejected": 1.2254608869552612, "step": 4332 }, { "epoch": 0.7, "learning_rate": 9.349057716003934e-07, "logits/chosen": -0.5013989806175232, "logits/rejected": -0.504417359828949, "logps/chosen": -4.765583515167236, "logps/rejected": -2.0727851390838623, "loss": 0.7785, "rewards/accuracies": 0.0, "rewards/chosen": 0.19900035858154297, "rewards/margins": -0.056252360343933105, "rewards/rejected": 0.2552527189254761, "step": 4333 }, { "epoch": 0.7, "learning_rate": 9.348409133755135e-07, "logits/chosen": -0.43698909878730774, "logits/rejected": -0.9995251893997192, "logps/chosen": -82.62886047363281, "logps/rejected": -35.28892517089844, "loss": 0.1362, "rewards/accuracies": 1.0, "rewards/chosen": 2.2895944118499756, "rewards/margins": 2.0304243564605713, "rewards/rejected": 0.25917014479637146, "step": 4334 }, { "epoch": 0.7, "learning_rate": 9.34776025107241e-07, "logits/chosen": -0.19693823158740997, "logits/rejected": -0.04879337176680565, "logps/chosen": -58.20597839355469, "logps/rejected": -5.160846710205078, "loss": 0.3723, "rewards/accuracies": 1.0, "rewards/chosen": 1.6724213361740112, "rewards/margins": 0.9949316382408142, "rewards/rejected": 0.677489697933197, "step": 4335 }, { "epoch": 0.7, "learning_rate": 9.347111068000591e-07, "logits/chosen": -1.0262417793273926, "logits/rejected": -0.9889417886734009, "logps/chosen": -233.8976287841797, "logps/rejected": -135.42550659179688, "loss": 0.7425, "rewards/accuracies": 0.0, "rewards/chosen": 3.4849166870117188, "rewards/margins": -1.196380615234375, "rewards/rejected": 4.681297302246094, "step": 4336 }, { "epoch": 0.7, "learning_rate": 9.34646158458453e-07, "logits/chosen": -0.4972589313983917, "logits/rejected": -0.5056207180023193, "logps/chosen": -134.0954132080078, "logps/rejected": -118.3272476196289, "loss": 0.2815, "rewards/accuracies": 1.0, "rewards/chosen": 5.089738368988037, "rewards/margins": 0.8987751007080078, "rewards/rejected": 4.190963268280029, "step": 4337 }, { "epoch": 0.7, "learning_rate": 9.345811800869099e-07, "logits/chosen": -0.5677815675735474, "logits/rejected": -0.5422582030296326, "logps/chosen": -79.23538208007812, "logps/rejected": -113.95506286621094, "loss": 1.3696, "rewards/accuracies": 0.0, "rewards/chosen": 2.3281686305999756, "rewards/margins": -2.5138251781463623, "rewards/rejected": 4.841993808746338, "step": 4338 }, { "epoch": 0.7, "learning_rate": 9.345161716899195e-07, "logits/chosen": -0.8641003370285034, "logits/rejected": -0.8043949007987976, "logps/chosen": -71.48849487304688, "logps/rejected": -95.61399841308594, "loss": 2.8216, "rewards/accuracies": 0.0, "rewards/chosen": 2.5692551136016846, "rewards/margins": -1.8037903308868408, "rewards/rejected": 4.373045444488525, "step": 4339 }, { "epoch": 0.7, "learning_rate": 9.344511332719728e-07, "logits/chosen": -0.35375767946243286, "logits/rejected": -0.35375767946243286, "logps/chosen": -46.276554107666016, "logps/rejected": -46.276554107666016, "loss": 0.3478, "rewards/accuracies": 0.0, "rewards/chosen": 0.29613038897514343, "rewards/margins": 0.0, "rewards/rejected": 0.29613038897514343, "step": 4340 }, { "epoch": 0.7, "learning_rate": 9.343860648375639e-07, "logits/chosen": -0.6818355917930603, "logits/rejected": -0.5268754363059998, "logps/chosen": -84.43293762207031, "logps/rejected": -45.31222915649414, "loss": 0.0509, "rewards/accuracies": 1.0, "rewards/chosen": 4.1886515617370605, "rewards/margins": 2.6727194786071777, "rewards/rejected": 1.5159320831298828, "step": 4341 }, { "epoch": 0.7, "learning_rate": 9.34320966391188e-07, "logits/chosen": -0.27541375160217285, "logits/rejected": -0.2158304899930954, "logps/chosen": -59.125389099121094, "logps/rejected": -35.80091094970703, "loss": 0.7499, "rewards/accuracies": 0.0, "rewards/chosen": 0.9345917105674744, "rewards/margins": -0.10826414823532104, "rewards/rejected": 1.0428558588027954, "step": 4342 }, { "epoch": 0.7, "learning_rate": 9.342558379373428e-07, "logits/chosen": -0.35922175645828247, "logits/rejected": -0.32811039686203003, "logps/chosen": -55.625885009765625, "logps/rejected": -179.0643310546875, "loss": 0.5405, "rewards/accuracies": 1.0, "rewards/chosen": 1.3671387434005737, "rewards/margins": 1.2503082752227783, "rewards/rejected": 0.11683044582605362, "step": 4343 }, { "epoch": 0.71, "learning_rate": 9.341906794805283e-07, "logits/chosen": -0.7279819250106812, "logits/rejected": -0.7271758913993835, "logps/chosen": -178.30328369140625, "logps/rejected": -166.28680419921875, "loss": 0.2076, "rewards/accuracies": 1.0, "rewards/chosen": 5.538702487945557, "rewards/margins": 0.80810546875, "rewards/rejected": 4.730597019195557, "step": 4344 }, { "epoch": 0.71, "learning_rate": 9.34125491025246e-07, "logits/chosen": -0.7513236403465271, "logits/rejected": -0.7513236403465271, "logps/chosen": -49.21845245361328, "logps/rejected": -49.21845245361328, "loss": 1.4519, "rewards/accuracies": 0.0, "rewards/chosen": 2.511098623275757, "rewards/margins": 0.0, "rewards/rejected": 2.511098623275757, "step": 4345 }, { "epoch": 0.71, "learning_rate": 9.340602725760003e-07, "logits/chosen": -0.20216147601604462, "logits/rejected": -0.1955418437719345, "logps/chosen": -42.69035720825195, "logps/rejected": -46.41014099121094, "loss": 0.8741, "rewards/accuracies": 0.0, "rewards/chosen": 0.7962902188301086, "rewards/margins": -0.7332466244697571, "rewards/rejected": 1.5295368432998657, "step": 4346 }, { "epoch": 0.71, "learning_rate": 9.339950241372967e-07, "logits/chosen": -0.5519098043441772, "logits/rejected": -0.5295903086662292, "logps/chosen": -59.687435150146484, "logps/rejected": -119.36399841308594, "loss": 0.2889, "rewards/accuracies": 1.0, "rewards/chosen": 1.3762180805206299, "rewards/margins": 1.3145084381103516, "rewards/rejected": 0.06170959398150444, "step": 4347 }, { "epoch": 0.71, "learning_rate": 9.339297457136434e-07, "logits/chosen": -0.8451635241508484, "logits/rejected": -0.7817555665969849, "logps/chosen": -102.47499084472656, "logps/rejected": -92.13837432861328, "loss": 0.2598, "rewards/accuracies": 1.0, "rewards/chosen": 1.5244048833847046, "rewards/margins": 0.43170166015625, "rewards/rejected": 1.0927032232284546, "step": 4348 }, { "epoch": 0.71, "learning_rate": 9.338644373095505e-07, "logits/chosen": -0.49721699953079224, "logits/rejected": -0.5150039196014404, "logps/chosen": -101.6812744140625, "logps/rejected": -164.69119262695312, "loss": 3.105, "rewards/accuracies": 0.0, "rewards/chosen": 0.9522964358329773, "rewards/margins": -2.6473755836486816, "rewards/rejected": 3.5996720790863037, "step": 4349 }, { "epoch": 0.71, "learning_rate": 9.337990989295304e-07, "logits/chosen": -0.7493950128555298, "logits/rejected": -0.7493950128555298, "logps/chosen": -38.945655822753906, "logps/rejected": -38.945655822753906, "loss": 0.9543, "rewards/accuracies": 0.0, "rewards/chosen": 1.8905490636825562, "rewards/margins": 0.0, "rewards/rejected": 1.8905490636825562, "step": 4350 }, { "epoch": 0.71, "learning_rate": 9.337337305780972e-07, "logits/chosen": -0.5926372408866882, "logits/rejected": -0.5926372408866882, "logps/chosen": -66.21145629882812, "logps/rejected": -66.21145629882812, "loss": 0.5904, "rewards/accuracies": 0.0, "rewards/chosen": 1.2665894031524658, "rewards/margins": 0.0, "rewards/rejected": 1.2665894031524658, "step": 4351 }, { "epoch": 0.71, "learning_rate": 9.336683322597673e-07, "logits/chosen": -0.5443207025527954, "logits/rejected": -0.5987805128097534, "logps/chosen": -65.64584350585938, "logps/rejected": -65.66382598876953, "loss": 0.5643, "rewards/accuracies": 0.0, "rewards/chosen": 1.434657335281372, "rewards/margins": -0.560509443283081, "rewards/rejected": 1.9951667785644531, "step": 4352 }, { "epoch": 0.71, "learning_rate": 9.336029039790589e-07, "logits/chosen": -0.32286667823791504, "logits/rejected": -0.3162271976470947, "logps/chosen": -77.95822143554688, "logps/rejected": -72.40859985351562, "loss": 1.0854, "rewards/accuracies": 0.0, "rewards/chosen": 0.7997833490371704, "rewards/margins": -0.5963256359100342, "rewards/rejected": 1.3961089849472046, "step": 4353 }, { "epoch": 0.71, "learning_rate": 9.335374457404927e-07, "logits/chosen": -0.753407895565033, "logits/rejected": -0.6184066534042358, "logps/chosen": -88.84170532226562, "logps/rejected": -115.54714965820312, "loss": 0.8443, "rewards/accuracies": 0.0, "rewards/chosen": 2.438666582107544, "rewards/margins": -0.5020699501037598, "rewards/rejected": 2.9407365322113037, "step": 4354 }, { "epoch": 0.71, "learning_rate": 9.334719575485912e-07, "logits/chosen": -0.5222935080528259, "logits/rejected": -0.5886974930763245, "logps/chosen": -121.5076904296875, "logps/rejected": -119.9193344116211, "loss": 0.5345, "rewards/accuracies": 1.0, "rewards/chosen": 2.702160596847534, "rewards/margins": 1.9923841953277588, "rewards/rejected": 0.7097763419151306, "step": 4355 }, { "epoch": 0.71, "learning_rate": 9.334064394078789e-07, "logits/chosen": -0.7673305869102478, "logits/rejected": -0.9323155283927917, "logps/chosen": -35.50238037109375, "logps/rejected": -151.9688262939453, "loss": 1.0976, "rewards/accuracies": 0.0, "rewards/chosen": 1.6510345935821533, "rewards/margins": -1.9107666015625, "rewards/rejected": 3.5618011951446533, "step": 4356 }, { "epoch": 0.71, "learning_rate": 9.333408913228825e-07, "logits/chosen": -0.7867393493652344, "logits/rejected": -0.8064056634902954, "logps/chosen": -84.15507507324219, "logps/rejected": -82.15689086914062, "loss": 1.4272, "rewards/accuracies": 0.0, "rewards/chosen": 0.9037445187568665, "rewards/margins": -1.0506668090820312, "rewards/rejected": 1.9544113874435425, "step": 4357 }, { "epoch": 0.71, "learning_rate": 9.332753132981311e-07, "logits/chosen": -1.243409276008606, "logits/rejected": -1.1697787046432495, "logps/chosen": -85.7634506225586, "logps/rejected": -6.716448783874512, "loss": 2.121, "rewards/accuracies": 1.0, "rewards/chosen": 1.2647392749786377, "rewards/margins": 0.5312274098396301, "rewards/rejected": 0.7335118651390076, "step": 4358 }, { "epoch": 0.71, "learning_rate": 9.332097053381549e-07, "logits/chosen": -0.5051860809326172, "logits/rejected": -0.26350080966949463, "logps/chosen": -139.27989196777344, "logps/rejected": -17.316802978515625, "loss": 0.4371, "rewards/accuracies": 1.0, "rewards/chosen": 4.425544738769531, "rewards/margins": 3.8585827350616455, "rewards/rejected": 0.5669620633125305, "step": 4359 }, { "epoch": 0.71, "learning_rate": 9.331440674474873e-07, "logits/chosen": -0.5798359513282776, "logits/rejected": -0.25069281458854675, "logps/chosen": -191.01434326171875, "logps/rejected": -44.05687713623047, "loss": 0.9684, "rewards/accuracies": 1.0, "rewards/chosen": 4.820953369140625, "rewards/margins": 3.727307081222534, "rewards/rejected": 1.0936462879180908, "step": 4360 }, { "epoch": 0.71, "learning_rate": 9.33078399630663e-07, "logits/chosen": -0.9143871068954468, "logits/rejected": -0.8765371441841125, "logps/chosen": -206.972900390625, "logps/rejected": -39.92334747314453, "loss": 0.7892, "rewards/accuracies": 1.0, "rewards/chosen": 0.6519882082939148, "rewards/margins": 0.5146957039833069, "rewards/rejected": 0.13729248940944672, "step": 4361 }, { "epoch": 0.71, "learning_rate": 9.330127018922193e-07, "logits/chosen": -0.6249938011169434, "logits/rejected": -0.6431516408920288, "logps/chosen": -102.12513732910156, "logps/rejected": -93.20075225830078, "loss": 2.4132, "rewards/accuracies": 0.0, "rewards/chosen": 0.2785964906215668, "rewards/margins": -4.204728126525879, "rewards/rejected": 4.4833245277404785, "step": 4362 }, { "epoch": 0.71, "learning_rate": 9.32946974236695e-07, "logits/chosen": -0.3383297026157379, "logits/rejected": -0.2943699359893799, "logps/chosen": -17.73072052001953, "logps/rejected": -7.091090202331543, "loss": 0.3726, "rewards/accuracies": 1.0, "rewards/chosen": 1.4830849170684814, "rewards/margins": 0.5769036412239075, "rewards/rejected": 0.906181275844574, "step": 4363 }, { "epoch": 0.71, "learning_rate": 9.328812166686313e-07, "logits/chosen": -1.026401400566101, "logits/rejected": -0.9829040169715881, "logps/chosen": -140.34622192382812, "logps/rejected": -96.43737030029297, "loss": 0.8114, "rewards/accuracies": 1.0, "rewards/chosen": 4.587751865386963, "rewards/margins": 1.561375379562378, "rewards/rejected": 3.026376485824585, "step": 4364 }, { "epoch": 0.71, "learning_rate": 9.328154291925716e-07, "logits/chosen": -0.4522121548652649, "logits/rejected": -0.39680543541908264, "logps/chosen": -59.373287200927734, "logps/rejected": -77.35285186767578, "loss": 0.3364, "rewards/accuracies": 1.0, "rewards/chosen": 1.5192760229110718, "rewards/margins": 0.2588397264480591, "rewards/rejected": 1.2604362964630127, "step": 4365 }, { "epoch": 0.71, "learning_rate": 9.327496118130609e-07, "logits/chosen": -0.6437459588050842, "logits/rejected": -0.6437459588050842, "logps/chosen": -32.772552490234375, "logps/rejected": -32.772552490234375, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": 0.23653793334960938, "rewards/margins": 0.0, "rewards/rejected": 0.23653793334960938, "step": 4366 }, { "epoch": 0.71, "learning_rate": 9.32683764534647e-07, "logits/chosen": -0.8798960447311401, "logits/rejected": -0.6611306071281433, "logps/chosen": -113.456298828125, "logps/rejected": -83.75238037109375, "loss": 0.2756, "rewards/accuracies": 1.0, "rewards/chosen": 3.6628921031951904, "rewards/margins": 1.4597883224487305, "rewards/rejected": 2.20310378074646, "step": 4367 }, { "epoch": 0.71, "learning_rate": 9.326178873618789e-07, "logits/chosen": -0.7069904804229736, "logits/rejected": -0.7045205235481262, "logps/chosen": -87.83561706542969, "logps/rejected": -59.71302032470703, "loss": 0.776, "rewards/accuracies": 0.0, "rewards/chosen": 0.08232956379652023, "rewards/margins": -1.2132980823516846, "rewards/rejected": 1.2956275939941406, "step": 4368 }, { "epoch": 0.71, "learning_rate": 9.325519802993083e-07, "logits/chosen": -0.4020286500453949, "logits/rejected": -0.5160485506057739, "logps/chosen": -111.81484985351562, "logps/rejected": -155.12086486816406, "loss": 2.0756, "rewards/accuracies": 0.0, "rewards/chosen": 0.30188676714897156, "rewards/margins": -3.271709442138672, "rewards/rejected": 3.573596239089966, "step": 4369 }, { "epoch": 0.71, "learning_rate": 9.324860433514887e-07, "logits/chosen": -0.7395817637443542, "logits/rejected": -0.6113326549530029, "logps/chosen": -100.13872528076172, "logps/rejected": -37.972930908203125, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 3.7460427284240723, "rewards/margins": 1.7425103187561035, "rewards/rejected": 2.0035324096679688, "step": 4370 }, { "epoch": 0.71, "learning_rate": 9.324200765229757e-07, "logits/chosen": -1.0117993354797363, "logits/rejected": -0.9659056067466736, "logps/chosen": -72.44413757324219, "logps/rejected": -88.68507385253906, "loss": 0.4157, "rewards/accuracies": 1.0, "rewards/chosen": 1.9444572925567627, "rewards/margins": 1.2131248712539673, "rewards/rejected": 0.7313324213027954, "step": 4371 }, { "epoch": 0.71, "learning_rate": 9.323540798183269e-07, "logits/chosen": -0.6881487369537354, "logits/rejected": -0.7175222039222717, "logps/chosen": -44.93189239501953, "logps/rejected": -50.593353271484375, "loss": 0.349, "rewards/accuracies": 1.0, "rewards/chosen": 1.9534591436386108, "rewards/margins": 0.4460716247558594, "rewards/rejected": 1.5073875188827515, "step": 4372 }, { "epoch": 0.71, "learning_rate": 9.322880532421023e-07, "logits/chosen": -0.8349059820175171, "logits/rejected": -0.47342678904533386, "logps/chosen": -133.14903259277344, "logps/rejected": -96.3365478515625, "loss": 0.8133, "rewards/accuracies": 0.0, "rewards/chosen": 1.507991075515747, "rewards/margins": -0.5701661109924316, "rewards/rejected": 2.0781571865081787, "step": 4373 }, { "epoch": 0.71, "learning_rate": 9.322219967988636e-07, "logits/chosen": -0.33395373821258545, "logits/rejected": -0.33395373821258545, "logps/chosen": -47.16549301147461, "logps/rejected": -47.16549301147461, "loss": 0.9439, "rewards/accuracies": 0.0, "rewards/chosen": 0.523624837398529, "rewards/margins": 0.0, "rewards/rejected": 0.523624837398529, "step": 4374 }, { "epoch": 0.71, "learning_rate": 9.321559104931745e-07, "logits/chosen": -0.36081600189208984, "logits/rejected": -0.22370858490467072, "logps/chosen": -101.93870544433594, "logps/rejected": -151.62283325195312, "loss": 1.3523, "rewards/accuracies": 0.0, "rewards/chosen": 3.3864212036132812, "rewards/margins": -0.2059495449066162, "rewards/rejected": 3.5923707485198975, "step": 4375 }, { "epoch": 0.71, "learning_rate": 9.320897943296011e-07, "logits/chosen": -0.5587760210037231, "logits/rejected": -0.5512075424194336, "logps/chosen": -50.32814025878906, "logps/rejected": -43.35175323486328, "loss": 0.566, "rewards/accuracies": 1.0, "rewards/chosen": 1.4524033069610596, "rewards/margins": 0.57857745885849, "rewards/rejected": 0.8738258481025696, "step": 4376 }, { "epoch": 0.71, "learning_rate": 9.320236483127115e-07, "logits/chosen": -0.6254480481147766, "logits/rejected": -0.4819659888744354, "logps/chosen": -120.49361419677734, "logps/rejected": -19.03382110595703, "loss": 0.7782, "rewards/accuracies": 1.0, "rewards/chosen": 1.2791557312011719, "rewards/margins": 0.5869100689888, "rewards/rejected": 0.6922456622123718, "step": 4377 }, { "epoch": 0.71, "learning_rate": 9.319574724470756e-07, "logits/chosen": -0.7012693881988525, "logits/rejected": -0.7662066221237183, "logps/chosen": -124.43241119384766, "logps/rejected": -121.36564636230469, "loss": 1.9438, "rewards/accuracies": 0.0, "rewards/chosen": 2.6553871631622314, "rewards/margins": -3.6828773021698, "rewards/rejected": 6.338264465332031, "step": 4378 }, { "epoch": 0.71, "learning_rate": 9.318912667372656e-07, "logits/chosen": -0.6413218975067139, "logits/rejected": -0.5464869737625122, "logps/chosen": -116.04808044433594, "logps/rejected": -51.18792724609375, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": 4.642894268035889, "rewards/margins": 2.91109037399292, "rewards/rejected": 1.7318038940429688, "step": 4379 }, { "epoch": 0.71, "learning_rate": 9.318250311878557e-07, "logits/chosen": -1.0096280574798584, "logits/rejected": -0.8773118257522583, "logps/chosen": -79.95552062988281, "logps/rejected": -121.14205932617188, "loss": 0.8191, "rewards/accuracies": 0.0, "rewards/chosen": 3.788372039794922, "rewards/margins": -0.6887140274047852, "rewards/rejected": 4.477086067199707, "step": 4380 }, { "epoch": 0.71, "learning_rate": 9.31758765803422e-07, "logits/chosen": -0.22970245778560638, "logits/rejected": -0.22970245778560638, "logps/chosen": -30.64596176147461, "logps/rejected": -30.64596176147461, "loss": 0.6085, "rewards/accuracies": 0.0, "rewards/chosen": 1.8465522527694702, "rewards/margins": 0.0, "rewards/rejected": 1.8465522527694702, "step": 4381 }, { "epoch": 0.71, "learning_rate": 9.31692470588543e-07, "logits/chosen": -0.8595191836357117, "logits/rejected": -0.9021703004837036, "logps/chosen": -181.42132568359375, "logps/rejected": -127.1681900024414, "loss": 0.289, "rewards/accuracies": 1.0, "rewards/chosen": 5.010846138000488, "rewards/margins": 0.3148965835571289, "rewards/rejected": 4.695949554443359, "step": 4382 }, { "epoch": 0.71, "learning_rate": 9.31626145547799e-07, "logits/chosen": -0.5995998978614807, "logits/rejected": -0.5995998978614807, "logps/chosen": -104.7531509399414, "logps/rejected": -104.7531509399414, "loss": 0.3504, "rewards/accuracies": 0.0, "rewards/chosen": 0.8920127749443054, "rewards/margins": 0.0, "rewards/rejected": 0.8920127749443054, "step": 4383 }, { "epoch": 0.71, "learning_rate": 9.315597906857723e-07, "logits/chosen": -0.5858647227287292, "logits/rejected": -0.513693630695343, "logps/chosen": -106.31248474121094, "logps/rejected": -220.6320343017578, "loss": 0.4789, "rewards/accuracies": 1.0, "rewards/chosen": 0.8060073852539062, "rewards/margins": 0.6287597417831421, "rewards/rejected": 0.17724762856960297, "step": 4384 }, { "epoch": 0.71, "learning_rate": 9.314934060070476e-07, "logits/chosen": -0.7105188369750977, "logits/rejected": -0.5636173486709595, "logps/chosen": -89.58462524414062, "logps/rejected": -240.89859008789062, "loss": 3.5686, "rewards/accuracies": 0.0, "rewards/chosen": 0.8741500973701477, "rewards/margins": -6.954920768737793, "rewards/rejected": 7.829071044921875, "step": 4385 }, { "epoch": 0.71, "learning_rate": 9.314269915162114e-07, "logits/chosen": -0.39395949244499207, "logits/rejected": -0.33031222224235535, "logps/chosen": -81.54195404052734, "logps/rejected": -99.25186157226562, "loss": 1.121, "rewards/accuracies": 0.0, "rewards/chosen": 0.4089553952217102, "rewards/margins": -0.15870362520217896, "rewards/rejected": 0.5676590204238892, "step": 4386 }, { "epoch": 0.71, "learning_rate": 9.313605472178522e-07, "logits/chosen": -0.8412762880325317, "logits/rejected": -0.8412762880325317, "logps/chosen": -55.83067321777344, "logps/rejected": -55.83067321777344, "loss": 0.3945, "rewards/accuracies": 0.0, "rewards/chosen": 0.12004967033863068, "rewards/margins": 0.0, "rewards/rejected": 0.12004967033863068, "step": 4387 }, { "epoch": 0.71, "learning_rate": 9.312940731165608e-07, "logits/chosen": -0.541813850402832, "logits/rejected": -0.5168560147285461, "logps/chosen": -67.90203857421875, "logps/rejected": -32.441062927246094, "loss": 0.7487, "rewards/accuracies": 0.0, "rewards/chosen": 0.8576721549034119, "rewards/margins": -0.35207098722457886, "rewards/rejected": 1.2097431421279907, "step": 4388 }, { "epoch": 0.71, "learning_rate": 9.3122756921693e-07, "logits/chosen": -0.5693382620811462, "logits/rejected": -0.36815157532691956, "logps/chosen": -166.26901245117188, "logps/rejected": -33.811378479003906, "loss": 0.0785, "rewards/accuracies": 1.0, "rewards/chosen": 3.2201263904571533, "rewards/margins": 2.1849236488342285, "rewards/rejected": 1.0352028608322144, "step": 4389 }, { "epoch": 0.71, "learning_rate": 9.311610355235544e-07, "logits/chosen": -0.52834153175354, "logits/rejected": -0.47494810819625854, "logps/chosen": -51.01557922363281, "logps/rejected": -49.725162506103516, "loss": 0.3231, "rewards/accuracies": 1.0, "rewards/chosen": 0.8848800659179688, "rewards/margins": 0.12260091304779053, "rewards/rejected": 0.7622791528701782, "step": 4390 }, { "epoch": 0.71, "learning_rate": 9.310944720410309e-07, "logits/chosen": -0.5331861972808838, "logits/rejected": -0.5421582460403442, "logps/chosen": -10.212020874023438, "logps/rejected": -3.5990042686462402, "loss": 0.5927, "rewards/accuracies": 0.0, "rewards/chosen": 0.17916584014892578, "rewards/margins": -0.2576596736907959, "rewards/rejected": 0.4368255138397217, "step": 4391 }, { "epoch": 0.71, "learning_rate": 9.310278787739585e-07, "logits/chosen": -0.7871829867362976, "logits/rejected": -0.7944913506507874, "logps/chosen": -113.26093292236328, "logps/rejected": -87.61266326904297, "loss": 0.5253, "rewards/accuracies": 1.0, "rewards/chosen": 1.4928200244903564, "rewards/margins": 0.06036221981048584, "rewards/rejected": 1.4324578046798706, "step": 4392 }, { "epoch": 0.71, "learning_rate": 9.30961255726938e-07, "logits/chosen": -0.44038689136505127, "logits/rejected": -0.44038689136505127, "logps/chosen": -79.86015319824219, "logps/rejected": -79.86015319824219, "loss": 0.3805, "rewards/accuracies": 0.0, "rewards/chosen": 1.07988440990448, "rewards/margins": 0.0, "rewards/rejected": 1.07988440990448, "step": 4393 }, { "epoch": 0.71, "learning_rate": 9.308946029045726e-07, "logits/chosen": -0.7712985873222351, "logits/rejected": -0.6889463067054749, "logps/chosen": -98.98973846435547, "logps/rejected": -104.32475280761719, "loss": 0.1465, "rewards/accuracies": 1.0, "rewards/chosen": 4.6598076820373535, "rewards/margins": 1.9125525951385498, "rewards/rejected": 2.7472550868988037, "step": 4394 }, { "epoch": 0.71, "learning_rate": 9.308279203114673e-07, "logits/chosen": -0.7499799132347107, "logits/rejected": -0.7542651891708374, "logps/chosen": -73.47895050048828, "logps/rejected": -107.76010131835938, "loss": 0.8626, "rewards/accuracies": 0.0, "rewards/chosen": 1.1792770624160767, "rewards/margins": -0.4387214183807373, "rewards/rejected": 1.617998480796814, "step": 4395 }, { "epoch": 0.71, "learning_rate": 9.307612079522292e-07, "logits/chosen": -0.5469846129417419, "logits/rejected": -0.4552743434906006, "logps/chosen": -105.65138244628906, "logps/rejected": -87.9468002319336, "loss": 0.7699, "rewards/accuracies": 0.0, "rewards/chosen": 1.4227020740509033, "rewards/margins": -0.8124732971191406, "rewards/rejected": 2.235175371170044, "step": 4396 }, { "epoch": 0.71, "learning_rate": 9.306944658314676e-07, "logits/chosen": -0.9045267701148987, "logits/rejected": -0.9664616584777832, "logps/chosen": -183.9191131591797, "logps/rejected": -65.74436950683594, "loss": 0.3344, "rewards/accuracies": 1.0, "rewards/chosen": 2.7566909790039062, "rewards/margins": 1.9719184637069702, "rewards/rejected": 0.784772515296936, "step": 4397 }, { "epoch": 0.71, "learning_rate": 9.306276939537936e-07, "logits/chosen": -0.8013283014297485, "logits/rejected": -0.8138564229011536, "logps/chosen": -94.52043151855469, "logps/rejected": -62.59738540649414, "loss": 0.2077, "rewards/accuracies": 1.0, "rewards/chosen": 2.5045006275177, "rewards/margins": 1.0172756910324097, "rewards/rejected": 1.4872249364852905, "step": 4398 }, { "epoch": 0.71, "learning_rate": 9.305608923238206e-07, "logits/chosen": -0.5172331929206848, "logits/rejected": -0.4726892411708832, "logps/chosen": -89.68859100341797, "logps/rejected": -20.511268615722656, "loss": 0.5767, "rewards/accuracies": 1.0, "rewards/chosen": 1.5044609308242798, "rewards/margins": 1.290845274925232, "rewards/rejected": 0.21361561119556427, "step": 4399 }, { "epoch": 0.71, "learning_rate": 9.30494060946164e-07, "logits/chosen": -1.0855361223220825, "logits/rejected": -1.1158982515335083, "logps/chosen": -226.13168334960938, "logps/rejected": -100.91901397705078, "loss": 0.0335, "rewards/accuracies": 1.0, "rewards/chosen": 4.398105144500732, "rewards/margins": 2.8184900283813477, "rewards/rejected": 1.5796149969100952, "step": 4400 }, { "epoch": 0.71, "learning_rate": 9.304271998254411e-07, "logits/chosen": -0.5062708854675293, "logits/rejected": -0.5180004835128784, "logps/chosen": -47.86192321777344, "logps/rejected": -71.60185241699219, "loss": 0.5021, "rewards/accuracies": 1.0, "rewards/chosen": 1.7840203046798706, "rewards/margins": 0.4021873474121094, "rewards/rejected": 1.3818329572677612, "step": 4401 }, { "epoch": 0.71, "learning_rate": 9.303603089662715e-07, "logits/chosen": -0.30963820219039917, "logits/rejected": -0.3120007812976837, "logps/chosen": -22.863990783691406, "logps/rejected": -18.968332290649414, "loss": 1.2818, "rewards/accuracies": 0.0, "rewards/chosen": 0.26331445574760437, "rewards/margins": -0.10560834407806396, "rewards/rejected": 0.36892279982566833, "step": 4402 }, { "epoch": 0.71, "learning_rate": 9.302933883732767e-07, "logits/chosen": -0.43929511308670044, "logits/rejected": -0.28135260939598083, "logps/chosen": -178.08457946777344, "logps/rejected": -73.19587707519531, "loss": 0.3304, "rewards/accuracies": 1.0, "rewards/chosen": 1.8629013299942017, "rewards/margins": 0.311370849609375, "rewards/rejected": 1.5515304803848267, "step": 4403 }, { "epoch": 0.71, "learning_rate": 9.3022643805108e-07, "logits/chosen": -0.6999619603157043, "logits/rejected": -0.6855908632278442, "logps/chosen": -57.27193069458008, "logps/rejected": -49.80674743652344, "loss": 0.3436, "rewards/accuracies": 1.0, "rewards/chosen": 2.078608274459839, "rewards/margins": 0.030649900436401367, "rewards/rejected": 2.0479583740234375, "step": 4404 }, { "epoch": 0.71, "learning_rate": 9.301594580043075e-07, "logits/chosen": -0.6013200879096985, "logits/rejected": -0.6294469237327576, "logps/chosen": -25.372373580932617, "logps/rejected": -18.990631103515625, "loss": 1.3833, "rewards/accuracies": 0.0, "rewards/chosen": 0.060572054237127304, "rewards/margins": -0.23350849747657776, "rewards/rejected": 0.29408055543899536, "step": 4405 }, { "epoch": 0.72, "learning_rate": 9.300924482375865e-07, "logits/chosen": -0.8070774674415588, "logits/rejected": -0.6271428465843201, "logps/chosen": -113.56078338623047, "logps/rejected": -129.17962646484375, "loss": 0.3244, "rewards/accuracies": 1.0, "rewards/chosen": 6.258158206939697, "rewards/margins": 2.119185447692871, "rewards/rejected": 4.138972759246826, "step": 4406 }, { "epoch": 0.72, "learning_rate": 9.30025408755547e-07, "logits/chosen": -0.5695411562919617, "logits/rejected": -0.6013673543930054, "logps/chosen": -57.86865997314453, "logps/rejected": -70.4708251953125, "loss": 0.9343, "rewards/accuracies": 0.0, "rewards/chosen": 1.6889053583145142, "rewards/margins": -0.44723808765411377, "rewards/rejected": 2.136143445968628, "step": 4407 }, { "epoch": 0.72, "learning_rate": 9.299583395628207e-07, "logits/chosen": -0.3824956715106964, "logits/rejected": -0.3824956715106964, "logps/chosen": -46.1285285949707, "logps/rejected": -46.1285285949707, "loss": 1.0933, "rewards/accuracies": 0.0, "rewards/chosen": 0.17220841348171234, "rewards/margins": 0.0, "rewards/rejected": 0.17220841348171234, "step": 4408 }, { "epoch": 0.72, "learning_rate": 9.298912406640413e-07, "logits/chosen": -0.7404610514640808, "logits/rejected": -0.7436873316764832, "logps/chosen": -40.13634490966797, "logps/rejected": -26.99912452697754, "loss": 0.4103, "rewards/accuracies": 0.0, "rewards/chosen": 1.5645760297775269, "rewards/margins": -0.08420801162719727, "rewards/rejected": 1.6487840414047241, "step": 4409 }, { "epoch": 0.72, "learning_rate": 9.29824112063845e-07, "logits/chosen": -0.4435885548591614, "logits/rejected": -0.40049511194229126, "logps/chosen": -75.71430969238281, "logps/rejected": -58.81645584106445, "loss": 0.7244, "rewards/accuracies": 1.0, "rewards/chosen": 2.1922943592071533, "rewards/margins": 0.8452366590499878, "rewards/rejected": 1.3470577001571655, "step": 4410 }, { "epoch": 0.72, "learning_rate": 9.297569537668696e-07, "logits/chosen": -0.5552634596824646, "logits/rejected": -0.12552490830421448, "logps/chosen": -46.1120719909668, "logps/rejected": -102.12854766845703, "loss": 0.1811, "rewards/accuracies": 1.0, "rewards/chosen": 1.7998722791671753, "rewards/margins": 0.8695507645606995, "rewards/rejected": 0.9303215146064758, "step": 4411 }, { "epoch": 0.72, "learning_rate": 9.296897657777551e-07, "logits/chosen": -1.220568060874939, "logits/rejected": -1.120609164237976, "logps/chosen": -115.43611907958984, "logps/rejected": -104.20355224609375, "loss": 1.3463, "rewards/accuracies": 0.0, "rewards/chosen": 1.1365875005722046, "rewards/margins": -0.6584442853927612, "rewards/rejected": 1.7950317859649658, "step": 4412 }, { "epoch": 0.72, "learning_rate": 9.296225481011435e-07, "logits/chosen": -0.42206671833992004, "logits/rejected": -0.42206671833992004, "logps/chosen": -112.58892059326172, "logps/rejected": -112.58892059326172, "loss": 0.4238, "rewards/accuracies": 0.0, "rewards/chosen": 0.725311279296875, "rewards/margins": 0.0, "rewards/rejected": 0.725311279296875, "step": 4413 }, { "epoch": 0.72, "learning_rate": 9.295553007416789e-07, "logits/chosen": -0.6614567637443542, "logits/rejected": -0.6809381246566772, "logps/chosen": -120.78118896484375, "logps/rejected": -29.61150550842285, "loss": 0.5259, "rewards/accuracies": 0.0, "rewards/chosen": 0.693402111530304, "rewards/margins": -0.5280718207359314, "rewards/rejected": 1.2214739322662354, "step": 4414 }, { "epoch": 0.72, "learning_rate": 9.294880237040074e-07, "logits/chosen": -0.7670720219612122, "logits/rejected": -0.6831430196762085, "logps/chosen": -147.65106201171875, "logps/rejected": -179.26168823242188, "loss": 1.5227, "rewards/accuracies": 0.0, "rewards/chosen": 3.7355315685272217, "rewards/margins": -2.235260248184204, "rewards/rejected": 5.970791816711426, "step": 4415 }, { "epoch": 0.72, "learning_rate": 9.294207169927776e-07, "logits/chosen": -0.3620644509792328, "logits/rejected": -0.34126123785972595, "logps/chosen": -102.51515197753906, "logps/rejected": -77.86723327636719, "loss": 0.5196, "rewards/accuracies": 0.0, "rewards/chosen": 0.2553001344203949, "rewards/margins": -0.5964920520782471, "rewards/rejected": 0.8517921566963196, "step": 4416 }, { "epoch": 0.72, "learning_rate": 9.293533806126393e-07, "logits/chosen": -0.5250020027160645, "logits/rejected": -0.4033565819263458, "logps/chosen": -75.2427978515625, "logps/rejected": -67.73943328857422, "loss": 0.3543, "rewards/accuracies": 1.0, "rewards/chosen": 1.1438156366348267, "rewards/margins": 0.06846237182617188, "rewards/rejected": 1.0753532648086548, "step": 4417 }, { "epoch": 0.72, "learning_rate": 9.292860145682451e-07, "logits/chosen": -0.9908769726753235, "logits/rejected": -1.0397236347198486, "logps/chosen": -74.94711303710938, "logps/rejected": -212.386474609375, "loss": 3.442, "rewards/accuracies": 0.0, "rewards/chosen": 1.9269866943359375, "rewards/margins": -6.5964508056640625, "rewards/rejected": 8.5234375, "step": 4418 }, { "epoch": 0.72, "learning_rate": 9.29218618864249e-07, "logits/chosen": -0.490317165851593, "logits/rejected": -0.5519043207168579, "logps/chosen": -50.2385368347168, "logps/rejected": -143.1713104248047, "loss": 2.508, "rewards/accuracies": 0.0, "rewards/chosen": 2.1650960445404053, "rewards/margins": -2.4882137775421143, "rewards/rejected": 4.6533098220825195, "step": 4419 }, { "epoch": 0.72, "learning_rate": 9.291511935053078e-07, "logits/chosen": -0.6456934213638306, "logits/rejected": -0.6341884732246399, "logps/chosen": -84.4468994140625, "logps/rejected": -95.74893188476562, "loss": 1.6678, "rewards/accuracies": 0.0, "rewards/chosen": 1.9447319507598877, "rewards/margins": -0.7658271789550781, "rewards/rejected": 2.710559129714966, "step": 4420 }, { "epoch": 0.72, "learning_rate": 9.290837384960799e-07, "logits/chosen": -0.7259968519210815, "logits/rejected": -0.6625600457191467, "logps/chosen": -92.21376037597656, "logps/rejected": -30.078344345092773, "loss": 0.4414, "rewards/accuracies": 1.0, "rewards/chosen": 1.2463181018829346, "rewards/margins": 0.9672685861587524, "rewards/rejected": 0.27904948592185974, "step": 4421 }, { "epoch": 0.72, "learning_rate": 9.290162538412255e-07, "logits/chosen": -0.5928614139556885, "logits/rejected": -0.6171126365661621, "logps/chosen": -16.583642959594727, "logps/rejected": -2.56941819190979, "loss": 1.7223, "rewards/accuracies": 0.0, "rewards/chosen": -0.225428968667984, "rewards/margins": -0.5047212839126587, "rewards/rejected": 0.2792922854423523, "step": 4422 }, { "epoch": 0.72, "learning_rate": 9.289487395454075e-07, "logits/chosen": -0.5863499641418457, "logits/rejected": -0.62642902135849, "logps/chosen": -72.43113708496094, "logps/rejected": -51.962188720703125, "loss": 0.8981, "rewards/accuracies": 0.0, "rewards/chosen": 1.0439094305038452, "rewards/margins": -1.0791558027267456, "rewards/rejected": 2.123065233230591, "step": 4423 }, { "epoch": 0.72, "learning_rate": 9.288811956132903e-07, "logits/chosen": -0.6944414973258972, "logits/rejected": -0.5761764049530029, "logps/chosen": -60.731143951416016, "logps/rejected": -106.03573608398438, "loss": 0.3596, "rewards/accuracies": 1.0, "rewards/chosen": 3.1076931953430176, "rewards/margins": 0.024889707565307617, "rewards/rejected": 3.08280348777771, "step": 4424 }, { "epoch": 0.72, "learning_rate": 9.288136220495405e-07, "logits/chosen": -1.1145117282867432, "logits/rejected": -1.2201828956604004, "logps/chosen": -222.22547912597656, "logps/rejected": -167.236083984375, "loss": 1.1893, "rewards/accuracies": 0.0, "rewards/chosen": 4.4022417068481445, "rewards/margins": -1.8823041915893555, "rewards/rejected": 6.2845458984375, "step": 4425 }, { "epoch": 0.72, "learning_rate": 9.28746018858827e-07, "logits/chosen": -0.48844045400619507, "logits/rejected": -0.5066688656806946, "logps/chosen": -120.08417510986328, "logps/rejected": -106.06462097167969, "loss": 0.2612, "rewards/accuracies": 1.0, "rewards/chosen": 2.0506904125213623, "rewards/margins": 2.3435683250427246, "rewards/rejected": -0.2928779721260071, "step": 4426 }, { "epoch": 0.72, "learning_rate": 9.286783860458203e-07, "logits/chosen": -0.9585549235343933, "logits/rejected": -0.9374247193336487, "logps/chosen": -70.385009765625, "logps/rejected": -18.986391067504883, "loss": 0.3373, "rewards/accuracies": 1.0, "rewards/chosen": 3.7307136058807373, "rewards/margins": 3.2390079498291016, "rewards/rejected": 0.4917057156562805, "step": 4427 }, { "epoch": 0.72, "learning_rate": 9.286107236151934e-07, "logits/chosen": -0.6789819598197937, "logits/rejected": -0.5923100709915161, "logps/chosen": -58.54225540161133, "logps/rejected": -56.41419982910156, "loss": 1.2685, "rewards/accuracies": 1.0, "rewards/chosen": 0.8888241052627563, "rewards/margins": 0.345758855342865, "rewards/rejected": 0.5430652499198914, "step": 4428 }, { "epoch": 0.72, "learning_rate": 9.285430315716211e-07, "logits/chosen": -0.6979382038116455, "logits/rejected": -0.7166646718978882, "logps/chosen": -70.8563461303711, "logps/rejected": -38.864078521728516, "loss": 0.0927, "rewards/accuracies": 1.0, "rewards/chosen": 4.0367655754089355, "rewards/margins": 2.0969622135162354, "rewards/rejected": 1.9398033618927002, "step": 4429 }, { "epoch": 0.72, "learning_rate": 9.284753099197802e-07, "logits/chosen": -0.6567183136940002, "logits/rejected": -0.5778396725654602, "logps/chosen": -116.8193359375, "logps/rejected": -71.8989486694336, "loss": 0.3356, "rewards/accuracies": 1.0, "rewards/chosen": 1.1303452253341675, "rewards/margins": 0.7671456336975098, "rewards/rejected": 0.3631996214389801, "step": 4430 }, { "epoch": 0.72, "learning_rate": 9.284075586643496e-07, "logits/chosen": -0.49339720606803894, "logits/rejected": -0.4180382490158081, "logps/chosen": -63.20457458496094, "logps/rejected": -60.67509078979492, "loss": 0.3353, "rewards/accuracies": 1.0, "rewards/chosen": 1.343257188796997, "rewards/margins": 0.18566250801086426, "rewards/rejected": 1.1575946807861328, "step": 4431 }, { "epoch": 0.72, "learning_rate": 9.283397778100104e-07, "logits/chosen": -0.9285212159156799, "logits/rejected": -0.6757450103759766, "logps/chosen": -123.62509155273438, "logps/rejected": -103.4473876953125, "loss": 1.0999, "rewards/accuracies": 1.0, "rewards/chosen": 3.291468858718872, "rewards/margins": 1.632562279701233, "rewards/rejected": 1.6589065790176392, "step": 4432 }, { "epoch": 0.72, "learning_rate": 9.282719673614455e-07, "logits/chosen": -0.3382149934768677, "logits/rejected": -0.35317713022232056, "logps/chosen": -6.688004970550537, "logps/rejected": -3.009725570678711, "loss": 0.4362, "rewards/accuracies": 0.0, "rewards/chosen": -0.14255891740322113, "rewards/margins": -0.30905699729919434, "rewards/rejected": 0.1664980947971344, "step": 4433 }, { "epoch": 0.72, "learning_rate": 9.282041273233401e-07, "logits/chosen": -0.5030642747879028, "logits/rejected": -0.5030642747879028, "logps/chosen": -79.44439697265625, "logps/rejected": -79.44439697265625, "loss": 0.354, "rewards/accuracies": 0.0, "rewards/chosen": 2.0850846767425537, "rewards/margins": 0.0, "rewards/rejected": 2.0850846767425537, "step": 4434 }, { "epoch": 0.72, "learning_rate": 9.28136257700381e-07, "logits/chosen": -0.6731181740760803, "logits/rejected": -0.6432923674583435, "logps/chosen": -146.99652099609375, "logps/rejected": -103.88845825195312, "loss": 0.5654, "rewards/accuracies": 1.0, "rewards/chosen": 5.455386638641357, "rewards/margins": 1.5174760818481445, "rewards/rejected": 3.937910556793213, "step": 4435 }, { "epoch": 0.72, "learning_rate": 9.280683584972577e-07, "logits/chosen": -0.7412692904472351, "logits/rejected": -0.6336196064949036, "logps/chosen": -115.59496307373047, "logps/rejected": -203.75265502929688, "loss": 0.987, "rewards/accuracies": 0.0, "rewards/chosen": 4.138916969299316, "rewards/margins": -1.770369529724121, "rewards/rejected": 5.9092864990234375, "step": 4436 }, { "epoch": 0.72, "learning_rate": 9.280004297186612e-07, "logits/chosen": -0.4755067527294159, "logits/rejected": -0.49642661213874817, "logps/chosen": -6.990107536315918, "logps/rejected": -23.789508819580078, "loss": 0.6819, "rewards/accuracies": 0.0, "rewards/chosen": 0.4182778298854828, "rewards/margins": -0.46690627932548523, "rewards/rejected": 0.885184109210968, "step": 4437 }, { "epoch": 0.72, "learning_rate": 9.279324713692849e-07, "logits/chosen": -0.5440104007720947, "logits/rejected": -0.5310472846031189, "logps/chosen": -106.290771484375, "logps/rejected": -64.28031158447266, "loss": 0.5342, "rewards/accuracies": 0.0, "rewards/chosen": 1.608712077140808, "rewards/margins": -0.5442749261856079, "rewards/rejected": 2.152987003326416, "step": 4438 }, { "epoch": 0.72, "learning_rate": 9.278644834538238e-07, "logits/chosen": -0.40895479917526245, "logits/rejected": -0.5646855235099792, "logps/chosen": -78.57162475585938, "logps/rejected": -80.72401428222656, "loss": 1.4061, "rewards/accuracies": 0.0, "rewards/chosen": 1.9640945196151733, "rewards/margins": -2.6320676803588867, "rewards/rejected": 4.59616231918335, "step": 4439 }, { "epoch": 0.72, "learning_rate": 9.277964659769754e-07, "logits/chosen": -0.571536660194397, "logits/rejected": -0.5995473861694336, "logps/chosen": -67.84329223632812, "logps/rejected": -61.63544464111328, "loss": 0.5044, "rewards/accuracies": 1.0, "rewards/chosen": 1.9679771661758423, "rewards/margins": 0.04562222957611084, "rewards/rejected": 1.9223549365997314, "step": 4440 }, { "epoch": 0.72, "learning_rate": 9.277284189434393e-07, "logits/chosen": -0.8967954516410828, "logits/rejected": -0.7842740416526794, "logps/chosen": -127.23098754882812, "logps/rejected": -103.14826965332031, "loss": 2.519, "rewards/accuracies": 1.0, "rewards/chosen": 5.310069561004639, "rewards/margins": 0.24954843521118164, "rewards/rejected": 5.060521125793457, "step": 4441 }, { "epoch": 0.72, "learning_rate": 9.276603423579164e-07, "logits/chosen": -0.3791086971759796, "logits/rejected": -0.342572957277298, "logps/chosen": -102.97087097167969, "logps/rejected": -66.09335327148438, "loss": 1.8427, "rewards/accuracies": 0.0, "rewards/chosen": 0.2812759578227997, "rewards/margins": -2.555945634841919, "rewards/rejected": 2.837221622467041, "step": 4442 }, { "epoch": 0.72, "learning_rate": 9.275922362251105e-07, "logits/chosen": -0.5255663990974426, "logits/rejected": -0.5283852815628052, "logps/chosen": -9.77403736114502, "logps/rejected": -1.0372021198272705, "loss": 0.4441, "rewards/accuracies": 0.0, "rewards/chosen": -0.10936222225427628, "rewards/margins": -0.32968562841415405, "rewards/rejected": 0.22032339870929718, "step": 4443 }, { "epoch": 0.72, "learning_rate": 9.27524100549727e-07, "logits/chosen": -0.619667649269104, "logits/rejected": -0.6159442663192749, "logps/chosen": -45.047855377197266, "logps/rejected": -61.37621307373047, "loss": 0.6764, "rewards/accuracies": 0.0, "rewards/chosen": -0.014894867315888405, "rewards/margins": -0.25028496980667114, "rewards/rejected": 0.2353900969028473, "step": 4444 }, { "epoch": 0.72, "learning_rate": 9.274559353364733e-07, "logits/chosen": -0.8481937646865845, "logits/rejected": -0.8978702425956726, "logps/chosen": -113.37872314453125, "logps/rejected": -102.83415222167969, "loss": 0.9792, "rewards/accuracies": 0.0, "rewards/chosen": 0.5963608026504517, "rewards/margins": -0.504492998123169, "rewards/rejected": 1.1008538007736206, "step": 4445 }, { "epoch": 0.72, "learning_rate": 9.273877405900592e-07, "logits/chosen": -0.32727059721946716, "logits/rejected": -0.32727059721946716, "logps/chosen": -1.8064450025558472, "logps/rejected": -1.8064450025558472, "loss": 0.7837, "rewards/accuracies": 0.0, "rewards/chosen": 0.18631206452846527, "rewards/margins": 0.0, "rewards/rejected": 0.18631206452846527, "step": 4446 }, { "epoch": 0.72, "learning_rate": 9.273195163151962e-07, "logits/chosen": -0.7465245127677917, "logits/rejected": -0.7217717170715332, "logps/chosen": -78.01221466064453, "logps/rejected": -4.249942779541016, "loss": 2.1549, "rewards/accuracies": 1.0, "rewards/chosen": 1.3717941045761108, "rewards/margins": 0.7539529204368591, "rewards/rejected": 0.6178411841392517, "step": 4447 }, { "epoch": 0.72, "learning_rate": 9.272512625165978e-07, "logits/chosen": -0.2502056360244751, "logits/rejected": -0.2612419128417969, "logps/chosen": -20.961669921875, "logps/rejected": -34.349517822265625, "loss": 1.6896, "rewards/accuracies": 0.0, "rewards/chosen": 0.8689729571342468, "rewards/margins": -0.42054086923599243, "rewards/rejected": 1.2895138263702393, "step": 4448 }, { "epoch": 0.72, "learning_rate": 9.2718297919898e-07, "logits/chosen": -0.1901608258485794, "logits/rejected": -0.20198936760425568, "logps/chosen": -3.1016769409179688, "logps/rejected": -18.50173568725586, "loss": 1.6283, "rewards/accuracies": 0.0, "rewards/chosen": 0.20095311105251312, "rewards/margins": -0.14562930166721344, "rewards/rejected": 0.34658241271972656, "step": 4449 }, { "epoch": 0.72, "learning_rate": 9.271146663670604e-07, "logits/chosen": -0.29493242502212524, "logits/rejected": -0.29493242502212524, "logps/chosen": -0.5992403626441956, "logps/rejected": -0.5992403626441956, "loss": 0.4717, "rewards/accuracies": 0.0, "rewards/chosen": 0.21406905353069305, "rewards/margins": 0.0, "rewards/rejected": 0.21406905353069305, "step": 4450 }, { "epoch": 0.72, "learning_rate": 9.270463240255587e-07, "logits/chosen": -0.5691829919815063, "logits/rejected": -0.48189017176628113, "logps/chosen": -71.82653045654297, "logps/rejected": -94.86722564697266, "loss": 0.5497, "rewards/accuracies": 1.0, "rewards/chosen": 2.7444443702697754, "rewards/margins": 0.10682916641235352, "rewards/rejected": 2.637615203857422, "step": 4451 }, { "epoch": 0.72, "learning_rate": 9.269779521791967e-07, "logits/chosen": -0.9343696236610413, "logits/rejected": -0.8584387898445129, "logps/chosen": -88.66572570800781, "logps/rejected": -80.51922607421875, "loss": 0.3787, "rewards/accuracies": 1.0, "rewards/chosen": 2.3255012035369873, "rewards/margins": 0.25191402435302734, "rewards/rejected": 2.07358717918396, "step": 4452 }, { "epoch": 0.72, "learning_rate": 9.269095508326985e-07, "logits/chosen": -0.7560783624649048, "logits/rejected": -0.7159688472747803, "logps/chosen": -140.44772338867188, "logps/rejected": -72.868896484375, "loss": 0.081, "rewards/accuracies": 1.0, "rewards/chosen": 3.265738010406494, "rewards/margins": 1.7568939924240112, "rewards/rejected": 1.508844017982483, "step": 4453 }, { "epoch": 0.72, "learning_rate": 9.268411199907896e-07, "logits/chosen": -0.42058032751083374, "logits/rejected": -0.39573943614959717, "logps/chosen": -28.391883850097656, "logps/rejected": -5.591991901397705, "loss": 1.452, "rewards/accuracies": 0.0, "rewards/chosen": 0.02477550506591797, "rewards/margins": -0.6792787313461304, "rewards/rejected": 0.7040542364120483, "step": 4454 }, { "epoch": 0.72, "learning_rate": 9.267726596581982e-07, "logits/chosen": -0.518571674823761, "logits/rejected": -0.5052822232246399, "logps/chosen": -69.58151245117188, "logps/rejected": -91.37213134765625, "loss": 2.2289, "rewards/accuracies": 0.0, "rewards/chosen": 1.6583572626113892, "rewards/margins": -1.3002265691757202, "rewards/rejected": 2.9585838317871094, "step": 4455 }, { "epoch": 0.72, "learning_rate": 9.267041698396543e-07, "logits/chosen": -0.40587303042411804, "logits/rejected": -0.3456282317638397, "logps/chosen": -82.88166809082031, "logps/rejected": -121.40788269042969, "loss": 1.1134, "rewards/accuracies": 0.0, "rewards/chosen": 2.5099990367889404, "rewards/margins": -0.4332306385040283, "rewards/rejected": 2.9432296752929688, "step": 4456 }, { "epoch": 0.72, "learning_rate": 9.266356505398896e-07, "logits/chosen": -0.7486363053321838, "logits/rejected": -0.7669616937637329, "logps/chosen": -159.07936096191406, "logps/rejected": -48.02180099487305, "loss": 0.2898, "rewards/accuracies": 1.0, "rewards/chosen": 3.2138137817382812, "rewards/margins": 0.3613109588623047, "rewards/rejected": 2.8525028228759766, "step": 4457 }, { "epoch": 0.72, "learning_rate": 9.265671017636382e-07, "logits/chosen": -0.7197182178497314, "logits/rejected": -0.8101359009742737, "logps/chosen": -143.2694091796875, "logps/rejected": -207.12063598632812, "loss": 1.2331, "rewards/accuracies": 0.0, "rewards/chosen": 3.5519044399261475, "rewards/margins": -2.2133023738861084, "rewards/rejected": 5.765206813812256, "step": 4458 }, { "epoch": 0.72, "learning_rate": 9.264985235156366e-07, "logits/chosen": -0.5920026302337646, "logits/rejected": -0.5591976046562195, "logps/chosen": -89.73194122314453, "logps/rejected": -112.31041717529297, "loss": 0.2431, "rewards/accuracies": 1.0, "rewards/chosen": 1.192853569984436, "rewards/margins": 1.2671082019805908, "rewards/rejected": -0.074254609644413, "step": 4459 }, { "epoch": 0.72, "learning_rate": 9.264299158006223e-07, "logits/chosen": -0.4107578992843628, "logits/rejected": -0.46438121795654297, "logps/chosen": -110.61672973632812, "logps/rejected": -145.08444213867188, "loss": 2.0009, "rewards/accuracies": 0.0, "rewards/chosen": 1.2134827375411987, "rewards/margins": -3.3817243576049805, "rewards/rejected": 4.595207214355469, "step": 4460 }, { "epoch": 0.72, "learning_rate": 9.263612786233359e-07, "logits/chosen": -0.58193039894104, "logits/rejected": -0.6193259954452515, "logps/chosen": -40.10250473022461, "logps/rejected": -3.542603015899658, "loss": 1.2322, "rewards/accuracies": 0.0, "rewards/chosen": 0.0053916932083666325, "rewards/margins": -0.489271879196167, "rewards/rejected": 0.4946635663509369, "step": 4461 }, { "epoch": 0.72, "learning_rate": 9.262926119885194e-07, "logits/chosen": -0.5525349378585815, "logits/rejected": -0.5514163970947266, "logps/chosen": -68.0959243774414, "logps/rejected": -87.98665618896484, "loss": 0.5577, "rewards/accuracies": 1.0, "rewards/chosen": 1.686348795890808, "rewards/margins": 1.2523574829101562, "rewards/rejected": 0.4339912533760071, "step": 4462 }, { "epoch": 0.72, "learning_rate": 9.262239159009171e-07, "logits/chosen": -0.6780915856361389, "logits/rejected": -0.6128690838813782, "logps/chosen": -75.78970336914062, "logps/rejected": -59.71596145629883, "loss": 0.6628, "rewards/accuracies": 1.0, "rewards/chosen": 3.164738416671753, "rewards/margins": 1.1506106853485107, "rewards/rejected": 2.014127731323242, "step": 4463 }, { "epoch": 0.72, "learning_rate": 9.261551903652751e-07, "logits/chosen": -0.7102136611938477, "logits/rejected": -0.6581602692604065, "logps/chosen": -179.15484619140625, "logps/rejected": -46.384620666503906, "loss": 0.0773, "rewards/accuracies": 1.0, "rewards/chosen": 5.753225803375244, "rewards/margins": 3.528517246246338, "rewards/rejected": 2.2247085571289062, "step": 4464 }, { "epoch": 0.72, "learning_rate": 9.260864353863418e-07, "logits/chosen": -0.6154959201812744, "logits/rejected": -0.571302056312561, "logps/chosen": -56.3001708984375, "logps/rejected": -77.83741760253906, "loss": 0.2376, "rewards/accuracies": 1.0, "rewards/chosen": 2.0977463722229004, "rewards/margins": 0.9429672956466675, "rewards/rejected": 1.154779076576233, "step": 4465 }, { "epoch": 0.72, "learning_rate": 9.260176509688673e-07, "logits/chosen": -0.8726900219917297, "logits/rejected": -0.9581258296966553, "logps/chosen": -211.03115844726562, "logps/rejected": -158.92343139648438, "loss": 1.3261, "rewards/accuracies": 0.0, "rewards/chosen": 3.7931549549102783, "rewards/margins": -1.3857543468475342, "rewards/rejected": 5.1789093017578125, "step": 4466 }, { "epoch": 0.73, "learning_rate": 9.259488371176043e-07, "logits/chosen": -0.7191573977470398, "logits/rejected": -0.7015841603279114, "logps/chosen": -68.796875, "logps/rejected": -52.157405853271484, "loss": 0.858, "rewards/accuracies": 0.0, "rewards/chosen": 0.7930518984794617, "rewards/margins": -0.6183056235313416, "rewards/rejected": 1.4113575220108032, "step": 4467 }, { "epoch": 0.73, "learning_rate": 9.25879993837307e-07, "logits/chosen": -0.7139649391174316, "logits/rejected": -0.6887525320053101, "logps/chosen": -123.5420150756836, "logps/rejected": -95.56553649902344, "loss": 1.1988, "rewards/accuracies": 0.0, "rewards/chosen": 1.614484429359436, "rewards/margins": -0.5382834672927856, "rewards/rejected": 2.1527678966522217, "step": 4468 }, { "epoch": 0.73, "learning_rate": 9.258111211327319e-07, "logits/chosen": -0.7718206644058228, "logits/rejected": -0.7756332159042358, "logps/chosen": -124.45614624023438, "logps/rejected": -108.08387756347656, "loss": 1.0937, "rewards/accuracies": 1.0, "rewards/chosen": 0.5879928469657898, "rewards/margins": 0.2207397222518921, "rewards/rejected": 0.3672531247138977, "step": 4469 }, { "epoch": 0.73, "learning_rate": 9.257422190086372e-07, "logits/chosen": -0.6255404949188232, "logits/rejected": -0.5583403706550598, "logps/chosen": -116.05284118652344, "logps/rejected": -112.32441711425781, "loss": 1.244, "rewards/accuracies": 0.0, "rewards/chosen": 1.3554047346115112, "rewards/margins": -1.2079421281814575, "rewards/rejected": 2.5633468627929688, "step": 4470 }, { "epoch": 0.73, "learning_rate": 9.256732874697838e-07, "logits/chosen": -0.5061609745025635, "logits/rejected": -0.4686611592769623, "logps/chosen": -46.91802978515625, "logps/rejected": -68.60784149169922, "loss": 0.821, "rewards/accuracies": 0.0, "rewards/chosen": 1.195076823234558, "rewards/margins": -0.6491721868515015, "rewards/rejected": 1.8442490100860596, "step": 4471 }, { "epoch": 0.73, "learning_rate": 9.256043265209339e-07, "logits/chosen": -1.088003158569336, "logits/rejected": -0.9586992263793945, "logps/chosen": -126.20995330810547, "logps/rejected": -32.27751159667969, "loss": 0.0682, "rewards/accuracies": 1.0, "rewards/chosen": 4.547950267791748, "rewards/margins": 4.003906726837158, "rewards/rejected": 0.5440437197685242, "step": 4472 }, { "epoch": 0.73, "learning_rate": 9.255353361668521e-07, "logits/chosen": -0.7007997632026672, "logits/rejected": -0.5367834568023682, "logps/chosen": -152.25486755371094, "logps/rejected": -93.97781372070312, "loss": 1.2991, "rewards/accuracies": 1.0, "rewards/chosen": 4.362575054168701, "rewards/margins": 2.5575242042541504, "rewards/rejected": 1.8050507307052612, "step": 4473 }, { "epoch": 0.73, "learning_rate": 9.254663164123051e-07, "logits/chosen": -0.5494135022163391, "logits/rejected": -0.4328748285770416, "logps/chosen": -77.07874298095703, "logps/rejected": -35.745819091796875, "loss": 0.8495, "rewards/accuracies": 1.0, "rewards/chosen": 1.439143419265747, "rewards/margins": 1.4439525604248047, "rewards/rejected": -0.00480918912217021, "step": 4474 }, { "epoch": 0.73, "learning_rate": 9.253972672620614e-07, "logits/chosen": -0.1285722404718399, "logits/rejected": -0.1285722404718399, "logps/chosen": -68.04087829589844, "logps/rejected": -68.04087829589844, "loss": 0.3813, "rewards/accuracies": 0.0, "rewards/chosen": 0.587860107421875, "rewards/margins": 0.0, "rewards/rejected": 0.587860107421875, "step": 4475 }, { "epoch": 0.73, "learning_rate": 9.253281887208917e-07, "logits/chosen": -0.8610523343086243, "logits/rejected": -0.7418348789215088, "logps/chosen": -72.959228515625, "logps/rejected": -25.288976669311523, "loss": 1.1792, "rewards/accuracies": 0.0, "rewards/chosen": 0.1377410888671875, "rewards/margins": -0.36591970920562744, "rewards/rejected": 0.5036607980728149, "step": 4476 }, { "epoch": 0.73, "learning_rate": 9.252590807935685e-07, "logits/chosen": -1.0752695798873901, "logits/rejected": -1.0692170858383179, "logps/chosen": -70.5290756225586, "logps/rejected": -80.49102783203125, "loss": 0.797, "rewards/accuracies": 1.0, "rewards/chosen": 1.5423065423965454, "rewards/margins": 0.04636991024017334, "rewards/rejected": 1.495936632156372, "step": 4477 }, { "epoch": 0.73, "learning_rate": 9.251899434848669e-07, "logits/chosen": -0.3516583740711212, "logits/rejected": -0.30924221873283386, "logps/chosen": -47.04175567626953, "logps/rejected": -60.052520751953125, "loss": 0.4741, "rewards/accuracies": 1.0, "rewards/chosen": 1.9543602466583252, "rewards/margins": 1.137209415435791, "rewards/rejected": 0.817150890827179, "step": 4478 }, { "epoch": 0.73, "learning_rate": 9.251207767995632e-07, "logits/chosen": -0.38674479722976685, "logits/rejected": -0.3530738353729248, "logps/chosen": -134.9197235107422, "logps/rejected": -117.09505462646484, "loss": 1.7518, "rewards/accuracies": 0.0, "rewards/chosen": 0.7259308099746704, "rewards/margins": -2.1846399307250977, "rewards/rejected": 2.9105706214904785, "step": 4479 }, { "epoch": 0.73, "learning_rate": 9.250515807424364e-07, "logits/chosen": -0.3614422678947449, "logits/rejected": -0.421167254447937, "logps/chosen": -85.96583557128906, "logps/rejected": -59.19133758544922, "loss": 1.2538, "rewards/accuracies": 0.0, "rewards/chosen": 0.4428520202636719, "rewards/margins": -0.7698951959609985, "rewards/rejected": 1.2127472162246704, "step": 4480 }, { "epoch": 0.73, "learning_rate": 9.249823553182674e-07, "logits/chosen": -0.8126763701438904, "logits/rejected": -0.7454639077186584, "logps/chosen": -94.25475311279297, "logps/rejected": -75.80537414550781, "loss": 0.5719, "rewards/accuracies": 0.0, "rewards/chosen": 2.235811710357666, "rewards/margins": -0.0013053417205810547, "rewards/rejected": 2.237117052078247, "step": 4481 }, { "epoch": 0.73, "learning_rate": 9.249131005318387e-07, "logits/chosen": -0.5142086744308472, "logits/rejected": -0.4517960846424103, "logps/chosen": -62.50617980957031, "logps/rejected": -114.74267578125, "loss": 0.2486, "rewards/accuracies": 1.0, "rewards/chosen": 1.82243812084198, "rewards/margins": 0.5064438581466675, "rewards/rejected": 1.3159942626953125, "step": 4482 }, { "epoch": 0.73, "learning_rate": 9.248438163879353e-07, "logits/chosen": -0.5765714049339294, "logits/rejected": -0.49672478437423706, "logps/chosen": -128.60873413085938, "logps/rejected": -128.1753387451172, "loss": 0.2267, "rewards/accuracies": 1.0, "rewards/chosen": 3.4423065185546875, "rewards/margins": 0.8215880393981934, "rewards/rejected": 2.620718479156494, "step": 4483 }, { "epoch": 0.73, "learning_rate": 9.247745028913442e-07, "logits/chosen": -0.7331507802009583, "logits/rejected": -0.7926045060157776, "logps/chosen": -196.66860961914062, "logps/rejected": -108.35220336914062, "loss": 0.4932, "rewards/accuracies": 1.0, "rewards/chosen": 3.4342010021209717, "rewards/margins": 1.074758768081665, "rewards/rejected": 2.3594422340393066, "step": 4484 }, { "epoch": 0.73, "learning_rate": 9.247051600468541e-07, "logits/chosen": -0.940640926361084, "logits/rejected": -0.8977195620536804, "logps/chosen": -69.72651672363281, "logps/rejected": -106.87472534179688, "loss": 0.4335, "rewards/accuracies": 0.0, "rewards/chosen": 1.799098253250122, "rewards/margins": -0.26760339736938477, "rewards/rejected": 2.066701650619507, "step": 4485 }, { "epoch": 0.73, "learning_rate": 9.246357878592561e-07, "logits/chosen": -0.4422588646411896, "logits/rejected": -0.3782367408275604, "logps/chosen": -65.3553695678711, "logps/rejected": -61.1051139831543, "loss": 1.1461, "rewards/accuracies": 0.0, "rewards/chosen": 0.6191062927246094, "rewards/margins": -0.6638065576553345, "rewards/rejected": 1.2829128503799438, "step": 4486 }, { "epoch": 0.73, "learning_rate": 9.245663863333432e-07, "logits/chosen": -0.6995888948440552, "logits/rejected": -0.694485068321228, "logps/chosen": -104.98869323730469, "logps/rejected": -125.94422149658203, "loss": 0.546, "rewards/accuracies": 0.0, "rewards/chosen": 0.7949935793876648, "rewards/margins": -0.15615618228912354, "rewards/rejected": 0.9511497616767883, "step": 4487 }, { "epoch": 0.73, "learning_rate": 9.244969554739101e-07, "logits/chosen": -0.6330756545066833, "logits/rejected": -0.5836802124977112, "logps/chosen": -88.2959213256836, "logps/rejected": -41.97698211669922, "loss": 0.0776, "rewards/accuracies": 1.0, "rewards/chosen": 2.2674758434295654, "rewards/margins": 1.8946692943572998, "rewards/rejected": 0.3728065490722656, "step": 4488 }, { "epoch": 0.73, "learning_rate": 9.244274952857542e-07, "logits/chosen": -0.07925001531839371, "logits/rejected": -0.09765234589576721, "logps/chosen": -12.82046890258789, "logps/rejected": -31.63747215270996, "loss": 1.4693, "rewards/accuracies": 0.0, "rewards/chosen": -0.17168740928173065, "rewards/margins": -0.4777102470397949, "rewards/rejected": 0.30602285265922546, "step": 4489 }, { "epoch": 0.73, "learning_rate": 9.243580057736742e-07, "logits/chosen": -0.42027294635772705, "logits/rejected": -0.44558683037757874, "logps/chosen": -94.33495330810547, "logps/rejected": -47.048797607421875, "loss": 1.1042, "rewards/accuracies": 0.0, "rewards/chosen": 1.9688278436660767, "rewards/margins": -0.6119240522384644, "rewards/rejected": 2.580751895904541, "step": 4490 }, { "epoch": 0.73, "learning_rate": 9.242884869424714e-07, "logits/chosen": -0.4643772542476654, "logits/rejected": -0.4556260406970978, "logps/chosen": -91.72311401367188, "logps/rejected": -113.84536743164062, "loss": 1.0513, "rewards/accuracies": 0.0, "rewards/chosen": 0.33145904541015625, "rewards/margins": -1.675886631011963, "rewards/rejected": 2.007345676422119, "step": 4491 }, { "epoch": 0.73, "learning_rate": 9.242189387969488e-07, "logits/chosen": -0.7163199186325073, "logits/rejected": -0.586554765701294, "logps/chosen": -111.8287353515625, "logps/rejected": -59.775474548339844, "loss": 1.1526, "rewards/accuracies": 1.0, "rewards/chosen": 4.754147529602051, "rewards/margins": 3.46095609664917, "rewards/rejected": 1.2931915521621704, "step": 4492 }, { "epoch": 0.73, "learning_rate": 9.241493613419114e-07, "logits/chosen": -0.9629707932472229, "logits/rejected": -0.9156345129013062, "logps/chosen": -143.93643188476562, "logps/rejected": -84.06076049804688, "loss": 0.8017, "rewards/accuracies": 1.0, "rewards/chosen": 5.229071140289307, "rewards/margins": 3.823697566986084, "rewards/rejected": 1.405373454093933, "step": 4493 }, { "epoch": 0.73, "learning_rate": 9.240797545821666e-07, "logits/chosen": -0.5753885507583618, "logits/rejected": -0.5720141530036926, "logps/chosen": -25.428016662597656, "logps/rejected": -36.897003173828125, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.09552879631519318, "rewards/margins": 0.03483162075281143, "rewards/rejected": 0.060697175562381744, "step": 4494 }, { "epoch": 0.73, "learning_rate": 9.240101185225233e-07, "logits/chosen": -0.7097488045692444, "logits/rejected": -0.5532420873641968, "logps/chosen": -102.96963500976562, "logps/rejected": -96.35186004638672, "loss": 2.0648, "rewards/accuracies": 0.0, "rewards/chosen": 1.4821723699569702, "rewards/margins": -1.080985426902771, "rewards/rejected": 2.563157796859741, "step": 4495 }, { "epoch": 0.73, "learning_rate": 9.23940453167793e-07, "logits/chosen": -0.6770290732383728, "logits/rejected": -0.6770290732383728, "logps/chosen": -76.89520263671875, "logps/rejected": -76.89520263671875, "loss": 0.3968, "rewards/accuracies": 0.0, "rewards/chosen": 2.7123076915740967, "rewards/margins": 0.0, "rewards/rejected": 2.7123076915740967, "step": 4496 }, { "epoch": 0.73, "learning_rate": 9.238707585227886e-07, "logits/chosen": -0.24640484154224396, "logits/rejected": -0.29380667209625244, "logps/chosen": -76.34183502197266, "logps/rejected": -70.6279296875, "loss": 0.7265, "rewards/accuracies": 0.0, "rewards/chosen": 0.4763984680175781, "rewards/margins": -0.686719536781311, "rewards/rejected": 1.1631180047988892, "step": 4497 }, { "epoch": 0.73, "learning_rate": 9.238010345923256e-07, "logits/chosen": -0.7226834297180176, "logits/rejected": -0.8031399846076965, "logps/chosen": -73.12886047363281, "logps/rejected": -84.76056671142578, "loss": 1.1021, "rewards/accuracies": 0.0, "rewards/chosen": 1.0741074085235596, "rewards/margins": -1.9073104858398438, "rewards/rejected": 2.9814178943634033, "step": 4498 }, { "epoch": 0.73, "learning_rate": 9.237312813812212e-07, "logits/chosen": -0.6251843571662903, "logits/rejected": -0.628593921661377, "logps/chosen": -61.988197326660156, "logps/rejected": -67.6791763305664, "loss": 1.0828, "rewards/accuracies": 0.0, "rewards/chosen": 2.0050697326660156, "rewards/margins": -1.21185302734375, "rewards/rejected": 3.2169227600097656, "step": 4499 }, { "epoch": 0.73, "learning_rate": 9.236614988942945e-07, "logits/chosen": -0.6682198643684387, "logits/rejected": -0.48190274834632874, "logps/chosen": -118.11032104492188, "logps/rejected": -141.63636779785156, "loss": 0.7945, "rewards/accuracies": 0.0, "rewards/chosen": 5.264794826507568, "rewards/margins": -0.5070972442626953, "rewards/rejected": 5.771892070770264, "step": 4500 }, { "epoch": 0.73, "learning_rate": 9.23591687136367e-07, "logits/chosen": -0.3426834046840668, "logits/rejected": -0.3495168685913086, "logps/chosen": -59.757240295410156, "logps/rejected": -71.26678466796875, "loss": 0.7117, "rewards/accuracies": 0.0, "rewards/chosen": 1.2639518976211548, "rewards/margins": -0.24067997932434082, "rewards/rejected": 1.5046318769454956, "step": 4501 }, { "epoch": 0.73, "learning_rate": 9.23521846112262e-07, "logits/chosen": -0.6352595090866089, "logits/rejected": -0.6771734356880188, "logps/chosen": -64.17144012451172, "logps/rejected": -92.60591125488281, "loss": 0.7341, "rewards/accuracies": 0.0, "rewards/chosen": 0.8567054867744446, "rewards/margins": -1.1705849170684814, "rewards/rejected": 2.0272903442382812, "step": 4502 }, { "epoch": 0.73, "learning_rate": 9.234519758268047e-07, "logits/chosen": -0.6756551265716553, "logits/rejected": -0.7169656157493591, "logps/chosen": -80.72235107421875, "logps/rejected": -67.15312957763672, "loss": 1.9399, "rewards/accuracies": 0.0, "rewards/chosen": 1.4318389892578125, "rewards/margins": -0.48697590827941895, "rewards/rejected": 1.9188148975372314, "step": 4503 }, { "epoch": 0.73, "learning_rate": 9.233820762848228e-07, "logits/chosen": -0.8753997683525085, "logits/rejected": -0.8849860429763794, "logps/chosen": -71.2657699584961, "logps/rejected": -41.90720748901367, "loss": 0.3454, "rewards/accuracies": 1.0, "rewards/chosen": 1.3578239679336548, "rewards/margins": 0.023848295211791992, "rewards/rejected": 1.3339756727218628, "step": 4504 }, { "epoch": 0.73, "learning_rate": 9.233121474911454e-07, "logits/chosen": -0.21408437192440033, "logits/rejected": -0.21235127747058868, "logps/chosen": -4.383726119995117, "logps/rejected": -10.854249954223633, "loss": 0.394, "rewards/accuracies": 1.0, "rewards/chosen": 0.13381238281726837, "rewards/margins": 0.17757868766784668, "rewards/rejected": -0.04376630857586861, "step": 4505 }, { "epoch": 0.73, "learning_rate": 9.232421894506042e-07, "logits/chosen": -0.6443017721176147, "logits/rejected": -0.6662771105766296, "logps/chosen": -66.83268737792969, "logps/rejected": -76.95298767089844, "loss": 0.4883, "rewards/accuracies": 1.0, "rewards/chosen": 1.6858993768692017, "rewards/margins": 0.01988983154296875, "rewards/rejected": 1.666009545326233, "step": 4506 }, { "epoch": 0.73, "learning_rate": 9.231722021680321e-07, "logits/chosen": -0.5901490449905396, "logits/rejected": -0.5725196003913879, "logps/chosen": -43.03741455078125, "logps/rejected": -67.29530334472656, "loss": 0.8169, "rewards/accuracies": 1.0, "rewards/chosen": 2.580221652984619, "rewards/margins": 0.15805363655090332, "rewards/rejected": 2.422168016433716, "step": 4507 }, { "epoch": 0.73, "learning_rate": 9.231021856482652e-07, "logits/chosen": -0.6631198525428772, "logits/rejected": -0.46781569719314575, "logps/chosen": -137.53009033203125, "logps/rejected": -61.6448974609375, "loss": 0.8848, "rewards/accuracies": 1.0, "rewards/chosen": 3.7743194103240967, "rewards/margins": 2.5394179821014404, "rewards/rejected": 1.2349014282226562, "step": 4508 }, { "epoch": 0.73, "learning_rate": 9.230321398961407e-07, "logits/chosen": -0.9024990797042847, "logits/rejected": -0.956902027130127, "logps/chosen": -237.81326293945312, "logps/rejected": -171.07533264160156, "loss": 0.2588, "rewards/accuracies": 1.0, "rewards/chosen": 3.991687059402466, "rewards/margins": 2.6214981079101562, "rewards/rejected": 1.3701889514923096, "step": 4509 }, { "epoch": 0.73, "learning_rate": 9.22962064916498e-07, "logits/chosen": -0.3776054084300995, "logits/rejected": -0.34043580293655396, "logps/chosen": -75.9739990234375, "logps/rejected": -68.46691131591797, "loss": 0.566, "rewards/accuracies": 0.0, "rewards/chosen": 0.8912857174873352, "rewards/margins": -0.31319814920425415, "rewards/rejected": 1.2044838666915894, "step": 4510 }, { "epoch": 0.73, "learning_rate": 9.228919607141787e-07, "logits/chosen": -0.46581119298934937, "logits/rejected": -0.4523060917854309, "logps/chosen": -1.1908286809921265, "logps/rejected": -19.233877182006836, "loss": 0.6402, "rewards/accuracies": 1.0, "rewards/chosen": 0.41777536273002625, "rewards/margins": 0.22423554956912994, "rewards/rejected": 0.1935398131608963, "step": 4511 }, { "epoch": 0.73, "learning_rate": 9.228218272940265e-07, "logits/chosen": -0.3978612422943115, "logits/rejected": -0.5251027941703796, "logps/chosen": -124.21436309814453, "logps/rejected": -143.65032958984375, "loss": 1.7346, "rewards/accuracies": 0.0, "rewards/chosen": 0.3983146846294403, "rewards/margins": -2.180518388748169, "rewards/rejected": 2.5788331031799316, "step": 4512 }, { "epoch": 0.73, "learning_rate": 9.227516646608868e-07, "logits/chosen": -0.5425582528114319, "logits/rejected": -0.478976845741272, "logps/chosen": -44.67076110839844, "logps/rejected": -59.27037048339844, "loss": 0.3984, "rewards/accuracies": 1.0, "rewards/chosen": 1.8853164911270142, "rewards/margins": 0.16200411319732666, "rewards/rejected": 1.7233123779296875, "step": 4513 }, { "epoch": 0.73, "learning_rate": 9.226814728196071e-07, "logits/chosen": -0.5987380146980286, "logits/rejected": -0.5616615414619446, "logps/chosen": -102.02067565917969, "logps/rejected": -19.817852020263672, "loss": 1.323, "rewards/accuracies": 0.0, "rewards/chosen": -0.04459686204791069, "rewards/margins": -0.46863269805908203, "rewards/rejected": 0.42403584718704224, "step": 4514 }, { "epoch": 0.73, "learning_rate": 9.226112517750371e-07, "logits/chosen": -0.3003918528556824, "logits/rejected": -0.3426503837108612, "logps/chosen": -41.298851013183594, "logps/rejected": -114.35549926757812, "loss": 0.6289, "rewards/accuracies": 0.0, "rewards/chosen": 0.18729935586452484, "rewards/margins": -0.27960890531539917, "rewards/rejected": 0.4669082760810852, "step": 4515 }, { "epoch": 0.73, "learning_rate": 9.225410015320284e-07, "logits/chosen": -0.6941034197807312, "logits/rejected": -0.30128636956214905, "logps/chosen": -60.47850799560547, "logps/rejected": -102.1139144897461, "loss": 0.5841, "rewards/accuracies": 1.0, "rewards/chosen": 4.722538948059082, "rewards/margins": 0.0893106460571289, "rewards/rejected": 4.633228302001953, "step": 4516 }, { "epoch": 0.73, "learning_rate": 9.224707220954346e-07, "logits/chosen": -0.536121666431427, "logits/rejected": -0.536121666431427, "logps/chosen": -82.32286071777344, "logps/rejected": -82.32286071777344, "loss": 0.4769, "rewards/accuracies": 0.0, "rewards/chosen": 1.5397347211837769, "rewards/margins": 0.0, "rewards/rejected": 1.5397347211837769, "step": 4517 }, { "epoch": 0.73, "learning_rate": 9.224004134701113e-07, "logits/chosen": -0.5690377354621887, "logits/rejected": -0.48551493883132935, "logps/chosen": -61.823387145996094, "logps/rejected": -85.08598327636719, "loss": 0.8889, "rewards/accuracies": 0.0, "rewards/chosen": 1.0919106006622314, "rewards/margins": -0.46729278564453125, "rewards/rejected": 1.5592033863067627, "step": 4518 }, { "epoch": 0.73, "learning_rate": 9.223300756609164e-07, "logits/chosen": -0.6759617328643799, "logits/rejected": -0.7129489779472351, "logps/chosen": -92.93754577636719, "logps/rejected": -98.96649932861328, "loss": 1.1416, "rewards/accuracies": 1.0, "rewards/chosen": 1.527502417564392, "rewards/margins": 0.5705916881561279, "rewards/rejected": 0.9569107294082642, "step": 4519 }, { "epoch": 0.73, "learning_rate": 9.222597086727093e-07, "logits/chosen": -0.2245071977376938, "logits/rejected": -0.2900964915752411, "logps/chosen": -75.86152648925781, "logps/rejected": -67.65515899658203, "loss": 0.8019, "rewards/accuracies": 0.0, "rewards/chosen": 1.0417816638946533, "rewards/margins": -0.45129168033599854, "rewards/rejected": 1.4930733442306519, "step": 4520 }, { "epoch": 0.73, "learning_rate": 9.221893125103517e-07, "logits/chosen": -0.23840942978858948, "logits/rejected": -0.18475449085235596, "logps/chosen": -72.1017074584961, "logps/rejected": -81.16759490966797, "loss": 0.2787, "rewards/accuracies": 1.0, "rewards/chosen": 1.9502747058868408, "rewards/margins": 0.6575736999511719, "rewards/rejected": 1.292701005935669, "step": 4521 }, { "epoch": 0.73, "learning_rate": 9.221188871787075e-07, "logits/chosen": -0.6611543297767639, "logits/rejected": -0.6611543297767639, "logps/chosen": -65.5212631225586, "logps/rejected": -65.5212631225586, "loss": 0.3923, "rewards/accuracies": 0.0, "rewards/chosen": 1.7399208545684814, "rewards/margins": 0.0, "rewards/rejected": 1.7399208545684814, "step": 4522 }, { "epoch": 0.73, "learning_rate": 9.220484326826422e-07, "logits/chosen": -0.8349273204803467, "logits/rejected": -0.7874026894569397, "logps/chosen": -107.65129089355469, "logps/rejected": -65.2458267211914, "loss": 0.7301, "rewards/accuracies": 0.0, "rewards/chosen": 0.25316619873046875, "rewards/margins": -0.922620415687561, "rewards/rejected": 1.1757866144180298, "step": 4523 }, { "epoch": 0.73, "learning_rate": 9.219779490270237e-07, "logits/chosen": -0.9102550745010376, "logits/rejected": -0.884361207485199, "logps/chosen": -133.387939453125, "logps/rejected": -82.50260925292969, "loss": 0.3597, "rewards/accuracies": 1.0, "rewards/chosen": 1.501007080078125, "rewards/margins": 0.2938476800918579, "rewards/rejected": 1.207159399986267, "step": 4524 }, { "epoch": 0.73, "learning_rate": 9.219074362167218e-07, "logits/chosen": -0.90683513879776, "logits/rejected": -0.8442291021347046, "logps/chosen": -145.99502563476562, "logps/rejected": -86.71737670898438, "loss": 0.2426, "rewards/accuracies": 1.0, "rewards/chosen": 4.007546901702881, "rewards/margins": 3.103607177734375, "rewards/rejected": 0.9039398431777954, "step": 4525 }, { "epoch": 0.73, "learning_rate": 9.218368942566081e-07, "logits/chosen": -0.20735371112823486, "logits/rejected": -0.12968027591705322, "logps/chosen": -75.17060852050781, "logps/rejected": -61.10948181152344, "loss": 1.0743, "rewards/accuracies": 1.0, "rewards/chosen": 2.1978211402893066, "rewards/margins": 1.5359848737716675, "rewards/rejected": 0.6618362665176392, "step": 4526 }, { "epoch": 0.73, "learning_rate": 9.217663231515566e-07, "logits/chosen": -0.702272355556488, "logits/rejected": -0.6472679972648621, "logps/chosen": -101.40397644042969, "logps/rejected": -73.7436294555664, "loss": 0.4729, "rewards/accuracies": 1.0, "rewards/chosen": 1.7034225463867188, "rewards/margins": 0.11211013793945312, "rewards/rejected": 1.5913124084472656, "step": 4527 }, { "epoch": 0.73, "learning_rate": 9.216957229064428e-07, "logits/chosen": -0.5887742638587952, "logits/rejected": -0.526350200176239, "logps/chosen": -93.52333068847656, "logps/rejected": -71.63973999023438, "loss": 0.8128, "rewards/accuracies": 1.0, "rewards/chosen": 2.1194214820861816, "rewards/margins": 0.7493348121643066, "rewards/rejected": 1.370086669921875, "step": 4528 }, { "epoch": 0.74, "learning_rate": 9.216250935261447e-07, "logits/chosen": -0.3418900966644287, "logits/rejected": -0.3418900966644287, "logps/chosen": -28.520414352416992, "logps/rejected": -28.520414352416992, "loss": 0.3722, "rewards/accuracies": 0.0, "rewards/chosen": 0.09842319786548615, "rewards/margins": 0.0, "rewards/rejected": 0.09842319786548615, "step": 4529 }, { "epoch": 0.74, "learning_rate": 9.215544350155422e-07, "logits/chosen": -1.0129114389419556, "logits/rejected": -1.0104618072509766, "logps/chosen": -153.78125, "logps/rejected": -67.6198959350586, "loss": 1.9017, "rewards/accuracies": 1.0, "rewards/chosen": 3.4355087280273438, "rewards/margins": 0.8742926120758057, "rewards/rejected": 2.561216115951538, "step": 4530 }, { "epoch": 0.74, "learning_rate": 9.214837473795169e-07, "logits/chosen": -0.6753084659576416, "logits/rejected": -0.6762819290161133, "logps/chosen": -62.97864532470703, "logps/rejected": -5.753457546234131, "loss": 0.5652, "rewards/accuracies": 0.0, "rewards/chosen": 0.20971070230007172, "rewards/margins": -0.6405776739120483, "rewards/rejected": 0.8502883911132812, "step": 4531 }, { "epoch": 0.74, "learning_rate": 9.214130306229528e-07, "logits/chosen": -0.5783911943435669, "logits/rejected": -0.5840852856636047, "logps/chosen": -2.782133102416992, "logps/rejected": -1.135576605796814, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 0.3119398057460785, "rewards/margins": -0.04692840576171875, "rewards/rejected": 0.35886821150779724, "step": 4532 }, { "epoch": 0.74, "learning_rate": 9.213422847507357e-07, "logits/chosen": -0.6079601049423218, "logits/rejected": -0.6173952221870422, "logps/chosen": -82.70016479492188, "logps/rejected": -199.87078857421875, "loss": 0.7313, "rewards/accuracies": 1.0, "rewards/chosen": 1.4958893060684204, "rewards/margins": 1.0828521251678467, "rewards/rejected": 0.41303712129592896, "step": 4533 }, { "epoch": 0.74, "learning_rate": 9.212715097677537e-07, "logits/chosen": -0.772222638130188, "logits/rejected": -0.7397837042808533, "logps/chosen": -34.572059631347656, "logps/rejected": -80.16303253173828, "loss": 0.8645, "rewards/accuracies": 0.0, "rewards/chosen": 1.7573235034942627, "rewards/margins": -0.6081664562225342, "rewards/rejected": 2.365489959716797, "step": 4534 }, { "epoch": 0.74, "learning_rate": 9.212007056788963e-07, "logits/chosen": -0.46649351716041565, "logits/rejected": -0.4405899941921234, "logps/chosen": -57.279170989990234, "logps/rejected": -68.40101623535156, "loss": 0.7513, "rewards/accuracies": 0.0, "rewards/chosen": 2.4745090007781982, "rewards/margins": -0.4962582588195801, "rewards/rejected": 2.9707672595977783, "step": 4535 }, { "epoch": 0.74, "learning_rate": 9.211298724890557e-07, "logits/chosen": -0.6874807476997375, "logits/rejected": -0.5428788661956787, "logps/chosen": -92.45368194580078, "logps/rejected": -50.99631118774414, "loss": 0.3078, "rewards/accuracies": 1.0, "rewards/chosen": 3.8743646144866943, "rewards/margins": 3.197205066680908, "rewards/rejected": 0.6771594882011414, "step": 4536 }, { "epoch": 0.74, "learning_rate": 9.210590102031255e-07, "logits/chosen": -0.4468912184238434, "logits/rejected": -0.3911302387714386, "logps/chosen": -47.686378479003906, "logps/rejected": -41.57640838623047, "loss": 0.5841, "rewards/accuracies": 1.0, "rewards/chosen": 2.067190647125244, "rewards/margins": 1.5664681196212769, "rewards/rejected": 0.5007225275039673, "step": 4537 }, { "epoch": 0.74, "learning_rate": 9.209881188260019e-07, "logits/chosen": -0.6413002014160156, "logits/rejected": -0.5244241952896118, "logps/chosen": -71.65718841552734, "logps/rejected": -41.575706481933594, "loss": 0.4771, "rewards/accuracies": 0.0, "rewards/chosen": 2.1350929737091064, "rewards/margins": -0.417330265045166, "rewards/rejected": 2.5524232387542725, "step": 4538 }, { "epoch": 0.74, "learning_rate": 9.209171983625828e-07, "logits/chosen": -0.5305606126785278, "logits/rejected": -0.44225168228149414, "logps/chosen": -63.384010314941406, "logps/rejected": -71.75521850585938, "loss": 0.7956, "rewards/accuracies": 1.0, "rewards/chosen": 2.7413246631622314, "rewards/margins": 0.20094060897827148, "rewards/rejected": 2.54038405418396, "step": 4539 }, { "epoch": 0.74, "learning_rate": 9.208462488177678e-07, "logits/chosen": -0.6111338138580322, "logits/rejected": -0.6299529075622559, "logps/chosen": -106.940673828125, "logps/rejected": -91.973388671875, "loss": 0.7612, "rewards/accuracies": 0.0, "rewards/chosen": 0.6470527648925781, "rewards/margins": -0.906781792640686, "rewards/rejected": 1.5538345575332642, "step": 4540 }, { "epoch": 0.74, "learning_rate": 9.207752701964592e-07, "logits/chosen": -0.49279099702835083, "logits/rejected": -0.2802596092224121, "logps/chosen": -104.59620666503906, "logps/rejected": -117.93832397460938, "loss": 1.1618, "rewards/accuracies": 0.0, "rewards/chosen": 2.447627305984497, "rewards/margins": -1.415025234222412, "rewards/rejected": 3.862652540206909, "step": 4541 }, { "epoch": 0.74, "learning_rate": 9.20704262503561e-07, "logits/chosen": -0.601993203163147, "logits/rejected": -0.5779768228530884, "logps/chosen": -53.48133850097656, "logps/rejected": -77.70877838134766, "loss": 0.2089, "rewards/accuracies": 1.0, "rewards/chosen": 1.6258690357208252, "rewards/margins": 0.7272049188613892, "rewards/rejected": 0.898664116859436, "step": 4542 }, { "epoch": 0.74, "learning_rate": 9.206332257439788e-07, "logits/chosen": -0.7608296871185303, "logits/rejected": -0.7004323601722717, "logps/chosen": -70.09152221679688, "logps/rejected": -84.429931640625, "loss": 0.9147, "rewards/accuracies": 0.0, "rewards/chosen": 1.271131157875061, "rewards/margins": -0.964098334312439, "rewards/rejected": 2.2352294921875, "step": 4543 }, { "epoch": 0.74, "learning_rate": 9.205621599226209e-07, "logits/chosen": -0.1567794680595398, "logits/rejected": -0.04470246285200119, "logps/chosen": -40.37038803100586, "logps/rejected": -71.76725769042969, "loss": 0.9998, "rewards/accuracies": 0.0, "rewards/chosen": 0.7838504910469055, "rewards/margins": -0.7721050381660461, "rewards/rejected": 1.5559555292129517, "step": 4544 }, { "epoch": 0.74, "learning_rate": 9.204910650443971e-07, "logits/chosen": -0.8847072720527649, "logits/rejected": -0.7830358743667603, "logps/chosen": -103.96833038330078, "logps/rejected": -92.02093505859375, "loss": 0.5171, "rewards/accuracies": 0.0, "rewards/chosen": 1.2692375183105469, "rewards/margins": -0.38429033756256104, "rewards/rejected": 1.653527855873108, "step": 4545 }, { "epoch": 0.74, "learning_rate": 9.204199411142195e-07, "logits/chosen": -0.5208456516265869, "logits/rejected": -0.5419979095458984, "logps/chosen": -9.585016250610352, "logps/rejected": -2.42437481880188, "loss": 0.7563, "rewards/accuracies": 0.0, "rewards/chosen": 0.17469798028469086, "rewards/margins": -0.6173315644264221, "rewards/rejected": 0.7920295596122742, "step": 4546 }, { "epoch": 0.74, "learning_rate": 9.203487881370019e-07, "logits/chosen": -0.787684440612793, "logits/rejected": -0.6925686001777649, "logps/chosen": -122.77342224121094, "logps/rejected": -80.00019836425781, "loss": 0.6106, "rewards/accuracies": 0.0, "rewards/chosen": 1.0942124128341675, "rewards/margins": -0.842509388923645, "rewards/rejected": 1.9367218017578125, "step": 4547 }, { "epoch": 0.74, "learning_rate": 9.202776061176605e-07, "logits/chosen": -0.8637714982032776, "logits/rejected": -0.8735727667808533, "logps/chosen": -213.5486602783203, "logps/rejected": -114.28636169433594, "loss": 0.6489, "rewards/accuracies": 1.0, "rewards/chosen": 4.362913608551025, "rewards/margins": 2.9096083641052246, "rewards/rejected": 1.4533051252365112, "step": 4548 }, { "epoch": 0.74, "learning_rate": 9.202063950611132e-07, "logits/chosen": -0.8332533240318298, "logits/rejected": -0.8035038113594055, "logps/chosen": -279.1202392578125, "logps/rejected": -147.17425537109375, "loss": 0.7675, "rewards/accuracies": 0.0, "rewards/chosen": 4.77717924118042, "rewards/margins": -1.2636656761169434, "rewards/rejected": 6.040844917297363, "step": 4549 }, { "epoch": 0.74, "learning_rate": 9.2013515497228e-07, "logits/chosen": -0.7917097806930542, "logits/rejected": -0.7135879993438721, "logps/chosen": -67.33772277832031, "logps/rejected": -58.163169860839844, "loss": 1.0909, "rewards/accuracies": 1.0, "rewards/chosen": 2.339520215988159, "rewards/margins": 1.4602515697479248, "rewards/rejected": 0.8792686462402344, "step": 4550 }, { "epoch": 0.74, "learning_rate": 9.200638858560829e-07, "logits/chosen": -0.7994915843009949, "logits/rejected": -0.7866077423095703, "logps/chosen": -134.99362182617188, "logps/rejected": -83.40037536621094, "loss": 0.1502, "rewards/accuracies": 1.0, "rewards/chosen": 3.769305467605591, "rewards/margins": 1.519073486328125, "rewards/rejected": 2.250231981277466, "step": 4551 }, { "epoch": 0.74, "learning_rate": 9.19992587717446e-07, "logits/chosen": -0.3661856949329376, "logits/rejected": -0.46713364124298096, "logps/chosen": -291.5992126464844, "logps/rejected": -115.4432144165039, "loss": 0.2518, "rewards/accuracies": 1.0, "rewards/chosen": 3.9539825916290283, "rewards/margins": 0.4626288414001465, "rewards/rejected": 3.491353750228882, "step": 4552 }, { "epoch": 0.74, "learning_rate": 9.199212605612954e-07, "logits/chosen": -0.3926340639591217, "logits/rejected": -0.33982083201408386, "logps/chosen": -51.887657165527344, "logps/rejected": -58.14225769042969, "loss": 0.7076, "rewards/accuracies": 0.0, "rewards/chosen": 2.350267171859741, "rewards/margins": -1.1098990440368652, "rewards/rejected": 3.4601662158966064, "step": 4553 }, { "epoch": 0.74, "learning_rate": 9.198499043925589e-07, "logits/chosen": -0.22620126605033875, "logits/rejected": -0.22620126605033875, "logps/chosen": -21.61210060119629, "logps/rejected": -21.61210060119629, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.17583008110523224, "rewards/margins": 0.0, "rewards/rejected": 0.17583008110523224, "step": 4554 }, { "epoch": 0.74, "learning_rate": 9.197785192161668e-07, "logits/chosen": -0.3263635039329529, "logits/rejected": -0.3263635039329529, "logps/chosen": -28.24390983581543, "logps/rejected": -28.24390983581543, "loss": 0.4157, "rewards/accuracies": 0.0, "rewards/chosen": 0.09249477833509445, "rewards/margins": 0.0, "rewards/rejected": 0.09249477833509445, "step": 4555 }, { "epoch": 0.74, "learning_rate": 9.197071050370508e-07, "logits/chosen": -0.7337676286697388, "logits/rejected": -0.7178928256034851, "logps/chosen": -81.96234893798828, "logps/rejected": -61.58473205566406, "loss": 1.7223, "rewards/accuracies": 0.0, "rewards/chosen": 1.051533579826355, "rewards/margins": -0.5404616594314575, "rewards/rejected": 1.5919952392578125, "step": 4556 }, { "epoch": 0.74, "learning_rate": 9.196356618601453e-07, "logits/chosen": -1.0251777172088623, "logits/rejected": -1.0521974563598633, "logps/chosen": -132.26220703125, "logps/rejected": -135.50897216796875, "loss": 1.6237, "rewards/accuracies": 0.0, "rewards/chosen": 2.194000244140625, "rewards/margins": -3.0902605056762695, "rewards/rejected": 5.2842607498168945, "step": 4557 }, { "epoch": 0.74, "learning_rate": 9.195641896903862e-07, "logits/chosen": -0.6489868760108948, "logits/rejected": -0.5864080190658569, "logps/chosen": -81.04801940917969, "logps/rejected": -102.04083251953125, "loss": 1.9323, "rewards/accuracies": 0.0, "rewards/chosen": 1.3752365112304688, "rewards/margins": -2.093815565109253, "rewards/rejected": 3.4690520763397217, "step": 4558 }, { "epoch": 0.74, "learning_rate": 9.194926885327114e-07, "logits/chosen": -0.15868833661079407, "logits/rejected": -0.1587502807378769, "logps/chosen": -1.4715715646743774, "logps/rejected": -2.164283037185669, "loss": 0.9653, "rewards/accuracies": 0.0, "rewards/chosen": 0.20428991317749023, "rewards/margins": -0.05267536640167236, "rewards/rejected": 0.2569652795791626, "step": 4559 }, { "epoch": 0.74, "learning_rate": 9.194211583920613e-07, "logits/chosen": -0.6769906878471375, "logits/rejected": -0.601049542427063, "logps/chosen": -77.32902526855469, "logps/rejected": -81.60772705078125, "loss": 0.9937, "rewards/accuracies": 0.0, "rewards/chosen": 1.975738525390625, "rewards/margins": -1.7922630310058594, "rewards/rejected": 3.7680015563964844, "step": 4560 }, { "epoch": 0.74, "learning_rate": 9.193495992733777e-07, "logits/chosen": -0.5339527130126953, "logits/rejected": -0.47798478603363037, "logps/chosen": -86.81248474121094, "logps/rejected": -107.18997192382812, "loss": 0.6372, "rewards/accuracies": 0.0, "rewards/chosen": 4.966723918914795, "rewards/margins": -0.7815823554992676, "rewards/rejected": 5.7483062744140625, "step": 4561 }, { "epoch": 0.74, "learning_rate": 9.192780111816046e-07, "logits/chosen": -0.9068474769592285, "logits/rejected": -0.9624819755554199, "logps/chosen": -215.29220581054688, "logps/rejected": -121.61315155029297, "loss": 0.4954, "rewards/accuracies": 1.0, "rewards/chosen": 3.513568162918091, "rewards/margins": 0.053894758224487305, "rewards/rejected": 3.4596734046936035, "step": 4562 }, { "epoch": 0.74, "learning_rate": 9.192063941216883e-07, "logits/chosen": -0.7932250499725342, "logits/rejected": -0.7822163701057434, "logps/chosen": -27.591394424438477, "logps/rejected": -1.8125264644622803, "loss": 0.8899, "rewards/accuracies": 0.0, "rewards/chosen": 0.41907501220703125, "rewards/margins": -0.19139426946640015, "rewards/rejected": 0.6104692816734314, "step": 4563 }, { "epoch": 0.74, "learning_rate": 9.191347480985767e-07, "logits/chosen": -0.09059975296258926, "logits/rejected": -0.12300443649291992, "logps/chosen": -107.72599792480469, "logps/rejected": -84.20652770996094, "loss": 1.1401, "rewards/accuracies": 0.0, "rewards/chosen": 0.8955482840538025, "rewards/margins": -1.5721275806427002, "rewards/rejected": 2.4676759243011475, "step": 4564 }, { "epoch": 0.74, "learning_rate": 9.190630731172198e-07, "logits/chosen": -0.609084963798523, "logits/rejected": -0.5578758120536804, "logps/chosen": -48.6787223815918, "logps/rejected": -69.75833129882812, "loss": 0.376, "rewards/accuracies": 0.0, "rewards/chosen": 2.088169574737549, "rewards/margins": -0.0005657672882080078, "rewards/rejected": 2.088735342025757, "step": 4565 }, { "epoch": 0.74, "learning_rate": 9.189913691825699e-07, "logits/chosen": -0.7831341624259949, "logits/rejected": -0.7203761339187622, "logps/chosen": -121.46617889404297, "logps/rejected": -15.197848320007324, "loss": 0.1802, "rewards/accuracies": 1.0, "rewards/chosen": 1.2869209051132202, "rewards/margins": 1.000417947769165, "rewards/rejected": 0.2865029275417328, "step": 4566 }, { "epoch": 0.74, "learning_rate": 9.189196362995808e-07, "logits/chosen": -0.5685099363327026, "logits/rejected": -0.420012503862381, "logps/chosen": -190.4175567626953, "logps/rejected": -142.2642822265625, "loss": 0.3076, "rewards/accuracies": 1.0, "rewards/chosen": 6.191398620605469, "rewards/margins": 0.1692976951599121, "rewards/rejected": 6.022100925445557, "step": 4567 }, { "epoch": 0.74, "learning_rate": 9.188478744732088e-07, "logits/chosen": -0.7274424433708191, "logits/rejected": -0.6596813201904297, "logps/chosen": -77.89556884765625, "logps/rejected": -34.42573165893555, "loss": 0.3996, "rewards/accuracies": 1.0, "rewards/chosen": 1.4419678449630737, "rewards/margins": 1.0010654926300049, "rewards/rejected": 0.44090232253074646, "step": 4568 }, { "epoch": 0.74, "learning_rate": 9.187760837084117e-07, "logits/chosen": -0.47383710741996765, "logits/rejected": -0.3969898521900177, "logps/chosen": -32.49571228027344, "logps/rejected": -28.54908561706543, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 2.2884862422943115, "rewards/margins": 1.9495110511779785, "rewards/rejected": 0.3389751613140106, "step": 4569 }, { "epoch": 0.74, "learning_rate": 9.187042640101499e-07, "logits/chosen": -0.4019680619239807, "logits/rejected": -0.4090289771556854, "logps/chosen": -4.380350112915039, "logps/rejected": -2.06722092628479, "loss": 1.3645, "rewards/accuracies": 0.0, "rewards/chosen": 0.1293410360813141, "rewards/margins": -0.22523567080497742, "rewards/rejected": 0.3545767068862915, "step": 4570 }, { "epoch": 0.74, "learning_rate": 9.186324153833853e-07, "logits/chosen": -0.5586015582084656, "logits/rejected": -0.6351633667945862, "logps/chosen": -59.02341842651367, "logps/rejected": -100.70762634277344, "loss": 1.1325, "rewards/accuracies": 0.0, "rewards/chosen": 2.223313570022583, "rewards/margins": -1.695138931274414, "rewards/rejected": 3.918452501296997, "step": 4571 }, { "epoch": 0.74, "learning_rate": 9.185605378330818e-07, "logits/chosen": -0.44253721833229065, "logits/rejected": -0.40089523792266846, "logps/chosen": -40.30575180053711, "logps/rejected": -50.60948944091797, "loss": 0.3563, "rewards/accuracies": 1.0, "rewards/chosen": 2.1585140228271484, "rewards/margins": 0.2782313823699951, "rewards/rejected": 1.8802826404571533, "step": 4572 }, { "epoch": 0.74, "learning_rate": 9.184886313642056e-07, "logits/chosen": -0.6611416935920715, "logits/rejected": -0.7188169956207275, "logps/chosen": -56.07929992675781, "logps/rejected": -126.60533142089844, "loss": 0.8168, "rewards/accuracies": 0.0, "rewards/chosen": 2.067091464996338, "rewards/margins": -0.9874632358551025, "rewards/rejected": 3.0545547008514404, "step": 4573 }, { "epoch": 0.74, "learning_rate": 9.184166959817247e-07, "logits/chosen": -0.7882656455039978, "logits/rejected": -0.7723920345306396, "logps/chosen": -38.82035446166992, "logps/rejected": -55.42472839355469, "loss": 0.5341, "rewards/accuracies": 0.0, "rewards/chosen": 1.400125503540039, "rewards/margins": -0.5889675617218018, "rewards/rejected": 1.9890930652618408, "step": 4574 }, { "epoch": 0.74, "learning_rate": 9.183447316906093e-07, "logits/chosen": -0.34158262610435486, "logits/rejected": -0.37013205885887146, "logps/chosen": -36.92623519897461, "logps/rejected": -48.07677459716797, "loss": 0.5528, "rewards/accuracies": 0.0, "rewards/chosen": 1.2778819799423218, "rewards/margins": -0.31976282596588135, "rewards/rejected": 1.5976448059082031, "step": 4575 }, { "epoch": 0.74, "learning_rate": 9.182727384958313e-07, "logits/chosen": -0.8627992272377014, "logits/rejected": -0.8498122096061707, "logps/chosen": -50.03535461425781, "logps/rejected": -39.730106353759766, "loss": 0.4011, "rewards/accuracies": 1.0, "rewards/chosen": 3.2989234924316406, "rewards/margins": 1.423712134361267, "rewards/rejected": 1.8752113580703735, "step": 4576 }, { "epoch": 0.74, "learning_rate": 9.182007164023649e-07, "logits/chosen": -0.9230340719223022, "logits/rejected": -0.9022557139396667, "logps/chosen": -50.39851379394531, "logps/rejected": -61.77598190307617, "loss": 0.8705, "rewards/accuracies": 1.0, "rewards/chosen": 2.1751809120178223, "rewards/margins": 0.27263081073760986, "rewards/rejected": 1.9025501012802124, "step": 4577 }, { "epoch": 0.74, "learning_rate": 9.181286654151859e-07, "logits/chosen": -0.6577504873275757, "logits/rejected": -0.622046172618866, "logps/chosen": -62.16462707519531, "logps/rejected": -145.2447967529297, "loss": 0.8537, "rewards/accuracies": 0.0, "rewards/chosen": -0.07753143459558487, "rewards/margins": -0.7706878781318665, "rewards/rejected": 0.6931564211845398, "step": 4578 }, { "epoch": 0.74, "learning_rate": 9.180565855392725e-07, "logits/chosen": -0.2704541087150574, "logits/rejected": -0.3014567196369171, "logps/chosen": -47.24345779418945, "logps/rejected": -90.36754608154297, "loss": 0.1979, "rewards/accuracies": 1.0, "rewards/chosen": 0.6598522067070007, "rewards/margins": 0.8451137542724609, "rewards/rejected": -0.185261532664299, "step": 4579 }, { "epoch": 0.74, "learning_rate": 9.179844767796047e-07, "logits/chosen": -0.5733179450035095, "logits/rejected": -0.5733179450035095, "logps/chosen": -20.284791946411133, "logps/rejected": -20.284791946411133, "loss": 0.5768, "rewards/accuracies": 0.0, "rewards/chosen": 0.024098968133330345, "rewards/margins": 0.0, "rewards/rejected": 0.024098968133330345, "step": 4580 }, { "epoch": 0.74, "learning_rate": 9.179123391411647e-07, "logits/chosen": -0.5520451068878174, "logits/rejected": -0.5506324768066406, "logps/chosen": -2.468719720840454, "logps/rejected": -17.23173713684082, "loss": 0.4497, "rewards/accuracies": 1.0, "rewards/chosen": 0.39111918210983276, "rewards/margins": 0.03845176100730896, "rewards/rejected": 0.3526674211025238, "step": 4581 }, { "epoch": 0.74, "learning_rate": 9.178401726289364e-07, "logits/chosen": -0.8023631572723389, "logits/rejected": -0.7384888529777527, "logps/chosen": -67.26602935791016, "logps/rejected": -110.12688446044922, "loss": 1.0028, "rewards/accuracies": 0.0, "rewards/chosen": 2.7115700244903564, "rewards/margins": -1.5998289585113525, "rewards/rejected": 4.311398983001709, "step": 4582 }, { "epoch": 0.74, "learning_rate": 9.177679772479057e-07, "logits/chosen": -0.43289005756378174, "logits/rejected": -0.43289005756378174, "logps/chosen": -36.130313873291016, "logps/rejected": -36.130313873291016, "loss": 0.4952, "rewards/accuracies": 0.0, "rewards/chosen": 0.19855843484401703, "rewards/margins": 0.0, "rewards/rejected": 0.19855843484401703, "step": 4583 }, { "epoch": 0.74, "learning_rate": 9.176957530030608e-07, "logits/chosen": -0.8190450072288513, "logits/rejected": -0.8021039962768555, "logps/chosen": -154.539306640625, "logps/rejected": -188.025146484375, "loss": 0.7736, "rewards/accuracies": 0.0, "rewards/chosen": 4.0131683349609375, "rewards/margins": -1.2900238037109375, "rewards/rejected": 5.303192138671875, "step": 4584 }, { "epoch": 0.74, "learning_rate": 9.176234998993917e-07, "logits/chosen": -0.618904709815979, "logits/rejected": -0.7418584227561951, "logps/chosen": -77.83409881591797, "logps/rejected": -129.4665985107422, "loss": 1.3311, "rewards/accuracies": 0.0, "rewards/chosen": 2.293083906173706, "rewards/margins": -1.5564630031585693, "rewards/rejected": 3.8495469093322754, "step": 4585 }, { "epoch": 0.74, "learning_rate": 9.175512179418901e-07, "logits/chosen": -0.3318506181240082, "logits/rejected": -0.3318506181240082, "logps/chosen": -32.607601165771484, "logps/rejected": -32.607601165771484, "loss": 1.2826, "rewards/accuracies": 0.0, "rewards/chosen": 1.4910560846328735, "rewards/margins": 0.0, "rewards/rejected": 1.4910560846328735, "step": 4586 }, { "epoch": 0.74, "learning_rate": 9.174789071355504e-07, "logits/chosen": -0.019597403705120087, "logits/rejected": 0.055116813629865646, "logps/chosen": -47.124839782714844, "logps/rejected": -70.73101806640625, "loss": 0.392, "rewards/accuracies": 1.0, "rewards/chosen": 1.4364128112792969, "rewards/margins": 0.1923224925994873, "rewards/rejected": 1.2440903186798096, "step": 4587 }, { "epoch": 0.74, "learning_rate": 9.174065674853686e-07, "logits/chosen": -0.34164267778396606, "logits/rejected": -0.31298017501831055, "logps/chosen": -62.03018569946289, "logps/rejected": -85.80441284179688, "loss": 0.7712, "rewards/accuracies": 0.0, "rewards/chosen": 0.7939014434814453, "rewards/margins": -1.0112789869308472, "rewards/rejected": 1.8051804304122925, "step": 4588 }, { "epoch": 0.74, "learning_rate": 9.173341989963424e-07, "logits/chosen": -0.5253044366836548, "logits/rejected": -0.4977494776248932, "logps/chosen": -59.894264221191406, "logps/rejected": -17.584367752075195, "loss": 0.6056, "rewards/accuracies": 0.0, "rewards/chosen": 0.5328613519668579, "rewards/margins": -0.2318304181098938, "rewards/rejected": 0.7646917700767517, "step": 4589 }, { "epoch": 0.75, "learning_rate": 9.172618016734718e-07, "logits/chosen": -0.8166757225990295, "logits/rejected": -0.8088793754577637, "logps/chosen": -187.8516845703125, "logps/rejected": -173.6844940185547, "loss": 0.2647, "rewards/accuracies": 1.0, "rewards/chosen": 4.523303508758545, "rewards/margins": 0.5548815727233887, "rewards/rejected": 3.9684219360351562, "step": 4590 }, { "epoch": 0.75, "learning_rate": 9.171893755217589e-07, "logits/chosen": -0.5165128707885742, "logits/rejected": -0.41865670680999756, "logps/chosen": -71.69917297363281, "logps/rejected": -15.468013763427734, "loss": 0.6667, "rewards/accuracies": 1.0, "rewards/chosen": 0.9196075797080994, "rewards/margins": 0.0334320068359375, "rewards/rejected": 0.8861755728721619, "step": 4591 }, { "epoch": 0.75, "learning_rate": 9.171169205462077e-07, "logits/chosen": -0.5900247097015381, "logits/rejected": -0.5297209620475769, "logps/chosen": -34.417293548583984, "logps/rejected": -60.46429443359375, "loss": 1.1547, "rewards/accuracies": 0.0, "rewards/chosen": 1.3278019428253174, "rewards/margins": -0.19269287586212158, "rewards/rejected": 1.520494818687439, "step": 4592 }, { "epoch": 0.75, "learning_rate": 9.170444367518241e-07, "logits/chosen": -0.6309233903884888, "logits/rejected": -0.5308614373207092, "logps/chosen": -95.54704284667969, "logps/rejected": -30.41932487487793, "loss": 0.2009, "rewards/accuracies": 1.0, "rewards/chosen": 2.314593553543091, "rewards/margins": 1.7761626243591309, "rewards/rejected": 0.5384309887886047, "step": 4593 }, { "epoch": 0.75, "learning_rate": 9.16971924143616e-07, "logits/chosen": -0.45934826135635376, "logits/rejected": -0.37632638216018677, "logps/chosen": -100.76673889160156, "logps/rejected": -81.85175323486328, "loss": 0.4206, "rewards/accuracies": 1.0, "rewards/chosen": 1.8619041442871094, "rewards/margins": 1.3417160511016846, "rewards/rejected": 0.5201881527900696, "step": 4594 }, { "epoch": 0.75, "learning_rate": 9.168993827265934e-07, "logits/chosen": -0.6644569635391235, "logits/rejected": -0.4976005256175995, "logps/chosen": -80.00875854492188, "logps/rejected": -87.61711883544922, "loss": 0.1168, "rewards/accuracies": 1.0, "rewards/chosen": 4.182478427886963, "rewards/margins": 2.061399221420288, "rewards/rejected": 2.121079206466675, "step": 4595 }, { "epoch": 0.75, "learning_rate": 9.168268125057682e-07, "logits/chosen": -0.5046507716178894, "logits/rejected": -0.5046507716178894, "logps/chosen": -113.21991729736328, "logps/rejected": -113.21991729736328, "loss": 1.2089, "rewards/accuracies": 0.0, "rewards/chosen": 0.35971298813819885, "rewards/margins": 0.0, "rewards/rejected": 0.35971298813819885, "step": 4596 }, { "epoch": 0.75, "learning_rate": 9.167542134861541e-07, "logits/chosen": -0.629980742931366, "logits/rejected": -0.5901793241500854, "logps/chosen": -55.14466857910156, "logps/rejected": -74.36221313476562, "loss": 0.3909, "rewards/accuracies": 0.0, "rewards/chosen": 0.9702041745185852, "rewards/margins": -0.06504672765731812, "rewards/rejected": 1.0352509021759033, "step": 4597 }, { "epoch": 0.75, "learning_rate": 9.166815856727674e-07, "logits/chosen": -0.4068721532821655, "logits/rejected": -0.4233555495738983, "logps/chosen": -27.443527221679688, "logps/rejected": -67.97329711914062, "loss": 0.5587, "rewards/accuracies": 0.0, "rewards/chosen": 0.3120701014995575, "rewards/margins": -0.18915745615959167, "rewards/rejected": 0.5012275576591492, "step": 4598 }, { "epoch": 0.75, "learning_rate": 9.166089290706259e-07, "logits/chosen": -0.2008666694164276, "logits/rejected": -0.2589402496814728, "logps/chosen": -66.5291748046875, "logps/rejected": -65.96577453613281, "loss": 0.7154, "rewards/accuracies": 0.0, "rewards/chosen": 2.697988271713257, "rewards/margins": -0.17529821395874023, "rewards/rejected": 2.873286485671997, "step": 4599 }, { "epoch": 0.75, "learning_rate": 9.165362436847493e-07, "logits/chosen": -0.5062891244888306, "logits/rejected": -0.472825825214386, "logps/chosen": -47.909942626953125, "logps/rejected": -127.41256713867188, "loss": 0.4632, "rewards/accuracies": 1.0, "rewards/chosen": 1.6453148126602173, "rewards/margins": 1.3364677429199219, "rewards/rejected": 0.308847039937973, "step": 4600 }, { "epoch": 0.75, "learning_rate": 9.164635295201595e-07, "logits/chosen": -0.8592768907546997, "logits/rejected": -0.7684589624404907, "logps/chosen": -84.35798645019531, "logps/rejected": -59.4285774230957, "loss": 0.4342, "rewards/accuracies": 1.0, "rewards/chosen": 4.242189884185791, "rewards/margins": 1.9553508758544922, "rewards/rejected": 2.286839008331299, "step": 4601 }, { "epoch": 0.75, "learning_rate": 9.163907865818806e-07, "logits/chosen": -0.5591405630111694, "logits/rejected": -0.7015565633773804, "logps/chosen": -79.81781005859375, "logps/rejected": -129.6222686767578, "loss": 1.8104, "rewards/accuracies": 0.0, "rewards/chosen": 1.498094916343689, "rewards/margins": -1.937242865562439, "rewards/rejected": 3.435337781906128, "step": 4602 }, { "epoch": 0.75, "learning_rate": 9.16318014874938e-07, "logits/chosen": -0.7063292264938354, "logits/rejected": -0.7619547247886658, "logps/chosen": -89.7723617553711, "logps/rejected": -104.73551940917969, "loss": 0.6344, "rewards/accuracies": 0.0, "rewards/chosen": 2.449347734451294, "rewards/margins": -0.9245491027832031, "rewards/rejected": 3.373896837234497, "step": 4603 }, { "epoch": 0.75, "learning_rate": 9.1624521440436e-07, "logits/chosen": -0.7284233570098877, "logits/rejected": -0.6734281182289124, "logps/chosen": -71.25102996826172, "logps/rejected": -79.59756469726562, "loss": 0.967, "rewards/accuracies": 1.0, "rewards/chosen": 1.868299126625061, "rewards/margins": 0.011594414710998535, "rewards/rejected": 1.8567047119140625, "step": 4604 }, { "epoch": 0.75, "learning_rate": 9.161723851751762e-07, "logits/chosen": -0.02667653188109398, "logits/rejected": -0.045770276337862015, "logps/chosen": -11.427199363708496, "logps/rejected": -55.29893112182617, "loss": 0.3693, "rewards/accuracies": 0.0, "rewards/chosen": 0.18671445548534393, "rewards/margins": -0.019039630889892578, "rewards/rejected": 0.2057540863752365, "step": 4605 }, { "epoch": 0.75, "learning_rate": 9.160995271924183e-07, "logits/chosen": -0.7521967887878418, "logits/rejected": -0.696565568447113, "logps/chosen": -71.99996948242188, "logps/rejected": -67.46073913574219, "loss": 0.783, "rewards/accuracies": 0.0, "rewards/chosen": 0.7404029965400696, "rewards/margins": -0.37124401330947876, "rewards/rejected": 1.1116470098495483, "step": 4606 }, { "epoch": 0.75, "learning_rate": 9.160266404611205e-07, "logits/chosen": -0.7682543992996216, "logits/rejected": -0.7377541661262512, "logps/chosen": -44.27634048461914, "logps/rejected": -51.437416076660156, "loss": 0.4611, "rewards/accuracies": 1.0, "rewards/chosen": 2.097282886505127, "rewards/margins": 0.8686901330947876, "rewards/rejected": 1.2285927534103394, "step": 4607 }, { "epoch": 0.75, "learning_rate": 9.15953724986318e-07, "logits/chosen": -0.870204508304596, "logits/rejected": -0.7443106770515442, "logps/chosen": -89.12462615966797, "logps/rejected": -54.58268356323242, "loss": 0.1753, "rewards/accuracies": 1.0, "rewards/chosen": 3.791670322418213, "rewards/margins": 2.0197672843933105, "rewards/rejected": 1.7719029188156128, "step": 4608 }, { "epoch": 0.75, "learning_rate": 9.158807807730492e-07, "logits/chosen": -0.44350263476371765, "logits/rejected": -0.3582923114299774, "logps/chosen": -125.16764831542969, "logps/rejected": -71.62210083007812, "loss": 0.5182, "rewards/accuracies": 0.0, "rewards/chosen": 2.570817708969116, "rewards/margins": -0.2830932140350342, "rewards/rejected": 2.8539109230041504, "step": 4609 }, { "epoch": 0.75, "learning_rate": 9.158078078263535e-07, "logits/chosen": -0.6584526896476746, "logits/rejected": -0.6985464692115784, "logps/chosen": -59.10251235961914, "logps/rejected": -45.368072509765625, "loss": 1.1225, "rewards/accuracies": 0.0, "rewards/chosen": 0.03344307094812393, "rewards/margins": -1.1530849933624268, "rewards/rejected": 1.1865280866622925, "step": 4610 }, { "epoch": 0.75, "learning_rate": 9.157348061512726e-07, "logits/chosen": -0.6578210592269897, "logits/rejected": -0.6575688719749451, "logps/chosen": -41.95552062988281, "logps/rejected": -19.154010772705078, "loss": 0.3752, "rewards/accuracies": 0.0, "rewards/chosen": 0.35671767592430115, "rewards/margins": -0.08982354402542114, "rewards/rejected": 0.4465412199497223, "step": 4611 }, { "epoch": 0.75, "learning_rate": 9.156617757528503e-07, "logits/chosen": -0.8888647556304932, "logits/rejected": -0.9210639595985413, "logps/chosen": -83.73844909667969, "logps/rejected": -74.841552734375, "loss": 0.9766, "rewards/accuracies": 0.0, "rewards/chosen": 0.04165191575884819, "rewards/margins": -1.6329742670059204, "rewards/rejected": 1.6746262311935425, "step": 4612 }, { "epoch": 0.75, "learning_rate": 9.155887166361324e-07, "logits/chosen": -0.5986691117286682, "logits/rejected": -0.6242672801017761, "logps/chosen": -86.42431640625, "logps/rejected": -78.56752014160156, "loss": 0.3829, "rewards/accuracies": 0.0, "rewards/chosen": 1.5083328485488892, "rewards/margins": -0.0516204833984375, "rewards/rejected": 1.5599533319473267, "step": 4613 }, { "epoch": 0.75, "learning_rate": 9.155156288061665e-07, "logits/chosen": -0.9871943593025208, "logits/rejected": -0.7196252346038818, "logps/chosen": -94.02607727050781, "logps/rejected": -60.927650451660156, "loss": 1.2526, "rewards/accuracies": 1.0, "rewards/chosen": 3.669956922531128, "rewards/margins": 2.157945156097412, "rewards/rejected": 1.5120117664337158, "step": 4614 }, { "epoch": 0.75, "learning_rate": 9.154425122680023e-07, "logits/chosen": -0.5823734402656555, "logits/rejected": -0.4755052328109741, "logps/chosen": -120.96360778808594, "logps/rejected": -34.41047286987305, "loss": 0.2313, "rewards/accuracies": 1.0, "rewards/chosen": 3.8261094093322754, "rewards/margins": 1.574821949005127, "rewards/rejected": 2.2512874603271484, "step": 4615 }, { "epoch": 0.75, "learning_rate": 9.153693670266913e-07, "logits/chosen": -0.7213962078094482, "logits/rejected": -0.6517710089683533, "logps/chosen": -40.61687469482422, "logps/rejected": -97.69876098632812, "loss": 0.8086, "rewards/accuracies": 1.0, "rewards/chosen": 1.7579479217529297, "rewards/margins": 0.5977870225906372, "rewards/rejected": 1.1601608991622925, "step": 4616 }, { "epoch": 0.75, "learning_rate": 9.152961930872876e-07, "logits/chosen": -0.4801209568977356, "logits/rejected": -0.48755785822868347, "logps/chosen": -57.845428466796875, "logps/rejected": -97.12373352050781, "loss": 0.67, "rewards/accuracies": 1.0, "rewards/chosen": 1.8751229047775269, "rewards/margins": 0.24763953685760498, "rewards/rejected": 1.6274833679199219, "step": 4617 }, { "epoch": 0.75, "learning_rate": 9.152229904548463e-07, "logits/chosen": -0.28383252024650574, "logits/rejected": -0.244442418217659, "logps/chosen": -36.97793960571289, "logps/rejected": -19.889713287353516, "loss": 0.8317, "rewards/accuracies": 1.0, "rewards/chosen": 1.4880104064941406, "rewards/margins": 0.11110568046569824, "rewards/rejected": 1.3769047260284424, "step": 4618 }, { "epoch": 0.75, "learning_rate": 9.151497591344253e-07, "logits/chosen": -0.9003598690032959, "logits/rejected": -0.8136032819747925, "logps/chosen": -183.50241088867188, "logps/rejected": -52.843841552734375, "loss": 0.4238, "rewards/accuracies": 1.0, "rewards/chosen": 3.9750823974609375, "rewards/margins": 1.661973476409912, "rewards/rejected": 2.3131089210510254, "step": 4619 }, { "epoch": 0.75, "learning_rate": 9.150764991310841e-07, "logits/chosen": -0.7033792734146118, "logits/rejected": -0.6408289670944214, "logps/chosen": -106.60643768310547, "logps/rejected": -96.1786117553711, "loss": 1.3892, "rewards/accuracies": 0.0, "rewards/chosen": 0.7544273734092712, "rewards/margins": -1.7417662143707275, "rewards/rejected": 2.4961936473846436, "step": 4620 }, { "epoch": 0.75, "learning_rate": 9.150032104498844e-07, "logits/chosen": -0.6634955406188965, "logits/rejected": -0.5933115482330322, "logps/chosen": -51.89512634277344, "logps/rejected": -86.61398315429688, "loss": 0.6565, "rewards/accuracies": 0.0, "rewards/chosen": 2.475853681564331, "rewards/margins": -0.9691858291625977, "rewards/rejected": 3.4450395107269287, "step": 4621 }, { "epoch": 0.75, "learning_rate": 9.149298930958896e-07, "logits/chosen": -0.4922214448451996, "logits/rejected": -0.5201058983802795, "logps/chosen": -87.8870849609375, "logps/rejected": -27.16900062561035, "loss": 1.5502, "rewards/accuracies": 1.0, "rewards/chosen": 0.7721328735351562, "rewards/margins": 0.412815660238266, "rewards/rejected": 0.35931721329689026, "step": 4622 }, { "epoch": 0.75, "learning_rate": 9.148565470741652e-07, "logits/chosen": -0.6545908451080322, "logits/rejected": -0.6652770638465881, "logps/chosen": -65.44424438476562, "logps/rejected": -161.197021484375, "loss": 0.3372, "rewards/accuracies": 1.0, "rewards/chosen": 0.9252182245254517, "rewards/margins": 0.27397459745407104, "rewards/rejected": 0.6512436270713806, "step": 4623 }, { "epoch": 0.75, "learning_rate": 9.147831723897788e-07, "logits/chosen": -0.5756394267082214, "logits/rejected": -0.5756394267082214, "logps/chosen": -54.05314636230469, "logps/rejected": -54.05314636230469, "loss": 0.3935, "rewards/accuracies": 0.0, "rewards/chosen": 1.4811333417892456, "rewards/margins": 0.0, "rewards/rejected": 1.4811333417892456, "step": 4624 }, { "epoch": 0.75, "learning_rate": 9.147097690478001e-07, "logits/chosen": -0.19823960959911346, "logits/rejected": -0.17579258978366852, "logps/chosen": -43.67793273925781, "logps/rejected": -86.66307067871094, "loss": 1.5731, "rewards/accuracies": 1.0, "rewards/chosen": 1.4306381940841675, "rewards/margins": 0.8213348984718323, "rewards/rejected": 0.6093032956123352, "step": 4625 }, { "epoch": 0.75, "learning_rate": 9.146363370533003e-07, "logits/chosen": -0.18346530199050903, "logits/rejected": -0.17329582571983337, "logps/chosen": -46.55609893798828, "logps/rejected": -47.01553726196289, "loss": 0.754, "rewards/accuracies": 0.0, "rewards/chosen": 1.9165420532226562, "rewards/margins": -0.22118186950683594, "rewards/rejected": 2.137723922729492, "step": 4626 }, { "epoch": 0.75, "learning_rate": 9.145628764113529e-07, "logits/chosen": -0.4177575409412384, "logits/rejected": -0.3987732231616974, "logps/chosen": -36.943668365478516, "logps/rejected": -26.365890502929688, "loss": 0.7444, "rewards/accuracies": 0.0, "rewards/chosen": -0.264120489358902, "rewards/margins": -0.4369047284126282, "rewards/rejected": 0.1727842390537262, "step": 4627 }, { "epoch": 0.75, "learning_rate": 9.144893871270333e-07, "logits/chosen": -0.9646211266517639, "logits/rejected": -0.9408396482467651, "logps/chosen": -84.65402221679688, "logps/rejected": -70.9051284790039, "loss": 0.7937, "rewards/accuracies": 0.0, "rewards/chosen": 1.2226539850234985, "rewards/margins": -1.1179107427597046, "rewards/rejected": 2.340564727783203, "step": 4628 }, { "epoch": 0.75, "learning_rate": 9.14415869205419e-07, "logits/chosen": -0.4432016611099243, "logits/rejected": -0.4512689709663391, "logps/chosen": -12.851491928100586, "logps/rejected": -2.336716651916504, "loss": 0.4749, "rewards/accuracies": 0.0, "rewards/chosen": -0.13159218430519104, "rewards/margins": -0.4582892656326294, "rewards/rejected": 0.32669708132743835, "step": 4629 }, { "epoch": 0.75, "learning_rate": 9.143423226515894e-07, "logits/chosen": -0.23715892434120178, "logits/rejected": -0.23715892434120178, "logps/chosen": -29.216476440429688, "logps/rejected": -29.216476440429688, "loss": 0.3574, "rewards/accuracies": 0.0, "rewards/chosen": 1.1644309759140015, "rewards/margins": 0.0, "rewards/rejected": 1.1644309759140015, "step": 4630 }, { "epoch": 0.75, "learning_rate": 9.142687474706259e-07, "logits/chosen": -0.6385440230369568, "logits/rejected": -0.7579100728034973, "logps/chosen": -50.57012176513672, "logps/rejected": -126.42829895019531, "loss": 2.1481, "rewards/accuracies": 0.0, "rewards/chosen": 2.1461219787597656, "rewards/margins": -3.529346466064453, "rewards/rejected": 5.675468444824219, "step": 4631 }, { "epoch": 0.75, "learning_rate": 9.141951436676118e-07, "logits/chosen": -0.7139652967453003, "logits/rejected": -0.6947386860847473, "logps/chosen": -96.45321655273438, "logps/rejected": -103.63805389404297, "loss": 1.7576, "rewards/accuracies": 0.0, "rewards/chosen": 0.7766990661621094, "rewards/margins": -2.0191314220428467, "rewards/rejected": 2.795830488204956, "step": 4632 }, { "epoch": 0.75, "learning_rate": 9.141215112476324e-07, "logits/chosen": -0.5569774508476257, "logits/rejected": -0.554046094417572, "logps/chosen": -52.47584533691406, "logps/rejected": -106.73257446289062, "loss": 0.9027, "rewards/accuracies": 1.0, "rewards/chosen": 1.7323074340820312, "rewards/margins": 0.5269156694412231, "rewards/rejected": 1.205391764640808, "step": 4633 }, { "epoch": 0.75, "learning_rate": 9.140478502157749e-07, "logits/chosen": -0.6789112091064453, "logits/rejected": -0.6453746557235718, "logps/chosen": -24.548505783081055, "logps/rejected": -76.4712905883789, "loss": 0.4434, "rewards/accuracies": 1.0, "rewards/chosen": 1.339007019996643, "rewards/margins": 0.617802619934082, "rewards/rejected": 0.721204400062561, "step": 4634 }, { "epoch": 0.75, "learning_rate": 9.13974160577129e-07, "logits/chosen": -0.8459085822105408, "logits/rejected": -0.44270142912864685, "logps/chosen": -94.88090515136719, "logps/rejected": -93.560546875, "loss": 1.4821, "rewards/accuracies": 0.0, "rewards/chosen": 1.51048743724823, "rewards/margins": -1.4299827814102173, "rewards/rejected": 2.9404702186584473, "step": 4635 }, { "epoch": 0.75, "learning_rate": 9.139004423367853e-07, "logits/chosen": -0.5146788358688354, "logits/rejected": -0.6296740770339966, "logps/chosen": -181.3546600341797, "logps/rejected": -63.4093017578125, "loss": 0.2071, "rewards/accuracies": 1.0, "rewards/chosen": 2.8716018199920654, "rewards/margins": 0.9475973844528198, "rewards/rejected": 1.9240044355392456, "step": 4636 }, { "epoch": 0.75, "learning_rate": 9.138266954998377e-07, "logits/chosen": -0.35503557324409485, "logits/rejected": -0.36076676845550537, "logps/chosen": -136.3153533935547, "logps/rejected": -77.0290298461914, "loss": 1.0278, "rewards/accuracies": 0.0, "rewards/chosen": 2.421208143234253, "rewards/margins": -0.47847461700439453, "rewards/rejected": 2.8996827602386475, "step": 4637 }, { "epoch": 0.75, "learning_rate": 9.137529200713809e-07, "logits/chosen": -0.5052119493484497, "logits/rejected": -0.4873215854167938, "logps/chosen": -79.95330047607422, "logps/rejected": -91.16250610351562, "loss": 0.5536, "rewards/accuracies": 0.0, "rewards/chosen": 2.074495792388916, "rewards/margins": -0.640007734298706, "rewards/rejected": 2.714503526687622, "step": 4638 }, { "epoch": 0.75, "learning_rate": 9.136791160565125e-07, "logits/chosen": -0.715090811252594, "logits/rejected": -0.7227451205253601, "logps/chosen": -125.92037200927734, "logps/rejected": -52.12860107421875, "loss": 1.6956, "rewards/accuracies": 0.0, "rewards/chosen": 1.308972954750061, "rewards/margins": -0.46974873542785645, "rewards/rejected": 1.7787216901779175, "step": 4639 }, { "epoch": 0.75, "learning_rate": 9.136052834603313e-07, "logits/chosen": 0.026735378429293633, "logits/rejected": 0.026735378429293633, "logps/chosen": -41.40913772583008, "logps/rejected": -41.40913772583008, "loss": 0.7653, "rewards/accuracies": 0.0, "rewards/chosen": 0.25964394211769104, "rewards/margins": 0.0, "rewards/rejected": 0.25964394211769104, "step": 4640 }, { "epoch": 0.75, "learning_rate": 9.135314222879387e-07, "logits/chosen": -0.698678731918335, "logits/rejected": -0.6856586933135986, "logps/chosen": -55.43598175048828, "logps/rejected": -39.88311767578125, "loss": 2.1914, "rewards/accuracies": 0.0, "rewards/chosen": 1.445460557937622, "rewards/margins": -0.4231768846511841, "rewards/rejected": 1.8686374425888062, "step": 4641 }, { "epoch": 0.75, "learning_rate": 9.134575325444375e-07, "logits/chosen": -0.8470311760902405, "logits/rejected": -0.8193715214729309, "logps/chosen": -15.044428825378418, "logps/rejected": -22.65392303466797, "loss": 0.7559, "rewards/accuracies": 0.0, "rewards/chosen": 0.44063082337379456, "rewards/margins": -0.8133548498153687, "rewards/rejected": 1.2539856433868408, "step": 4642 }, { "epoch": 0.75, "learning_rate": 9.133836142349331e-07, "logits/chosen": -0.38028398156166077, "logits/rejected": -0.378426730632782, "logps/chosen": -26.356246948242188, "logps/rejected": -5.6761980056762695, "loss": 1.348, "rewards/accuracies": 0.0, "rewards/chosen": -0.01718750037252903, "rewards/margins": -0.15014877915382385, "rewards/rejected": 0.13296127319335938, "step": 4643 }, { "epoch": 0.75, "learning_rate": 9.133096673645324e-07, "logits/chosen": -0.3981175124645233, "logits/rejected": -0.3793617784976959, "logps/chosen": -45.67868423461914, "logps/rejected": -45.392555236816406, "loss": 0.7044, "rewards/accuracies": 1.0, "rewards/chosen": 1.6429370641708374, "rewards/margins": 0.8204089999198914, "rewards/rejected": 0.822528064250946, "step": 4644 }, { "epoch": 0.75, "learning_rate": 9.132356919383445e-07, "logits/chosen": -0.95494544506073, "logits/rejected": -0.8939681053161621, "logps/chosen": -86.5597915649414, "logps/rejected": -35.68425750732422, "loss": 0.3167, "rewards/accuracies": 1.0, "rewards/chosen": 1.3721923828125, "rewards/margins": 0.5091041326522827, "rewards/rejected": 0.8630882501602173, "step": 4645 }, { "epoch": 0.75, "learning_rate": 9.131616879614803e-07, "logits/chosen": -0.9353500008583069, "logits/rejected": -0.8538886308670044, "logps/chosen": -104.93629455566406, "logps/rejected": -62.28185272216797, "loss": 0.2404, "rewards/accuracies": 1.0, "rewards/chosen": 0.8908950686454773, "rewards/margins": 0.5416069030761719, "rewards/rejected": 0.3492881953716278, "step": 4646 }, { "epoch": 0.75, "learning_rate": 9.130876554390529e-07, "logits/chosen": -0.7691515684127808, "logits/rejected": -0.6021450161933899, "logps/chosen": -132.9533233642578, "logps/rejected": -66.70466613769531, "loss": 0.3702, "rewards/accuracies": 1.0, "rewards/chosen": 5.025346279144287, "rewards/margins": 3.028455972671509, "rewards/rejected": 1.9968903064727783, "step": 4647 }, { "epoch": 0.75, "learning_rate": 9.130135943761771e-07, "logits/chosen": -0.6298506259918213, "logits/rejected": -0.6067172288894653, "logps/chosen": -148.76644897460938, "logps/rejected": -77.58660125732422, "loss": 0.3479, "rewards/accuracies": 1.0, "rewards/chosen": 4.865043640136719, "rewards/margins": 2.4782652854919434, "rewards/rejected": 2.3867783546447754, "step": 4648 }, { "epoch": 0.75, "learning_rate": 9.1293950477797e-07, "logits/chosen": -0.22276514768600464, "logits/rejected": -0.24716763198375702, "logps/chosen": -28.277118682861328, "logps/rejected": -21.154769897460938, "loss": 0.4628, "rewards/accuracies": 0.0, "rewards/chosen": -0.17848359048366547, "rewards/margins": -0.220918670296669, "rewards/rejected": 0.04243507608771324, "step": 4649 }, { "epoch": 0.75, "learning_rate": 9.128653866495503e-07, "logits/chosen": -0.7414372563362122, "logits/rejected": -0.7134868502616882, "logps/chosen": -82.08839416503906, "logps/rejected": -114.6970443725586, "loss": 0.5911, "rewards/accuracies": 0.0, "rewards/chosen": 1.8113449811935425, "rewards/margins": -0.1841193437576294, "rewards/rejected": 1.9954643249511719, "step": 4650 }, { "epoch": 0.75, "learning_rate": 9.127912399960389e-07, "logits/chosen": -0.3191535472869873, "logits/rejected": -0.2627716362476349, "logps/chosen": -35.593231201171875, "logps/rejected": -56.958763122558594, "loss": 0.1426, "rewards/accuracies": 1.0, "rewards/chosen": 2.0027425289154053, "rewards/margins": 1.4984192848205566, "rewards/rejected": 0.5043231844902039, "step": 4651 }, { "epoch": 0.76, "learning_rate": 9.127170648225588e-07, "logits/chosen": -0.5446518659591675, "logits/rejected": -0.7029462456703186, "logps/chosen": -70.73677062988281, "logps/rejected": -183.97555541992188, "loss": 2.8385, "rewards/accuracies": 0.0, "rewards/chosen": 0.4751243591308594, "rewards/margins": -4.977069854736328, "rewards/rejected": 5.4521942138671875, "step": 4652 }, { "epoch": 0.76, "learning_rate": 9.126428611342347e-07, "logits/chosen": -0.7642130255699158, "logits/rejected": -0.759539008140564, "logps/chosen": -44.24953842163086, "logps/rejected": -56.34836196899414, "loss": 0.5622, "rewards/accuracies": 0.0, "rewards/chosen": 1.3236016035079956, "rewards/margins": -0.1565936803817749, "rewards/rejected": 1.4801952838897705, "step": 4653 }, { "epoch": 0.76, "learning_rate": 9.125686289361933e-07, "logits/chosen": -0.7697785496711731, "logits/rejected": -0.7196125984191895, "logps/chosen": -54.63652801513672, "logps/rejected": -99.6369857788086, "loss": 0.3368, "rewards/accuracies": 1.0, "rewards/chosen": 1.874505639076233, "rewards/margins": 0.2441192865371704, "rewards/rejected": 1.6303863525390625, "step": 4654 }, { "epoch": 0.76, "learning_rate": 9.124943682335634e-07, "logits/chosen": -0.4916890263557434, "logits/rejected": -0.3110707402229309, "logps/chosen": -120.00984954833984, "logps/rejected": -16.70138168334961, "loss": 0.2106, "rewards/accuracies": 1.0, "rewards/chosen": 2.460508108139038, "rewards/margins": 1.3408843278884888, "rewards/rejected": 1.1196237802505493, "step": 4655 }, { "epoch": 0.76, "learning_rate": 9.124200790314758e-07, "logits/chosen": -0.5956804156303406, "logits/rejected": -0.5747297406196594, "logps/chosen": -72.20708465576172, "logps/rejected": -109.94651794433594, "loss": 0.2854, "rewards/accuracies": 1.0, "rewards/chosen": 1.8342819213867188, "rewards/margins": 0.28785240650177, "rewards/rejected": 1.5464295148849487, "step": 4656 }, { "epoch": 0.76, "learning_rate": 9.12345761335063e-07, "logits/chosen": -0.7908903956413269, "logits/rejected": -0.815925121307373, "logps/chosen": -165.90127563476562, "logps/rejected": -140.07733154296875, "loss": 2.1469, "rewards/accuracies": 0.0, "rewards/chosen": 1.9429749250411987, "rewards/margins": -3.48478364944458, "rewards/rejected": 5.427758693695068, "step": 4657 }, { "epoch": 0.76, "learning_rate": 9.122714151494597e-07, "logits/chosen": -0.5210556983947754, "logits/rejected": -0.5007708668708801, "logps/chosen": -67.81614685058594, "logps/rejected": -101.61761474609375, "loss": 0.2149, "rewards/accuracies": 1.0, "rewards/chosen": 1.7372757196426392, "rewards/margins": 0.7286331653594971, "rewards/rejected": 1.008642554283142, "step": 4658 }, { "epoch": 0.76, "learning_rate": 9.121970404798026e-07, "logits/chosen": -0.6075393557548523, "logits/rejected": -0.5338006615638733, "logps/chosen": -59.31990432739258, "logps/rejected": -66.74131774902344, "loss": 0.2563, "rewards/accuracies": 1.0, "rewards/chosen": 3.2046115398406982, "rewards/margins": 0.6450252532958984, "rewards/rejected": 2.5595862865448, "step": 4659 }, { "epoch": 0.76, "learning_rate": 9.121226373312303e-07, "logits/chosen": -0.4612298905849457, "logits/rejected": -0.38161906599998474, "logps/chosen": -168.4922637939453, "logps/rejected": -22.698505401611328, "loss": 0.1363, "rewards/accuracies": 1.0, "rewards/chosen": 3.6200790405273438, "rewards/margins": 3.4292242527008057, "rewards/rejected": 0.19085483253002167, "step": 4660 }, { "epoch": 0.76, "learning_rate": 9.120482057088832e-07, "logits/chosen": -0.5357998609542847, "logits/rejected": -0.5357998609542847, "logps/chosen": -26.666706085205078, "logps/rejected": -26.666706085205078, "loss": 1.3519, "rewards/accuracies": 0.0, "rewards/chosen": 1.0143784284591675, "rewards/margins": 0.0, "rewards/rejected": 1.0143784284591675, "step": 4661 }, { "epoch": 0.76, "learning_rate": 9.119737456179039e-07, "logits/chosen": -0.6491206288337708, "logits/rejected": -0.6076536178588867, "logps/chosen": -70.84996032714844, "logps/rejected": -77.04510498046875, "loss": 0.3708, "rewards/accuracies": 1.0, "rewards/chosen": 2.4321823120117188, "rewards/margins": 0.44326019287109375, "rewards/rejected": 1.988922119140625, "step": 4662 }, { "epoch": 0.76, "learning_rate": 9.118992570634372e-07, "logits/chosen": -0.9693272709846497, "logits/rejected": -0.7444891333580017, "logps/chosen": -154.62623596191406, "logps/rejected": -129.59967041015625, "loss": 2.341, "rewards/accuracies": 0.0, "rewards/chosen": 1.9473053216934204, "rewards/margins": -4.657168865203857, "rewards/rejected": 6.604474067687988, "step": 4663 }, { "epoch": 0.76, "learning_rate": 9.118247400506289e-07, "logits/chosen": -0.7975751757621765, "logits/rejected": -0.7880740761756897, "logps/chosen": -158.01943969726562, "logps/rejected": -215.52854919433594, "loss": 0.9299, "rewards/accuracies": 0.0, "rewards/chosen": 5.589895725250244, "rewards/margins": -1.630894660949707, "rewards/rejected": 7.220790386199951, "step": 4664 }, { "epoch": 0.76, "learning_rate": 9.11750194584628e-07, "logits/chosen": -0.4742915630340576, "logits/rejected": -0.4742915630340576, "logps/chosen": -0.8839157223701477, "logps/rejected": -0.8839157223701477, "loss": 0.3788, "rewards/accuracies": 0.0, "rewards/chosen": 0.45294928550720215, "rewards/margins": 0.0, "rewards/rejected": 0.45294928550720215, "step": 4665 }, { "epoch": 0.76, "learning_rate": 9.116756206705847e-07, "logits/chosen": -0.5799290537834167, "logits/rejected": -0.5543769001960754, "logps/chosen": -190.64398193359375, "logps/rejected": -60.4918327331543, "loss": 0.0915, "rewards/accuracies": 1.0, "rewards/chosen": 3.7155792713165283, "rewards/margins": 2.022430896759033, "rewards/rejected": 1.6931484937667847, "step": 4666 }, { "epoch": 0.76, "learning_rate": 9.116010183136511e-07, "logits/chosen": -0.7267791032791138, "logits/rejected": -0.6810115575790405, "logps/chosen": -197.7405242919922, "logps/rejected": -74.97535705566406, "loss": 0.2962, "rewards/accuracies": 1.0, "rewards/chosen": 4.386203289031982, "rewards/margins": 1.560882806777954, "rewards/rejected": 2.8253204822540283, "step": 4667 }, { "epoch": 0.76, "learning_rate": 9.11526387518982e-07, "logits/chosen": -0.45897752046585083, "logits/rejected": -0.677010715007782, "logps/chosen": -54.732913970947266, "logps/rejected": -34.27412796020508, "loss": 0.2425, "rewards/accuracies": 1.0, "rewards/chosen": 2.3199520111083984, "rewards/margins": 0.6675609350204468, "rewards/rejected": 1.6523910760879517, "step": 4668 }, { "epoch": 0.76, "learning_rate": 9.114517282917334e-07, "logits/chosen": -0.510737955570221, "logits/rejected": -0.485316663980484, "logps/chosen": -35.09589385986328, "logps/rejected": -24.02030372619629, "loss": 2.0634, "rewards/accuracies": 0.0, "rewards/chosen": 1.5295273065567017, "rewards/margins": -0.19770336151123047, "rewards/rejected": 1.7272306680679321, "step": 4669 }, { "epoch": 0.76, "learning_rate": 9.113770406370633e-07, "logits/chosen": -0.5223398804664612, "logits/rejected": -0.4119873344898224, "logps/chosen": -72.26518249511719, "logps/rejected": -65.60254669189453, "loss": 0.2464, "rewards/accuracies": 1.0, "rewards/chosen": 1.7951080799102783, "rewards/margins": 0.5067535638809204, "rewards/rejected": 1.288354516029358, "step": 4670 }, { "epoch": 0.76, "learning_rate": 9.113023245601323e-07, "logits/chosen": -0.6645308136940002, "logits/rejected": -0.6025019288063049, "logps/chosen": -98.41429138183594, "logps/rejected": -55.01015853881836, "loss": 0.8496, "rewards/accuracies": 1.0, "rewards/chosen": 1.814428687095642, "rewards/margins": 1.591786503791809, "rewards/rejected": 0.22264213860034943, "step": 4671 }, { "epoch": 0.76, "learning_rate": 9.112275800661026e-07, "logits/chosen": -0.15998578071594238, "logits/rejected": -0.14514076709747314, "logps/chosen": -3.364753484725952, "logps/rejected": -18.417505264282227, "loss": 0.9791, "rewards/accuracies": 1.0, "rewards/chosen": 0.40997228026390076, "rewards/margins": 0.28849077224731445, "rewards/rejected": 0.1214815154671669, "step": 4672 }, { "epoch": 0.76, "learning_rate": 9.111528071601381e-07, "logits/chosen": -0.7626171112060547, "logits/rejected": -0.6603047847747803, "logps/chosen": -90.0218505859375, "logps/rejected": -73.67208862304688, "loss": 1.2757, "rewards/accuracies": 1.0, "rewards/chosen": 3.290818929672241, "rewards/margins": 1.6911011934280396, "rewards/rejected": 1.5997177362442017, "step": 4673 }, { "epoch": 0.76, "learning_rate": 9.11078005847405e-07, "logits/chosen": -0.6975429058074951, "logits/rejected": -0.7196676135063171, "logps/chosen": -65.60870361328125, "logps/rejected": -80.3421859741211, "loss": 0.7959, "rewards/accuracies": 0.0, "rewards/chosen": 1.8980941772460938, "rewards/margins": -0.27402496337890625, "rewards/rejected": 2.172119140625, "step": 4674 }, { "epoch": 0.76, "learning_rate": 9.110031761330712e-07, "logits/chosen": -0.14196357131004333, "logits/rejected": -0.14190135896205902, "logps/chosen": -2.378493070602417, "logps/rejected": -8.868011474609375, "loss": 0.6419, "rewards/accuracies": 1.0, "rewards/chosen": 0.2131955921649933, "rewards/margins": 0.21651676297187805, "rewards/rejected": -0.0033211708068847656, "step": 4675 }, { "epoch": 0.76, "learning_rate": 9.109283180223071e-07, "logits/chosen": -0.9380735158920288, "logits/rejected": -0.8989457488059998, "logps/chosen": -83.50359344482422, "logps/rejected": -81.79718017578125, "loss": 0.5406, "rewards/accuracies": 0.0, "rewards/chosen": 0.3070366084575653, "rewards/margins": -0.5899680852890015, "rewards/rejected": 0.8970047235488892, "step": 4676 }, { "epoch": 0.76, "learning_rate": 9.108534315202844e-07, "logits/chosen": -0.6339023113250732, "logits/rejected": -0.5532987713813782, "logps/chosen": -103.26514434814453, "logps/rejected": -89.22846984863281, "loss": 0.1726, "rewards/accuracies": 1.0, "rewards/chosen": 4.539974212646484, "rewards/margins": 0.8965888023376465, "rewards/rejected": 3.643385410308838, "step": 4677 }, { "epoch": 0.76, "learning_rate": 9.107785166321771e-07, "logits/chosen": -0.7316462993621826, "logits/rejected": -0.7036880850791931, "logps/chosen": -146.5474853515625, "logps/rejected": -57.574771881103516, "loss": 0.4061, "rewards/accuracies": 1.0, "rewards/chosen": 0.544384777545929, "rewards/margins": 0.03307455778121948, "rewards/rejected": 0.5113102197647095, "step": 4678 }, { "epoch": 0.76, "learning_rate": 9.107035733631612e-07, "logits/chosen": -0.5085078477859497, "logits/rejected": -0.5394033789634705, "logps/chosen": -26.157482147216797, "logps/rejected": -67.54960632324219, "loss": 0.3535, "rewards/accuracies": 1.0, "rewards/chosen": 0.5360473990440369, "rewards/margins": 0.5575012564659119, "rewards/rejected": -0.021453857421875, "step": 4679 }, { "epoch": 0.76, "learning_rate": 9.106286017184143e-07, "logits/chosen": -0.4276554584503174, "logits/rejected": -0.31045109033584595, "logps/chosen": -60.58152389526367, "logps/rejected": -69.4444808959961, "loss": 0.8884, "rewards/accuracies": 1.0, "rewards/chosen": 2.9295995235443115, "rewards/margins": 1.825804591178894, "rewards/rejected": 1.1037949323654175, "step": 4680 }, { "epoch": 0.76, "learning_rate": 9.105536017031166e-07, "logits/chosen": -0.9548473954200745, "logits/rejected": -0.8937512040138245, "logps/chosen": -105.25869750976562, "logps/rejected": -99.01702117919922, "loss": 1.339, "rewards/accuracies": 1.0, "rewards/chosen": 3.756848096847534, "rewards/margins": 0.17190170288085938, "rewards/rejected": 3.584946393966675, "step": 4681 }, { "epoch": 0.76, "learning_rate": 9.104785733224496e-07, "logits/chosen": -0.7654789686203003, "logits/rejected": -0.7113957405090332, "logps/chosen": -68.5242691040039, "logps/rejected": -49.1137809753418, "loss": 0.1569, "rewards/accuracies": 1.0, "rewards/chosen": 2.8773086071014404, "rewards/margins": 1.0863689184188843, "rewards/rejected": 1.7909396886825562, "step": 4682 }, { "epoch": 0.76, "learning_rate": 9.104035165815971e-07, "logits/chosen": -0.6422898769378662, "logits/rejected": -0.5079272985458374, "logps/chosen": -104.95658111572266, "logps/rejected": -81.34956359863281, "loss": 0.8029, "rewards/accuracies": 1.0, "rewards/chosen": 3.420635223388672, "rewards/margins": 0.3507354259490967, "rewards/rejected": 3.069899797439575, "step": 4683 }, { "epoch": 0.76, "learning_rate": 9.103284314857451e-07, "logits/chosen": -0.8864518404006958, "logits/rejected": -0.36611422896385193, "logps/chosen": -83.31686401367188, "logps/rejected": -103.54544830322266, "loss": 0.6073, "rewards/accuracies": 0.0, "rewards/chosen": 1.5814125537872314, "rewards/margins": -0.8451447486877441, "rewards/rejected": 2.4265573024749756, "step": 4684 }, { "epoch": 0.76, "learning_rate": 9.102533180400809e-07, "logits/chosen": -0.29201820492744446, "logits/rejected": -0.3159154951572418, "logps/chosen": -57.5017204284668, "logps/rejected": -42.18889617919922, "loss": 0.802, "rewards/accuracies": 0.0, "rewards/chosen": 0.49965134263038635, "rewards/margins": -1.3292884826660156, "rewards/rejected": 1.8289397954940796, "step": 4685 }, { "epoch": 0.76, "learning_rate": 9.101781762497943e-07, "logits/chosen": -0.515151858329773, "logits/rejected": -0.5255914330482483, "logps/chosen": -96.6365737915039, "logps/rejected": -42.01784133911133, "loss": 1.3075, "rewards/accuracies": 0.0, "rewards/chosen": 0.9180518984794617, "rewards/margins": -0.10223275423049927, "rewards/rejected": 1.020284652709961, "step": 4686 }, { "epoch": 0.76, "learning_rate": 9.10103006120077e-07, "logits/chosen": -0.7957761883735657, "logits/rejected": -0.7435178160667419, "logps/chosen": -49.08119201660156, "logps/rejected": -11.997528076171875, "loss": 0.4225, "rewards/accuracies": 1.0, "rewards/chosen": 1.2224044799804688, "rewards/margins": 0.5928968191146851, "rewards/rejected": 0.6295076608657837, "step": 4687 }, { "epoch": 0.76, "learning_rate": 9.100278076561222e-07, "logits/chosen": -0.6410240530967712, "logits/rejected": -0.7095699906349182, "logps/chosen": -62.85045623779297, "logps/rejected": -55.4880485534668, "loss": 0.8096, "rewards/accuracies": 0.0, "rewards/chosen": 1.0593849420547485, "rewards/margins": -1.0214282274246216, "rewards/rejected": 2.08081316947937, "step": 4688 }, { "epoch": 0.76, "learning_rate": 9.099525808631256e-07, "logits/chosen": -0.6708809733390808, "logits/rejected": -0.5995337963104248, "logps/chosen": -47.93631362915039, "logps/rejected": -139.13221740722656, "loss": 0.4633, "rewards/accuracies": 1.0, "rewards/chosen": 1.8148243427276611, "rewards/margins": 0.6786472797393799, "rewards/rejected": 1.1361770629882812, "step": 4689 }, { "epoch": 0.76, "learning_rate": 9.098773257462848e-07, "logits/chosen": -0.5266875624656677, "logits/rejected": -0.48909351229667664, "logps/chosen": -70.55142211914062, "logps/rejected": -98.4775390625, "loss": 1.7832, "rewards/accuracies": 0.0, "rewards/chosen": 1.787251353263855, "rewards/margins": -1.2081772089004517, "rewards/rejected": 2.9954285621643066, "step": 4690 }, { "epoch": 0.76, "learning_rate": 9.09802042310799e-07, "logits/chosen": -0.6482741832733154, "logits/rejected": -0.7433050274848938, "logps/chosen": -83.46098327636719, "logps/rejected": -130.0347900390625, "loss": 3.0112, "rewards/accuracies": 0.0, "rewards/chosen": 1.2369674444198608, "rewards/margins": -3.3441481590270996, "rewards/rejected": 4.58111572265625, "step": 4691 }, { "epoch": 0.76, "learning_rate": 9.097267305618698e-07, "logits/chosen": -0.3005402684211731, "logits/rejected": -0.3095865845680237, "logps/chosen": -3.2867348194122314, "logps/rejected": -1.3796875476837158, "loss": 0.4362, "rewards/accuracies": 0.0, "rewards/chosen": 0.1825226992368698, "rewards/margins": -0.11431147158145905, "rewards/rejected": 0.29683417081832886, "step": 4692 }, { "epoch": 0.76, "learning_rate": 9.096513905047002e-07, "logits/chosen": -0.3778480291366577, "logits/rejected": -0.4024624824523926, "logps/chosen": -37.342689514160156, "logps/rejected": -74.14791870117188, "loss": 0.7047, "rewards/accuracies": 0.0, "rewards/chosen": 1.2959213256835938, "rewards/margins": -0.27311861515045166, "rewards/rejected": 1.5690399408340454, "step": 4693 }, { "epoch": 0.76, "learning_rate": 9.095760221444959e-07, "logits/chosen": -0.6234432458877563, "logits/rejected": -0.5920825600624084, "logps/chosen": -109.31716918945312, "logps/rejected": -84.06809997558594, "loss": 1.408, "rewards/accuracies": 0.0, "rewards/chosen": -0.31592559814453125, "rewards/margins": -1.2171661853790283, "rewards/rejected": 0.9012405276298523, "step": 4694 }, { "epoch": 0.76, "learning_rate": 9.095006254864638e-07, "logits/chosen": -1.006260633468628, "logits/rejected": -1.0131433010101318, "logps/chosen": -129.43017578125, "logps/rejected": -87.70318603515625, "loss": 1.0614, "rewards/accuracies": 0.0, "rewards/chosen": 1.3841583728790283, "rewards/margins": -0.9432449340820312, "rewards/rejected": 2.3274033069610596, "step": 4695 }, { "epoch": 0.76, "learning_rate": 9.094252005358132e-07, "logits/chosen": -0.8560337424278259, "logits/rejected": -0.6667025685310364, "logps/chosen": -67.81297302246094, "logps/rejected": -97.98786926269531, "loss": 1.0509, "rewards/accuracies": 0.0, "rewards/chosen": 0.7027832269668579, "rewards/margins": -1.1656792163848877, "rewards/rejected": 1.8684624433517456, "step": 4696 }, { "epoch": 0.76, "learning_rate": 9.093497472977551e-07, "logits/chosen": -0.9214292168617249, "logits/rejected": -0.8709401488304138, "logps/chosen": -72.36145782470703, "logps/rejected": -34.07283401489258, "loss": 1.7538, "rewards/accuracies": 1.0, "rewards/chosen": 2.4294211864471436, "rewards/margins": 2.1739752292633057, "rewards/rejected": 0.2554458677768707, "step": 4697 }, { "epoch": 0.76, "learning_rate": 9.09274265777503e-07, "logits/chosen": -0.8591983318328857, "logits/rejected": -0.7759979367256165, "logps/chosen": -111.67808532714844, "logps/rejected": -190.6467742919922, "loss": 1.1634, "rewards/accuracies": 0.0, "rewards/chosen": 4.113511562347412, "rewards/margins": -2.1102967262268066, "rewards/rejected": 6.223808288574219, "step": 4698 }, { "epoch": 0.76, "learning_rate": 9.091987559802716e-07, "logits/chosen": -0.5994354486465454, "logits/rejected": -0.5927208662033081, "logps/chosen": -24.980472564697266, "logps/rejected": -0.932968020439148, "loss": 0.8315, "rewards/accuracies": 0.0, "rewards/chosen": -0.19211597740650177, "rewards/margins": -0.49317634105682373, "rewards/rejected": 0.30106034874916077, "step": 4699 }, { "epoch": 0.76, "learning_rate": 9.091232179112781e-07, "logits/chosen": -0.593722939491272, "logits/rejected": -0.5088589191436768, "logps/chosen": -100.54722595214844, "logps/rejected": -86.4166259765625, "loss": 0.3991, "rewards/accuracies": 1.0, "rewards/chosen": 0.777362048625946, "rewards/margins": 0.16793286800384521, "rewards/rejected": 0.6094291806221008, "step": 4700 }, { "epoch": 0.76, "learning_rate": 9.090476515757415e-07, "logits/chosen": -0.3407232165336609, "logits/rejected": -0.32981595396995544, "logps/chosen": -102.09019470214844, "logps/rejected": -91.09381103515625, "loss": 0.9408, "rewards/accuracies": 0.0, "rewards/chosen": 0.7668853998184204, "rewards/margins": -0.6635879278182983, "rewards/rejected": 1.4304733276367188, "step": 4701 }, { "epoch": 0.76, "learning_rate": 9.089720569788824e-07, "logits/chosen": -0.5698906183242798, "logits/rejected": -0.5698906183242798, "logps/chosen": -46.138702392578125, "logps/rejected": -46.138702392578125, "loss": 0.4654, "rewards/accuracies": 0.0, "rewards/chosen": 1.0747276544570923, "rewards/margins": 0.0, "rewards/rejected": 1.0747276544570923, "step": 4702 }, { "epoch": 0.76, "learning_rate": 9.08896434125924e-07, "logits/chosen": -0.6856665015220642, "logits/rejected": -0.6401086449623108, "logps/chosen": -71.79241180419922, "logps/rejected": -54.687801361083984, "loss": 0.4819, "rewards/accuracies": 0.0, "rewards/chosen": 1.6321624517440796, "rewards/margins": -0.09760940074920654, "rewards/rejected": 1.7297718524932861, "step": 4703 }, { "epoch": 0.76, "learning_rate": 9.088207830220911e-07, "logits/chosen": -0.5433340072631836, "logits/rejected": -0.5433340072631836, "logps/chosen": -63.928096771240234, "logps/rejected": -63.928096771240234, "loss": 0.3628, "rewards/accuracies": 0.0, "rewards/chosen": 2.3395602703094482, "rewards/margins": 0.0, "rewards/rejected": 2.3395602703094482, "step": 4704 }, { "epoch": 0.76, "learning_rate": 9.087451036726102e-07, "logits/chosen": -1.2320345640182495, "logits/rejected": -1.2198224067687988, "logps/chosen": -78.77603149414062, "logps/rejected": -42.62760925292969, "loss": 0.2399, "rewards/accuracies": 1.0, "rewards/chosen": 0.9323089718818665, "rewards/margins": 0.7543563842773438, "rewards/rejected": 0.1779525727033615, "step": 4705 }, { "epoch": 0.76, "learning_rate": 9.086693960827105e-07, "logits/chosen": -0.900638222694397, "logits/rejected": -0.9290069937705994, "logps/chosen": -61.822593688964844, "logps/rejected": -110.4930419921875, "loss": 1.1478, "rewards/accuracies": 0.0, "rewards/chosen": 0.47885438799858093, "rewards/margins": -0.3160301148891449, "rewards/rejected": 0.7948845028877258, "step": 4706 }, { "epoch": 0.76, "learning_rate": 9.085936602576221e-07, "logits/chosen": -0.9676588773727417, "logits/rejected": -0.9450328946113586, "logps/chosen": -124.82677459716797, "logps/rejected": -120.76750946044922, "loss": 0.8027, "rewards/accuracies": 0.0, "rewards/chosen": 5.049243927001953, "rewards/margins": -1.3781280517578125, "rewards/rejected": 6.427371978759766, "step": 4707 }, { "epoch": 0.76, "learning_rate": 9.085178962025782e-07, "logits/chosen": -0.6829895973205566, "logits/rejected": -0.6829895973205566, "logps/chosen": -56.719703674316406, "logps/rejected": -56.719703674316406, "loss": 0.349, "rewards/accuracies": 0.0, "rewards/chosen": 0.9471901059150696, "rewards/margins": 0.0, "rewards/rejected": 0.9471901059150696, "step": 4708 }, { "epoch": 0.76, "learning_rate": 9.084421039228129e-07, "logits/chosen": -0.31514984369277954, "logits/rejected": -0.3163047134876251, "logps/chosen": -5.659461975097656, "logps/rejected": -4.500885963439941, "loss": 0.7155, "rewards/accuracies": 0.0, "rewards/chosen": 0.2163061648607254, "rewards/margins": -0.08764056861400604, "rewards/rejected": 0.30394673347473145, "step": 4709 }, { "epoch": 0.76, "learning_rate": 9.083662834235629e-07, "logits/chosen": -1.0553393363952637, "logits/rejected": -0.8394725918769836, "logps/chosen": -169.58200073242188, "logps/rejected": -21.475740432739258, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 4.666592597961426, "rewards/margins": 4.232169151306152, "rewards/rejected": 0.4344232678413391, "step": 4710 }, { "epoch": 0.76, "learning_rate": 9.082904347100668e-07, "logits/chosen": -0.5078539252281189, "logits/rejected": -0.5292877554893494, "logps/chosen": -57.140708923339844, "logps/rejected": -54.78771209716797, "loss": 0.7031, "rewards/accuracies": 0.0, "rewards/chosen": 0.5228626132011414, "rewards/margins": -1.1175205707550049, "rewards/rejected": 1.6403831243515015, "step": 4711 }, { "epoch": 0.76, "learning_rate": 9.082145577875648e-07, "logits/chosen": -0.7306612730026245, "logits/rejected": -0.7015475034713745, "logps/chosen": -62.44090270996094, "logps/rejected": -104.49520874023438, "loss": 1.5042, "rewards/accuracies": 1.0, "rewards/chosen": 1.5586349964141846, "rewards/margins": 0.24858248233795166, "rewards/rejected": 1.310052514076233, "step": 4712 }, { "epoch": 0.76, "learning_rate": 9.081386526612998e-07, "logits/chosen": -0.41164135932922363, "logits/rejected": -0.30538395047187805, "logps/chosen": -30.169593811035156, "logps/rejected": -45.234554290771484, "loss": 0.8468, "rewards/accuracies": 1.0, "rewards/chosen": 2.2410848140716553, "rewards/margins": 0.43965041637420654, "rewards/rejected": 1.8014343976974487, "step": 4713 }, { "epoch": 0.77, "learning_rate": 9.080627193365154e-07, "logits/chosen": -0.5437769293785095, "logits/rejected": -0.49904000759124756, "logps/chosen": -77.56864929199219, "logps/rejected": -102.81625366210938, "loss": 0.8426, "rewards/accuracies": 1.0, "rewards/chosen": 1.054174780845642, "rewards/margins": 0.03690338134765625, "rewards/rejected": 1.0172713994979858, "step": 4714 }, { "epoch": 0.77, "learning_rate": 9.079867578184584e-07, "logits/chosen": -0.5846872329711914, "logits/rejected": -0.5485960245132446, "logps/chosen": -107.27507019042969, "logps/rejected": -64.48175811767578, "loss": 0.3338, "rewards/accuracies": 1.0, "rewards/chosen": 4.992732524871826, "rewards/margins": 2.776322364807129, "rewards/rejected": 2.2164101600646973, "step": 4715 }, { "epoch": 0.77, "learning_rate": 9.079107681123766e-07, "logits/chosen": -0.9171425700187683, "logits/rejected": -0.8037817478179932, "logps/chosen": -69.3177490234375, "logps/rejected": -79.10855102539062, "loss": 1.7988, "rewards/accuracies": 1.0, "rewards/chosen": 1.8628501892089844, "rewards/margins": 1.1108527183532715, "rewards/rejected": 0.7519974112510681, "step": 4716 }, { "epoch": 0.77, "learning_rate": 9.078347502235206e-07, "logits/chosen": -0.5077962279319763, "logits/rejected": -0.45690295100212097, "logps/chosen": -58.36911392211914, "logps/rejected": -77.11852264404297, "loss": 0.7104, "rewards/accuracies": 0.0, "rewards/chosen": 0.6947200894355774, "rewards/margins": -1.04429292678833, "rewards/rejected": 1.7390129566192627, "step": 4717 }, { "epoch": 0.77, "learning_rate": 9.077587041571424e-07, "logits/chosen": -0.15067829191684723, "logits/rejected": -0.15067829191684723, "logps/chosen": -79.91254425048828, "logps/rejected": -79.91254425048828, "loss": 0.7356, "rewards/accuracies": 0.0, "rewards/chosen": 1.1859687566757202, "rewards/margins": 0.0, "rewards/rejected": 1.1859687566757202, "step": 4718 }, { "epoch": 0.77, "learning_rate": 9.076826299184958e-07, "logits/chosen": -0.4660082459449768, "logits/rejected": -0.41168907284736633, "logps/chosen": -70.75257873535156, "logps/rejected": -132.14976501464844, "loss": 0.2164, "rewards/accuracies": 1.0, "rewards/chosen": 1.1022919416427612, "rewards/margins": 0.7207214832305908, "rewards/rejected": 0.381570428609848, "step": 4719 }, { "epoch": 0.77, "learning_rate": 9.07606527512837e-07, "logits/chosen": -0.6287827491760254, "logits/rejected": -0.6477063298225403, "logps/chosen": -74.16697692871094, "logps/rejected": -107.69850158691406, "loss": 0.316, "rewards/accuracies": 1.0, "rewards/chosen": 5.114366054534912, "rewards/margins": 0.17093467712402344, "rewards/rejected": 4.943431377410889, "step": 4720 }, { "epoch": 0.77, "learning_rate": 9.075303969454242e-07, "logits/chosen": -0.8554795980453491, "logits/rejected": -0.6864609122276306, "logps/chosen": -136.1253662109375, "logps/rejected": -49.95444107055664, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 5.4833269119262695, "rewards/margins": 3.7700324058532715, "rewards/rejected": 1.7132946252822876, "step": 4721 }, { "epoch": 0.77, "learning_rate": 9.074542382215169e-07, "logits/chosen": 0.05079231411218643, "logits/rejected": 0.034607287496328354, "logps/chosen": -3.196237325668335, "logps/rejected": -30.95334815979004, "loss": 0.3337, "rewards/accuracies": 1.0, "rewards/chosen": 0.3858107030391693, "rewards/margins": 0.1224537193775177, "rewards/rejected": 0.2633569836616516, "step": 4722 }, { "epoch": 0.77, "learning_rate": 9.073780513463772e-07, "logits/chosen": -1.0333625078201294, "logits/rejected": -0.9735295176506042, "logps/chosen": -61.159324645996094, "logps/rejected": -25.136837005615234, "loss": 0.6464, "rewards/accuracies": 1.0, "rewards/chosen": 2.017113447189331, "rewards/margins": 0.5041564702987671, "rewards/rejected": 1.512956976890564, "step": 4723 }, { "epoch": 0.77, "learning_rate": 9.073018363252687e-07, "logits/chosen": -0.8228040337562561, "logits/rejected": -0.7797767519950867, "logps/chosen": -67.05183410644531, "logps/rejected": -36.75881576538086, "loss": 0.7836, "rewards/accuracies": 1.0, "rewards/chosen": 2.7623116970062256, "rewards/margins": 0.24391651153564453, "rewards/rejected": 2.518395185470581, "step": 4724 }, { "epoch": 0.77, "learning_rate": 9.072255931634572e-07, "logits/chosen": -0.6545127034187317, "logits/rejected": -0.49924859404563904, "logps/chosen": -101.3916015625, "logps/rejected": -19.009292602539062, "loss": 0.2052, "rewards/accuracies": 1.0, "rewards/chosen": 1.6258010864257812, "rewards/margins": 1.35099196434021, "rewards/rejected": 0.2748090922832489, "step": 4725 }, { "epoch": 0.77, "learning_rate": 9.071493218662106e-07, "logits/chosen": -0.8427174687385559, "logits/rejected": -0.7959205508232117, "logps/chosen": -41.701011657714844, "logps/rejected": -50.595741271972656, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 1.7584251165390015, "rewards/margins": 0.4670424461364746, "rewards/rejected": 1.2913826704025269, "step": 4726 }, { "epoch": 0.77, "learning_rate": 9.070730224387982e-07, "logits/chosen": -0.40837591886520386, "logits/rejected": -0.40837591886520386, "logps/chosen": -48.79966735839844, "logps/rejected": -48.79966735839844, "loss": 2.3527, "rewards/accuracies": 0.0, "rewards/chosen": 1.3858299255371094, "rewards/margins": 0.0, "rewards/rejected": 1.3858299255371094, "step": 4727 }, { "epoch": 0.77, "learning_rate": 9.069966948864916e-07, "logits/chosen": -0.7738280296325684, "logits/rejected": -0.6484619379043579, "logps/chosen": -95.57936096191406, "logps/rejected": -71.31153869628906, "loss": 0.8914, "rewards/accuracies": 0.0, "rewards/chosen": 0.977435290813446, "rewards/margins": -1.4701316356658936, "rewards/rejected": 2.4475669860839844, "step": 4728 }, { "epoch": 0.77, "learning_rate": 9.069203392145646e-07, "logits/chosen": -0.7939937710762024, "logits/rejected": -0.8293649554252625, "logps/chosen": -97.2886962890625, "logps/rejected": -145.84341430664062, "loss": 2.4728, "rewards/accuracies": 0.0, "rewards/chosen": 0.2673545777797699, "rewards/margins": -4.311986923217773, "rewards/rejected": 4.579341411590576, "step": 4729 }, { "epoch": 0.77, "learning_rate": 9.068439554282923e-07, "logits/chosen": -0.8071238994598389, "logits/rejected": -0.8129372596740723, "logps/chosen": -175.065673828125, "logps/rejected": -164.50567626953125, "loss": 0.4617, "rewards/accuracies": 0.0, "rewards/chosen": 4.0832366943359375, "rewards/margins": -0.15366506576538086, "rewards/rejected": 4.236901760101318, "step": 4730 }, { "epoch": 0.77, "learning_rate": 9.067675435329524e-07, "logits/chosen": -0.8809047937393188, "logits/rejected": -0.6391706466674805, "logps/chosen": -170.68841552734375, "logps/rejected": -53.231353759765625, "loss": 0.1062, "rewards/accuracies": 1.0, "rewards/chosen": 4.755810737609863, "rewards/margins": 3.733757972717285, "rewards/rejected": 1.0220527648925781, "step": 4731 }, { "epoch": 0.77, "learning_rate": 9.066911035338242e-07, "logits/chosen": -0.8220865726470947, "logits/rejected": -0.7143179774284363, "logps/chosen": -73.47904968261719, "logps/rejected": -70.03392791748047, "loss": 0.0626, "rewards/accuracies": 1.0, "rewards/chosen": 4.444904327392578, "rewards/margins": 2.2868804931640625, "rewards/rejected": 2.1580238342285156, "step": 4732 }, { "epoch": 0.77, "learning_rate": 9.066146354361888e-07, "logits/chosen": -0.7231014966964722, "logits/rejected": -0.504599392414093, "logps/chosen": -106.87263488769531, "logps/rejected": -73.06713104248047, "loss": 0.1339, "rewards/accuracies": 1.0, "rewards/chosen": 3.0614471435546875, "rewards/margins": 1.682396650314331, "rewards/rejected": 1.3790504932403564, "step": 4733 }, { "epoch": 0.77, "learning_rate": 9.065381392453295e-07, "logits/chosen": -0.9224610924720764, "logits/rejected": -0.8970758318901062, "logps/chosen": -202.1165008544922, "logps/rejected": -27.896907806396484, "loss": 0.0371, "rewards/accuracies": 1.0, "rewards/chosen": 5.100773811340332, "rewards/margins": 2.634533166885376, "rewards/rejected": 2.466240644454956, "step": 4734 }, { "epoch": 0.77, "learning_rate": 9.064616149665314e-07, "logits/chosen": -0.710368812084198, "logits/rejected": -0.6344711184501648, "logps/chosen": -151.08944702148438, "logps/rejected": -62.18003845214844, "loss": 0.5384, "rewards/accuracies": 0.0, "rewards/chosen": 1.12860107421875, "rewards/margins": -0.3560851812362671, "rewards/rejected": 1.484686255455017, "step": 4735 }, { "epoch": 0.77, "learning_rate": 9.063850626050818e-07, "logits/chosen": -0.7584587335586548, "logits/rejected": -0.7619423866271973, "logps/chosen": -86.77648162841797, "logps/rejected": -52.91487503051758, "loss": 0.9756, "rewards/accuracies": 1.0, "rewards/chosen": 0.8260429501533508, "rewards/margins": 0.3761848509311676, "rewards/rejected": 0.4498580992221832, "step": 4736 }, { "epoch": 0.77, "learning_rate": 9.063084821662697e-07, "logits/chosen": -0.7545908689498901, "logits/rejected": -0.6866264939308167, "logps/chosen": -48.84430694580078, "logps/rejected": -82.0135726928711, "loss": 0.2996, "rewards/accuracies": 1.0, "rewards/chosen": 2.2751471996307373, "rewards/margins": 0.6695173978805542, "rewards/rejected": 1.605629801750183, "step": 4737 }, { "epoch": 0.77, "learning_rate": 9.062318736553858e-07, "logits/chosen": -0.22154679894447327, "logits/rejected": -0.17342247068881989, "logps/chosen": -68.56908416748047, "logps/rejected": -48.961158752441406, "loss": 0.2207, "rewards/accuracies": 1.0, "rewards/chosen": 1.8528168201446533, "rewards/margins": 0.7993221282958984, "rewards/rejected": 1.0534946918487549, "step": 4738 }, { "epoch": 0.77, "learning_rate": 9.061552370777235e-07, "logits/chosen": -1.077582836151123, "logits/rejected": -0.643615186214447, "logps/chosen": -166.2394561767578, "logps/rejected": -89.13243865966797, "loss": 0.1325, "rewards/accuracies": 1.0, "rewards/chosen": 4.102043151855469, "rewards/margins": 1.208833932876587, "rewards/rejected": 2.893209218978882, "step": 4739 }, { "epoch": 0.77, "learning_rate": 9.060785724385771e-07, "logits/chosen": -0.5784476399421692, "logits/rejected": -0.6195259690284729, "logps/chosen": -52.289955139160156, "logps/rejected": -50.82875061035156, "loss": 1.4035, "rewards/accuracies": 0.0, "rewards/chosen": 0.3041320741176605, "rewards/margins": -1.4711906909942627, "rewards/rejected": 1.7753227949142456, "step": 4740 }, { "epoch": 0.77, "learning_rate": 9.060018797432438e-07, "logits/chosen": -0.5290610790252686, "logits/rejected": -0.5118591785430908, "logps/chosen": -53.134735107421875, "logps/rejected": -22.512096405029297, "loss": 0.587, "rewards/accuracies": 1.0, "rewards/chosen": 0.5047497153282166, "rewards/margins": 0.03160497546195984, "rewards/rejected": 0.4731447398662567, "step": 4741 }, { "epoch": 0.77, "learning_rate": 9.059251589970222e-07, "logits/chosen": -0.367081880569458, "logits/rejected": -0.2770191729068756, "logps/chosen": -82.68876647949219, "logps/rejected": -57.15985107421875, "loss": 0.2274, "rewards/accuracies": 1.0, "rewards/chosen": 1.5312073230743408, "rewards/margins": 0.8529686331748962, "rewards/rejected": 0.6782386898994446, "step": 4742 }, { "epoch": 0.77, "learning_rate": 9.058484102052131e-07, "logits/chosen": -0.9836173057556152, "logits/rejected": -1.0198709964752197, "logps/chosen": -186.52099609375, "logps/rejected": -138.05368041992188, "loss": 0.8438, "rewards/accuracies": 1.0, "rewards/chosen": 3.54221510887146, "rewards/margins": 2.328901767730713, "rewards/rejected": 1.213313341140747, "step": 4743 }, { "epoch": 0.77, "learning_rate": 9.057716333731192e-07, "logits/chosen": -0.7483422756195068, "logits/rejected": -0.7382368445396423, "logps/chosen": -209.23611450195312, "logps/rejected": -62.70130157470703, "loss": 0.8357, "rewards/accuracies": 1.0, "rewards/chosen": 5.838116645812988, "rewards/margins": 3.0198495388031006, "rewards/rejected": 2.8182671070098877, "step": 4744 }, { "epoch": 0.77, "learning_rate": 9.056948285060446e-07, "logits/chosen": -1.0546139478683472, "logits/rejected": -0.7651075720787048, "logps/chosen": -159.19078063964844, "logps/rejected": -28.365371704101562, "loss": 0.0463, "rewards/accuracies": 1.0, "rewards/chosen": 5.1078386306762695, "rewards/margins": 4.5906548500061035, "rewards/rejected": 0.5171837210655212, "step": 4745 }, { "epoch": 0.77, "learning_rate": 9.056179956092961e-07, "logits/chosen": -0.7588016986846924, "logits/rejected": -0.5909772515296936, "logps/chosen": -174.29367065429688, "logps/rejected": -53.474761962890625, "loss": 0.3593, "rewards/accuracies": 1.0, "rewards/chosen": 5.511059761047363, "rewards/margins": 4.234586715698242, "rewards/rejected": 1.2764732837677002, "step": 4746 }, { "epoch": 0.77, "learning_rate": 9.055411346881822e-07, "logits/chosen": -0.5249190926551819, "logits/rejected": -0.5249190926551819, "logps/chosen": -71.91830444335938, "logps/rejected": -71.91830444335938, "loss": 0.4946, "rewards/accuracies": 0.0, "rewards/chosen": 1.4063538312911987, "rewards/margins": 0.0, "rewards/rejected": 1.4063538312911987, "step": 4747 }, { "epoch": 0.77, "learning_rate": 9.05464245748013e-07, "logits/chosen": -0.7920733690261841, "logits/rejected": -0.6800966858863831, "logps/chosen": -157.28448486328125, "logps/rejected": -20.522357940673828, "loss": 0.0945, "rewards/accuracies": 1.0, "rewards/chosen": 5.433221340179443, "rewards/margins": 4.868706226348877, "rewards/rejected": 0.5645151138305664, "step": 4748 }, { "epoch": 0.77, "learning_rate": 9.053873287941011e-07, "logits/chosen": -0.8701030611991882, "logits/rejected": -0.7280883193016052, "logps/chosen": -90.69734191894531, "logps/rejected": -62.81769561767578, "loss": 0.3308, "rewards/accuracies": 1.0, "rewards/chosen": 2.803331136703491, "rewards/margins": 0.9098130464553833, "rewards/rejected": 1.893518090248108, "step": 4749 }, { "epoch": 0.77, "learning_rate": 9.053103838317606e-07, "logits/chosen": -0.45967215299606323, "logits/rejected": -0.591838002204895, "logps/chosen": -112.8056411743164, "logps/rejected": -99.14291381835938, "loss": 1.6874, "rewards/accuracies": 0.0, "rewards/chosen": 1.0770469903945923, "rewards/margins": -3.330817699432373, "rewards/rejected": 4.407864570617676, "step": 4750 }, { "epoch": 0.77, "learning_rate": 9.052334108663076e-07, "logits/chosen": -0.8544053435325623, "logits/rejected": -0.8101393580436707, "logps/chosen": -62.276893615722656, "logps/rejected": -83.28593444824219, "loss": 0.3568, "rewards/accuracies": 0.0, "rewards/chosen": 1.8274520635604858, "rewards/margins": -0.00919961929321289, "rewards/rejected": 1.8366516828536987, "step": 4751 }, { "epoch": 0.77, "learning_rate": 9.051564099030603e-07, "logits/chosen": -0.6144696474075317, "logits/rejected": -0.6137675046920776, "logps/chosen": -2.983340263366699, "logps/rejected": -1.9590046405792236, "loss": 0.6709, "rewards/accuracies": 0.0, "rewards/chosen": 0.4661714732646942, "rewards/margins": -0.14068928360939026, "rewards/rejected": 0.6068607568740845, "step": 4752 }, { "epoch": 0.77, "learning_rate": 9.050793809473387e-07, "logits/chosen": -0.08975386619567871, "logits/rejected": -0.09187798202037811, "logps/chosen": -3.5748140811920166, "logps/rejected": -1.5915093421936035, "loss": 0.6248, "rewards/accuracies": 0.0, "rewards/chosen": 0.2991364300251007, "rewards/margins": -0.02509063482284546, "rewards/rejected": 0.32422706484794617, "step": 4753 }, { "epoch": 0.77, "learning_rate": 9.050023240044648e-07, "logits/chosen": -0.6586208343505859, "logits/rejected": -0.696503221988678, "logps/chosen": -73.65983581542969, "logps/rejected": -81.64321899414062, "loss": 0.8349, "rewards/accuracies": 0.0, "rewards/chosen": 1.6426972150802612, "rewards/margins": -0.9434119462966919, "rewards/rejected": 2.586109161376953, "step": 4754 }, { "epoch": 0.77, "learning_rate": 9.049252390797623e-07, "logits/chosen": -0.6846190094947815, "logits/rejected": -0.6892214417457581, "logps/chosen": -189.0386962890625, "logps/rejected": -174.17030334472656, "loss": 1.0549, "rewards/accuracies": 0.0, "rewards/chosen": 5.1231231689453125, "rewards/margins": -1.1768784523010254, "rewards/rejected": 6.300001621246338, "step": 4755 }, { "epoch": 0.77, "learning_rate": 9.048481261785574e-07, "logits/chosen": -0.3545285761356354, "logits/rejected": -0.3545285761356354, "logps/chosen": -0.7368467450141907, "logps/rejected": -0.7368467450141907, "loss": 0.6775, "rewards/accuracies": 0.0, "rewards/chosen": 0.220004603266716, "rewards/margins": 0.0, "rewards/rejected": 0.220004603266716, "step": 4756 }, { "epoch": 0.77, "learning_rate": 9.047709853061776e-07, "logits/chosen": -0.4237384498119354, "logits/rejected": -0.42214271426200867, "logps/chosen": -271.3101806640625, "logps/rejected": -59.92857360839844, "loss": 1.3712, "rewards/accuracies": 1.0, "rewards/chosen": 4.192413330078125, "rewards/margins": 1.7026252746582031, "rewards/rejected": 2.489788055419922, "step": 4757 }, { "epoch": 0.77, "learning_rate": 9.046938164679529e-07, "logits/chosen": -0.6952183246612549, "logits/rejected": -0.6952183246612549, "logps/chosen": -78.44863891601562, "logps/rejected": -78.44863891601562, "loss": 0.4399, "rewards/accuracies": 0.0, "rewards/chosen": 2.4157555103302, "rewards/margins": 0.0, "rewards/rejected": 2.4157555103302, "step": 4758 }, { "epoch": 0.77, "learning_rate": 9.046166196692144e-07, "logits/chosen": -0.8427486419677734, "logits/rejected": -0.6316574215888977, "logps/chosen": -120.49848937988281, "logps/rejected": -68.23685455322266, "loss": 0.5214, "rewards/accuracies": 1.0, "rewards/chosen": 8.22275447845459, "rewards/margins": 6.371894836425781, "rewards/rejected": 1.8508598804473877, "step": 4759 }, { "epoch": 0.77, "learning_rate": 9.045393949152962e-07, "logits/chosen": -0.3427710235118866, "logits/rejected": -0.3427710235118866, "logps/chosen": -24.9522705078125, "logps/rejected": -24.9522705078125, "loss": 0.9109, "rewards/accuracies": 0.0, "rewards/chosen": 0.5323654413223267, "rewards/margins": 0.0, "rewards/rejected": 0.5323654413223267, "step": 4760 }, { "epoch": 0.77, "learning_rate": 9.044621422115336e-07, "logits/chosen": -0.9903049468994141, "logits/rejected": -1.2226731777191162, "logps/chosen": -181.62176513671875, "logps/rejected": -35.82383346557617, "loss": 0.6021, "rewards/accuracies": 1.0, "rewards/chosen": 0.653961181640625, "rewards/margins": 0.4050517976284027, "rewards/rejected": 0.2489093840122223, "step": 4761 }, { "epoch": 0.77, "learning_rate": 9.043848615632641e-07, "logits/chosen": -0.6417644619941711, "logits/rejected": -0.6493350267410278, "logps/chosen": -322.5718078613281, "logps/rejected": -114.09136962890625, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 5.846380710601807, "rewards/margins": 3.9769835472106934, "rewards/rejected": 1.8693970441818237, "step": 4762 }, { "epoch": 0.77, "learning_rate": 9.043075529758268e-07, "logits/chosen": -0.23025760054588318, "logits/rejected": -0.23025760054588318, "logps/chosen": -111.21783447265625, "logps/rejected": -111.21783447265625, "loss": 0.4883, "rewards/accuracies": 0.0, "rewards/chosen": 1.1231437921524048, "rewards/margins": 0.0, "rewards/rejected": 1.1231437921524048, "step": 4763 }, { "epoch": 0.77, "learning_rate": 9.042302164545632e-07, "logits/chosen": -0.6125547885894775, "logits/rejected": -0.48949798941612244, "logps/chosen": -71.73990631103516, "logps/rejected": -20.408733367919922, "loss": 0.4645, "rewards/accuracies": 0.0, "rewards/chosen": 0.8552314639091492, "rewards/margins": -0.03349095582962036, "rewards/rejected": 0.8887224197387695, "step": 4764 }, { "epoch": 0.77, "learning_rate": 9.041528520048167e-07, "logits/chosen": -1.052648663520813, "logits/rejected": -0.9721935391426086, "logps/chosen": -75.290771484375, "logps/rejected": -97.28509521484375, "loss": 0.1976, "rewards/accuracies": 1.0, "rewards/chosen": 3.89103102684021, "rewards/margins": 0.8064453601837158, "rewards/rejected": 3.084585666656494, "step": 4765 }, { "epoch": 0.77, "learning_rate": 9.040754596319321e-07, "logits/chosen": -0.6423755288124084, "logits/rejected": -0.7211891412734985, "logps/chosen": -210.1495361328125, "logps/rejected": -47.57905578613281, "loss": 0.0656, "rewards/accuracies": 1.0, "rewards/chosen": 5.362701416015625, "rewards/margins": 3.7252907752990723, "rewards/rejected": 1.6374107599258423, "step": 4766 }, { "epoch": 0.77, "learning_rate": 9.039980393412566e-07, "logits/chosen": -0.7633602619171143, "logits/rejected": -0.6565254926681519, "logps/chosen": -149.8549041748047, "logps/rejected": -93.53218841552734, "loss": 0.1606, "rewards/accuracies": 1.0, "rewards/chosen": 4.778146266937256, "rewards/margins": 1.0809850692749023, "rewards/rejected": 3.6971611976623535, "step": 4767 }, { "epoch": 0.77, "learning_rate": 9.039205911381393e-07, "logits/chosen": -0.4277181327342987, "logits/rejected": -0.35151612758636475, "logps/chosen": -54.75312805175781, "logps/rejected": -71.94552612304688, "loss": 0.3409, "rewards/accuracies": 1.0, "rewards/chosen": 1.5161141157150269, "rewards/margins": 0.8122246265411377, "rewards/rejected": 0.7038894891738892, "step": 4768 }, { "epoch": 0.77, "learning_rate": 9.038431150279311e-07, "logits/chosen": -0.4159071445465088, "logits/rejected": -0.41375648975372314, "logps/chosen": -2.184823989868164, "logps/rejected": -1.490110158920288, "loss": 0.3967, "rewards/accuracies": 0.0, "rewards/chosen": 0.19707070291042328, "rewards/margins": -0.12431557476520538, "rewards/rejected": 0.32138627767562866, "step": 4769 }, { "epoch": 0.77, "learning_rate": 9.037656110159849e-07, "logits/chosen": -0.5066038966178894, "logits/rejected": -0.4638592302799225, "logps/chosen": -59.472816467285156, "logps/rejected": -77.26866149902344, "loss": 0.2126, "rewards/accuracies": 1.0, "rewards/chosen": 1.8028709888458252, "rewards/margins": 1.065331220626831, "rewards/rejected": 0.7375397086143494, "step": 4770 }, { "epoch": 0.77, "learning_rate": 9.036880791076554e-07, "logits/chosen": -0.5853483080863953, "logits/rejected": -0.39099258184432983, "logps/chosen": -133.04806518554688, "logps/rejected": -47.54022216796875, "loss": 0.1158, "rewards/accuracies": 1.0, "rewards/chosen": 4.470855712890625, "rewards/margins": 2.063483476638794, "rewards/rejected": 2.407372236251831, "step": 4771 }, { "epoch": 0.77, "learning_rate": 9.036105193082993e-07, "logits/chosen": -0.5951629281044006, "logits/rejected": -0.5552560687065125, "logps/chosen": -82.06269836425781, "logps/rejected": -96.38934326171875, "loss": 1.2765, "rewards/accuracies": 1.0, "rewards/chosen": 3.650266408920288, "rewards/margins": 0.24314045906066895, "rewards/rejected": 3.407125949859619, "step": 4772 }, { "epoch": 0.77, "learning_rate": 9.035329316232754e-07, "logits/chosen": -0.8507804274559021, "logits/rejected": -0.9138723611831665, "logps/chosen": -134.33885192871094, "logps/rejected": -196.66851806640625, "loss": 1.551, "rewards/accuracies": 0.0, "rewards/chosen": 0.26148682832717896, "rewards/margins": -1.3333587646484375, "rewards/rejected": 1.5948456525802612, "step": 4773 }, { "epoch": 0.77, "learning_rate": 9.034553160579443e-07, "logits/chosen": -0.7584452629089355, "logits/rejected": -0.769621729850769, "logps/chosen": -39.20569610595703, "logps/rejected": -28.388153076171875, "loss": 0.4158, "rewards/accuracies": 0.0, "rewards/chosen": 0.7374168634414673, "rewards/margins": -0.018511950969696045, "rewards/rejected": 0.7559288144111633, "step": 4774 }, { "epoch": 0.78, "learning_rate": 9.033776726176681e-07, "logits/chosen": -0.6551582217216492, "logits/rejected": -0.5247194766998291, "logps/chosen": -123.50651550292969, "logps/rejected": -135.02926635742188, "loss": 0.5735, "rewards/accuracies": 0.0, "rewards/chosen": 4.91811990737915, "rewards/margins": -0.7446136474609375, "rewards/rejected": 5.662733554840088, "step": 4775 }, { "epoch": 0.78, "learning_rate": 9.033000013078117e-07, "logits/chosen": -0.4309227764606476, "logits/rejected": -0.42987585067749023, "logps/chosen": -95.94888305664062, "logps/rejected": -83.680908203125, "loss": 1.0072, "rewards/accuracies": 0.0, "rewards/chosen": 1.9405968189239502, "rewards/margins": -1.0310027599334717, "rewards/rejected": 2.971599578857422, "step": 4776 }, { "epoch": 0.78, "learning_rate": 9.032223021337413e-07, "logits/chosen": -0.6225769519805908, "logits/rejected": -0.6225769519805908, "logps/chosen": -100.68724060058594, "logps/rejected": -100.68724060058594, "loss": 0.3569, "rewards/accuracies": 0.0, "rewards/chosen": 1.3197128772735596, "rewards/margins": 0.0, "rewards/rejected": 1.3197128772735596, "step": 4777 }, { "epoch": 0.78, "learning_rate": 9.031445751008251e-07, "logits/chosen": -0.8397200107574463, "logits/rejected": -0.8568306565284729, "logps/chosen": -269.8393859863281, "logps/rejected": -185.87899780273438, "loss": 2.2042, "rewards/accuracies": 0.0, "rewards/chosen": 3.7987427711486816, "rewards/margins": -3.7267885208129883, "rewards/rejected": 7.52553129196167, "step": 4778 }, { "epoch": 0.78, "learning_rate": 9.030668202144333e-07, "logits/chosen": -0.7220713496208191, "logits/rejected": -0.7339694499969482, "logps/chosen": -35.60781478881836, "logps/rejected": -79.21421813964844, "loss": 1.1258, "rewards/accuracies": 0.0, "rewards/chosen": 0.7257766723632812, "rewards/margins": -1.3220443725585938, "rewards/rejected": 2.047821044921875, "step": 4779 }, { "epoch": 0.78, "learning_rate": 9.029890374799381e-07, "logits/chosen": -0.4274630546569824, "logits/rejected": -0.4861937463283539, "logps/chosen": -79.15450286865234, "logps/rejected": -45.25416564941406, "loss": 0.9067, "rewards/accuracies": 0.0, "rewards/chosen": 1.3614647388458252, "rewards/margins": -0.41954147815704346, "rewards/rejected": 1.7810062170028687, "step": 4780 }, { "epoch": 0.78, "learning_rate": 9.029112269027135e-07, "logits/chosen": -0.3865615129470825, "logits/rejected": -0.4064841866493225, "logps/chosen": -44.29297637939453, "logps/rejected": -63.63185119628906, "loss": 0.3568, "rewards/accuracies": 1.0, "rewards/chosen": 1.6854034662246704, "rewards/margins": 0.21625518798828125, "rewards/rejected": 1.4691482782363892, "step": 4781 }, { "epoch": 0.78, "learning_rate": 9.028333884881356e-07, "logits/chosen": -0.6246543526649475, "logits/rejected": -0.5779533982276917, "logps/chosen": -173.00686645507812, "logps/rejected": -62.89080810546875, "loss": 0.0971, "rewards/accuracies": 1.0, "rewards/chosen": 3.8129518032073975, "rewards/margins": 2.124509572982788, "rewards/rejected": 1.6884422302246094, "step": 4782 }, { "epoch": 0.78, "learning_rate": 9.027555222415821e-07, "logits/chosen": -0.72002112865448, "logits/rejected": -0.7131950855255127, "logps/chosen": -81.84219360351562, "logps/rejected": -139.86203002929688, "loss": 0.5491, "rewards/accuracies": 0.0, "rewards/chosen": 1.1528472900390625, "rewards/margins": -0.32498013973236084, "rewards/rejected": 1.4778274297714233, "step": 4783 }, { "epoch": 0.78, "learning_rate": 9.026776281684329e-07, "logits/chosen": -0.7292346954345703, "logits/rejected": -0.7683294415473938, "logps/chosen": -153.83041381835938, "logps/rejected": -150.19125366210938, "loss": 2.6451, "rewards/accuracies": 0.0, "rewards/chosen": 4.592039585113525, "rewards/margins": -2.697293281555176, "rewards/rejected": 7.289332866668701, "step": 4784 }, { "epoch": 0.78, "learning_rate": 9.0259970627407e-07, "logits/chosen": -0.4846780598163605, "logits/rejected": -0.5379524230957031, "logps/chosen": -76.91432189941406, "logps/rejected": -102.88771057128906, "loss": 2.3585, "rewards/accuracies": 0.0, "rewards/chosen": 2.0880258083343506, "rewards/margins": -1.8312749862670898, "rewards/rejected": 3.9193007946014404, "step": 4785 }, { "epoch": 0.78, "learning_rate": 9.025217565638765e-07, "logits/chosen": -0.21147042512893677, "logits/rejected": -0.21147042512893677, "logps/chosen": -76.50038146972656, "logps/rejected": -76.50038146972656, "loss": 0.4387, "rewards/accuracies": 0.0, "rewards/chosen": 0.5013580322265625, "rewards/margins": 0.0, "rewards/rejected": 0.5013580322265625, "step": 4786 }, { "epoch": 0.78, "learning_rate": 9.024437790432386e-07, "logits/chosen": -0.60841304063797, "logits/rejected": -0.6213468313217163, "logps/chosen": -101.01612854003906, "logps/rejected": -87.73149108886719, "loss": 0.8634, "rewards/accuracies": 0.0, "rewards/chosen": 0.07513580471277237, "rewards/margins": -1.114924669265747, "rewards/rejected": 1.1900604963302612, "step": 4787 }, { "epoch": 0.78, "learning_rate": 9.023657737175434e-07, "logits/chosen": -0.49580734968185425, "logits/rejected": -0.43081942200660706, "logps/chosen": -54.604248046875, "logps/rejected": -53.1710090637207, "loss": 0.4898, "rewards/accuracies": 1.0, "rewards/chosen": 1.9144363403320312, "rewards/margins": 0.4051342010498047, "rewards/rejected": 1.5093021392822266, "step": 4788 }, { "epoch": 0.78, "learning_rate": 9.022877405921803e-07, "logits/chosen": -0.6175450682640076, "logits/rejected": -0.6240546107292175, "logps/chosen": -40.048362731933594, "logps/rejected": -77.99044799804688, "loss": 0.7076, "rewards/accuracies": 1.0, "rewards/chosen": 1.7498314380645752, "rewards/margins": 0.6122490167617798, "rewards/rejected": 1.1375824213027954, "step": 4789 }, { "epoch": 0.78, "learning_rate": 9.022096796725412e-07, "logits/chosen": -0.714648962020874, "logits/rejected": -0.714648962020874, "logps/chosen": -38.53296661376953, "logps/rejected": -38.53296661376953, "loss": 1.0561, "rewards/accuracies": 0.0, "rewards/chosen": 0.8071083426475525, "rewards/margins": 0.0, "rewards/rejected": 0.8071083426475525, "step": 4790 }, { "epoch": 0.78, "learning_rate": 9.021315909640186e-07, "logits/chosen": -0.4630469083786011, "logits/rejected": -0.4224558472633362, "logps/chosen": -75.35570526123047, "logps/rejected": -82.85356140136719, "loss": 0.2679, "rewards/accuracies": 1.0, "rewards/chosen": 2.421985626220703, "rewards/margins": 1.9274284839630127, "rewards/rejected": 0.4945572018623352, "step": 4791 }, { "epoch": 0.78, "learning_rate": 9.020534744720083e-07, "logits/chosen": -1.2301981449127197, "logits/rejected": -1.2620080709457397, "logps/chosen": -137.73226928710938, "logps/rejected": -119.70438385009766, "loss": 1.592, "rewards/accuracies": 0.0, "rewards/chosen": 5.650323390960693, "rewards/margins": -0.7235894203186035, "rewards/rejected": 6.373912811279297, "step": 4792 }, { "epoch": 0.78, "learning_rate": 9.019753302019071e-07, "logits/chosen": 0.05426894128322601, "logits/rejected": 0.0646333247423172, "logps/chosen": -16.26698112487793, "logps/rejected": -53.68580627441406, "loss": 0.6532, "rewards/accuracies": 1.0, "rewards/chosen": 0.24631749093532562, "rewards/margins": 0.06558781862258911, "rewards/rejected": 0.1807296723127365, "step": 4793 }, { "epoch": 0.78, "learning_rate": 9.01897158159114e-07, "logits/chosen": -0.6699159741401672, "logits/rejected": -0.6416304111480713, "logps/chosen": -51.567039489746094, "logps/rejected": -86.49942016601562, "loss": 1.5138, "rewards/accuracies": 0.0, "rewards/chosen": 1.6887565851211548, "rewards/margins": -1.262386441230774, "rewards/rejected": 2.9511430263519287, "step": 4794 }, { "epoch": 0.78, "learning_rate": 9.018189583490302e-07, "logits/chosen": -1.0109484195709229, "logits/rejected": -0.7950051426887512, "logps/chosen": -159.05630493164062, "logps/rejected": -80.91832733154297, "loss": 0.0119, "rewards/accuracies": 1.0, "rewards/chosen": 6.016882419586182, "rewards/margins": 3.9006125926971436, "rewards/rejected": 2.116269826889038, "step": 4795 }, { "epoch": 0.78, "learning_rate": 9.017407307770584e-07, "logits/chosen": -0.3596566319465637, "logits/rejected": -0.5210641026496887, "logps/chosen": -80.51371002197266, "logps/rejected": -132.78036499023438, "loss": 2.4568, "rewards/accuracies": 0.0, "rewards/chosen": 2.412451982498169, "rewards/margins": -1.0829262733459473, "rewards/rejected": 3.495378255844116, "step": 4796 }, { "epoch": 0.78, "learning_rate": 9.016624754486033e-07, "logits/chosen": -0.8773308992385864, "logits/rejected": -0.7864990234375, "logps/chosen": -76.20072174072266, "logps/rejected": -112.65143585205078, "loss": 1.0729, "rewards/accuracies": 1.0, "rewards/chosen": 1.3358993530273438, "rewards/margins": 0.28675687313079834, "rewards/rejected": 1.0491424798965454, "step": 4797 }, { "epoch": 0.78, "learning_rate": 9.015841923690719e-07, "logits/chosen": -0.18928146362304688, "logits/rejected": -0.15929169952869415, "logps/chosen": -81.35151672363281, "logps/rejected": -88.57260131835938, "loss": 0.8378, "rewards/accuracies": 0.0, "rewards/chosen": 0.9551056027412415, "rewards/margins": -1.1959176063537598, "rewards/rejected": 2.1510231494903564, "step": 4798 }, { "epoch": 0.78, "learning_rate": 9.015058815438726e-07, "logits/chosen": -0.7062202095985413, "logits/rejected": -0.5641019940376282, "logps/chosen": -49.57054901123047, "logps/rejected": -29.60396957397461, "loss": 1.9539, "rewards/accuracies": 1.0, "rewards/chosen": 1.8777366876602173, "rewards/margins": 0.3352038860321045, "rewards/rejected": 1.5425328016281128, "step": 4799 }, { "epoch": 0.78, "learning_rate": 9.014275429784158e-07, "logits/chosen": -0.6201625466346741, "logits/rejected": -0.6414812207221985, "logps/chosen": -96.36109161376953, "logps/rejected": -113.0182113647461, "loss": 2.7328, "rewards/accuracies": 0.0, "rewards/chosen": 2.2141730785369873, "rewards/margins": -2.199859857559204, "rewards/rejected": 4.414032936096191, "step": 4800 }, { "epoch": 0.78, "learning_rate": 9.013491766781143e-07, "logits/chosen": -0.5810818076133728, "logits/rejected": -0.6251115202903748, "logps/chosen": -19.92144203186035, "logps/rejected": -45.365020751953125, "loss": 1.4893, "rewards/accuracies": 0.0, "rewards/chosen": 0.886256992816925, "rewards/margins": -0.7453580498695374, "rewards/rejected": 1.6316150426864624, "step": 4801 }, { "epoch": 0.78, "learning_rate": 9.012707826483822e-07, "logits/chosen": -0.676922082901001, "logits/rejected": -0.6303808689117432, "logps/chosen": -29.119524002075195, "logps/rejected": -121.78414916992188, "loss": 0.4064, "rewards/accuracies": 0.0, "rewards/chosen": 2.533538341522217, "rewards/margins": -0.21644330024719238, "rewards/rejected": 2.749981641769409, "step": 4802 }, { "epoch": 0.78, "learning_rate": 9.01192360894636e-07, "logits/chosen": -0.9755271673202515, "logits/rejected": -1.0127896070480347, "logps/chosen": -150.53948974609375, "logps/rejected": -52.24024963378906, "loss": 0.7674, "rewards/accuracies": 1.0, "rewards/chosen": 3.355185031890869, "rewards/margins": 1.7764602899551392, "rewards/rejected": 1.57872474193573, "step": 4803 }, { "epoch": 0.78, "learning_rate": 9.011139114222937e-07, "logits/chosen": -0.3916281759738922, "logits/rejected": -0.3916281759738922, "logps/chosen": -173.14688110351562, "logps/rejected": -173.14688110351562, "loss": 0.5816, "rewards/accuracies": 0.0, "rewards/chosen": 4.004666328430176, "rewards/margins": 0.0, "rewards/rejected": 4.004666328430176, "step": 4804 }, { "epoch": 0.78, "learning_rate": 9.010354342367754e-07, "logits/chosen": -0.41720137000083923, "logits/rejected": -0.41720137000083923, "logps/chosen": -55.51385498046875, "logps/rejected": -55.51385498046875, "loss": 0.3525, "rewards/accuracies": 0.0, "rewards/chosen": 2.166410207748413, "rewards/margins": 0.0, "rewards/rejected": 2.166410207748413, "step": 4805 }, { "epoch": 0.78, "learning_rate": 9.009569293435033e-07, "logits/chosen": -0.5145612955093384, "logits/rejected": -0.45848265290260315, "logps/chosen": -113.02334594726562, "logps/rejected": -76.8797607421875, "loss": 0.1986, "rewards/accuracies": 1.0, "rewards/chosen": 3.077012777328491, "rewards/margins": 0.7860612869262695, "rewards/rejected": 2.2909514904022217, "step": 4806 }, { "epoch": 0.78, "learning_rate": 9.008783967479012e-07, "logits/chosen": -0.42101341485977173, "logits/rejected": -0.594894289970398, "logps/chosen": -101.93270874023438, "logps/rejected": -114.78009033203125, "loss": 2.0232, "rewards/accuracies": 0.0, "rewards/chosen": 0.8764411807060242, "rewards/margins": -3.919597864151001, "rewards/rejected": 4.79603910446167, "step": 4807 }, { "epoch": 0.78, "learning_rate": 9.00799836455395e-07, "logits/chosen": -0.7079862952232361, "logits/rejected": -0.6941627860069275, "logps/chosen": -230.82211303710938, "logps/rejected": -161.31326293945312, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": 2.8298797607421875, "rewards/margins": 2.2064666748046875, "rewards/rejected": 0.6234130859375, "step": 4808 }, { "epoch": 0.78, "learning_rate": 9.007212484714127e-07, "logits/chosen": -0.44533059000968933, "logits/rejected": -0.44533059000968933, "logps/chosen": -49.90730285644531, "logps/rejected": -49.90730285644531, "loss": 0.4408, "rewards/accuracies": 0.0, "rewards/chosen": 1.4412720203399658, "rewards/margins": 0.0, "rewards/rejected": 1.4412720203399658, "step": 4809 }, { "epoch": 0.78, "learning_rate": 9.006426328013837e-07, "logits/chosen": -0.5352964997291565, "logits/rejected": -0.5228169560432434, "logps/chosen": -85.60563659667969, "logps/rejected": -92.95695495605469, "loss": 1.3808, "rewards/accuracies": 0.0, "rewards/chosen": 0.6677482724189758, "rewards/margins": -0.8681251406669617, "rewards/rejected": 1.5358734130859375, "step": 4810 }, { "epoch": 0.78, "learning_rate": 9.005639894507397e-07, "logits/chosen": -0.9349247813224792, "logits/rejected": -0.8620664477348328, "logps/chosen": -132.24082946777344, "logps/rejected": -64.58554077148438, "loss": 0.3905, "rewards/accuracies": 1.0, "rewards/chosen": 2.5147902965545654, "rewards/margins": 0.9312865734100342, "rewards/rejected": 1.5835037231445312, "step": 4811 }, { "epoch": 0.78, "learning_rate": 9.004853184249142e-07, "logits/chosen": -0.9696722030639648, "logits/rejected": -0.8890355825424194, "logps/chosen": -120.07960510253906, "logps/rejected": -67.42230224609375, "loss": 0.6996, "rewards/accuracies": 0.0, "rewards/chosen": 0.5854110717773438, "rewards/margins": -1.0563201904296875, "rewards/rejected": 1.6417312622070312, "step": 4812 }, { "epoch": 0.78, "learning_rate": 9.004066197293428e-07, "logits/chosen": -0.554536759853363, "logits/rejected": -0.46543237566947937, "logps/chosen": -56.06629180908203, "logps/rejected": -51.16636657714844, "loss": 0.2022, "rewards/accuracies": 1.0, "rewards/chosen": 1.7996826171875, "rewards/margins": 1.0245051383972168, "rewards/rejected": 0.7751774191856384, "step": 4813 }, { "epoch": 0.78, "learning_rate": 9.003278933694624e-07, "logits/chosen": -0.32236170768737793, "logits/rejected": -0.28089645504951477, "logps/chosen": -34.875160217285156, "logps/rejected": -70.9748764038086, "loss": 0.4782, "rewards/accuracies": 0.0, "rewards/chosen": 1.177668809890747, "rewards/margins": -0.4473060369491577, "rewards/rejected": 1.6249748468399048, "step": 4814 }, { "epoch": 0.78, "learning_rate": 9.002491393507125e-07, "logits/chosen": -0.9496642351150513, "logits/rejected": -0.8644022941589355, "logps/chosen": -150.69821166992188, "logps/rejected": -29.424423217773438, "loss": 0.1262, "rewards/accuracies": 1.0, "rewards/chosen": 1.2887115478515625, "rewards/margins": 1.2726690769195557, "rewards/rejected": 0.016042519360780716, "step": 4815 }, { "epoch": 0.78, "learning_rate": 9.001703576785343e-07, "logits/chosen": -0.8186495304107666, "logits/rejected": -0.7282306551933289, "logps/chosen": -81.03138732910156, "logps/rejected": -92.80557250976562, "loss": 0.254, "rewards/accuracies": 1.0, "rewards/chosen": 2.777230978012085, "rewards/margins": 0.5871286392211914, "rewards/rejected": 2.1901023387908936, "step": 4816 }, { "epoch": 0.78, "learning_rate": 9.000915483583709e-07, "logits/chosen": -0.9232960343360901, "logits/rejected": -0.8504524827003479, "logps/chosen": -117.46766662597656, "logps/rejected": -84.4947509765625, "loss": 1.1356, "rewards/accuracies": 0.0, "rewards/chosen": 0.4099632203578949, "rewards/margins": -1.8464072942733765, "rewards/rejected": 2.2563705444335938, "step": 4817 }, { "epoch": 0.78, "learning_rate": 9.000127113956672e-07, "logits/chosen": -0.6645381450653076, "logits/rejected": -0.6407000422477722, "logps/chosen": -63.444053649902344, "logps/rejected": -107.78805541992188, "loss": 0.0877, "rewards/accuracies": 1.0, "rewards/chosen": 1.633287787437439, "rewards/margins": 1.8272422552108765, "rewards/rejected": -0.1939544677734375, "step": 4818 }, { "epoch": 0.78, "learning_rate": 8.999338467958702e-07, "logits/chosen": -0.522050142288208, "logits/rejected": -0.5189566612243652, "logps/chosen": -73.76006317138672, "logps/rejected": -104.30422973632812, "loss": 0.6029, "rewards/accuracies": 0.0, "rewards/chosen": 1.975793480873108, "rewards/margins": -0.7075775861740112, "rewards/rejected": 2.683371067047119, "step": 4819 }, { "epoch": 0.78, "learning_rate": 8.998549545644284e-07, "logits/chosen": -0.5759869813919067, "logits/rejected": -0.6185247302055359, "logps/chosen": -46.434165954589844, "logps/rejected": -49.68214797973633, "loss": 1.8403, "rewards/accuracies": 0.0, "rewards/chosen": 0.1980716735124588, "rewards/margins": -1.7484595775604248, "rewards/rejected": 1.9465312957763672, "step": 4820 }, { "epoch": 0.78, "learning_rate": 8.997760347067927e-07, "logits/chosen": -0.9308478236198425, "logits/rejected": -0.9266535043716431, "logps/chosen": -78.23577117919922, "logps/rejected": -171.96524047851562, "loss": 3.0163, "rewards/accuracies": 0.0, "rewards/chosen": 0.8636329770088196, "rewards/margins": -5.006764888763428, "rewards/rejected": 5.870398044586182, "step": 4821 }, { "epoch": 0.78, "learning_rate": 8.996970872284157e-07, "logits/chosen": -0.6444517970085144, "logits/rejected": -0.7335849404335022, "logps/chosen": -87.77153015136719, "logps/rejected": -141.30252075195312, "loss": 1.5742, "rewards/accuracies": 0.0, "rewards/chosen": 2.3311874866485596, "rewards/margins": -3.0169451236724854, "rewards/rejected": 5.348132610321045, "step": 4822 }, { "epoch": 0.78, "learning_rate": 8.996181121347521e-07, "logits/chosen": -0.5101144909858704, "logits/rejected": -0.511078417301178, "logps/chosen": -48.94013977050781, "logps/rejected": -82.16301727294922, "loss": 0.5764, "rewards/accuracies": 1.0, "rewards/chosen": 2.4493980407714844, "rewards/margins": 0.8126869201660156, "rewards/rejected": 1.6367111206054688, "step": 4823 }, { "epoch": 0.78, "learning_rate": 8.995391094312583e-07, "logits/chosen": -0.8676098585128784, "logits/rejected": -0.8972534537315369, "logps/chosen": -138.18722534179688, "logps/rejected": -79.27423095703125, "loss": 1.1416, "rewards/accuracies": 1.0, "rewards/chosen": 4.1426544189453125, "rewards/margins": 2.167771816253662, "rewards/rejected": 1.9748824834823608, "step": 4824 }, { "epoch": 0.78, "learning_rate": 8.994600791233922e-07, "logits/chosen": -0.939012885093689, "logits/rejected": -0.8527208566665649, "logps/chosen": -111.35122680664062, "logps/rejected": -15.622889518737793, "loss": 0.123, "rewards/accuracies": 1.0, "rewards/chosen": 2.2163193225860596, "rewards/margins": 1.8962783813476562, "rewards/rejected": 0.3200410008430481, "step": 4825 }, { "epoch": 0.78, "learning_rate": 8.993810212166145e-07, "logits/chosen": -0.9439713358879089, "logits/rejected": -0.949555516242981, "logps/chosen": -57.36116027832031, "logps/rejected": -64.08511352539062, "loss": 0.3443, "rewards/accuracies": 1.0, "rewards/chosen": 1.6159600019454956, "rewards/margins": 0.02521669864654541, "rewards/rejected": 1.5907433032989502, "step": 4826 }, { "epoch": 0.78, "learning_rate": 8.993019357163872e-07, "logits/chosen": -0.8414545059204102, "logits/rejected": -0.6924774646759033, "logps/chosen": -48.66774368286133, "logps/rejected": -19.46779441833496, "loss": 0.404, "rewards/accuracies": 1.0, "rewards/chosen": 1.899583101272583, "rewards/margins": 0.9296768307685852, "rewards/rejected": 0.9699062705039978, "step": 4827 }, { "epoch": 0.78, "learning_rate": 8.992228226281743e-07, "logits/chosen": -0.8623426556587219, "logits/rejected": -0.7785204648971558, "logps/chosen": -153.879638671875, "logps/rejected": -129.07510375976562, "loss": 0.2156, "rewards/accuracies": 1.0, "rewards/chosen": 5.098333835601807, "rewards/margins": 1.1341156959533691, "rewards/rejected": 3.9642181396484375, "step": 4828 }, { "epoch": 0.78, "learning_rate": 8.99143681957442e-07, "logits/chosen": -1.0157554149627686, "logits/rejected": -0.74992436170578, "logps/chosen": -195.12579345703125, "logps/rejected": -187.5364990234375, "loss": 0.8069, "rewards/accuracies": 0.0, "rewards/chosen": 5.663987636566162, "rewards/margins": -0.4467639923095703, "rewards/rejected": 6.110751628875732, "step": 4829 }, { "epoch": 0.78, "learning_rate": 8.99064513709658e-07, "logits/chosen": -1.4314779043197632, "logits/rejected": -1.3303658962249756, "logps/chosen": -103.7645263671875, "logps/rejected": -167.69189453125, "loss": 1.1181, "rewards/accuracies": 0.0, "rewards/chosen": 1.6324204206466675, "rewards/margins": -2.1169204711914062, "rewards/rejected": 3.749340772628784, "step": 4830 }, { "epoch": 0.78, "learning_rate": 8.98985317890292e-07, "logits/chosen": -2.2594785690307617, "logits/rejected": -2.037405014038086, "logps/chosen": -110.5587387084961, "logps/rejected": -149.56381225585938, "loss": 2.162, "rewards/accuracies": 0.0, "rewards/chosen": 2.2608344554901123, "rewards/margins": -3.89787220954895, "rewards/rejected": 6.1587066650390625, "step": 4831 }, { "epoch": 0.78, "learning_rate": 8.989060945048157e-07, "logits/chosen": -0.7347903847694397, "logits/rejected": -0.7304105758666992, "logps/chosen": -3.234701633453369, "logps/rejected": -6.920617580413818, "loss": 1.9654, "rewards/accuracies": 1.0, "rewards/chosen": 0.5250533819198608, "rewards/margins": 0.1978074312210083, "rewards/rejected": 0.32724595069885254, "step": 4832 }, { "epoch": 0.78, "learning_rate": 8.988268435587028e-07, "logits/chosen": -0.5100025534629822, "logits/rejected": -0.533575177192688, "logps/chosen": -9.636556625366211, "logps/rejected": -32.41425704956055, "loss": 0.354, "rewards/accuracies": 1.0, "rewards/chosen": 0.11088533699512482, "rewards/margins": 0.19637155532836914, "rewards/rejected": -0.08548622578382492, "step": 4833 }, { "epoch": 0.78, "learning_rate": 8.987475650574288e-07, "logits/chosen": -0.6244959831237793, "logits/rejected": -0.6015642285346985, "logps/chosen": -132.1002960205078, "logps/rejected": -37.250030517578125, "loss": 0.2436, "rewards/accuracies": 1.0, "rewards/chosen": 0.6810287833213806, "rewards/margins": 0.5155075192451477, "rewards/rejected": 0.16552124917507172, "step": 4834 }, { "epoch": 0.78, "learning_rate": 8.986682590064709e-07, "logits/chosen": -0.7855833172798157, "logits/rejected": -0.8097374439239502, "logps/chosen": -86.53697204589844, "logps/rejected": -114.82685852050781, "loss": 0.5035, "rewards/accuracies": 0.0, "rewards/chosen": 0.6511764526367188, "rewards/margins": -0.3285713195800781, "rewards/rejected": 0.9797477722167969, "step": 4835 }, { "epoch": 0.78, "learning_rate": 8.985889254113087e-07, "logits/chosen": -0.620492160320282, "logits/rejected": -0.6650989651679993, "logps/chosen": -110.74710083007812, "logps/rejected": -69.8909912109375, "loss": 1.334, "rewards/accuracies": 0.0, "rewards/chosen": 1.9737976789474487, "rewards/margins": -1.8728653192520142, "rewards/rejected": 3.846662998199463, "step": 4836 }, { "epoch": 0.79, "learning_rate": 8.985095642774232e-07, "logits/chosen": -0.982444703578949, "logits/rejected": -1.010246992111206, "logps/chosen": -183.31491088867188, "logps/rejected": -179.31192016601562, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": 7.047488689422607, "rewards/margins": 4.12681770324707, "rewards/rejected": 2.920671224594116, "step": 4837 }, { "epoch": 0.79, "learning_rate": 8.984301756102975e-07, "logits/chosen": -0.8755722641944885, "logits/rejected": -0.8340326547622681, "logps/chosen": -107.97139739990234, "logps/rejected": -64.49000549316406, "loss": 0.2669, "rewards/accuracies": 1.0, "rewards/chosen": 2.601294755935669, "rewards/margins": 0.44545674324035645, "rewards/rejected": 2.1558380126953125, "step": 4838 }, { "epoch": 0.79, "learning_rate": 8.983507594154167e-07, "logits/chosen": -0.7058670520782471, "logits/rejected": -0.6669605374336243, "logps/chosen": -44.62462615966797, "logps/rejected": -50.3941650390625, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 1.3361881971359253, "rewards/margins": -0.09130966663360596, "rewards/rejected": 1.4274978637695312, "step": 4839 }, { "epoch": 0.79, "learning_rate": 8.982713156982677e-07, "logits/chosen": -0.5278477072715759, "logits/rejected": -0.5299046039581299, "logps/chosen": -10.188704490661621, "logps/rejected": -2.950432777404785, "loss": 0.465, "rewards/accuracies": 0.0, "rewards/chosen": -0.012151146307587624, "rewards/margins": -0.40841275453567505, "rewards/rejected": 0.396261602640152, "step": 4840 }, { "epoch": 0.79, "learning_rate": 8.981918444643391e-07, "logits/chosen": -0.386708527803421, "logits/rejected": -0.3773900866508484, "logps/chosen": -3.7513484954833984, "logps/rejected": -5.862757205963135, "loss": 0.5391, "rewards/accuracies": 0.0, "rewards/chosen": 0.01863884925842285, "rewards/margins": -0.19546031951904297, "rewards/rejected": 0.21409916877746582, "step": 4841 }, { "epoch": 0.79, "learning_rate": 8.981123457191219e-07, "logits/chosen": -0.43697109818458557, "logits/rejected": -0.35355520248413086, "logps/chosen": -54.773372650146484, "logps/rejected": -35.37891387939453, "loss": 0.3643, "rewards/accuracies": 1.0, "rewards/chosen": 1.359890341758728, "rewards/margins": 0.7011821269989014, "rewards/rejected": 0.6587082147598267, "step": 4842 }, { "epoch": 0.79, "learning_rate": 8.980328194681086e-07, "logits/chosen": -0.608547568321228, "logits/rejected": -0.601243793964386, "logps/chosen": -73.59732818603516, "logps/rejected": -93.09949493408203, "loss": 1.1132, "rewards/accuracies": 0.0, "rewards/chosen": 0.23569488525390625, "rewards/margins": -1.2900642156600952, "rewards/rejected": 1.5257591009140015, "step": 4843 }, { "epoch": 0.79, "learning_rate": 8.979532657167936e-07, "logits/chosen": -0.7956925630569458, "logits/rejected": -0.6872550249099731, "logps/chosen": -238.4491729736328, "logps/rejected": -92.71418762207031, "loss": 0.1594, "rewards/accuracies": 1.0, "rewards/chosen": 3.763401746749878, "rewards/margins": 1.0282402038574219, "rewards/rejected": 2.735161542892456, "step": 4844 }, { "epoch": 0.79, "learning_rate": 8.978736844706735e-07, "logits/chosen": -0.4836995601654053, "logits/rejected": -0.28866931796073914, "logps/chosen": -50.92001724243164, "logps/rejected": -86.74026489257812, "loss": 1.4827, "rewards/accuracies": 0.0, "rewards/chosen": 2.651794195175171, "rewards/margins": -2.3965394496917725, "rewards/rejected": 5.048333644866943, "step": 4845 }, { "epoch": 0.79, "learning_rate": 8.977940757352464e-07, "logits/chosen": -0.94682776927948, "logits/rejected": -0.9681382775306702, "logps/chosen": -190.07449340820312, "logps/rejected": -127.554931640625, "loss": 1.3042, "rewards/accuracies": 0.0, "rewards/chosen": 0.7345978021621704, "rewards/margins": -2.4770402908325195, "rewards/rejected": 3.2116379737854004, "step": 4846 }, { "epoch": 0.79, "learning_rate": 8.977144395160128e-07, "logits/chosen": -0.44939857721328735, "logits/rejected": -0.30952349305152893, "logps/chosen": -69.35105895996094, "logps/rejected": -27.26910400390625, "loss": 0.4573, "rewards/accuracies": 1.0, "rewards/chosen": 1.570502519607544, "rewards/margins": 1.3945772647857666, "rewards/rejected": 0.17592525482177734, "step": 4847 }, { "epoch": 0.79, "learning_rate": 8.976347758184744e-07, "logits/chosen": -0.727155864238739, "logits/rejected": -0.7107142806053162, "logps/chosen": -46.02264404296875, "logps/rejected": -31.39263916015625, "loss": 0.4252, "rewards/accuracies": 1.0, "rewards/chosen": 1.9353889226913452, "rewards/margins": 0.7778609991073608, "rewards/rejected": 1.1575279235839844, "step": 4848 }, { "epoch": 0.79, "learning_rate": 8.975550846481356e-07, "logits/chosen": -0.8434534668922424, "logits/rejected": -0.7396575808525085, "logps/chosen": -189.79049682617188, "logps/rejected": -79.68739318847656, "loss": 0.7448, "rewards/accuracies": 0.0, "rewards/chosen": 1.064601182937622, "rewards/margins": -1.091496229171753, "rewards/rejected": 2.156097412109375, "step": 4849 }, { "epoch": 0.79, "learning_rate": 8.974753660105021e-07, "logits/chosen": -0.7656896710395813, "logits/rejected": -0.7498348355293274, "logps/chosen": -32.18403625488281, "logps/rejected": -58.31371307373047, "loss": 0.661, "rewards/accuracies": 1.0, "rewards/chosen": 2.656750440597534, "rewards/margins": 0.3711555004119873, "rewards/rejected": 2.285594940185547, "step": 4850 }, { "epoch": 0.79, "learning_rate": 8.973956199110819e-07, "logits/chosen": -0.8555232882499695, "logits/rejected": -0.8626294136047363, "logps/chosen": -159.487548828125, "logps/rejected": -95.17411804199219, "loss": 0.8256, "rewards/accuracies": 0.0, "rewards/chosen": 3.678271532058716, "rewards/margins": -1.1840345859527588, "rewards/rejected": 4.862306118011475, "step": 4851 }, { "epoch": 0.79, "learning_rate": 8.973158463553845e-07, "logits/chosen": -0.7718905210494995, "logits/rejected": -0.4667436182498932, "logps/chosen": -85.92520904541016, "logps/rejected": -57.427207946777344, "loss": 1.4286, "rewards/accuracies": 1.0, "rewards/chosen": 3.848524570465088, "rewards/margins": 2.469076633453369, "rewards/rejected": 1.3794479370117188, "step": 4852 }, { "epoch": 0.79, "learning_rate": 8.972360453489214e-07, "logits/chosen": -0.7696943879127502, "logits/rejected": -0.7392832636833191, "logps/chosen": -73.78421020507812, "logps/rejected": -43.253021240234375, "loss": 0.4271, "rewards/accuracies": 0.0, "rewards/chosen": 2.5405280590057373, "rewards/margins": -0.2642533779144287, "rewards/rejected": 2.804781436920166, "step": 4853 }, { "epoch": 0.79, "learning_rate": 8.971562168972064e-07, "logits/chosen": -0.594183087348938, "logits/rejected": -0.606209397315979, "logps/chosen": -64.54269409179688, "logps/rejected": -52.72484588623047, "loss": 2.7959, "rewards/accuracies": 0.0, "rewards/chosen": 1.6997581720352173, "rewards/margins": -1.20777428150177, "rewards/rejected": 2.9075324535369873, "step": 4854 }, { "epoch": 0.79, "learning_rate": 8.970763610057546e-07, "logits/chosen": -0.4946799874305725, "logits/rejected": -0.42234694957733154, "logps/chosen": -76.79258728027344, "logps/rejected": -63.33038330078125, "loss": 0.2374, "rewards/accuracies": 1.0, "rewards/chosen": 1.2176055908203125, "rewards/margins": 1.0008407831192017, "rewards/rejected": 0.21676483750343323, "step": 4855 }, { "epoch": 0.79, "learning_rate": 8.969964776800836e-07, "logits/chosen": -0.8074364066123962, "logits/rejected": -0.7854456305503845, "logps/chosen": -76.79534912109375, "logps/rejected": -98.33488464355469, "loss": 0.8636, "rewards/accuracies": 1.0, "rewards/chosen": 1.7295547723770142, "rewards/margins": 0.8008872866630554, "rewards/rejected": 0.9286674857139587, "step": 4856 }, { "epoch": 0.79, "learning_rate": 8.969165669257122e-07, "logits/chosen": -0.8531997799873352, "logits/rejected": -0.7268980741500854, "logps/chosen": -85.85719299316406, "logps/rejected": -153.34780883789062, "loss": 0.4529, "rewards/accuracies": 0.0, "rewards/chosen": 3.021196126937866, "rewards/margins": -0.07929229736328125, "rewards/rejected": 3.1004884243011475, "step": 4857 }, { "epoch": 0.79, "learning_rate": 8.968366287481619e-07, "logits/chosen": -0.6022370457649231, "logits/rejected": -0.5964131355285645, "logps/chosen": -72.18511962890625, "logps/rejected": -89.73831176757812, "loss": 0.5331, "rewards/accuracies": 1.0, "rewards/chosen": 1.398688554763794, "rewards/margins": 0.3143554925918579, "rewards/rejected": 1.084333062171936, "step": 4858 }, { "epoch": 0.79, "learning_rate": 8.967566631529553e-07, "logits/chosen": -0.5429037809371948, "logits/rejected": -0.4139236807823181, "logps/chosen": -60.922183990478516, "logps/rejected": -80.83289337158203, "loss": 2.0177, "rewards/accuracies": 1.0, "rewards/chosen": 1.5387195348739624, "rewards/margins": 0.17385518550872803, "rewards/rejected": 1.3648643493652344, "step": 4859 }, { "epoch": 0.79, "learning_rate": 8.966766701456176e-07, "logits/chosen": -0.5455047488212585, "logits/rejected": -0.5823376774787903, "logps/chosen": -102.66142272949219, "logps/rejected": -60.600791931152344, "loss": 1.0108, "rewards/accuracies": 0.0, "rewards/chosen": 0.4053665101528168, "rewards/margins": -1.006890058517456, "rewards/rejected": 1.4122565984725952, "step": 4860 }, { "epoch": 0.79, "learning_rate": 8.965966497316753e-07, "logits/chosen": -0.9180669784545898, "logits/rejected": -0.86576908826828, "logps/chosen": -108.00371551513672, "logps/rejected": -37.46623992919922, "loss": 0.2001, "rewards/accuracies": 1.0, "rewards/chosen": 1.5279372930526733, "rewards/margins": 1.1871615648269653, "rewards/rejected": 0.3407756984233856, "step": 4861 }, { "epoch": 0.79, "learning_rate": 8.96516601916657e-07, "logits/chosen": -0.7763552665710449, "logits/rejected": -0.7033584117889404, "logps/chosen": -73.5134048461914, "logps/rejected": -57.543975830078125, "loss": 1.2393, "rewards/accuracies": 1.0, "rewards/chosen": 2.2203216552734375, "rewards/margins": 0.6827560663223267, "rewards/rejected": 1.5375655889511108, "step": 4862 }, { "epoch": 0.79, "learning_rate": 8.964365267060935e-07, "logits/chosen": -1.2160202264785767, "logits/rejected": -1.2773905992507935, "logps/chosen": -252.23043823242188, "logps/rejected": -21.136180877685547, "loss": 0.23, "rewards/accuracies": 1.0, "rewards/chosen": 1.9525727033615112, "rewards/margins": 1.7974705696105957, "rewards/rejected": 0.15510216355323792, "step": 4863 }, { "epoch": 0.79, "learning_rate": 8.96356424105517e-07, "logits/chosen": -0.49466177821159363, "logits/rejected": -0.37438568472862244, "logps/chosen": -92.89405822753906, "logps/rejected": -81.67595672607422, "loss": 1.1283, "rewards/accuracies": 0.0, "rewards/chosen": 1.1889861822128296, "rewards/margins": -1.22594153881073, "rewards/rejected": 2.4149277210235596, "step": 4864 }, { "epoch": 0.79, "learning_rate": 8.962762941204623e-07, "logits/chosen": -0.5932771563529968, "logits/rejected": -0.5698289275169373, "logps/chosen": -150.60076904296875, "logps/rejected": -117.81082153320312, "loss": 0.9196, "rewards/accuracies": 0.0, "rewards/chosen": 2.194689989089966, "rewards/margins": -1.6571898460388184, "rewards/rejected": 3.851879835128784, "step": 4865 }, { "epoch": 0.79, "learning_rate": 8.96196136756465e-07, "logits/chosen": -0.4416666328907013, "logits/rejected": -0.4190855026245117, "logps/chosen": -232.3546600341797, "logps/rejected": -140.51168823242188, "loss": 1.8258, "rewards/accuracies": 0.0, "rewards/chosen": 3.1363112926483154, "rewards/margins": -2.8313004970550537, "rewards/rejected": 5.967611789703369, "step": 4866 }, { "epoch": 0.79, "learning_rate": 8.961159520190636e-07, "logits/chosen": -0.8727312088012695, "logits/rejected": -0.8884732723236084, "logps/chosen": -101.93708801269531, "logps/rejected": -66.0794906616211, "loss": 1.2316, "rewards/accuracies": 0.0, "rewards/chosen": 1.3760124444961548, "rewards/margins": -1.7779313325881958, "rewards/rejected": 3.1539437770843506, "step": 4867 }, { "epoch": 0.79, "learning_rate": 8.960357399137979e-07, "logits/chosen": -0.02281215973198414, "logits/rejected": -0.004965943284332752, "logps/chosen": -27.137216567993164, "logps/rejected": -16.698463439941406, "loss": 1.2497, "rewards/accuracies": 0.0, "rewards/chosen": -0.17416591942310333, "rewards/margins": -0.7790212631225586, "rewards/rejected": 0.6048553586006165, "step": 4868 }, { "epoch": 0.79, "learning_rate": 8.959555004462098e-07, "logits/chosen": -0.9209320545196533, "logits/rejected": -0.8759587407112122, "logps/chosen": -129.37965393066406, "logps/rejected": -340.11834716796875, "loss": 1.3828, "rewards/accuracies": 0.0, "rewards/chosen": 2.5565993785858154, "rewards/margins": -2.5585525035858154, "rewards/rejected": 5.115151882171631, "step": 4869 }, { "epoch": 0.79, "learning_rate": 8.958752336218433e-07, "logits/chosen": -0.7914497256278992, "logits/rejected": -0.7359544038772583, "logps/chosen": -62.041908264160156, "logps/rejected": -20.72589111328125, "loss": 0.3753, "rewards/accuracies": 1.0, "rewards/chosen": 2.0184686183929443, "rewards/margins": 0.513873815536499, "rewards/rejected": 1.5045948028564453, "step": 4870 }, { "epoch": 0.79, "learning_rate": 8.957949394462441e-07, "logits/chosen": -0.9163133502006531, "logits/rejected": -0.9288408160209656, "logps/chosen": -56.52920913696289, "logps/rejected": -80.23898315429688, "loss": 1.2041, "rewards/accuracies": 0.0, "rewards/chosen": 0.466757208108902, "rewards/margins": -0.1261955201625824, "rewards/rejected": 0.5929527282714844, "step": 4871 }, { "epoch": 0.79, "learning_rate": 8.957146179249595e-07, "logits/chosen": -0.5356889963150024, "logits/rejected": -0.4480241537094116, "logps/chosen": -66.68583679199219, "logps/rejected": -38.276092529296875, "loss": 0.6374, "rewards/accuracies": 0.0, "rewards/chosen": 1.4531258344650269, "rewards/margins": -0.5947364568710327, "rewards/rejected": 2.0478622913360596, "step": 4872 }, { "epoch": 0.79, "learning_rate": 8.95634269063539e-07, "logits/chosen": -0.9102977514266968, "logits/rejected": -0.890215277671814, "logps/chosen": -62.01921463012695, "logps/rejected": -71.57730865478516, "loss": 1.0776, "rewards/accuracies": 1.0, "rewards/chosen": 0.5015064477920532, "rewards/margins": 0.7581608295440674, "rewards/rejected": -0.2566543519496918, "step": 4873 }, { "epoch": 0.79, "learning_rate": 8.955538928675341e-07, "logits/chosen": -0.7094663381576538, "logits/rejected": -0.7094663381576538, "logps/chosen": -97.35467529296875, "logps/rejected": -97.35467529296875, "loss": 1.9465, "rewards/accuracies": 0.0, "rewards/chosen": 2.139573812484741, "rewards/margins": 0.0, "rewards/rejected": 2.139573812484741, "step": 4874 }, { "epoch": 0.79, "learning_rate": 8.95473489342498e-07, "logits/chosen": -0.37652504444122314, "logits/rejected": -0.4363458454608917, "logps/chosen": -26.000226974487305, "logps/rejected": -76.37171936035156, "loss": 0.8209, "rewards/accuracies": 0.0, "rewards/chosen": 0.4952896237373352, "rewards/margins": -0.7067596316337585, "rewards/rejected": 1.2020492553710938, "step": 4875 }, { "epoch": 0.79, "learning_rate": 8.95393058493986e-07, "logits/chosen": -0.8727544546127319, "logits/rejected": -0.821217954158783, "logps/chosen": -58.37431335449219, "logps/rejected": -77.3935546875, "loss": 0.5158, "rewards/accuracies": 0.0, "rewards/chosen": 1.352014183998108, "rewards/margins": -0.5797363519668579, "rewards/rejected": 1.9317505359649658, "step": 4876 }, { "epoch": 0.79, "learning_rate": 8.953126003275546e-07, "logits/chosen": -0.25338733196258545, "logits/rejected": 0.13889114558696747, "logps/chosen": -106.74105834960938, "logps/rejected": -142.25003051757812, "loss": 1.1015, "rewards/accuracies": 0.0, "rewards/chosen": 0.9856010675430298, "rewards/margins": -1.168831706047058, "rewards/rejected": 2.154432773590088, "step": 4877 }, { "epoch": 0.79, "learning_rate": 8.952321148487631e-07, "logits/chosen": -0.4692747890949249, "logits/rejected": -0.43330812454223633, "logps/chosen": -69.48474884033203, "logps/rejected": -97.59492492675781, "loss": 2.0497, "rewards/accuracies": 0.0, "rewards/chosen": 1.0679603815078735, "rewards/margins": -0.5626205205917358, "rewards/rejected": 1.6305809020996094, "step": 4878 }, { "epoch": 0.79, "learning_rate": 8.951516020631721e-07, "logits/chosen": -0.6104135513305664, "logits/rejected": -0.5855971574783325, "logps/chosen": -79.5467529296875, "logps/rejected": -59.45694351196289, "loss": 0.6777, "rewards/accuracies": 0.0, "rewards/chosen": 1.2319198846817017, "rewards/margins": -1.0095852613449097, "rewards/rejected": 2.2415051460266113, "step": 4879 }, { "epoch": 0.79, "learning_rate": 8.950710619763445e-07, "logits/chosen": -0.882574737071991, "logits/rejected": -0.7070819735527039, "logps/chosen": -87.97501373291016, "logps/rejected": -21.535242080688477, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": 5.757094860076904, "rewards/margins": 5.351762294769287, "rewards/rejected": 0.40533238649368286, "step": 4880 }, { "epoch": 0.79, "learning_rate": 8.949904945938447e-07, "logits/chosen": -0.6130611300468445, "logits/rejected": -0.6081521511077881, "logps/chosen": -2.988006591796875, "logps/rejected": -3.869744062423706, "loss": 0.7243, "rewards/accuracies": 0.0, "rewards/chosen": 0.2928057312965393, "rewards/margins": -0.12081733345985413, "rewards/rejected": 0.41362306475639343, "step": 4881 }, { "epoch": 0.79, "learning_rate": 8.94909899921239e-07, "logits/chosen": -0.7789711356163025, "logits/rejected": -0.5147912502288818, "logps/chosen": -138.1541748046875, "logps/rejected": -30.499696731567383, "loss": 0.5427, "rewards/accuracies": 1.0, "rewards/chosen": 4.085913181304932, "rewards/margins": 3.591655731201172, "rewards/rejected": 0.49425753951072693, "step": 4882 }, { "epoch": 0.79, "learning_rate": 8.94829277964096e-07, "logits/chosen": -0.14284759759902954, "logits/rejected": -0.18965163826942444, "logps/chosen": -3.503565549850464, "logps/rejected": -67.26449584960938, "loss": 0.5632, "rewards/accuracies": 0.0, "rewards/chosen": 0.3346826732158661, "rewards/margins": -0.2734105885028839, "rewards/rejected": 0.60809326171875, "step": 4883 }, { "epoch": 0.79, "learning_rate": 8.947486287279858e-07, "logits/chosen": -0.5251315236091614, "logits/rejected": -0.430961012840271, "logps/chosen": -77.06733703613281, "logps/rejected": -62.5155029296875, "loss": 0.2788, "rewards/accuracies": 1.0, "rewards/chosen": 1.4014679193496704, "rewards/margins": 0.3127319812774658, "rewards/rejected": 1.0887359380722046, "step": 4884 }, { "epoch": 0.79, "learning_rate": 8.946679522184806e-07, "logits/chosen": -0.4315730333328247, "logits/rejected": -0.3597707450389862, "logps/chosen": -46.61173629760742, "logps/rejected": -42.083282470703125, "loss": 0.2397, "rewards/accuracies": 1.0, "rewards/chosen": 2.9481608867645264, "rewards/margins": 1.042229413986206, "rewards/rejected": 1.9059314727783203, "step": 4885 }, { "epoch": 0.79, "learning_rate": 8.945872484411541e-07, "logits/chosen": -0.5038222074508667, "logits/rejected": -0.49474599957466125, "logps/chosen": -9.854938507080078, "logps/rejected": -19.385099411010742, "loss": 0.815, "rewards/accuracies": 1.0, "rewards/chosen": 0.29424697160720825, "rewards/margins": 0.022156625986099243, "rewards/rejected": 0.272090345621109, "step": 4886 }, { "epoch": 0.79, "learning_rate": 8.945065174015824e-07, "logits/chosen": -0.5279540419578552, "logits/rejected": -0.46495065093040466, "logps/chosen": -69.04685974121094, "logps/rejected": -56.67268371582031, "loss": 1.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.7241684198379517, "rewards/margins": 0.33933642506599426, "rewards/rejected": 0.3848319947719574, "step": 4887 }, { "epoch": 0.79, "learning_rate": 8.944257591053433e-07, "logits/chosen": -0.3109172582626343, "logits/rejected": -0.2907921373844147, "logps/chosen": -23.816242218017578, "logps/rejected": -19.687347412109375, "loss": 0.6573, "rewards/accuracies": 1.0, "rewards/chosen": -0.08992176502943039, "rewards/margins": 0.12783527374267578, "rewards/rejected": -0.21775703132152557, "step": 4888 }, { "epoch": 0.79, "learning_rate": 8.943449735580162e-07, "logits/chosen": -0.852536141872406, "logits/rejected": -0.8162989020347595, "logps/chosen": -98.8914794921875, "logps/rejected": -93.59034729003906, "loss": 1.1526, "rewards/accuracies": 0.0, "rewards/chosen": 1.3261200189590454, "rewards/margins": -2.189897060394287, "rewards/rejected": 3.516017198562622, "step": 4889 }, { "epoch": 0.79, "learning_rate": 8.942641607651828e-07, "logits/chosen": -0.8100918531417847, "logits/rejected": -0.793953537940979, "logps/chosen": -55.77997589111328, "logps/rejected": -61.252220153808594, "loss": 0.8467, "rewards/accuracies": 0.0, "rewards/chosen": 2.433706760406494, "rewards/margins": -0.2640554904937744, "rewards/rejected": 2.6977622509002686, "step": 4890 }, { "epoch": 0.79, "learning_rate": 8.941833207324265e-07, "logits/chosen": -0.9572761058807373, "logits/rejected": -0.982803225517273, "logps/chosen": -113.94742584228516, "logps/rejected": -84.74915313720703, "loss": 0.5446, "rewards/accuracies": 0.0, "rewards/chosen": 1.0353072881698608, "rewards/margins": -0.6250358819961548, "rewards/rejected": 1.6603431701660156, "step": 4891 }, { "epoch": 0.79, "learning_rate": 8.941024534653325e-07, "logits/chosen": -0.2864799499511719, "logits/rejected": -0.24706293642520905, "logps/chosen": -70.37142944335938, "logps/rejected": -63.60777282714844, "loss": 0.9199, "rewards/accuracies": 0.0, "rewards/chosen": 1.8743447065353394, "rewards/margins": -1.4612518548965454, "rewards/rejected": 3.3355965614318848, "step": 4892 }, { "epoch": 0.79, "learning_rate": 8.940215589694882e-07, "logits/chosen": -0.9934247732162476, "logits/rejected": -0.9975211024284363, "logps/chosen": -105.802001953125, "logps/rejected": -26.3385009765625, "loss": 0.9325, "rewards/accuracies": 0.0, "rewards/chosen": 0.37496185302734375, "rewards/margins": -0.5073307156562805, "rewards/rejected": 0.8822925686836243, "step": 4893 }, { "epoch": 0.79, "learning_rate": 8.939406372504822e-07, "logits/chosen": -0.5729646682739258, "logits/rejected": -0.6044541001319885, "logps/chosen": -47.592445373535156, "logps/rejected": -92.75859069824219, "loss": 0.5659, "rewards/accuracies": 1.0, "rewards/chosen": 0.7095600366592407, "rewards/margins": 0.1553165316581726, "rewards/rejected": 0.5542435050010681, "step": 4894 }, { "epoch": 0.79, "learning_rate": 8.938596883139057e-07, "logits/chosen": -0.2560371160507202, "logits/rejected": -0.3364010155200958, "logps/chosen": -74.11669921875, "logps/rejected": -140.88275146484375, "loss": 0.9531, "rewards/accuracies": 0.0, "rewards/chosen": 1.3681358098983765, "rewards/margins": -0.013545989990234375, "rewards/rejected": 1.3816817998886108, "step": 4895 }, { "epoch": 0.79, "learning_rate": 8.937787121653515e-07, "logits/chosen": -0.8174876570701599, "logits/rejected": -0.6427332758903503, "logps/chosen": -110.70872497558594, "logps/rejected": -76.42086791992188, "loss": 0.1249, "rewards/accuracies": 1.0, "rewards/chosen": 5.574679851531982, "rewards/margins": 1.7563960552215576, "rewards/rejected": 3.818283796310425, "step": 4896 }, { "epoch": 0.79, "learning_rate": 8.936977088104143e-07, "logits/chosen": -0.8319457173347473, "logits/rejected": -1.180923342704773, "logps/chosen": -85.16590118408203, "logps/rejected": -35.737457275390625, "loss": 0.5472, "rewards/accuracies": 1.0, "rewards/chosen": 1.220354437828064, "rewards/margins": 0.8719886541366577, "rewards/rejected": 0.34836578369140625, "step": 4897 }, { "epoch": 0.8, "learning_rate": 8.936166782546906e-07, "logits/chosen": -0.5984780788421631, "logits/rejected": -0.6098411679267883, "logps/chosen": -89.5596923828125, "logps/rejected": -36.789527893066406, "loss": 0.5084, "rewards/accuracies": 0.0, "rewards/chosen": 1.0856307744979858, "rewards/margins": -0.5465019941329956, "rewards/rejected": 1.6321327686309814, "step": 4898 }, { "epoch": 0.8, "learning_rate": 8.935356205037787e-07, "logits/chosen": -0.9493097066879272, "logits/rejected": -0.8638295531272888, "logps/chosen": -127.41613006591797, "logps/rejected": -72.19168853759766, "loss": 1.295, "rewards/accuracies": 0.0, "rewards/chosen": 0.28750839829444885, "rewards/margins": -0.5117241144180298, "rewards/rejected": 0.7992324829101562, "step": 4899 }, { "epoch": 0.8, "learning_rate": 8.934545355632792e-07, "logits/chosen": -0.36288487911224365, "logits/rejected": -0.24406662583351135, "logps/chosen": -57.908531188964844, "logps/rejected": -97.39054870605469, "loss": 0.3318, "rewards/accuracies": 1.0, "rewards/chosen": 2.00663685798645, "rewards/margins": 0.4954894781112671, "rewards/rejected": 1.511147379875183, "step": 4900 }, { "epoch": 0.8, "learning_rate": 8.933734234387943e-07, "logits/chosen": -0.026044566184282303, "logits/rejected": -0.026044566184282303, "logps/chosen": -1.6196842193603516, "logps/rejected": -1.6196842193603516, "loss": 0.3487, "rewards/accuracies": 0.0, "rewards/chosen": 0.17537441849708557, "rewards/margins": 0.0, "rewards/rejected": 0.17537441849708557, "step": 4901 }, { "epoch": 0.8, "learning_rate": 8.93292284135928e-07, "logits/chosen": -0.2801430821418762, "logits/rejected": -0.2796293795108795, "logps/chosen": -4.74845027923584, "logps/rejected": -17.106842041015625, "loss": 0.4486, "rewards/accuracies": 1.0, "rewards/chosen": 0.627781093120575, "rewards/margins": 0.22540566325187683, "rewards/rejected": 0.4023754298686981, "step": 4902 }, { "epoch": 0.8, "learning_rate": 8.932111176602861e-07, "logits/chosen": -0.8694711923599243, "logits/rejected": -0.831423282623291, "logps/chosen": -68.54467010498047, "logps/rejected": -77.54164123535156, "loss": 0.241, "rewards/accuracies": 1.0, "rewards/chosen": 2.118406057357788, "rewards/margins": 1.2867822647094727, "rewards/rejected": 0.8316238522529602, "step": 4903 }, { "epoch": 0.8, "learning_rate": 8.931299240174765e-07, "logits/chosen": -0.5935943126678467, "logits/rejected": -0.6041663289070129, "logps/chosen": -15.85774040222168, "logps/rejected": -3.8135986328125, "loss": 0.5225, "rewards/accuracies": 0.0, "rewards/chosen": -0.03878812864422798, "rewards/margins": -0.2984786331653595, "rewards/rejected": 0.2596904933452606, "step": 4904 }, { "epoch": 0.8, "learning_rate": 8.930487032131091e-07, "logits/chosen": -0.9417003989219666, "logits/rejected": -0.5224241614341736, "logps/chosen": -87.32466125488281, "logps/rejected": -112.5513687133789, "loss": 0.9258, "rewards/accuracies": 0.0, "rewards/chosen": 3.1992738246917725, "rewards/margins": -1.396578073501587, "rewards/rejected": 4.595851898193359, "step": 4905 }, { "epoch": 0.8, "learning_rate": 8.929674552527955e-07, "logits/chosen": -0.6881153583526611, "logits/rejected": -0.5972152352333069, "logps/chosen": -58.32542037963867, "logps/rejected": -92.0166015625, "loss": 0.8348, "rewards/accuracies": 0.0, "rewards/chosen": 1.7441600561141968, "rewards/margins": -0.9570194482803345, "rewards/rejected": 2.7011795043945312, "step": 4906 }, { "epoch": 0.8, "learning_rate": 8.928861801421488e-07, "logits/chosen": -0.7206365466117859, "logits/rejected": -0.7426699995994568, "logps/chosen": -145.852294921875, "logps/rejected": -120.09800720214844, "loss": 0.7207, "rewards/accuracies": 0.0, "rewards/chosen": 3.954205274581909, "rewards/margins": -1.1089370250701904, "rewards/rejected": 5.0631422996521, "step": 4907 }, { "epoch": 0.8, "learning_rate": 8.928048778867847e-07, "logits/chosen": -0.6479450464248657, "logits/rejected": -0.6956506371498108, "logps/chosen": -73.29144287109375, "logps/rejected": -137.12142944335938, "loss": 1.8196, "rewards/accuracies": 0.0, "rewards/chosen": 1.512311577796936, "rewards/margins": -2.8141350746154785, "rewards/rejected": 4.326446533203125, "step": 4908 }, { "epoch": 0.8, "learning_rate": 8.927235484923202e-07, "logits/chosen": -0.9004908204078674, "logits/rejected": -0.8680136203765869, "logps/chosen": -104.65519714355469, "logps/rejected": -93.68975067138672, "loss": 0.6434, "rewards/accuracies": 0.0, "rewards/chosen": 0.9592735171318054, "rewards/margins": -0.4769211411476135, "rewards/rejected": 1.436194658279419, "step": 4909 }, { "epoch": 0.8, "learning_rate": 8.926421919643745e-07, "logits/chosen": -0.9449342489242554, "logits/rejected": -0.8932332992553711, "logps/chosen": -91.04305267333984, "logps/rejected": -14.223965644836426, "loss": 0.9503, "rewards/accuracies": 1.0, "rewards/chosen": 1.50261390209198, "rewards/margins": 1.208722472190857, "rewards/rejected": 0.29389142990112305, "step": 4910 }, { "epoch": 0.8, "learning_rate": 8.925608083085688e-07, "logits/chosen": -0.4115579426288605, "logits/rejected": -0.41142281889915466, "logps/chosen": -68.06007385253906, "logps/rejected": -98.30645751953125, "loss": 0.2972, "rewards/accuracies": 1.0, "rewards/chosen": 0.5411483645439148, "rewards/margins": 0.2146659791469574, "rewards/rejected": 0.3264823853969574, "step": 4911 }, { "epoch": 0.8, "learning_rate": 8.924793975305254e-07, "logits/chosen": -0.8815097808837891, "logits/rejected": -0.8228197693824768, "logps/chosen": -83.31965637207031, "logps/rejected": -17.081439971923828, "loss": 0.988, "rewards/accuracies": 1.0, "rewards/chosen": 1.040808081626892, "rewards/margins": 0.8988679647445679, "rewards/rejected": 0.14194011688232422, "step": 4912 }, { "epoch": 0.8, "learning_rate": 8.923979596358693e-07, "logits/chosen": -0.47792914509773254, "logits/rejected": -0.4012163281440735, "logps/chosen": -38.68009567260742, "logps/rejected": -6.533318042755127, "loss": 0.4279, "rewards/accuracies": 1.0, "rewards/chosen": 2.1941630840301514, "rewards/margins": 1.3505964279174805, "rewards/rejected": 0.8435667157173157, "step": 4913 }, { "epoch": 0.8, "learning_rate": 8.923164946302273e-07, "logits/chosen": -0.6660475134849548, "logits/rejected": -0.7340407967567444, "logps/chosen": -125.6091537475586, "logps/rejected": -112.49777221679688, "loss": 2.192, "rewards/accuracies": 0.0, "rewards/chosen": 1.1061439514160156, "rewards/margins": -4.172835826873779, "rewards/rejected": 5.278979778289795, "step": 4914 }, { "epoch": 0.8, "learning_rate": 8.922350025192274e-07, "logits/chosen": -0.7473894953727722, "logits/rejected": -0.7044870257377625, "logps/chosen": -93.18669128417969, "logps/rejected": -77.18574523925781, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 4.84140157699585, "rewards/margins": 3.0216565132141113, "rewards/rejected": 1.8197449445724487, "step": 4915 }, { "epoch": 0.8, "learning_rate": 8.921534833085003e-07, "logits/chosen": -0.9604735970497131, "logits/rejected": -0.9767280220985413, "logps/chosen": -127.6683349609375, "logps/rejected": -166.2154998779297, "loss": 1.6023, "rewards/accuracies": 0.0, "rewards/chosen": 4.896252632141113, "rewards/margins": -2.2100324630737305, "rewards/rejected": 7.106285095214844, "step": 4916 }, { "epoch": 0.8, "learning_rate": 8.920719370036782e-07, "logits/chosen": -0.5607972741127014, "logits/rejected": -0.5237978100776672, "logps/chosen": -102.93516540527344, "logps/rejected": -69.73629760742188, "loss": 0.9724, "rewards/accuracies": 0.0, "rewards/chosen": 0.8974205255508423, "rewards/margins": -0.7442581653594971, "rewards/rejected": 1.6416786909103394, "step": 4917 }, { "epoch": 0.8, "learning_rate": 8.91990363610395e-07, "logits/chosen": -0.6189808249473572, "logits/rejected": -0.5746216773986816, "logps/chosen": -62.901580810546875, "logps/rejected": -103.8464126586914, "loss": 0.1315, "rewards/accuracies": 1.0, "rewards/chosen": 1.9081131219863892, "rewards/margins": 1.2796311378479004, "rewards/rejected": 0.6284820437431335, "step": 4918 }, { "epoch": 0.8, "learning_rate": 8.919087631342867e-07, "logits/chosen": -0.5047559142112732, "logits/rejected": -0.49224385619163513, "logps/chosen": -28.946218490600586, "logps/rejected": -17.282747268676758, "loss": 1.8515, "rewards/accuracies": 1.0, "rewards/chosen": 1.2191259860992432, "rewards/margins": 0.8263109922409058, "rewards/rejected": 0.3928150236606598, "step": 4919 }, { "epoch": 0.8, "learning_rate": 8.918271355809912e-07, "logits/chosen": -0.6500733494758606, "logits/rejected": -0.6083323955535889, "logps/chosen": -75.9912109375, "logps/rejected": -50.200401306152344, "loss": 0.5622, "rewards/accuracies": 0.0, "rewards/chosen": 0.9080360531806946, "rewards/margins": -0.7198135256767273, "rewards/rejected": 1.6278495788574219, "step": 4920 }, { "epoch": 0.8, "learning_rate": 8.91745480956148e-07, "logits/chosen": -0.022852279245853424, "logits/rejected": 0.00456913560628891, "logps/chosen": -27.158510208129883, "logps/rejected": -1.7271268367767334, "loss": 0.5641, "rewards/accuracies": 0.0, "rewards/chosen": -0.08771438896656036, "rewards/margins": -0.3561158776283264, "rewards/rejected": 0.26840147376060486, "step": 4921 }, { "epoch": 0.8, "learning_rate": 8.916637992653991e-07, "logits/chosen": -1.1135752201080322, "logits/rejected": -1.1010043621063232, "logps/chosen": -166.6143341064453, "logps/rejected": -87.91877746582031, "loss": 0.4316, "rewards/accuracies": 1.0, "rewards/chosen": 3.8985962867736816, "rewards/margins": 0.6610565185546875, "rewards/rejected": 3.237539768218994, "step": 4922 }, { "epoch": 0.8, "learning_rate": 8.915820905143872e-07, "logits/chosen": -0.10249422490596771, "logits/rejected": -0.1327664852142334, "logps/chosen": -33.82957077026367, "logps/rejected": -57.593727111816406, "loss": 0.564, "rewards/accuracies": 0.0, "rewards/chosen": 1.9872440099716187, "rewards/margins": -0.3193424940109253, "rewards/rejected": 2.306586503982544, "step": 4923 }, { "epoch": 0.8, "learning_rate": 8.915003547087583e-07, "logits/chosen": -0.5947371125221252, "logits/rejected": -0.5661386251449585, "logps/chosen": -64.4139404296875, "logps/rejected": -121.43357849121094, "loss": 0.1906, "rewards/accuracies": 1.0, "rewards/chosen": 0.6973831057548523, "rewards/margins": 0.948235273361206, "rewards/rejected": -0.25085219740867615, "step": 4924 }, { "epoch": 0.8, "learning_rate": 8.914185918541592e-07, "logits/chosen": -0.8834903240203857, "logits/rejected": -0.9008318185806274, "logps/chosen": -73.55311584472656, "logps/rejected": -36.21039962768555, "loss": 0.2009, "rewards/accuracies": 1.0, "rewards/chosen": 1.3308807611465454, "rewards/margins": 1.0441310405731201, "rewards/rejected": 0.2867496609687805, "step": 4925 }, { "epoch": 0.8, "learning_rate": 8.91336801956239e-07, "logits/chosen": -0.6974055171012878, "logits/rejected": -0.6062185168266296, "logps/chosen": -67.2932357788086, "logps/rejected": -53.405601501464844, "loss": 0.5935, "rewards/accuracies": 0.0, "rewards/chosen": 2.181259870529175, "rewards/margins": -0.23386311531066895, "rewards/rejected": 2.4151229858398438, "step": 4926 }, { "epoch": 0.8, "learning_rate": 8.912549850206489e-07, "logits/chosen": -0.6247627139091492, "logits/rejected": -0.5439321398735046, "logps/chosen": -73.89288330078125, "logps/rejected": -60.360389709472656, "loss": 1.1043, "rewards/accuracies": 1.0, "rewards/chosen": 4.158596038818359, "rewards/margins": 1.6025946140289307, "rewards/rejected": 2.5560014247894287, "step": 4927 }, { "epoch": 0.8, "learning_rate": 8.911731410530412e-07, "logits/chosen": -0.536605715751648, "logits/rejected": -0.44649460911750793, "logps/chosen": -68.70307159423828, "logps/rejected": -54.35413360595703, "loss": 0.4115, "rewards/accuracies": 0.0, "rewards/chosen": 1.7600739002227783, "rewards/margins": -0.18728327751159668, "rewards/rejected": 1.947357177734375, "step": 4928 }, { "epoch": 0.8, "learning_rate": 8.910912700590709e-07, "logits/chosen": -0.4524487257003784, "logits/rejected": -0.35102298855781555, "logps/chosen": -59.93981170654297, "logps/rejected": -39.383033752441406, "loss": 0.4742, "rewards/accuracies": 0.0, "rewards/chosen": 1.3768600225448608, "rewards/margins": -0.2998455762863159, "rewards/rejected": 1.6767055988311768, "step": 4929 }, { "epoch": 0.8, "learning_rate": 8.910093720443943e-07, "logits/chosen": -0.8191598653793335, "logits/rejected": -0.6676085591316223, "logps/chosen": -56.21346664428711, "logps/rejected": -78.6198501586914, "loss": 0.6491, "rewards/accuracies": 1.0, "rewards/chosen": 2.287468433380127, "rewards/margins": 0.7326817512512207, "rewards/rejected": 1.5547866821289062, "step": 4930 }, { "epoch": 0.8, "learning_rate": 8.909274470146697e-07, "logits/chosen": -0.78033846616745, "logits/rejected": -0.17531508207321167, "logps/chosen": -141.7841033935547, "logps/rejected": -126.15092468261719, "loss": 1.034, "rewards/accuracies": 1.0, "rewards/chosen": 4.268527507781982, "rewards/margins": 0.1344013214111328, "rewards/rejected": 4.13412618637085, "step": 4931 }, { "epoch": 0.8, "learning_rate": 8.908454949755577e-07, "logits/chosen": -0.4439121186733246, "logits/rejected": -0.4439121186733246, "logps/chosen": -69.74505615234375, "logps/rejected": -69.74505615234375, "loss": 0.6683, "rewards/accuracies": 0.0, "rewards/chosen": 1.5705124139785767, "rewards/margins": 0.0, "rewards/rejected": 1.5705124139785767, "step": 4932 }, { "epoch": 0.8, "learning_rate": 8.907635159327203e-07, "logits/chosen": -0.4716147184371948, "logits/rejected": -0.41765913367271423, "logps/chosen": -67.77580261230469, "logps/rejected": -62.08461380004883, "loss": 0.6323, "rewards/accuracies": 1.0, "rewards/chosen": 1.5741561651229858, "rewards/margins": 0.2603168487548828, "rewards/rejected": 1.313839316368103, "step": 4933 }, { "epoch": 0.8, "learning_rate": 8.906815098918212e-07, "logits/chosen": -1.2312270402908325, "logits/rejected": -1.1751222610473633, "logps/chosen": -167.29891967773438, "logps/rejected": -45.50984573364258, "loss": 0.2148, "rewards/accuracies": 1.0, "rewards/chosen": 2.453456163406372, "rewards/margins": 1.2242240905761719, "rewards/rejected": 1.2292320728302002, "step": 4934 }, { "epoch": 0.8, "learning_rate": 8.905994768585265e-07, "logits/chosen": -0.8205246925354004, "logits/rejected": -0.8625975847244263, "logps/chosen": -97.91813659667969, "logps/rejected": -129.1400909423828, "loss": 2.8127, "rewards/accuracies": 0.0, "rewards/chosen": 1.3673171997070312, "rewards/margins": -4.737338542938232, "rewards/rejected": 6.104655742645264, "step": 4935 }, { "epoch": 0.8, "learning_rate": 8.905174168385038e-07, "logits/chosen": -0.4906602203845978, "logits/rejected": -0.48141637444496155, "logps/chosen": -53.860206604003906, "logps/rejected": -91.18739318847656, "loss": 0.1347, "rewards/accuracies": 1.0, "rewards/chosen": 1.6137542724609375, "rewards/margins": 1.2543007135391235, "rewards/rejected": 0.35945358872413635, "step": 4936 }, { "epoch": 0.8, "learning_rate": 8.904353298374227e-07, "logits/chosen": -1.0855119228363037, "logits/rejected": -1.0347445011138916, "logps/chosen": -102.39616394042969, "logps/rejected": -23.27591323852539, "loss": 0.2593, "rewards/accuracies": 1.0, "rewards/chosen": 1.1069351434707642, "rewards/margins": 1.1897636651992798, "rewards/rejected": -0.08282852172851562, "step": 4937 }, { "epoch": 0.8, "learning_rate": 8.903532158609547e-07, "logits/chosen": -1.1788967847824097, "logits/rejected": -1.1468555927276611, "logps/chosen": -126.29681396484375, "logps/rejected": -40.713104248046875, "loss": 0.1861, "rewards/accuracies": 1.0, "rewards/chosen": 1.2794647216796875, "rewards/margins": 1.1249122619628906, "rewards/rejected": 0.15455245971679688, "step": 4938 }, { "epoch": 0.8, "learning_rate": 8.90271074914773e-07, "logits/chosen": -0.5125721096992493, "logits/rejected": -0.4305035173892975, "logps/chosen": -68.92347717285156, "logps/rejected": -46.160560607910156, "loss": 0.7714, "rewards/accuracies": 0.0, "rewards/chosen": 1.4754494428634644, "rewards/margins": -0.24407958984375, "rewards/rejected": 1.7195290327072144, "step": 4939 }, { "epoch": 0.8, "learning_rate": 8.901889070045529e-07, "logits/chosen": -0.38505974411964417, "logits/rejected": -0.4322454333305359, "logps/chosen": -235.24310302734375, "logps/rejected": -82.81018829345703, "loss": 0.069, "rewards/accuracies": 1.0, "rewards/chosen": 3.436492919921875, "rewards/margins": 2.2001953125, "rewards/rejected": 1.236297607421875, "step": 4940 }, { "epoch": 0.8, "learning_rate": 8.901067121359711e-07, "logits/chosen": -0.6046420931816101, "logits/rejected": -0.5746994018554688, "logps/chosen": -65.62583923339844, "logps/rejected": -39.998809814453125, "loss": 0.2517, "rewards/accuracies": 1.0, "rewards/chosen": 2.019488573074341, "rewards/margins": 0.7080978155136108, "rewards/rejected": 1.31139075756073, "step": 4941 }, { "epoch": 0.8, "learning_rate": 8.900244903147069e-07, "logits/chosen": -0.7037386298179626, "logits/rejected": -0.7650662064552307, "logps/chosen": -102.54376220703125, "logps/rejected": -127.41497802734375, "loss": 0.6614, "rewards/accuracies": 0.0, "rewards/chosen": 5.06005859375, "rewards/margins": -1.007861614227295, "rewards/rejected": 6.067920207977295, "step": 4942 }, { "epoch": 0.8, "learning_rate": 8.899422415464408e-07, "logits/chosen": -0.4863213002681732, "logits/rejected": -0.4863213002681732, "logps/chosen": -47.184852600097656, "logps/rejected": -47.184852600097656, "loss": 2.0414, "rewards/accuracies": 0.0, "rewards/chosen": 0.7467452883720398, "rewards/margins": 0.0, "rewards/rejected": 0.7467452883720398, "step": 4943 }, { "epoch": 0.8, "learning_rate": 8.898599658368555e-07, "logits/chosen": -0.791012704372406, "logits/rejected": -0.7150166630744934, "logps/chosen": -89.78572082519531, "logps/rejected": -72.61703491210938, "loss": 1.8537, "rewards/accuracies": 0.0, "rewards/chosen": 0.4462753236293793, "rewards/margins": -0.5596206188201904, "rewards/rejected": 1.005895972251892, "step": 4944 }, { "epoch": 0.8, "learning_rate": 8.897776631916355e-07, "logits/chosen": -0.2788854241371155, "logits/rejected": -0.30598658323287964, "logps/chosen": -6.9169816970825195, "logps/rejected": -81.47866821289062, "loss": 0.6623, "rewards/accuracies": 1.0, "rewards/chosen": 0.17158842086791992, "rewards/margins": 0.19527845084667206, "rewards/rejected": -0.023690033704042435, "step": 4945 }, { "epoch": 0.8, "learning_rate": 8.89695333616467e-07, "logits/chosen": -0.7732512950897217, "logits/rejected": -0.7732512950897217, "logps/chosen": -70.00226593017578, "logps/rejected": -70.00226593017578, "loss": 0.4424, "rewards/accuracies": 0.0, "rewards/chosen": 2.416278839111328, "rewards/margins": 0.0, "rewards/rejected": 2.416278839111328, "step": 4946 }, { "epoch": 0.8, "learning_rate": 8.896129771170384e-07, "logits/chosen": -1.0616320371627808, "logits/rejected": -1.0430654287338257, "logps/chosen": -104.51309204101562, "logps/rejected": -113.82101440429688, "loss": 2.6829, "rewards/accuracies": 1.0, "rewards/chosen": 1.7901664972305298, "rewards/margins": 1.3595283031463623, "rewards/rejected": 0.4306381344795227, "step": 4947 }, { "epoch": 0.8, "learning_rate": 8.895305936990396e-07, "logits/chosen": -0.7454006671905518, "logits/rejected": -0.725827693939209, "logps/chosen": -125.55764770507812, "logps/rejected": -52.167999267578125, "loss": 1.1153, "rewards/accuracies": 0.0, "rewards/chosen": -0.060681916773319244, "rewards/margins": -1.304573893547058, "rewards/rejected": 1.243891954421997, "step": 4948 }, { "epoch": 0.8, "learning_rate": 8.894481833681624e-07, "logits/chosen": -0.7099075317382812, "logits/rejected": -0.698816180229187, "logps/chosen": -96.79696655273438, "logps/rejected": -74.59708404541016, "loss": 0.3546, "rewards/accuracies": 0.0, "rewards/chosen": 0.6211593747138977, "rewards/margins": -0.0009658932685852051, "rewards/rejected": 0.6221252679824829, "step": 4949 }, { "epoch": 0.8, "learning_rate": 8.893657461301008e-07, "logits/chosen": -0.6621759533882141, "logits/rejected": -0.6302967071533203, "logps/chosen": -78.7994155883789, "logps/rejected": -73.8323974609375, "loss": 1.408, "rewards/accuracies": 0.0, "rewards/chosen": 1.5184097290039062, "rewards/margins": -1.6076209545135498, "rewards/rejected": 3.126030683517456, "step": 4950 }, { "epoch": 0.8, "learning_rate": 8.892832819905504e-07, "logits/chosen": -0.5991238951683044, "logits/rejected": -0.5139564275741577, "logps/chosen": -113.71700286865234, "logps/rejected": -126.82682037353516, "loss": 1.2389, "rewards/accuracies": 0.0, "rewards/chosen": 0.758374035358429, "rewards/margins": -1.3890740871429443, "rewards/rejected": 2.1474480628967285, "step": 4951 }, { "epoch": 0.8, "learning_rate": 8.892007909552086e-07, "logits/chosen": -0.7126375436782837, "logits/rejected": -0.7700082659721375, "logps/chosen": -232.39651489257812, "logps/rejected": -108.28599548339844, "loss": 1.89, "rewards/accuracies": 0.0, "rewards/chosen": 2.7312042713165283, "rewards/margins": -3.4707534313201904, "rewards/rejected": 6.201957702636719, "step": 4952 }, { "epoch": 0.8, "learning_rate": 8.891182730297748e-07, "logits/chosen": -0.6703678369522095, "logits/rejected": -0.5934515595436096, "logps/chosen": -75.07493591308594, "logps/rejected": -105.8409423828125, "loss": 0.6323, "rewards/accuracies": 0.0, "rewards/chosen": 1.2981491088867188, "rewards/margins": -0.4612067937850952, "rewards/rejected": 1.759355902671814, "step": 4953 }, { "epoch": 0.8, "learning_rate": 8.890357282199503e-07, "logits/chosen": -0.6053453683853149, "logits/rejected": -0.5264970064163208, "logps/chosen": -130.01052856445312, "logps/rejected": -162.70025634765625, "loss": 0.83, "rewards/accuracies": 0.0, "rewards/chosen": 4.681967258453369, "rewards/margins": -1.4151291847229004, "rewards/rejected": 6.0970964431762695, "step": 4954 }, { "epoch": 0.8, "learning_rate": 8.88953156531438e-07, "logits/chosen": -0.22788895666599274, "logits/rejected": -0.1890759915113449, "logps/chosen": -93.3759765625, "logps/rejected": -139.11474609375, "loss": 0.4262, "rewards/accuracies": 1.0, "rewards/chosen": 1.2042557001113892, "rewards/margins": 1.7564010620117188, "rewards/rejected": -0.5521454215049744, "step": 4955 }, { "epoch": 0.8, "learning_rate": 8.888705579699429e-07, "logits/chosen": -0.4684213399887085, "logits/rejected": -0.46940940618515015, "logps/chosen": -103.58511352539062, "logps/rejected": -70.44339752197266, "loss": 0.2535, "rewards/accuracies": 1.0, "rewards/chosen": 4.0187530517578125, "rewards/margins": 1.1140556335449219, "rewards/rejected": 2.9046974182128906, "step": 4956 }, { "epoch": 0.8, "learning_rate": 8.887879325411717e-07, "logits/chosen": -0.6745762825012207, "logits/rejected": -0.7009059190750122, "logps/chosen": -53.69688415527344, "logps/rejected": -38.39299774169922, "loss": 0.8302, "rewards/accuracies": 0.0, "rewards/chosen": 1.4363716840744019, "rewards/margins": -1.436198353767395, "rewards/rejected": 2.872570037841797, "step": 4957 }, { "epoch": 0.8, "learning_rate": 8.887052802508333e-07, "logits/chosen": -0.42718878388404846, "logits/rejected": -0.42718878388404846, "logps/chosen": -46.681583404541016, "logps/rejected": -46.681583404541016, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.23239822685718536, "rewards/margins": 0.0, "rewards/rejected": 0.23239822685718536, "step": 4958 }, { "epoch": 0.8, "learning_rate": 8.886226011046377e-07, "logits/chosen": -0.6511273980140686, "logits/rejected": -0.5345824360847473, "logps/chosen": -67.43215942382812, "logps/rejected": -14.078222274780273, "loss": 1.0952, "rewards/accuracies": 1.0, "rewards/chosen": 2.2620255947113037, "rewards/margins": 1.2972888946533203, "rewards/rejected": 0.9647367596626282, "step": 4959 }, { "epoch": 0.81, "learning_rate": 8.885398951082977e-07, "logits/chosen": -0.31535205245018005, "logits/rejected": -0.33102545142173767, "logps/chosen": -18.935436248779297, "logps/rejected": -56.186981201171875, "loss": 0.3901, "rewards/accuracies": 0.0, "rewards/chosen": 0.12988929450511932, "rewards/margins": -0.11509132385253906, "rewards/rejected": 0.2449806183576584, "step": 4960 }, { "epoch": 0.81, "learning_rate": 8.884571622675274e-07, "logits/chosen": -0.9352482557296753, "logits/rejected": -0.9262773990631104, "logps/chosen": -64.89756774902344, "logps/rejected": -59.5506591796875, "loss": 0.5999, "rewards/accuracies": 1.0, "rewards/chosen": 2.082127332687378, "rewards/margins": 0.22593069076538086, "rewards/rejected": 1.856196641921997, "step": 4961 }, { "epoch": 0.81, "learning_rate": 8.883744025880427e-07, "logits/chosen": -0.6238729953765869, "logits/rejected": -0.6369162201881409, "logps/chosen": -200.3421173095703, "logps/rejected": -136.27818298339844, "loss": 1.01, "rewards/accuracies": 0.0, "rewards/chosen": 3.645390272140503, "rewards/margins": -1.8339874744415283, "rewards/rejected": 5.479377746582031, "step": 4962 }, { "epoch": 0.81, "learning_rate": 8.882916160755617e-07, "logits/chosen": -0.9392294883728027, "logits/rejected": -0.8386568427085876, "logps/chosen": -58.17526626586914, "logps/rejected": -60.00197982788086, "loss": 0.6037, "rewards/accuracies": 0.0, "rewards/chosen": 2.057889938354492, "rewards/margins": -0.26039576530456543, "rewards/rejected": 2.3182857036590576, "step": 4963 }, { "epoch": 0.81, "learning_rate": 8.882088027358042e-07, "logits/chosen": -0.8481345772743225, "logits/rejected": -0.820967972278595, "logps/chosen": -146.35382080078125, "logps/rejected": -93.24763488769531, "loss": 1.0195, "rewards/accuracies": 0.0, "rewards/chosen": 0.18493805825710297, "rewards/margins": -1.805120825767517, "rewards/rejected": 1.9900588989257812, "step": 4964 }, { "epoch": 0.81, "learning_rate": 8.881259625744915e-07, "logits/chosen": -0.15607666969299316, "logits/rejected": -0.12306737899780273, "logps/chosen": -44.38424301147461, "logps/rejected": -26.421592712402344, "loss": 0.3552, "rewards/accuracies": 1.0, "rewards/chosen": 0.6070446372032166, "rewards/margins": 0.009354829788208008, "rewards/rejected": 0.5976898074150085, "step": 4965 }, { "epoch": 0.81, "learning_rate": 8.880430955973473e-07, "logits/chosen": -0.564652144908905, "logits/rejected": -0.4384879767894745, "logps/chosen": -41.630340576171875, "logps/rejected": -53.8989143371582, "loss": 1.1586, "rewards/accuracies": 0.0, "rewards/chosen": 1.3500556945800781, "rewards/margins": -0.06071889400482178, "rewards/rejected": 1.4107745885849, "step": 4966 }, { "epoch": 0.81, "learning_rate": 8.879602018100968e-07, "logits/chosen": -0.8019726276397705, "logits/rejected": -0.7589273452758789, "logps/chosen": -48.7940559387207, "logps/rejected": -46.354583740234375, "loss": 0.3155, "rewards/accuracies": 1.0, "rewards/chosen": 1.9092525243759155, "rewards/margins": 0.36393582820892334, "rewards/rejected": 1.5453166961669922, "step": 4967 }, { "epoch": 0.81, "learning_rate": 8.878772812184673e-07, "logits/chosen": -0.630861222743988, "logits/rejected": -0.591804027557373, "logps/chosen": -92.00701904296875, "logps/rejected": -51.55447006225586, "loss": 1.0608, "rewards/accuracies": 0.0, "rewards/chosen": 0.7965912222862244, "rewards/margins": -0.4207466244697571, "rewards/rejected": 1.2173378467559814, "step": 4968 }, { "epoch": 0.81, "learning_rate": 8.877943338281879e-07, "logits/chosen": -0.6726834774017334, "logits/rejected": -0.8065447211265564, "logps/chosen": -70.45065307617188, "logps/rejected": -116.24310302734375, "loss": 2.8485, "rewards/accuracies": 0.0, "rewards/chosen": 1.6227562427520752, "rewards/margins": -2.9779441356658936, "rewards/rejected": 4.600700378417969, "step": 4969 }, { "epoch": 0.81, "learning_rate": 8.877113596449894e-07, "logits/chosen": -0.43604084849357605, "logits/rejected": -0.4505678117275238, "logps/chosen": -51.93401336669922, "logps/rejected": -65.47190856933594, "loss": 0.4547, "rewards/accuracies": 0.0, "rewards/chosen": 0.5269630551338196, "rewards/margins": -0.3302467465400696, "rewards/rejected": 0.8572098016738892, "step": 4970 }, { "epoch": 0.81, "learning_rate": 8.876283586746045e-07, "logits/chosen": -0.7399082779884338, "logits/rejected": -0.6957932114601135, "logps/chosen": -54.16701126098633, "logps/rejected": -24.250885009765625, "loss": 0.1268, "rewards/accuracies": 1.0, "rewards/chosen": 2.3589611053466797, "rewards/margins": 2.0270609855651855, "rewards/rejected": 0.331900030374527, "step": 4971 }, { "epoch": 0.81, "learning_rate": 8.875453309227677e-07, "logits/chosen": -0.3880506753921509, "logits/rejected": -0.42669832706451416, "logps/chosen": -81.28346252441406, "logps/rejected": -45.97517395019531, "loss": 0.5059, "rewards/accuracies": 0.0, "rewards/chosen": 1.3635787963867188, "rewards/margins": -0.5382652282714844, "rewards/rejected": 1.9018440246582031, "step": 4972 }, { "epoch": 0.81, "learning_rate": 8.874622763952155e-07, "logits/chosen": -0.6151267290115356, "logits/rejected": -0.5645315051078796, "logps/chosen": -59.50940704345703, "logps/rejected": -86.61366271972656, "loss": 0.7027, "rewards/accuracies": 1.0, "rewards/chosen": 2.0332162380218506, "rewards/margins": 0.782370924949646, "rewards/rejected": 1.2508453130722046, "step": 4973 }, { "epoch": 0.81, "learning_rate": 8.873791950976863e-07, "logits/chosen": -0.462079793214798, "logits/rejected": -0.47918200492858887, "logps/chosen": -103.51139831542969, "logps/rejected": -81.05856323242188, "loss": 1.0527, "rewards/accuracies": 1.0, "rewards/chosen": 1.544519066810608, "rewards/margins": 0.3436768054962158, "rewards/rejected": 1.200842261314392, "step": 4974 }, { "epoch": 0.81, "learning_rate": 8.872960870359202e-07, "logits/chosen": -0.539068877696991, "logits/rejected": -0.46731945872306824, "logps/chosen": -41.15364074707031, "logps/rejected": -34.14703369140625, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 1.2040703296661377, "rewards/margins": 0.22913706302642822, "rewards/rejected": 0.9749332666397095, "step": 4975 }, { "epoch": 0.81, "learning_rate": 8.87212952215659e-07, "logits/chosen": -1.1754547357559204, "logits/rejected": -1.1072053909301758, "logps/chosen": -116.4123306274414, "logps/rejected": -57.87333679199219, "loss": 0.6566, "rewards/accuracies": 0.0, "rewards/chosen": 0.636396050453186, "rewards/margins": -0.886101484298706, "rewards/rejected": 1.522497534751892, "step": 4976 }, { "epoch": 0.81, "learning_rate": 8.871297906426467e-07, "logits/chosen": -0.587151825428009, "logits/rejected": -0.5188062191009521, "logps/chosen": -74.84573364257812, "logps/rejected": -98.61734771728516, "loss": 0.7866, "rewards/accuracies": 0.0, "rewards/chosen": 1.6127815246582031, "rewards/margins": -1.3166992664337158, "rewards/rejected": 2.929480791091919, "step": 4977 }, { "epoch": 0.81, "learning_rate": 8.870466023226288e-07, "logits/chosen": -0.49364539980888367, "logits/rejected": -0.3193964958190918, "logps/chosen": -133.982666015625, "logps/rejected": -50.606712341308594, "loss": 0.061, "rewards/accuracies": 1.0, "rewards/chosen": 3.5823304653167725, "rewards/margins": 2.2111687660217285, "rewards/rejected": 1.371161699295044, "step": 4978 }, { "epoch": 0.81, "learning_rate": 8.869633872613531e-07, "logits/chosen": -0.8213898539543152, "logits/rejected": -0.7196201086044312, "logps/chosen": -77.35753631591797, "logps/rejected": -73.57733154296875, "loss": 0.661, "rewards/accuracies": 1.0, "rewards/chosen": 2.638409376144409, "rewards/margins": 0.35387563705444336, "rewards/rejected": 2.284533739089966, "step": 4979 }, { "epoch": 0.81, "learning_rate": 8.868801454645687e-07, "logits/chosen": -0.5362995862960815, "logits/rejected": -0.20831872522830963, "logps/chosen": -187.51661682128906, "logps/rejected": -28.956750869750977, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": 4.403294563293457, "rewards/margins": 4.05975341796875, "rewards/rejected": 0.34354114532470703, "step": 4980 }, { "epoch": 0.81, "learning_rate": 8.86796876938027e-07, "logits/chosen": -0.583325207233429, "logits/rejected": -0.44984370470046997, "logps/chosen": -186.36581420898438, "logps/rejected": -90.10104370117188, "loss": 0.073, "rewards/accuracies": 1.0, "rewards/chosen": 5.436474800109863, "rewards/margins": 2.1605820655822754, "rewards/rejected": 3.275892734527588, "step": 4981 }, { "epoch": 0.81, "learning_rate": 8.86713581687481e-07, "logits/chosen": -0.5328864455223083, "logits/rejected": -0.5011633038520813, "logps/chosen": -44.28570556640625, "logps/rejected": -45.4256591796875, "loss": 0.4171, "rewards/accuracies": 0.0, "rewards/chosen": 1.3491379022598267, "rewards/margins": -0.2211906909942627, "rewards/rejected": 1.5703285932540894, "step": 4982 }, { "epoch": 0.81, "learning_rate": 8.866302597186856e-07, "logits/chosen": -0.2750580310821533, "logits/rejected": -0.0688549354672432, "logps/chosen": -54.12794494628906, "logps/rejected": -121.29879760742188, "loss": 1.5536, "rewards/accuracies": 0.0, "rewards/chosen": 1.3806442022323608, "rewards/margins": -1.8502365350723267, "rewards/rejected": 3.2308807373046875, "step": 4983 }, { "epoch": 0.81, "learning_rate": 8.865469110373978e-07, "logits/chosen": -0.7089822888374329, "logits/rejected": -0.6548859477043152, "logps/chosen": -83.8484878540039, "logps/rejected": -49.47691345214844, "loss": 0.4135, "rewards/accuracies": 1.0, "rewards/chosen": 1.3541358709335327, "rewards/margins": 0.2380080223083496, "rewards/rejected": 1.116127848625183, "step": 4984 }, { "epoch": 0.81, "learning_rate": 8.864635356493757e-07, "logits/chosen": -0.5621729493141174, "logits/rejected": -0.6530032157897949, "logps/chosen": -110.58755493164062, "logps/rejected": -100.12992858886719, "loss": 1.1551, "rewards/accuracies": 0.0, "rewards/chosen": 2.1106414794921875, "rewards/margins": -1.0020339488983154, "rewards/rejected": 3.112675428390503, "step": 4985 }, { "epoch": 0.81, "learning_rate": 8.863801335603801e-07, "logits/chosen": -0.6353582739830017, "logits/rejected": -0.6533865332603455, "logps/chosen": -45.71467971801758, "logps/rejected": -7.5169830322265625, "loss": 0.9571, "rewards/accuracies": 0.0, "rewards/chosen": 0.07625732570886612, "rewards/margins": -0.12196149677038193, "rewards/rejected": 0.19821882247924805, "step": 4986 }, { "epoch": 0.81, "learning_rate": 8.862967047761733e-07, "logits/chosen": -0.4374850392341614, "logits/rejected": -0.44299638271331787, "logps/chosen": -4.478924751281738, "logps/rejected": -1.6764235496520996, "loss": 0.7494, "rewards/accuracies": 0.0, "rewards/chosen": 0.10131511837244034, "rewards/margins": -0.2134667932987213, "rewards/rejected": 0.31478190422058105, "step": 4987 }, { "epoch": 0.81, "learning_rate": 8.862132493025194e-07, "logits/chosen": -0.3021842837333679, "logits/rejected": -0.3021842837333679, "logps/chosen": -22.930011749267578, "logps/rejected": -22.930011749267578, "loss": 0.9543, "rewards/accuracies": 0.0, "rewards/chosen": 0.41500282287597656, "rewards/margins": 0.0, "rewards/rejected": 0.41500282287597656, "step": 4988 }, { "epoch": 0.81, "learning_rate": 8.861297671451843e-07, "logits/chosen": -0.6007401943206787, "logits/rejected": -0.5656084418296814, "logps/chosen": -126.06944274902344, "logps/rejected": -44.9284782409668, "loss": 1.3943, "rewards/accuracies": 0.0, "rewards/chosen": 0.2677810788154602, "rewards/margins": -0.9662079215049744, "rewards/rejected": 1.2339890003204346, "step": 4989 }, { "epoch": 0.81, "learning_rate": 8.860462583099359e-07, "logits/chosen": -0.5654979944229126, "logits/rejected": -0.4586946964263916, "logps/chosen": -136.72607421875, "logps/rejected": -98.01286315917969, "loss": 1.5433, "rewards/accuracies": 1.0, "rewards/chosen": 3.77140212059021, "rewards/margins": 2.024594306945801, "rewards/rejected": 1.7468079328536987, "step": 4990 }, { "epoch": 0.81, "learning_rate": 8.859627228025438e-07, "logits/chosen": -0.7215827107429504, "logits/rejected": -0.6532360911369324, "logps/chosen": -51.0618896484375, "logps/rejected": -127.19451904296875, "loss": 1.0164, "rewards/accuracies": 0.0, "rewards/chosen": 2.619879961013794, "rewards/margins": -1.8757712841033936, "rewards/rejected": 4.4956512451171875, "step": 4991 }, { "epoch": 0.81, "learning_rate": 8.858791606287796e-07, "logits/chosen": -0.4956384599208832, "logits/rejected": -0.404758095741272, "logps/chosen": -109.50088500976562, "logps/rejected": -166.71722412109375, "loss": 0.3702, "rewards/accuracies": 1.0, "rewards/chosen": 5.701028347015381, "rewards/margins": 0.3252224922180176, "rewards/rejected": 5.375805854797363, "step": 4992 }, { "epoch": 0.81, "learning_rate": 8.857955717944166e-07, "logits/chosen": -0.523346483707428, "logits/rejected": -0.49882301688194275, "logps/chosen": -37.650211334228516, "logps/rejected": -7.87915563583374, "loss": 0.1714, "rewards/accuracies": 1.0, "rewards/chosen": 2.4520745277404785, "rewards/margins": 1.4677574634552002, "rewards/rejected": 0.9843170046806335, "step": 4993 }, { "epoch": 0.81, "learning_rate": 8.8571195630523e-07, "logits/chosen": -0.7391263842582703, "logits/rejected": -0.675943911075592, "logps/chosen": -71.2095947265625, "logps/rejected": -64.65042114257812, "loss": 0.1486, "rewards/accuracies": 1.0, "rewards/chosen": 3.215618848800659, "rewards/margins": 1.2846465110778809, "rewards/rejected": 1.9309723377227783, "step": 4994 }, { "epoch": 0.81, "learning_rate": 8.856283141669969e-07, "logits/chosen": -0.6566044092178345, "logits/rejected": -0.672684907913208, "logps/chosen": -90.33592224121094, "logps/rejected": -64.2073974609375, "loss": 1.2144, "rewards/accuracies": 0.0, "rewards/chosen": -0.2520461976528168, "rewards/margins": -2.1156814098358154, "rewards/rejected": 1.8636353015899658, "step": 4995 }, { "epoch": 0.81, "learning_rate": 8.855446453854962e-07, "logits/chosen": -0.6650341749191284, "logits/rejected": -0.5833301544189453, "logps/chosen": -86.57405090332031, "logps/rejected": -69.35623168945312, "loss": 2.1207, "rewards/accuracies": 1.0, "rewards/chosen": 2.2567176818847656, "rewards/margins": 0.633348822593689, "rewards/rejected": 1.6233688592910767, "step": 4996 }, { "epoch": 0.81, "learning_rate": 8.854609499665086e-07, "logits/chosen": -0.6681387424468994, "logits/rejected": -0.5992489457130432, "logps/chosen": -37.13384246826172, "logps/rejected": -8.054698944091797, "loss": 0.4181, "rewards/accuracies": 1.0, "rewards/chosen": 1.8658065795898438, "rewards/margins": 1.106658935546875, "rewards/rejected": 0.759147584438324, "step": 4997 }, { "epoch": 0.81, "learning_rate": 8.853772279158165e-07, "logits/chosen": -0.8150386214256287, "logits/rejected": -0.8816933631896973, "logps/chosen": -62.725406646728516, "logps/rejected": -71.82389068603516, "loss": 0.8311, "rewards/accuracies": 0.0, "rewards/chosen": 2.023998737335205, "rewards/margins": -1.1894640922546387, "rewards/rejected": 3.2134628295898438, "step": 4998 }, { "epoch": 0.81, "learning_rate": 8.852934792392045e-07, "logits/chosen": -0.941923975944519, "logits/rejected": -0.8831928372383118, "logps/chosen": -223.8004150390625, "logps/rejected": -131.36276245117188, "loss": 1.124, "rewards/accuracies": 0.0, "rewards/chosen": 2.50714111328125, "rewards/margins": -1.8858122825622559, "rewards/rejected": 4.392953395843506, "step": 4999 }, { "epoch": 0.81, "learning_rate": 8.852097039424588e-07, "logits/chosen": -0.8507636785507202, "logits/rejected": -0.8967595100402832, "logps/chosen": -51.27069854736328, "logps/rejected": -107.79788970947266, "loss": 1.8119, "rewards/accuracies": 0.0, "rewards/chosen": 1.8550331592559814, "rewards/margins": -3.4736740589141846, "rewards/rejected": 5.328707218170166, "step": 5000 }, { "epoch": 0.81, "learning_rate": 8.851259020313673e-07, "logits/chosen": -0.7609742879867554, "logits/rejected": -0.6906108260154724, "logps/chosen": -189.61947631835938, "logps/rejected": -56.88002395629883, "loss": 0.0475, "rewards/accuracies": 1.0, "rewards/chosen": 5.4712677001953125, "rewards/margins": 5.089834213256836, "rewards/rejected": 0.38143348693847656, "step": 5001 }, { "epoch": 0.81, "learning_rate": 8.850420735117201e-07, "logits/chosen": -0.4960014820098877, "logits/rejected": -0.4960014820098877, "logps/chosen": -100.15713500976562, "logps/rejected": -100.15713500976562, "loss": 0.3524, "rewards/accuracies": 0.0, "rewards/chosen": 2.658456563949585, "rewards/margins": 0.0, "rewards/rejected": 2.658456563949585, "step": 5002 }, { "epoch": 0.81, "learning_rate": 8.84958218389309e-07, "logits/chosen": -0.84007728099823, "logits/rejected": -0.6783207654953003, "logps/chosen": -70.83221435546875, "logps/rejected": -100.50950622558594, "loss": 0.1395, "rewards/accuracies": 1.0, "rewards/chosen": 5.243550777435303, "rewards/margins": 1.735579490661621, "rewards/rejected": 3.5079712867736816, "step": 5003 }, { "epoch": 0.81, "learning_rate": 8.848743366699275e-07, "logits/chosen": -0.5618031620979309, "logits/rejected": -0.46475106477737427, "logps/chosen": -67.28714752197266, "logps/rejected": -42.52138900756836, "loss": 1.3077, "rewards/accuracies": 0.0, "rewards/chosen": 0.7173622250556946, "rewards/margins": -0.002895355224609375, "rewards/rejected": 0.720257580280304, "step": 5004 }, { "epoch": 0.81, "learning_rate": 8.84790428359371e-07, "logits/chosen": -0.5799623131752014, "logits/rejected": -0.5770272016525269, "logps/chosen": -116.15155792236328, "logps/rejected": -99.67204284667969, "loss": 0.4859, "rewards/accuracies": 0.0, "rewards/chosen": 1.2377251386642456, "rewards/margins": -0.4854689836502075, "rewards/rejected": 1.7231941223144531, "step": 5005 }, { "epoch": 0.81, "learning_rate": 8.84706493463437e-07, "logits/chosen": -0.5595676898956299, "logits/rejected": -0.6369274258613586, "logps/chosen": -129.0105438232422, "logps/rejected": -121.13658905029297, "loss": 1.0087, "rewards/accuracies": 0.0, "rewards/chosen": 0.40246278047561646, "rewards/margins": -1.554694414138794, "rewards/rejected": 1.9571571350097656, "step": 5006 }, { "epoch": 0.81, "learning_rate": 8.846225319879241e-07, "logits/chosen": -0.8678956627845764, "logits/rejected": -0.7386581897735596, "logps/chosen": -93.86690521240234, "logps/rejected": -62.97239685058594, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": 5.616431713104248, "rewards/margins": 3.7269997596740723, "rewards/rejected": 1.8894318342208862, "step": 5007 }, { "epoch": 0.81, "learning_rate": 8.845385439386336e-07, "logits/chosen": -0.6791244745254517, "logits/rejected": -0.6675101518630981, "logps/chosen": -96.89605712890625, "logps/rejected": -99.65878295898438, "loss": 0.5066, "rewards/accuracies": 0.0, "rewards/chosen": 4.243936061859131, "rewards/margins": -0.5442156791687012, "rewards/rejected": 4.788151741027832, "step": 5008 }, { "epoch": 0.81, "learning_rate": 8.844545293213685e-07, "logits/chosen": -0.26474329829216003, "logits/rejected": -0.26474329829216003, "logps/chosen": -95.2014389038086, "logps/rejected": -95.2014389038086, "loss": 0.3557, "rewards/accuracies": 0.0, "rewards/chosen": 1.2714470624923706, "rewards/margins": 0.0, "rewards/rejected": 1.2714470624923706, "step": 5009 }, { "epoch": 0.81, "learning_rate": 8.843704881419331e-07, "logits/chosen": -0.3758275806903839, "logits/rejected": -0.40645432472229004, "logps/chosen": -92.65567779541016, "logps/rejected": -58.93190002441406, "loss": 0.6648, "rewards/accuracies": 0.0, "rewards/chosen": 0.7677375674247742, "rewards/margins": -0.2194313406944275, "rewards/rejected": 0.9871689081192017, "step": 5010 }, { "epoch": 0.81, "learning_rate": 8.84286420406134e-07, "logits/chosen": -0.8903379440307617, "logits/rejected": -0.7347634434700012, "logps/chosen": -109.68429565429688, "logps/rejected": -23.105438232421875, "loss": 0.0367, "rewards/accuracies": 1.0, "rewards/chosen": 3.4531280994415283, "rewards/margins": 3.204843044281006, "rewards/rejected": 0.24828509986400604, "step": 5011 }, { "epoch": 0.81, "learning_rate": 8.842023261197793e-07, "logits/chosen": -0.5684546232223511, "logits/rejected": -0.6167255640029907, "logps/chosen": -20.579355239868164, "logps/rejected": -35.14140319824219, "loss": 0.5734, "rewards/accuracies": 0.0, "rewards/chosen": 1.1230758428573608, "rewards/margins": -0.44358909130096436, "rewards/rejected": 1.5666649341583252, "step": 5012 }, { "epoch": 0.81, "learning_rate": 8.841182052886791e-07, "logits/chosen": -1.1483609676361084, "logits/rejected": -1.1311471462249756, "logps/chosen": -83.89227294921875, "logps/rejected": -85.55156707763672, "loss": 1.059, "rewards/accuracies": 0.0, "rewards/chosen": 3.71482253074646, "rewards/margins": -1.932391881942749, "rewards/rejected": 5.647214412689209, "step": 5013 }, { "epoch": 0.81, "learning_rate": 8.840340579186456e-07, "logits/chosen": -0.747028112411499, "logits/rejected": -0.75607830286026, "logps/chosen": -78.04936218261719, "logps/rejected": -126.46745300292969, "loss": 0.9239, "rewards/accuracies": 1.0, "rewards/chosen": 1.1016616821289062, "rewards/margins": 0.5561935305595398, "rewards/rejected": 0.5454681515693665, "step": 5014 }, { "epoch": 0.81, "learning_rate": 8.839498840154925e-07, "logits/chosen": -0.7036486864089966, "logits/rejected": -0.5714604258537292, "logps/chosen": -99.71009826660156, "logps/rejected": -53.694801330566406, "loss": 1.3488, "rewards/accuracies": 1.0, "rewards/chosen": 2.446674346923828, "rewards/margins": 0.5303939580917358, "rewards/rejected": 1.9162803888320923, "step": 5015 }, { "epoch": 0.81, "learning_rate": 8.838656835850353e-07, "logits/chosen": -0.4578119218349457, "logits/rejected": -0.40788233280181885, "logps/chosen": -64.08412170410156, "logps/rejected": -65.92967224121094, "loss": 0.8371, "rewards/accuracies": 1.0, "rewards/chosen": 2.5522172451019287, "rewards/margins": 0.9462021589279175, "rewards/rejected": 1.6060150861740112, "step": 5016 }, { "epoch": 0.81, "learning_rate": 8.837814566330916e-07, "logits/chosen": -0.7563418745994568, "logits/rejected": -0.6620134115219116, "logps/chosen": -73.02733612060547, "logps/rejected": -65.50565338134766, "loss": 0.303, "rewards/accuracies": 1.0, "rewards/chosen": 1.59539794921875, "rewards/margins": 0.3215904235839844, "rewards/rejected": 1.2738075256347656, "step": 5017 }, { "epoch": 0.81, "learning_rate": 8.836972031654806e-07, "logits/chosen": -0.3938711881637573, "logits/rejected": -0.4463716447353363, "logps/chosen": -62.10547637939453, "logps/rejected": -56.23182678222656, "loss": 1.0374, "rewards/accuracies": 0.0, "rewards/chosen": 0.5026909112930298, "rewards/margins": -1.4221992492675781, "rewards/rejected": 1.924890160560608, "step": 5018 }, { "epoch": 0.81, "learning_rate": 8.836129231880234e-07, "logits/chosen": -0.33217450976371765, "logits/rejected": -0.34388068318367004, "logps/chosen": -119.09246826171875, "logps/rejected": -41.89175033569336, "loss": 0.6166, "rewards/accuracies": 0.0, "rewards/chosen": 0.1078697219491005, "rewards/margins": -0.8227226138114929, "rewards/rejected": 0.9305923581123352, "step": 5019 }, { "epoch": 0.81, "learning_rate": 8.83528616706543e-07, "logits/chosen": -0.3703332841396332, "logits/rejected": -0.3703332841396332, "logps/chosen": -3.9577696323394775, "logps/rejected": -3.9577696323394775, "loss": 0.4082, "rewards/accuracies": 0.0, "rewards/chosen": 0.47572270035743713, "rewards/margins": 0.0, "rewards/rejected": 0.47572270035743713, "step": 5020 }, { "epoch": 0.81, "learning_rate": 8.834442837268641e-07, "logits/chosen": -0.6738875508308411, "logits/rejected": -0.6039978265762329, "logps/chosen": -94.07386016845703, "logps/rejected": -105.70185089111328, "loss": 0.6951, "rewards/accuracies": 1.0, "rewards/chosen": 2.661252737045288, "rewards/margins": 1.333027720451355, "rewards/rejected": 1.328225016593933, "step": 5021 }, { "epoch": 0.82, "learning_rate": 8.833599242548135e-07, "logits/chosen": -0.7437266111373901, "logits/rejected": -0.7955213785171509, "logps/chosen": -124.5521240234375, "logps/rejected": -195.015869140625, "loss": 0.7799, "rewards/accuracies": 0.0, "rewards/chosen": 5.257777690887451, "rewards/margins": -1.3050518035888672, "rewards/rejected": 6.562829494476318, "step": 5022 }, { "epoch": 0.82, "learning_rate": 8.832755382962195e-07, "logits/chosen": -0.37137648463249207, "logits/rejected": -0.3467061221599579, "logps/chosen": -40.32929992675781, "logps/rejected": -5.652756690979004, "loss": 0.6723, "rewards/accuracies": 1.0, "rewards/chosen": 0.615142822265625, "rewards/margins": 0.08512067794799805, "rewards/rejected": 0.530022144317627, "step": 5023 }, { "epoch": 0.82, "learning_rate": 8.831911258569124e-07, "logits/chosen": -0.618238627910614, "logits/rejected": -0.4551829993724823, "logps/chosen": -80.11700439453125, "logps/rejected": -59.7302360534668, "loss": 0.585, "rewards/accuracies": 1.0, "rewards/chosen": 2.877095937728882, "rewards/margins": 1.2153416872024536, "rewards/rejected": 1.6617542505264282, "step": 5024 }, { "epoch": 0.82, "learning_rate": 8.831066869427241e-07, "logits/chosen": -0.4777848720550537, "logits/rejected": -0.47928670048713684, "logps/chosen": -5.249264240264893, "logps/rejected": -2.8074402809143066, "loss": 0.5503, "rewards/accuracies": 0.0, "rewards/chosen": 0.23809833824634552, "rewards/margins": -0.1053343266248703, "rewards/rejected": 0.3434326648712158, "step": 5025 }, { "epoch": 0.82, "learning_rate": 8.83022221559489e-07, "logits/chosen": -0.9132400751113892, "logits/rejected": -0.8629641532897949, "logps/chosen": -112.7747802734375, "logps/rejected": -64.2891845703125, "loss": 0.8353, "rewards/accuracies": 0.0, "rewards/chosen": 0.4654190242290497, "rewards/margins": -0.6828330755233765, "rewards/rejected": 1.1482521295547485, "step": 5026 }, { "epoch": 0.82, "learning_rate": 8.829377297130425e-07, "logits/chosen": -0.5442065596580505, "logits/rejected": -0.5344299674034119, "logps/chosen": -30.844404220581055, "logps/rejected": -4.627311706542969, "loss": 0.391, "rewards/accuracies": 1.0, "rewards/chosen": 0.2814466655254364, "rewards/margins": 0.14703251421451569, "rewards/rejected": 0.13441415131092072, "step": 5027 }, { "epoch": 0.82, "learning_rate": 8.828532114092222e-07, "logits/chosen": -0.3332395553588867, "logits/rejected": -0.33520033955574036, "logps/chosen": -14.692307472229004, "logps/rejected": -4.655635356903076, "loss": 0.9151, "rewards/accuracies": 0.0, "rewards/chosen": -0.24761943519115448, "rewards/margins": -0.4011843800544739, "rewards/rejected": 0.1535649299621582, "step": 5028 }, { "epoch": 0.82, "learning_rate": 8.827686666538676e-07, "logits/chosen": -0.7683338522911072, "logits/rejected": -0.7481285333633423, "logps/chosen": -85.21554565429688, "logps/rejected": -23.754817962646484, "loss": 0.7089, "rewards/accuracies": 1.0, "rewards/chosen": 1.3090591430664062, "rewards/margins": 0.9223735332489014, "rewards/rejected": 0.3866855800151825, "step": 5029 }, { "epoch": 0.82, "learning_rate": 8.826840954528198e-07, "logits/chosen": -0.7143526673316956, "logits/rejected": -0.7344022393226624, "logps/chosen": -72.03231811523438, "logps/rejected": -48.86112976074219, "loss": 0.8685, "rewards/accuracies": 1.0, "rewards/chosen": 2.2116456031799316, "rewards/margins": 0.14816975593566895, "rewards/rejected": 2.0634758472442627, "step": 5030 }, { "epoch": 0.82, "learning_rate": 8.825994978119223e-07, "logits/chosen": -0.80843585729599, "logits/rejected": -0.7200462222099304, "logps/chosen": -100.88764190673828, "logps/rejected": -63.62886047363281, "loss": 1.1462, "rewards/accuracies": 0.0, "rewards/chosen": 1.0803428888320923, "rewards/margins": -1.010537028312683, "rewards/rejected": 2.0908799171447754, "step": 5031 }, { "epoch": 0.82, "learning_rate": 8.825148737370194e-07, "logits/chosen": -0.3370099663734436, "logits/rejected": -0.3415180742740631, "logps/chosen": -4.224198341369629, "logps/rejected": -2.9523062705993652, "loss": 0.3943, "rewards/accuracies": 0.0, "rewards/chosen": 0.13100901246070862, "rewards/margins": -0.17163795232772827, "rewards/rejected": 0.3026469647884369, "step": 5032 }, { "epoch": 0.82, "learning_rate": 8.824302232339582e-07, "logits/chosen": -0.6636887192726135, "logits/rejected": -0.5531206727027893, "logps/chosen": -117.450927734375, "logps/rejected": -53.575164794921875, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 4.234694004058838, "rewards/margins": 2.5756752490997314, "rewards/rejected": 1.6590187549591064, "step": 5033 }, { "epoch": 0.82, "learning_rate": 8.823455463085873e-07, "logits/chosen": -0.7472909092903137, "logits/rejected": -0.7286720275878906, "logps/chosen": -176.57249450683594, "logps/rejected": -16.659923553466797, "loss": 1.0317, "rewards/accuracies": 1.0, "rewards/chosen": 0.9641799926757812, "rewards/margins": 0.7545154690742493, "rewards/rejected": 0.20966453850269318, "step": 5034 }, { "epoch": 0.82, "learning_rate": 8.822608429667569e-07, "logits/chosen": -0.9103043079376221, "logits/rejected": -0.8098436594009399, "logps/chosen": -67.4527816772461, "logps/rejected": -44.60210418701172, "loss": 0.3008, "rewards/accuracies": 1.0, "rewards/chosen": 2.809457540512085, "rewards/margins": 1.600469708442688, "rewards/rejected": 1.208987832069397, "step": 5035 }, { "epoch": 0.82, "learning_rate": 8.821761132143191e-07, "logits/chosen": -0.5559922456741333, "logits/rejected": -0.5559922456741333, "logps/chosen": -48.642311096191406, "logps/rejected": -48.642311096191406, "loss": 0.5504, "rewards/accuracies": 0.0, "rewards/chosen": 1.766832709312439, "rewards/margins": 0.0, "rewards/rejected": 1.766832709312439, "step": 5036 }, { "epoch": 0.82, "learning_rate": 8.820913570571281e-07, "logits/chosen": -0.47306567430496216, "logits/rejected": -0.43165910243988037, "logps/chosen": -91.35786437988281, "logps/rejected": -62.50773620605469, "loss": 0.3083, "rewards/accuracies": 1.0, "rewards/chosen": 1.8514305353164673, "rewards/margins": 0.21046829223632812, "rewards/rejected": 1.6409622430801392, "step": 5037 }, { "epoch": 0.82, "learning_rate": 8.820065745010398e-07, "logits/chosen": -0.9454962015151978, "logits/rejected": -0.8767807483673096, "logps/chosen": -161.7334747314453, "logps/rejected": -180.62628173828125, "loss": 1.8284, "rewards/accuracies": 0.0, "rewards/chosen": 3.6751160621643066, "rewards/margins": -3.571669101715088, "rewards/rejected": 7.2467851638793945, "step": 5038 }, { "epoch": 0.82, "learning_rate": 8.819217655519117e-07, "logits/chosen": -0.6978054642677307, "logits/rejected": -0.5392066240310669, "logps/chosen": -150.69876098632812, "logps/rejected": -195.20449829101562, "loss": 0.2317, "rewards/accuracies": 1.0, "rewards/chosen": 5.065695285797119, "rewards/margins": 0.6239290237426758, "rewards/rejected": 4.441766262054443, "step": 5039 }, { "epoch": 0.82, "learning_rate": 8.818369302156033e-07, "logits/chosen": -0.5990845561027527, "logits/rejected": -0.5994852185249329, "logps/chosen": -52.43824768066406, "logps/rejected": -42.276451110839844, "loss": 0.5706, "rewards/accuracies": 0.0, "rewards/chosen": 0.6453613638877869, "rewards/margins": -0.0984576940536499, "rewards/rejected": 0.7438190579414368, "step": 5040 }, { "epoch": 0.82, "learning_rate": 8.817520684979761e-07, "logits/chosen": -0.845139741897583, "logits/rejected": -0.7796811461448669, "logps/chosen": -96.88160705566406, "logps/rejected": -24.145328521728516, "loss": 0.9792, "rewards/accuracies": 1.0, "rewards/chosen": 1.8558616638183594, "rewards/margins": 1.4828790426254272, "rewards/rejected": 0.37298259139060974, "step": 5041 }, { "epoch": 0.82, "learning_rate": 8.816671804048932e-07, "logits/chosen": -0.7907055020332336, "logits/rejected": -0.7418784499168396, "logps/chosen": -70.15653991699219, "logps/rejected": -93.02439880371094, "loss": 1.1436, "rewards/accuracies": 1.0, "rewards/chosen": 2.0038833618164062, "rewards/margins": 0.07361674308776855, "rewards/rejected": 1.9302666187286377, "step": 5042 }, { "epoch": 0.82, "learning_rate": 8.815822659422194e-07, "logits/chosen": -0.4768015146255493, "logits/rejected": -0.3807058036327362, "logps/chosen": -142.00735473632812, "logps/rejected": -82.39268493652344, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": 4.869734287261963, "rewards/margins": 2.016979217529297, "rewards/rejected": 2.852755069732666, "step": 5043 }, { "epoch": 0.82, "learning_rate": 8.814973251158217e-07, "logits/chosen": -0.5418577790260315, "logits/rejected": -0.5595048069953918, "logps/chosen": -60.23379135131836, "logps/rejected": -100.43975830078125, "loss": 0.9634, "rewards/accuracies": 1.0, "rewards/chosen": 1.1388187408447266, "rewards/margins": 0.24333077669143677, "rewards/rejected": 0.8954879641532898, "step": 5044 }, { "epoch": 0.82, "learning_rate": 8.814123579315684e-07, "logits/chosen": -0.6150698065757751, "logits/rejected": -0.4559671878814697, "logps/chosen": -56.72565460205078, "logps/rejected": -59.40343475341797, "loss": 1.2631, "rewards/accuracies": 1.0, "rewards/chosen": 1.78729248046875, "rewards/margins": 1.5778892040252686, "rewards/rejected": 0.20940323173999786, "step": 5045 }, { "epoch": 0.82, "learning_rate": 8.813273643953303e-07, "logits/chosen": -0.3989028036594391, "logits/rejected": -0.4541231691837311, "logps/chosen": -69.14215087890625, "logps/rejected": -59.579872131347656, "loss": 0.9362, "rewards/accuracies": 1.0, "rewards/chosen": 1.4733933210372925, "rewards/margins": 0.6037041544914246, "rewards/rejected": 0.8696891665458679, "step": 5046 }, { "epoch": 0.82, "learning_rate": 8.812423445129795e-07, "logits/chosen": -0.852038562297821, "logits/rejected": -0.8532293438911438, "logps/chosen": -138.58612060546875, "logps/rejected": -67.51932525634766, "loss": 1.047, "rewards/accuracies": 0.0, "rewards/chosen": 0.8958297967910767, "rewards/margins": -1.5616294145584106, "rewards/rejected": 2.4574592113494873, "step": 5047 }, { "epoch": 0.82, "learning_rate": 8.811572982903899e-07, "logits/chosen": -0.42422670125961304, "logits/rejected": -0.18834251165390015, "logps/chosen": -97.78368377685547, "logps/rejected": -23.01720428466797, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": 3.2975380420684814, "rewards/margins": 3.0675926208496094, "rewards/rejected": 0.2299453765153885, "step": 5048 }, { "epoch": 0.82, "learning_rate": 8.810722257334375e-07, "logits/chosen": -0.45854419469833374, "logits/rejected": -0.38339826464653015, "logps/chosen": -42.139984130859375, "logps/rejected": -27.097713470458984, "loss": 0.4457, "rewards/accuracies": 1.0, "rewards/chosen": 1.6555442810058594, "rewards/margins": 0.4656146764755249, "rewards/rejected": 1.1899296045303345, "step": 5049 }, { "epoch": 0.82, "learning_rate": 8.809871268480003e-07, "logits/chosen": -0.5224499106407166, "logits/rejected": -0.4479682445526123, "logps/chosen": -69.81434631347656, "logps/rejected": -55.93269729614258, "loss": 0.6298, "rewards/accuracies": 1.0, "rewards/chosen": 3.1125900745391846, "rewards/margins": 0.4171290397644043, "rewards/rejected": 2.6954610347747803, "step": 5050 }, { "epoch": 0.82, "learning_rate": 8.809020016399573e-07, "logits/chosen": -1.0219581127166748, "logits/rejected": -0.9790266752243042, "logps/chosen": -309.2884216308594, "logps/rejected": -235.45977783203125, "loss": 2.3176, "rewards/accuracies": 0.0, "rewards/chosen": 3.3330445289611816, "rewards/margins": -4.560143947601318, "rewards/rejected": 7.8931884765625, "step": 5051 }, { "epoch": 0.82, "learning_rate": 8.808168501151903e-07, "logits/chosen": -0.7707440853118896, "logits/rejected": -0.7244085669517517, "logps/chosen": -53.37940216064453, "logps/rejected": -47.93522644042969, "loss": 0.4602, "rewards/accuracies": 1.0, "rewards/chosen": 1.914986491203308, "rewards/margins": 1.1358788013458252, "rewards/rejected": 0.7791076898574829, "step": 5052 }, { "epoch": 0.82, "learning_rate": 8.807316722795821e-07, "logits/chosen": -0.7291768789291382, "logits/rejected": -0.6870371699333191, "logps/chosen": -137.67950439453125, "logps/rejected": -42.94798278808594, "loss": 0.714, "rewards/accuracies": 1.0, "rewards/chosen": 4.4319658279418945, "rewards/margins": 3.200439929962158, "rewards/rejected": 1.2315257787704468, "step": 5053 }, { "epoch": 0.82, "learning_rate": 8.80646468139018e-07, "logits/chosen": -0.9311435222625732, "logits/rejected": -0.6887251138687134, "logps/chosen": -117.67191314697266, "logps/rejected": -69.76457214355469, "loss": 0.4812, "rewards/accuracies": 1.0, "rewards/chosen": 5.90607213973999, "rewards/margins": 4.02903413772583, "rewards/rejected": 1.8770378828048706, "step": 5054 }, { "epoch": 0.82, "learning_rate": 8.805612376993846e-07, "logits/chosen": -0.37755030393600464, "logits/rejected": -0.37755030393600464, "logps/chosen": -87.61026000976562, "logps/rejected": -87.61026000976562, "loss": 0.3606, "rewards/accuracies": 0.0, "rewards/chosen": 1.5997971296310425, "rewards/margins": 0.0, "rewards/rejected": 1.5997971296310425, "step": 5055 }, { "epoch": 0.82, "learning_rate": 8.804759809665706e-07, "logits/chosen": -0.8250773549079895, "logits/rejected": -0.762877881526947, "logps/chosen": -96.26231384277344, "logps/rejected": -140.39785766601562, "loss": 3.298, "rewards/accuracies": 0.0, "rewards/chosen": 0.8897461295127869, "rewards/margins": -5.553204536437988, "rewards/rejected": 6.44295072555542, "step": 5056 }, { "epoch": 0.82, "learning_rate": 8.803906979464664e-07, "logits/chosen": -0.4267876148223877, "logits/rejected": -0.4576779007911682, "logps/chosen": -104.59539031982422, "logps/rejected": -143.06597900390625, "loss": 1.2589, "rewards/accuracies": 0.0, "rewards/chosen": 4.037729740142822, "rewards/margins": -2.3700079917907715, "rewards/rejected": 6.407737731933594, "step": 5057 }, { "epoch": 0.82, "learning_rate": 8.803053886449643e-07, "logits/chosen": -0.14114639163017273, "logits/rejected": -0.15421664714813232, "logps/chosen": -60.862525939941406, "logps/rejected": -61.03207778930664, "loss": 0.7459, "rewards/accuracies": 1.0, "rewards/chosen": 0.8812019228935242, "rewards/margins": 0.33055758476257324, "rewards/rejected": 0.5506443381309509, "step": 5058 }, { "epoch": 0.82, "learning_rate": 8.802200530679584e-07, "logits/chosen": -0.6391236782073975, "logits/rejected": -0.5017802119255066, "logps/chosen": -96.09668731689453, "logps/rejected": -31.35174560546875, "loss": 0.1449, "rewards/accuracies": 1.0, "rewards/chosen": 1.3911689519882202, "rewards/margins": 1.2346007823944092, "rewards/rejected": 0.15656815469264984, "step": 5059 }, { "epoch": 0.82, "learning_rate": 8.801346912213443e-07, "logits/chosen": -0.6190007925033569, "logits/rejected": -0.5839466452598572, "logps/chosen": -48.585113525390625, "logps/rejected": -60.41181945800781, "loss": 0.3956, "rewards/accuracies": 0.0, "rewards/chosen": 1.516434907913208, "rewards/margins": -0.09092438220977783, "rewards/rejected": 1.6073592901229858, "step": 5060 }, { "epoch": 0.82, "learning_rate": 8.800493031110201e-07, "logits/chosen": -0.5022808909416199, "logits/rejected": -0.4663854241371155, "logps/chosen": -46.512290954589844, "logps/rejected": -72.09230041503906, "loss": 1.948, "rewards/accuracies": 0.0, "rewards/chosen": 0.7541977167129517, "rewards/margins": -2.7167129516601562, "rewards/rejected": 3.4709107875823975, "step": 5061 }, { "epoch": 0.82, "learning_rate": 8.79963888742885e-07, "logits/chosen": -0.6275636553764343, "logits/rejected": -0.48307177424430847, "logps/chosen": -108.00288391113281, "logps/rejected": -84.26277160644531, "loss": 0.2219, "rewards/accuracies": 1.0, "rewards/chosen": 1.560113549232483, "rewards/margins": 0.7628105282783508, "rewards/rejected": 0.7973030209541321, "step": 5062 }, { "epoch": 0.82, "learning_rate": 8.798784481228404e-07, "logits/chosen": -0.9522988200187683, "logits/rejected": -0.8626033663749695, "logps/chosen": -140.71710205078125, "logps/rejected": -21.216487884521484, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 6.596493721008301, "rewards/margins": 6.188419818878174, "rewards/rejected": 0.4080738127231598, "step": 5063 }, { "epoch": 0.82, "learning_rate": 8.797929812567896e-07, "logits/chosen": -0.9632625579833984, "logits/rejected": -1.0204460620880127, "logps/chosen": -252.7945556640625, "logps/rejected": -182.09490966796875, "loss": 0.6055, "rewards/accuracies": 0.0, "rewards/chosen": 3.5848419666290283, "rewards/margins": -0.7772324085235596, "rewards/rejected": 4.362074375152588, "step": 5064 }, { "epoch": 0.82, "learning_rate": 8.797074881506374e-07, "logits/chosen": -0.7231829762458801, "logits/rejected": -0.567773699760437, "logps/chosen": -130.08392333984375, "logps/rejected": -94.62408447265625, "loss": 0.36, "rewards/accuracies": 1.0, "rewards/chosen": 3.8035004138946533, "rewards/margins": 1.7016205787658691, "rewards/rejected": 2.101879835128784, "step": 5065 }, { "epoch": 0.82, "learning_rate": 8.796219688102905e-07, "logits/chosen": -0.37773555517196655, "logits/rejected": -0.32295915484428406, "logps/chosen": -62.12239074707031, "logps/rejected": -84.20527648925781, "loss": 1.4371, "rewards/accuracies": 0.0, "rewards/chosen": 2.0874688625335693, "rewards/margins": -0.023737192153930664, "rewards/rejected": 2.1112060546875, "step": 5066 }, { "epoch": 0.82, "learning_rate": 8.795364232416576e-07, "logits/chosen": -0.6566401124000549, "logits/rejected": -0.5935919284820557, "logps/chosen": -118.79566955566406, "logps/rejected": -131.98822021484375, "loss": 1.1977, "rewards/accuracies": 0.0, "rewards/chosen": 2.652888536453247, "rewards/margins": -2.290663003921509, "rewards/rejected": 4.943551540374756, "step": 5067 }, { "epoch": 0.82, "learning_rate": 8.794508514506491e-07, "logits/chosen": -0.6988492608070374, "logits/rejected": -0.5810402631759644, "logps/chosen": -63.41693115234375, "logps/rejected": -50.006195068359375, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 2.5662949085235596, "rewards/margins": 1.5069175958633423, "rewards/rejected": 1.0593773126602173, "step": 5068 }, { "epoch": 0.82, "learning_rate": 8.793652534431772e-07, "logits/chosen": -1.0506706237792969, "logits/rejected": -1.0158917903900146, "logps/chosen": -141.4361114501953, "logps/rejected": -110.13198852539062, "loss": 0.8447, "rewards/accuracies": 0.0, "rewards/chosen": 1.1606963872909546, "rewards/margins": -0.8756715059280396, "rewards/rejected": 2.036367893218994, "step": 5069 }, { "epoch": 0.82, "learning_rate": 8.792796292251559e-07, "logits/chosen": -1.0792362689971924, "logits/rejected": -1.0315028429031372, "logps/chosen": -137.533935546875, "logps/rejected": -38.631866455078125, "loss": 0.1462, "rewards/accuracies": 1.0, "rewards/chosen": 1.4395843744277954, "rewards/margins": 1.2348591089248657, "rewards/rejected": 0.2047252655029297, "step": 5070 }, { "epoch": 0.82, "learning_rate": 8.791939788025009e-07, "logits/chosen": -0.5697917938232422, "logits/rejected": -0.5349869728088379, "logps/chosen": -52.31100082397461, "logps/rejected": -54.578773498535156, "loss": 0.3035, "rewards/accuracies": 1.0, "rewards/chosen": 0.6649090051651001, "rewards/margins": 0.18441620469093323, "rewards/rejected": 0.48049280047416687, "step": 5071 }, { "epoch": 0.82, "learning_rate": 8.7910830218113e-07, "logits/chosen": -0.27193692326545715, "logits/rejected": -0.27193692326545715, "logps/chosen": -58.13637924194336, "logps/rejected": -58.13637924194336, "loss": 0.6204, "rewards/accuracies": 0.0, "rewards/chosen": 0.5533356070518494, "rewards/margins": 0.0, "rewards/rejected": 0.5533356070518494, "step": 5072 }, { "epoch": 0.82, "learning_rate": 8.790225993669624e-07, "logits/chosen": -0.4941234588623047, "logits/rejected": -0.47901198267936707, "logps/chosen": -35.20458221435547, "logps/rejected": -55.58676528930664, "loss": 1.2667, "rewards/accuracies": 0.0, "rewards/chosen": 1.5181785821914673, "rewards/margins": -0.916016697883606, "rewards/rejected": 2.4341952800750732, "step": 5073 }, { "epoch": 0.82, "learning_rate": 8.789368703659198e-07, "logits/chosen": -0.36979272961616516, "logits/rejected": -0.4469231367111206, "logps/chosen": -68.14071655273438, "logps/rejected": -63.62433624267578, "loss": 0.5579, "rewards/accuracies": 0.0, "rewards/chosen": 1.3322190046310425, "rewards/margins": -0.6380202770233154, "rewards/rejected": 1.970239281654358, "step": 5074 }, { "epoch": 0.82, "learning_rate": 8.788511151839248e-07, "logits/chosen": -0.788622260093689, "logits/rejected": -0.813618540763855, "logps/chosen": -171.2999725341797, "logps/rejected": -149.1922149658203, "loss": 1.6225, "rewards/accuracies": 0.0, "rewards/chosen": 3.555140733718872, "rewards/margins": -3.02530837059021, "rewards/rejected": 6.580449104309082, "step": 5075 }, { "epoch": 0.82, "learning_rate": 8.787653338269026e-07, "logits/chosen": -0.3766130805015564, "logits/rejected": -0.2788918912410736, "logps/chosen": -103.87882995605469, "logps/rejected": -73.396484375, "loss": 2.7076, "rewards/accuracies": 0.0, "rewards/chosen": 1.0340737104415894, "rewards/margins": -0.9437377452850342, "rewards/rejected": 1.9778114557266235, "step": 5076 }, { "epoch": 0.82, "learning_rate": 8.786795263007796e-07, "logits/chosen": -0.6601869463920593, "logits/rejected": -0.5853754281997681, "logps/chosen": -82.01701354980469, "logps/rejected": -62.57719421386719, "loss": 0.4391, "rewards/accuracies": 0.0, "rewards/chosen": 2.157705068588257, "rewards/margins": -0.25650691986083984, "rewards/rejected": 2.4142119884490967, "step": 5077 }, { "epoch": 0.82, "learning_rate": 8.785936926114846e-07, "logits/chosen": -0.16094805300235748, "logits/rejected": -0.16094805300235748, "logps/chosen": -46.952293395996094, "logps/rejected": -46.952293395996094, "loss": 0.6643, "rewards/accuracies": 0.0, "rewards/chosen": 1.957830786705017, "rewards/margins": 0.0, "rewards/rejected": 1.957830786705017, "step": 5078 }, { "epoch": 0.82, "learning_rate": 8.785078327649476e-07, "logits/chosen": -0.8695249557495117, "logits/rejected": -0.8171244263648987, "logps/chosen": -148.66082763671875, "logps/rejected": -119.25825500488281, "loss": 1.2519, "rewards/accuracies": 0.0, "rewards/chosen": 2.2697510719299316, "rewards/margins": -2.395205497741699, "rewards/rejected": 4.664956569671631, "step": 5079 }, { "epoch": 0.82, "learning_rate": 8.784219467671008e-07, "logits/chosen": -0.4294567406177521, "logits/rejected": -0.5984180569648743, "logps/chosen": -38.06649398803711, "logps/rejected": -37.41066360473633, "loss": 0.6726, "rewards/accuracies": 0.0, "rewards/chosen": 1.152164101600647, "rewards/margins": -1.0297852754592896, "rewards/rejected": 2.1819493770599365, "step": 5080 }, { "epoch": 0.82, "learning_rate": 8.783360346238783e-07, "logits/chosen": -0.6946665048599243, "logits/rejected": -0.6497092843055725, "logps/chosen": -124.32300567626953, "logps/rejected": -54.008033752441406, "loss": 1.099, "rewards/accuracies": 0.0, "rewards/chosen": 1.4437614679336548, "rewards/margins": -0.5028121471405029, "rewards/rejected": 1.9465736150741577, "step": 5081 }, { "epoch": 0.82, "learning_rate": 8.782500963412156e-07, "logits/chosen": -0.48387765884399414, "logits/rejected": -0.3591809570789337, "logps/chosen": -88.98036193847656, "logps/rejected": -66.15789031982422, "loss": 1.6969, "rewards/accuracies": 1.0, "rewards/chosen": 2.468065023422241, "rewards/margins": 0.3617210388183594, "rewards/rejected": 2.106343984603882, "step": 5082 }, { "epoch": 0.83, "learning_rate": 8.781641319250501e-07, "logits/chosen": -0.2287181317806244, "logits/rejected": -0.285970002412796, "logps/chosen": -82.39595031738281, "logps/rejected": -63.92927932739258, "loss": 1.6074, "rewards/accuracies": 0.0, "rewards/chosen": 1.7458076477050781, "rewards/margins": -0.2353130578994751, "rewards/rejected": 1.9811207056045532, "step": 5083 }, { "epoch": 0.83, "learning_rate": 8.780781413813215e-07, "logits/chosen": -1.0964618921279907, "logits/rejected": -0.9448176026344299, "logps/chosen": -123.6804428100586, "logps/rejected": -111.13943481445312, "loss": 0.2105, "rewards/accuracies": 1.0, "rewards/chosen": 4.297829627990723, "rewards/margins": 1.3019037246704102, "rewards/rejected": 2.9959259033203125, "step": 5084 }, { "epoch": 0.83, "learning_rate": 8.779921247159707e-07, "logits/chosen": -1.0862613916397095, "logits/rejected": -1.0354502201080322, "logps/chosen": -114.06929779052734, "logps/rejected": -67.53895568847656, "loss": 1.462, "rewards/accuracies": 0.0, "rewards/chosen": 1.1271759271621704, "rewards/margins": -0.7372375726699829, "rewards/rejected": 1.8644134998321533, "step": 5085 }, { "epoch": 0.83, "learning_rate": 8.779060819349406e-07, "logits/chosen": -0.5089236497879028, "logits/rejected": -0.5734708905220032, "logps/chosen": -4.684945106506348, "logps/rejected": -57.93170166015625, "loss": 2.5765, "rewards/accuracies": 0.0, "rewards/chosen": 0.5978595614433289, "rewards/margins": -2.18894624710083, "rewards/rejected": 2.7868058681488037, "step": 5086 }, { "epoch": 0.83, "learning_rate": 8.778200130441761e-07, "logits/chosen": -0.3930470645427704, "logits/rejected": -0.3930470645427704, "logps/chosen": -73.16120910644531, "logps/rejected": -73.16120910644531, "loss": 1.8355, "rewards/accuracies": 0.0, "rewards/chosen": 0.9897255301475525, "rewards/margins": 0.0, "rewards/rejected": 0.9897255301475525, "step": 5087 }, { "epoch": 0.83, "learning_rate": 8.777339180496237e-07, "logits/chosen": -0.40587925910949707, "logits/rejected": -0.2993561327457428, "logps/chosen": -48.264251708984375, "logps/rejected": -80.04359436035156, "loss": 0.6842, "rewards/accuracies": 0.0, "rewards/chosen": 1.8989990949630737, "rewards/margins": -0.789751410484314, "rewards/rejected": 2.6887505054473877, "step": 5088 }, { "epoch": 0.83, "learning_rate": 8.776477969572315e-07, "logits/chosen": -0.17421968281269073, "logits/rejected": -0.20976899564266205, "logps/chosen": -66.83172607421875, "logps/rejected": -80.68209838867188, "loss": 0.976, "rewards/accuracies": 0.0, "rewards/chosen": 1.4339516162872314, "rewards/margins": -1.6337502002716064, "rewards/rejected": 3.067701816558838, "step": 5089 }, { "epoch": 0.83, "learning_rate": 8.775616497729502e-07, "logits/chosen": -1.0821959972381592, "logits/rejected": -0.8837949633598328, "logps/chosen": -121.55685424804688, "logps/rejected": -184.65423583984375, "loss": 0.3271, "rewards/accuracies": 1.0, "rewards/chosen": 7.227672100067139, "rewards/margins": 0.33683347702026367, "rewards/rejected": 6.890838623046875, "step": 5090 }, { "epoch": 0.83, "learning_rate": 8.774754765027313e-07, "logits/chosen": -0.6965256929397583, "logits/rejected": -0.7251887321472168, "logps/chosen": -254.7539825439453, "logps/rejected": -96.10417175292969, "loss": 0.1657, "rewards/accuracies": 1.0, "rewards/chosen": 3.7257003784179688, "rewards/margins": 1.6578400135040283, "rewards/rejected": 2.0678603649139404, "step": 5091 }, { "epoch": 0.83, "learning_rate": 8.773892771525284e-07, "logits/chosen": -0.22348932921886444, "logits/rejected": -0.19782744348049164, "logps/chosen": -10.402714729309082, "logps/rejected": -19.579132080078125, "loss": 0.4033, "rewards/accuracies": 0.0, "rewards/chosen": 0.020083999261260033, "rewards/margins": -0.1250530332326889, "rewards/rejected": 0.1451370269060135, "step": 5092 }, { "epoch": 0.83, "learning_rate": 8.773030517282977e-07, "logits/chosen": -0.6675097942352295, "logits/rejected": -0.5806452631950378, "logps/chosen": -65.20134735107422, "logps/rejected": -54.513362884521484, "loss": 0.4014, "rewards/accuracies": 1.0, "rewards/chosen": 3.3105385303497314, "rewards/margins": 0.9144184589385986, "rewards/rejected": 2.396120071411133, "step": 5093 }, { "epoch": 0.83, "learning_rate": 8.772168002359961e-07, "logits/chosen": -0.46935972571372986, "logits/rejected": -0.45776331424713135, "logps/chosen": -32.77476501464844, "logps/rejected": -24.582088470458984, "loss": 0.6158, "rewards/accuracies": 0.0, "rewards/chosen": 0.29957276582717896, "rewards/margins": -0.34318846464157104, "rewards/rejected": 0.64276123046875, "step": 5094 }, { "epoch": 0.83, "learning_rate": 8.771305226815829e-07, "logits/chosen": -0.127248153090477, "logits/rejected": -0.127248153090477, "logps/chosen": -1.0316892862319946, "logps/rejected": -1.0316892862319946, "loss": 0.7776, "rewards/accuracies": 0.0, "rewards/chosen": 0.43742114305496216, "rewards/margins": 0.0, "rewards/rejected": 0.43742114305496216, "step": 5095 }, { "epoch": 0.83, "learning_rate": 8.770442190710188e-07, "logits/chosen": -0.8096904754638672, "logits/rejected": -0.6668899059295654, "logps/chosen": -119.4517822265625, "logps/rejected": -78.44660949707031, "loss": 0.1786, "rewards/accuracies": 1.0, "rewards/chosen": 3.8191375732421875, "rewards/margins": 1.1462371349334717, "rewards/rejected": 2.672900438308716, "step": 5096 }, { "epoch": 0.83, "learning_rate": 8.769578894102669e-07, "logits/chosen": -0.31576839089393616, "logits/rejected": -0.3883492350578308, "logps/chosen": -47.85210418701172, "logps/rejected": -137.25770568847656, "loss": 1.0801, "rewards/accuracies": 1.0, "rewards/chosen": 1.0151947736740112, "rewards/margins": 0.9316360950469971, "rewards/rejected": 0.08355865627527237, "step": 5097 }, { "epoch": 0.83, "learning_rate": 8.768715337052916e-07, "logits/chosen": -0.6471908092498779, "logits/rejected": -0.6471908092498779, "logps/chosen": -6.053499698638916, "logps/rejected": -6.053499698638916, "loss": 1.0555, "rewards/accuracies": 0.0, "rewards/chosen": 0.20737119019031525, "rewards/margins": 0.0, "rewards/rejected": 0.20737119019031525, "step": 5098 }, { "epoch": 0.83, "learning_rate": 8.767851519620595e-07, "logits/chosen": -0.5427820086479187, "logits/rejected": -0.43560591340065, "logps/chosen": -144.65611267089844, "logps/rejected": -123.43995666503906, "loss": 1.0284, "rewards/accuracies": 0.0, "rewards/chosen": 4.288948059082031, "rewards/margins": -1.87874174118042, "rewards/rejected": 6.167689800262451, "step": 5099 }, { "epoch": 0.83, "learning_rate": 8.766987441865386e-07, "logits/chosen": -0.33515244722366333, "logits/rejected": -0.33515244722366333, "logps/chosen": -48.14830017089844, "logps/rejected": -48.14830017089844, "loss": 0.8778, "rewards/accuracies": 0.0, "rewards/chosen": 0.05029335245490074, "rewards/margins": 0.0, "rewards/rejected": 0.05029335245490074, "step": 5100 }, { "epoch": 0.83, "learning_rate": 8.766123103846986e-07, "logits/chosen": -0.709073543548584, "logits/rejected": -0.8617478013038635, "logps/chosen": -101.04335021972656, "logps/rejected": -85.72730255126953, "loss": 1.8698, "rewards/accuracies": 0.0, "rewards/chosen": 0.7161354422569275, "rewards/margins": -3.331836700439453, "rewards/rejected": 4.047972202301025, "step": 5101 }, { "epoch": 0.83, "learning_rate": 8.765258505625116e-07, "logits/chosen": -0.5124975442886353, "logits/rejected": -0.5129583477973938, "logps/chosen": -21.503469467163086, "logps/rejected": -26.59066390991211, "loss": 0.5841, "rewards/accuracies": 1.0, "rewards/chosen": 0.3827848434448242, "rewards/margins": 0.15783271193504333, "rewards/rejected": 0.22495213150978088, "step": 5102 }, { "epoch": 0.83, "learning_rate": 8.76439364725951e-07, "logits/chosen": -0.9975742101669312, "logits/rejected": -1.0278058052062988, "logps/chosen": -129.71987915039062, "logps/rejected": -132.8826446533203, "loss": 1.6429, "rewards/accuracies": 0.0, "rewards/chosen": 0.780963122844696, "rewards/margins": -0.6826401352882385, "rewards/rejected": 1.4636032581329346, "step": 5103 }, { "epoch": 0.83, "learning_rate": 8.763528528809922e-07, "logits/chosen": -0.678766131401062, "logits/rejected": -0.6490305066108704, "logps/chosen": -63.78559494018555, "logps/rejected": -31.925899505615234, "loss": 0.6521, "rewards/accuracies": 1.0, "rewards/chosen": 1.8630794286727905, "rewards/margins": 0.5533149242401123, "rewards/rejected": 1.3097645044326782, "step": 5104 }, { "epoch": 0.83, "learning_rate": 8.762663150336125e-07, "logits/chosen": -0.4847380220890045, "logits/rejected": -0.5007657408714294, "logps/chosen": -42.661705017089844, "logps/rejected": -61.759483337402344, "loss": 1.5041, "rewards/accuracies": 1.0, "rewards/chosen": 0.6957740783691406, "rewards/margins": 0.27361753582954407, "rewards/rejected": 0.42215654253959656, "step": 5105 }, { "epoch": 0.83, "learning_rate": 8.761797511897906e-07, "logits/chosen": -1.0860326290130615, "logits/rejected": -1.04645574092865, "logps/chosen": -113.90135192871094, "logps/rejected": -68.09194946289062, "loss": 0.9512, "rewards/accuracies": 1.0, "rewards/chosen": 4.391964912414551, "rewards/margins": 1.8084213733673096, "rewards/rejected": 2.583543539047241, "step": 5106 }, { "epoch": 0.83, "learning_rate": 8.760931613555074e-07, "logits/chosen": -0.6153140664100647, "logits/rejected": -0.556876003742218, "logps/chosen": -80.12178802490234, "logps/rejected": -51.81660079956055, "loss": 0.7185, "rewards/accuracies": 1.0, "rewards/chosen": 4.6340484619140625, "rewards/margins": 2.8524386882781982, "rewards/rejected": 1.7816097736358643, "step": 5107 }, { "epoch": 0.83, "learning_rate": 8.760065455367453e-07, "logits/chosen": -0.4709518849849701, "logits/rejected": -0.4423423707485199, "logps/chosen": -95.10221862792969, "logps/rejected": -80.60286712646484, "loss": 0.4076, "rewards/accuracies": 0.0, "rewards/chosen": 2.0925049781799316, "rewards/margins": -0.15259099006652832, "rewards/rejected": 2.24509596824646, "step": 5108 }, { "epoch": 0.83, "learning_rate": 8.759199037394886e-07, "logits/chosen": -0.1648855209350586, "logits/rejected": -0.1648855209350586, "logps/chosen": -6.395722389221191, "logps/rejected": -6.395722389221191, "loss": 0.3977, "rewards/accuracies": 0.0, "rewards/chosen": 0.23677758872509003, "rewards/margins": 0.0, "rewards/rejected": 0.23677758872509003, "step": 5109 }, { "epoch": 0.83, "learning_rate": 8.758332359697238e-07, "logits/chosen": -0.45135611295700073, "logits/rejected": -0.5587669610977173, "logps/chosen": -126.79720306396484, "logps/rejected": -97.02700805664062, "loss": 0.4901, "rewards/accuracies": 1.0, "rewards/chosen": 3.0060067176818848, "rewards/margins": 2.238636016845703, "rewards/rejected": 0.7673706412315369, "step": 5110 }, { "epoch": 0.83, "learning_rate": 8.757465422334384e-07, "logits/chosen": -0.20644502341747284, "logits/rejected": -0.21646925806999207, "logps/chosen": -23.371122360229492, "logps/rejected": -32.953758239746094, "loss": 0.6314, "rewards/accuracies": 1.0, "rewards/chosen": 0.13543950021266937, "rewards/margins": 0.26424849033355713, "rewards/rejected": -0.12880897521972656, "step": 5111 }, { "epoch": 0.83, "learning_rate": 8.756598225366222e-07, "logits/chosen": -0.5131675601005554, "logits/rejected": -0.4925430119037628, "logps/chosen": -60.84849548339844, "logps/rejected": -56.88642501831055, "loss": 0.9451, "rewards/accuracies": 0.0, "rewards/chosen": 0.6960739493370056, "rewards/margins": -0.3148731589317322, "rewards/rejected": 1.0109471082687378, "step": 5112 }, { "epoch": 0.83, "learning_rate": 8.755730768852669e-07, "logits/chosen": -0.9875249862670898, "logits/rejected": -0.9875249862670898, "logps/chosen": -69.0989990234375, "logps/rejected": -69.0989990234375, "loss": 0.5555, "rewards/accuracies": 0.0, "rewards/chosen": 1.0362099409103394, "rewards/margins": 0.0, "rewards/rejected": 1.0362099409103394, "step": 5113 }, { "epoch": 0.83, "learning_rate": 8.754863052853657e-07, "logits/chosen": -0.45313695073127747, "logits/rejected": -0.368875652551651, "logps/chosen": -59.04828643798828, "logps/rejected": -13.907919883728027, "loss": 0.5704, "rewards/accuracies": 1.0, "rewards/chosen": 2.38736891746521, "rewards/margins": 1.4142658710479736, "rewards/rejected": 0.9731030464172363, "step": 5114 }, { "epoch": 0.83, "learning_rate": 8.753995077429137e-07, "logits/chosen": -0.754585862159729, "logits/rejected": -0.6403821706771851, "logps/chosen": -128.3520050048828, "logps/rejected": -77.0406265258789, "loss": 0.5571, "rewards/accuracies": 0.0, "rewards/chosen": 3.1150665283203125, "rewards/margins": -0.5730721950531006, "rewards/rejected": 3.688138723373413, "step": 5115 }, { "epoch": 0.83, "learning_rate": 8.753126842639078e-07, "logits/chosen": -0.6692427396774292, "logits/rejected": -0.7051916122436523, "logps/chosen": -94.30561828613281, "logps/rejected": -112.13500213623047, "loss": 1.3068, "rewards/accuracies": 0.0, "rewards/chosen": 0.7424370050430298, "rewards/margins": -1.7980576753616333, "rewards/rejected": 2.540494680404663, "step": 5116 }, { "epoch": 0.83, "learning_rate": 8.752258348543466e-07, "logits/chosen": -1.0228800773620605, "logits/rejected": -0.8974198698997498, "logps/chosen": -131.3675537109375, "logps/rejected": -104.5625, "loss": 0.5523, "rewards/accuracies": 1.0, "rewards/chosen": 4.206363201141357, "rewards/margins": 0.8085055351257324, "rewards/rejected": 3.397857666015625, "step": 5117 }, { "epoch": 0.83, "learning_rate": 8.751389595202306e-07, "logits/chosen": -0.714419424533844, "logits/rejected": -0.714419424533844, "logps/chosen": -80.56925201416016, "logps/rejected": -80.56925201416016, "loss": 0.4841, "rewards/accuracies": 0.0, "rewards/chosen": 1.4356224536895752, "rewards/margins": 0.0, "rewards/rejected": 1.4356224536895752, "step": 5118 }, { "epoch": 0.83, "learning_rate": 8.75052058267562e-07, "logits/chosen": -1.021333932876587, "logits/rejected": -0.9731366038322449, "logps/chosen": -123.9527587890625, "logps/rejected": -107.52525329589844, "loss": 1.0879, "rewards/accuracies": 1.0, "rewards/chosen": 1.4529632329940796, "rewards/margins": 0.15675652027130127, "rewards/rejected": 1.2962067127227783, "step": 5119 }, { "epoch": 0.83, "learning_rate": 8.749651311023451e-07, "logits/chosen": -0.5919707417488098, "logits/rejected": -0.6075929403305054, "logps/chosen": -27.678508758544922, "logps/rejected": -117.05359649658203, "loss": 0.8685, "rewards/accuracies": 0.0, "rewards/chosen": 2.2995853424072266, "rewards/margins": -1.4265918731689453, "rewards/rejected": 3.726177215576172, "step": 5120 }, { "epoch": 0.83, "learning_rate": 8.748781780305856e-07, "logits/chosen": -0.4871249198913574, "logits/rejected": -0.48699477314949036, "logps/chosen": -77.95860290527344, "logps/rejected": -69.94277954101562, "loss": 0.9277, "rewards/accuracies": 0.0, "rewards/chosen": 1.0438873767852783, "rewards/margins": -0.6800750494003296, "rewards/rejected": 1.723962426185608, "step": 5121 }, { "epoch": 0.83, "learning_rate": 8.747911990582911e-07, "logits/chosen": -0.8239361047744751, "logits/rejected": -0.7081611156463623, "logps/chosen": -103.43206024169922, "logps/rejected": -85.84547424316406, "loss": 0.0747, "rewards/accuracies": 1.0, "rewards/chosen": 6.9202494621276855, "rewards/margins": 5.287702560424805, "rewards/rejected": 1.6325470209121704, "step": 5122 }, { "epoch": 0.83, "learning_rate": 8.747041941914711e-07, "logits/chosen": -0.832503616809845, "logits/rejected": -0.9142246246337891, "logps/chosen": -161.77151489257812, "logps/rejected": -105.08744812011719, "loss": 1.6046, "rewards/accuracies": 0.0, "rewards/chosen": -0.2905441224575043, "rewards/margins": -0.9045302867889404, "rewards/rejected": 0.6139861941337585, "step": 5123 }, { "epoch": 0.83, "learning_rate": 8.746171634361368e-07, "logits/chosen": -0.4524853527545929, "logits/rejected": -0.47734227776527405, "logps/chosen": -112.84587097167969, "logps/rejected": -116.27767944335938, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.5193588137626648, "rewards/margins": 0.19021984934806824, "rewards/rejected": 0.32913896441459656, "step": 5124 }, { "epoch": 0.83, "learning_rate": 8.745301067983011e-07, "logits/chosen": -0.05458025634288788, "logits/rejected": -0.05458025634288788, "logps/chosen": -53.418182373046875, "logps/rejected": -53.418182373046875, "loss": 0.4187, "rewards/accuracies": 0.0, "rewards/chosen": 0.45279693603515625, "rewards/margins": 0.0, "rewards/rejected": 0.45279693603515625, "step": 5125 }, { "epoch": 0.83, "learning_rate": 8.744430242839788e-07, "logits/chosen": -1.0345410108566284, "logits/rejected": -0.9407000541687012, "logps/chosen": -92.9921875, "logps/rejected": -22.500385284423828, "loss": 0.0623, "rewards/accuracies": 1.0, "rewards/chosen": 2.9620347023010254, "rewards/margins": 2.5576975345611572, "rewards/rejected": 0.4043371379375458, "step": 5126 }, { "epoch": 0.83, "learning_rate": 8.743559158991866e-07, "logits/chosen": -0.21083027124404907, "logits/rejected": -0.21083027124404907, "logps/chosen": -21.45280647277832, "logps/rejected": -21.45280647277832, "loss": 0.3604, "rewards/accuracies": 0.0, "rewards/chosen": 0.1395505964756012, "rewards/margins": 0.0, "rewards/rejected": 0.1395505964756012, "step": 5127 }, { "epoch": 0.83, "learning_rate": 8.742687816499428e-07, "logits/chosen": -0.6697806715965271, "logits/rejected": -0.6837174296379089, "logps/chosen": -103.69059753417969, "logps/rejected": -139.70663452148438, "loss": 0.7371, "rewards/accuracies": 1.0, "rewards/chosen": 4.775294780731201, "rewards/margins": 0.333892822265625, "rewards/rejected": 4.441401958465576, "step": 5128 }, { "epoch": 0.83, "learning_rate": 8.741816215422675e-07, "logits/chosen": -0.9066253900527954, "logits/rejected": -0.9237834811210632, "logps/chosen": -83.63174438476562, "logps/rejected": -147.72528076171875, "loss": 2.0727, "rewards/accuracies": 0.0, "rewards/chosen": 1.1536911725997925, "rewards/margins": -4.120991230010986, "rewards/rejected": 5.274682521820068, "step": 5129 }, { "epoch": 0.83, "learning_rate": 8.740944355821826e-07, "logits/chosen": -0.6342836618423462, "logits/rejected": -0.5870872139930725, "logps/chosen": -119.80282592773438, "logps/rejected": -80.31492614746094, "loss": 1.2551, "rewards/accuracies": 1.0, "rewards/chosen": 4.083253383636475, "rewards/margins": 0.860586404800415, "rewards/rejected": 3.2226669788360596, "step": 5130 }, { "epoch": 0.83, "learning_rate": 8.740072237757121e-07, "logits/chosen": -0.9452550411224365, "logits/rejected": -0.8573906421661377, "logps/chosen": -140.8274383544922, "logps/rejected": -62.53934097290039, "loss": 0.915, "rewards/accuracies": 1.0, "rewards/chosen": 4.217898845672607, "rewards/margins": 2.4958343505859375, "rewards/rejected": 1.7220646142959595, "step": 5131 }, { "epoch": 0.83, "learning_rate": 8.739199861288813e-07, "logits/chosen": -0.9325895309448242, "logits/rejected": -0.9503901600837708, "logps/chosen": -35.35283660888672, "logps/rejected": -15.099424362182617, "loss": 0.5511, "rewards/accuracies": 0.0, "rewards/chosen": -0.02980956993997097, "rewards/margins": -0.39965954422950745, "rewards/rejected": 0.3698499798774719, "step": 5132 }, { "epoch": 0.83, "learning_rate": 8.738327226477175e-07, "logits/chosen": -0.619608461856842, "logits/rejected": -0.6289337277412415, "logps/chosen": -125.39230346679688, "logps/rejected": -108.94331359863281, "loss": 3.5999, "rewards/accuracies": 1.0, "rewards/chosen": 2.750701904296875, "rewards/margins": 0.49662327766418457, "rewards/rejected": 2.2540786266326904, "step": 5133 }, { "epoch": 0.83, "learning_rate": 8.737454333382497e-07, "logits/chosen": -0.5002498626708984, "logits/rejected": -0.5061210989952087, "logps/chosen": -72.45050048828125, "logps/rejected": -60.283416748046875, "loss": 0.9627, "rewards/accuracies": 0.0, "rewards/chosen": 0.6371246576309204, "rewards/margins": -0.5110809803009033, "rewards/rejected": 1.1482056379318237, "step": 5134 }, { "epoch": 0.83, "learning_rate": 8.736581182065091e-07, "logits/chosen": -0.8759045600891113, "logits/rejected": -0.7929018139839172, "logps/chosen": -67.79989624023438, "logps/rejected": -86.48912811279297, "loss": 0.7287, "rewards/accuracies": 0.0, "rewards/chosen": 1.6205002069473267, "rewards/margins": -0.3989783525466919, "rewards/rejected": 2.0194785594940186, "step": 5135 }, { "epoch": 0.83, "learning_rate": 8.735707772585279e-07, "logits/chosen": -0.5774873495101929, "logits/rejected": -0.7037960886955261, "logps/chosen": -75.86988067626953, "logps/rejected": -119.83939361572266, "loss": 2.117, "rewards/accuracies": 0.0, "rewards/chosen": 1.7577980756759644, "rewards/margins": -3.935410976409912, "rewards/rejected": 5.693209171295166, "step": 5136 }, { "epoch": 0.83, "learning_rate": 8.734834105003408e-07, "logits/chosen": -0.7764755487442017, "logits/rejected": -0.8216137886047363, "logps/chosen": -170.43093872070312, "logps/rejected": -108.34193420410156, "loss": 0.4281, "rewards/accuracies": 0.0, "rewards/chosen": 4.806079387664795, "rewards/margins": -0.0743560791015625, "rewards/rejected": 4.880435466766357, "step": 5137 }, { "epoch": 0.83, "learning_rate": 8.73396017937984e-07, "logits/chosen": -0.616123378276825, "logits/rejected": -0.616123378276825, "logps/chosen": -25.18266487121582, "logps/rejected": -25.18266487121582, "loss": 0.423, "rewards/accuracies": 0.0, "rewards/chosen": -0.012170219793915749, "rewards/margins": 0.0, "rewards/rejected": -0.012170219793915749, "step": 5138 }, { "epoch": 0.83, "learning_rate": 8.733085995774956e-07, "logits/chosen": -0.6458393335342407, "logits/rejected": -0.5780574679374695, "logps/chosen": -64.75410461425781, "logps/rejected": -52.94112777709961, "loss": 0.3408, "rewards/accuracies": 1.0, "rewards/chosen": 2.5636444091796875, "rewards/margins": 1.5049372911453247, "rewards/rejected": 1.0587071180343628, "step": 5139 }, { "epoch": 0.83, "learning_rate": 8.732211554249151e-07, "logits/chosen": -0.5734309554100037, "logits/rejected": -0.6265398859977722, "logps/chosen": -12.087740898132324, "logps/rejected": -92.8459701538086, "loss": 0.6097, "rewards/accuracies": 1.0, "rewards/chosen": 0.5203478932380676, "rewards/margins": 0.08544263243675232, "rewards/rejected": 0.4349052608013153, "step": 5140 }, { "epoch": 0.83, "learning_rate": 8.731336854862841e-07, "logits/chosen": -0.875776469707489, "logits/rejected": -0.8373565077781677, "logps/chosen": -68.39543914794922, "logps/rejected": -21.67544174194336, "loss": 0.6179, "rewards/accuracies": 1.0, "rewards/chosen": 0.7938545346260071, "rewards/margins": 0.06204491853713989, "rewards/rejected": 0.7318096160888672, "step": 5141 }, { "epoch": 0.83, "learning_rate": 8.730461897676463e-07, "logits/chosen": -0.5578245520591736, "logits/rejected": -0.5436933636665344, "logps/chosen": -125.22826385498047, "logps/rejected": -66.12833404541016, "loss": 0.0945, "rewards/accuracies": 1.0, "rewards/chosen": 4.747707366943359, "rewards/margins": 2.7635879516601562, "rewards/rejected": 1.9841194152832031, "step": 5142 }, { "epoch": 0.83, "learning_rate": 8.729586682750464e-07, "logits/chosen": -0.4634997844696045, "logits/rejected": -0.5629526972770691, "logps/chosen": -45.13037109375, "logps/rejected": -73.74922180175781, "loss": 1.0616, "rewards/accuracies": 0.0, "rewards/chosen": 1.2284530401229858, "rewards/margins": -0.8648256063461304, "rewards/rejected": 2.093278646469116, "step": 5143 }, { "epoch": 0.83, "learning_rate": 8.728711210145315e-07, "logits/chosen": -0.5157931447029114, "logits/rejected": -0.4870466887950897, "logps/chosen": -38.53083038330078, "logps/rejected": -5.04584264755249, "loss": 0.9867, "rewards/accuracies": 0.0, "rewards/chosen": 0.23116913437843323, "rewards/margins": -0.14236503839492798, "rewards/rejected": 0.3735341727733612, "step": 5144 }, { "epoch": 0.84, "learning_rate": 8.727835479921504e-07, "logits/chosen": -0.279592365026474, "logits/rejected": -0.279592365026474, "logps/chosen": -56.24073028564453, "logps/rejected": -56.24073028564453, "loss": 0.4312, "rewards/accuracies": 0.0, "rewards/chosen": 0.387900173664093, "rewards/margins": 0.0, "rewards/rejected": 0.387900173664093, "step": 5145 }, { "epoch": 0.84, "learning_rate": 8.726959492139534e-07, "logits/chosen": -0.3952144980430603, "logits/rejected": -0.3952144980430603, "logps/chosen": -38.62013244628906, "logps/rejected": -38.62013244628906, "loss": 0.7574, "rewards/accuracies": 0.0, "rewards/chosen": 0.5299000144004822, "rewards/margins": 0.0, "rewards/rejected": 0.5299000144004822, "step": 5146 }, { "epoch": 0.84, "learning_rate": 8.726083246859928e-07, "logits/chosen": -0.5619943141937256, "logits/rejected": -0.5826194286346436, "logps/chosen": -6.771227836608887, "logps/rejected": -2.705540180206299, "loss": 0.4777, "rewards/accuracies": 0.0, "rewards/chosen": -0.010048485361039639, "rewards/margins": -0.3745874762535095, "rewards/rejected": 0.3645389974117279, "step": 5147 }, { "epoch": 0.84, "learning_rate": 8.725206744143225e-07, "logits/chosen": -0.7938500642776489, "logits/rejected": -0.7855961918830872, "logps/chosen": -168.88673400878906, "logps/rejected": -103.59233093261719, "loss": 0.662, "rewards/accuracies": 0.0, "rewards/chosen": 2.502027988433838, "rewards/margins": -0.7962493896484375, "rewards/rejected": 3.2982773780822754, "step": 5148 }, { "epoch": 0.84, "learning_rate": 8.724329984049985e-07, "logits/chosen": -0.7086929082870483, "logits/rejected": -0.5912806987762451, "logps/chosen": -147.79592895507812, "logps/rejected": -85.26103210449219, "loss": 1.0492, "rewards/accuracies": 0.0, "rewards/chosen": 1.3313324451446533, "rewards/margins": -0.04493105411529541, "rewards/rejected": 1.3762634992599487, "step": 5149 }, { "epoch": 0.84, "learning_rate": 8.723452966640783e-07, "logits/chosen": -0.5032025575637817, "logits/rejected": -0.4649348556995392, "logps/chosen": -100.04940032958984, "logps/rejected": -123.09321594238281, "loss": 0.3624, "rewards/accuracies": 1.0, "rewards/chosen": 1.0679931640625, "rewards/margins": 0.09272915124893188, "rewards/rejected": 0.9752640128135681, "step": 5150 }, { "epoch": 0.84, "learning_rate": 8.722575691976213e-07, "logits/chosen": -0.9716002941131592, "logits/rejected": -0.9397233128547668, "logps/chosen": -109.1633071899414, "logps/rejected": -101.31550598144531, "loss": 1.2557, "rewards/accuracies": 0.0, "rewards/chosen": 0.6808769106864929, "rewards/margins": -2.2104086875915527, "rewards/rejected": 2.8912856578826904, "step": 5151 }, { "epoch": 0.84, "learning_rate": 8.721698160116885e-07, "logits/chosen": -0.8607854247093201, "logits/rejected": -0.7595157623291016, "logps/chosen": -148.54318237304688, "logps/rejected": -84.05464172363281, "loss": 0.0852, "rewards/accuracies": 1.0, "rewards/chosen": 4.110232830047607, "rewards/margins": 1.9180924892425537, "rewards/rejected": 2.1921403408050537, "step": 5152 }, { "epoch": 0.84, "learning_rate": 8.72082037112343e-07, "logits/chosen": -0.529609203338623, "logits/rejected": -0.5730395317077637, "logps/chosen": -54.00331115722656, "logps/rejected": -88.65288543701172, "loss": 0.2926, "rewards/accuracies": 1.0, "rewards/chosen": 1.9901809692382812, "rewards/margins": 0.5264701843261719, "rewards/rejected": 1.4637107849121094, "step": 5153 }, { "epoch": 0.84, "learning_rate": 8.719942325056494e-07, "logits/chosen": -0.38740023970603943, "logits/rejected": -0.2650102972984314, "logps/chosen": -37.14854431152344, "logps/rejected": -8.948005676269531, "loss": 0.1835, "rewards/accuracies": 1.0, "rewards/chosen": 1.9456955194473267, "rewards/margins": 1.1802012920379639, "rewards/rejected": 0.765494167804718, "step": 5154 }, { "epoch": 0.84, "learning_rate": 8.719064021976741e-07, "logits/chosen": -0.7058225870132446, "logits/rejected": -0.6013363599777222, "logps/chosen": -206.86758422851562, "logps/rejected": -177.9476318359375, "loss": 0.6865, "rewards/accuracies": 0.0, "rewards/chosen": 6.177957057952881, "rewards/margins": -1.0079822540283203, "rewards/rejected": 7.185939311981201, "step": 5155 }, { "epoch": 0.84, "learning_rate": 8.718185461944855e-07, "logits/chosen": -0.9012682437896729, "logits/rejected": -0.8446072340011597, "logps/chosen": -84.69054412841797, "logps/rejected": -47.37749481201172, "loss": 1.107, "rewards/accuracies": 0.0, "rewards/chosen": 0.3283897340297699, "rewards/margins": -2.095851182937622, "rewards/rejected": 2.424240827560425, "step": 5156 }, { "epoch": 0.84, "learning_rate": 8.717306645021536e-07, "logits/chosen": -0.7190369963645935, "logits/rejected": -0.6040412783622742, "logps/chosen": -126.02275085449219, "logps/rejected": -118.57284545898438, "loss": 1.0723, "rewards/accuracies": 0.0, "rewards/chosen": 3.5791244506835938, "rewards/margins": -0.29788661003112793, "rewards/rejected": 3.8770110607147217, "step": 5157 }, { "epoch": 0.84, "learning_rate": 8.716427571267502e-07, "logits/chosen": -0.8254237771034241, "logits/rejected": -0.7957916855812073, "logps/chosen": -342.56878662109375, "logps/rejected": -94.14402770996094, "loss": 1.0835, "rewards/accuracies": 1.0, "rewards/chosen": 3.077239990234375, "rewards/margins": 0.24834275245666504, "rewards/rejected": 2.82889723777771, "step": 5158 }, { "epoch": 0.84, "learning_rate": 8.715548240743485e-07, "logits/chosen": -0.9284980297088623, "logits/rejected": -0.9430070519447327, "logps/chosen": -115.03230285644531, "logps/rejected": -120.49911499023438, "loss": 1.7669, "rewards/accuracies": 0.0, "rewards/chosen": 0.45101624727249146, "rewards/margins": -0.32963407039642334, "rewards/rejected": 0.7806503176689148, "step": 5159 }, { "epoch": 0.84, "learning_rate": 8.714668653510244e-07, "logits/chosen": -0.6755648255348206, "logits/rejected": -0.6427820920944214, "logps/chosen": -57.70198059082031, "logps/rejected": -12.580249786376953, "loss": 0.7983, "rewards/accuracies": 0.0, "rewards/chosen": -0.0010169983142986894, "rewards/margins": -0.38380929827690125, "rewards/rejected": 0.38279229402542114, "step": 5160 }, { "epoch": 0.84, "learning_rate": 8.713788809628546e-07, "logits/chosen": -0.24784275889396667, "logits/rejected": -0.14917251467704773, "logps/chosen": -40.518062591552734, "logps/rejected": -69.36968994140625, "loss": 0.2372, "rewards/accuracies": 1.0, "rewards/chosen": 1.4102691411972046, "rewards/margins": 0.6305663585662842, "rewards/rejected": 0.7797027826309204, "step": 5161 }, { "epoch": 0.84, "learning_rate": 8.712908709159183e-07, "logits/chosen": -0.7002923488616943, "logits/rejected": -0.6552137732505798, "logps/chosen": -62.23392105102539, "logps/rejected": -79.3692398071289, "loss": 0.3477, "rewards/accuracies": 1.0, "rewards/chosen": 1.178629755973816, "rewards/margins": 0.618455171585083, "rewards/rejected": 0.5601745843887329, "step": 5162 }, { "epoch": 0.84, "learning_rate": 8.712028352162958e-07, "logits/chosen": -0.483705073595047, "logits/rejected": -0.3867703974246979, "logps/chosen": -60.56304168701172, "logps/rejected": -103.85530090332031, "loss": 0.6322, "rewards/accuracies": 1.0, "rewards/chosen": 3.8188042640686035, "rewards/margins": 1.1436195373535156, "rewards/rejected": 2.675184726715088, "step": 5163 }, { "epoch": 0.84, "learning_rate": 8.711147738700699e-07, "logits/chosen": -0.5200858116149902, "logits/rejected": -0.45749565958976746, "logps/chosen": -57.659149169921875, "logps/rejected": -36.159889221191406, "loss": 0.6306, "rewards/accuracies": 1.0, "rewards/chosen": 2.1261184215545654, "rewards/margins": 0.7740706205368042, "rewards/rejected": 1.3520478010177612, "step": 5164 }, { "epoch": 0.84, "learning_rate": 8.710266868833245e-07, "logits/chosen": -0.8015890121459961, "logits/rejected": -0.8486736416816711, "logps/chosen": -151.52227783203125, "logps/rejected": -115.291748046875, "loss": 0.0752, "rewards/accuracies": 1.0, "rewards/chosen": 3.2601654529571533, "rewards/margins": 2.9240386486053467, "rewards/rejected": 0.3361267149448395, "step": 5165 }, { "epoch": 0.84, "learning_rate": 8.709385742621457e-07, "logits/chosen": -0.353851854801178, "logits/rejected": -0.353851854801178, "logps/chosen": -51.20088577270508, "logps/rejected": -51.20088577270508, "loss": 1.3141, "rewards/accuracies": 0.0, "rewards/chosen": 1.8842334747314453, "rewards/margins": 0.0, "rewards/rejected": 1.8842334747314453, "step": 5166 }, { "epoch": 0.84, "learning_rate": 8.708504360126215e-07, "logits/chosen": -0.8408679366111755, "logits/rejected": -0.9153925180435181, "logps/chosen": -126.03487396240234, "logps/rejected": -114.89624786376953, "loss": 1.9137, "rewards/accuracies": 0.0, "rewards/chosen": 0.18956375122070312, "rewards/margins": -3.5729081630706787, "rewards/rejected": 3.762471914291382, "step": 5167 }, { "epoch": 0.84, "learning_rate": 8.707622721408412e-07, "logits/chosen": -1.0292637348175049, "logits/rejected": -0.9988953471183777, "logps/chosen": -97.98789978027344, "logps/rejected": -95.6329116821289, "loss": 1.0648, "rewards/accuracies": 0.0, "rewards/chosen": 0.5202728509902954, "rewards/margins": -1.9144378900527954, "rewards/rejected": 2.434710741043091, "step": 5168 }, { "epoch": 0.84, "learning_rate": 8.706740826528959e-07, "logits/chosen": -0.8365091681480408, "logits/rejected": -0.8035231828689575, "logps/chosen": -120.06806182861328, "logps/rejected": -158.94131469726562, "loss": 0.3028, "rewards/accuracies": 1.0, "rewards/chosen": 1.5031074285507202, "rewards/margins": 0.8988913893699646, "rewards/rejected": 0.6042160391807556, "step": 5169 }, { "epoch": 0.84, "learning_rate": 8.70585867554879e-07, "logits/chosen": -0.33704546093940735, "logits/rejected": -0.33428290486335754, "logps/chosen": -4.554330348968506, "logps/rejected": -2.800264835357666, "loss": 1.7438, "rewards/accuracies": 0.0, "rewards/chosen": 0.22840233147144318, "rewards/margins": -0.018571048974990845, "rewards/rejected": 0.24697338044643402, "step": 5170 }, { "epoch": 0.84, "learning_rate": 8.704976268528849e-07, "logits/chosen": -0.6791736483573914, "logits/rejected": -0.5969305038452148, "logps/chosen": -53.144649505615234, "logps/rejected": -123.62777709960938, "loss": 0.7436, "rewards/accuracies": 0.0, "rewards/chosen": 1.9656215906143188, "rewards/margins": -0.6857837438583374, "rewards/rejected": 2.6514053344726562, "step": 5171 }, { "epoch": 0.84, "learning_rate": 8.704093605530107e-07, "logits/chosen": -0.5473883152008057, "logits/rejected": -0.5344299077987671, "logps/chosen": -42.0195198059082, "logps/rejected": -58.632080078125, "loss": 0.617, "rewards/accuracies": 1.0, "rewards/chosen": 1.928982138633728, "rewards/margins": 0.3322482109069824, "rewards/rejected": 1.5967339277267456, "step": 5172 }, { "epoch": 0.84, "learning_rate": 8.703210686613545e-07, "logits/chosen": -0.5818716883659363, "logits/rejected": -0.5893389582633972, "logps/chosen": -38.2700309753418, "logps/rejected": -48.61909103393555, "loss": 0.5024, "rewards/accuracies": 0.0, "rewards/chosen": 0.9955639243125916, "rewards/margins": -0.36159592866897583, "rewards/rejected": 1.3571598529815674, "step": 5173 }, { "epoch": 0.84, "learning_rate": 8.702327511840164e-07, "logits/chosen": -0.43432363867759705, "logits/rejected": -0.42844322323799133, "logps/chosen": -2.587031602859497, "logps/rejected": -6.639530658721924, "loss": 0.4432, "rewards/accuracies": 1.0, "rewards/chosen": 0.26853978633880615, "rewards/margins": 0.25960686802864075, "rewards/rejected": 0.008932924829423428, "step": 5174 }, { "epoch": 0.84, "learning_rate": 8.701444081270984e-07, "logits/chosen": -0.7515798211097717, "logits/rejected": -0.955069363117218, "logps/chosen": -80.72964477539062, "logps/rejected": -45.37677764892578, "loss": 0.5058, "rewards/accuracies": 1.0, "rewards/chosen": 1.8676681518554688, "rewards/margins": 0.7609672546386719, "rewards/rejected": 1.1067008972167969, "step": 5175 }, { "epoch": 0.84, "learning_rate": 8.700560394967043e-07, "logits/chosen": -0.6453027129173279, "logits/rejected": -0.6095433235168457, "logps/chosen": -144.29525756835938, "logps/rejected": -99.09187316894531, "loss": 0.5938, "rewards/accuracies": 0.0, "rewards/chosen": 1.0728836059570312, "rewards/margins": -0.8160820007324219, "rewards/rejected": 1.8889656066894531, "step": 5176 }, { "epoch": 0.84, "learning_rate": 8.699676452989391e-07, "logits/chosen": -0.5985727906227112, "logits/rejected": -0.5564916133880615, "logps/chosen": -95.56266021728516, "logps/rejected": -64.4383544921875, "loss": 1.8248, "rewards/accuracies": 0.0, "rewards/chosen": 1.1107460260391235, "rewards/margins": -1.3782845735549927, "rewards/rejected": 2.489030599594116, "step": 5177 }, { "epoch": 0.84, "learning_rate": 8.698792255399103e-07, "logits/chosen": -0.4892407953739166, "logits/rejected": -0.4892407953739166, "logps/chosen": -5.50827693939209, "logps/rejected": -5.50827693939209, "loss": 0.5569, "rewards/accuracies": 0.0, "rewards/chosen": 0.4022710919380188, "rewards/margins": 0.0, "rewards/rejected": 0.4022710919380188, "step": 5178 }, { "epoch": 0.84, "learning_rate": 8.697907802257269e-07, "logits/chosen": -0.8888661861419678, "logits/rejected": -0.7268518209457397, "logps/chosen": -195.47706604003906, "logps/rejected": -234.85888671875, "loss": 1.3777, "rewards/accuracies": 0.0, "rewards/chosen": 4.371705532073975, "rewards/margins": -2.6800613403320312, "rewards/rejected": 7.051766872406006, "step": 5179 }, { "epoch": 0.84, "learning_rate": 8.697023093624997e-07, "logits/chosen": -0.7193714380264282, "logits/rejected": -0.6876136064529419, "logps/chosen": -124.75725555419922, "logps/rejected": -48.27824783325195, "loss": 0.784, "rewards/accuracies": 0.0, "rewards/chosen": 1.7712211608886719, "rewards/margins": -0.7538869380950928, "rewards/rejected": 2.5251080989837646, "step": 5180 }, { "epoch": 0.84, "learning_rate": 8.696138129563411e-07, "logits/chosen": -0.6541417241096497, "logits/rejected": -0.6600928902626038, "logps/chosen": -53.55000305175781, "logps/rejected": -53.88112258911133, "loss": 0.3313, "rewards/accuracies": 1.0, "rewards/chosen": 0.6254169344902039, "rewards/margins": 0.20126837491989136, "rewards/rejected": 0.4241485595703125, "step": 5181 }, { "epoch": 0.84, "learning_rate": 8.695252910133652e-07, "logits/chosen": -0.9104880690574646, "logits/rejected": -0.8762760162353516, "logps/chosen": -163.0252685546875, "logps/rejected": -181.8592529296875, "loss": 0.5865, "rewards/accuracies": 0.0, "rewards/chosen": 0.39067384600639343, "rewards/margins": -0.7930572032928467, "rewards/rejected": 1.1837310791015625, "step": 5182 }, { "epoch": 0.84, "learning_rate": 8.694367435396881e-07, "logits/chosen": -1.0347766876220703, "logits/rejected": -1.024839997291565, "logps/chosen": -207.5536651611328, "logps/rejected": -55.93681335449219, "loss": 0.5165, "rewards/accuracies": 0.0, "rewards/chosen": 1.2748383283615112, "rewards/margins": -0.07182228565216064, "rewards/rejected": 1.3466606140136719, "step": 5183 }, { "epoch": 0.84, "learning_rate": 8.693481705414278e-07, "logits/chosen": -0.4921393096446991, "logits/rejected": -0.503474235534668, "logps/chosen": -69.35111999511719, "logps/rejected": -70.91565704345703, "loss": 0.6824, "rewards/accuracies": 0.0, "rewards/chosen": 0.6707488894462585, "rewards/margins": -0.3347274661064148, "rewards/rejected": 1.0054763555526733, "step": 5184 }, { "epoch": 0.84, "learning_rate": 8.692595720247038e-07, "logits/chosen": -0.21295182406902313, "logits/rejected": -0.2720431983470917, "logps/chosen": -106.27755737304688, "logps/rejected": -81.11939239501953, "loss": 0.4608, "rewards/accuracies": 0.0, "rewards/chosen": 0.4564872682094574, "rewards/margins": -0.15754547715187073, "rewards/rejected": 0.6140327453613281, "step": 5185 }, { "epoch": 0.84, "learning_rate": 8.691709479956372e-07, "logits/chosen": -0.7234594821929932, "logits/rejected": -0.7427594661712646, "logps/chosen": -55.82987976074219, "logps/rejected": -72.2142562866211, "loss": 1.1062, "rewards/accuracies": 1.0, "rewards/chosen": 1.302577257156372, "rewards/margins": 0.2199249267578125, "rewards/rejected": 1.0826523303985596, "step": 5186 }, { "epoch": 0.84, "learning_rate": 8.690822984603512e-07, "logits/chosen": -0.9387940764427185, "logits/rejected": -0.846385657787323, "logps/chosen": -179.095947265625, "logps/rejected": -47.84991455078125, "loss": 0.3056, "rewards/accuracies": 1.0, "rewards/chosen": 1.0170730352401733, "rewards/margins": 0.869942843914032, "rewards/rejected": 0.14713020622730255, "step": 5187 }, { "epoch": 0.84, "learning_rate": 8.689936234249707e-07, "logits/chosen": -0.7047544121742249, "logits/rejected": -0.617301344871521, "logps/chosen": -71.05345153808594, "logps/rejected": -7.083069801330566, "loss": 0.4787, "rewards/accuracies": 1.0, "rewards/chosen": 0.5785713195800781, "rewards/margins": 0.038782477378845215, "rewards/rejected": 0.5397888422012329, "step": 5188 }, { "epoch": 0.84, "learning_rate": 8.689049228956224e-07, "logits/chosen": -1.0846900939941406, "logits/rejected": -1.0258026123046875, "logps/chosen": -112.79662322998047, "logps/rejected": -98.95465087890625, "loss": 0.3222, "rewards/accuracies": 1.0, "rewards/chosen": 4.375458717346191, "rewards/margins": 1.566746711730957, "rewards/rejected": 2.8087120056152344, "step": 5189 }, { "epoch": 0.84, "learning_rate": 8.688161968784344e-07, "logits/chosen": -0.3683800995349884, "logits/rejected": -0.3683800995349884, "logps/chosen": -60.04817581176758, "logps/rejected": -60.04817581176758, "loss": 0.4405, "rewards/accuracies": 0.0, "rewards/chosen": 0.015598297119140625, "rewards/margins": 0.0, "rewards/rejected": 0.015598297119140625, "step": 5190 }, { "epoch": 0.84, "learning_rate": 8.68727445379537e-07, "logits/chosen": -0.7691062092781067, "logits/rejected": -0.8070098161697388, "logps/chosen": -60.38518524169922, "logps/rejected": -54.87114715576172, "loss": 1.043, "rewards/accuracies": 0.0, "rewards/chosen": 1.3648391962051392, "rewards/margins": -1.2359825372695923, "rewards/rejected": 2.6008217334747314, "step": 5191 }, { "epoch": 0.84, "learning_rate": 8.68638668405062e-07, "logits/chosen": -0.65859055519104, "logits/rejected": -0.5437141060829163, "logps/chosen": -99.0724868774414, "logps/rejected": -91.53634643554688, "loss": 0.6778, "rewards/accuracies": 0.0, "rewards/chosen": 1.7767143249511719, "rewards/margins": -0.36684727668762207, "rewards/rejected": 2.143561601638794, "step": 5192 }, { "epoch": 0.84, "learning_rate": 8.685498659611432e-07, "logits/chosen": -0.7469962239265442, "logits/rejected": -0.6877283453941345, "logps/chosen": -40.6077995300293, "logps/rejected": -13.047019004821777, "loss": 0.5093, "rewards/accuracies": 1.0, "rewards/chosen": 2.3467671871185303, "rewards/margins": 1.6529722213745117, "rewards/rejected": 0.6937949061393738, "step": 5193 }, { "epoch": 0.84, "learning_rate": 8.684610380539159e-07, "logits/chosen": -0.5529868006706238, "logits/rejected": -0.6302445530891418, "logps/chosen": -82.8182601928711, "logps/rejected": -78.73667907714844, "loss": 0.6415, "rewards/accuracies": 0.0, "rewards/chosen": 0.9817261099815369, "rewards/margins": -0.8655089735984802, "rewards/rejected": 1.847235083580017, "step": 5194 }, { "epoch": 0.84, "learning_rate": 8.683721846895171e-07, "logits/chosen": -0.5415388345718384, "logits/rejected": -0.539825975894928, "logps/chosen": -112.58496856689453, "logps/rejected": -53.19474792480469, "loss": 1.6077, "rewards/accuracies": 1.0, "rewards/chosen": 1.4384605884552002, "rewards/margins": 0.4899147152900696, "rewards/rejected": 0.9485458731651306, "step": 5195 }, { "epoch": 0.84, "learning_rate": 8.682833058740861e-07, "logits/chosen": -0.5707563161849976, "logits/rejected": -0.42735305428504944, "logps/chosen": -78.10704803466797, "logps/rejected": -137.65211486816406, "loss": 0.9394, "rewards/accuracies": 0.0, "rewards/chosen": 2.098313093185425, "rewards/margins": -1.0546884536743164, "rewards/rejected": 3.153001546859741, "step": 5196 }, { "epoch": 0.84, "learning_rate": 8.681944016137635e-07, "logits/chosen": -0.6271240711212158, "logits/rejected": -0.6132650375366211, "logps/chosen": -56.110687255859375, "logps/rejected": -50.51704406738281, "loss": 0.6718, "rewards/accuracies": 1.0, "rewards/chosen": 2.460989475250244, "rewards/margins": 0.17166757583618164, "rewards/rejected": 2.2893218994140625, "step": 5197 }, { "epoch": 0.84, "learning_rate": 8.681054719146914e-07, "logits/chosen": -0.7992172241210938, "logits/rejected": -0.6310174465179443, "logps/chosen": -79.38552856445312, "logps/rejected": -35.49854278564453, "loss": 0.1586, "rewards/accuracies": 1.0, "rewards/chosen": 2.5234169960021973, "rewards/margins": 2.0137109756469727, "rewards/rejected": 0.5097061395645142, "step": 5198 }, { "epoch": 0.84, "learning_rate": 8.680165167830143e-07, "logits/chosen": -0.6565824747085571, "logits/rejected": -0.6952599883079529, "logps/chosen": -79.53622436523438, "logps/rejected": -78.34263610839844, "loss": 0.6755, "rewards/accuracies": 0.0, "rewards/chosen": 1.4211304187774658, "rewards/margins": -1.0196595191955566, "rewards/rejected": 2.4407899379730225, "step": 5199 }, { "epoch": 0.84, "learning_rate": 8.679275362248782e-07, "logits/chosen": -0.6997473835945129, "logits/rejected": -0.6130659580230713, "logps/chosen": -120.39025115966797, "logps/rejected": -25.477176666259766, "loss": 0.1139, "rewards/accuracies": 1.0, "rewards/chosen": 5.318397045135498, "rewards/margins": 4.4768147468566895, "rewards/rejected": 0.8415821194648743, "step": 5200 }, { "epoch": 0.84, "learning_rate": 8.678385302464306e-07, "logits/chosen": -0.5843662619590759, "logits/rejected": -0.5445578694343567, "logps/chosen": -95.71410369873047, "logps/rejected": -64.03995513916016, "loss": 1.5829, "rewards/accuracies": 1.0, "rewards/chosen": 1.300066351890564, "rewards/margins": 0.16504669189453125, "rewards/rejected": 1.1350196599960327, "step": 5201 }, { "epoch": 0.84, "learning_rate": 8.67749498853821e-07, "logits/chosen": -0.7174935340881348, "logits/rejected": -0.7174935340881348, "logps/chosen": -53.96094512939453, "logps/rejected": -53.96094512939453, "loss": 0.3549, "rewards/accuracies": 0.0, "rewards/chosen": 2.010939836502075, "rewards/margins": 0.0, "rewards/rejected": 2.010939836502075, "step": 5202 }, { "epoch": 0.84, "learning_rate": 8.676604420532009e-07, "logits/chosen": -0.29430118203163147, "logits/rejected": -0.303689569234848, "logps/chosen": -38.19575119018555, "logps/rejected": -36.69281768798828, "loss": 0.6206, "rewards/accuracies": 0.0, "rewards/chosen": 1.0078537464141846, "rewards/margins": -0.06141626834869385, "rewards/rejected": 1.0692700147628784, "step": 5203 }, { "epoch": 0.84, "learning_rate": 8.675713598507231e-07, "logits/chosen": -0.56060391664505, "logits/rejected": -0.5859120488166809, "logps/chosen": -77.88339233398438, "logps/rejected": -168.06607055664062, "loss": 2.4055, "rewards/accuracies": 0.0, "rewards/chosen": 1.6981728076934814, "rewards/margins": -4.615255355834961, "rewards/rejected": 6.313427925109863, "step": 5204 }, { "epoch": 0.84, "learning_rate": 8.674822522525421e-07, "logits/chosen": -0.850208044052124, "logits/rejected": -0.8689815998077393, "logps/chosen": -75.96661376953125, "logps/rejected": -93.0702133178711, "loss": 1.3757, "rewards/accuracies": 0.0, "rewards/chosen": 1.4517685174942017, "rewards/margins": -2.005876064300537, "rewards/rejected": 3.4576447010040283, "step": 5205 }, { "epoch": 0.84, "learning_rate": 8.673931192648148e-07, "logits/chosen": -0.9197072982788086, "logits/rejected": -0.841962456703186, "logps/chosen": -155.7793426513672, "logps/rejected": -64.22079467773438, "loss": 0.0688, "rewards/accuracies": 1.0, "rewards/chosen": 4.603618144989014, "rewards/margins": 2.650120735168457, "rewards/rejected": 1.953497290611267, "step": 5206 }, { "epoch": 0.85, "learning_rate": 8.673039608936992e-07, "logits/chosen": -0.5528395771980286, "logits/rejected": -0.5828913450241089, "logps/chosen": -90.87384033203125, "logps/rejected": -205.17709350585938, "loss": 1.7359, "rewards/accuracies": 0.0, "rewards/chosen": 2.2984726428985596, "rewards/margins": -2.9898483753204346, "rewards/rejected": 5.288321018218994, "step": 5207 }, { "epoch": 0.85, "learning_rate": 8.672147771453553e-07, "logits/chosen": -0.6834356784820557, "logits/rejected": -0.4261389672756195, "logps/chosen": -142.51828002929688, "logps/rejected": -47.22159957885742, "loss": 0.2109, "rewards/accuracies": 1.0, "rewards/chosen": 1.2592804431915283, "rewards/margins": 0.6946033835411072, "rewards/rejected": 0.5646770596504211, "step": 5208 }, { "epoch": 0.85, "learning_rate": 8.671255680259451e-07, "logits/chosen": -1.1547198295593262, "logits/rejected": -1.0319969654083252, "logps/chosen": -134.00411987304688, "logps/rejected": -100.3084487915039, "loss": 0.2129, "rewards/accuracies": 1.0, "rewards/chosen": 6.6247711181640625, "rewards/margins": 2.7278449535369873, "rewards/rejected": 3.896926164627075, "step": 5209 }, { "epoch": 0.85, "learning_rate": 8.670363335416317e-07, "logits/chosen": -0.5609054565429688, "logits/rejected": -0.5681553483009338, "logps/chosen": -91.5256118774414, "logps/rejected": -139.00477600097656, "loss": 1.6556, "rewards/accuracies": 0.0, "rewards/chosen": 4.040639400482178, "rewards/margins": -1.8502464294433594, "rewards/rejected": 5.890885829925537, "step": 5210 }, { "epoch": 0.85, "learning_rate": 8.669470736985808e-07, "logits/chosen": -0.7838582396507263, "logits/rejected": -0.7698495984077454, "logps/chosen": -65.92054748535156, "logps/rejected": -109.41409301757812, "loss": 1.3706, "rewards/accuracies": 0.0, "rewards/chosen": 1.8376693725585938, "rewards/margins": -2.3524703979492188, "rewards/rejected": 4.1901397705078125, "step": 5211 }, { "epoch": 0.85, "learning_rate": 8.66857788502959e-07, "logits/chosen": -0.3500935435295105, "logits/rejected": -0.2476274073123932, "logps/chosen": -120.51065063476562, "logps/rejected": -13.544998168945312, "loss": 0.7011, "rewards/accuracies": 1.0, "rewards/chosen": 5.271475315093994, "rewards/margins": 5.035312175750732, "rewards/rejected": 0.23616304993629456, "step": 5212 }, { "epoch": 0.85, "learning_rate": 8.667684779609355e-07, "logits/chosen": -0.7088294625282288, "logits/rejected": -0.7343457937240601, "logps/chosen": -68.93810272216797, "logps/rejected": -58.19514465332031, "loss": 0.5584, "rewards/accuracies": 0.0, "rewards/chosen": 2.4594719409942627, "rewards/margins": -0.3720238208770752, "rewards/rejected": 2.831495761871338, "step": 5213 }, { "epoch": 0.85, "learning_rate": 8.666791420786803e-07, "logits/chosen": -0.9101824164390564, "logits/rejected": -0.8481747508049011, "logps/chosen": -57.81591033935547, "logps/rejected": -36.459373474121094, "loss": 0.6967, "rewards/accuracies": 0.0, "rewards/chosen": 1.2440407276153564, "rewards/margins": -0.3262920379638672, "rewards/rejected": 1.5703327655792236, "step": 5214 }, { "epoch": 0.85, "learning_rate": 8.665897808623661e-07, "logits/chosen": -0.9179816842079163, "logits/rejected": -0.8918626308441162, "logps/chosen": -66.15975189208984, "logps/rejected": -68.90242767333984, "loss": 1.2686, "rewards/accuracies": 0.0, "rewards/chosen": 2.0129172801971436, "rewards/margins": -1.1856842041015625, "rewards/rejected": 3.198601484298706, "step": 5215 }, { "epoch": 0.85, "learning_rate": 8.665003943181668e-07, "logits/chosen": -0.8537030816078186, "logits/rejected": -1.1997201442718506, "logps/chosen": -94.61205291748047, "logps/rejected": -36.045570373535156, "loss": 0.1345, "rewards/accuracies": 1.0, "rewards/chosen": 1.9272712469100952, "rewards/margins": 1.6511985063552856, "rewards/rejected": 0.2760727107524872, "step": 5216 }, { "epoch": 0.85, "learning_rate": 8.66410982452258e-07, "logits/chosen": -0.6090400815010071, "logits/rejected": -0.6090400815010071, "logps/chosen": -67.33602905273438, "logps/rejected": -67.33602905273438, "loss": 0.479, "rewards/accuracies": 0.0, "rewards/chosen": 0.8669281005859375, "rewards/margins": 0.0, "rewards/rejected": 0.8669281005859375, "step": 5217 }, { "epoch": 0.85, "learning_rate": 8.663215452708173e-07, "logits/chosen": -0.7681836485862732, "logits/rejected": -0.7067965865135193, "logps/chosen": -31.815771102905273, "logps/rejected": -73.86524963378906, "loss": 0.5845, "rewards/accuracies": 1.0, "rewards/chosen": 1.305065393447876, "rewards/margins": 0.022590041160583496, "rewards/rejected": 1.2824753522872925, "step": 5218 }, { "epoch": 0.85, "learning_rate": 8.66232082780024e-07, "logits/chosen": -0.17285825312137604, "logits/rejected": -0.18134965002536774, "logps/chosen": -10.125772476196289, "logps/rejected": -3.871411085128784, "loss": 0.8406, "rewards/accuracies": 0.0, "rewards/chosen": -0.08448686450719833, "rewards/margins": -0.39226898550987244, "rewards/rejected": 0.3077821135520935, "step": 5219 }, { "epoch": 0.85, "learning_rate": 8.661425949860591e-07, "logits/chosen": -0.7239682674407959, "logits/rejected": -0.5983996391296387, "logps/chosen": -68.09901428222656, "logps/rejected": -18.815345764160156, "loss": 0.3572, "rewards/accuracies": 1.0, "rewards/chosen": 1.7497535943984985, "rewards/margins": 1.4760351181030273, "rewards/rejected": 0.2737184464931488, "step": 5220 }, { "epoch": 0.85, "learning_rate": 8.660530818951055e-07, "logits/chosen": -0.7405584454536438, "logits/rejected": -0.5761022567749023, "logps/chosen": -132.85638427734375, "logps/rejected": -178.0291748046875, "loss": 1.3031, "rewards/accuracies": 0.0, "rewards/chosen": 0.9879013299942017, "rewards/margins": -2.3712663650512695, "rewards/rejected": 3.3591675758361816, "step": 5221 }, { "epoch": 0.85, "learning_rate": 8.659635435133474e-07, "logits/chosen": -0.5943928360939026, "logits/rejected": -0.595653772354126, "logps/chosen": -56.86067199707031, "logps/rejected": -87.33731079101562, "loss": 0.5933, "rewards/accuracies": 0.0, "rewards/chosen": 1.5417267084121704, "rewards/margins": -0.04201352596282959, "rewards/rejected": 1.583740234375, "step": 5222 }, { "epoch": 0.85, "learning_rate": 8.658739798469712e-07, "logits/chosen": -0.6525729894638062, "logits/rejected": -0.6121236085891724, "logps/chosen": -89.20704650878906, "logps/rejected": -100.6323471069336, "loss": 0.3405, "rewards/accuracies": 1.0, "rewards/chosen": 1.6594833135604858, "rewards/margins": 0.07194280624389648, "rewards/rejected": 1.5875405073165894, "step": 5223 }, { "epoch": 0.85, "learning_rate": 8.657843909021651e-07, "logits/chosen": -1.0635219812393188, "logits/rejected": -1.0314853191375732, "logps/chosen": -42.95552062988281, "logps/rejected": -115.16606903076172, "loss": 0.6305, "rewards/accuracies": 0.0, "rewards/chosen": 1.9889999628067017, "rewards/margins": -0.2609916925430298, "rewards/rejected": 2.2499916553497314, "step": 5224 }, { "epoch": 0.85, "learning_rate": 8.656947766851186e-07, "logits/chosen": -0.7933486700057983, "logits/rejected": -0.7253487706184387, "logps/chosen": -93.1839828491211, "logps/rejected": -58.66484069824219, "loss": 0.4304, "rewards/accuracies": 1.0, "rewards/chosen": 3.3113198280334473, "rewards/margins": 0.7106819152832031, "rewards/rejected": 2.600637912750244, "step": 5225 }, { "epoch": 0.85, "learning_rate": 8.656051372020232e-07, "logits/chosen": -0.7088838815689087, "logits/rejected": -0.5941128134727478, "logps/chosen": -74.89798736572266, "logps/rejected": -23.811702728271484, "loss": 1.1137, "rewards/accuracies": 1.0, "rewards/chosen": 1.0808089971542358, "rewards/margins": 1.1311131715774536, "rewards/rejected": -0.05030422285199165, "step": 5226 }, { "epoch": 0.85, "learning_rate": 8.655154724590722e-07, "logits/chosen": -0.3067421019077301, "logits/rejected": -0.24402841925621033, "logps/chosen": -106.56312561035156, "logps/rejected": -59.38768768310547, "loss": 0.8357, "rewards/accuracies": 0.0, "rewards/chosen": 0.41915589570999146, "rewards/margins": -1.2878289222717285, "rewards/rejected": 1.7069847583770752, "step": 5227 }, { "epoch": 0.85, "learning_rate": 8.654257824624607e-07, "logits/chosen": -0.46036431193351746, "logits/rejected": -0.403478741645813, "logps/chosen": -81.25281524658203, "logps/rejected": -81.31227111816406, "loss": 2.1886, "rewards/accuracies": 0.0, "rewards/chosen": 1.0605964660644531, "rewards/margins": -0.6114357709884644, "rewards/rejected": 1.6720322370529175, "step": 5228 }, { "epoch": 0.85, "learning_rate": 8.653360672183852e-07, "logits/chosen": -0.5554673671722412, "logits/rejected": -0.6055876612663269, "logps/chosen": -84.75505828857422, "logps/rejected": -85.05656433105469, "loss": 1.437, "rewards/accuracies": 0.0, "rewards/chosen": 0.8539161682128906, "rewards/margins": -0.7633827924728394, "rewards/rejected": 1.61729896068573, "step": 5229 }, { "epoch": 0.85, "learning_rate": 8.652463267330443e-07, "logits/chosen": -0.5637611150741577, "logits/rejected": -0.5637611150741577, "logps/chosen": -27.00797462463379, "logps/rejected": -27.00797462463379, "loss": 0.7065, "rewards/accuracies": 0.0, "rewards/chosen": 1.3895231485366821, "rewards/margins": 0.0, "rewards/rejected": 1.3895231485366821, "step": 5230 }, { "epoch": 0.85, "learning_rate": 8.651565610126383e-07, "logits/chosen": -0.3267815411090851, "logits/rejected": -0.19848273694515228, "logps/chosen": -55.28364944458008, "logps/rejected": -47.83081817626953, "loss": 0.4852, "rewards/accuracies": 1.0, "rewards/chosen": 1.9023128747940063, "rewards/margins": 0.8157341480255127, "rewards/rejected": 1.0865787267684937, "step": 5231 }, { "epoch": 0.85, "learning_rate": 8.650667700633691e-07, "logits/chosen": -0.48165425658226013, "logits/rejected": -0.5715295076370239, "logps/chosen": -85.35728454589844, "logps/rejected": -120.60543823242188, "loss": 1.8475, "rewards/accuracies": 0.0, "rewards/chosen": 2.0500779151916504, "rewards/margins": -2.261876106262207, "rewards/rejected": 4.311954021453857, "step": 5232 }, { "epoch": 0.85, "learning_rate": 8.649769538914405e-07, "logits/chosen": -0.9160503149032593, "logits/rejected": -0.9824215769767761, "logps/chosen": -100.8884048461914, "logps/rejected": -101.0362777709961, "loss": 1.1024, "rewards/accuracies": 0.0, "rewards/chosen": 1.6476119756698608, "rewards/margins": -1.6619499921798706, "rewards/rejected": 3.3095619678497314, "step": 5233 }, { "epoch": 0.85, "learning_rate": 8.648871125030574e-07, "logits/chosen": -0.9980381727218628, "logits/rejected": -0.9962877631187439, "logps/chosen": -105.4000473022461, "logps/rejected": -154.02197265625, "loss": 2.0935, "rewards/accuracies": 0.0, "rewards/chosen": 1.3746925592422485, "rewards/margins": -3.8200464248657227, "rewards/rejected": 5.194738864898682, "step": 5234 }, { "epoch": 0.85, "learning_rate": 8.647972459044278e-07, "logits/chosen": -0.8433145880699158, "logits/rejected": -0.6801074147224426, "logps/chosen": -306.2689208984375, "logps/rejected": -82.87429809570312, "loss": 0.3911, "rewards/accuracies": 1.0, "rewards/chosen": 4.191094875335693, "rewards/margins": 2.372882843017578, "rewards/rejected": 1.8182121515274048, "step": 5235 }, { "epoch": 0.85, "learning_rate": 8.647073541017603e-07, "logits/chosen": -0.8281293511390686, "logits/rejected": -0.7901745438575745, "logps/chosen": -88.43182373046875, "logps/rejected": -31.51348876953125, "loss": 0.2764, "rewards/accuracies": 1.0, "rewards/chosen": 0.8731674551963806, "rewards/margins": 0.4309837520122528, "rewards/rejected": 0.4421837031841278, "step": 5236 }, { "epoch": 0.85, "learning_rate": 8.646174371012652e-07, "logits/chosen": -0.8582638502120972, "logits/rejected": -0.8259085416793823, "logps/chosen": -139.18121337890625, "logps/rejected": -146.10302734375, "loss": 0.6693, "rewards/accuracies": 0.0, "rewards/chosen": 5.2106170654296875, "rewards/margins": -0.1384124755859375, "rewards/rejected": 5.349029541015625, "step": 5237 }, { "epoch": 0.85, "learning_rate": 8.645274949091554e-07, "logits/chosen": -0.7162202000617981, "logits/rejected": -0.6795183420181274, "logps/chosen": -122.0758056640625, "logps/rejected": -71.35475158691406, "loss": 0.0452, "rewards/accuracies": 1.0, "rewards/chosen": 4.015011787414551, "rewards/margins": 2.9285998344421387, "rewards/rejected": 1.0864120721817017, "step": 5238 }, { "epoch": 0.85, "learning_rate": 8.644375275316449e-07, "logits/chosen": -0.5849468111991882, "logits/rejected": -0.5700366497039795, "logps/chosen": -117.18241882324219, "logps/rejected": -182.74710083007812, "loss": 0.2915, "rewards/accuracies": 1.0, "rewards/chosen": 2.0202088356018066, "rewards/margins": 1.1299378871917725, "rewards/rejected": 0.890271008014679, "step": 5239 }, { "epoch": 0.85, "learning_rate": 8.643475349749496e-07, "logits/chosen": -0.5752274394035339, "logits/rejected": -0.538141667842865, "logps/chosen": -84.87318420410156, "logps/rejected": -6.7152419090271, "loss": 0.4744, "rewards/accuracies": 1.0, "rewards/chosen": 2.505596876144409, "rewards/margins": 1.6392651796340942, "rewards/rejected": 0.8663316965103149, "step": 5240 }, { "epoch": 0.85, "learning_rate": 8.642575172452871e-07, "logits/chosen": -0.6000978946685791, "logits/rejected": -0.5883206129074097, "logps/chosen": -116.43526458740234, "logps/rejected": -125.11524963378906, "loss": 0.8022, "rewards/accuracies": 0.0, "rewards/chosen": 2.6358940601348877, "rewards/margins": -1.245948076248169, "rewards/rejected": 3.8818421363830566, "step": 5241 }, { "epoch": 0.85, "learning_rate": 8.641674743488769e-07, "logits/chosen": -0.8570443391799927, "logits/rejected": -0.8570443391799927, "logps/chosen": -83.72662353515625, "logps/rejected": -83.72662353515625, "loss": 0.4298, "rewards/accuracies": 0.0, "rewards/chosen": 3.5570099353790283, "rewards/margins": 0.0, "rewards/rejected": 3.5570099353790283, "step": 5242 }, { "epoch": 0.85, "learning_rate": 8.640774062919399e-07, "logits/chosen": -0.34031081199645996, "logits/rejected": -0.24734455347061157, "logps/chosen": -56.356040954589844, "logps/rejected": -5.674834728240967, "loss": 1.3507, "rewards/accuracies": 1.0, "rewards/chosen": 1.3215073347091675, "rewards/margins": 0.5660671591758728, "rewards/rejected": 0.7554401755332947, "step": 5243 }, { "epoch": 0.85, "learning_rate": 8.63987313080699e-07, "logits/chosen": -1.1576987504959106, "logits/rejected": -1.1427671909332275, "logps/chosen": -136.86703491210938, "logps/rejected": -72.68170928955078, "loss": 0.4014, "rewards/accuracies": 1.0, "rewards/chosen": 4.689845561981201, "rewards/margins": 2.6115870475769043, "rewards/rejected": 2.078258514404297, "step": 5244 }, { "epoch": 0.85, "learning_rate": 8.638971947213789e-07, "logits/chosen": -0.610942006111145, "logits/rejected": -0.6651890277862549, "logps/chosen": -58.02922058105469, "logps/rejected": -127.04412841796875, "loss": 0.6849, "rewards/accuracies": 0.0, "rewards/chosen": 1.8246482610702515, "rewards/margins": -0.9186500310897827, "rewards/rejected": 2.743298292160034, "step": 5245 }, { "epoch": 0.85, "learning_rate": 8.638070512202059e-07, "logits/chosen": -0.7792425155639648, "logits/rejected": -0.6800054311752319, "logps/chosen": -179.63992309570312, "logps/rejected": -211.16490173339844, "loss": 2.5758, "rewards/accuracies": 0.0, "rewards/chosen": 4.98714017868042, "rewards/margins": -3.0548691749572754, "rewards/rejected": 8.042009353637695, "step": 5246 }, { "epoch": 0.85, "learning_rate": 8.63716882583408e-07, "logits/chosen": -0.35745999217033386, "logits/rejected": -0.3525850772857666, "logps/chosen": -43.36734390258789, "logps/rejected": -51.84612274169922, "loss": 0.7742, "rewards/accuracies": 0.0, "rewards/chosen": 1.8172531127929688, "rewards/margins": -0.11350476741790771, "rewards/rejected": 1.9307578802108765, "step": 5247 }, { "epoch": 0.85, "learning_rate": 8.63626688817215e-07, "logits/chosen": -0.1312878280878067, "logits/rejected": -0.1339966356754303, "logps/chosen": -3.3328373432159424, "logps/rejected": -1.8862531185150146, "loss": 0.699, "rewards/accuracies": 1.0, "rewards/chosen": 0.2515520751476288, "rewards/margins": 0.00815381109714508, "rewards/rejected": 0.2433982640504837, "step": 5248 }, { "epoch": 0.85, "learning_rate": 8.635364699278585e-07, "logits/chosen": -0.9133489727973938, "logits/rejected": -0.8290987014770508, "logps/chosen": -77.00467681884766, "logps/rejected": -12.643113136291504, "loss": 0.5758, "rewards/accuracies": 1.0, "rewards/chosen": 1.5107063055038452, "rewards/margins": 1.14410400390625, "rewards/rejected": 0.36660224199295044, "step": 5249 }, { "epoch": 0.85, "learning_rate": 8.634462259215718e-07, "logits/chosen": -0.5380246043205261, "logits/rejected": -0.5923379063606262, "logps/chosen": -74.69122314453125, "logps/rejected": -50.05569076538086, "loss": 0.8771, "rewards/accuracies": 0.0, "rewards/chosen": 1.209285020828247, "rewards/margins": -1.2106671333312988, "rewards/rejected": 2.419952154159546, "step": 5250 }, { "epoch": 0.85, "learning_rate": 8.633559568045898e-07, "logits/chosen": -0.8573489785194397, "logits/rejected": -0.8338238596916199, "logps/chosen": -81.3009033203125, "logps/rejected": -56.911224365234375, "loss": 0.4933, "rewards/accuracies": 0.0, "rewards/chosen": 1.9537209272384644, "rewards/margins": -0.12413251399993896, "rewards/rejected": 2.0778534412384033, "step": 5251 }, { "epoch": 0.85, "learning_rate": 8.632656625831494e-07, "logits/chosen": -0.5010104179382324, "logits/rejected": -0.25239092111587524, "logps/chosen": -127.79512786865234, "logps/rejected": -36.86702346801758, "loss": 0.167, "rewards/accuracies": 1.0, "rewards/chosen": 3.2605767250061035, "rewards/margins": 3.1218578815460205, "rewards/rejected": 0.13871879875659943, "step": 5252 }, { "epoch": 0.85, "learning_rate": 8.631753432634888e-07, "logits/chosen": -0.9843815565109253, "logits/rejected": -0.9521129131317139, "logps/chosen": -52.534568786621094, "logps/rejected": -20.046077728271484, "loss": 0.2572, "rewards/accuracies": 1.0, "rewards/chosen": 1.6374351978302002, "rewards/margins": 1.3361256122589111, "rewards/rejected": 0.30130958557128906, "step": 5253 }, { "epoch": 0.85, "learning_rate": 8.630849988518485e-07, "logits/chosen": -0.5197164416313171, "logits/rejected": -0.480965256690979, "logps/chosen": -58.84053039550781, "logps/rejected": -39.773162841796875, "loss": 0.3435, "rewards/accuracies": 1.0, "rewards/chosen": 0.6901779174804688, "rewards/margins": 0.08449441194534302, "rewards/rejected": 0.6056835055351257, "step": 5254 }, { "epoch": 0.85, "learning_rate": 8.629946293544703e-07, "logits/chosen": -0.5689060091972351, "logits/rejected": -0.5058098435401917, "logps/chosen": -46.33122634887695, "logps/rejected": -18.12417984008789, "loss": 0.5701, "rewards/accuracies": 1.0, "rewards/chosen": 0.7121181488037109, "rewards/margins": 0.29583778977394104, "rewards/rejected": 0.4162803590297699, "step": 5255 }, { "epoch": 0.85, "learning_rate": 8.629042347775979e-07, "logits/chosen": -0.7981060147285461, "logits/rejected": -0.7173805832862854, "logps/chosen": -105.13380432128906, "logps/rejected": -18.988557815551758, "loss": 0.3279, "rewards/accuracies": 1.0, "rewards/chosen": 0.6139251589775085, "rewards/margins": 0.3372838795185089, "rewards/rejected": 0.27664127945899963, "step": 5256 }, { "epoch": 0.85, "learning_rate": 8.628138151274766e-07, "logits/chosen": -0.566041886806488, "logits/rejected": -0.2421291023492813, "logps/chosen": -183.49093627929688, "logps/rejected": -90.3416976928711, "loss": 0.3609, "rewards/accuracies": 1.0, "rewards/chosen": 3.5567474365234375, "rewards/margins": 0.15514373779296875, "rewards/rejected": 3.4016036987304688, "step": 5257 }, { "epoch": 0.85, "learning_rate": 8.627233704103537e-07, "logits/chosen": -0.3359888195991516, "logits/rejected": -0.4057784676551819, "logps/chosen": -89.64019775390625, "logps/rejected": -92.28591918945312, "loss": 0.8507, "rewards/accuracies": 0.0, "rewards/chosen": 0.9561347961425781, "rewards/margins": -0.19354021549224854, "rewards/rejected": 1.1496750116348267, "step": 5258 }, { "epoch": 0.85, "learning_rate": 8.62632900632478e-07, "logits/chosen": -0.9070210456848145, "logits/rejected": -0.9090564846992493, "logps/chosen": -104.32670593261719, "logps/rejected": -98.48472595214844, "loss": 0.2739, "rewards/accuracies": 1.0, "rewards/chosen": 1.746865153312683, "rewards/margins": 0.35540318489074707, "rewards/rejected": 1.391461968421936, "step": 5259 }, { "epoch": 0.85, "learning_rate": 8.625424058001003e-07, "logits/chosen": -0.4333938956260681, "logits/rejected": -0.31882259249687195, "logps/chosen": -65.70662689208984, "logps/rejected": -6.964911937713623, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 2.264556884765625, "rewards/margins": 1.7506675720214844, "rewards/rejected": 0.5138893723487854, "step": 5260 }, { "epoch": 0.85, "learning_rate": 8.624518859194726e-07, "logits/chosen": -0.6113063097000122, "logits/rejected": -0.6340588331222534, "logps/chosen": -54.65837478637695, "logps/rejected": -68.27592468261719, "loss": 0.8674, "rewards/accuracies": 0.0, "rewards/chosen": 1.7495311498641968, "rewards/margins": -0.0034519433975219727, "rewards/rejected": 1.7529830932617188, "step": 5261 }, { "epoch": 0.85, "learning_rate": 8.623613409968491e-07, "logits/chosen": -0.6432130932807922, "logits/rejected": -0.6394885182380676, "logps/chosen": -45.109134674072266, "logps/rejected": -142.18324279785156, "loss": 0.5684, "rewards/accuracies": 1.0, "rewards/chosen": 1.9934123754501343, "rewards/margins": 1.4638454914093018, "rewards/rejected": 0.5295669436454773, "step": 5262 }, { "epoch": 0.85, "learning_rate": 8.622707710384857e-07, "logits/chosen": -0.8209388256072998, "logits/rejected": -0.8547759056091309, "logps/chosen": -96.8200454711914, "logps/rejected": -111.90190887451172, "loss": 0.9727, "rewards/accuracies": 0.0, "rewards/chosen": 1.3442176580429077, "rewards/margins": -0.2716156244277954, "rewards/rejected": 1.6158332824707031, "step": 5263 }, { "epoch": 0.85, "learning_rate": 8.621801760506399e-07, "logits/chosen": -0.3930773138999939, "logits/rejected": -0.4133386015892029, "logps/chosen": -71.98857879638672, "logps/rejected": -57.17302703857422, "loss": 1.1852, "rewards/accuracies": 0.0, "rewards/chosen": 0.8640152215957642, "rewards/margins": -1.833666205406189, "rewards/rejected": 2.697681427001953, "step": 5264 }, { "epoch": 0.85, "learning_rate": 8.62089556039571e-07, "logits/chosen": -0.7449696063995361, "logits/rejected": -0.6872117519378662, "logps/chosen": -67.65336608886719, "logps/rejected": -44.858097076416016, "loss": 0.6183, "rewards/accuracies": 0.0, "rewards/chosen": 1.2824302911758423, "rewards/margins": -0.7584201097488403, "rewards/rejected": 2.0408504009246826, "step": 5265 }, { "epoch": 0.85, "learning_rate": 8.619989110115398e-07, "logits/chosen": -0.863929271697998, "logits/rejected": -0.8764048218727112, "logps/chosen": -75.77727508544922, "logps/rejected": -83.66996765136719, "loss": 0.609, "rewards/accuracies": 0.0, "rewards/chosen": 2.3383920192718506, "rewards/margins": -0.10739350318908691, "rewards/rejected": 2.4457855224609375, "step": 5266 }, { "epoch": 0.85, "learning_rate": 8.619082409728091e-07, "logits/chosen": -0.3692912757396698, "logits/rejected": -0.34215569496154785, "logps/chosen": -48.93152618408203, "logps/rejected": -19.625524520874023, "loss": 0.4218, "rewards/accuracies": 0.0, "rewards/chosen": 0.2991745173931122, "rewards/margins": -0.1737830936908722, "rewards/rejected": 0.4729576110839844, "step": 5267 }, { "epoch": 0.86, "learning_rate": 8.618175459296433e-07, "logits/chosen": -0.4811421036720276, "logits/rejected": -0.3564959764480591, "logps/chosen": -64.81391906738281, "logps/rejected": -90.87108612060547, "loss": 0.5971, "rewards/accuracies": 1.0, "rewards/chosen": 2.1925034523010254, "rewards/margins": 0.09333038330078125, "rewards/rejected": 2.099173069000244, "step": 5268 }, { "epoch": 0.86, "learning_rate": 8.617268258883088e-07, "logits/chosen": -0.5330022573471069, "logits/rejected": -0.39818790555000305, "logps/chosen": -119.9729995727539, "logps/rejected": -66.15634155273438, "loss": 0.2011, "rewards/accuracies": 1.0, "rewards/chosen": 2.994560956954956, "rewards/margins": 0.7194464206695557, "rewards/rejected": 2.2751145362854004, "step": 5269 }, { "epoch": 0.86, "learning_rate": 8.616360808550731e-07, "logits/chosen": -0.5561515688896179, "logits/rejected": -0.5306019186973572, "logps/chosen": -116.7741470336914, "logps/rejected": -96.46662902832031, "loss": 0.8258, "rewards/accuracies": 0.0, "rewards/chosen": 1.4368736743927002, "rewards/margins": -0.31095731258392334, "rewards/rejected": 1.7478309869766235, "step": 5270 }, { "epoch": 0.86, "learning_rate": 8.615453108362063e-07, "logits/chosen": -0.6411500573158264, "logits/rejected": -0.6056186556816101, "logps/chosen": -70.97300720214844, "logps/rejected": -124.19380187988281, "loss": 0.3508, "rewards/accuracies": 1.0, "rewards/chosen": 1.5421829223632812, "rewards/margins": 0.7082534432411194, "rewards/rejected": 0.8339294791221619, "step": 5271 }, { "epoch": 0.86, "learning_rate": 8.614545158379792e-07, "logits/chosen": -0.8074954152107239, "logits/rejected": -0.7897719144821167, "logps/chosen": -75.89325714111328, "logps/rejected": -86.73953247070312, "loss": 0.1689, "rewards/accuracies": 1.0, "rewards/chosen": 1.4859421253204346, "rewards/margins": 0.9654815793037415, "rewards/rejected": 0.5204605460166931, "step": 5272 }, { "epoch": 0.86, "learning_rate": 8.613636958666653e-07, "logits/chosen": -0.7827582359313965, "logits/rejected": -0.8323636054992676, "logps/chosen": -79.33056640625, "logps/rejected": -181.20919799804688, "loss": 2.4379, "rewards/accuracies": 0.0, "rewards/chosen": 1.9812653064727783, "rewards/margins": -3.958305597305298, "rewards/rejected": 5.939570903778076, "step": 5273 }, { "epoch": 0.86, "learning_rate": 8.612728509285394e-07, "logits/chosen": -0.9250276684761047, "logits/rejected": -0.9802561402320862, "logps/chosen": -109.7746353149414, "logps/rejected": -114.00240325927734, "loss": 3.0961, "rewards/accuracies": 0.0, "rewards/chosen": 2.2781760692596436, "rewards/margins": -4.221107482910156, "rewards/rejected": 6.499283790588379, "step": 5274 }, { "epoch": 0.86, "learning_rate": 8.611819810298777e-07, "logits/chosen": -0.804994523525238, "logits/rejected": -0.8162356615066528, "logps/chosen": -201.10086059570312, "logps/rejected": -203.22312927246094, "loss": 0.6448, "rewards/accuracies": 0.0, "rewards/chosen": 4.028832912445068, "rewards/margins": -0.5917940139770508, "rewards/rejected": 4.620626926422119, "step": 5275 }, { "epoch": 0.86, "learning_rate": 8.610910861769588e-07, "logits/chosen": -0.6394565105438232, "logits/rejected": -0.6855877041816711, "logps/chosen": -152.62001037597656, "logps/rejected": -132.70050048828125, "loss": 0.1676, "rewards/accuracies": 1.0, "rewards/chosen": 2.783451795578003, "rewards/margins": 2.070185661315918, "rewards/rejected": 0.7132660150527954, "step": 5276 }, { "epoch": 0.86, "learning_rate": 8.610001663760625e-07, "logits/chosen": -0.5370798707008362, "logits/rejected": -0.4917536675930023, "logps/chosen": -44.38814926147461, "logps/rejected": -9.248480796813965, "loss": 0.2225, "rewards/accuracies": 1.0, "rewards/chosen": 1.4948681592941284, "rewards/margins": 0.6576656103134155, "rewards/rejected": 0.8372025489807129, "step": 5277 }, { "epoch": 0.86, "learning_rate": 8.609092216334704e-07, "logits/chosen": -0.8965833187103271, "logits/rejected": -0.87433260679245, "logps/chosen": -42.842628479003906, "logps/rejected": -20.232666015625, "loss": 0.8233, "rewards/accuracies": 1.0, "rewards/chosen": 1.1099700927734375, "rewards/margins": 0.7544761896133423, "rewards/rejected": 0.3554939329624176, "step": 5278 }, { "epoch": 0.86, "learning_rate": 8.608182519554661e-07, "logits/chosen": -0.5236578583717346, "logits/rejected": -0.5374467968940735, "logps/chosen": -58.72789001464844, "logps/rejected": -38.80724334716797, "loss": 0.7825, "rewards/accuracies": 0.0, "rewards/chosen": 1.6406738758087158, "rewards/margins": -1.0457854270935059, "rewards/rejected": 2.6864593029022217, "step": 5279 }, { "epoch": 0.86, "learning_rate": 8.607272573483347e-07, "logits/chosen": -0.7280599474906921, "logits/rejected": -0.7339015007019043, "logps/chosen": -107.32777404785156, "logps/rejected": -53.241294860839844, "loss": 0.5727, "rewards/accuracies": 0.0, "rewards/chosen": 0.975909411907196, "rewards/margins": -0.7602348923683167, "rewards/rejected": 1.7361443042755127, "step": 5280 }, { "epoch": 0.86, "learning_rate": 8.606362378183631e-07, "logits/chosen": -0.749592661857605, "logits/rejected": -0.6886864900588989, "logps/chosen": -59.428348541259766, "logps/rejected": -33.42135238647461, "loss": 1.1053, "rewards/accuracies": 0.0, "rewards/chosen": 1.059355616569519, "rewards/margins": -0.5319151878356934, "rewards/rejected": 1.5912708044052124, "step": 5281 }, { "epoch": 0.86, "learning_rate": 8.605451933718397e-07, "logits/chosen": -0.3550596237182617, "logits/rejected": -0.3384188711643219, "logps/chosen": -14.682814598083496, "logps/rejected": -19.106536865234375, "loss": 0.3261, "rewards/accuracies": 1.0, "rewards/chosen": 0.6402302980422974, "rewards/margins": 0.4160231947898865, "rewards/rejected": 0.22420711815357208, "step": 5282 }, { "epoch": 0.86, "learning_rate": 8.604541240150551e-07, "logits/chosen": -0.6098260879516602, "logits/rejected": -0.5815427899360657, "logps/chosen": -2.4591457843780518, "logps/rejected": -40.11415481567383, "loss": 0.6276, "rewards/accuracies": 1.0, "rewards/chosen": 0.4785288870334625, "rewards/margins": 0.47839614748954773, "rewards/rejected": 0.00013275146193336695, "step": 5283 }, { "epoch": 0.86, "learning_rate": 8.603630297543011e-07, "logits/chosen": -0.490565687417984, "logits/rejected": -0.5235247611999512, "logps/chosen": -54.59889221191406, "logps/rejected": -32.22718811035156, "loss": 1.4487, "rewards/accuracies": 1.0, "rewards/chosen": 0.771405041217804, "rewards/margins": 0.04834061861038208, "rewards/rejected": 0.7230644226074219, "step": 5284 }, { "epoch": 0.86, "learning_rate": 8.602719105958716e-07, "logits/chosen": -0.138437420129776, "logits/rejected": -0.138437420129776, "logps/chosen": -72.18885040283203, "logps/rejected": -72.18885040283203, "loss": 0.583, "rewards/accuracies": 0.0, "rewards/chosen": 0.6191818118095398, "rewards/margins": 0.0, "rewards/rejected": 0.6191818118095398, "step": 5285 }, { "epoch": 0.86, "learning_rate": 8.601807665460619e-07, "logits/chosen": -0.28058603405952454, "logits/rejected": -0.3574922978878021, "logps/chosen": -116.10263061523438, "logps/rejected": -87.36125183105469, "loss": 0.5856, "rewards/accuracies": 1.0, "rewards/chosen": 1.7557815313339233, "rewards/margins": 1.1164238452911377, "rewards/rejected": 0.6393577456474304, "step": 5286 }, { "epoch": 0.86, "learning_rate": 8.600895976111694e-07, "logits/chosen": -0.7404811382293701, "logits/rejected": -0.754082202911377, "logps/chosen": -57.031776428222656, "logps/rejected": -119.5224380493164, "loss": 0.9563, "rewards/accuracies": 1.0, "rewards/chosen": 2.810422658920288, "rewards/margins": 1.2810624837875366, "rewards/rejected": 1.5293601751327515, "step": 5287 }, { "epoch": 0.86, "learning_rate": 8.599984037974928e-07, "logits/chosen": -0.7738829851150513, "logits/rejected": -1.1886440515518188, "logps/chosen": -70.59935760498047, "logps/rejected": -36.778358459472656, "loss": 0.7467, "rewards/accuracies": 1.0, "rewards/chosen": 1.325679063796997, "rewards/margins": 1.0354244709014893, "rewards/rejected": 0.2902545928955078, "step": 5288 }, { "epoch": 0.86, "learning_rate": 8.599071851113329e-07, "logits/chosen": -0.4322184920310974, "logits/rejected": -0.4322184920310974, "logps/chosen": -80.18944549560547, "logps/rejected": -80.18944549560547, "loss": 0.6804, "rewards/accuracies": 0.0, "rewards/chosen": 1.848345160484314, "rewards/margins": 0.0, "rewards/rejected": 1.848345160484314, "step": 5289 }, { "epoch": 0.86, "learning_rate": 8.598159415589919e-07, "logits/chosen": -0.6369786858558655, "logits/rejected": -0.49792489409446716, "logps/chosen": -98.1038818359375, "logps/rejected": -52.8929443359375, "loss": 0.4143, "rewards/accuracies": 1.0, "rewards/chosen": 0.4352912902832031, "rewards/margins": 0.27425307035446167, "rewards/rejected": 0.16103820502758026, "step": 5290 }, { "epoch": 0.86, "learning_rate": 8.59724673146774e-07, "logits/chosen": -0.8557021021842957, "logits/rejected": -1.2037196159362793, "logps/chosen": -99.29486083984375, "logps/rejected": -33.688018798828125, "loss": 1.2262, "rewards/accuracies": 1.0, "rewards/chosen": 1.4710403680801392, "rewards/margins": 1.233567476272583, "rewards/rejected": 0.23747292160987854, "step": 5291 }, { "epoch": 0.86, "learning_rate": 8.59633379880985e-07, "logits/chosen": -0.6720677018165588, "logits/rejected": -0.6488562226295471, "logps/chosen": -81.32901000976562, "logps/rejected": -94.02742004394531, "loss": 0.4188, "rewards/accuracies": 1.0, "rewards/chosen": 2.5075995922088623, "rewards/margins": 0.4584357738494873, "rewards/rejected": 2.049163818359375, "step": 5292 }, { "epoch": 0.86, "learning_rate": 8.595420617679323e-07, "logits/chosen": -0.6753042936325073, "logits/rejected": -0.6753042936325073, "logps/chosen": -70.48387145996094, "logps/rejected": -70.48387145996094, "loss": 0.4279, "rewards/accuracies": 0.0, "rewards/chosen": 1.8024123907089233, "rewards/margins": 0.0, "rewards/rejected": 1.8024123907089233, "step": 5293 }, { "epoch": 0.86, "learning_rate": 8.59450718813925e-07, "logits/chosen": -0.751582145690918, "logits/rejected": -0.6121399998664856, "logps/chosen": -168.5843963623047, "logps/rejected": -97.43858337402344, "loss": 0.443, "rewards/accuracies": 1.0, "rewards/chosen": 5.521356105804443, "rewards/margins": 0.9005246162414551, "rewards/rejected": 4.620831489562988, "step": 5294 }, { "epoch": 0.86, "learning_rate": 8.593593510252744e-07, "logits/chosen": -0.8823960423469543, "logits/rejected": -0.7198048830032349, "logps/chosen": -145.6865234375, "logps/rejected": -83.66065979003906, "loss": 0.0696, "rewards/accuracies": 1.0, "rewards/chosen": 5.616302490234375, "rewards/margins": 2.143874406814575, "rewards/rejected": 3.4724280834198, "step": 5295 }, { "epoch": 0.86, "learning_rate": 8.59267958408293e-07, "logits/chosen": -0.6729986667633057, "logits/rejected": -0.6375029683113098, "logps/chosen": -156.91171264648438, "logps/rejected": -59.33910369873047, "loss": 0.9544, "rewards/accuracies": 0.0, "rewards/chosen": 0.11295624077320099, "rewards/margins": -1.6307563781738281, "rewards/rejected": 1.7437126636505127, "step": 5296 }, { "epoch": 0.86, "learning_rate": 8.591765409692948e-07, "logits/chosen": -0.8407966494560242, "logits/rejected": -0.8407966494560242, "logps/chosen": -71.6771240234375, "logps/rejected": -71.6771240234375, "loss": 1.8381, "rewards/accuracies": 0.0, "rewards/chosen": 1.2010787725448608, "rewards/margins": 0.0, "rewards/rejected": 1.2010787725448608, "step": 5297 }, { "epoch": 0.86, "learning_rate": 8.590850987145964e-07, "logits/chosen": -0.7458480596542358, "logits/rejected": -0.7286396026611328, "logps/chosen": -56.55049133300781, "logps/rejected": -99.29103088378906, "loss": 0.359, "rewards/accuracies": 1.0, "rewards/chosen": 1.5032669305801392, "rewards/margins": 0.9248520135879517, "rewards/rejected": 0.5784149169921875, "step": 5298 }, { "epoch": 0.86, "learning_rate": 8.589936316505153e-07, "logits/chosen": -0.9627741575241089, "logits/rejected": -0.3210005462169647, "logps/chosen": -54.262969970703125, "logps/rejected": -69.93092346191406, "loss": 1.2327, "rewards/accuracies": 0.0, "rewards/chosen": 2.112319231033325, "rewards/margins": -1.9344208240509033, "rewards/rejected": 4.0467400550842285, "step": 5299 }, { "epoch": 0.86, "learning_rate": 8.589021397833711e-07, "logits/chosen": -0.7740428447723389, "logits/rejected": -0.6997621059417725, "logps/chosen": -143.55909729003906, "logps/rejected": -44.7429313659668, "loss": 0.1687, "rewards/accuracies": 1.0, "rewards/chosen": 1.3091232776641846, "rewards/margins": 1.0541714429855347, "rewards/rejected": 0.2549518644809723, "step": 5300 }, { "epoch": 0.86, "learning_rate": 8.588106231194849e-07, "logits/chosen": -0.46447962522506714, "logits/rejected": -0.4346000552177429, "logps/chosen": -64.30680847167969, "logps/rejected": -100.67120361328125, "loss": 0.6148, "rewards/accuracies": 0.0, "rewards/chosen": 0.32922670245170593, "rewards/margins": -0.8751403093338013, "rewards/rejected": 1.2043670415878296, "step": 5301 }, { "epoch": 0.86, "learning_rate": 8.5871908166518e-07, "logits/chosen": -0.7133513689041138, "logits/rejected": -0.7250279188156128, "logps/chosen": -5.225543975830078, "logps/rejected": -1.4175175428390503, "loss": 0.7022, "rewards/accuracies": 0.0, "rewards/chosen": 0.15386973321437836, "rewards/margins": -0.24777217209339142, "rewards/rejected": 0.4016419053077698, "step": 5302 }, { "epoch": 0.86, "learning_rate": 8.586275154267805e-07, "logits/chosen": -0.7879065871238708, "logits/rejected": -0.7095571756362915, "logps/chosen": -86.68595886230469, "logps/rejected": -86.21375274658203, "loss": 0.7274, "rewards/accuracies": 0.0, "rewards/chosen": 1.6432678699493408, "rewards/margins": -0.09350049495697021, "rewards/rejected": 1.736768364906311, "step": 5303 }, { "epoch": 0.86, "learning_rate": 8.585359244106131e-07, "logits/chosen": -0.5222119092941284, "logits/rejected": -0.5186212062835693, "logps/chosen": -11.803537368774414, "logps/rejected": -19.74329948425293, "loss": 0.4331, "rewards/accuracies": 1.0, "rewards/chosen": 0.6749847531318665, "rewards/margins": 0.39307841658592224, "rewards/rejected": 0.2819063365459442, "step": 5304 }, { "epoch": 0.86, "learning_rate": 8.584443086230059e-07, "logits/chosen": -0.7797009348869324, "logits/rejected": -0.7045045495033264, "logps/chosen": -128.6711883544922, "logps/rejected": -61.484100341796875, "loss": 0.407, "rewards/accuracies": 0.0, "rewards/chosen": 1.5230239629745483, "rewards/margins": -0.21302568912506104, "rewards/rejected": 1.7360496520996094, "step": 5305 }, { "epoch": 0.86, "learning_rate": 8.583526680702886e-07, "logits/chosen": -0.8806314468383789, "logits/rejected": -0.7702022790908813, "logps/chosen": -87.09283447265625, "logps/rejected": -33.73367691040039, "loss": 0.3337, "rewards/accuracies": 1.0, "rewards/chosen": 1.26421058177948, "rewards/margins": 0.10477066040039062, "rewards/rejected": 1.1594399213790894, "step": 5306 }, { "epoch": 0.86, "learning_rate": 8.582610027587926e-07, "logits/chosen": -1.054431438446045, "logits/rejected": -0.9965324997901917, "logps/chosen": -101.07620239257812, "logps/rejected": -89.51808166503906, "loss": 0.467, "rewards/accuracies": 0.0, "rewards/chosen": 1.4691040515899658, "rewards/margins": -0.4295135736465454, "rewards/rejected": 1.8986176252365112, "step": 5307 }, { "epoch": 0.86, "learning_rate": 8.581693126948513e-07, "logits/chosen": -0.793910801410675, "logits/rejected": -0.7700042128562927, "logps/chosen": -86.68637084960938, "logps/rejected": -110.65901184082031, "loss": 0.8734, "rewards/accuracies": 0.0, "rewards/chosen": 2.1402480602264404, "rewards/margins": -1.23130202293396, "rewards/rejected": 3.3715500831604004, "step": 5308 }, { "epoch": 0.86, "learning_rate": 8.580775978847996e-07, "logits/chosen": -1.642478108406067, "logits/rejected": -1.4559561014175415, "logps/chosen": -86.85346984863281, "logps/rejected": -195.0477294921875, "loss": 1.9633, "rewards/accuracies": 0.0, "rewards/chosen": 2.40580153465271, "rewards/margins": -2.4257614612579346, "rewards/rejected": 4.8315629959106445, "step": 5309 }, { "epoch": 0.86, "learning_rate": 8.579858583349739e-07, "logits/chosen": -0.692533016204834, "logits/rejected": -0.596852719783783, "logps/chosen": -103.9476318359375, "logps/rejected": -57.89301300048828, "loss": 0.6889, "rewards/accuracies": 0.0, "rewards/chosen": 1.4597870111465454, "rewards/margins": -0.9059349298477173, "rewards/rejected": 2.3657219409942627, "step": 5310 }, { "epoch": 0.86, "learning_rate": 8.578940940517128e-07, "logits/chosen": -0.7941668629646301, "logits/rejected": -0.7409205436706543, "logps/chosen": -93.13980102539062, "logps/rejected": -63.36300277709961, "loss": 1.2583, "rewards/accuracies": 0.0, "rewards/chosen": 1.339599609375, "rewards/margins": -1.081827163696289, "rewards/rejected": 2.421426773071289, "step": 5311 }, { "epoch": 0.86, "learning_rate": 8.578023050413561e-07, "logits/chosen": -0.9618138670921326, "logits/rejected": -0.9185134172439575, "logps/chosen": -53.12430953979492, "logps/rejected": -73.90644836425781, "loss": 0.489, "rewards/accuracies": 1.0, "rewards/chosen": 2.4151723384857178, "rewards/margins": 0.36455273628234863, "rewards/rejected": 2.050619602203369, "step": 5312 }, { "epoch": 0.86, "learning_rate": 8.577104913102457e-07, "logits/chosen": -0.6897863745689392, "logits/rejected": -0.6633854508399963, "logps/chosen": -85.63690185546875, "logps/rejected": -70.7281723022461, "loss": 0.2901, "rewards/accuracies": 1.0, "rewards/chosen": 3.3156845569610596, "rewards/margins": 0.5665695667266846, "rewards/rejected": 2.749114990234375, "step": 5313 }, { "epoch": 0.86, "learning_rate": 8.576186528647252e-07, "logits/chosen": -0.6208271384239197, "logits/rejected": -0.6293493509292603, "logps/chosen": -121.2166519165039, "logps/rejected": -51.34019088745117, "loss": 0.3709, "rewards/accuracies": 1.0, "rewards/chosen": 1.3926597833633423, "rewards/margins": 0.04375958442687988, "rewards/rejected": 1.3489001989364624, "step": 5314 }, { "epoch": 0.86, "learning_rate": 8.575267897111396e-07, "logits/chosen": -0.456522136926651, "logits/rejected": -0.4253320097923279, "logps/chosen": -299.2268371582031, "logps/rejected": -40.560157775878906, "loss": 0.3194, "rewards/accuracies": 1.0, "rewards/chosen": 2.560373067855835, "rewards/margins": 1.0170925855636597, "rewards/rejected": 1.5432804822921753, "step": 5315 }, { "epoch": 0.86, "learning_rate": 8.574349018558355e-07, "logits/chosen": -0.7319825291633606, "logits/rejected": -0.6867315173149109, "logps/chosen": -78.58895111083984, "logps/rejected": -18.72775650024414, "loss": 0.4613, "rewards/accuracies": 1.0, "rewards/chosen": 1.0705947875976562, "rewards/margins": 0.9553912878036499, "rewards/rejected": 0.11520347744226456, "step": 5316 }, { "epoch": 0.86, "learning_rate": 8.573429893051621e-07, "logits/chosen": -0.7905938625335693, "logits/rejected": -0.7475926280021667, "logps/chosen": -99.7008056640625, "logps/rejected": -117.7451400756836, "loss": 0.6632, "rewards/accuracies": 1.0, "rewards/chosen": 2.0539932250976562, "rewards/margins": 2.637326717376709, "rewards/rejected": -0.5833336114883423, "step": 5317 }, { "epoch": 0.86, "learning_rate": 8.572510520654691e-07, "logits/chosen": -0.617148220539093, "logits/rejected": -0.57206130027771, "logps/chosen": -77.81338500976562, "logps/rejected": -143.50723266601562, "loss": 1.0422, "rewards/accuracies": 0.0, "rewards/chosen": 2.836839437484741, "rewards/margins": -1.4529478549957275, "rewards/rejected": 4.289787292480469, "step": 5318 }, { "epoch": 0.86, "learning_rate": 8.571590901431089e-07, "logits/chosen": -0.7214798927307129, "logits/rejected": -0.7624989151954651, "logps/chosen": -101.17337036132812, "logps/rejected": -112.23389434814453, "loss": 0.8724, "rewards/accuracies": 1.0, "rewards/chosen": 0.17844466865062714, "rewards/margins": 0.39345091581344604, "rewards/rejected": -0.2150062620639801, "step": 5319 }, { "epoch": 0.86, "learning_rate": 8.57067103544435e-07, "logits/chosen": -0.8847667574882507, "logits/rejected": -0.8077260851860046, "logps/chosen": -113.62067413330078, "logps/rejected": -83.04310607910156, "loss": 0.9214, "rewards/accuracies": 0.0, "rewards/chosen": 1.9211968183517456, "rewards/margins": -0.19579851627349854, "rewards/rejected": 2.116995334625244, "step": 5320 }, { "epoch": 0.86, "learning_rate": 8.569750922758028e-07, "logits/chosen": -0.5802399516105652, "logits/rejected": -0.5901471376419067, "logps/chosen": -26.868276596069336, "logps/rejected": -25.921001434326172, "loss": 0.702, "rewards/accuracies": 0.0, "rewards/chosen": 0.30461961030960083, "rewards/margins": -0.2095733880996704, "rewards/rejected": 0.5141929984092712, "step": 5321 }, { "epoch": 0.86, "learning_rate": 8.568830563435694e-07, "logits/chosen": -0.7601569294929504, "logits/rejected": -0.7496477365493774, "logps/chosen": -129.2796630859375, "logps/rejected": -155.2671661376953, "loss": 0.2614, "rewards/accuracies": 1.0, "rewards/chosen": 3.89202880859375, "rewards/margins": 2.746420383453369, "rewards/rejected": 1.1456085443496704, "step": 5322 }, { "epoch": 0.86, "learning_rate": 8.567909957540938e-07, "logits/chosen": 0.002313733333721757, "logits/rejected": 0.002313733333721757, "logps/chosen": -6.5459208488464355, "logps/rejected": -6.5459208488464355, "loss": 0.7503, "rewards/accuracies": 0.0, "rewards/chosen": 0.14580416679382324, "rewards/margins": 0.0, "rewards/rejected": 0.14580416679382324, "step": 5323 }, { "epoch": 0.86, "learning_rate": 8.566989105137363e-07, "logits/chosen": -0.8257032632827759, "logits/rejected": -0.7798962593078613, "logps/chosen": -118.64149475097656, "logps/rejected": -215.82363891601562, "loss": 0.9352, "rewards/accuracies": 0.0, "rewards/chosen": 4.184378147125244, "rewards/margins": -1.1797089576721191, "rewards/rejected": 5.364087104797363, "step": 5324 }, { "epoch": 0.86, "learning_rate": 8.566068006288592e-07, "logits/chosen": -0.706783652305603, "logits/rejected": -0.667396605014801, "logps/chosen": -40.68906021118164, "logps/rejected": -17.072853088378906, "loss": 0.6475, "rewards/accuracies": 1.0, "rewards/chosen": 0.6310993432998657, "rewards/margins": 0.191867858171463, "rewards/rejected": 0.4392314851284027, "step": 5325 }, { "epoch": 0.86, "learning_rate": 8.565146661058264e-07, "logits/chosen": -0.5244043469429016, "logits/rejected": -0.47760993242263794, "logps/chosen": -48.83720397949219, "logps/rejected": -146.17276000976562, "loss": 0.1146, "rewards/accuracies": 1.0, "rewards/chosen": 1.6271148920059204, "rewards/margins": 1.4114700555801392, "rewards/rejected": 0.21564483642578125, "step": 5326 }, { "epoch": 0.86, "learning_rate": 8.564225069510036e-07, "logits/chosen": -0.6102621555328369, "logits/rejected": -0.5515122413635254, "logps/chosen": -43.260902404785156, "logps/rejected": -100.44213104248047, "loss": 0.5781, "rewards/accuracies": 0.0, "rewards/chosen": 2.0654563903808594, "rewards/margins": -0.20342421531677246, "rewards/rejected": 2.268880605697632, "step": 5327 }, { "epoch": 0.86, "learning_rate": 8.563303231707582e-07, "logits/chosen": -0.6579280495643616, "logits/rejected": -0.6018546223640442, "logps/chosen": -92.12937927246094, "logps/rejected": -46.71954345703125, "loss": 0.2378, "rewards/accuracies": 1.0, "rewards/chosen": 0.8994255065917969, "rewards/margins": 0.7414695620536804, "rewards/rejected": 0.15795592963695526, "step": 5328 }, { "epoch": 0.86, "learning_rate": 8.562381147714588e-07, "logits/chosen": -0.7074919939041138, "logits/rejected": -0.6765870451927185, "logps/chosen": -165.24737548828125, "logps/rejected": -154.89132690429688, "loss": 1.7766, "rewards/accuracies": 0.0, "rewards/chosen": 3.190173387527466, "rewards/margins": -2.471696615219116, "rewards/rejected": 5.661870002746582, "step": 5329 }, { "epoch": 0.87, "learning_rate": 8.561458817594767e-07, "logits/chosen": -0.9173550009727478, "logits/rejected": -0.8059900999069214, "logps/chosen": -105.20906066894531, "logps/rejected": -66.67485046386719, "loss": 1.5309, "rewards/accuracies": 0.0, "rewards/chosen": 0.4832214415073395, "rewards/margins": -0.9936302900314331, "rewards/rejected": 1.4768517017364502, "step": 5330 }, { "epoch": 0.87, "learning_rate": 8.560536241411838e-07, "logits/chosen": -0.7881993055343628, "logits/rejected": -0.7024562954902649, "logps/chosen": -91.03404235839844, "logps/rejected": -110.38310241699219, "loss": 0.1091, "rewards/accuracies": 1.0, "rewards/chosen": 3.3865585327148438, "rewards/margins": 1.4986785650253296, "rewards/rejected": 1.8878799676895142, "step": 5331 }, { "epoch": 0.87, "learning_rate": 8.559613419229547e-07, "logits/chosen": -0.6896489262580872, "logits/rejected": -0.7512449622154236, "logps/chosen": -85.50870513916016, "logps/rejected": -110.4730224609375, "loss": 2.0332, "rewards/accuracies": 0.0, "rewards/chosen": 2.095876455307007, "rewards/margins": -3.0593421459198, "rewards/rejected": 5.155218601226807, "step": 5332 }, { "epoch": 0.87, "learning_rate": 8.55869035111165e-07, "logits/chosen": -0.6259944438934326, "logits/rejected": -0.47165751457214355, "logps/chosen": -91.32244873046875, "logps/rejected": -86.8138656616211, "loss": 0.4628, "rewards/accuracies": 1.0, "rewards/chosen": 1.3234176635742188, "rewards/margins": 0.3244636654853821, "rewards/rejected": 0.9989539980888367, "step": 5333 }, { "epoch": 0.87, "learning_rate": 8.557767037121921e-07, "logits/chosen": -0.3638029992580414, "logits/rejected": -0.4150001108646393, "logps/chosen": -68.79586029052734, "logps/rejected": -150.38223266601562, "loss": 0.8243, "rewards/accuracies": 1.0, "rewards/chosen": 0.9950904846191406, "rewards/margins": 0.13579785823822021, "rewards/rejected": 0.8592926263809204, "step": 5334 }, { "epoch": 0.87, "learning_rate": 8.556843477324154e-07, "logits/chosen": -0.9180962443351746, "logits/rejected": -0.9728335738182068, "logps/chosen": -171.29299926757812, "logps/rejected": -106.38848876953125, "loss": 1.0131, "rewards/accuracies": 0.0, "rewards/chosen": 2.6772706508636475, "rewards/margins": -1.2990446090698242, "rewards/rejected": 3.9763152599334717, "step": 5335 }, { "epoch": 0.87, "learning_rate": 8.555919671782159e-07, "logits/chosen": -0.1255345344543457, "logits/rejected": -0.18044035136699677, "logps/chosen": -71.59345245361328, "logps/rejected": -117.53327941894531, "loss": 0.9474, "rewards/accuracies": 1.0, "rewards/chosen": -0.034458160400390625, "rewards/margins": 0.1053466796875, "rewards/rejected": -0.13980484008789062, "step": 5336 }, { "epoch": 0.87, "learning_rate": 8.55499562055976e-07, "logits/chosen": -0.5312909483909607, "logits/rejected": -0.5276418924331665, "logps/chosen": -99.60771179199219, "logps/rejected": -58.61960983276367, "loss": 0.7744, "rewards/accuracies": 0.0, "rewards/chosen": 1.06097412109375, "rewards/margins": -1.2018795013427734, "rewards/rejected": 2.2628536224365234, "step": 5337 }, { "epoch": 0.87, "learning_rate": 8.554071323720802e-07, "logits/chosen": -0.2075500637292862, "logits/rejected": -0.21951861679553986, "logps/chosen": -70.31770324707031, "logps/rejected": -70.38656616210938, "loss": 0.7974, "rewards/accuracies": 0.0, "rewards/chosen": 1.2313750982284546, "rewards/margins": -0.26373910903930664, "rewards/rejected": 1.4951142072677612, "step": 5338 }, { "epoch": 0.87, "learning_rate": 8.553146781329144e-07, "logits/chosen": -0.4920831024646759, "logits/rejected": -0.4246356785297394, "logps/chosen": -79.76651000976562, "logps/rejected": -65.8777847290039, "loss": 0.2112, "rewards/accuracies": 1.0, "rewards/chosen": 1.4660691022872925, "rewards/margins": 0.6761650443077087, "rewards/rejected": 0.7899040579795837, "step": 5339 }, { "epoch": 0.87, "learning_rate": 8.552221993448663e-07, "logits/chosen": -0.6087902784347534, "logits/rejected": -0.5475093722343445, "logps/chosen": -51.406673431396484, "logps/rejected": -67.13304138183594, "loss": 0.4107, "rewards/accuracies": 1.0, "rewards/chosen": 1.5975910425186157, "rewards/margins": 0.7432575225830078, "rewards/rejected": 0.8543335199356079, "step": 5340 }, { "epoch": 0.87, "learning_rate": 8.551296960143255e-07, "logits/chosen": -0.5810094475746155, "logits/rejected": -0.5547971725463867, "logps/chosen": -108.17988586425781, "logps/rejected": -119.73681640625, "loss": 1.0232, "rewards/accuracies": 1.0, "rewards/chosen": 1.0552581548690796, "rewards/margins": 0.5691405534744263, "rewards/rejected": 0.48611757159233093, "step": 5341 }, { "epoch": 0.87, "learning_rate": 8.550371681476829e-07, "logits/chosen": -0.04352155327796936, "logits/rejected": -0.051743242889642715, "logps/chosen": -5.533833026885986, "logps/rejected": -1.1081331968307495, "loss": 1.3378, "rewards/accuracies": 0.0, "rewards/chosen": 0.20531049370765686, "rewards/margins": -0.05166143178939819, "rewards/rejected": 0.25697192549705505, "step": 5342 }, { "epoch": 0.87, "learning_rate": 8.549446157513314e-07, "logits/chosen": -0.6832929849624634, "logits/rejected": -0.12661336362361908, "logps/chosen": -98.25723266601562, "logps/rejected": -84.41926574707031, "loss": 0.6485, "rewards/accuracies": 0.0, "rewards/chosen": 1.2428909540176392, "rewards/margins": -0.7495849132537842, "rewards/rejected": 1.9924758672714233, "step": 5343 }, { "epoch": 0.87, "learning_rate": 8.548520388316654e-07, "logits/chosen": -0.5657644271850586, "logits/rejected": -0.5574235916137695, "logps/chosen": -137.7726287841797, "logps/rejected": -78.12405395507812, "loss": 1.5169, "rewards/accuracies": 1.0, "rewards/chosen": 3.430128574371338, "rewards/margins": 0.99395751953125, "rewards/rejected": 2.436171054840088, "step": 5344 }, { "epoch": 0.87, "learning_rate": 8.547594373950812e-07, "logits/chosen": -0.516606867313385, "logits/rejected": -0.48025333881378174, "logps/chosen": -55.45458984375, "logps/rejected": -52.9228515625, "loss": 0.8631, "rewards/accuracies": 1.0, "rewards/chosen": 1.3368866443634033, "rewards/margins": 0.23165631294250488, "rewards/rejected": 1.1052303314208984, "step": 5345 }, { "epoch": 0.87, "learning_rate": 8.546668114479767e-07, "logits/chosen": -0.8187195658683777, "logits/rejected": -0.7460362911224365, "logps/chosen": -78.67618560791016, "logps/rejected": -85.51949310302734, "loss": 1.089, "rewards/accuracies": 0.0, "rewards/chosen": 1.3005386590957642, "rewards/margins": -1.227857232093811, "rewards/rejected": 2.528395891189575, "step": 5346 }, { "epoch": 0.87, "learning_rate": 8.545741609967514e-07, "logits/chosen": -0.9439184069633484, "logits/rejected": -0.92708420753479, "logps/chosen": -55.88444137573242, "logps/rejected": -60.81214141845703, "loss": 0.4689, "rewards/accuracies": 1.0, "rewards/chosen": 0.6124340295791626, "rewards/margins": 0.3230537474155426, "rewards/rejected": 0.28938028216362, "step": 5347 }, { "epoch": 0.87, "learning_rate": 8.544814860478064e-07, "logits/chosen": -0.6346178650856018, "logits/rejected": -0.6711943745613098, "logps/chosen": -41.22222900390625, "logps/rejected": -140.34661865234375, "loss": 1.8015, "rewards/accuracies": 0.0, "rewards/chosen": 1.5643341541290283, "rewards/margins": -3.5103609561920166, "rewards/rejected": 5.074695110321045, "step": 5348 }, { "epoch": 0.87, "learning_rate": 8.543887866075451e-07, "logits/chosen": -0.5423294305801392, "logits/rejected": -0.48491647839546204, "logps/chosen": -74.77713012695312, "logps/rejected": -82.29151916503906, "loss": 0.329, "rewards/accuracies": 1.0, "rewards/chosen": 0.8999771475791931, "rewards/margins": 0.10355758666992188, "rewards/rejected": 0.7964195609092712, "step": 5349 }, { "epoch": 0.87, "learning_rate": 8.542960626823719e-07, "logits/chosen": -1.0065045356750488, "logits/rejected": -0.909182071685791, "logps/chosen": -81.53738403320312, "logps/rejected": -96.46855926513672, "loss": 0.9373, "rewards/accuracies": 0.0, "rewards/chosen": 3.4360427856445312, "rewards/margins": -1.602689266204834, "rewards/rejected": 5.038732051849365, "step": 5350 }, { "epoch": 0.87, "learning_rate": 8.54203314278693e-07, "logits/chosen": -0.5884824991226196, "logits/rejected": -0.5355722904205322, "logps/chosen": -75.49649810791016, "logps/rejected": -65.52447509765625, "loss": 0.3212, "rewards/accuracies": 1.0, "rewards/chosen": 2.1545181274414062, "rewards/margins": 0.4581352472305298, "rewards/rejected": 1.6963828802108765, "step": 5351 }, { "epoch": 0.87, "learning_rate": 8.541105414029167e-07, "logits/chosen": -0.529686689376831, "logits/rejected": -0.5036787390708923, "logps/chosen": -102.73338317871094, "logps/rejected": -104.26502990722656, "loss": 0.5227, "rewards/accuracies": 1.0, "rewards/chosen": 2.939012289047241, "rewards/margins": 0.8913865089416504, "rewards/rejected": 2.047625780105591, "step": 5352 }, { "epoch": 0.87, "learning_rate": 8.540177440614524e-07, "logits/chosen": -0.8429690003395081, "logits/rejected": -0.8104357123374939, "logps/chosen": -60.42683410644531, "logps/rejected": -23.4056453704834, "loss": 0.5208, "rewards/accuracies": 0.0, "rewards/chosen": 0.32030031085014343, "rewards/margins": -0.16540944576263428, "rewards/rejected": 0.4857097566127777, "step": 5353 }, { "epoch": 0.87, "learning_rate": 8.539249222607118e-07, "logits/chosen": -0.8692446947097778, "logits/rejected": -0.6799120306968689, "logps/chosen": -93.28296661376953, "logps/rejected": -75.03295135498047, "loss": 0.7515, "rewards/accuracies": 1.0, "rewards/chosen": 1.8097076416015625, "rewards/margins": 0.2906752824783325, "rewards/rejected": 1.51903235912323, "step": 5354 }, { "epoch": 0.87, "learning_rate": 8.538320760071082e-07, "logits/chosen": -0.10524995625019073, "logits/rejected": -0.07170005142688751, "logps/chosen": -43.20246124267578, "logps/rejected": -12.645973205566406, "loss": 0.9959, "rewards/accuracies": 1.0, "rewards/chosen": 1.390283226966858, "rewards/margins": 0.7083431482315063, "rewards/rejected": 0.6819400787353516, "step": 5355 }, { "epoch": 0.87, "learning_rate": 8.53739205307056e-07, "logits/chosen": -0.527396023273468, "logits/rejected": -0.49962183833122253, "logps/chosen": -140.83859252929688, "logps/rejected": -113.43600463867188, "loss": 1.7409, "rewards/accuracies": 0.0, "rewards/chosen": 1.3555710315704346, "rewards/margins": -3.4352705478668213, "rewards/rejected": 4.790841579437256, "step": 5356 }, { "epoch": 0.87, "learning_rate": 8.536463101669717e-07, "logits/chosen": -1.0666788816452026, "logits/rejected": -1.041203498840332, "logps/chosen": -96.048095703125, "logps/rejected": -123.84414672851562, "loss": 0.4679, "rewards/accuracies": 1.0, "rewards/chosen": 1.1791619062423706, "rewards/margins": 0.11191487312316895, "rewards/rejected": 1.0672470331192017, "step": 5357 }, { "epoch": 0.87, "learning_rate": 8.535533905932737e-07, "logits/chosen": -0.6017662882804871, "logits/rejected": -0.617641806602478, "logps/chosen": -153.15554809570312, "logps/rejected": -101.9434814453125, "loss": 1.3303, "rewards/accuracies": 0.0, "rewards/chosen": 1.696783423423767, "rewards/margins": -0.06691217422485352, "rewards/rejected": 1.7636955976486206, "step": 5358 }, { "epoch": 0.87, "learning_rate": 8.534604465923819e-07, "logits/chosen": -0.1489107459783554, "logits/rejected": -0.15322241187095642, "logps/chosen": -2.394582748413086, "logps/rejected": -33.00730895996094, "loss": 0.386, "rewards/accuracies": 0.0, "rewards/chosen": 0.13816924393177032, "rewards/margins": -0.06166978180408478, "rewards/rejected": 0.1998390257358551, "step": 5359 }, { "epoch": 0.87, "learning_rate": 8.533674781707174e-07, "logits/chosen": -0.29633957147598267, "logits/rejected": -0.29633957147598267, "logps/chosen": -36.859703063964844, "logps/rejected": -36.859703063964844, "loss": 1.0619, "rewards/accuracies": 0.0, "rewards/chosen": 0.908502995967865, "rewards/margins": 0.0, "rewards/rejected": 0.908502995967865, "step": 5360 }, { "epoch": 0.87, "learning_rate": 8.53274485334704e-07, "logits/chosen": -0.7184022665023804, "logits/rejected": -0.6968209147453308, "logps/chosen": -64.2500228881836, "logps/rejected": -101.16740417480469, "loss": 3.1372, "rewards/accuracies": 0.0, "rewards/chosen": 2.182421922683716, "rewards/margins": -0.21187281608581543, "rewards/rejected": 2.3942947387695312, "step": 5361 }, { "epoch": 0.87, "learning_rate": 8.531814680907663e-07, "logits/chosen": -0.6984775066375732, "logits/rejected": -0.6385866403579712, "logps/chosen": -211.28575134277344, "logps/rejected": -114.72749328613281, "loss": 0.3388, "rewards/accuracies": 1.0, "rewards/chosen": 5.479719638824463, "rewards/margins": 0.8954453468322754, "rewards/rejected": 4.5842742919921875, "step": 5362 }, { "epoch": 0.87, "learning_rate": 8.530884264453309e-07, "logits/chosen": -0.49809736013412476, "logits/rejected": -0.4765488803386688, "logps/chosen": -88.688720703125, "logps/rejected": -51.23128128051758, "loss": 0.6316, "rewards/accuracies": 0.0, "rewards/chosen": 2.5765862464904785, "rewards/margins": -0.5842769145965576, "rewards/rejected": 3.160863161087036, "step": 5363 }, { "epoch": 0.87, "learning_rate": 8.529953604048263e-07, "logits/chosen": -0.6841424107551575, "logits/rejected": -0.608241081237793, "logps/chosen": -101.92231750488281, "logps/rejected": -167.61151123046875, "loss": 0.6928, "rewards/accuracies": 0.0, "rewards/chosen": 1.3864631652832031, "rewards/margins": -0.5828789472579956, "rewards/rejected": 1.9693421125411987, "step": 5364 }, { "epoch": 0.87, "learning_rate": 8.529022699756825e-07, "logits/chosen": -0.3474096953868866, "logits/rejected": -0.5377217531204224, "logps/chosen": -67.85161590576172, "logps/rejected": -96.41341400146484, "loss": 2.2573, "rewards/accuracies": 0.0, "rewards/chosen": 0.8630569577217102, "rewards/margins": -3.3406870365142822, "rewards/rejected": 4.203743934631348, "step": 5365 }, { "epoch": 0.87, "learning_rate": 8.528091551643308e-07, "logits/chosen": -0.7396233677864075, "logits/rejected": -0.7416204214096069, "logps/chosen": -87.78821563720703, "logps/rejected": -60.96748352050781, "loss": 0.6236, "rewards/accuracies": 1.0, "rewards/chosen": 2.0855071544647217, "rewards/margins": 0.19549548625946045, "rewards/rejected": 1.8900116682052612, "step": 5366 }, { "epoch": 0.87, "learning_rate": 8.527160159772048e-07, "logits/chosen": -0.7320246696472168, "logits/rejected": -0.6323938369750977, "logps/chosen": -91.72541809082031, "logps/rejected": -61.15765380859375, "loss": 0.3563, "rewards/accuracies": 1.0, "rewards/chosen": 5.068572998046875, "rewards/margins": 3.765106201171875, "rewards/rejected": 1.303466796875, "step": 5367 }, { "epoch": 0.87, "learning_rate": 8.526228524207396e-07, "logits/chosen": -0.7517744898796082, "logits/rejected": -0.7484531998634338, "logps/chosen": -84.43645477294922, "logps/rejected": -52.441986083984375, "loss": 1.4668, "rewards/accuracies": 0.0, "rewards/chosen": 0.987567126750946, "rewards/margins": -0.862322986125946, "rewards/rejected": 1.849890112876892, "step": 5368 }, { "epoch": 0.87, "learning_rate": 8.525296645013718e-07, "logits/chosen": -0.46835801005363464, "logits/rejected": -0.46835801005363464, "logps/chosen": -30.43307113647461, "logps/rejected": -30.43307113647461, "loss": 0.7831, "rewards/accuracies": 0.0, "rewards/chosen": 1.3956245183944702, "rewards/margins": 0.0, "rewards/rejected": 1.3956245183944702, "step": 5369 }, { "epoch": 0.87, "learning_rate": 8.5243645222554e-07, "logits/chosen": -0.8883786797523499, "logits/rejected": -0.861314058303833, "logps/chosen": -213.361083984375, "logps/rejected": -52.15650939941406, "loss": 0.2506, "rewards/accuracies": 1.0, "rewards/chosen": 4.0538649559021, "rewards/margins": 1.6762464046478271, "rewards/rejected": 2.3776185512542725, "step": 5370 }, { "epoch": 0.87, "learning_rate": 8.523432155996839e-07, "logits/chosen": -0.5186131000518799, "logits/rejected": -0.41600289940834045, "logps/chosen": -58.38706970214844, "logps/rejected": -52.43168640136719, "loss": 0.3833, "rewards/accuracies": 0.0, "rewards/chosen": 1.3816741704940796, "rewards/margins": -0.024411797523498535, "rewards/rejected": 1.4060859680175781, "step": 5371 }, { "epoch": 0.87, "learning_rate": 8.522499546302458e-07, "logits/chosen": -0.8089120388031006, "logits/rejected": -0.8165579438209534, "logps/chosen": -56.08283233642578, "logps/rejected": -108.05239868164062, "loss": 0.0854, "rewards/accuracies": 1.0, "rewards/chosen": 1.714678168296814, "rewards/margins": 1.7355964183807373, "rewards/rejected": -0.02091827429831028, "step": 5372 }, { "epoch": 0.87, "learning_rate": 8.521566693236686e-07, "logits/chosen": -0.6883517503738403, "logits/rejected": -0.6309478282928467, "logps/chosen": -128.46612548828125, "logps/rejected": -73.98507690429688, "loss": 0.5129, "rewards/accuracies": 1.0, "rewards/chosen": 1.6411163806915283, "rewards/margins": 0.25957417488098145, "rewards/rejected": 1.3815422058105469, "step": 5373 }, { "epoch": 0.87, "learning_rate": 8.520633596863977e-07, "logits/chosen": -0.16825507581233978, "logits/rejected": -0.1760530322790146, "logps/chosen": -37.91520309448242, "logps/rejected": -17.45441436767578, "loss": 0.5699, "rewards/accuracies": 1.0, "rewards/chosen": 0.3630050718784332, "rewards/margins": 0.3603973388671875, "rewards/rejected": 0.002607727190479636, "step": 5374 }, { "epoch": 0.87, "learning_rate": 8.519700257248801e-07, "logits/chosen": -0.6006345748901367, "logits/rejected": -0.5186365246772766, "logps/chosen": -88.90013885498047, "logps/rejected": -40.15345764160156, "loss": 0.5701, "rewards/accuracies": 1.0, "rewards/chosen": 0.9839454889297485, "rewards/margins": 0.8758819699287415, "rewards/rejected": 0.10806351155042648, "step": 5375 }, { "epoch": 0.87, "learning_rate": 8.51876667445564e-07, "logits/chosen": -0.7062026262283325, "logits/rejected": -0.6428342461585999, "logps/chosen": -43.01862335205078, "logps/rejected": -108.66299438476562, "loss": 0.4679, "rewards/accuracies": 1.0, "rewards/chosen": 2.6549980640411377, "rewards/margins": 0.25604772567749023, "rewards/rejected": 2.3989503383636475, "step": 5376 }, { "epoch": 0.87, "learning_rate": 8.517832848548996e-07, "logits/chosen": -0.6483217477798462, "logits/rejected": -0.6625648736953735, "logps/chosen": -91.81456756591797, "logps/rejected": -133.8785400390625, "loss": 0.6898, "rewards/accuracies": 0.0, "rewards/chosen": 1.6960258483886719, "rewards/margins": -0.18592000007629395, "rewards/rejected": 1.8819458484649658, "step": 5377 }, { "epoch": 0.87, "learning_rate": 8.516898779593389e-07, "logits/chosen": -0.4465731680393219, "logits/rejected": -0.4465731680393219, "logps/chosen": -13.341715812683105, "logps/rejected": -13.341715812683105, "loss": 0.3581, "rewards/accuracies": 0.0, "rewards/chosen": 0.46852657198905945, "rewards/margins": 0.0, "rewards/rejected": 0.46852657198905945, "step": 5378 }, { "epoch": 0.87, "learning_rate": 8.515964467653353e-07, "logits/chosen": -0.580438494682312, "logits/rejected": -0.5409891605377197, "logps/chosen": -40.93737030029297, "logps/rejected": -63.568111419677734, "loss": 0.5718, "rewards/accuracies": 1.0, "rewards/chosen": 1.6572974920272827, "rewards/margins": 0.03387022018432617, "rewards/rejected": 1.6234272718429565, "step": 5379 }, { "epoch": 0.87, "learning_rate": 8.515029912793441e-07, "logits/chosen": -0.23277956247329712, "logits/rejected": -0.23583610355854034, "logps/chosen": -11.613690376281738, "logps/rejected": -2.396390199661255, "loss": 0.7582, "rewards/accuracies": 0.0, "rewards/chosen": -0.05975313112139702, "rewards/margins": -0.3614385426044464, "rewards/rejected": 0.3016854226589203, "step": 5380 }, { "epoch": 0.87, "learning_rate": 8.514095115078222e-07, "logits/chosen": -0.6663712859153748, "logits/rejected": -0.6707223057746887, "logps/chosen": -104.87065124511719, "logps/rejected": -84.27891540527344, "loss": 1.0085, "rewards/accuracies": 0.0, "rewards/chosen": 0.9852356314659119, "rewards/margins": -1.7298681735992432, "rewards/rejected": 2.7151038646698, "step": 5381 }, { "epoch": 0.87, "learning_rate": 8.513160074572279e-07, "logits/chosen": -0.5781019926071167, "logits/rejected": -0.30968502163887024, "logps/chosen": -121.08834838867188, "logps/rejected": -35.347442626953125, "loss": 0.6657, "rewards/accuracies": 1.0, "rewards/chosen": 4.364802837371826, "rewards/margins": 3.371302366256714, "rewards/rejected": 0.9935005307197571, "step": 5382 }, { "epoch": 0.87, "learning_rate": 8.512224791340219e-07, "logits/chosen": -1.1669936180114746, "logits/rejected": -0.9604730010032654, "logps/chosen": -159.37380981445312, "logps/rejected": -110.21148681640625, "loss": 0.086, "rewards/accuracies": 1.0, "rewards/chosen": 5.1404709815979, "rewards/margins": 1.79819655418396, "rewards/rejected": 3.3422744274139404, "step": 5383 }, { "epoch": 0.87, "learning_rate": 8.511289265446657e-07, "logits/chosen": -0.7091510891914368, "logits/rejected": -0.7387747168540955, "logps/chosen": -130.624267578125, "logps/rejected": -138.07887268066406, "loss": 0.6143, "rewards/accuracies": 0.0, "rewards/chosen": 5.608769416809082, "rewards/margins": -0.10437297821044922, "rewards/rejected": 5.713142395019531, "step": 5384 }, { "epoch": 0.87, "learning_rate": 8.510353496956233e-07, "logits/chosen": -0.7044998407363892, "logits/rejected": -0.6354609727859497, "logps/chosen": -97.32351684570312, "logps/rejected": -91.76469421386719, "loss": 0.4285, "rewards/accuracies": 0.0, "rewards/chosen": 1.8379043340682983, "rewards/margins": -0.282368540763855, "rewards/rejected": 2.1202728748321533, "step": 5385 }, { "epoch": 0.87, "learning_rate": 8.509417485933596e-07, "logits/chosen": -0.8630544543266296, "logits/rejected": -0.7576080560684204, "logps/chosen": -146.31210327148438, "logps/rejected": -74.33889770507812, "loss": 1.0033, "rewards/accuracies": 0.0, "rewards/chosen": 1.0919373035430908, "rewards/margins": -0.8544524908065796, "rewards/rejected": 1.9463897943496704, "step": 5386 }, { "epoch": 0.87, "learning_rate": 8.508481232443418e-07, "logits/chosen": -0.9389362335205078, "logits/rejected": -0.8455124497413635, "logps/chosen": -180.205322265625, "logps/rejected": -113.99404907226562, "loss": 0.1289, "rewards/accuracies": 1.0, "rewards/chosen": 6.578210353851318, "rewards/margins": 1.2421202659606934, "rewards/rejected": 5.336090087890625, "step": 5387 }, { "epoch": 0.87, "learning_rate": 8.507544736550385e-07, "logits/chosen": -0.6500735282897949, "logits/rejected": -0.7088717222213745, "logps/chosen": -59.819496154785156, "logps/rejected": -90.99928283691406, "loss": 1.5439, "rewards/accuracies": 0.0, "rewards/chosen": 0.790130615234375, "rewards/margins": -2.1331100463867188, "rewards/rejected": 2.9232406616210938, "step": 5388 }, { "epoch": 0.87, "learning_rate": 8.506607998319197e-07, "logits/chosen": -0.5407500863075256, "logits/rejected": -0.5093182325363159, "logps/chosen": -69.00160217285156, "logps/rejected": -58.24641799926758, "loss": 0.8284, "rewards/accuracies": 0.0, "rewards/chosen": 1.482934594154358, "rewards/margins": -1.1548961400985718, "rewards/rejected": 2.6378307342529297, "step": 5389 }, { "epoch": 0.87, "learning_rate": 8.505671017814579e-07, "logits/chosen": -0.9632672667503357, "logits/rejected": -0.9633117914199829, "logps/chosen": -66.36431884765625, "logps/rejected": -66.69767761230469, "loss": 0.5316, "rewards/accuracies": 0.0, "rewards/chosen": 1.4021835327148438, "rewards/margins": -0.27366483211517334, "rewards/rejected": 1.675848364830017, "step": 5390 }, { "epoch": 0.88, "learning_rate": 8.504733795101263e-07, "logits/chosen": -0.9384174346923828, "logits/rejected": -1.2279497385025024, "logps/chosen": -113.32855987548828, "logps/rejected": -35.371612548828125, "loss": 0.0917, "rewards/accuracies": 1.0, "rewards/chosen": 1.9704277515411377, "rewards/margins": 1.643170952796936, "rewards/rejected": 0.3272567689418793, "step": 5391 }, { "epoch": 0.88, "learning_rate": 8.503796330244004e-07, "logits/chosen": -0.5907899737358093, "logits/rejected": -0.5166576504707336, "logps/chosen": -133.03814697265625, "logps/rejected": -122.07282257080078, "loss": 0.5011, "rewards/accuracies": 0.0, "rewards/chosen": 3.1667847633361816, "rewards/margins": -0.4194984436035156, "rewards/rejected": 3.5862832069396973, "step": 5392 }, { "epoch": 0.88, "learning_rate": 8.502858623307573e-07, "logits/chosen": -0.3697117865085602, "logits/rejected": -0.3548964858055115, "logps/chosen": -85.12879180908203, "logps/rejected": -48.09482192993164, "loss": 0.5775, "rewards/accuracies": 0.0, "rewards/chosen": 1.1818519830703735, "rewards/margins": -0.6943302154541016, "rewards/rejected": 1.876182198524475, "step": 5393 }, { "epoch": 0.88, "learning_rate": 8.501920674356754e-07, "logits/chosen": -0.5124500393867493, "logits/rejected": -0.3961709141731262, "logps/chosen": -63.32168197631836, "logps/rejected": -44.756553649902344, "loss": 0.9924, "rewards/accuracies": 0.0, "rewards/chosen": 1.262755274772644, "rewards/margins": -1.4284473657608032, "rewards/rejected": 2.6912026405334473, "step": 5394 }, { "epoch": 0.88, "learning_rate": 8.500982483456352e-07, "logits/chosen": -0.8546194434165955, "logits/rejected": -0.8809264898300171, "logps/chosen": -162.781005859375, "logps/rejected": -148.023681640625, "loss": 0.9808, "rewards/accuracies": 1.0, "rewards/chosen": 3.6329009532928467, "rewards/margins": 0.2206573486328125, "rewards/rejected": 3.412243604660034, "step": 5395 }, { "epoch": 0.88, "learning_rate": 8.500044050671187e-07, "logits/chosen": -0.5910775661468506, "logits/rejected": -0.6319929957389832, "logps/chosen": -88.51054382324219, "logps/rejected": -94.6500244140625, "loss": 1.0289, "rewards/accuracies": 0.0, "rewards/chosen": -0.3495895564556122, "rewards/margins": -1.5876785516738892, "rewards/rejected": 1.2380889654159546, "step": 5396 }, { "epoch": 0.88, "learning_rate": 8.499105376066096e-07, "logits/chosen": -0.857812225818634, "logits/rejected": -0.5469016432762146, "logps/chosen": -72.37828826904297, "logps/rejected": -237.40966796875, "loss": 2.6929, "rewards/accuracies": 0.0, "rewards/chosen": 0.7200683951377869, "rewards/margins": -3.8868408203125, "rewards/rejected": 4.606909275054932, "step": 5397 }, { "epoch": 0.88, "learning_rate": 8.498166459705934e-07, "logits/chosen": -0.5392111539840698, "logits/rejected": -0.3865295350551605, "logps/chosen": -75.75540161132812, "logps/rejected": -16.309268951416016, "loss": 0.0709, "rewards/accuracies": 1.0, "rewards/chosen": 2.8756134510040283, "rewards/margins": 2.7392852306365967, "rewards/rejected": 0.13632832467556, "step": 5398 }, { "epoch": 0.88, "learning_rate": 8.497227301655567e-07, "logits/chosen": -0.8362840414047241, "logits/rejected": -0.6201229691505432, "logps/chosen": -102.74798583984375, "logps/rejected": -94.67326354980469, "loss": 0.6898, "rewards/accuracies": 0.0, "rewards/chosen": 3.9237823486328125, "rewards/margins": -0.39908456802368164, "rewards/rejected": 4.322866916656494, "step": 5399 }, { "epoch": 0.88, "learning_rate": 8.496287901979886e-07, "logits/chosen": -0.6784198880195618, "logits/rejected": -0.700474202632904, "logps/chosen": -155.6063690185547, "logps/rejected": -54.59757614135742, "loss": 0.7328, "rewards/accuracies": 0.0, "rewards/chosen": 1.5488296747207642, "rewards/margins": -0.1527881622314453, "rewards/rejected": 1.7016178369522095, "step": 5400 }, { "epoch": 0.88, "learning_rate": 8.495348260743793e-07, "logits/chosen": -0.24494585394859314, "logits/rejected": -0.14363905787467957, "logps/chosen": -60.41217041015625, "logps/rejected": -66.43379974365234, "loss": 0.2791, "rewards/accuracies": 1.0, "rewards/chosen": 2.3675262928009033, "rewards/margins": 0.987396240234375, "rewards/rejected": 1.3801300525665283, "step": 5401 }, { "epoch": 0.88, "learning_rate": 8.494408378012207e-07, "logits/chosen": -0.5090531706809998, "logits/rejected": -0.4496375322341919, "logps/chosen": -36.84960174560547, "logps/rejected": -44.326045989990234, "loss": 0.2744, "rewards/accuracies": 1.0, "rewards/chosen": 1.1316193342208862, "rewards/margins": 0.36082690954208374, "rewards/rejected": 0.7707924246788025, "step": 5402 }, { "epoch": 0.88, "learning_rate": 8.49346825385007e-07, "logits/chosen": -0.4569675624370575, "logits/rejected": -0.41144704818725586, "logps/chosen": -103.6832504272461, "logps/rejected": -133.78329467773438, "loss": 0.449, "rewards/accuracies": 0.0, "rewards/chosen": 0.6955215334892273, "rewards/margins": -0.044102489948272705, "rewards/rejected": 0.7396240234375, "step": 5403 }, { "epoch": 0.88, "learning_rate": 8.49252788832233e-07, "logits/chosen": -0.6057631373405457, "logits/rejected": -0.5502293109893799, "logps/chosen": -80.76178741455078, "logps/rejected": -59.7205810546875, "loss": 0.2953, "rewards/accuracies": 1.0, "rewards/chosen": 1.777227759361267, "rewards/margins": 1.1820625066757202, "rewards/rejected": 0.5951652526855469, "step": 5404 }, { "epoch": 0.88, "learning_rate": 8.491587281493959e-07, "logits/chosen": -0.45395174622535706, "logits/rejected": -0.4499571621417999, "logps/chosen": -90.5106201171875, "logps/rejected": -66.88613891601562, "loss": 0.5527, "rewards/accuracies": 0.0, "rewards/chosen": 0.3406212031841278, "rewards/margins": -0.4387046992778778, "rewards/rejected": 0.7793259024620056, "step": 5405 }, { "epoch": 0.88, "learning_rate": 8.490646433429945e-07, "logits/chosen": -0.6422744393348694, "logits/rejected": -0.48079466819763184, "logps/chosen": -140.62364196777344, "logps/rejected": -43.1743278503418, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": 4.597941875457764, "rewards/margins": 2.4924123287200928, "rewards/rejected": 2.105529546737671, "step": 5406 }, { "epoch": 0.88, "learning_rate": 8.489705344195291e-07, "logits/chosen": -0.5292526483535767, "logits/rejected": -0.6229236125946045, "logps/chosen": -50.81877136230469, "logps/rejected": -91.12359619140625, "loss": 0.5833, "rewards/accuracies": 0.0, "rewards/chosen": 1.5818451642990112, "rewards/margins": -0.7793716192245483, "rewards/rejected": 2.3612167835235596, "step": 5407 }, { "epoch": 0.88, "learning_rate": 8.488764013855018e-07, "logits/chosen": -0.5096881985664368, "logits/rejected": -0.5311230421066284, "logps/chosen": -89.41614532470703, "logps/rejected": -50.923057556152344, "loss": 0.4294, "rewards/accuracies": 1.0, "rewards/chosen": 2.236598253250122, "rewards/margins": 0.30160295963287354, "rewards/rejected": 1.9349952936172485, "step": 5408 }, { "epoch": 0.88, "learning_rate": 8.487822442474162e-07, "logits/chosen": -0.8279446363449097, "logits/rejected": -0.6881506443023682, "logps/chosen": -118.12105560302734, "logps/rejected": -60.636024475097656, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": 5.33461856842041, "rewards/margins": 4.614591598510742, "rewards/rejected": 0.7200271487236023, "step": 5409 }, { "epoch": 0.88, "learning_rate": 8.486880630117779e-07, "logits/chosen": -0.2754637598991394, "logits/rejected": -0.2754637598991394, "logps/chosen": -45.442203521728516, "logps/rejected": -45.442203521728516, "loss": 0.3959, "rewards/accuracies": 0.0, "rewards/chosen": 1.5433815717697144, "rewards/margins": 0.0, "rewards/rejected": 1.5433815717697144, "step": 5410 }, { "epoch": 0.88, "learning_rate": 8.485938576850936e-07, "logits/chosen": -0.79440838098526, "logits/rejected": -0.8370364904403687, "logps/chosen": -67.53495788574219, "logps/rejected": -107.3460693359375, "loss": 0.3245, "rewards/accuracies": 1.0, "rewards/chosen": 1.0210922956466675, "rewards/margins": 0.18325048685073853, "rewards/rejected": 0.837841808795929, "step": 5411 }, { "epoch": 0.88, "learning_rate": 8.484996282738722e-07, "logits/chosen": -0.5551645159721375, "logits/rejected": -0.43109193444252014, "logps/chosen": -88.72491455078125, "logps/rejected": -38.1417350769043, "loss": 0.2199, "rewards/accuracies": 1.0, "rewards/chosen": 1.540810465812683, "rewards/margins": 1.3972523212432861, "rewards/rejected": 0.14355812966823578, "step": 5412 }, { "epoch": 0.88, "learning_rate": 8.48405374784624e-07, "logits/chosen": -0.662449061870575, "logits/rejected": -0.49293896555900574, "logps/chosen": -109.28837585449219, "logps/rejected": -86.40032958984375, "loss": 0.2311, "rewards/accuracies": 1.0, "rewards/chosen": 3.4484314918518066, "rewards/margins": 0.685305118560791, "rewards/rejected": 2.7631263732910156, "step": 5413 }, { "epoch": 0.88, "learning_rate": 8.483110972238611e-07, "logits/chosen": -0.5266554355621338, "logits/rejected": -0.488896906375885, "logps/chosen": -114.875732421875, "logps/rejected": -83.99847412109375, "loss": 0.7255, "rewards/accuracies": 0.0, "rewards/chosen": 2.3183228969573975, "rewards/margins": -1.130911111831665, "rewards/rejected": 3.4492340087890625, "step": 5414 }, { "epoch": 0.88, "learning_rate": 8.48216795598097e-07, "logits/chosen": -1.0122535228729248, "logits/rejected": -0.6156727075576782, "logps/chosen": -147.22825622558594, "logps/rejected": -69.26522827148438, "loss": 0.3677, "rewards/accuracies": 1.0, "rewards/chosen": 5.743037700653076, "rewards/margins": 2.378875255584717, "rewards/rejected": 3.3641624450683594, "step": 5415 }, { "epoch": 0.88, "learning_rate": 8.481224699138474e-07, "logits/chosen": -0.7726028561592102, "logits/rejected": -0.5831420421600342, "logps/chosen": -116.81085205078125, "logps/rejected": -135.3263702392578, "loss": 0.087, "rewards/accuracies": 1.0, "rewards/chosen": 8.600624084472656, "rewards/margins": 2.5944104194641113, "rewards/rejected": 6.006213665008545, "step": 5416 }, { "epoch": 0.88, "learning_rate": 8.48028120177629e-07, "logits/chosen": -1.0517256259918213, "logits/rejected": -1.1096385717391968, "logps/chosen": -260.3558349609375, "logps/rejected": -105.11970520019531, "loss": 0.075, "rewards/accuracies": 1.0, "rewards/chosen": 5.302114963531494, "rewards/margins": 3.5929627418518066, "rewards/rejected": 1.7091522216796875, "step": 5417 }, { "epoch": 0.88, "learning_rate": 8.479337463959605e-07, "logits/chosen": -0.7173210978507996, "logits/rejected": -0.713545560836792, "logps/chosen": -51.90282440185547, "logps/rejected": -53.32561111450195, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.7902233004570007, "rewards/margins": -0.9332504868507385, "rewards/rejected": 1.7234737873077393, "step": 5418 }, { "epoch": 0.88, "learning_rate": 8.478393485753623e-07, "logits/chosen": -0.9216546416282654, "logits/rejected": -0.7868316173553467, "logps/chosen": -101.8558349609375, "logps/rejected": -195.9734344482422, "loss": 0.6086, "rewards/accuracies": 0.0, "rewards/chosen": 4.235327243804932, "rewards/margins": -0.4636979103088379, "rewards/rejected": 4.6990251541137695, "step": 5419 }, { "epoch": 0.88, "learning_rate": 8.477449267223564e-07, "logits/chosen": -0.8946994543075562, "logits/rejected": -0.8943827152252197, "logps/chosen": -244.9063720703125, "logps/rejected": -121.48040008544922, "loss": 0.3831, "rewards/accuracies": 1.0, "rewards/chosen": 4.689160346984863, "rewards/margins": 0.029571056365966797, "rewards/rejected": 4.6595892906188965, "step": 5420 }, { "epoch": 0.88, "learning_rate": 8.476504808434666e-07, "logits/chosen": -0.4374009072780609, "logits/rejected": -0.4549768567085266, "logps/chosen": -31.6562442779541, "logps/rejected": -2.0823841094970703, "loss": 1.4133, "rewards/accuracies": 0.0, "rewards/chosen": -0.152302548289299, "rewards/margins": -0.6389477849006653, "rewards/rejected": 0.4866452217102051, "step": 5421 }, { "epoch": 0.88, "learning_rate": 8.47556010945218e-07, "logits/chosen": -0.6812431812286377, "logits/rejected": -0.5072639584541321, "logps/chosen": -161.6005859375, "logps/rejected": -96.59893798828125, "loss": 0.0735, "rewards/accuracies": 1.0, "rewards/chosen": 5.208396911621094, "rewards/margins": 2.013929605484009, "rewards/rejected": 3.194467306137085, "step": 5422 }, { "epoch": 0.88, "learning_rate": 8.474615170341377e-07, "logits/chosen": -0.765498161315918, "logits/rejected": -0.4138994812965393, "logps/chosen": -141.32110595703125, "logps/rejected": -27.935630798339844, "loss": 0.2418, "rewards/accuracies": 1.0, "rewards/chosen": 3.8131394386291504, "rewards/margins": 2.645500659942627, "rewards/rejected": 1.1676387786865234, "step": 5423 }, { "epoch": 0.88, "learning_rate": 8.473669991167541e-07, "logits/chosen": -0.5970883369445801, "logits/rejected": -0.5305588245391846, "logps/chosen": -114.76888275146484, "logps/rejected": -66.66485595703125, "loss": 0.8211, "rewards/accuracies": 0.0, "rewards/chosen": 0.4561973512172699, "rewards/margins": -1.4241478443145752, "rewards/rejected": 1.8803452253341675, "step": 5424 }, { "epoch": 0.88, "learning_rate": 8.472724571995979e-07, "logits/chosen": -0.6958587765693665, "logits/rejected": -0.6237654685974121, "logps/chosen": -67.410888671875, "logps/rejected": -80.71825408935547, "loss": 0.9617, "rewards/accuracies": 0.0, "rewards/chosen": 2.447552442550659, "rewards/margins": -1.0042107105255127, "rewards/rejected": 3.451763153076172, "step": 5425 }, { "epoch": 0.88, "learning_rate": 8.471778912892007e-07, "logits/chosen": -0.49768224358558655, "logits/rejected": -0.4777635633945465, "logps/chosen": -97.55691528320312, "logps/rejected": -42.274940490722656, "loss": 0.5063, "rewards/accuracies": 0.0, "rewards/chosen": 1.4542831182479858, "rewards/margins": -0.032495975494384766, "rewards/rejected": 1.4867790937423706, "step": 5426 }, { "epoch": 0.88, "learning_rate": 8.470833013920962e-07, "logits/chosen": -0.407798171043396, "logits/rejected": -0.39390695095062256, "logps/chosen": -43.893638610839844, "logps/rejected": -40.993534088134766, "loss": 1.373, "rewards/accuracies": 0.0, "rewards/chosen": 1.4261143207550049, "rewards/margins": -0.8972649574279785, "rewards/rejected": 2.3233792781829834, "step": 5427 }, { "epoch": 0.88, "learning_rate": 8.469886875148198e-07, "logits/chosen": -0.6353464722633362, "logits/rejected": -0.5436434149742126, "logps/chosen": -62.80133819580078, "logps/rejected": -57.69389343261719, "loss": 0.3744, "rewards/accuracies": 1.0, "rewards/chosen": 1.4968407154083252, "rewards/margins": 0.47045445442199707, "rewards/rejected": 1.0263862609863281, "step": 5428 }, { "epoch": 0.88, "learning_rate": 8.468940496639083e-07, "logits/chosen": -0.46141108870506287, "logits/rejected": -0.4482594132423401, "logps/chosen": -23.240684509277344, "logps/rejected": -3.6793808937072754, "loss": 0.3787, "rewards/accuracies": 0.0, "rewards/chosen": 0.3032693862915039, "rewards/margins": -0.07446393370628357, "rewards/rejected": 0.3777333199977875, "step": 5429 }, { "epoch": 0.88, "learning_rate": 8.467993878459003e-07, "logits/chosen": -0.7402008175849915, "logits/rejected": -0.7043790221214294, "logps/chosen": -46.4329948425293, "logps/rejected": -23.877819061279297, "loss": 0.1253, "rewards/accuracies": 1.0, "rewards/chosen": 1.9938640594482422, "rewards/margins": 1.4429874420166016, "rewards/rejected": 0.5508766174316406, "step": 5430 }, { "epoch": 0.88, "learning_rate": 8.46704702067336e-07, "logits/chosen": -0.42691105604171753, "logits/rejected": -0.33038485050201416, "logps/chosen": -114.62893676757812, "logps/rejected": -89.12882995605469, "loss": 0.0891, "rewards/accuracies": 1.0, "rewards/chosen": 4.944613933563232, "rewards/margins": 2.804436683654785, "rewards/rejected": 2.1401772499084473, "step": 5431 }, { "epoch": 0.88, "learning_rate": 8.466099923347574e-07, "logits/chosen": -0.5579382181167603, "logits/rejected": -0.19912254810333252, "logps/chosen": -92.22508239746094, "logps/rejected": -51.639808654785156, "loss": 3.3922, "rewards/accuracies": 0.0, "rewards/chosen": 0.628980278968811, "rewards/margins": -1.1990478038787842, "rewards/rejected": 1.8280280828475952, "step": 5432 }, { "epoch": 0.88, "learning_rate": 8.465152586547079e-07, "logits/chosen": -0.7988113164901733, "logits/rejected": -0.8393915891647339, "logps/chosen": -54.744773864746094, "logps/rejected": -62.990943908691406, "loss": 0.7978, "rewards/accuracies": 0.0, "rewards/chosen": 1.8471611738204956, "rewards/margins": -0.5419410467147827, "rewards/rejected": 2.3891022205352783, "step": 5433 }, { "epoch": 0.88, "learning_rate": 8.46420501033733e-07, "logits/chosen": -0.7186214923858643, "logits/rejected": -0.6996457576751709, "logps/chosen": -61.67099380493164, "logps/rejected": -68.74771881103516, "loss": 1.6923, "rewards/accuracies": 0.0, "rewards/chosen": 0.42887917160987854, "rewards/margins": -2.1912028789520264, "rewards/rejected": 2.620082139968872, "step": 5434 }, { "epoch": 0.88, "learning_rate": 8.463257194783792e-07, "logits/chosen": -0.9094008207321167, "logits/rejected": -0.9021501541137695, "logps/chosen": -63.547943115234375, "logps/rejected": -67.10813903808594, "loss": 0.7915, "rewards/accuracies": 0.0, "rewards/chosen": 0.752288818359375, "rewards/margins": -0.29560697078704834, "rewards/rejected": 1.0478957891464233, "step": 5435 }, { "epoch": 0.88, "learning_rate": 8.462309139951951e-07, "logits/chosen": -0.3825933337211609, "logits/rejected": -0.15907573699951172, "logps/chosen": -136.40225219726562, "logps/rejected": -25.737043380737305, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 5.378991603851318, "rewards/margins": 5.008376598358154, "rewards/rejected": 0.37061482667922974, "step": 5436 }, { "epoch": 0.88, "learning_rate": 8.46136084590731e-07, "logits/chosen": -0.842523992061615, "logits/rejected": -0.23416933417320251, "logps/chosen": -113.56178283691406, "logps/rejected": -118.63108825683594, "loss": 0.117, "rewards/accuracies": 1.0, "rewards/chosen": 4.496641635894775, "rewards/margins": 1.69736647605896, "rewards/rejected": 2.7992751598358154, "step": 5437 }, { "epoch": 0.88, "learning_rate": 8.460412312715386e-07, "logits/chosen": -0.23274533450603485, "logits/rejected": -0.23274533450603485, "logps/chosen": -39.957332611083984, "logps/rejected": -39.957332611083984, "loss": 0.5535, "rewards/accuracies": 0.0, "rewards/chosen": -0.10408782958984375, "rewards/margins": 0.0, "rewards/rejected": -0.10408782958984375, "step": 5438 }, { "epoch": 0.88, "learning_rate": 8.459463540441714e-07, "logits/chosen": -0.7696466445922852, "logits/rejected": -0.6474496126174927, "logps/chosen": -76.63771057128906, "logps/rejected": -68.28555297851562, "loss": 0.1203, "rewards/accuracies": 1.0, "rewards/chosen": 3.780397891998291, "rewards/margins": 1.890889048576355, "rewards/rejected": 1.889508843421936, "step": 5439 }, { "epoch": 0.88, "learning_rate": 8.458514529151846e-07, "logits/chosen": -0.7054169178009033, "logits/rejected": -0.7773971557617188, "logps/chosen": -94.33363342285156, "logps/rejected": -117.0242691040039, "loss": 1.1924, "rewards/accuracies": 0.0, "rewards/chosen": 2.5572502613067627, "rewards/margins": -2.2656843662261963, "rewards/rejected": 4.822934627532959, "step": 5440 }, { "epoch": 0.88, "learning_rate": 8.457565278911347e-07, "logits/chosen": -0.30416154861450195, "logits/rejected": -0.3159601092338562, "logps/chosen": -19.158414840698242, "logps/rejected": -21.635601043701172, "loss": 1.7502, "rewards/accuracies": 1.0, "rewards/chosen": 0.2794721722602844, "rewards/margins": 0.29718267917633057, "rewards/rejected": -0.017710495740175247, "step": 5441 }, { "epoch": 0.88, "learning_rate": 8.456615789785804e-07, "logits/chosen": -0.47268861532211304, "logits/rejected": -0.5269772410392761, "logps/chosen": -40.219879150390625, "logps/rejected": -91.43463897705078, "loss": 2.9939, "rewards/accuracies": 0.0, "rewards/chosen": 2.4584884643554688, "rewards/margins": -1.8302803039550781, "rewards/rejected": 4.288768768310547, "step": 5442 }, { "epoch": 0.88, "learning_rate": 8.455666061840816e-07, "logits/chosen": -0.36809366941452026, "logits/rejected": -0.32149583101272583, "logps/chosen": -139.58970642089844, "logps/rejected": -59.49494934082031, "loss": 0.2789, "rewards/accuracies": 1.0, "rewards/chosen": 1.5333389043807983, "rewards/margins": 0.39271080493927, "rewards/rejected": 1.1406280994415283, "step": 5443 }, { "epoch": 0.88, "learning_rate": 8.454716095142001e-07, "logits/chosen": -0.9363350868225098, "logits/rejected": -0.9098665714263916, "logps/chosen": -75.24021911621094, "logps/rejected": -36.272865295410156, "loss": 0.4519, "rewards/accuracies": 1.0, "rewards/chosen": 3.023193359375, "rewards/margins": 2.803170680999756, "rewards/rejected": 0.22002258896827698, "step": 5444 }, { "epoch": 0.88, "learning_rate": 8.453765889754993e-07, "logits/chosen": -0.644102156162262, "logits/rejected": -0.6238494515419006, "logps/chosen": -27.062347412109375, "logps/rejected": -35.232078552246094, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 1.200801134109497, "rewards/margins": 0.09384846687316895, "rewards/rejected": 1.1069526672363281, "step": 5445 }, { "epoch": 0.88, "learning_rate": 8.452815445745443e-07, "logits/chosen": -0.8668808341026306, "logits/rejected": -0.7283366322517395, "logps/chosen": -147.774658203125, "logps/rejected": -67.83645629882812, "loss": 0.1748, "rewards/accuracies": 1.0, "rewards/chosen": 3.001507520675659, "rewards/margins": 0.9235014915466309, "rewards/rejected": 2.0780060291290283, "step": 5446 }, { "epoch": 0.88, "learning_rate": 8.451864763179016e-07, "logits/chosen": -0.23350298404693604, "logits/rejected": -0.23374608159065247, "logps/chosen": -1.5079176425933838, "logps/rejected": -3.1369435787200928, "loss": 2.4002, "rewards/accuracies": 0.0, "rewards/chosen": 0.4626050889492035, "rewards/margins": -0.12861809134483337, "rewards/rejected": 0.5912231802940369, "step": 5447 }, { "epoch": 0.88, "learning_rate": 8.450913842121395e-07, "logits/chosen": -0.43898090720176697, "logits/rejected": -0.43898090720176697, "logps/chosen": -43.24342727661133, "logps/rejected": -43.24342727661133, "loss": 0.6189, "rewards/accuracies": 0.0, "rewards/chosen": 1.4898815155029297, "rewards/margins": 0.0, "rewards/rejected": 1.4898815155029297, "step": 5448 }, { "epoch": 0.88, "learning_rate": 8.449962682638279e-07, "logits/chosen": -0.7764222025871277, "logits/rejected": -0.5666243433952332, "logps/chosen": -140.14138793945312, "logps/rejected": -70.84759521484375, "loss": 0.1181, "rewards/accuracies": 1.0, "rewards/chosen": 4.072511196136475, "rewards/margins": 1.3314878940582275, "rewards/rejected": 2.741023302078247, "step": 5449 }, { "epoch": 0.88, "learning_rate": 8.449011284795387e-07, "logits/chosen": -0.6330141425132751, "logits/rejected": -0.5915341973304749, "logps/chosen": -66.17074584960938, "logps/rejected": -16.856201171875, "loss": 0.3266, "rewards/accuracies": 1.0, "rewards/chosen": 2.0683236122131348, "rewards/margins": 1.8844412565231323, "rewards/rejected": 0.18388234078884125, "step": 5450 }, { "epoch": 0.88, "learning_rate": 8.44805964865845e-07, "logits/chosen": -0.8178955912590027, "logits/rejected": -0.5054652690887451, "logps/chosen": -169.82192993164062, "logps/rejected": -25.35707664489746, "loss": 0.2343, "rewards/accuracies": 1.0, "rewards/chosen": 5.891114711761475, "rewards/margins": 5.44081974029541, "rewards/rejected": 0.4502948820590973, "step": 5451 }, { "epoch": 0.88, "learning_rate": 8.447107774293219e-07, "logits/chosen": -0.5120128393173218, "logits/rejected": -0.5120128393173218, "logps/chosen": -0.9519801139831543, "logps/rejected": -0.9519801139831543, "loss": 0.6371, "rewards/accuracies": 0.0, "rewards/chosen": 0.23502908647060394, "rewards/margins": 0.0, "rewards/rejected": 0.23502908647060394, "step": 5452 }, { "epoch": 0.89, "learning_rate": 8.446155661765456e-07, "logits/chosen": -0.2936706840991974, "logits/rejected": -0.2936706840991974, "logps/chosen": -36.059810638427734, "logps/rejected": -36.059810638427734, "loss": 1.2589, "rewards/accuracies": 0.0, "rewards/chosen": 0.1432979553937912, "rewards/margins": 0.0, "rewards/rejected": 0.1432979553937912, "step": 5453 }, { "epoch": 0.89, "learning_rate": 8.445203311140943e-07, "logits/chosen": -0.3255261778831482, "logits/rejected": -0.14362047612667084, "logps/chosen": -81.17423248291016, "logps/rejected": -24.468050003051758, "loss": 0.2475, "rewards/accuracies": 1.0, "rewards/chosen": 1.298858642578125, "rewards/margins": 0.8257777690887451, "rewards/rejected": 0.4730808436870575, "step": 5454 }, { "epoch": 0.89, "learning_rate": 8.444250722485482e-07, "logits/chosen": -0.8300097584724426, "logits/rejected": -0.7848020792007446, "logps/chosen": -128.29278564453125, "logps/rejected": -152.90457153320312, "loss": 1.28, "rewards/accuracies": 0.0, "rewards/chosen": 3.457240343093872, "rewards/margins": -2.4743425846099854, "rewards/rejected": 5.931582927703857, "step": 5455 }, { "epoch": 0.89, "learning_rate": 8.443297895864884e-07, "logits/chosen": -0.6662973165512085, "logits/rejected": -0.6562591195106506, "logps/chosen": -31.0367374420166, "logps/rejected": -32.03096389770508, "loss": 1.6373, "rewards/accuracies": 1.0, "rewards/chosen": 0.7553121447563171, "rewards/margins": 0.30645081400871277, "rewards/rejected": 0.44886133074760437, "step": 5456 }, { "epoch": 0.89, "learning_rate": 8.442344831344984e-07, "logits/chosen": -0.19697555899620056, "logits/rejected": -0.19697555899620056, "logps/chosen": -108.74897003173828, "logps/rejected": -108.74897003173828, "loss": 0.3944, "rewards/accuracies": 0.0, "rewards/chosen": 0.7031013369560242, "rewards/margins": 0.0, "rewards/rejected": 0.7031013369560242, "step": 5457 }, { "epoch": 0.89, "learning_rate": 8.441391528991628e-07, "logits/chosen": -0.5025074481964111, "logits/rejected": -0.4292218089103699, "logps/chosen": -34.123260498046875, "logps/rejected": -31.683032989501953, "loss": 0.6469, "rewards/accuracies": 1.0, "rewards/chosen": 1.6368759870529175, "rewards/margins": 0.7990353107452393, "rewards/rejected": 0.8378406763076782, "step": 5458 }, { "epoch": 0.89, "learning_rate": 8.440437988870679e-07, "logits/chosen": -0.8837557435035706, "logits/rejected": -0.658217191696167, "logps/chosen": -171.94375610351562, "logps/rejected": -65.26002502441406, "loss": 0.0476, "rewards/accuracies": 1.0, "rewards/chosen": 4.6799774169921875, "rewards/margins": 2.6424689292907715, "rewards/rejected": 2.037508487701416, "step": 5459 }, { "epoch": 0.89, "learning_rate": 8.439484211048018e-07, "logits/chosen": -0.391100138425827, "logits/rejected": -0.25741463899612427, "logps/chosen": -121.3440170288086, "logps/rejected": -27.104196548461914, "loss": 0.2044, "rewards/accuracies": 1.0, "rewards/chosen": 6.036744117736816, "rewards/margins": 5.374300956726074, "rewards/rejected": 0.6624433398246765, "step": 5460 }, { "epoch": 0.89, "learning_rate": 8.438530195589545e-07, "logits/chosen": -0.15369844436645508, "logits/rejected": -0.15369844436645508, "logps/chosen": -55.19252014160156, "logps/rejected": -55.19252014160156, "loss": 0.5188, "rewards/accuracies": 0.0, "rewards/chosen": 0.9535003900527954, "rewards/margins": 0.0, "rewards/rejected": 0.9535003900527954, "step": 5461 }, { "epoch": 0.89, "learning_rate": 8.43757594256117e-07, "logits/chosen": -0.5187996029853821, "logits/rejected": -0.4789946973323822, "logps/chosen": -62.774757385253906, "logps/rejected": -96.75927734375, "loss": 1.627, "rewards/accuracies": 0.0, "rewards/chosen": 1.7397384643554688, "rewards/margins": -2.7327589988708496, "rewards/rejected": 4.472497463226318, "step": 5462 }, { "epoch": 0.89, "learning_rate": 8.436621452028824e-07, "logits/chosen": -0.6273345947265625, "logits/rejected": -0.5914263725280762, "logps/chosen": -47.117855072021484, "logps/rejected": -69.05735778808594, "loss": 1.9544, "rewards/accuracies": 1.0, "rewards/chosen": 2.4405384063720703, "rewards/margins": 0.6571750640869141, "rewards/rejected": 1.7833633422851562, "step": 5463 }, { "epoch": 0.89, "learning_rate": 8.435666724058453e-07, "logits/chosen": -1.1560207605361938, "logits/rejected": -1.1881086826324463, "logps/chosen": -257.4733581542969, "logps/rejected": -36.439579010009766, "loss": 0.1617, "rewards/accuracies": 1.0, "rewards/chosen": 1.2363739013671875, "rewards/margins": 0.9905773401260376, "rewards/rejected": 0.2457965910434723, "step": 5464 }, { "epoch": 0.89, "learning_rate": 8.43471175871602e-07, "logits/chosen": -0.4837442636489868, "logits/rejected": -0.5303804874420166, "logps/chosen": -51.28504180908203, "logps/rejected": -103.47805786132812, "loss": 0.6334, "rewards/accuracies": 1.0, "rewards/chosen": 1.6905784606933594, "rewards/margins": 0.4079231023788452, "rewards/rejected": 1.2826553583145142, "step": 5465 }, { "epoch": 0.89, "learning_rate": 8.433756556067505e-07, "logits/chosen": -0.6510530114173889, "logits/rejected": -0.6540699005126953, "logps/chosen": -96.84677124023438, "logps/rejected": -60.13866424560547, "loss": 0.8304, "rewards/accuracies": 0.0, "rewards/chosen": 1.992645263671875, "rewards/margins": -0.27274250984191895, "rewards/rejected": 2.265387773513794, "step": 5466 }, { "epoch": 0.89, "learning_rate": 8.432801116178902e-07, "logits/chosen": -0.33315935730934143, "logits/rejected": -0.3299824297428131, "logps/chosen": -2.4442267417907715, "logps/rejected": -14.280054092407227, "loss": 0.3929, "rewards/accuracies": 0.0, "rewards/chosen": 0.2709619998931885, "rewards/margins": -0.049755632877349854, "rewards/rejected": 0.32071763277053833, "step": 5467 }, { "epoch": 0.89, "learning_rate": 8.431845439116223e-07, "logits/chosen": -0.24622024595737457, "logits/rejected": -0.24622024595737457, "logps/chosen": -24.502849578857422, "logps/rejected": -24.502849578857422, "loss": 0.855, "rewards/accuracies": 0.0, "rewards/chosen": 1.1905678510665894, "rewards/margins": 0.0, "rewards/rejected": 1.1905678510665894, "step": 5468 }, { "epoch": 0.89, "learning_rate": 8.430889524945497e-07, "logits/chosen": -0.9315841794013977, "logits/rejected": -0.8845910429954529, "logps/chosen": -33.559471130371094, "logps/rejected": -83.84907531738281, "loss": 1.128, "rewards/accuracies": 1.0, "rewards/chosen": 0.8498512506484985, "rewards/margins": 0.6932228207588196, "rewards/rejected": 0.15662841498851776, "step": 5469 }, { "epoch": 0.89, "learning_rate": 8.429933373732767e-07, "logits/chosen": -0.7074881196022034, "logits/rejected": -1.110450029373169, "logps/chosen": -58.28658676147461, "logps/rejected": -55.70895767211914, "loss": 0.3437, "rewards/accuracies": 1.0, "rewards/chosen": 0.5748004913330078, "rewards/margins": 0.3116222321987152, "rewards/rejected": 0.2631782591342926, "step": 5470 }, { "epoch": 0.89, "learning_rate": 8.428976985544097e-07, "logits/chosen": -0.9495240449905396, "logits/rejected": -0.8383007049560547, "logps/chosen": -203.07469177246094, "logps/rejected": -93.88056182861328, "loss": 0.0986, "rewards/accuracies": 1.0, "rewards/chosen": 4.125199794769287, "rewards/margins": 1.8335134983062744, "rewards/rejected": 2.2916862964630127, "step": 5471 }, { "epoch": 0.89, "learning_rate": 8.428020360445562e-07, "logits/chosen": -0.7301167249679565, "logits/rejected": -0.7044633626937866, "logps/chosen": -72.10888671875, "logps/rejected": -14.121153831481934, "loss": 0.3665, "rewards/accuracies": 1.0, "rewards/chosen": 0.9478897452354431, "rewards/margins": 0.44089269638061523, "rewards/rejected": 0.5069970488548279, "step": 5472 }, { "epoch": 0.89, "learning_rate": 8.427063498503255e-07, "logits/chosen": -0.6680186986923218, "logits/rejected": -0.5662753582000732, "logps/chosen": -55.29095458984375, "logps/rejected": -71.29801940917969, "loss": 0.9133, "rewards/accuracies": 0.0, "rewards/chosen": 1.7436096668243408, "rewards/margins": -0.7566390037536621, "rewards/rejected": 2.500248670578003, "step": 5473 }, { "epoch": 0.89, "learning_rate": 8.426106399783289e-07, "logits/chosen": -0.7405675649642944, "logits/rejected": -0.629113495349884, "logps/chosen": -70.18260192871094, "logps/rejected": -41.02088928222656, "loss": 1.0972, "rewards/accuracies": 0.0, "rewards/chosen": 1.3409416675567627, "rewards/margins": -0.9390239715576172, "rewards/rejected": 2.27996563911438, "step": 5474 }, { "epoch": 0.89, "learning_rate": 8.425149064351789e-07, "logits/chosen": -0.5523332953453064, "logits/rejected": -0.5428968667984009, "logps/chosen": -57.3753776550293, "logps/rejected": -4.353375434875488, "loss": 0.6571, "rewards/accuracies": 0.0, "rewards/chosen": -0.02958526648581028, "rewards/margins": -0.35768404603004456, "rewards/rejected": 0.32809877395629883, "step": 5475 }, { "epoch": 0.89, "learning_rate": 8.424191492274897e-07, "logits/chosen": -0.7292706370353699, "logits/rejected": -0.7471514940261841, "logps/chosen": -79.16050720214844, "logps/rejected": -102.61051940917969, "loss": 0.7699, "rewards/accuracies": 0.0, "rewards/chosen": 0.8356208801269531, "rewards/margins": -0.16296160221099854, "rewards/rejected": 0.9985824823379517, "step": 5476 }, { "epoch": 0.89, "learning_rate": 8.423233683618773e-07, "logits/chosen": -0.4116500914096832, "logits/rejected": -0.4043009579181671, "logps/chosen": -68.74258422851562, "logps/rejected": -86.20573425292969, "loss": 0.7582, "rewards/accuracies": 0.0, "rewards/chosen": 2.154104709625244, "rewards/margins": -1.0795013904571533, "rewards/rejected": 3.2336061000823975, "step": 5477 }, { "epoch": 0.89, "learning_rate": 8.422275638449592e-07, "logits/chosen": -0.5964638590812683, "logits/rejected": -0.5723132491111755, "logps/chosen": -20.314476013183594, "logps/rejected": -9.36655330657959, "loss": 0.6395, "rewards/accuracies": 1.0, "rewards/chosen": 0.667653501033783, "rewards/margins": 0.22765830159187317, "rewards/rejected": 0.4399951994419098, "step": 5478 }, { "epoch": 0.89, "learning_rate": 8.421317356833546e-07, "logits/chosen": 0.02463855966925621, "logits/rejected": 0.02463855966925621, "logps/chosen": -18.261188507080078, "logps/rejected": -18.261188507080078, "loss": 0.4578, "rewards/accuracies": 0.0, "rewards/chosen": 0.25365695357322693, "rewards/margins": 0.0, "rewards/rejected": 0.25365695357322693, "step": 5479 }, { "epoch": 0.89, "learning_rate": 8.420358838836845e-07, "logits/chosen": -0.6204478740692139, "logits/rejected": -0.6422038674354553, "logps/chosen": -168.37478637695312, "logps/rejected": -97.54823303222656, "loss": 1.8583, "rewards/accuracies": 0.0, "rewards/chosen": 0.34825897216796875, "rewards/margins": -2.2820732593536377, "rewards/rejected": 2.6303322315216064, "step": 5480 }, { "epoch": 0.89, "learning_rate": 8.419400084525711e-07, "logits/chosen": -0.8081232905387878, "logits/rejected": -0.383866548538208, "logps/chosen": -117.78238677978516, "logps/rejected": -65.53304290771484, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": 5.257265567779541, "rewards/margins": 3.114976644515991, "rewards/rejected": 2.14228892326355, "step": 5481 }, { "epoch": 0.89, "learning_rate": 8.418441093966385e-07, "logits/chosen": -0.5035818219184875, "logits/rejected": -0.5019775629043579, "logps/chosen": -96.03994750976562, "logps/rejected": -63.52500534057617, "loss": 0.5068, "rewards/accuracies": 0.0, "rewards/chosen": 0.939013659954071, "rewards/margins": -0.30530208349227905, "rewards/rejected": 1.24431574344635, "step": 5482 }, { "epoch": 0.89, "learning_rate": 8.417481867225127e-07, "logits/chosen": -0.6664960980415344, "logits/rejected": -0.6342642307281494, "logps/chosen": -54.65904235839844, "logps/rejected": -59.14322280883789, "loss": 0.8113, "rewards/accuracies": 0.0, "rewards/chosen": 1.7105454206466675, "rewards/margins": -0.9004276990890503, "rewards/rejected": 2.6109731197357178, "step": 5483 }, { "epoch": 0.89, "learning_rate": 8.416522404368207e-07, "logits/chosen": -0.4951777458190918, "logits/rejected": -0.44264769554138184, "logps/chosen": -54.171302795410156, "logps/rejected": -40.47673034667969, "loss": 0.332, "rewards/accuracies": 1.0, "rewards/chosen": 1.6326531171798706, "rewards/margins": 0.7743610739707947, "rewards/rejected": 0.8582920432090759, "step": 5484 }, { "epoch": 0.89, "learning_rate": 8.415562705461916e-07, "logits/chosen": -0.47661301493644714, "logits/rejected": -0.386545330286026, "logps/chosen": -59.1035270690918, "logps/rejected": -9.310806274414062, "loss": 0.8092, "rewards/accuracies": 1.0, "rewards/chosen": 1.823740005493164, "rewards/margins": 1.0686566829681396, "rewards/rejected": 0.7550832629203796, "step": 5485 }, { "epoch": 0.89, "learning_rate": 8.414602770572561e-07, "logits/chosen": -0.6174650192260742, "logits/rejected": -0.6174650192260742, "logps/chosen": -97.7965316772461, "logps/rejected": -97.7965316772461, "loss": 0.9043, "rewards/accuracies": 0.0, "rewards/chosen": 3.1071510314941406, "rewards/margins": 0.0, "rewards/rejected": 3.1071510314941406, "step": 5486 }, { "epoch": 0.89, "learning_rate": 8.413642599766464e-07, "logits/chosen": -0.6967503428459167, "logits/rejected": -0.7379553318023682, "logps/chosen": -92.76453399658203, "logps/rejected": -116.74864959716797, "loss": 1.1193, "rewards/accuracies": 0.0, "rewards/chosen": 0.6256187558174133, "rewards/margins": -1.4506242275238037, "rewards/rejected": 2.0762429237365723, "step": 5487 }, { "epoch": 0.89, "learning_rate": 8.412682193109963e-07, "logits/chosen": -0.8571979403495789, "logits/rejected": -0.7133529186248779, "logps/chosen": -168.05726623535156, "logps/rejected": -142.53443908691406, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 5.2572922706604, "rewards/margins": 0.19555950164794922, "rewards/rejected": 5.061732769012451, "step": 5488 }, { "epoch": 0.89, "learning_rate": 8.411721550669415e-07, "logits/chosen": -0.7591361403465271, "logits/rejected": -0.7604936361312866, "logps/chosen": -212.72097778320312, "logps/rejected": -167.56369018554688, "loss": 0.5451, "rewards/accuracies": 1.0, "rewards/chosen": 6.871099948883057, "rewards/margins": 1.2318449020385742, "rewards/rejected": 5.639255046844482, "step": 5489 }, { "epoch": 0.89, "learning_rate": 8.410760672511188e-07, "logits/chosen": -0.12929707765579224, "logits/rejected": -0.18144118785858154, "logps/chosen": -3.98795223236084, "logps/rejected": -49.43389892578125, "loss": 0.4383, "rewards/accuracies": 1.0, "rewards/chosen": 0.584648609161377, "rewards/margins": 0.13424080610275269, "rewards/rejected": 0.45040780305862427, "step": 5490 }, { "epoch": 0.89, "learning_rate": 8.409799558701673e-07, "logits/chosen": -0.3250524401664734, "logits/rejected": -0.3250524401664734, "logps/chosen": -28.498149871826172, "logps/rejected": -28.498149871826172, "loss": 0.7539, "rewards/accuracies": 0.0, "rewards/chosen": 1.9006813764572144, "rewards/margins": 0.0, "rewards/rejected": 1.9006813764572144, "step": 5491 }, { "epoch": 0.89, "learning_rate": 8.408838209307271e-07, "logits/chosen": -0.9303568601608276, "logits/rejected": -0.9430000185966492, "logps/chosen": -75.3062744140625, "logps/rejected": -168.35989379882812, "loss": 0.7625, "rewards/accuracies": 1.0, "rewards/chosen": 1.803350806236267, "rewards/margins": 1.481791615486145, "rewards/rejected": 0.3215591609477997, "step": 5492 }, { "epoch": 0.89, "learning_rate": 8.407876624394406e-07, "logits/chosen": -0.7441967725753784, "logits/rejected": -0.7083390951156616, "logps/chosen": -54.23528289794922, "logps/rejected": -65.15499114990234, "loss": 0.3154, "rewards/accuracies": 1.0, "rewards/chosen": 1.741075873374939, "rewards/margins": 0.3140937089920044, "rewards/rejected": 1.4269821643829346, "step": 5493 }, { "epoch": 0.89, "learning_rate": 8.406914804029509e-07, "logits/chosen": -0.7291918396949768, "logits/rejected": -0.6843374371528625, "logps/chosen": -94.02980041503906, "logps/rejected": -62.01513671875, "loss": 0.9258, "rewards/accuracies": 0.0, "rewards/chosen": 1.2153915166854858, "rewards/margins": -0.42293858528137207, "rewards/rejected": 1.638330101966858, "step": 5494 }, { "epoch": 0.89, "learning_rate": 8.405952748279037e-07, "logits/chosen": -0.5007380843162537, "logits/rejected": -0.3820781409740448, "logps/chosen": -60.70951843261719, "logps/rejected": -51.57421875, "loss": 0.2374, "rewards/accuracies": 1.0, "rewards/chosen": 2.824253797531128, "rewards/margins": 0.6226470470428467, "rewards/rejected": 2.2016067504882812, "step": 5495 }, { "epoch": 0.89, "learning_rate": 8.404990457209457e-07, "logits/chosen": -0.5302116274833679, "logits/rejected": -0.5243167281150818, "logps/chosen": -18.839853286743164, "logps/rejected": -3.788094997406006, "loss": 0.3592, "rewards/accuracies": 1.0, "rewards/chosen": 0.6140222549438477, "rewards/margins": 0.07673591375350952, "rewards/rejected": 0.5372863411903381, "step": 5496 }, { "epoch": 0.89, "learning_rate": 8.404027930887254e-07, "logits/chosen": -0.6512888669967651, "logits/rejected": -0.4161967933177948, "logps/chosen": -161.4811248779297, "logps/rejected": -33.84871292114258, "loss": 0.6483, "rewards/accuracies": 1.0, "rewards/chosen": 1.995387315750122, "rewards/margins": 1.9188454151153564, "rewards/rejected": 0.07654190063476562, "step": 5497 }, { "epoch": 0.89, "learning_rate": 8.403065169378932e-07, "logits/chosen": -0.283189058303833, "logits/rejected": -0.356375515460968, "logps/chosen": -103.88330078125, "logps/rejected": -54.50879669189453, "loss": 1.1398, "rewards/accuracies": 0.0, "rewards/chosen": 0.761364758014679, "rewards/margins": -2.010457754135132, "rewards/rejected": 2.771822452545166, "step": 5498 }, { "epoch": 0.89, "learning_rate": 8.402102172751004e-07, "logits/chosen": -0.5529137849807739, "logits/rejected": -0.4509871304035187, "logps/chosen": -139.02923583984375, "logps/rejected": -83.35002136230469, "loss": 1.0998, "rewards/accuracies": 0.0, "rewards/chosen": 0.03068847768008709, "rewards/margins": -1.6117430925369263, "rewards/rejected": 1.642431616783142, "step": 5499 }, { "epoch": 0.89, "learning_rate": 8.401138941070009e-07, "logits/chosen": -0.7002496123313904, "logits/rejected": -0.627214789390564, "logps/chosen": -82.69851684570312, "logps/rejected": -78.16313171386719, "loss": 0.9847, "rewards/accuracies": 1.0, "rewards/chosen": 2.0383262634277344, "rewards/margins": 0.11783981323242188, "rewards/rejected": 1.9204864501953125, "step": 5500 }, { "epoch": 0.89, "learning_rate": 8.400175474402495e-07, "logits/chosen": -0.8035085201263428, "logits/rejected": -0.7625154852867126, "logps/chosen": -211.21713256835938, "logps/rejected": -80.58226013183594, "loss": 0.4235, "rewards/accuracies": 1.0, "rewards/chosen": 4.35176420211792, "rewards/margins": 1.229506254196167, "rewards/rejected": 3.122257947921753, "step": 5501 }, { "epoch": 0.89, "learning_rate": 8.399211772815029e-07, "logits/chosen": -0.5349981784820557, "logits/rejected": -0.49476519227027893, "logps/chosen": -41.64072036743164, "logps/rejected": -57.58622741699219, "loss": 0.6757, "rewards/accuracies": 0.0, "rewards/chosen": 1.878501534461975, "rewards/margins": -0.5090092420578003, "rewards/rejected": 2.3875107765197754, "step": 5502 }, { "epoch": 0.89, "learning_rate": 8.398247836374193e-07, "logits/chosen": -0.6260823607444763, "logits/rejected": -0.6978715062141418, "logps/chosen": -77.52113342285156, "logps/rejected": -69.55003356933594, "loss": 1.2594, "rewards/accuracies": 0.0, "rewards/chosen": 1.2413361072540283, "rewards/margins": -0.9583907127380371, "rewards/rejected": 2.1997268199920654, "step": 5503 }, { "epoch": 0.89, "learning_rate": 8.397283665146583e-07, "logits/chosen": -0.6122185587882996, "logits/rejected": -0.5534944534301758, "logps/chosen": -70.04427337646484, "logps/rejected": -76.44970703125, "loss": 1.7529, "rewards/accuracies": 1.0, "rewards/chosen": 1.3854950666427612, "rewards/margins": 0.4452866315841675, "rewards/rejected": 0.9402084350585938, "step": 5504 }, { "epoch": 0.89, "learning_rate": 8.396319259198821e-07, "logits/chosen": -0.14599382877349854, "logits/rejected": -0.1193801760673523, "logps/chosen": -60.057220458984375, "logps/rejected": -53.31817626953125, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 0.8036583065986633, "rewards/margins": -0.3125045895576477, "rewards/rejected": 1.116162896156311, "step": 5505 }, { "epoch": 0.89, "learning_rate": 8.395354618597532e-07, "logits/chosen": -0.6722210049629211, "logits/rejected": -0.6811696290969849, "logps/chosen": -70.49478912353516, "logps/rejected": -97.21072387695312, "loss": 0.7142, "rewards/accuracies": 0.0, "rewards/chosen": 0.8202491998672485, "rewards/margins": -0.4274475574493408, "rewards/rejected": 1.2476967573165894, "step": 5506 }, { "epoch": 0.89, "learning_rate": 8.394389743409368e-07, "logits/chosen": -0.9707307815551758, "logits/rejected": -0.959225058555603, "logps/chosen": -134.84939575195312, "logps/rejected": -67.12503051757812, "loss": 0.3922, "rewards/accuracies": 1.0, "rewards/chosen": 4.298259258270264, "rewards/margins": 2.3484439849853516, "rewards/rejected": 1.9498153924942017, "step": 5507 }, { "epoch": 0.89, "learning_rate": 8.393424633700989e-07, "logits/chosen": -0.6318225860595703, "logits/rejected": -0.6161670684814453, "logps/chosen": -120.41290283203125, "logps/rejected": -179.06846618652344, "loss": 0.5033, "rewards/accuracies": 0.0, "rewards/chosen": 4.879415988922119, "rewards/margins": -0.5159010887145996, "rewards/rejected": 5.395317077636719, "step": 5508 }, { "epoch": 0.89, "learning_rate": 8.392459289539078e-07, "logits/chosen": -0.45125147700309753, "logits/rejected": -0.5736944079399109, "logps/chosen": -75.89315795898438, "logps/rejected": -122.20246887207031, "loss": 3.2815, "rewards/accuracies": 0.0, "rewards/chosen": 1.072224497795105, "rewards/margins": -3.5394225120544434, "rewards/rejected": 4.611647129058838, "step": 5509 }, { "epoch": 0.89, "learning_rate": 8.39149371099033e-07, "logits/chosen": -1.2532562017440796, "logits/rejected": -1.2263684272766113, "logps/chosen": -186.31019592285156, "logps/rejected": -68.84223937988281, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": 5.8315019607543945, "rewards/margins": 3.130122423171997, "rewards/rejected": 2.7013795375823975, "step": 5510 }, { "epoch": 0.89, "learning_rate": 8.390527898121456e-07, "logits/chosen": -0.909910261631012, "logits/rejected": -0.8059020042419434, "logps/chosen": -139.95108032226562, "logps/rejected": -104.81043243408203, "loss": 0.1843, "rewards/accuracies": 1.0, "rewards/chosen": 5.112677097320557, "rewards/margins": 0.9215326309204102, "rewards/rejected": 4.1911444664001465, "step": 5511 }, { "epoch": 0.89, "learning_rate": 8.389561850999186e-07, "logits/chosen": -0.46728426218032837, "logits/rejected": -0.47575944662094116, "logps/chosen": -72.98483276367188, "logps/rejected": -61.89148712158203, "loss": 0.9827, "rewards/accuracies": 0.0, "rewards/chosen": 0.7735931277275085, "rewards/margins": -0.5701972842216492, "rewards/rejected": 1.3437904119491577, "step": 5512 }, { "epoch": 0.89, "learning_rate": 8.388595569690264e-07, "logits/chosen": -0.8658407330513, "logits/rejected": -0.8060113191604614, "logps/chosen": -86.47767639160156, "logps/rejected": -66.42385864257812, "loss": 0.3187, "rewards/accuracies": 1.0, "rewards/chosen": 1.314936876296997, "rewards/margins": 0.26006925106048584, "rewards/rejected": 1.0548676252365112, "step": 5513 }, { "epoch": 0.89, "learning_rate": 8.387629054261453e-07, "logits/chosen": -0.4817201793193817, "logits/rejected": -0.4609523117542267, "logps/chosen": -98.4930419921875, "logps/rejected": -47.40510940551758, "loss": 1.1026, "rewards/accuracies": 1.0, "rewards/chosen": 1.3459640741348267, "rewards/margins": 0.25728535652160645, "rewards/rejected": 1.0886787176132202, "step": 5514 }, { "epoch": 0.9, "learning_rate": 8.386662304779528e-07, "logits/chosen": -0.8782959580421448, "logits/rejected": -0.652611494064331, "logps/chosen": -133.291748046875, "logps/rejected": -82.44432067871094, "loss": 0.2269, "rewards/accuracies": 1.0, "rewards/chosen": 3.732426404953003, "rewards/margins": 1.2258071899414062, "rewards/rejected": 2.5066192150115967, "step": 5515 }, { "epoch": 0.9, "learning_rate": 8.385695321311281e-07, "logits/chosen": -1.027688980102539, "logits/rejected": -0.9749070405960083, "logps/chosen": -125.30227661132812, "logps/rejected": -137.92921447753906, "loss": 1.2224, "rewards/accuracies": 1.0, "rewards/chosen": 4.819507122039795, "rewards/margins": 0.3938617706298828, "rewards/rejected": 4.425645351409912, "step": 5516 }, { "epoch": 0.9, "learning_rate": 8.384728103923524e-07, "logits/chosen": -0.5968337655067444, "logits/rejected": -0.5801534652709961, "logps/chosen": -94.94976806640625, "logps/rejected": -111.74484252929688, "loss": 0.5518, "rewards/accuracies": 1.0, "rewards/chosen": 1.494179606437683, "rewards/margins": 0.6782211661338806, "rewards/rejected": 0.8159584403038025, "step": 5517 }, { "epoch": 0.9, "learning_rate": 8.38376065268308e-07, "logits/chosen": -0.4660206437110901, "logits/rejected": -0.5557135939598083, "logps/chosen": -122.69393157958984, "logps/rejected": -115.65643310546875, "loss": 1.439, "rewards/accuracies": 1.0, "rewards/chosen": 3.789203643798828, "rewards/margins": 0.35358643531799316, "rewards/rejected": 3.435617208480835, "step": 5518 }, { "epoch": 0.9, "learning_rate": 8.382792967656795e-07, "logits/chosen": -0.5890719890594482, "logits/rejected": -0.5871455073356628, "logps/chosen": -60.16811752319336, "logps/rejected": -47.417694091796875, "loss": 0.8082, "rewards/accuracies": 0.0, "rewards/chosen": 0.8164669275283813, "rewards/margins": -0.6884181499481201, "rewards/rejected": 1.5048850774765015, "step": 5519 }, { "epoch": 0.9, "learning_rate": 8.381825048911524e-07, "logits/chosen": -0.2413252890110016, "logits/rejected": -0.2413252890110016, "logps/chosen": -39.034854888916016, "logps/rejected": -39.034854888916016, "loss": 0.9357, "rewards/accuracies": 0.0, "rewards/chosen": 1.57249915599823, "rewards/margins": 0.0, "rewards/rejected": 1.57249915599823, "step": 5520 }, { "epoch": 0.9, "learning_rate": 8.38085689651414e-07, "logits/chosen": -0.555372416973114, "logits/rejected": -0.5593991875648499, "logps/chosen": -47.61427307128906, "logps/rejected": -38.127159118652344, "loss": 0.4988, "rewards/accuracies": 1.0, "rewards/chosen": 1.0975300073623657, "rewards/margins": 0.4539543390274048, "rewards/rejected": 0.6435756683349609, "step": 5521 }, { "epoch": 0.9, "learning_rate": 8.379888510531534e-07, "logits/chosen": -0.7959238886833191, "logits/rejected": -1.1824673414230347, "logps/chosen": -95.0713119506836, "logps/rejected": -34.91542434692383, "loss": 1.1359, "rewards/accuracies": 1.0, "rewards/chosen": 0.5491882562637329, "rewards/margins": 0.32064783573150635, "rewards/rejected": 0.22854042053222656, "step": 5522 }, { "epoch": 0.9, "learning_rate": 8.378919891030614e-07, "logits/chosen": -0.4484493136405945, "logits/rejected": -0.4484493136405945, "logps/chosen": -0.9715157747268677, "logps/rejected": -0.9715157747268677, "loss": 0.7952, "rewards/accuracies": 0.0, "rewards/chosen": 0.365908145904541, "rewards/margins": 0.0, "rewards/rejected": 0.365908145904541, "step": 5523 }, { "epoch": 0.9, "learning_rate": 8.377951038078301e-07, "logits/chosen": -0.9563060998916626, "logits/rejected": -0.9727979302406311, "logps/chosen": -42.253326416015625, "logps/rejected": -44.9723014831543, "loss": 0.6705, "rewards/accuracies": 0.0, "rewards/chosen": 2.295126438140869, "rewards/margins": -0.39435768127441406, "rewards/rejected": 2.689484119415283, "step": 5524 }, { "epoch": 0.9, "learning_rate": 8.376981951741533e-07, "logits/chosen": -0.6211013793945312, "logits/rejected": -0.5163062214851379, "logps/chosen": -61.858192443847656, "logps/rejected": -87.1312484741211, "loss": 1.7929, "rewards/accuracies": 0.0, "rewards/chosen": 1.5705422163009644, "rewards/margins": -1.784814476966858, "rewards/rejected": 3.3553566932678223, "step": 5525 }, { "epoch": 0.9, "learning_rate": 8.376012632087266e-07, "logits/chosen": -0.7371087074279785, "logits/rejected": -0.7773438096046448, "logps/chosen": -56.67637252807617, "logps/rejected": -64.98568725585938, "loss": 0.8833, "rewards/accuracies": 0.0, "rewards/chosen": 2.154602527618408, "rewards/margins": -0.3242816925048828, "rewards/rejected": 2.478884220123291, "step": 5526 }, { "epoch": 0.9, "learning_rate": 8.37504307918247e-07, "logits/chosen": -0.25032901763916016, "logits/rejected": -0.28443440794944763, "logps/chosen": -70.46245574951172, "logps/rejected": -44.42985153198242, "loss": 1.0683, "rewards/accuracies": 0.0, "rewards/chosen": 0.1280677765607834, "rewards/margins": -1.5273582935333252, "rewards/rejected": 1.655426025390625, "step": 5527 }, { "epoch": 0.9, "learning_rate": 8.374073293094132e-07, "logits/chosen": -0.35744166374206543, "logits/rejected": -0.3551274538040161, "logps/chosen": -43.5655517578125, "logps/rejected": -44.56755828857422, "loss": 0.4466, "rewards/accuracies": 1.0, "rewards/chosen": 1.447262167930603, "rewards/margins": 0.08870887756347656, "rewards/rejected": 1.3585532903671265, "step": 5528 }, { "epoch": 0.9, "learning_rate": 8.373103273889255e-07, "logits/chosen": -0.5320099592208862, "logits/rejected": -0.49431249499320984, "logps/chosen": -60.3714485168457, "logps/rejected": -63.62062072753906, "loss": 0.3758, "rewards/accuracies": 1.0, "rewards/chosen": 1.535763144493103, "rewards/margins": 0.1665431261062622, "rewards/rejected": 1.3692200183868408, "step": 5529 }, { "epoch": 0.9, "learning_rate": 8.372133021634859e-07, "logits/chosen": -0.5689288973808289, "logits/rejected": -0.5644667148590088, "logps/chosen": -82.10757446289062, "logps/rejected": -67.8590316772461, "loss": 0.504, "rewards/accuracies": 1.0, "rewards/chosen": 3.2856171131134033, "rewards/margins": 1.257875919342041, "rewards/rejected": 2.0277411937713623, "step": 5530 }, { "epoch": 0.9, "learning_rate": 8.37116253639798e-07, "logits/chosen": -0.4286091923713684, "logits/rejected": -0.4190559983253479, "logps/chosen": -73.01968383789062, "logps/rejected": -54.880157470703125, "loss": 0.5041, "rewards/accuracies": 0.0, "rewards/chosen": 2.0086090564727783, "rewards/margins": -0.17919015884399414, "rewards/rejected": 2.1877992153167725, "step": 5531 }, { "epoch": 0.9, "learning_rate": 8.370191818245667e-07, "logits/chosen": -0.5746833682060242, "logits/rejected": -0.532038152217865, "logps/chosen": -172.01194763183594, "logps/rejected": -58.70697021484375, "loss": 0.3834, "rewards/accuracies": 0.0, "rewards/chosen": 2.985154867172241, "rewards/margins": -0.11769700050354004, "rewards/rejected": 3.1028518676757812, "step": 5532 }, { "epoch": 0.9, "learning_rate": 8.369220867244987e-07, "logits/chosen": -0.3959246873855591, "logits/rejected": -0.35149168968200684, "logps/chosen": -65.06031799316406, "logps/rejected": -55.64714050292969, "loss": 0.4941, "rewards/accuracies": 0.0, "rewards/chosen": 1.7367066144943237, "rewards/margins": -0.09419095516204834, "rewards/rejected": 1.830897569656372, "step": 5533 }, { "epoch": 0.9, "learning_rate": 8.368249683463027e-07, "logits/chosen": -0.8262709379196167, "logits/rejected": -0.8370115160942078, "logps/chosen": -93.26094055175781, "logps/rejected": -134.52847290039062, "loss": 0.4567, "rewards/accuracies": 0.0, "rewards/chosen": 1.4555526971817017, "rewards/margins": -0.3258330821990967, "rewards/rejected": 1.7813857793807983, "step": 5534 }, { "epoch": 0.9, "learning_rate": 8.367278266966882e-07, "logits/chosen": -0.9988089203834534, "logits/rejected": -0.9702674150466919, "logps/chosen": -84.11663818359375, "logps/rejected": -133.732666015625, "loss": 0.3161, "rewards/accuracies": 1.0, "rewards/chosen": 5.276004314422607, "rewards/margins": 0.2990083694458008, "rewards/rejected": 4.976995944976807, "step": 5535 }, { "epoch": 0.9, "learning_rate": 8.366306617823672e-07, "logits/chosen": -0.8338054418563843, "logits/rejected": -0.7413277626037598, "logps/chosen": -87.11576843261719, "logps/rejected": -47.16387176513672, "loss": 0.4519, "rewards/accuracies": 0.0, "rewards/chosen": 1.0956611633300781, "rewards/margins": -0.11191976070404053, "rewards/rejected": 1.2075809240341187, "step": 5536 }, { "epoch": 0.9, "learning_rate": 8.365334736100526e-07, "logits/chosen": -0.8184409737586975, "logits/rejected": -0.8917279839515686, "logps/chosen": -48.257808685302734, "logps/rejected": -85.4904556274414, "loss": 1.1118, "rewards/accuracies": 0.0, "rewards/chosen": 1.4020988941192627, "rewards/margins": -2.095611572265625, "rewards/rejected": 3.4977104663848877, "step": 5537 }, { "epoch": 0.9, "learning_rate": 8.364362621864594e-07, "logits/chosen": -0.30753785371780396, "logits/rejected": -0.3228858411312103, "logps/chosen": -54.67087173461914, "logps/rejected": -76.84721374511719, "loss": 0.2569, "rewards/accuracies": 1.0, "rewards/chosen": 2.3562984466552734, "rewards/margins": 0.5618083477020264, "rewards/rejected": 1.794490098953247, "step": 5538 }, { "epoch": 0.9, "learning_rate": 8.363390275183039e-07, "logits/chosen": -0.5180072784423828, "logits/rejected": -0.5306776762008667, "logps/chosen": -30.837318420410156, "logps/rejected": -51.71424102783203, "loss": 0.5134, "rewards/accuracies": 0.0, "rewards/chosen": 0.23032760620117188, "rewards/margins": -0.4292808771133423, "rewards/rejected": 0.6596084833145142, "step": 5539 }, { "epoch": 0.9, "learning_rate": 8.362417696123039e-07, "logits/chosen": -0.46917015314102173, "logits/rejected": -0.4818364083766937, "logps/chosen": -19.074220657348633, "logps/rejected": -6.06442928314209, "loss": 1.4694, "rewards/accuracies": 0.0, "rewards/chosen": -0.03464832529425621, "rewards/margins": -0.3144180476665497, "rewards/rejected": 0.2797697186470032, "step": 5540 }, { "epoch": 0.9, "learning_rate": 8.361444884751793e-07, "logits/chosen": -0.8174414038658142, "logits/rejected": -0.8197199702262878, "logps/chosen": -112.43365478515625, "logps/rejected": -81.0550537109375, "loss": 0.5577, "rewards/accuracies": 1.0, "rewards/chosen": 2.5358307361602783, "rewards/margins": 0.6437476873397827, "rewards/rejected": 1.8920830488204956, "step": 5541 }, { "epoch": 0.9, "learning_rate": 8.360471841136511e-07, "logits/chosen": -1.0246713161468506, "logits/rejected": -0.9300302863121033, "logps/chosen": -159.15530395507812, "logps/rejected": -68.47168731689453, "loss": 0.9141, "rewards/accuracies": 0.0, "rewards/chosen": 1.0980545282363892, "rewards/margins": -1.156603217124939, "rewards/rejected": 2.254657745361328, "step": 5542 }, { "epoch": 0.9, "learning_rate": 8.359498565344422e-07, "logits/chosen": -0.6587657928466797, "logits/rejected": -0.6676728129386902, "logps/chosen": -50.666847229003906, "logps/rejected": -47.54337692260742, "loss": 0.4548, "rewards/accuracies": 1.0, "rewards/chosen": 1.8731743097305298, "rewards/margins": 0.04741251468658447, "rewards/rejected": 1.8257617950439453, "step": 5543 }, { "epoch": 0.9, "learning_rate": 8.35852505744277e-07, "logits/chosen": -0.8472462892532349, "logits/rejected": -0.7477962374687195, "logps/chosen": -91.50373840332031, "logps/rejected": -67.88616943359375, "loss": 0.7428, "rewards/accuracies": 0.0, "rewards/chosen": 2.2930145263671875, "rewards/margins": -0.25548553466796875, "rewards/rejected": 2.5485000610351562, "step": 5544 }, { "epoch": 0.9, "learning_rate": 8.357551317498816e-07, "logits/chosen": -0.3829875886440277, "logits/rejected": -0.3255440890789032, "logps/chosen": -54.87104034423828, "logps/rejected": -64.96038055419922, "loss": 0.1681, "rewards/accuracies": 1.0, "rewards/chosen": 1.5059928894042969, "rewards/margins": 1.0445899963378906, "rewards/rejected": 0.46140289306640625, "step": 5545 }, { "epoch": 0.9, "learning_rate": 8.356577345579836e-07, "logits/chosen": -0.9700143933296204, "logits/rejected": -1.0189721584320068, "logps/chosen": -309.8822326660156, "logps/rejected": -65.0931396484375, "loss": 1.2259, "rewards/accuracies": 1.0, "rewards/chosen": 5.656576633453369, "rewards/margins": 2.672853946685791, "rewards/rejected": 2.983722686767578, "step": 5546 }, { "epoch": 0.9, "learning_rate": 8.35560314175312e-07, "logits/chosen": -0.6817173361778259, "logits/rejected": -0.6817173361778259, "logps/chosen": -49.48426818847656, "logps/rejected": -49.48426818847656, "loss": 1.3234, "rewards/accuracies": 0.0, "rewards/chosen": 2.9633376598358154, "rewards/margins": 0.0, "rewards/rejected": 2.9633376598358154, "step": 5547 }, { "epoch": 0.9, "learning_rate": 8.354628706085978e-07, "logits/chosen": -0.5276585817337036, "logits/rejected": -0.5172088146209717, "logps/chosen": -29.446521759033203, "logps/rejected": -28.94527816772461, "loss": 1.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.3041702210903168, "rewards/margins": 0.1797511875629425, "rewards/rejected": 0.12441902607679367, "step": 5548 }, { "epoch": 0.9, "learning_rate": 8.353654038645735e-07, "logits/chosen": -0.4804243743419647, "logits/rejected": -0.34477701783180237, "logps/chosen": -112.2995834350586, "logps/rejected": -87.73218536376953, "loss": 0.1384, "rewards/accuracies": 1.0, "rewards/chosen": 5.430495738983154, "rewards/margins": 2.8287065029144287, "rewards/rejected": 2.6017892360687256, "step": 5549 }, { "epoch": 0.9, "learning_rate": 8.35267913949973e-07, "logits/chosen": -1.252242088317871, "logits/rejected": -1.2333736419677734, "logps/chosen": -101.0963363647461, "logps/rejected": -123.45579528808594, "loss": 1.6314, "rewards/accuracies": 0.0, "rewards/chosen": 1.0270347595214844, "rewards/margins": -3.2217249870300293, "rewards/rejected": 4.248759746551514, "step": 5550 }, { "epoch": 0.9, "learning_rate": 8.35170400871532e-07, "logits/chosen": -0.6529393792152405, "logits/rejected": -0.5429926514625549, "logps/chosen": -104.33607482910156, "logps/rejected": -81.93609619140625, "loss": 0.4527, "rewards/accuracies": 0.0, "rewards/chosen": 1.0427414178848267, "rewards/margins": -0.37459254264831543, "rewards/rejected": 1.417333960533142, "step": 5551 }, { "epoch": 0.9, "learning_rate": 8.350728646359877e-07, "logits/chosen": -1.434136152267456, "logits/rejected": -0.8314412832260132, "logps/chosen": -105.6204833984375, "logps/rejected": -128.5416717529297, "loss": 1.2084, "rewards/accuracies": 0.0, "rewards/chosen": 1.5605720281600952, "rewards/margins": -0.14810717105865479, "rewards/rejected": 1.70867919921875, "step": 5552 }, { "epoch": 0.9, "learning_rate": 8.349753052500789e-07, "logits/chosen": -0.8157958388328552, "logits/rejected": -0.7277767062187195, "logps/chosen": -93.11488342285156, "logps/rejected": -31.289535522460938, "loss": 0.2748, "rewards/accuracies": 1.0, "rewards/chosen": 0.9975387454032898, "rewards/margins": 1.0719127655029297, "rewards/rejected": -0.0743740126490593, "step": 5553 }, { "epoch": 0.9, "learning_rate": 8.348777227205462e-07, "logits/chosen": -0.5100323557853699, "logits/rejected": -0.48688024282455444, "logps/chosen": -57.23268127441406, "logps/rejected": -81.98767852783203, "loss": 0.3311, "rewards/accuracies": 1.0, "rewards/chosen": 1.809700846672058, "rewards/margins": 0.35423743724823, "rewards/rejected": 1.4554634094238281, "step": 5554 }, { "epoch": 0.9, "learning_rate": 8.347801170541313e-07, "logits/chosen": -0.4243755340576172, "logits/rejected": -0.4119059443473816, "logps/chosen": -93.749755859375, "logps/rejected": -106.7269058227539, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 0.5450714230537415, "rewards/margins": 0.0525718629360199, "rewards/rejected": 0.49249956011772156, "step": 5555 }, { "epoch": 0.9, "learning_rate": 8.346824882575781e-07, "logits/chosen": -0.6574277281761169, "logits/rejected": -0.676859974861145, "logps/chosen": -93.50155639648438, "logps/rejected": -62.83960723876953, "loss": 1.1772, "rewards/accuracies": 0.0, "rewards/chosen": 1.1692123413085938, "rewards/margins": -2.174710988998413, "rewards/rejected": 3.343923330307007, "step": 5556 }, { "epoch": 0.9, "learning_rate": 8.345848363376317e-07, "logits/chosen": -0.15998342633247375, "logits/rejected": -0.15998342633247375, "logps/chosen": -42.58708190917969, "logps/rejected": -42.58708190917969, "loss": 1.7343, "rewards/accuracies": 0.0, "rewards/chosen": 0.9075164794921875, "rewards/margins": 0.0, "rewards/rejected": 0.9075164794921875, "step": 5557 }, { "epoch": 0.9, "learning_rate": 8.344871613010392e-07, "logits/chosen": -0.9956190586090088, "logits/rejected": -1.0178216695785522, "logps/chosen": -109.65229797363281, "logps/rejected": -109.32427215576172, "loss": 0.8343, "rewards/accuracies": 0.0, "rewards/chosen": 0.8076431155204773, "rewards/margins": -1.3820395469665527, "rewards/rejected": 2.189682722091675, "step": 5558 }, { "epoch": 0.9, "learning_rate": 8.343894631545487e-07, "logits/chosen": -0.6458343863487244, "logits/rejected": -0.641592800617218, "logps/chosen": -35.82981491088867, "logps/rejected": -43.87017822265625, "loss": 0.5954, "rewards/accuracies": 1.0, "rewards/chosen": 1.6155627965927124, "rewards/margins": 0.10046875476837158, "rewards/rejected": 1.5150940418243408, "step": 5559 }, { "epoch": 0.9, "learning_rate": 8.342917419049103e-07, "logits/chosen": -0.292558491230011, "logits/rejected": -0.292558491230011, "logps/chosen": -37.13959503173828, "logps/rejected": -37.13959503173828, "loss": 0.8922, "rewards/accuracies": 0.0, "rewards/chosen": 1.124578833580017, "rewards/margins": 0.0, "rewards/rejected": 1.124578833580017, "step": 5560 }, { "epoch": 0.9, "learning_rate": 8.341939975588756e-07, "logits/chosen": -0.9407438039779663, "logits/rejected": -0.911047637462616, "logps/chosen": -95.89486694335938, "logps/rejected": -85.99551391601562, "loss": 1.3959, "rewards/accuracies": 0.0, "rewards/chosen": 0.26431503891944885, "rewards/margins": -1.6756858825683594, "rewards/rejected": 1.9400008916854858, "step": 5561 }, { "epoch": 0.9, "learning_rate": 8.340962301231979e-07, "logits/chosen": -0.5615924596786499, "logits/rejected": -0.5514234900474548, "logps/chosen": -64.50703430175781, "logps/rejected": -20.218353271484375, "loss": 0.7191, "rewards/accuracies": 1.0, "rewards/chosen": 0.7148048281669617, "rewards/margins": 0.46915340423583984, "rewards/rejected": 0.24565143883228302, "step": 5562 }, { "epoch": 0.9, "learning_rate": 8.339984396046321e-07, "logits/chosen": -0.43531954288482666, "logits/rejected": -0.44886767864227295, "logps/chosen": -85.78282928466797, "logps/rejected": -52.144325256347656, "loss": 0.5251, "rewards/accuracies": 0.0, "rewards/chosen": 0.774030327796936, "rewards/margins": -0.3982360363006592, "rewards/rejected": 1.1722663640975952, "step": 5563 }, { "epoch": 0.9, "learning_rate": 8.339006260099341e-07, "logits/chosen": -0.4838505685329437, "logits/rejected": -0.5639411211013794, "logps/chosen": -68.77279663085938, "logps/rejected": -59.09870147705078, "loss": 1.0963, "rewards/accuracies": 1.0, "rewards/chosen": 1.918365478515625, "rewards/margins": 0.3996391296386719, "rewards/rejected": 1.5187263488769531, "step": 5564 }, { "epoch": 0.9, "learning_rate": 8.338027893458623e-07, "logits/chosen": -0.7968247532844543, "logits/rejected": -0.6927821040153503, "logps/chosen": -47.41753005981445, "logps/rejected": -59.769248962402344, "loss": 0.657, "rewards/accuracies": 0.0, "rewards/chosen": 1.830532431602478, "rewards/margins": -0.5265392065048218, "rewards/rejected": 2.3570716381073, "step": 5565 }, { "epoch": 0.9, "learning_rate": 8.337049296191765e-07, "logits/chosen": -1.0386563539505005, "logits/rejected": -0.9694790840148926, "logps/chosen": -124.3902816772461, "logps/rejected": -40.18156051635742, "loss": 0.4127, "rewards/accuracies": 1.0, "rewards/chosen": 1.0620231628417969, "rewards/margins": 0.8284305334091187, "rewards/rejected": 0.23359261453151703, "step": 5566 }, { "epoch": 0.9, "learning_rate": 8.336070468366374e-07, "logits/chosen": -0.42014163732528687, "logits/rejected": -0.44078803062438965, "logps/chosen": -127.0854263305664, "logps/rejected": -57.874534606933594, "loss": 0.6867, "rewards/accuracies": 0.0, "rewards/chosen": 1.0550529956817627, "rewards/margins": -0.9732780456542969, "rewards/rejected": 2.0283310413360596, "step": 5567 }, { "epoch": 0.9, "learning_rate": 8.33509141005008e-07, "logits/chosen": -0.4965338408946991, "logits/rejected": -0.31407463550567627, "logps/chosen": -88.78756713867188, "logps/rejected": -65.0854721069336, "loss": 0.2129, "rewards/accuracies": 1.0, "rewards/chosen": 6.27632474899292, "rewards/margins": 3.393697500228882, "rewards/rejected": 2.882627248764038, "step": 5568 }, { "epoch": 0.9, "learning_rate": 8.334112121310526e-07, "logits/chosen": -0.8818251490592957, "logits/rejected": -0.8737823367118835, "logps/chosen": -212.16522216796875, "logps/rejected": -186.909423828125, "loss": 0.8869, "rewards/accuracies": 0.0, "rewards/chosen": 4.329530239105225, "rewards/margins": -1.4806690216064453, "rewards/rejected": 5.81019926071167, "step": 5569 }, { "epoch": 0.9, "learning_rate": 8.333132602215374e-07, "logits/chosen": -0.7462953925132751, "logits/rejected": -0.6761311292648315, "logps/chosen": -66.7714614868164, "logps/rejected": -71.05137634277344, "loss": 0.8325, "rewards/accuracies": 0.0, "rewards/chosen": 2.155197858810425, "rewards/margins": -0.06891870498657227, "rewards/rejected": 2.224116563796997, "step": 5570 }, { "epoch": 0.9, "learning_rate": 8.332152852832296e-07, "logits/chosen": -0.7719724178314209, "logits/rejected": -0.7194503545761108, "logps/chosen": -142.1088104248047, "logps/rejected": -114.7321548461914, "loss": 0.2112, "rewards/accuracies": 1.0, "rewards/chosen": 1.1544967889785767, "rewards/margins": 0.7891311645507812, "rewards/rejected": 0.365365594625473, "step": 5571 }, { "epoch": 0.9, "learning_rate": 8.331172873228985e-07, "logits/chosen": -0.8193566799163818, "logits/rejected": -0.7883692383766174, "logps/chosen": -40.274513244628906, "logps/rejected": -83.20321655273438, "loss": 0.7741, "rewards/accuracies": 1.0, "rewards/chosen": 2.1480941772460938, "rewards/margins": 1.176672339439392, "rewards/rejected": 0.9714218378067017, "step": 5572 }, { "epoch": 0.9, "learning_rate": 8.330192663473149e-07, "logits/chosen": -0.8434346318244934, "logits/rejected": -0.7622320055961609, "logps/chosen": -74.4613265991211, "logps/rejected": -145.03953552246094, "loss": 0.1539, "rewards/accuracies": 1.0, "rewards/chosen": 5.947052955627441, "rewards/margins": 1.3905177116394043, "rewards/rejected": 4.556535243988037, "step": 5573 }, { "epoch": 0.9, "learning_rate": 8.32921222363251e-07, "logits/chosen": -0.6899403929710388, "logits/rejected": -0.6914669871330261, "logps/chosen": -58.70616912841797, "logps/rejected": -53.697113037109375, "loss": 0.4905, "rewards/accuracies": 0.0, "rewards/chosen": 1.1324493885040283, "rewards/margins": -0.43743133544921875, "rewards/rejected": 1.569880723953247, "step": 5574 }, { "epoch": 0.9, "learning_rate": 8.328231553774808e-07, "logits/chosen": -0.8130666017532349, "logits/rejected": -0.6782600283622742, "logps/chosen": -51.37110137939453, "logps/rejected": -20.74714469909668, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": 2.579127550125122, "rewards/margins": 2.28851580619812, "rewards/rejected": 0.2906116545200348, "step": 5575 }, { "epoch": 0.91, "learning_rate": 8.327250653967797e-07, "logits/chosen": -0.726967990398407, "logits/rejected": -0.6672766208648682, "logps/chosen": -56.1820068359375, "logps/rejected": -12.804759979248047, "loss": 0.2157, "rewards/accuracies": 1.0, "rewards/chosen": 1.3869918584823608, "rewards/margins": 0.7151885628700256, "rewards/rejected": 0.6718032956123352, "step": 5576 }, { "epoch": 0.91, "learning_rate": 8.32626952427925e-07, "logits/chosen": -0.615935206413269, "logits/rejected": -0.600429356098175, "logps/chosen": -49.66459274291992, "logps/rejected": -56.32305908203125, "loss": 0.3865, "rewards/accuracies": 1.0, "rewards/chosen": 1.943698525428772, "rewards/margins": 0.07110476493835449, "rewards/rejected": 1.8725937604904175, "step": 5577 }, { "epoch": 0.91, "learning_rate": 8.325288164776951e-07, "logits/chosen": -0.4839150607585907, "logits/rejected": -0.5350850224494934, "logps/chosen": -125.7669677734375, "logps/rejected": -64.11166381835938, "loss": 1.6333, "rewards/accuracies": 0.0, "rewards/chosen": 0.38797304034233093, "rewards/margins": -0.777567982673645, "rewards/rejected": 1.1655410528182983, "step": 5578 }, { "epoch": 0.91, "learning_rate": 8.324306575528706e-07, "logits/chosen": -0.5796747207641602, "logits/rejected": -0.5172226428985596, "logps/chosen": -91.84537506103516, "logps/rejected": -59.72697067260742, "loss": 0.9913, "rewards/accuracies": 0.0, "rewards/chosen": 0.9032821655273438, "rewards/margins": -0.7322415113449097, "rewards/rejected": 1.6355236768722534, "step": 5579 }, { "epoch": 0.91, "learning_rate": 8.323324756602329e-07, "logits/chosen": -0.35896021127700806, "logits/rejected": -0.2436690777540207, "logps/chosen": -46.43739318847656, "logps/rejected": -34.63243865966797, "loss": 0.4427, "rewards/accuracies": 1.0, "rewards/chosen": 1.1644108295440674, "rewards/margins": 1.236113429069519, "rewards/rejected": -0.07170257717370987, "step": 5580 }, { "epoch": 0.91, "learning_rate": 8.322342708065657e-07, "logits/chosen": -0.6592510342597961, "logits/rejected": -0.6020559668540955, "logps/chosen": -65.25723266601562, "logps/rejected": -7.323220252990723, "loss": 0.2121, "rewards/accuracies": 1.0, "rewards/chosen": 1.4588425159454346, "rewards/margins": 0.8375437259674072, "rewards/rejected": 0.6212987899780273, "step": 5581 }, { "epoch": 0.91, "learning_rate": 8.321360429986541e-07, "logits/chosen": -0.34870198369026184, "logits/rejected": -0.34597498178482056, "logps/chosen": -60.68575668334961, "logps/rejected": -62.098175048828125, "loss": 0.9053, "rewards/accuracies": 1.0, "rewards/chosen": 2.6429455280303955, "rewards/margins": 0.10634565353393555, "rewards/rejected": 2.53659987449646, "step": 5582 }, { "epoch": 0.91, "learning_rate": 8.320377922432846e-07, "logits/chosen": -1.055648922920227, "logits/rejected": -0.8008800745010376, "logps/chosen": -132.59420776367188, "logps/rejected": -25.125591278076172, "loss": 0.8151, "rewards/accuracies": 1.0, "rewards/chosen": 5.389471530914307, "rewards/margins": 4.67087984085083, "rewards/rejected": 0.7185916900634766, "step": 5583 }, { "epoch": 0.91, "learning_rate": 8.319395185472456e-07, "logits/chosen": -0.5972115397453308, "logits/rejected": -0.5740141868591309, "logps/chosen": -55.94407653808594, "logps/rejected": -88.06867980957031, "loss": 0.3443, "rewards/accuracies": 1.0, "rewards/chosen": 2.117077589035034, "rewards/margins": 0.6491286754608154, "rewards/rejected": 1.4679489135742188, "step": 5584 }, { "epoch": 0.91, "learning_rate": 8.318412219173266e-07, "logits/chosen": 0.01974831335246563, "logits/rejected": 0.01974831335246563, "logps/chosen": -44.895790100097656, "logps/rejected": -44.895790100097656, "loss": 0.4256, "rewards/accuracies": 0.0, "rewards/chosen": 0.43126335740089417, "rewards/margins": 0.0, "rewards/rejected": 0.43126335740089417, "step": 5585 }, { "epoch": 0.91, "learning_rate": 8.317429023603188e-07, "logits/chosen": -0.5525270700454712, "logits/rejected": -0.45632198452949524, "logps/chosen": -56.6005744934082, "logps/rejected": -100.28221130371094, "loss": 0.7574, "rewards/accuracies": 1.0, "rewards/chosen": 2.212095260620117, "rewards/margins": 0.9686931371688843, "rewards/rejected": 1.243402123451233, "step": 5586 }, { "epoch": 0.91, "learning_rate": 8.316445598830157e-07, "logits/chosen": -0.6085970401763916, "logits/rejected": -0.6057291626930237, "logps/chosen": -73.62602233886719, "logps/rejected": -131.98544311523438, "loss": 1.1084, "rewards/accuracies": 0.0, "rewards/chosen": 1.9663909673690796, "rewards/margins": -1.6913942098617554, "rewards/rejected": 3.657785177230835, "step": 5587 }, { "epoch": 0.91, "learning_rate": 8.315461944922117e-07, "logits/chosen": -0.6240792274475098, "logits/rejected": -0.6210518479347229, "logps/chosen": -3.0493357181549072, "logps/rejected": -20.127004623413086, "loss": 0.6513, "rewards/accuracies": 0.0, "rewards/chosen": 0.39777520298957825, "rewards/margins": -0.03308627009391785, "rewards/rejected": 0.4308614730834961, "step": 5588 }, { "epoch": 0.91, "learning_rate": 8.314478061947026e-07, "logits/chosen": -0.6147014498710632, "logits/rejected": -0.4365285634994507, "logps/chosen": -287.21197509765625, "logps/rejected": -51.99037551879883, "loss": 0.374, "rewards/accuracies": 1.0, "rewards/chosen": 4.715661525726318, "rewards/margins": 3.8078646659851074, "rewards/rejected": 0.9077968597412109, "step": 5589 }, { "epoch": 0.91, "learning_rate": 8.313493949972863e-07, "logits/chosen": -1.0476412773132324, "logits/rejected": -1.0378092527389526, "logps/chosen": -182.1080780029297, "logps/rejected": -92.82919311523438, "loss": 0.6599, "rewards/accuracies": 0.0, "rewards/chosen": 1.1858657598495483, "rewards/margins": -0.9912155866622925, "rewards/rejected": 2.177081346511841, "step": 5590 }, { "epoch": 0.91, "learning_rate": 8.31250960906762e-07, "logits/chosen": -0.8532108664512634, "logits/rejected": -0.761053204536438, "logps/chosen": -111.05066680908203, "logps/rejected": -187.7151641845703, "loss": 0.6588, "rewards/accuracies": 0.0, "rewards/chosen": 5.084015846252441, "rewards/margins": -0.9912910461425781, "rewards/rejected": 6.0753068923950195, "step": 5591 }, { "epoch": 0.91, "learning_rate": 8.311525039299307e-07, "logits/chosen": -1.0373953580856323, "logits/rejected": -1.0236711502075195, "logps/chosen": -94.28306579589844, "logps/rejected": -163.38949584960938, "loss": 0.2664, "rewards/accuracies": 1.0, "rewards/chosen": 4.481056213378906, "rewards/margins": 0.35485076904296875, "rewards/rejected": 4.1262054443359375, "step": 5592 }, { "epoch": 0.91, "learning_rate": 8.310540240735947e-07, "logits/chosen": -0.7801750302314758, "logits/rejected": -0.8319079279899597, "logps/chosen": -199.98858642578125, "logps/rejected": -130.0998077392578, "loss": 1.2346, "rewards/accuracies": 0.0, "rewards/chosen": 2.695587158203125, "rewards/margins": -2.3699569702148438, "rewards/rejected": 5.065544128417969, "step": 5593 }, { "epoch": 0.91, "learning_rate": 8.309555213445583e-07, "logits/chosen": -0.9529551863670349, "logits/rejected": -0.8893795013427734, "logps/chosen": -61.34269714355469, "logps/rejected": -20.952369689941406, "loss": 0.1222, "rewards/accuracies": 1.0, "rewards/chosen": 2.2465531826019287, "rewards/margins": 1.8694602251052856, "rewards/rejected": 0.3770929276943207, "step": 5594 }, { "epoch": 0.91, "learning_rate": 8.308569957496266e-07, "logits/chosen": -0.16867326200008392, "logits/rejected": -0.1624114215373993, "logps/chosen": -4.103194236755371, "logps/rejected": -2.799503803253174, "loss": 0.5583, "rewards/accuracies": 1.0, "rewards/chosen": 0.17125844955444336, "rewards/margins": 0.0067479610443115234, "rewards/rejected": 0.16451048851013184, "step": 5595 }, { "epoch": 0.91, "learning_rate": 8.307584472956073e-07, "logits/chosen": -1.0335216522216797, "logits/rejected": -0.9703976511955261, "logps/chosen": -115.27787780761719, "logps/rejected": -158.87347412109375, "loss": 0.467, "rewards/accuracies": 0.0, "rewards/chosen": 4.103582859039307, "rewards/margins": -0.36467599868774414, "rewards/rejected": 4.468258857727051, "step": 5596 }, { "epoch": 0.91, "learning_rate": 8.306598759893088e-07, "logits/chosen": -0.4316163957118988, "logits/rejected": -0.4316163957118988, "logps/chosen": -39.846126556396484, "logps/rejected": -39.846126556396484, "loss": 0.5114, "rewards/accuracies": 0.0, "rewards/chosen": 0.5463027954101562, "rewards/margins": 0.0, "rewards/rejected": 0.5463027954101562, "step": 5597 }, { "epoch": 0.91, "learning_rate": 8.305612818375418e-07, "logits/chosen": -0.8184951543807983, "logits/rejected": -0.725709080696106, "logps/chosen": -78.44731140136719, "logps/rejected": -40.9658203125, "loss": 0.2946, "rewards/accuracies": 1.0, "rewards/chosen": 0.8839516043663025, "rewards/margins": 0.4075706899166107, "rewards/rejected": 0.4763809144496918, "step": 5598 }, { "epoch": 0.91, "learning_rate": 8.30462664847118e-07, "logits/chosen": -0.6117721796035767, "logits/rejected": -0.567775547504425, "logps/chosen": -52.01258087158203, "logps/rejected": -40.14645767211914, "loss": 0.3699, "rewards/accuracies": 1.0, "rewards/chosen": 2.107434034347534, "rewards/margins": 0.7775362730026245, "rewards/rejected": 1.3298977613449097, "step": 5599 }, { "epoch": 0.91, "learning_rate": 8.30364025024851e-07, "logits/chosen": -0.5869085192680359, "logits/rejected": -0.6117777228355408, "logps/chosen": -65.34307861328125, "logps/rejected": -41.81645584106445, "loss": 0.8045, "rewards/accuracies": 0.0, "rewards/chosen": 1.5451287031173706, "rewards/margins": -0.3781161308288574, "rewards/rejected": 1.923244833946228, "step": 5600 }, { "epoch": 0.91, "learning_rate": 8.302653623775556e-07, "logits/chosen": -0.5396726131439209, "logits/rejected": -0.28207314014434814, "logps/chosen": -68.07218933105469, "logps/rejected": -51.75217056274414, "loss": 0.9321, "rewards/accuracies": 0.0, "rewards/chosen": 0.29012298583984375, "rewards/margins": -0.9564151763916016, "rewards/rejected": 1.2465381622314453, "step": 5601 }, { "epoch": 0.91, "learning_rate": 8.301666769120487e-07, "logits/chosen": -0.1806088387966156, "logits/rejected": -0.1806088387966156, "logps/chosen": -46.728336334228516, "logps/rejected": -46.728336334228516, "loss": 0.5411, "rewards/accuracies": 0.0, "rewards/chosen": 0.48820915818214417, "rewards/margins": 0.0, "rewards/rejected": 0.48820915818214417, "step": 5602 }, { "epoch": 0.91, "learning_rate": 8.300679686351484e-07, "logits/chosen": -0.8673487901687622, "logits/rejected": -0.9162833094596863, "logps/chosen": -85.53900146484375, "logps/rejected": -99.23774719238281, "loss": 1.6818, "rewards/accuracies": 0.0, "rewards/chosen": 1.4887886047363281, "rewards/margins": -1.7863075733184814, "rewards/rejected": 3.2750961780548096, "step": 5603 }, { "epoch": 0.91, "learning_rate": 8.299692375536747e-07, "logits/chosen": -0.6811392903327942, "logits/rejected": -0.7738785147666931, "logps/chosen": -64.43204498291016, "logps/rejected": -85.53483581542969, "loss": 1.3545, "rewards/accuracies": 0.0, "rewards/chosen": 0.9303092956542969, "rewards/margins": -2.490010976791382, "rewards/rejected": 3.4203202724456787, "step": 5604 }, { "epoch": 0.91, "learning_rate": 8.29870483674449e-07, "logits/chosen": -0.42169103026390076, "logits/rejected": -0.42001596093177795, "logps/chosen": -2.273104667663574, "logps/rejected": -9.566155433654785, "loss": 0.3902, "rewards/accuracies": 0.0, "rewards/chosen": 0.21610751748085022, "rewards/margins": -0.020630016922950745, "rewards/rejected": 0.23673753440380096, "step": 5605 }, { "epoch": 0.91, "learning_rate": 8.29771707004294e-07, "logits/chosen": -0.5196187496185303, "logits/rejected": -0.3902696967124939, "logps/chosen": -79.61167907714844, "logps/rejected": -13.656341552734375, "loss": 0.1732, "rewards/accuracies": 1.0, "rewards/chosen": 1.436425805091858, "rewards/margins": 1.0164697170257568, "rewards/rejected": 0.4199560284614563, "step": 5606 }, { "epoch": 0.91, "learning_rate": 8.296729075500343e-07, "logits/chosen": -0.21928511559963226, "logits/rejected": -0.20669786632061005, "logps/chosen": -29.47528076171875, "logps/rejected": -1.6132508516311646, "loss": 1.5514, "rewards/accuracies": 0.0, "rewards/chosen": -0.012369346804916859, "rewards/margins": -0.3938678801059723, "rewards/rejected": 0.3814985454082489, "step": 5607 }, { "epoch": 0.91, "learning_rate": 8.295740853184962e-07, "logits/chosen": -0.9547151327133179, "logits/rejected": -0.9567146301269531, "logps/chosen": -104.80061340332031, "logps/rejected": -66.02938842773438, "loss": 0.7102, "rewards/accuracies": 0.0, "rewards/chosen": 2.2876617908477783, "rewards/margins": -1.0407013893127441, "rewards/rejected": 3.3283631801605225, "step": 5608 }, { "epoch": 0.91, "learning_rate": 8.294752403165074e-07, "logits/chosen": -0.5163635015487671, "logits/rejected": -0.515518844127655, "logps/chosen": -76.98992919921875, "logps/rejected": -62.87910842895508, "loss": 1.0485, "rewards/accuracies": 0.0, "rewards/chosen": 1.4982903003692627, "rewards/margins": -0.06377601623535156, "rewards/rejected": 1.5620663166046143, "step": 5609 }, { "epoch": 0.91, "learning_rate": 8.293763725508969e-07, "logits/chosen": -0.8467335104942322, "logits/rejected": -0.8552218079566956, "logps/chosen": -42.31206130981445, "logps/rejected": -124.37992095947266, "loss": 2.6261, "rewards/accuracies": 0.0, "rewards/chosen": 2.338439702987671, "rewards/margins": -3.3213651180267334, "rewards/rejected": 5.659804821014404, "step": 5610 }, { "epoch": 0.91, "learning_rate": 8.292774820284956e-07, "logits/chosen": -0.7683063745498657, "logits/rejected": -0.7769626975059509, "logps/chosen": -79.034912109375, "logps/rejected": -41.965087890625, "loss": 0.399, "rewards/accuracies": 0.0, "rewards/chosen": 0.9133338928222656, "rewards/margins": -0.14731216430664062, "rewards/rejected": 1.0606460571289062, "step": 5611 }, { "epoch": 0.91, "learning_rate": 8.291785687561359e-07, "logits/chosen": -0.7265235185623169, "logits/rejected": -0.7513306140899658, "logps/chosen": -95.90680694580078, "logps/rejected": -125.230224609375, "loss": 0.498, "rewards/accuracies": 1.0, "rewards/chosen": 1.656023383140564, "rewards/margins": 0.062477827072143555, "rewards/rejected": 1.5935455560684204, "step": 5612 }, { "epoch": 0.91, "learning_rate": 8.290796327406519e-07, "logits/chosen": -0.9510303735733032, "logits/rejected": -0.9390691518783569, "logps/chosen": -235.96746826171875, "logps/rejected": -131.36221313476562, "loss": 0.4954, "rewards/accuracies": 0.0, "rewards/chosen": 0.5847061276435852, "rewards/margins": -0.4218490719795227, "rewards/rejected": 1.006555199623108, "step": 5613 }, { "epoch": 0.91, "learning_rate": 8.289806739888791e-07, "logits/chosen": -0.8241721391677856, "logits/rejected": -0.7828127145767212, "logps/chosen": -123.01962280273438, "logps/rejected": -135.94825744628906, "loss": 1.3253, "rewards/accuracies": 0.0, "rewards/chosen": 4.187291145324707, "rewards/margins": -2.3953351974487305, "rewards/rejected": 6.5826263427734375, "step": 5614 }, { "epoch": 0.91, "learning_rate": 8.288816925076545e-07, "logits/chosen": -1.0137344598770142, "logits/rejected": -0.9985402822494507, "logps/chosen": -106.55191802978516, "logps/rejected": -57.465816497802734, "loss": 0.1902, "rewards/accuracies": 1.0, "rewards/chosen": 3.6806817054748535, "rewards/margins": 2.056201457977295, "rewards/rejected": 1.624480128288269, "step": 5615 }, { "epoch": 0.91, "learning_rate": 8.287826883038168e-07, "logits/chosen": -0.6368831396102905, "logits/rejected": -0.7451356053352356, "logps/chosen": -57.597900390625, "logps/rejected": -131.31040954589844, "loss": 1.2733, "rewards/accuracies": 0.0, "rewards/chosen": 1.6288727521896362, "rewards/margins": -1.68297278881073, "rewards/rejected": 3.311845541000366, "step": 5616 }, { "epoch": 0.91, "learning_rate": 8.286836613842064e-07, "logits/chosen": -0.7872714400291443, "logits/rejected": -0.7119435667991638, "logps/chosen": -170.88516235351562, "logps/rejected": -109.3231430053711, "loss": 0.2077, "rewards/accuracies": 1.0, "rewards/chosen": 5.58914041519165, "rewards/margins": 0.7561101913452148, "rewards/rejected": 4.8330302238464355, "step": 5617 }, { "epoch": 0.91, "learning_rate": 8.28584611755665e-07, "logits/chosen": -0.718879759311676, "logits/rejected": -0.6984928250312805, "logps/chosen": -49.12964630126953, "logps/rejected": -6.887899875640869, "loss": 0.4339, "rewards/accuracies": 1.0, "rewards/chosen": 1.2203041315078735, "rewards/margins": 0.3723863959312439, "rewards/rejected": 0.8479177355766296, "step": 5618 }, { "epoch": 0.91, "learning_rate": 8.284855394250361e-07, "logits/chosen": -0.6840859651565552, "logits/rejected": -0.725942075252533, "logps/chosen": -118.03264617919922, "logps/rejected": -50.603511810302734, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 0.9912864565849304, "rewards/margins": -0.49038583040237427, "rewards/rejected": 1.4816722869873047, "step": 5619 }, { "epoch": 0.91, "learning_rate": 8.283864443991645e-07, "logits/chosen": -0.484698623418808, "logits/rejected": -0.484698623418808, "logps/chosen": -0.9469274878501892, "logps/rejected": -0.9469274878501892, "loss": 0.5493, "rewards/accuracies": 0.0, "rewards/chosen": 0.2977088987827301, "rewards/margins": 0.0, "rewards/rejected": 0.2977088987827301, "step": 5620 }, { "epoch": 0.91, "learning_rate": 8.282873266848969e-07, "logits/chosen": -0.5484274625778198, "logits/rejected": -0.543342649936676, "logps/chosen": -58.44994354248047, "logps/rejected": -53.998619079589844, "loss": 1.4061, "rewards/accuracies": 0.0, "rewards/chosen": 1.8697693347930908, "rewards/margins": -0.11125564575195312, "rewards/rejected": 1.981024980545044, "step": 5621 }, { "epoch": 0.91, "learning_rate": 8.281881862890811e-07, "logits/chosen": -0.6331930756568909, "logits/rejected": -0.6331930756568909, "logps/chosen": -79.19947052001953, "logps/rejected": -79.19947052001953, "loss": 0.3843, "rewards/accuracies": 0.0, "rewards/chosen": 1.8745644092559814, "rewards/margins": 0.0, "rewards/rejected": 1.8745644092559814, "step": 5622 }, { "epoch": 0.91, "learning_rate": 8.280890232185671e-07, "logits/chosen": -0.7866244316101074, "logits/rejected": -0.8125447630882263, "logps/chosen": -110.02267456054688, "logps/rejected": -78.09635162353516, "loss": 1.4639, "rewards/accuracies": 0.0, "rewards/chosen": 0.5544754266738892, "rewards/margins": -2.8573083877563477, "rewards/rejected": 3.4117836952209473, "step": 5623 }, { "epoch": 0.91, "learning_rate": 8.279898374802061e-07, "logits/chosen": -0.21676039695739746, "logits/rejected": -0.21021690964698792, "logps/chosen": -19.82818603515625, "logps/rejected": -2.062120199203491, "loss": 0.64, "rewards/accuracies": 0.0, "rewards/chosen": -0.057619668543338776, "rewards/margins": -0.44513627886772156, "rewards/rejected": 0.3875166177749634, "step": 5624 }, { "epoch": 0.91, "learning_rate": 8.278906290808507e-07, "logits/chosen": -0.24127018451690674, "logits/rejected": -0.24127018451690674, "logps/chosen": -81.88711547851562, "logps/rejected": -81.88711547851562, "loss": 0.3742, "rewards/accuracies": 0.0, "rewards/chosen": -0.12068786472082138, "rewards/margins": 0.0, "rewards/rejected": -0.12068786472082138, "step": 5625 }, { "epoch": 0.91, "learning_rate": 8.277913980273554e-07, "logits/chosen": -1.1191712617874146, "logits/rejected": -0.9812462329864502, "logps/chosen": -144.65565490722656, "logps/rejected": -95.7669448852539, "loss": 0.3738, "rewards/accuracies": 1.0, "rewards/chosen": 3.691999912261963, "rewards/margins": 0.09658598899841309, "rewards/rejected": 3.59541392326355, "step": 5626 }, { "epoch": 0.91, "learning_rate": 8.276921443265761e-07, "logits/chosen": -0.5872101783752441, "logits/rejected": -0.5892110466957092, "logps/chosen": -69.91020202636719, "logps/rejected": -172.81471252441406, "loss": 0.6115, "rewards/accuracies": 0.0, "rewards/chosen": 2.7713334560394287, "rewards/margins": -0.5467422008514404, "rewards/rejected": 3.318075656890869, "step": 5627 }, { "epoch": 0.91, "learning_rate": 8.275928679853702e-07, "logits/chosen": -0.5813406109809875, "logits/rejected": -0.596572995185852, "logps/chosen": -71.49990844726562, "logps/rejected": -136.0458221435547, "loss": 0.2456, "rewards/accuracies": 1.0, "rewards/chosen": 1.4516953229904175, "rewards/margins": 1.5721908807754517, "rewards/rejected": -0.12049560993909836, "step": 5628 }, { "epoch": 0.91, "learning_rate": 8.274935690105969e-07, "logits/chosen": -0.8489285707473755, "logits/rejected": -0.7938181757926941, "logps/chosen": -121.99177551269531, "logps/rejected": -163.03225708007812, "loss": 2.5236, "rewards/accuracies": 0.0, "rewards/chosen": 1.396235704421997, "rewards/margins": -3.7679641246795654, "rewards/rejected": 5.1641998291015625, "step": 5629 }, { "epoch": 0.91, "learning_rate": 8.273942474091167e-07, "logits/chosen": -0.862898588180542, "logits/rejected": -0.8059626817703247, "logps/chosen": -104.38017272949219, "logps/rejected": -103.627685546875, "loss": 0.3605, "rewards/accuracies": 1.0, "rewards/chosen": 2.6798202991485596, "rewards/margins": 0.04287266731262207, "rewards/rejected": 2.6369476318359375, "step": 5630 }, { "epoch": 0.91, "learning_rate": 8.272949031877918e-07, "logits/chosen": -0.5627052783966064, "logits/rejected": -0.43084847927093506, "logps/chosen": -61.786834716796875, "logps/rejected": -19.036325454711914, "loss": 0.3115, "rewards/accuracies": 1.0, "rewards/chosen": 1.6067520380020142, "rewards/margins": 1.3190298080444336, "rewards/rejected": 0.2877222001552582, "step": 5631 }, { "epoch": 0.91, "learning_rate": 8.27195536353486e-07, "logits/chosen": -0.4972970485687256, "logits/rejected": -0.47036561369895935, "logps/chosen": -70.7655029296875, "logps/rejected": -56.321868896484375, "loss": 0.6778, "rewards/accuracies": 1.0, "rewards/chosen": 2.003344774246216, "rewards/margins": 0.8526229858398438, "rewards/rejected": 1.150721788406372, "step": 5632 }, { "epoch": 0.91, "learning_rate": 8.270961469130648e-07, "logits/chosen": -0.3281501233577728, "logits/rejected": -0.4171737730503082, "logps/chosen": -87.89191436767578, "logps/rejected": -97.98284149169922, "loss": 0.7635, "rewards/accuracies": 0.0, "rewards/chosen": 1.2097877264022827, "rewards/margins": -1.22649085521698, "rewards/rejected": 2.4362785816192627, "step": 5633 }, { "epoch": 0.91, "learning_rate": 8.269967348733946e-07, "logits/chosen": -0.4207952320575714, "logits/rejected": -0.42997145652770996, "logps/chosen": -1.9884297847747803, "logps/rejected": -1.9960395097732544, "loss": 1.4559, "rewards/accuracies": 1.0, "rewards/chosen": 0.24392831325531006, "rewards/margins": 0.013625353574752808, "rewards/rejected": 0.23030295968055725, "step": 5634 }, { "epoch": 0.91, "learning_rate": 8.268973002413442e-07, "logits/chosen": -0.7904501557350159, "logits/rejected": -0.7416874170303345, "logps/chosen": -108.79121398925781, "logps/rejected": -65.39677429199219, "loss": 0.0797, "rewards/accuracies": 1.0, "rewards/chosen": 4.768455505371094, "rewards/margins": 2.221026659011841, "rewards/rejected": 2.547428846359253, "step": 5635 }, { "epoch": 0.91, "learning_rate": 8.267978430237834e-07, "logits/chosen": -1.090879201889038, "logits/rejected": -1.018465518951416, "logps/chosen": -97.03144836425781, "logps/rejected": -192.50750732421875, "loss": 1.48, "rewards/accuracies": 0.0, "rewards/chosen": 1.212188720703125, "rewards/margins": -2.075822591781616, "rewards/rejected": 3.288011312484741, "step": 5636 }, { "epoch": 0.91, "learning_rate": 8.26698363227584e-07, "logits/chosen": -0.3527281582355499, "logits/rejected": -0.34197327494621277, "logps/chosen": -17.78464698791504, "logps/rejected": -8.241142272949219, "loss": 0.5846, "rewards/accuracies": 1.0, "rewards/chosen": 0.7376871109008789, "rewards/margins": 0.49485549330711365, "rewards/rejected": 0.24283161759376526, "step": 5637 }, { "epoch": 0.92, "learning_rate": 8.265988608596188e-07, "logits/chosen": -0.4419333338737488, "logits/rejected": -0.4489552676677704, "logps/chosen": -91.51570129394531, "logps/rejected": -97.12297058105469, "loss": 1.4129, "rewards/accuracies": 0.0, "rewards/chosen": 1.2355438470840454, "rewards/margins": -2.7463254928588867, "rewards/rejected": 3.9818694591522217, "step": 5638 }, { "epoch": 0.92, "learning_rate": 8.264993359267626e-07, "logits/chosen": -0.8420851230621338, "logits/rejected": -0.8420851230621338, "logps/chosen": -43.14781951904297, "logps/rejected": -43.14781951904297, "loss": 0.3839, "rewards/accuracies": 0.0, "rewards/chosen": 1.5618599653244019, "rewards/margins": 0.0, "rewards/rejected": 1.5618599653244019, "step": 5639 }, { "epoch": 0.92, "learning_rate": 8.26399788435892e-07, "logits/chosen": -0.9671577215194702, "logits/rejected": -0.8839550018310547, "logps/chosen": -140.0672607421875, "logps/rejected": -84.40382385253906, "loss": 0.1293, "rewards/accuracies": 1.0, "rewards/chosen": 4.504101753234863, "rewards/margins": 1.414520502090454, "rewards/rejected": 3.089581251144409, "step": 5640 }, { "epoch": 0.92, "learning_rate": 8.263002183938841e-07, "logits/chosen": -0.6627151370048523, "logits/rejected": -0.6742234230041504, "logps/chosen": -88.14730834960938, "logps/rejected": -128.30703735351562, "loss": 1.2297, "rewards/accuracies": 0.0, "rewards/chosen": 1.0419120788574219, "rewards/margins": -1.4526803493499756, "rewards/rejected": 2.4945924282073975, "step": 5641 }, { "epoch": 0.92, "learning_rate": 8.262006258076187e-07, "logits/chosen": -0.3670148253440857, "logits/rejected": -0.35007375478744507, "logps/chosen": -78.19149017333984, "logps/rejected": -112.52059173583984, "loss": 0.2591, "rewards/accuracies": 1.0, "rewards/chosen": 1.4587043523788452, "rewards/margins": 0.539232611656189, "rewards/rejected": 0.9194717407226562, "step": 5642 }, { "epoch": 0.92, "learning_rate": 8.261010106839765e-07, "logits/chosen": -0.49153587222099304, "logits/rejected": -0.4729972183704376, "logps/chosen": -30.182891845703125, "logps/rejected": -92.72584533691406, "loss": 0.9041, "rewards/accuracies": 0.0, "rewards/chosen": 0.908050537109375, "rewards/margins": -0.5773323774337769, "rewards/rejected": 1.4853829145431519, "step": 5643 }, { "epoch": 0.92, "learning_rate": 8.260013730298401e-07, "logits/chosen": -0.8732290267944336, "logits/rejected": -0.670162558555603, "logps/chosen": -108.29508209228516, "logps/rejected": -88.66410064697266, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": 5.837010860443115, "rewards/margins": 3.5644171237945557, "rewards/rejected": 2.2725937366485596, "step": 5644 }, { "epoch": 0.92, "learning_rate": 8.259017128520935e-07, "logits/chosen": -0.5233725309371948, "logits/rejected": -0.5596317052841187, "logps/chosen": -93.4857177734375, "logps/rejected": -121.67337036132812, "loss": 0.8644, "rewards/accuracies": 0.0, "rewards/chosen": 1.2638763189315796, "rewards/margins": -0.9606736898422241, "rewards/rejected": 2.2245500087738037, "step": 5645 }, { "epoch": 0.92, "learning_rate": 8.258020301576223e-07, "logits/chosen": -0.5253328084945679, "logits/rejected": -0.5253328084945679, "logps/chosen": -77.63162994384766, "logps/rejected": -77.63162994384766, "loss": 0.6412, "rewards/accuracies": 0.0, "rewards/chosen": 1.8720260858535767, "rewards/margins": 0.0, "rewards/rejected": 1.8720260858535767, "step": 5646 }, { "epoch": 0.92, "learning_rate": 8.257023249533135e-07, "logits/chosen": -0.6156796216964722, "logits/rejected": -0.5434814095497131, "logps/chosen": -74.04794311523438, "logps/rejected": -115.21609497070312, "loss": 1.8693, "rewards/accuracies": 1.0, "rewards/chosen": 1.5550979375839233, "rewards/margins": 0.49473869800567627, "rewards/rejected": 1.060359239578247, "step": 5647 }, { "epoch": 0.92, "learning_rate": 8.25602597246056e-07, "logits/chosen": -0.29420673847198486, "logits/rejected": -0.44727447628974915, "logps/chosen": -42.086822509765625, "logps/rejected": -257.318115234375, "loss": 1.1434, "rewards/accuracies": 1.0, "rewards/chosen": 2.0269298553466797, "rewards/margins": 0.3288310766220093, "rewards/rejected": 1.6980987787246704, "step": 5648 }, { "epoch": 0.92, "learning_rate": 8.255028470427397e-07, "logits/chosen": -1.1811057329177856, "logits/rejected": -1.0473588705062866, "logps/chosen": -121.07112884521484, "logps/rejected": -115.87236022949219, "loss": 0.2375, "rewards/accuracies": 1.0, "rewards/chosen": 1.2357505559921265, "rewards/margins": 0.7926146984100342, "rewards/rejected": 0.4431358277797699, "step": 5649 }, { "epoch": 0.92, "learning_rate": 8.254030743502568e-07, "logits/chosen": -0.19599051773548126, "logits/rejected": -0.1972518265247345, "logps/chosen": -26.719724655151367, "logps/rejected": -20.65617561340332, "loss": 0.7309, "rewards/accuracies": 0.0, "rewards/chosen": -0.05291767045855522, "rewards/margins": -0.39586982131004333, "rewards/rejected": 0.342952162027359, "step": 5650 }, { "epoch": 0.92, "learning_rate": 8.253032791755004e-07, "logits/chosen": -0.5183353424072266, "logits/rejected": -0.5183353424072266, "logps/chosen": -132.1429443359375, "logps/rejected": -132.1429443359375, "loss": 0.3661, "rewards/accuracies": 0.0, "rewards/chosen": 1.169061303138733, "rewards/margins": 0.0, "rewards/rejected": 1.169061303138733, "step": 5651 }, { "epoch": 0.92, "learning_rate": 8.252034615253655e-07, "logits/chosen": -0.8009398579597473, "logits/rejected": -0.7326028347015381, "logps/chosen": -75.5156021118164, "logps/rejected": -98.70315551757812, "loss": 0.348, "rewards/accuracies": 1.0, "rewards/chosen": 1.0763756036758423, "rewards/margins": 0.17867356538772583, "rewards/rejected": 0.8977020382881165, "step": 5652 }, { "epoch": 0.92, "learning_rate": 8.251036214067483e-07, "logits/chosen": -0.7318256497383118, "logits/rejected": -0.6629441380500793, "logps/chosen": -144.2252655029297, "logps/rejected": -148.580810546875, "loss": 0.3539, "rewards/accuracies": 1.0, "rewards/chosen": 3.759831190109253, "rewards/margins": 0.12097001075744629, "rewards/rejected": 3.6388611793518066, "step": 5653 }, { "epoch": 0.92, "learning_rate": 8.250037588265473e-07, "logits/chosen": -0.6347163915634155, "logits/rejected": -0.5929210186004639, "logps/chosen": -88.86300659179688, "logps/rejected": -58.616153717041016, "loss": 0.4439, "rewards/accuracies": 1.0, "rewards/chosen": 1.2605491876602173, "rewards/margins": 0.221024751663208, "rewards/rejected": 1.0395244359970093, "step": 5654 }, { "epoch": 0.92, "learning_rate": 8.249038737916616e-07, "logits/chosen": -0.732330322265625, "logits/rejected": -0.7259543538093567, "logps/chosen": -54.509742736816406, "logps/rejected": -103.0770263671875, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 0.5161632895469666, "rewards/margins": 0.6437069177627563, "rewards/rejected": -0.127543643116951, "step": 5655 }, { "epoch": 0.92, "learning_rate": 8.248039663089927e-07, "logits/chosen": -0.62450110912323, "logits/rejected": -0.5893166065216064, "logps/chosen": -72.32543182373047, "logps/rejected": -122.69762420654297, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": 3.372666120529175, "rewards/margins": 2.798867702484131, "rewards/rejected": 0.5737983584403992, "step": 5656 }, { "epoch": 0.92, "learning_rate": 8.247040363854427e-07, "logits/chosen": -0.23360519111156464, "logits/rejected": -0.22752954065799713, "logps/chosen": -56.084781646728516, "logps/rejected": -43.06685256958008, "loss": 0.9468, "rewards/accuracies": 0.0, "rewards/chosen": 0.8689846396446228, "rewards/margins": -0.449227511882782, "rewards/rejected": 1.3182121515274048, "step": 5657 }, { "epoch": 0.92, "learning_rate": 8.246040840279165e-07, "logits/chosen": -0.4578784704208374, "logits/rejected": -0.43673086166381836, "logps/chosen": -52.534461975097656, "logps/rejected": -61.05552673339844, "loss": 0.9161, "rewards/accuracies": 0.0, "rewards/chosen": 2.3427276611328125, "rewards/margins": -0.05047464370727539, "rewards/rejected": 2.393202304840088, "step": 5658 }, { "epoch": 0.92, "learning_rate": 8.245041092433194e-07, "logits/chosen": -0.9871271848678589, "logits/rejected": -0.9252400994300842, "logps/chosen": -97.52149963378906, "logps/rejected": -53.46893310546875, "loss": 0.0935, "rewards/accuracies": 1.0, "rewards/chosen": 3.812361240386963, "rewards/margins": 1.7497222423553467, "rewards/rejected": 2.062638998031616, "step": 5659 }, { "epoch": 0.92, "learning_rate": 8.244041120385587e-07, "logits/chosen": -0.8094667792320251, "logits/rejected": -0.7235853672027588, "logps/chosen": -98.26731872558594, "logps/rejected": -94.20269775390625, "loss": 1.4173, "rewards/accuracies": 0.0, "rewards/chosen": 1.1679260730743408, "rewards/margins": -0.49457621574401855, "rewards/rejected": 1.6625022888183594, "step": 5660 }, { "epoch": 0.92, "learning_rate": 8.243040924205435e-07, "logits/chosen": -0.6368687152862549, "logits/rejected": -0.5857462882995605, "logps/chosen": -59.611244201660156, "logps/rejected": -68.32119750976562, "loss": 0.8752, "rewards/accuracies": 0.0, "rewards/chosen": 1.2585678100585938, "rewards/margins": -0.16013336181640625, "rewards/rejected": 1.418701171875, "step": 5661 }, { "epoch": 0.92, "learning_rate": 8.242040503961843e-07, "logits/chosen": -1.1542996168136597, "logits/rejected": -1.1171538829803467, "logps/chosen": -171.821044921875, "logps/rejected": -80.42169952392578, "loss": 0.066, "rewards/accuracies": 1.0, "rewards/chosen": 4.413525581359863, "rewards/margins": 2.2517664432525635, "rewards/rejected": 2.1617591381073, "step": 5662 }, { "epoch": 0.92, "learning_rate": 8.241039859723926e-07, "logits/chosen": -1.088051438331604, "logits/rejected": -1.088051438331604, "logps/chosen": -55.08963394165039, "logps/rejected": -55.08963394165039, "loss": 0.4327, "rewards/accuracies": 0.0, "rewards/chosen": 3.372894048690796, "rewards/margins": 0.0, "rewards/rejected": 3.372894048690796, "step": 5663 }, { "epoch": 0.92, "learning_rate": 8.240038991560823e-07, "logits/chosen": -0.9710304737091064, "logits/rejected": -0.9217939972877502, "logps/chosen": -86.40925598144531, "logps/rejected": -69.35188293457031, "loss": 1.0149, "rewards/accuracies": 0.0, "rewards/chosen": 2.088465929031372, "rewards/margins": -0.8234801292419434, "rewards/rejected": 2.9119460582733154, "step": 5664 }, { "epoch": 0.92, "learning_rate": 8.239037899541682e-07, "logits/chosen": -0.45116180181503296, "logits/rejected": -0.36772051453590393, "logps/chosen": -78.74005889892578, "logps/rejected": -75.79133605957031, "loss": 0.1045, "rewards/accuracies": 1.0, "rewards/chosen": 4.690742015838623, "rewards/margins": 2.6163363456726074, "rewards/rejected": 2.0744056701660156, "step": 5665 }, { "epoch": 0.92, "learning_rate": 8.238036583735671e-07, "logits/chosen": -0.5525726079940796, "logits/rejected": -0.5525726079940796, "logps/chosen": -60.48228073120117, "logps/rejected": -60.48228073120117, "loss": 0.3571, "rewards/accuracies": 0.0, "rewards/chosen": 1.1835224628448486, "rewards/margins": 0.0, "rewards/rejected": 1.1835224628448486, "step": 5666 }, { "epoch": 0.92, "learning_rate": 8.23703504421197e-07, "logits/chosen": -0.6616649627685547, "logits/rejected": -0.6616649627685547, "logps/chosen": -121.20120239257812, "logps/rejected": -121.20120239257812, "loss": 0.3927, "rewards/accuracies": 0.0, "rewards/chosen": 1.7409225702285767, "rewards/margins": 0.0, "rewards/rejected": 1.7409225702285767, "step": 5667 }, { "epoch": 0.92, "learning_rate": 8.236033281039778e-07, "logits/chosen": -0.43933331966400146, "logits/rejected": -0.3915436267852783, "logps/chosen": -46.84837341308594, "logps/rejected": -54.955997467041016, "loss": 0.832, "rewards/accuracies": 0.0, "rewards/chosen": 1.0570907592773438, "rewards/margins": -0.4183460474014282, "rewards/rejected": 1.475436806678772, "step": 5668 }, { "epoch": 0.92, "learning_rate": 8.235031294288305e-07, "logits/chosen": -0.645045280456543, "logits/rejected": -0.5536047220230103, "logps/chosen": -100.98545837402344, "logps/rejected": -37.14387893676758, "loss": 0.4815, "rewards/accuracies": 0.0, "rewards/chosen": 1.1038788557052612, "rewards/margins": -0.021679282188415527, "rewards/rejected": 1.1255581378936768, "step": 5669 }, { "epoch": 0.92, "learning_rate": 8.23402908402678e-07, "logits/chosen": -0.9080259203910828, "logits/rejected": -0.47589173913002014, "logps/chosen": -7.301644325256348, "logps/rejected": -193.93724060058594, "loss": 1.4748, "rewards/accuracies": 0.0, "rewards/chosen": 0.17501698434352875, "rewards/margins": -2.847991943359375, "rewards/rejected": 3.0230088233947754, "step": 5670 }, { "epoch": 0.92, "learning_rate": 8.233026650324446e-07, "logits/chosen": -0.5620229840278625, "logits/rejected": -0.6049315929412842, "logps/chosen": -62.29738235473633, "logps/rejected": -84.92189025878906, "loss": 1.2899, "rewards/accuracies": 0.0, "rewards/chosen": 1.5557796955108643, "rewards/margins": -0.1764606237411499, "rewards/rejected": 1.7322403192520142, "step": 5671 }, { "epoch": 0.92, "learning_rate": 8.232023993250561e-07, "logits/chosen": -0.6670881509780884, "logits/rejected": -0.6670881509780884, "logps/chosen": -95.28065490722656, "logps/rejected": -95.28065490722656, "loss": 0.4202, "rewards/accuracies": 0.0, "rewards/chosen": 1.1341522932052612, "rewards/margins": 0.0, "rewards/rejected": 1.1341522932052612, "step": 5672 }, { "epoch": 0.92, "learning_rate": 8.231021112874401e-07, "logits/chosen": -0.5297436714172363, "logits/rejected": -0.5388864874839783, "logps/chosen": -67.26686096191406, "logps/rejected": -67.13091278076172, "loss": 0.1936, "rewards/accuracies": 1.0, "rewards/chosen": 0.8202926516532898, "rewards/margins": 0.8596404790878296, "rewards/rejected": -0.03934783861041069, "step": 5673 }, { "epoch": 0.92, "learning_rate": 8.230018009265254e-07, "logits/chosen": -0.592923641204834, "logits/rejected": -0.592923641204834, "logps/chosen": -63.713714599609375, "logps/rejected": -63.713714599609375, "loss": 0.6669, "rewards/accuracies": 0.0, "rewards/chosen": 1.5897636413574219, "rewards/margins": 0.0, "rewards/rejected": 1.5897636413574219, "step": 5674 }, { "epoch": 0.92, "learning_rate": 8.229014682492423e-07, "logits/chosen": -0.5197173953056335, "logits/rejected": -0.4993765354156494, "logps/chosen": -62.27629089355469, "logps/rejected": -65.46121215820312, "loss": 0.9016, "rewards/accuracies": 0.0, "rewards/chosen": 0.7618247866630554, "rewards/margins": -1.223707675933838, "rewards/rejected": 1.9855324029922485, "step": 5675 }, { "epoch": 0.92, "learning_rate": 8.228011132625234e-07, "logits/chosen": -0.5905526876449585, "logits/rejected": -0.5519647598266602, "logps/chosen": -58.469539642333984, "logps/rejected": -28.203256607055664, "loss": 0.2451, "rewards/accuracies": 1.0, "rewards/chosen": 2.0660998821258545, "rewards/margins": 0.8979023694992065, "rewards/rejected": 1.168197512626648, "step": 5676 }, { "epoch": 0.92, "learning_rate": 8.227007359733017e-07, "logits/chosen": -0.8092498183250427, "logits/rejected": -0.7786930799484253, "logps/chosen": -51.259647369384766, "logps/rejected": -47.932498931884766, "loss": 0.1257, "rewards/accuracies": 1.0, "rewards/chosen": 1.5534298419952393, "rewards/margins": 1.355671763420105, "rewards/rejected": 0.19775810837745667, "step": 5677 }, { "epoch": 0.92, "learning_rate": 8.226003363885126e-07, "logits/chosen": -0.9137020111083984, "logits/rejected": -0.921694815158844, "logps/chosen": -109.2707290649414, "logps/rejected": -48.341697692871094, "loss": 0.9121, "rewards/accuracies": 0.0, "rewards/chosen": -0.19694900512695312, "rewards/margins": -1.571624755859375, "rewards/rejected": 1.3746757507324219, "step": 5678 }, { "epoch": 0.92, "learning_rate": 8.224999145150928e-07, "logits/chosen": -0.82940274477005, "logits/rejected": -0.830781102180481, "logps/chosen": -110.85110473632812, "logps/rejected": -121.90377044677734, "loss": 0.591, "rewards/accuracies": 0.0, "rewards/chosen": 0.8419235348701477, "rewards/margins": -0.33260875940322876, "rewards/rejected": 1.1745322942733765, "step": 5679 }, { "epoch": 0.92, "learning_rate": 8.223994703599805e-07, "logits/chosen": -0.28157171607017517, "logits/rejected": -0.28157171607017517, "logps/chosen": -0.9413666725158691, "logps/rejected": -0.9413666725158691, "loss": 0.6335, "rewards/accuracies": 0.0, "rewards/chosen": 0.4694153368473053, "rewards/margins": 0.0, "rewards/rejected": 0.4694153368473053, "step": 5680 }, { "epoch": 0.92, "learning_rate": 8.222990039301152e-07, "logits/chosen": -1.022597074508667, "logits/rejected": -0.9611743688583374, "logps/chosen": -166.53697204589844, "logps/rejected": -153.87094116210938, "loss": 1.5133, "rewards/accuracies": 0.0, "rewards/chosen": 5.6143693923950195, "rewards/margins": -1.7438626289367676, "rewards/rejected": 7.358232021331787, "step": 5681 }, { "epoch": 0.92, "learning_rate": 8.221985152324384e-07, "logits/chosen": -0.20740346610546112, "logits/rejected": -0.20740346610546112, "logps/chosen": -99.1114501953125, "logps/rejected": -99.1114501953125, "loss": 2.0163, "rewards/accuracies": 0.0, "rewards/chosen": 0.6593368649482727, "rewards/margins": 0.0, "rewards/rejected": 0.6593368649482727, "step": 5682 }, { "epoch": 0.92, "learning_rate": 8.22098004273893e-07, "logits/chosen": -0.36670389771461487, "logits/rejected": -0.36670389771461487, "logps/chosen": -64.45661926269531, "logps/rejected": -64.45661926269531, "loss": 0.8548, "rewards/accuracies": 0.0, "rewards/chosen": 0.9495865106582642, "rewards/margins": 0.0, "rewards/rejected": 0.9495865106582642, "step": 5683 }, { "epoch": 0.92, "learning_rate": 8.219974710614232e-07, "logits/chosen": -0.6247420310974121, "logits/rejected": -0.6402168869972229, "logps/chosen": -83.26803588867188, "logps/rejected": -117.40138244628906, "loss": 1.6654, "rewards/accuracies": 0.0, "rewards/chosen": 1.8073714971542358, "rewards/margins": -1.4078248739242554, "rewards/rejected": 3.215196371078491, "step": 5684 }, { "epoch": 0.92, "learning_rate": 8.218969156019748e-07, "logits/chosen": -0.9839379191398621, "logits/rejected": -0.7813647389411926, "logps/chosen": -95.2787094116211, "logps/rejected": -88.08268737792969, "loss": 0.5073, "rewards/accuracies": 1.0, "rewards/chosen": 5.149263858795166, "rewards/margins": 1.6223275661468506, "rewards/rejected": 3.5269362926483154, "step": 5685 }, { "epoch": 0.92, "learning_rate": 8.217963379024954e-07, "logits/chosen": -0.38893935084342957, "logits/rejected": -0.38810330629348755, "logps/chosen": -85.1949462890625, "logps/rejected": -79.84624481201172, "loss": 0.4128, "rewards/accuracies": 1.0, "rewards/chosen": 0.9011871218681335, "rewards/margins": 0.11018139123916626, "rewards/rejected": 0.7910057306289673, "step": 5686 }, { "epoch": 0.92, "learning_rate": 8.216957379699338e-07, "logits/chosen": -0.4652130603790283, "logits/rejected": -0.4970534145832062, "logps/chosen": -103.30933380126953, "logps/rejected": -74.74714660644531, "loss": 0.9442, "rewards/accuracies": 1.0, "rewards/chosen": 1.5620307922363281, "rewards/margins": 0.0797950029373169, "rewards/rejected": 1.4822357892990112, "step": 5687 }, { "epoch": 0.92, "learning_rate": 8.215951158112409e-07, "logits/chosen": -0.6286455392837524, "logits/rejected": -0.573505163192749, "logps/chosen": -86.93991088867188, "logps/rejected": -66.96282958984375, "loss": 0.2697, "rewards/accuracies": 1.0, "rewards/chosen": 2.675640821456909, "rewards/margins": 0.356870174407959, "rewards/rejected": 2.31877064704895, "step": 5688 }, { "epoch": 0.92, "learning_rate": 8.214944714333683e-07, "logits/chosen": -0.46286770701408386, "logits/rejected": -0.45590928196907043, "logps/chosen": -59.72290802001953, "logps/rejected": -51.75564193725586, "loss": 0.7585, "rewards/accuracies": 0.0, "rewards/chosen": 2.2181527614593506, "rewards/margins": -0.4201853275299072, "rewards/rejected": 2.638338088989258, "step": 5689 }, { "epoch": 0.92, "learning_rate": 8.213938048432696e-07, "logits/chosen": -1.2709019184112549, "logits/rejected": -1.2363228797912598, "logps/chosen": -103.2406234741211, "logps/rejected": -126.5790023803711, "loss": 2.239, "rewards/accuracies": 0.0, "rewards/chosen": 2.9045448303222656, "rewards/margins": -3.3304476737976074, "rewards/rejected": 6.234992504119873, "step": 5690 }, { "epoch": 0.92, "learning_rate": 8.212931160479002e-07, "logits/chosen": -0.6450625061988831, "logits/rejected": -0.6556954979896545, "logps/chosen": -66.07024383544922, "logps/rejected": -71.88404846191406, "loss": 1.0914, "rewards/accuracies": 0.0, "rewards/chosen": 0.587476372718811, "rewards/margins": -0.8957252502441406, "rewards/rejected": 1.4832016229629517, "step": 5691 }, { "epoch": 0.92, "learning_rate": 8.211924050542165e-07, "logits/chosen": -0.774204671382904, "logits/rejected": -0.7603150606155396, "logps/chosen": -29.134414672851562, "logps/rejected": -5.430647850036621, "loss": 0.9681, "rewards/accuracies": 0.0, "rewards/chosen": 0.03477821499109268, "rewards/margins": -0.26975613832473755, "rewards/rejected": 0.30453434586524963, "step": 5692 }, { "epoch": 0.92, "learning_rate": 8.21091671869177e-07, "logits/chosen": -0.8999872803688049, "logits/rejected": -0.8767849206924438, "logps/chosen": -131.62062072753906, "logps/rejected": -85.09083557128906, "loss": 1.4014, "rewards/accuracies": 0.0, "rewards/chosen": 1.6846145391464233, "rewards/margins": -1.2849229574203491, "rewards/rejected": 2.9695374965667725, "step": 5693 }, { "epoch": 0.92, "learning_rate": 8.209909164997408e-07, "logits/chosen": -0.7353657484054565, "logits/rejected": -0.7438143491744995, "logps/chosen": -78.23973083496094, "logps/rejected": -63.54976272583008, "loss": 1.0437, "rewards/accuracies": 0.0, "rewards/chosen": 1.1706726551055908, "rewards/margins": -0.11686968803405762, "rewards/rejected": 1.2875423431396484, "step": 5694 }, { "epoch": 0.92, "learning_rate": 8.208901389528698e-07, "logits/chosen": -0.4794236421585083, "logits/rejected": -0.33069273829460144, "logps/chosen": -69.71646881103516, "logps/rejected": -83.53768920898438, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": 5.495115756988525, "rewards/margins": 3.0860612392425537, "rewards/rejected": 2.4090545177459717, "step": 5695 }, { "epoch": 0.92, "learning_rate": 8.207893392355263e-07, "logits/chosen": -0.4000007212162018, "logits/rejected": -0.3511166572570801, "logps/chosen": -97.89518737792969, "logps/rejected": -39.70355987548828, "loss": 0.5418, "rewards/accuracies": 0.0, "rewards/chosen": 1.3104171752929688, "rewards/margins": -0.6298729181289673, "rewards/rejected": 1.940290093421936, "step": 5696 }, { "epoch": 0.92, "learning_rate": 8.206885173546749e-07, "logits/chosen": -0.439929336309433, "logits/rejected": -0.4397593140602112, "logps/chosen": -94.06632995605469, "logps/rejected": -78.98170471191406, "loss": 0.9252, "rewards/accuracies": 0.0, "rewards/chosen": 0.8476669192314148, "rewards/margins": -1.6623497009277344, "rewards/rejected": 2.510016679763794, "step": 5697 }, { "epoch": 0.92, "learning_rate": 8.205876733172813e-07, "logits/chosen": -0.48638486862182617, "logits/rejected": -0.4320085942745209, "logps/chosen": -69.03990173339844, "logps/rejected": -91.72188568115234, "loss": 0.4591, "rewards/accuracies": 0.0, "rewards/chosen": 2.3006227016448975, "rewards/margins": -0.12220597267150879, "rewards/rejected": 2.4228286743164062, "step": 5698 }, { "epoch": 0.93, "learning_rate": 8.20486807130313e-07, "logits/chosen": -0.6443497538566589, "logits/rejected": -0.6438168883323669, "logps/chosen": -102.37875366210938, "logps/rejected": -68.87307739257812, "loss": 0.5161, "rewards/accuracies": 0.0, "rewards/chosen": 1.7119117975234985, "rewards/margins": -0.09863054752349854, "rewards/rejected": 1.810542345046997, "step": 5699 }, { "epoch": 0.93, "learning_rate": 8.203859188007388e-07, "logits/chosen": -0.6693254709243774, "logits/rejected": -0.6687645316123962, "logps/chosen": -31.312475204467773, "logps/rejected": -4.821190357208252, "loss": 0.7108, "rewards/accuracies": 0.0, "rewards/chosen": 0.1575281172990799, "rewards/margins": -0.13200820982456207, "rewards/rejected": 0.28953632712364197, "step": 5700 }, { "epoch": 0.93, "learning_rate": 8.202850083355289e-07, "logits/chosen": -0.0971747487783432, "logits/rejected": -0.09119467437267303, "logps/chosen": -1.3527029752731323, "logps/rejected": -4.359983444213867, "loss": 0.5288, "rewards/accuracies": 1.0, "rewards/chosen": 0.29364320635795593, "rewards/margins": 0.0177004337310791, "rewards/rejected": 0.27594277262687683, "step": 5701 }, { "epoch": 0.93, "learning_rate": 8.201840757416556e-07, "logits/chosen": -0.5581658482551575, "logits/rejected": -0.572496771812439, "logps/chosen": -0.4814092218875885, "logps/rejected": -20.416749954223633, "loss": 0.4175, "rewards/accuracies": 1.0, "rewards/chosen": 0.21356259286403656, "rewards/margins": 0.554443359375, "rewards/rejected": -0.34088078141212463, "step": 5702 }, { "epoch": 0.93, "learning_rate": 8.200831210260924e-07, "logits/chosen": -0.6356031894683838, "logits/rejected": -0.6756356358528137, "logps/chosen": -62.760860443115234, "logps/rejected": -64.00749206542969, "loss": 1.5883, "rewards/accuracies": 0.0, "rewards/chosen": 1.6470493078231812, "rewards/margins": -1.6434048414230347, "rewards/rejected": 3.290454149246216, "step": 5703 }, { "epoch": 0.93, "learning_rate": 8.199821441958141e-07, "logits/chosen": -0.7859706282615662, "logits/rejected": -0.9018368124961853, "logps/chosen": -132.74639892578125, "logps/rejected": -149.50027465820312, "loss": 1.4297, "rewards/accuracies": 0.0, "rewards/chosen": 2.7970855236053467, "rewards/margins": -2.6214053630828857, "rewards/rejected": 5.418490886688232, "step": 5704 }, { "epoch": 0.93, "learning_rate": 8.198811452577974e-07, "logits/chosen": -1.0529617071151733, "logits/rejected": -0.9982245564460754, "logps/chosen": -185.4903564453125, "logps/rejected": -112.21574401855469, "loss": 0.4969, "rewards/accuracies": 0.0, "rewards/chosen": 0.744384765625, "rewards/margins": -0.5023895502090454, "rewards/rejected": 1.2467743158340454, "step": 5705 }, { "epoch": 0.93, "learning_rate": 8.197801242190202e-07, "logits/chosen": -1.0250028371810913, "logits/rejected": -0.9715030193328857, "logps/chosen": -85.39924621582031, "logps/rejected": -78.81031799316406, "loss": 3.0322, "rewards/accuracies": 0.0, "rewards/chosen": 0.8378990292549133, "rewards/margins": -2.509969472885132, "rewards/rejected": 3.3478684425354004, "step": 5706 }, { "epoch": 0.93, "learning_rate": 8.196790810864623e-07, "logits/chosen": -0.6926997303962708, "logits/rejected": -0.6878243088722229, "logps/chosen": -50.67560577392578, "logps/rejected": -66.32769775390625, "loss": 0.1821, "rewards/accuracies": 1.0, "rewards/chosen": 2.2335457801818848, "rewards/margins": 1.40024733543396, "rewards/rejected": 0.8332985043525696, "step": 5707 }, { "epoch": 0.93, "learning_rate": 8.195780158671046e-07, "logits/chosen": -0.771112859249115, "logits/rejected": -0.7107961773872375, "logps/chosen": -249.2935791015625, "logps/rejected": -75.09764099121094, "loss": 0.7089, "rewards/accuracies": 1.0, "rewards/chosen": 5.583099365234375, "rewards/margins": 2.124457597732544, "rewards/rejected": 3.458641767501831, "step": 5708 }, { "epoch": 0.93, "learning_rate": 8.1947692856793e-07, "logits/chosen": -1.0086015462875366, "logits/rejected": -0.9657762050628662, "logps/chosen": -182.27291870117188, "logps/rejected": -130.41677856445312, "loss": 1.05, "rewards/accuracies": 0.0, "rewards/chosen": 4.236151218414307, "rewards/margins": -1.9242310523986816, "rewards/rejected": 6.160382270812988, "step": 5709 }, { "epoch": 0.93, "learning_rate": 8.193758191959225e-07, "logits/chosen": -0.5461903810501099, "logits/rejected": -0.54941725730896, "logps/chosen": -53.70220947265625, "logps/rejected": -110.78367614746094, "loss": 0.5796, "rewards/accuracies": 1.0, "rewards/chosen": 0.8446346521377563, "rewards/margins": 0.23298609256744385, "rewards/rejected": 0.6116485595703125, "step": 5710 }, { "epoch": 0.93, "learning_rate": 8.192746877580678e-07, "logits/chosen": -0.8138436079025269, "logits/rejected": -0.9051918387413025, "logps/chosen": -126.0853271484375, "logps/rejected": -70.53605651855469, "loss": 0.2876, "rewards/accuracies": 1.0, "rewards/chosen": 2.4570281505584717, "rewards/margins": 0.6984885931015015, "rewards/rejected": 1.7585395574569702, "step": 5711 }, { "epoch": 0.93, "learning_rate": 8.191735342613532e-07, "logits/chosen": -0.7395220398902893, "logits/rejected": -0.40502095222473145, "logps/chosen": -97.77352142333984, "logps/rejected": -33.5020866394043, "loss": 0.0834, "rewards/accuracies": 1.0, "rewards/chosen": 3.4766762256622314, "rewards/margins": 3.0443780422210693, "rewards/rejected": 0.4322982728481293, "step": 5712 }, { "epoch": 0.93, "learning_rate": 8.190723587127677e-07, "logits/chosen": -0.5563575625419617, "logits/rejected": -0.6976523995399475, "logps/chosen": -63.25724792480469, "logps/rejected": -96.44046020507812, "loss": 1.9083, "rewards/accuracies": 0.0, "rewards/chosen": 0.8928985595703125, "rewards/margins": -3.787506103515625, "rewards/rejected": 4.6804046630859375, "step": 5713 }, { "epoch": 0.93, "learning_rate": 8.189711611193011e-07, "logits/chosen": -0.14914485812187195, "logits/rejected": -0.14914485812187195, "logps/chosen": -49.238521575927734, "logps/rejected": -49.238521575927734, "loss": 0.379, "rewards/accuracies": 0.0, "rewards/chosen": 1.5562671422958374, "rewards/margins": 0.0, "rewards/rejected": 1.5562671422958374, "step": 5714 }, { "epoch": 0.93, "learning_rate": 8.188699414879453e-07, "logits/chosen": -0.7837666273117065, "logits/rejected": -0.6964799165725708, "logps/chosen": -62.91539001464844, "logps/rejected": -18.247379302978516, "loss": 0.2464, "rewards/accuracies": 1.0, "rewards/chosen": 2.035687208175659, "rewards/margins": 1.5418505668640137, "rewards/rejected": 0.4938366115093231, "step": 5715 }, { "epoch": 0.93, "learning_rate": 8.187686998256937e-07, "logits/chosen": -0.49653756618499756, "logits/rejected": -0.5066692233085632, "logps/chosen": -172.463623046875, "logps/rejected": -104.12716674804688, "loss": 1.4212, "rewards/accuracies": 0.0, "rewards/chosen": 2.7752349376678467, "rewards/margins": -2.3143084049224854, "rewards/rejected": 5.089543342590332, "step": 5716 }, { "epoch": 0.93, "learning_rate": 8.186674361395414e-07, "logits/chosen": -0.7151890993118286, "logits/rejected": -0.2793785035610199, "logps/chosen": -49.752105712890625, "logps/rejected": -89.82054138183594, "loss": 0.8146, "rewards/accuracies": 0.0, "rewards/chosen": 1.5237587690353394, "rewards/margins": -0.9562400579452515, "rewards/rejected": 2.479998826980591, "step": 5717 }, { "epoch": 0.93, "learning_rate": 8.185661504364844e-07, "logits/chosen": -0.6710401773452759, "logits/rejected": -0.684354305267334, "logps/chosen": -131.50050354003906, "logps/rejected": -158.5758056640625, "loss": 1.6721, "rewards/accuracies": 0.0, "rewards/chosen": 0.16800843179225922, "rewards/margins": -2.776756525039673, "rewards/rejected": 2.9447648525238037, "step": 5718 }, { "epoch": 0.93, "learning_rate": 8.184648427235206e-07, "logits/chosen": -0.8532420992851257, "logits/rejected": -1.0133315324783325, "logps/chosen": -90.09307861328125, "logps/rejected": -202.506591796875, "loss": 2.6491, "rewards/accuracies": 0.0, "rewards/chosen": 0.566388726234436, "rewards/margins": -3.322068214416504, "rewards/rejected": 3.8884568214416504, "step": 5719 }, { "epoch": 0.93, "learning_rate": 8.183635130076496e-07, "logits/chosen": -0.4675174653530121, "logits/rejected": -0.45518749952316284, "logps/chosen": -94.45445251464844, "logps/rejected": -72.25259399414062, "loss": 0.6299, "rewards/accuracies": 0.0, "rewards/chosen": 1.1611664295196533, "rewards/margins": -0.5505492687225342, "rewards/rejected": 1.7117156982421875, "step": 5720 }, { "epoch": 0.93, "learning_rate": 8.182621612958722e-07, "logits/chosen": -0.9394584894180298, "logits/rejected": -0.8575941920280457, "logps/chosen": -48.89735794067383, "logps/rejected": -49.38331985473633, "loss": 0.5914, "rewards/accuracies": 0.0, "rewards/chosen": 1.7875995635986328, "rewards/margins": -0.08692324161529541, "rewards/rejected": 1.8745228052139282, "step": 5721 }, { "epoch": 0.93, "learning_rate": 8.18160787595191e-07, "logits/chosen": -0.34224697947502136, "logits/rejected": -0.3349398970603943, "logps/chosen": -8.424711227416992, "logps/rejected": -5.666782379150391, "loss": 1.0702, "rewards/accuracies": 0.0, "rewards/chosen": 0.5312265753746033, "rewards/margins": -0.25281625986099243, "rewards/rejected": 0.7840428352355957, "step": 5722 }, { "epoch": 0.93, "learning_rate": 8.180593919126097e-07, "logits/chosen": -0.622268795967102, "logits/rejected": -0.46158546209335327, "logps/chosen": -187.6896209716797, "logps/rejected": -177.14913940429688, "loss": 1.1829, "rewards/accuracies": 0.0, "rewards/chosen": 4.18049955368042, "rewards/margins": -1.4645934104919434, "rewards/rejected": 5.645092964172363, "step": 5723 }, { "epoch": 0.93, "learning_rate": 8.17957974255134e-07, "logits/chosen": -0.9046698808670044, "logits/rejected": -0.7470100522041321, "logps/chosen": -124.23353576660156, "logps/rejected": -182.30162048339844, "loss": 0.5828, "rewards/accuracies": 0.0, "rewards/chosen": 6.244285583496094, "rewards/margins": -0.7533721923828125, "rewards/rejected": 6.997657775878906, "step": 5724 }, { "epoch": 0.93, "learning_rate": 8.178565346297708e-07, "logits/chosen": -0.5665030479431152, "logits/rejected": -0.4512646496295929, "logps/chosen": -55.01741409301758, "logps/rejected": -61.59796142578125, "loss": 0.631, "rewards/accuracies": 0.0, "rewards/chosen": 1.525107979774475, "rewards/margins": -0.6835986375808716, "rewards/rejected": 2.2087066173553467, "step": 5725 }, { "epoch": 0.93, "learning_rate": 8.177550730435287e-07, "logits/chosen": -0.5674841403961182, "logits/rejected": -0.5702878832817078, "logps/chosen": -75.82054138183594, "logps/rejected": -96.14219665527344, "loss": 0.7162, "rewards/accuracies": 0.0, "rewards/chosen": 1.5411109924316406, "rewards/margins": -0.850884199142456, "rewards/rejected": 2.3919951915740967, "step": 5726 }, { "epoch": 0.93, "learning_rate": 8.176535895034175e-07, "logits/chosen": -0.6408861875534058, "logits/rejected": -0.5933887958526611, "logps/chosen": -75.50103759765625, "logps/rejected": -38.709556579589844, "loss": 0.6227, "rewards/accuracies": 1.0, "rewards/chosen": 0.8379272818565369, "rewards/margins": 0.7320290207862854, "rewards/rejected": 0.10589828342199326, "step": 5727 }, { "epoch": 0.93, "learning_rate": 8.175520840164491e-07, "logits/chosen": -0.7172194719314575, "logits/rejected": -0.7047123908996582, "logps/chosen": -68.38610076904297, "logps/rejected": -68.43942260742188, "loss": 2.5297, "rewards/accuracies": 0.0, "rewards/chosen": 0.34832078218460083, "rewards/margins": -1.325188398361206, "rewards/rejected": 1.6735092401504517, "step": 5728 }, { "epoch": 0.93, "learning_rate": 8.174505565896364e-07, "logits/chosen": -0.7794076204299927, "logits/rejected": -0.7768718004226685, "logps/chosen": -68.65818786621094, "logps/rejected": -60.63704299926758, "loss": 0.3502, "rewards/accuracies": 1.0, "rewards/chosen": 1.0625107288360596, "rewards/margins": 0.420962929725647, "rewards/rejected": 0.6415477991104126, "step": 5729 }, { "epoch": 0.93, "learning_rate": 8.17349007229994e-07, "logits/chosen": -0.5436457991600037, "logits/rejected": -0.5274567008018494, "logps/chosen": -132.0830078125, "logps/rejected": -89.61553955078125, "loss": 0.4102, "rewards/accuracies": 0.0, "rewards/chosen": 4.45050048828125, "rewards/margins": -0.17408466339111328, "rewards/rejected": 4.624585151672363, "step": 5730 }, { "epoch": 0.93, "learning_rate": 8.17247435944538e-07, "logits/chosen": -0.05534439906477928, "logits/rejected": -0.05534439906477928, "logps/chosen": -4.3170485496521, "logps/rejected": -4.3170485496521, "loss": 0.4414, "rewards/accuracies": 0.0, "rewards/chosen": 0.5793293714523315, "rewards/margins": 0.0, "rewards/rejected": 0.5793293714523315, "step": 5731 }, { "epoch": 0.93, "learning_rate": 8.171458427402859e-07, "logits/chosen": -0.84414142370224, "logits/rejected": -0.7127185463905334, "logps/chosen": -87.25330352783203, "logps/rejected": -20.364707946777344, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": 2.5219247341156006, "rewards/margins": 2.0484719276428223, "rewards/rejected": 0.47345277667045593, "step": 5732 }, { "epoch": 0.93, "learning_rate": 8.170442276242568e-07, "logits/chosen": -0.8210535049438477, "logits/rejected": -0.7670495510101318, "logps/chosen": -112.90902709960938, "logps/rejected": -126.2723388671875, "loss": 0.5857, "rewards/accuracies": 0.0, "rewards/chosen": 4.213874816894531, "rewards/margins": -0.5066757202148438, "rewards/rejected": 4.720550537109375, "step": 5733 }, { "epoch": 0.93, "learning_rate": 8.169425906034717e-07, "logits/chosen": -0.5005323886871338, "logits/rejected": -0.5038373470306396, "logps/chosen": -84.30696105957031, "logps/rejected": -75.02230834960938, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 2.1338372230529785, "rewards/margins": -1.661588191986084, "rewards/rejected": 3.7954254150390625, "step": 5734 }, { "epoch": 0.93, "learning_rate": 8.168409316849525e-07, "logits/chosen": -0.31273818016052246, "logits/rejected": -0.31273818016052246, "logps/chosen": -1.146218180656433, "logps/rejected": -1.146218180656433, "loss": 0.4473, "rewards/accuracies": 0.0, "rewards/chosen": 0.16222938895225525, "rewards/margins": 0.0, "rewards/rejected": 0.16222938895225525, "step": 5735 }, { "epoch": 0.93, "learning_rate": 8.167392508757229e-07, "logits/chosen": -0.8778544068336487, "logits/rejected": -0.9739678502082825, "logps/chosen": -156.85659790039062, "logps/rejected": -143.0623779296875, "loss": 1.6432, "rewards/accuracies": 0.0, "rewards/chosen": 2.925811767578125, "rewards/margins": -3.24696683883667, "rewards/rejected": 6.172778606414795, "step": 5736 }, { "epoch": 0.93, "learning_rate": 8.16637548182808e-07, "logits/chosen": -0.2614215016365051, "logits/rejected": -0.30053451657295227, "logps/chosen": -59.51127624511719, "logps/rejected": -60.75736999511719, "loss": 1.1609, "rewards/accuracies": 0.0, "rewards/chosen": 0.803234875202179, "rewards/margins": -1.592721700668335, "rewards/rejected": 2.395956516265869, "step": 5737 }, { "epoch": 0.93, "learning_rate": 8.165358236132346e-07, "logits/chosen": -0.5055107474327087, "logits/rejected": -0.505964994430542, "logps/chosen": -2.040081262588501, "logps/rejected": -21.08924102783203, "loss": 1.5616, "rewards/accuracies": 1.0, "rewards/chosen": 0.2733358144760132, "rewards/margins": 0.21160772442817688, "rewards/rejected": 0.0617280974984169, "step": 5738 }, { "epoch": 0.93, "learning_rate": 8.164340771740309e-07, "logits/chosen": -0.6781179308891296, "logits/rejected": -0.8456158638000488, "logps/chosen": -64.80537414550781, "logps/rejected": -1995.70849609375, "loss": 0.029, "rewards/accuracies": 1.0, "rewards/chosen": 3.3414580821990967, "rewards/margins": 6.822561740875244, "rewards/rejected": -3.4811036586761475, "step": 5739 }, { "epoch": 0.93, "learning_rate": 8.163323088722266e-07, "logits/chosen": -0.958465039730072, "logits/rejected": -0.9188461303710938, "logps/chosen": -34.49614334106445, "logps/rejected": -13.494667053222656, "loss": 0.7798, "rewards/accuracies": 0.0, "rewards/chosen": 0.4890727996826172, "rewards/margins": -0.35073739290237427, "rewards/rejected": 0.8398101925849915, "step": 5740 }, { "epoch": 0.93, "learning_rate": 8.162305187148528e-07, "logits/chosen": -0.3459014594554901, "logits/rejected": -0.3158554434776306, "logps/chosen": -49.572933197021484, "logps/rejected": -57.09389877319336, "loss": 0.2733, "rewards/accuracies": 1.0, "rewards/chosen": 2.3150112628936768, "rewards/margins": 0.4393981695175171, "rewards/rejected": 1.8756130933761597, "step": 5741 }, { "epoch": 0.93, "learning_rate": 8.161287067089425e-07, "logits/chosen": -0.3815402686595917, "logits/rejected": -0.39766091108322144, "logps/chosen": -75.01619720458984, "logps/rejected": -89.86123657226562, "loss": 0.4756, "rewards/accuracies": 1.0, "rewards/chosen": 0.5251823663711548, "rewards/margins": 0.2848396301269531, "rewards/rejected": 0.24034272134304047, "step": 5742 }, { "epoch": 0.93, "learning_rate": 8.160268728615298e-07, "logits/chosen": -1.000008225440979, "logits/rejected": -0.9685748219490051, "logps/chosen": -62.64231491088867, "logps/rejected": -55.32490539550781, "loss": 0.4177, "rewards/accuracies": 0.0, "rewards/chosen": 0.3655223846435547, "rewards/margins": -0.22335320711135864, "rewards/rejected": 0.5888755917549133, "step": 5743 }, { "epoch": 0.93, "learning_rate": 8.159250171796503e-07, "logits/chosen": -0.06006919965147972, "logits/rejected": -0.06296151876449585, "logps/chosen": -2.6869819164276123, "logps/rejected": -9.677946090698242, "loss": 1.1327, "rewards/accuracies": 1.0, "rewards/chosen": 0.2457127869129181, "rewards/margins": 0.3153729736804962, "rewards/rejected": -0.06966018676757812, "step": 5744 }, { "epoch": 0.93, "learning_rate": 8.158231396703416e-07, "logits/chosen": -0.08611796051263809, "logits/rejected": -0.08206668496131897, "logps/chosen": -2.9917376041412354, "logps/rejected": -21.241588592529297, "loss": 1.0786, "rewards/accuracies": 1.0, "rewards/chosen": 0.27052295207977295, "rewards/margins": 0.15718790888786316, "rewards/rejected": 0.1133350357413292, "step": 5745 }, { "epoch": 0.93, "learning_rate": 8.157212403406423e-07, "logits/chosen": -0.7448344826698303, "logits/rejected": -0.7395627498626709, "logps/chosen": -59.278564453125, "logps/rejected": -65.26339721679688, "loss": 0.656, "rewards/accuracies": 1.0, "rewards/chosen": 2.1415703296661377, "rewards/margins": 1.4600250720977783, "rewards/rejected": 0.6815452575683594, "step": 5746 }, { "epoch": 0.93, "learning_rate": 8.156193191975926e-07, "logits/chosen": -0.6193442940711975, "logits/rejected": -0.34979385137557983, "logps/chosen": -124.14512634277344, "logps/rejected": -31.195613861083984, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": 4.758533000946045, "rewards/margins": 4.460779666900635, "rewards/rejected": 0.29775354266166687, "step": 5747 }, { "epoch": 0.93, "learning_rate": 8.155173762482343e-07, "logits/chosen": -0.9342797994613647, "logits/rejected": -1.0228215456008911, "logps/chosen": -225.26174926757812, "logps/rejected": -89.15885925292969, "loss": 0.7602, "rewards/accuracies": 1.0, "rewards/chosen": 3.768026828765869, "rewards/margins": 2.6020736694335938, "rewards/rejected": 1.1659530401229858, "step": 5748 }, { "epoch": 0.93, "learning_rate": 8.154154114996109e-07, "logits/chosen": -0.8417221307754517, "logits/rejected": -0.8094788193702698, "logps/chosen": -104.91574096679688, "logps/rejected": -227.62127685546875, "loss": 4.7315, "rewards/accuracies": 0.0, "rewards/chosen": 0.8797012567520142, "rewards/margins": -4.8191819190979, "rewards/rejected": 5.698883056640625, "step": 5749 }, { "epoch": 0.93, "learning_rate": 8.15313424958767e-07, "logits/chosen": -0.2555878162384033, "logits/rejected": -0.24571511149406433, "logps/chosen": -1.0890858173370361, "logps/rejected": -16.219043731689453, "loss": 0.4548, "rewards/accuracies": 1.0, "rewards/chosen": 0.3954612612724304, "rewards/margins": 0.045453041791915894, "rewards/rejected": 0.3500082194805145, "step": 5750 }, { "epoch": 0.93, "learning_rate": 8.152114166327489e-07, "logits/chosen": -0.4517727494239807, "logits/rejected": -0.44688037037849426, "logps/chosen": -6.949218273162842, "logps/rejected": -6.084383010864258, "loss": 1.6042, "rewards/accuracies": 0.0, "rewards/chosen": -0.1291717141866684, "rewards/margins": -0.21271654963493347, "rewards/rejected": 0.08354482799768448, "step": 5751 }, { "epoch": 0.93, "learning_rate": 8.151093865286045e-07, "logits/chosen": -0.8228042721748352, "logits/rejected": -0.7295567989349365, "logps/chosen": -103.65988159179688, "logps/rejected": -64.3511734008789, "loss": 0.4109, "rewards/accuracies": 0.0, "rewards/chosen": 1.383876085281372, "rewards/margins": -0.20753860473632812, "rewards/rejected": 1.5914146900177002, "step": 5752 }, { "epoch": 0.93, "learning_rate": 8.150073346533833e-07, "logits/chosen": -0.8018859624862671, "logits/rejected": -0.8873655200004578, "logps/chosen": -197.69384765625, "logps/rejected": -69.40579223632812, "loss": 1.2972, "rewards/accuracies": 1.0, "rewards/chosen": 3.8833465576171875, "rewards/margins": 1.5567817687988281, "rewards/rejected": 2.3265647888183594, "step": 5753 }, { "epoch": 0.93, "learning_rate": 8.149052610141355e-07, "logits/chosen": -0.36544835567474365, "logits/rejected": -0.2939046621322632, "logps/chosen": -85.32102966308594, "logps/rejected": -88.24842834472656, "loss": 1.0305, "rewards/accuracies": 0.0, "rewards/chosen": 3.3379647731781006, "rewards/margins": -0.07790827751159668, "rewards/rejected": 3.4158730506896973, "step": 5754 }, { "epoch": 0.93, "learning_rate": 8.148031656179141e-07, "logits/chosen": -0.2880970537662506, "logits/rejected": -0.2362671047449112, "logps/chosen": -106.47785949707031, "logps/rejected": -70.75163269042969, "loss": 0.3361, "rewards/accuracies": 1.0, "rewards/chosen": 1.568947672843933, "rewards/margins": 0.15820395946502686, "rewards/rejected": 1.4107437133789062, "step": 5755 }, { "epoch": 0.93, "learning_rate": 8.147010484717726e-07, "logits/chosen": -0.48955392837524414, "logits/rejected": -0.3460554778575897, "logps/chosen": -111.14846801757812, "logps/rejected": -29.759967803955078, "loss": 0.203, "rewards/accuracies": 1.0, "rewards/chosen": 0.9034210443496704, "rewards/margins": 0.7558311820030212, "rewards/rejected": 0.14758987724781036, "step": 5756 }, { "epoch": 0.93, "learning_rate": 8.145989095827664e-07, "logits/chosen": -0.8460899591445923, "logits/rejected": -0.8194526433944702, "logps/chosen": -78.59053039550781, "logps/rejected": -126.72344970703125, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 1.819416880607605, "rewards/margins": 0.01954042911529541, "rewards/rejected": 1.7998764514923096, "step": 5757 }, { "epoch": 0.93, "learning_rate": 8.144967489579522e-07, "logits/chosen": -0.5385159254074097, "logits/rejected": -0.37549155950546265, "logps/chosen": -72.31043243408203, "logps/rejected": -20.327390670776367, "loss": 0.8517, "rewards/accuracies": 1.0, "rewards/chosen": 1.9200248718261719, "rewards/margins": 1.6569433212280273, "rewards/rejected": 0.26308155059814453, "step": 5758 }, { "epoch": 0.93, "learning_rate": 8.143945666043886e-07, "logits/chosen": -0.3908655345439911, "logits/rejected": -0.38395392894744873, "logps/chosen": -127.0309066772461, "logps/rejected": -215.5250244140625, "loss": 0.3036, "rewards/accuracies": 1.0, "rewards/chosen": 0.7160911560058594, "rewards/margins": 0.7382911443710327, "rewards/rejected": -0.02220001257956028, "step": 5759 }, { "epoch": 0.93, "learning_rate": 8.142923625291351e-07, "logits/chosen": -0.9862111210823059, "logits/rejected": -0.964879035949707, "logps/chosen": -129.44189453125, "logps/rejected": -101.39958190917969, "loss": 1.6271, "rewards/accuracies": 0.0, "rewards/chosen": 1.5643310546875, "rewards/margins": -1.4387619495391846, "rewards/rejected": 3.0030930042266846, "step": 5760 }, { "epoch": 0.94, "learning_rate": 8.141901367392534e-07, "logits/chosen": -0.5163882970809937, "logits/rejected": -0.5163882970809937, "logps/chosen": -19.141357421875, "logps/rejected": -19.141357421875, "loss": 0.3659, "rewards/accuracies": 0.0, "rewards/chosen": 1.5883667469024658, "rewards/margins": 0.0, "rewards/rejected": 1.5883667469024658, "step": 5761 }, { "epoch": 0.94, "learning_rate": 8.14087889241806e-07, "logits/chosen": -0.5276728868484497, "logits/rejected": -0.4717063903808594, "logps/chosen": -64.55095672607422, "logps/rejected": -59.673065185546875, "loss": 0.5204, "rewards/accuracies": 0.0, "rewards/chosen": 2.032468557357788, "rewards/margins": -0.12277889251708984, "rewards/rejected": 2.155247449874878, "step": 5762 }, { "epoch": 0.94, "learning_rate": 8.139856200438575e-07, "logits/chosen": -0.8237175941467285, "logits/rejected": -0.6848971247673035, "logps/chosen": -110.45521545410156, "logps/rejected": -71.66323852539062, "loss": 0.2798, "rewards/accuracies": 1.0, "rewards/chosen": 2.42291259765625, "rewards/margins": 0.300933837890625, "rewards/rejected": 2.121978759765625, "step": 5763 }, { "epoch": 0.94, "learning_rate": 8.138833291524734e-07, "logits/chosen": -0.7019805908203125, "logits/rejected": -0.6746141910552979, "logps/chosen": -30.741594314575195, "logps/rejected": -34.091346740722656, "loss": 0.3879, "rewards/accuracies": 0.0, "rewards/chosen": 1.7750898599624634, "rewards/margins": -0.047040700912475586, "rewards/rejected": 1.822130560874939, "step": 5764 }, { "epoch": 0.94, "learning_rate": 8.137810165747214e-07, "logits/chosen": -0.7747920751571655, "logits/rejected": -0.7280049324035645, "logps/chosen": -104.6114730834961, "logps/rejected": -133.48304748535156, "loss": 0.9409, "rewards/accuracies": 0.0, "rewards/chosen": 1.022469401359558, "rewards/margins": -0.6778602600097656, "rewards/rejected": 1.7003296613693237, "step": 5765 }, { "epoch": 0.94, "learning_rate": 8.136786823176702e-07, "logits/chosen": -0.6942040920257568, "logits/rejected": -0.6948160529136658, "logps/chosen": -36.85310363769531, "logps/rejected": -52.63676452636719, "loss": 0.4823, "rewards/accuracies": 0.0, "rewards/chosen": 1.2696961164474487, "rewards/margins": -0.4786292314529419, "rewards/rejected": 1.7483253479003906, "step": 5766 }, { "epoch": 0.94, "learning_rate": 8.135763263883901e-07, "logits/chosen": -0.8452334403991699, "logits/rejected": -0.734604001045227, "logps/chosen": -95.72911071777344, "logps/rejected": -88.25465393066406, "loss": 1.502, "rewards/accuracies": 0.0, "rewards/chosen": -0.4059188961982727, "rewards/margins": -2.202918291091919, "rewards/rejected": 1.7969993352890015, "step": 5767 }, { "epoch": 0.94, "learning_rate": 8.134739487939529e-07, "logits/chosen": -0.6318913698196411, "logits/rejected": -0.7180871367454529, "logps/chosen": -90.97486877441406, "logps/rejected": -109.05642700195312, "loss": 1.3535, "rewards/accuracies": 0.0, "rewards/chosen": 1.2670990228652954, "rewards/margins": -2.508509635925293, "rewards/rejected": 3.775608777999878, "step": 5768 }, { "epoch": 0.94, "learning_rate": 8.13371549541432e-07, "logits/chosen": -0.8307635188102722, "logits/rejected": -0.7513757348060608, "logps/chosen": -132.56124877929688, "logps/rejected": -136.355224609375, "loss": 0.1349, "rewards/accuracies": 1.0, "rewards/chosen": 5.0611724853515625, "rewards/margins": 1.1798202991485596, "rewards/rejected": 3.881352186203003, "step": 5769 }, { "epoch": 0.94, "learning_rate": 8.132691286379021e-07, "logits/chosen": -0.7319403886795044, "logits/rejected": -0.8213537931442261, "logps/chosen": -188.81253051757812, "logps/rejected": -240.7105712890625, "loss": 2.1625, "rewards/accuracies": 0.0, "rewards/chosen": 3.043133497238159, "rewards/margins": -4.224429130554199, "rewards/rejected": 7.2675628662109375, "step": 5770 }, { "epoch": 0.94, "learning_rate": 8.131666860904396e-07, "logits/chosen": -0.7461665272712708, "logits/rejected": -0.7023419737815857, "logps/chosen": -173.58932495117188, "logps/rejected": -65.75346374511719, "loss": 0.0808, "rewards/accuracies": 1.0, "rewards/chosen": 3.639129638671875, "rewards/margins": 2.101635694503784, "rewards/rejected": 1.5374939441680908, "step": 5771 }, { "epoch": 0.94, "learning_rate": 8.130642219061223e-07, "logits/chosen": -0.7537139058113098, "logits/rejected": -0.5925682187080383, "logps/chosen": -152.67138671875, "logps/rejected": -32.908294677734375, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": 4.3157958984375, "rewards/margins": 3.777024745941162, "rewards/rejected": 0.5387710928916931, "step": 5772 }, { "epoch": 0.94, "learning_rate": 8.129617360920296e-07, "logits/chosen": -0.6174202561378479, "logits/rejected": -0.6050395965576172, "logps/chosen": -65.65299987792969, "logps/rejected": -80.27658081054688, "loss": 0.8341, "rewards/accuracies": 0.0, "rewards/chosen": 2.507389783859253, "rewards/margins": -1.3484864234924316, "rewards/rejected": 3.8558762073516846, "step": 5773 }, { "epoch": 0.94, "learning_rate": 8.12859228655242e-07, "logits/chosen": -0.48527270555496216, "logits/rejected": -0.4891095757484436, "logps/chosen": -74.38296508789062, "logps/rejected": -66.03340148925781, "loss": 0.4828, "rewards/accuracies": 0.0, "rewards/chosen": 1.7042953968048096, "rewards/margins": -0.06644737720489502, "rewards/rejected": 1.7707427740097046, "step": 5774 }, { "epoch": 0.94, "learning_rate": 8.127566996028422e-07, "logits/chosen": -0.5148798823356628, "logits/rejected": -0.5218095183372498, "logps/chosen": -71.52558898925781, "logps/rejected": -54.36964416503906, "loss": 0.2432, "rewards/accuracies": 1.0, "rewards/chosen": 2.522512197494507, "rewards/margins": 0.5702340602874756, "rewards/rejected": 1.9522781372070312, "step": 5775 }, { "epoch": 0.94, "learning_rate": 8.126541489419137e-07, "logits/chosen": -0.9606350064277649, "logits/rejected": -0.9932200908660889, "logps/chosen": -133.56105041503906, "logps/rejected": -116.14241790771484, "loss": 2.1191, "rewards/accuracies": 0.0, "rewards/chosen": 1.0109084844589233, "rewards/margins": -1.1763709783554077, "rewards/rejected": 2.187279462814331, "step": 5776 }, { "epoch": 0.94, "learning_rate": 8.125515766795419e-07, "logits/chosen": -0.8091973066329956, "logits/rejected": -0.7704558372497559, "logps/chosen": -100.01904296875, "logps/rejected": -134.26870727539062, "loss": 0.3452, "rewards/accuracies": 1.0, "rewards/chosen": 5.442132472991943, "rewards/margins": 0.04870748519897461, "rewards/rejected": 5.393424987792969, "step": 5777 }, { "epoch": 0.94, "learning_rate": 8.124489828228136e-07, "logits/chosen": -0.5479677319526672, "logits/rejected": -0.4188246428966522, "logps/chosen": -39.779136657714844, "logps/rejected": -9.631956100463867, "loss": 0.9723, "rewards/accuracies": 1.0, "rewards/chosen": 1.5983269214630127, "rewards/margins": 0.98887038230896, "rewards/rejected": 0.6094565391540527, "step": 5778 }, { "epoch": 0.94, "learning_rate": 8.12346367378817e-07, "logits/chosen": -0.9777162671089172, "logits/rejected": -0.7736322283744812, "logps/chosen": -81.25611114501953, "logps/rejected": -74.47335815429688, "loss": 0.4723, "rewards/accuracies": 1.0, "rewards/chosen": 1.5450432300567627, "rewards/margins": 0.5156532526016235, "rewards/rejected": 1.0293899774551392, "step": 5779 }, { "epoch": 0.94, "learning_rate": 8.122437303546417e-07, "logits/chosen": -0.8195879459381104, "logits/rejected": -0.7881410121917725, "logps/chosen": -79.60061645507812, "logps/rejected": -138.88194274902344, "loss": 0.159, "rewards/accuracies": 1.0, "rewards/chosen": 1.0054444074630737, "rewards/margins": 1.2695221900939941, "rewards/rejected": -0.264077752828598, "step": 5780 }, { "epoch": 0.94, "learning_rate": 8.121410717573793e-07, "logits/chosen": -1.0174121856689453, "logits/rejected": -0.9120600819587708, "logps/chosen": -96.89630126953125, "logps/rejected": -55.47303771972656, "loss": 0.3827, "rewards/accuracies": 1.0, "rewards/chosen": 3.684439182281494, "rewards/margins": 2.590176582336426, "rewards/rejected": 1.094262719154358, "step": 5781 }, { "epoch": 0.94, "learning_rate": 8.120383915941222e-07, "logits/chosen": -1.001953125, "logits/rejected": -1.0142934322357178, "logps/chosen": -76.22479248046875, "logps/rejected": -161.3434295654297, "loss": 1.2814, "rewards/accuracies": 0.0, "rewards/chosen": 4.9253106117248535, "rewards/margins": -2.3933463096618652, "rewards/rejected": 7.318656921386719, "step": 5782 }, { "epoch": 0.94, "learning_rate": 8.119356898719649e-07, "logits/chosen": -0.7834607362747192, "logits/rejected": -0.6934292912483215, "logps/chosen": -142.69320678710938, "logps/rejected": -135.7149200439453, "loss": 0.2589, "rewards/accuracies": 1.0, "rewards/chosen": 3.4391555786132812, "rewards/margins": 0.4359419345855713, "rewards/rejected": 3.00321364402771, "step": 5783 }, { "epoch": 0.94, "learning_rate": 8.11832966598003e-07, "logits/chosen": -0.8429065942764282, "logits/rejected": -0.8544535636901855, "logps/chosen": -280.94036865234375, "logps/rejected": -110.43565368652344, "loss": 0.4126, "rewards/accuracies": 0.0, "rewards/chosen": 4.222405910491943, "rewards/margins": -0.24630165100097656, "rewards/rejected": 4.46870756149292, "step": 5784 }, { "epoch": 0.94, "learning_rate": 8.117302217793336e-07, "logits/chosen": -0.532799243927002, "logits/rejected": -0.5137549638748169, "logps/chosen": -27.784759521484375, "logps/rejected": -64.39507293701172, "loss": 0.5229, "rewards/accuracies": 1.0, "rewards/chosen": 1.8874043226242065, "rewards/margins": 0.4409458637237549, "rewards/rejected": 1.4464584589004517, "step": 5785 }, { "epoch": 0.94, "learning_rate": 8.116274554230555e-07, "logits/chosen": -0.7029207348823547, "logits/rejected": -0.7605178356170654, "logps/chosen": -76.7853775024414, "logps/rejected": -69.15540313720703, "loss": 0.5054, "rewards/accuracies": 0.0, "rewards/chosen": 0.9109092950820923, "rewards/margins": -0.38012540340423584, "rewards/rejected": 1.2910346984863281, "step": 5786 }, { "epoch": 0.94, "learning_rate": 8.115246675362689e-07, "logits/chosen": -0.7826419472694397, "logits/rejected": -0.7665387988090515, "logps/chosen": -67.35130310058594, "logps/rejected": -31.821365356445312, "loss": 0.1086, "rewards/accuracies": 1.0, "rewards/chosen": 2.738203525543213, "rewards/margins": 2.3340299129486084, "rewards/rejected": 0.40417367219924927, "step": 5787 }, { "epoch": 0.94, "learning_rate": 8.114218581260754e-07, "logits/chosen": -0.46347418427467346, "logits/rejected": -0.3961382806301117, "logps/chosen": -53.85053634643555, "logps/rejected": -76.81610870361328, "loss": 0.441, "rewards/accuracies": 1.0, "rewards/chosen": 2.0449206829071045, "rewards/margins": 0.49508464336395264, "rewards/rejected": 1.5498360395431519, "step": 5788 }, { "epoch": 0.94, "learning_rate": 8.113190271995784e-07, "logits/chosen": -0.5441059470176697, "logits/rejected": -0.5176418423652649, "logps/chosen": -87.01531982421875, "logps/rejected": -44.925838470458984, "loss": 0.7831, "rewards/accuracies": 0.0, "rewards/chosen": 1.1492462158203125, "rewards/margins": -0.9529941082000732, "rewards/rejected": 2.1022403240203857, "step": 5789 }, { "epoch": 0.94, "learning_rate": 8.112161747638821e-07, "logits/chosen": -0.5328479409217834, "logits/rejected": -0.4668051600456238, "logps/chosen": -73.33088684082031, "logps/rejected": -48.37312316894531, "loss": 0.3523, "rewards/accuracies": 1.0, "rewards/chosen": 1.5569175481796265, "rewards/margins": 0.3901221752166748, "rewards/rejected": 1.1667953729629517, "step": 5790 }, { "epoch": 0.94, "learning_rate": 8.11113300826093e-07, "logits/chosen": -0.6565284132957458, "logits/rejected": -0.622635006904602, "logps/chosen": -67.21954345703125, "logps/rejected": -44.89196014404297, "loss": 0.51, "rewards/accuracies": 0.0, "rewards/chosen": 1.3344337940216064, "rewards/margins": -0.4174255132675171, "rewards/rejected": 1.7518593072891235, "step": 5791 }, { "epoch": 0.94, "learning_rate": 8.110104053933187e-07, "logits/chosen": -0.4943697154521942, "logits/rejected": -0.5939129590988159, "logps/chosen": -192.97012329101562, "logps/rejected": -111.52572631835938, "loss": 0.7457, "rewards/accuracies": 0.0, "rewards/chosen": 3.2654876708984375, "rewards/margins": -1.16572904586792, "rewards/rejected": 4.431216716766357, "step": 5792 }, { "epoch": 0.94, "learning_rate": 8.10907488472668e-07, "logits/chosen": -0.8486263751983643, "logits/rejected": -0.698445200920105, "logps/chosen": -58.641685485839844, "logps/rejected": -34.91317367553711, "loss": 0.1828, "rewards/accuracies": 1.0, "rewards/chosen": 2.1108977794647217, "rewards/margins": 1.8268474340438843, "rewards/rejected": 0.2840503752231598, "step": 5793 }, { "epoch": 0.94, "learning_rate": 8.108045500712517e-07, "logits/chosen": -0.5609394907951355, "logits/rejected": -0.6396851539611816, "logps/chosen": -52.68585205078125, "logps/rejected": -119.97400665283203, "loss": 4.2508, "rewards/accuracies": 0.0, "rewards/chosen": 1.4501060247421265, "rewards/margins": -4.729403495788574, "rewards/rejected": 6.17950963973999, "step": 5794 }, { "epoch": 0.94, "learning_rate": 8.10701590196182e-07, "logits/chosen": -0.8869776725769043, "logits/rejected": -1.1873384714126587, "logps/chosen": -114.99647521972656, "logps/rejected": -35.091129302978516, "loss": 0.421, "rewards/accuracies": 1.0, "rewards/chosen": 2.144461154937744, "rewards/margins": 1.8879921436309814, "rewards/rejected": 0.2564689815044403, "step": 5795 }, { "epoch": 0.94, "learning_rate": 8.105986088545722e-07, "logits/chosen": -0.5489394068717957, "logits/rejected": -0.5430898070335388, "logps/chosen": -61.09320831298828, "logps/rejected": -56.47319030761719, "loss": 0.6193, "rewards/accuracies": 1.0, "rewards/chosen": 2.3938820362091064, "rewards/margins": 0.5928001403808594, "rewards/rejected": 1.801081895828247, "step": 5796 }, { "epoch": 0.94, "learning_rate": 8.104956060535375e-07, "logits/chosen": -0.634065568447113, "logits/rejected": -0.5999133586883545, "logps/chosen": -89.18463134765625, "logps/rejected": -64.6400146484375, "loss": 0.5152, "rewards/accuracies": 0.0, "rewards/chosen": 1.7044013738632202, "rewards/margins": -0.13376390933990479, "rewards/rejected": 1.838165283203125, "step": 5797 }, { "epoch": 0.94, "learning_rate": 8.103925818001942e-07, "logits/chosen": -1.0192912817001343, "logits/rejected": -0.9238227605819702, "logps/chosen": -89.65373229980469, "logps/rejected": -88.30891418457031, "loss": 0.2948, "rewards/accuracies": 1.0, "rewards/chosen": 2.330005645751953, "rewards/margins": 0.23769593238830566, "rewards/rejected": 2.0923097133636475, "step": 5798 }, { "epoch": 0.94, "learning_rate": 8.102895361016606e-07, "logits/chosen": -0.3233003616333008, "logits/rejected": -0.32999375462532043, "logps/chosen": -13.722999572753906, "logps/rejected": -1.6667269468307495, "loss": 0.5981, "rewards/accuracies": 0.0, "rewards/chosen": -0.1846552938222885, "rewards/margins": -0.39187857508659363, "rewards/rejected": 0.20722328126430511, "step": 5799 }, { "epoch": 0.94, "learning_rate": 8.101864689650559e-07, "logits/chosen": -0.7588769793510437, "logits/rejected": -0.774422287940979, "logps/chosen": -67.56602478027344, "logps/rejected": -99.48189544677734, "loss": 2.0767, "rewards/accuracies": 0.0, "rewards/chosen": 1.6696373224258423, "rewards/margins": -2.2619261741638184, "rewards/rejected": 3.93156361579895, "step": 5800 }, { "epoch": 0.94, "learning_rate": 8.100833803975015e-07, "logits/chosen": -1.169796347618103, "logits/rejected": -1.10390305519104, "logps/chosen": -266.15509033203125, "logps/rejected": -128.0662841796875, "loss": 0.4002, "rewards/accuracies": 0.0, "rewards/chosen": 2.2958741188049316, "rewards/margins": -0.08119797706604004, "rewards/rejected": 2.3770720958709717, "step": 5801 }, { "epoch": 0.94, "learning_rate": 8.099802704061193e-07, "logits/chosen": -0.28254610300064087, "logits/rejected": -0.2943976819515228, "logps/chosen": -55.73902130126953, "logps/rejected": -105.50991821289062, "loss": 0.243, "rewards/accuracies": 1.0, "rewards/chosen": 1.5432747602462769, "rewards/margins": 0.4804832935333252, "rewards/rejected": 1.0627914667129517, "step": 5802 }, { "epoch": 0.94, "learning_rate": 8.098771389980336e-07, "logits/chosen": -0.33231163024902344, "logits/rejected": -0.33263686299324036, "logps/chosen": -99.05522155761719, "logps/rejected": -53.321205139160156, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 1.1130599975585938, "rewards/margins": -0.3220818042755127, "rewards/rejected": 1.4351418018341064, "step": 5803 }, { "epoch": 0.94, "learning_rate": 8.097739861803696e-07, "logits/chosen": -0.6663451194763184, "logits/rejected": -0.5951226353645325, "logps/chosen": -165.81381225585938, "logps/rejected": -57.44603729248047, "loss": 1.0658, "rewards/accuracies": 1.0, "rewards/chosen": 3.46451735496521, "rewards/margins": 1.6765869855880737, "rewards/rejected": 1.7879303693771362, "step": 5804 }, { "epoch": 0.94, "learning_rate": 8.096708119602542e-07, "logits/chosen": -0.523041307926178, "logits/rejected": -0.5422429442405701, "logps/chosen": -62.535362243652344, "logps/rejected": -36.191627502441406, "loss": 0.9053, "rewards/accuracies": 0.0, "rewards/chosen": 0.6905860900878906, "rewards/margins": -0.79156494140625, "rewards/rejected": 1.4821510314941406, "step": 5805 }, { "epoch": 0.94, "learning_rate": 8.09567616344816e-07, "logits/chosen": -0.629318356513977, "logits/rejected": -0.5296556353569031, "logps/chosen": -51.02610778808594, "logps/rejected": -84.22957611083984, "loss": 0.6219, "rewards/accuracies": 0.0, "rewards/chosen": 2.2735671997070312, "rewards/margins": -0.3823540210723877, "rewards/rejected": 2.655921220779419, "step": 5806 }, { "epoch": 0.94, "learning_rate": 8.094643993411845e-07, "logits/chosen": -0.6764846444129944, "logits/rejected": -0.6072578430175781, "logps/chosen": -175.29498291015625, "logps/rejected": -141.72772216796875, "loss": 0.46, "rewards/accuracies": 0.0, "rewards/chosen": 5.041326999664307, "rewards/margins": -0.3101530075073242, "rewards/rejected": 5.351480007171631, "step": 5807 }, { "epoch": 0.94, "learning_rate": 8.093611609564912e-07, "logits/chosen": -0.8147344589233398, "logits/rejected": -0.9142598509788513, "logps/chosen": -90.82713317871094, "logps/rejected": -96.66625213623047, "loss": 2.389, "rewards/accuracies": 0.0, "rewards/chosen": 1.404761552810669, "rewards/margins": -3.6198012828826904, "rewards/rejected": 5.024562835693359, "step": 5808 }, { "epoch": 0.94, "learning_rate": 8.092579011978691e-07, "logits/chosen": -0.6441691517829895, "logits/rejected": -0.6610568761825562, "logps/chosen": -88.98457336425781, "logps/rejected": -143.86221313476562, "loss": 0.7271, "rewards/accuracies": 1.0, "rewards/chosen": 1.375335693359375, "rewards/margins": 0.870343029499054, "rewards/rejected": 0.504992663860321, "step": 5809 }, { "epoch": 0.94, "learning_rate": 8.091546200724521e-07, "logits/chosen": -0.486573725938797, "logits/rejected": -0.486573725938797, "logps/chosen": -3.248572587966919, "logps/rejected": -3.248572587966919, "loss": 0.3711, "rewards/accuracies": 0.0, "rewards/chosen": 1.358268141746521, "rewards/margins": 0.0, "rewards/rejected": 1.358268141746521, "step": 5810 }, { "epoch": 0.94, "learning_rate": 8.090513175873761e-07, "logits/chosen": -0.63727205991745, "logits/rejected": -0.15524718165397644, "logps/chosen": -176.39974975585938, "logps/rejected": -131.1737823486328, "loss": 1.0852, "rewards/accuracies": 0.0, "rewards/chosen": 0.3820388913154602, "rewards/margins": -1.769871711730957, "rewards/rejected": 2.1519105434417725, "step": 5811 }, { "epoch": 0.94, "learning_rate": 8.089479937497782e-07, "logits/chosen": -0.41146543622016907, "logits/rejected": -0.3466659486293793, "logps/chosen": -78.63737487792969, "logps/rejected": -13.797880172729492, "loss": 0.8529, "rewards/accuracies": 1.0, "rewards/chosen": 0.4988868832588196, "rewards/margins": 0.7837927937507629, "rewards/rejected": -0.28490591049194336, "step": 5812 }, { "epoch": 0.94, "learning_rate": 8.088446485667975e-07, "logits/chosen": -0.7681319713592529, "logits/rejected": -0.6132826805114746, "logps/chosen": -140.271728515625, "logps/rejected": -56.27369689941406, "loss": 0.2764, "rewards/accuracies": 1.0, "rewards/chosen": 3.7902328968048096, "rewards/margins": 0.39894628524780273, "rewards/rejected": 3.391286611557007, "step": 5813 }, { "epoch": 0.94, "learning_rate": 8.087412820455737e-07, "logits/chosen": -0.8430830240249634, "logits/rejected": -0.7969918847084045, "logps/chosen": -51.874420166015625, "logps/rejected": -63.956417083740234, "loss": 0.3885, "rewards/accuracies": 1.0, "rewards/chosen": 2.6029365062713623, "rewards/margins": 0.23128461837768555, "rewards/rejected": 2.3716518878936768, "step": 5814 }, { "epoch": 0.94, "learning_rate": 8.086378941932487e-07, "logits/chosen": -0.6593050360679626, "logits/rejected": -0.6833887100219727, "logps/chosen": -50.21337127685547, "logps/rejected": -51.08429718017578, "loss": 0.7979, "rewards/accuracies": 0.0, "rewards/chosen": 1.498023271560669, "rewards/margins": -0.8057608604431152, "rewards/rejected": 2.303784132003784, "step": 5815 }, { "epoch": 0.94, "learning_rate": 8.085344850169656e-07, "logits/chosen": -0.7670865058898926, "logits/rejected": -0.7271630167961121, "logps/chosen": -131.26864624023438, "logps/rejected": -128.40158081054688, "loss": 0.7739, "rewards/accuracies": 1.0, "rewards/chosen": 2.1027863025665283, "rewards/margins": 0.7396911382675171, "rewards/rejected": 1.3630951642990112, "step": 5816 }, { "epoch": 0.94, "learning_rate": 8.084310545238689e-07, "logits/chosen": -0.8157579898834229, "logits/rejected": -0.8102815747261047, "logps/chosen": -62.546600341796875, "logps/rejected": -63.74562072753906, "loss": 0.9445, "rewards/accuracies": 0.0, "rewards/chosen": 1.385279893875122, "rewards/margins": -1.5963058471679688, "rewards/rejected": 2.981585741043091, "step": 5817 }, { "epoch": 0.94, "learning_rate": 8.083276027211048e-07, "logits/chosen": -0.8404800891876221, "logits/rejected": -0.7910661697387695, "logps/chosen": -145.18075561523438, "logps/rejected": -91.51887512207031, "loss": 0.7369, "rewards/accuracies": 0.0, "rewards/chosen": 1.3231903314590454, "rewards/margins": -1.105265736579895, "rewards/rejected": 2.4284560680389404, "step": 5818 }, { "epoch": 0.94, "learning_rate": 8.082241296158207e-07, "logits/chosen": -0.612263560295105, "logits/rejected": -0.5896404385566711, "logps/chosen": -53.988670349121094, "logps/rejected": -83.32707214355469, "loss": 0.4046, "rewards/accuracies": 1.0, "rewards/chosen": 1.1117165088653564, "rewards/margins": 1.0053261518478394, "rewards/rejected": 0.10639037936925888, "step": 5819 }, { "epoch": 0.94, "learning_rate": 8.081206352151657e-07, "logits/chosen": -0.5634949207305908, "logits/rejected": -0.5017106533050537, "logps/chosen": -53.66368865966797, "logps/rejected": -41.19355010986328, "loss": 1.2385, "rewards/accuracies": 1.0, "rewards/chosen": 1.5864174365997314, "rewards/margins": 0.2199791669845581, "rewards/rejected": 1.3664382696151733, "step": 5820 }, { "epoch": 0.94, "learning_rate": 8.080171195262904e-07, "logits/chosen": -0.39904701709747314, "logits/rejected": -0.3968167006969452, "logps/chosen": -124.25394439697266, "logps/rejected": -123.99470520019531, "loss": 2.1665, "rewards/accuracies": 0.0, "rewards/chosen": 0.8746604919433594, "rewards/margins": -3.229729652404785, "rewards/rejected": 4.1043901443481445, "step": 5821 }, { "epoch": 0.94, "learning_rate": 8.079135825563465e-07, "logits/chosen": -0.5976772904396057, "logits/rejected": -0.3827733099460602, "logps/chosen": -96.52186584472656, "logps/rejected": -51.071380615234375, "loss": 2.1582, "rewards/accuracies": 1.0, "rewards/chosen": 1.5042670965194702, "rewards/margins": 0.26793205738067627, "rewards/rejected": 1.236335039138794, "step": 5822 }, { "epoch": 0.95, "learning_rate": 8.078100243124876e-07, "logits/chosen": -0.7498555779457092, "logits/rejected": -0.7084304094314575, "logps/chosen": -116.05459594726562, "logps/rejected": -79.02445983886719, "loss": 0.6676, "rewards/accuracies": 1.0, "rewards/chosen": 1.3386077880859375, "rewards/margins": 0.7350295782089233, "rewards/rejected": 0.6035782098770142, "step": 5823 }, { "epoch": 0.95, "learning_rate": 8.077064448018684e-07, "logits/chosen": -0.6117838621139526, "logits/rejected": -0.5718966126441956, "logps/chosen": -70.38929748535156, "logps/rejected": -45.44914245605469, "loss": 0.6498, "rewards/accuracies": 0.0, "rewards/chosen": 2.2318642139434814, "rewards/margins": -0.968787431716919, "rewards/rejected": 3.2006516456604004, "step": 5824 }, { "epoch": 0.95, "learning_rate": 8.076028440316456e-07, "logits/chosen": -0.7465372085571289, "logits/rejected": -0.6161328554153442, "logps/chosen": -141.19854736328125, "logps/rejected": -42.33579635620117, "loss": 0.2207, "rewards/accuracies": 1.0, "rewards/chosen": 4.57088041305542, "rewards/margins": 2.1118459701538086, "rewards/rejected": 2.4590344429016113, "step": 5825 }, { "epoch": 0.95, "learning_rate": 8.074992220089768e-07, "logits/chosen": -0.1741807460784912, "logits/rejected": -0.1303236186504364, "logps/chosen": -22.142337799072266, "logps/rejected": -17.47001075744629, "loss": 0.7983, "rewards/accuracies": 1.0, "rewards/chosen": 0.6097492575645447, "rewards/margins": 0.34345516562461853, "rewards/rejected": 0.26629409193992615, "step": 5826 }, { "epoch": 0.95, "learning_rate": 8.073955787410214e-07, "logits/chosen": -0.7248321175575256, "logits/rejected": -0.6933164000511169, "logps/chosen": -50.8223991394043, "logps/rejected": -18.456539154052734, "loss": 0.1961, "rewards/accuracies": 1.0, "rewards/chosen": 1.8184254169464111, "rewards/margins": 0.760978102684021, "rewards/rejected": 1.0574473142623901, "step": 5827 }, { "epoch": 0.95, "learning_rate": 8.072919142349399e-07, "logits/chosen": -0.51861971616745, "logits/rejected": -0.5257840752601624, "logps/chosen": -3.9768381118774414, "logps/rejected": -10.219564437866211, "loss": 0.9359, "rewards/accuracies": 1.0, "rewards/chosen": 0.2940020263195038, "rewards/margins": 0.27933698892593384, "rewards/rejected": 0.014665031805634499, "step": 5828 }, { "epoch": 0.95, "learning_rate": 8.071882284978949e-07, "logits/chosen": -0.7864981293678284, "logits/rejected": -0.792317271232605, "logps/chosen": -72.46466827392578, "logps/rejected": -54.992218017578125, "loss": 0.7543, "rewards/accuracies": 0.0, "rewards/chosen": 1.0803947448730469, "rewards/margins": -0.6903847455978394, "rewards/rejected": 1.7707794904708862, "step": 5829 }, { "epoch": 0.95, "learning_rate": 8.0708452153705e-07, "logits/chosen": -0.8364785313606262, "logits/rejected": -0.8466553688049316, "logps/chosen": -57.2943115234375, "logps/rejected": -59.737796783447266, "loss": 0.9918, "rewards/accuracies": 0.0, "rewards/chosen": 1.4998680353164673, "rewards/margins": -0.7442890405654907, "rewards/rejected": 2.244157075881958, "step": 5830 }, { "epoch": 0.95, "learning_rate": 8.069807933595702e-07, "logits/chosen": -0.40982887148857117, "logits/rejected": -0.4009541869163513, "logps/chosen": -18.04949951171875, "logps/rejected": -19.00093650817871, "loss": 1.5824, "rewards/accuracies": 1.0, "rewards/chosen": 0.4535192549228668, "rewards/margins": 0.11383017897605896, "rewards/rejected": 0.33968907594680786, "step": 5831 }, { "epoch": 0.95, "learning_rate": 8.068770439726222e-07, "logits/chosen": -0.6130891442298889, "logits/rejected": -0.6073401570320129, "logps/chosen": -108.51026916503906, "logps/rejected": -59.872283935546875, "loss": 3.6801, "rewards/accuracies": 0.0, "rewards/chosen": 1.0817168951034546, "rewards/margins": -0.9300416707992554, "rewards/rejected": 2.01175856590271, "step": 5832 }, { "epoch": 0.95, "learning_rate": 8.067732733833743e-07, "logits/chosen": -0.6668176651000977, "logits/rejected": -0.6443478465080261, "logps/chosen": -80.45256805419922, "logps/rejected": -88.83332061767578, "loss": 0.4632, "rewards/accuracies": 0.0, "rewards/chosen": 0.9148117303848267, "rewards/margins": -0.36374735832214355, "rewards/rejected": 1.2785590887069702, "step": 5833 }, { "epoch": 0.95, "learning_rate": 8.06669481598996e-07, "logits/chosen": -0.5045301914215088, "logits/rejected": -0.46074995398521423, "logps/chosen": -25.286197662353516, "logps/rejected": -9.676847457885742, "loss": 0.9905, "rewards/accuracies": 1.0, "rewards/chosen": 0.24862270057201385, "rewards/margins": 0.059522807598114014, "rewards/rejected": 0.18909989297389984, "step": 5834 }, { "epoch": 0.95, "learning_rate": 8.065656686266582e-07, "logits/chosen": -0.8151919841766357, "logits/rejected": -0.8067596554756165, "logps/chosen": -83.81316375732422, "logps/rejected": -203.118896484375, "loss": 0.5281, "rewards/accuracies": 0.0, "rewards/chosen": 3.9926888942718506, "rewards/margins": -0.19817566871643066, "rewards/rejected": 4.190864562988281, "step": 5835 }, { "epoch": 0.95, "learning_rate": 8.064618344735333e-07, "logits/chosen": -0.8331895470619202, "logits/rejected": -0.6773082613945007, "logps/chosen": -91.44853210449219, "logps/rejected": -134.49546813964844, "loss": 0.1157, "rewards/accuracies": 1.0, "rewards/chosen": 6.507531642913818, "rewards/margins": 1.3572111129760742, "rewards/rejected": 5.150320529937744, "step": 5836 }, { "epoch": 0.95, "learning_rate": 8.063579791467956e-07, "logits/chosen": -0.831149697303772, "logits/rejected": -0.7602253556251526, "logps/chosen": -61.23783874511719, "logps/rejected": -65.05996704101562, "loss": 0.2977, "rewards/accuracies": 1.0, "rewards/chosen": 2.185784101486206, "rewards/margins": 0.5445067882537842, "rewards/rejected": 1.6412773132324219, "step": 5837 }, { "epoch": 0.95, "learning_rate": 8.062541026536202e-07, "logits/chosen": -0.5349633693695068, "logits/rejected": -0.5833031535148621, "logps/chosen": -112.1158218383789, "logps/rejected": -114.00755310058594, "loss": 1.2636, "rewards/accuracies": 0.0, "rewards/chosen": 1.321051001548767, "rewards/margins": -1.2106965780258179, "rewards/rejected": 2.531747579574585, "step": 5838 }, { "epoch": 0.95, "learning_rate": 8.061502050011842e-07, "logits/chosen": -0.6159464716911316, "logits/rejected": -0.6218755841255188, "logps/chosen": -139.2865447998047, "logps/rejected": -37.727901458740234, "loss": 1.0588, "rewards/accuracies": 0.0, "rewards/chosen": 0.0535125732421875, "rewards/margins": -1.3810951709747314, "rewards/rejected": 1.434607744216919, "step": 5839 }, { "epoch": 0.95, "learning_rate": 8.060462861966657e-07, "logits/chosen": -0.6185633540153503, "logits/rejected": -0.3931509852409363, "logps/chosen": -109.50375366210938, "logps/rejected": -66.70590209960938, "loss": 0.077, "rewards/accuracies": 1.0, "rewards/chosen": 4.082376003265381, "rewards/margins": 1.9678418636322021, "rewards/rejected": 2.1145341396331787, "step": 5840 }, { "epoch": 0.95, "learning_rate": 8.059423462472448e-07, "logits/chosen": -0.7730664610862732, "logits/rejected": -0.7535846829414368, "logps/chosen": -93.99749755859375, "logps/rejected": -118.11726379394531, "loss": 0.665, "rewards/accuracies": 1.0, "rewards/chosen": 0.9112335443496704, "rewards/margins": 0.8070251941680908, "rewards/rejected": 0.10420837253332138, "step": 5841 }, { "epoch": 0.95, "learning_rate": 8.058383851601027e-07, "logits/chosen": -0.9405387043952942, "logits/rejected": -0.6321211457252502, "logps/chosen": -105.17384338378906, "logps/rejected": -67.33661651611328, "loss": 0.6844, "rewards/accuracies": 0.0, "rewards/chosen": 0.8305351138114929, "rewards/margins": -0.9946960806846619, "rewards/rejected": 1.8252311944961548, "step": 5842 }, { "epoch": 0.95, "learning_rate": 8.057344029424218e-07, "logits/chosen": -0.8776688575744629, "logits/rejected": -1.0078495740890503, "logps/chosen": -395.8924560546875, "logps/rejected": -56.13270950317383, "loss": 2.289, "rewards/accuracies": 1.0, "rewards/chosen": 4.722033977508545, "rewards/margins": 1.683093786239624, "rewards/rejected": 3.038940191268921, "step": 5843 }, { "epoch": 0.95, "learning_rate": 8.056303996013866e-07, "logits/chosen": -0.6706488132476807, "logits/rejected": -0.6563359498977661, "logps/chosen": -41.96564483642578, "logps/rejected": -39.381370544433594, "loss": 1.0692, "rewards/accuracies": 1.0, "rewards/chosen": 1.6378334760665894, "rewards/margins": 0.08516085147857666, "rewards/rejected": 1.5526726245880127, "step": 5844 }, { "epoch": 0.95, "learning_rate": 8.05526375144183e-07, "logits/chosen": -0.4945859909057617, "logits/rejected": -0.38472020626068115, "logps/chosen": -81.67991638183594, "logps/rejected": -16.979467391967773, "loss": 0.5025, "rewards/accuracies": 1.0, "rewards/chosen": 1.6662476062774658, "rewards/margins": 1.522312045097351, "rewards/rejected": 0.14393559098243713, "step": 5845 }, { "epoch": 0.95, "learning_rate": 8.054223295779974e-07, "logits/chosen": -0.5610337257385254, "logits/rejected": -0.5209847092628479, "logps/chosen": -74.2021484375, "logps/rejected": -133.480224609375, "loss": 1.4136, "rewards/accuracies": 0.0, "rewards/chosen": 2.199849843978882, "rewards/margins": -2.4857428073883057, "rewards/rejected": 4.6855926513671875, "step": 5846 }, { "epoch": 0.95, "learning_rate": 8.053182629100191e-07, "logits/chosen": -0.6419718861579895, "logits/rejected": -0.573755145072937, "logps/chosen": -56.92483901977539, "logps/rejected": -53.61045455932617, "loss": 0.3129, "rewards/accuracies": 1.0, "rewards/chosen": 1.8595638275146484, "rewards/margins": 0.21756362915039062, "rewards/rejected": 1.6420001983642578, "step": 5847 }, { "epoch": 0.95, "learning_rate": 8.052141751474375e-07, "logits/chosen": -0.9607991576194763, "logits/rejected": -0.9200353026390076, "logps/chosen": -91.57139587402344, "logps/rejected": -81.46076202392578, "loss": 0.8402, "rewards/accuracies": 0.0, "rewards/chosen": 1.2399475574493408, "rewards/margins": -0.8460502624511719, "rewards/rejected": 2.0859978199005127, "step": 5848 }, { "epoch": 0.95, "learning_rate": 8.051100662974445e-07, "logits/chosen": -0.3002852499485016, "logits/rejected": -0.3597653806209564, "logps/chosen": -107.90617370605469, "logps/rejected": -63.49711608886719, "loss": 0.6688, "rewards/accuracies": 0.0, "rewards/chosen": 1.5962997674942017, "rewards/margins": -0.9576188325881958, "rewards/rejected": 2.5539186000823975, "step": 5849 }, { "epoch": 0.95, "learning_rate": 8.050059363672328e-07, "logits/chosen": -0.441638708114624, "logits/rejected": -0.4124484360218048, "logps/chosen": -116.364013671875, "logps/rejected": -57.39036178588867, "loss": 0.9667, "rewards/accuracies": 1.0, "rewards/chosen": 3.5744385719299316, "rewards/margins": 1.2080631256103516, "rewards/rejected": 2.36637544631958, "step": 5850 }, { "epoch": 0.95, "learning_rate": 8.04901785363997e-07, "logits/chosen": -0.7089139819145203, "logits/rejected": -0.38341715931892395, "logps/chosen": -135.2149658203125, "logps/rejected": -46.2288818359375, "loss": 0.091, "rewards/accuracies": 1.0, "rewards/chosen": 4.735095500946045, "rewards/margins": 3.030118942260742, "rewards/rejected": 1.7049766778945923, "step": 5851 }, { "epoch": 0.95, "learning_rate": 8.047976132949328e-07, "logits/chosen": -0.648716390132904, "logits/rejected": -0.5861581563949585, "logps/chosen": -28.386924743652344, "logps/rejected": -53.56610107421875, "loss": 0.4045, "rewards/accuracies": 1.0, "rewards/chosen": 1.632258653640747, "rewards/margins": 0.862997829914093, "rewards/rejected": 0.769260823726654, "step": 5852 }, { "epoch": 0.95, "learning_rate": 8.046934201672375e-07, "logits/chosen": -0.5904701948165894, "logits/rejected": -0.4723346531391144, "logps/chosen": -52.82197189331055, "logps/rejected": -10.977051734924316, "loss": 0.5769, "rewards/accuracies": 1.0, "rewards/chosen": 1.015554428100586, "rewards/margins": 0.5358444452285767, "rewards/rejected": 0.47971001267433167, "step": 5853 }, { "epoch": 0.95, "learning_rate": 8.0458920598811e-07, "logits/chosen": -0.9225132465362549, "logits/rejected": -1.0474501848220825, "logps/chosen": -88.87081909179688, "logps/rejected": -108.37564849853516, "loss": 0.9718, "rewards/accuracies": 0.0, "rewards/chosen": 3.27176833152771, "rewards/margins": -1.7495887279510498, "rewards/rejected": 5.02135705947876, "step": 5854 }, { "epoch": 0.95, "learning_rate": 8.044849707647503e-07, "logits/chosen": -0.6909670829772949, "logits/rejected": -0.6234116554260254, "logps/chosen": -51.337852478027344, "logps/rejected": -50.149635314941406, "loss": 0.2814, "rewards/accuracies": 1.0, "rewards/chosen": 2.265789031982422, "rewards/margins": 0.29524528980255127, "rewards/rejected": 1.9705437421798706, "step": 5855 }, { "epoch": 0.95, "learning_rate": 8.043807145043603e-07, "logits/chosen": -0.7164375185966492, "logits/rejected": -0.5576854944229126, "logps/chosen": -93.10012817382812, "logps/rejected": -77.65480041503906, "loss": 0.675, "rewards/accuracies": 1.0, "rewards/chosen": 1.9501770734786987, "rewards/margins": 0.17303931713104248, "rewards/rejected": 1.7771377563476562, "step": 5856 }, { "epoch": 0.95, "learning_rate": 8.042764372141429e-07, "logits/chosen": -1.2596863508224487, "logits/rejected": -1.2080713510513306, "logps/chosen": -161.57041931152344, "logps/rejected": -177.97207641601562, "loss": 0.1649, "rewards/accuracies": 1.0, "rewards/chosen": 5.807173252105713, "rewards/margins": 0.979924201965332, "rewards/rejected": 4.827249050140381, "step": 5857 }, { "epoch": 0.95, "learning_rate": 8.041721389013029e-07, "logits/chosen": -0.8883541226387024, "logits/rejected": -0.9439482092857361, "logps/chosen": -333.6083068847656, "logps/rejected": -75.10281372070312, "loss": 1.3936, "rewards/accuracies": 0.0, "rewards/chosen": 3.501120090484619, "rewards/margins": -2.6947813034057617, "rewards/rejected": 6.195901393890381, "step": 5858 }, { "epoch": 0.95, "learning_rate": 8.040678195730462e-07, "logits/chosen": -0.5936465859413147, "logits/rejected": -0.5091755986213684, "logps/chosen": -71.67378997802734, "logps/rejected": -47.338829040527344, "loss": 0.5388, "rewards/accuracies": 0.0, "rewards/chosen": 1.5673828125, "rewards/margins": -0.40065693855285645, "rewards/rejected": 1.9680397510528564, "step": 5859 }, { "epoch": 0.95, "learning_rate": 8.039634792365801e-07, "logits/chosen": -0.7353875041007996, "logits/rejected": -0.7169937491416931, "logps/chosen": -54.6828727722168, "logps/rejected": -63.82260513305664, "loss": 1.6455, "rewards/accuracies": 0.0, "rewards/chosen": 2.0619161128997803, "rewards/margins": -0.01563262939453125, "rewards/rejected": 2.0775487422943115, "step": 5860 }, { "epoch": 0.95, "learning_rate": 8.038591178991139e-07, "logits/chosen": -0.8374124765396118, "logits/rejected": -0.7217799425125122, "logps/chosen": -51.50270462036133, "logps/rejected": -54.67075729370117, "loss": 0.4711, "rewards/accuracies": 1.0, "rewards/chosen": 2.016148805618286, "rewards/margins": 0.7595901489257812, "rewards/rejected": 1.2565586566925049, "step": 5861 }, { "epoch": 0.95, "learning_rate": 8.037547355678576e-07, "logits/chosen": -0.6171520948410034, "logits/rejected": -0.7869396209716797, "logps/chosen": -57.76897048950195, "logps/rejected": -121.33150482177734, "loss": 0.7174, "rewards/accuracies": 0.0, "rewards/chosen": 2.5098743438720703, "rewards/margins": -0.3806874752044678, "rewards/rejected": 2.890561819076538, "step": 5862 }, { "epoch": 0.95, "learning_rate": 8.036503322500234e-07, "logits/chosen": -0.4861806631088257, "logits/rejected": -0.4967687726020813, "logps/chosen": -68.05130004882812, "logps/rejected": -45.44783401489258, "loss": 0.9313, "rewards/accuracies": 0.0, "rewards/chosen": 1.2790336608886719, "rewards/margins": -1.0439229011535645, "rewards/rejected": 2.3229565620422363, "step": 5863 }, { "epoch": 0.95, "learning_rate": 8.035459079528244e-07, "logits/chosen": -0.5104610323905945, "logits/rejected": -0.5334199070930481, "logps/chosen": -77.2494125366211, "logps/rejected": -84.36515045166016, "loss": 0.7489, "rewards/accuracies": 0.0, "rewards/chosen": 0.2135566771030426, "rewards/margins": -0.2117668092250824, "rewards/rejected": 0.425323486328125, "step": 5864 }, { "epoch": 0.95, "learning_rate": 8.034414626834754e-07, "logits/chosen": -0.6748689413070679, "logits/rejected": -0.6367047429084778, "logps/chosen": -91.76557922363281, "logps/rejected": -35.957679748535156, "loss": 1.0544, "rewards/accuracies": 0.0, "rewards/chosen": 0.6393135190010071, "rewards/margins": -0.8728581070899963, "rewards/rejected": 1.5121716260910034, "step": 5865 }, { "epoch": 0.95, "learning_rate": 8.033369964491923e-07, "logits/chosen": -0.5523700714111328, "logits/rejected": -0.5080553293228149, "logps/chosen": -46.01273727416992, "logps/rejected": -42.290924072265625, "loss": 0.4728, "rewards/accuracies": 1.0, "rewards/chosen": 1.9458293914794922, "rewards/margins": 0.1616199016571045, "rewards/rejected": 1.7842094898223877, "step": 5866 }, { "epoch": 0.95, "learning_rate": 8.032325092571931e-07, "logits/chosen": -0.2378554344177246, "logits/rejected": -0.44976159930229187, "logps/chosen": -102.34718322753906, "logps/rejected": -162.79986572265625, "loss": 1.3158, "rewards/accuracies": 0.0, "rewards/chosen": 2.3394792079925537, "rewards/margins": -1.5975141525268555, "rewards/rejected": 3.936993360519409, "step": 5867 }, { "epoch": 0.95, "learning_rate": 8.031280011146966e-07, "logits/chosen": -0.5312846302986145, "logits/rejected": -0.46413224935531616, "logps/chosen": -60.57862854003906, "logps/rejected": -77.45508575439453, "loss": 0.5475, "rewards/accuracies": 1.0, "rewards/chosen": 3.0128448009490967, "rewards/margins": 1.4308959245681763, "rewards/rejected": 1.5819488763809204, "step": 5868 }, { "epoch": 0.95, "learning_rate": 8.030234720289236e-07, "logits/chosen": -0.21487317979335785, "logits/rejected": -0.12991388142108917, "logps/chosen": -51.39786148071289, "logps/rejected": -63.52037048339844, "loss": 0.2179, "rewards/accuracies": 1.0, "rewards/chosen": 2.168800115585327, "rewards/margins": 0.8639234304428101, "rewards/rejected": 1.304876685142517, "step": 5869 }, { "epoch": 0.95, "learning_rate": 8.029189220070959e-07, "logits/chosen": -0.6611228585243225, "logits/rejected": -0.5955836176872253, "logps/chosen": -49.97078323364258, "logps/rejected": -58.56103515625, "loss": 0.4128, "rewards/accuracies": 0.0, "rewards/chosen": 1.2344502210617065, "rewards/margins": -0.0923914909362793, "rewards/rejected": 1.3268417119979858, "step": 5870 }, { "epoch": 0.95, "learning_rate": 8.028143510564369e-07, "logits/chosen": -0.6191038489341736, "logits/rejected": -0.5675104260444641, "logps/chosen": -40.834617614746094, "logps/rejected": -17.444211959838867, "loss": 0.635, "rewards/accuracies": 1.0, "rewards/chosen": 0.6553714871406555, "rewards/margins": 0.33364659547805786, "rewards/rejected": 0.32172489166259766, "step": 5871 }, { "epoch": 0.95, "learning_rate": 8.027097591841714e-07, "logits/chosen": -0.9232596755027771, "logits/rejected": -0.8708298802375793, "logps/chosen": -114.34878540039062, "logps/rejected": -101.8594970703125, "loss": 0.318, "rewards/accuracies": 1.0, "rewards/chosen": 3.0638396739959717, "rewards/margins": 0.4986724853515625, "rewards/rejected": 2.565167188644409, "step": 5872 }, { "epoch": 0.95, "learning_rate": 8.026051463975259e-07, "logits/chosen": -0.6999308466911316, "logits/rejected": -0.7771116495132446, "logps/chosen": -43.59815216064453, "logps/rejected": -163.62808227539062, "loss": 0.6495, "rewards/accuracies": 0.0, "rewards/chosen": 1.761738657951355, "rewards/margins": -0.23009181022644043, "rewards/rejected": 1.9918304681777954, "step": 5873 }, { "epoch": 0.95, "learning_rate": 8.025005127037281e-07, "logits/chosen": -0.6834244132041931, "logits/rejected": -0.7184197306632996, "logps/chosen": -82.64925384521484, "logps/rejected": -65.06748962402344, "loss": 1.2248, "rewards/accuracies": 0.0, "rewards/chosen": 0.4345085322856903, "rewards/margins": -2.0498223304748535, "rewards/rejected": 2.484330892562866, "step": 5874 }, { "epoch": 0.95, "learning_rate": 8.023958581100071e-07, "logits/chosen": -0.9378312230110168, "logits/rejected": -1.030033826828003, "logps/chosen": -109.12109375, "logps/rejected": -104.62513732910156, "loss": 1.8641, "rewards/accuracies": 0.0, "rewards/chosen": 1.0116668939590454, "rewards/margins": -3.66945219039917, "rewards/rejected": 4.681118965148926, "step": 5875 }, { "epoch": 0.95, "learning_rate": 8.022911826235936e-07, "logits/chosen": -0.7308705449104309, "logits/rejected": -0.678335428237915, "logps/chosen": -139.18707275390625, "logps/rejected": -81.74800109863281, "loss": 0.1258, "rewards/accuracies": 1.0, "rewards/chosen": 4.101632595062256, "rewards/margins": 2.149101734161377, "rewards/rejected": 1.9525307416915894, "step": 5876 }, { "epoch": 0.95, "learning_rate": 8.021864862517197e-07, "logits/chosen": -0.5054640173912048, "logits/rejected": -0.5762127041816711, "logps/chosen": -101.40994262695312, "logps/rejected": -47.535743713378906, "loss": 0.7682, "rewards/accuracies": 0.0, "rewards/chosen": 1.5713318586349487, "rewards/margins": -0.8585196733474731, "rewards/rejected": 2.429851531982422, "step": 5877 }, { "epoch": 0.95, "learning_rate": 8.020817690016188e-07, "logits/chosen": -0.8599262237548828, "logits/rejected": -0.5606122016906738, "logps/chosen": -114.01812744140625, "logps/rejected": -38.008941650390625, "loss": 0.0927, "rewards/accuracies": 1.0, "rewards/chosen": 3.258104085922241, "rewards/margins": 2.394644260406494, "rewards/rejected": 0.8634597659111023, "step": 5878 }, { "epoch": 0.95, "learning_rate": 8.019770308805261e-07, "logits/chosen": -0.6158074736595154, "logits/rejected": -0.575357973575592, "logps/chosen": -73.72413635253906, "logps/rejected": -78.95201873779297, "loss": 0.316, "rewards/accuracies": 1.0, "rewards/chosen": 1.1966339349746704, "rewards/margins": 0.13207781314849854, "rewards/rejected": 1.0645561218261719, "step": 5879 }, { "epoch": 0.95, "learning_rate": 8.018722718956779e-07, "logits/chosen": -0.8888855576515198, "logits/rejected": -0.860525906085968, "logps/chosen": -110.30094146728516, "logps/rejected": -65.38629150390625, "loss": 0.5011, "rewards/accuracies": 0.0, "rewards/chosen": 2.065666913986206, "rewards/margins": -0.4841134548187256, "rewards/rejected": 2.5497803688049316, "step": 5880 }, { "epoch": 0.95, "learning_rate": 8.017674920543121e-07, "logits/chosen": -0.7100903987884521, "logits/rejected": -0.6734949350357056, "logps/chosen": -84.56298065185547, "logps/rejected": -68.57588195800781, "loss": 0.3133, "rewards/accuracies": 1.0, "rewards/chosen": 3.5774972438812256, "rewards/margins": 2.852924346923828, "rewards/rejected": 0.7245727777481079, "step": 5881 }, { "epoch": 0.95, "learning_rate": 8.016626913636679e-07, "logits/chosen": -0.5638517737388611, "logits/rejected": -0.6768249869346619, "logps/chosen": -154.19119262695312, "logps/rejected": -111.12142181396484, "loss": 1.0076, "rewards/accuracies": 0.0, "rewards/chosen": 3.585888624191284, "rewards/margins": -0.7700936794281006, "rewards/rejected": 4.355982303619385, "step": 5882 }, { "epoch": 0.95, "learning_rate": 8.015578698309862e-07, "logits/chosen": -1.143129825592041, "logits/rejected": -1.205536961555481, "logps/chosen": -181.87896728515625, "logps/rejected": -75.97300720214844, "loss": 0.0627, "rewards/accuracies": 1.0, "rewards/chosen": 4.511741638183594, "rewards/margins": 2.8220458030700684, "rewards/rejected": 1.6896957159042358, "step": 5883 }, { "epoch": 0.96, "learning_rate": 8.014530274635089e-07, "logits/chosen": -1.0101828575134277, "logits/rejected": -0.8810752034187317, "logps/chosen": -119.7119369506836, "logps/rejected": -119.94127655029297, "loss": 0.3748, "rewards/accuracies": 1.0, "rewards/chosen": 4.3737311363220215, "rewards/margins": 1.12575364112854, "rewards/rejected": 3.2479774951934814, "step": 5884 }, { "epoch": 0.96, "learning_rate": 8.013481642684799e-07, "logits/chosen": -0.323464572429657, "logits/rejected": -0.3512333333492279, "logps/chosen": -65.69303131103516, "logps/rejected": -101.89983367919922, "loss": 1.1169, "rewards/accuracies": 0.0, "rewards/chosen": 1.2257393598556519, "rewards/margins": -0.8077422380447388, "rewards/rejected": 2.0334815979003906, "step": 5885 }, { "epoch": 0.96, "learning_rate": 8.012432802531439e-07, "logits/chosen": -0.11196763813495636, "logits/rejected": -0.11196763813495636, "logps/chosen": -0.5636696815490723, "logps/rejected": -0.5636696815490723, "loss": 0.4609, "rewards/accuracies": 0.0, "rewards/chosen": 0.12656866014003754, "rewards/margins": 0.0, "rewards/rejected": 0.12656866014003754, "step": 5886 }, { "epoch": 0.96, "learning_rate": 8.011383754247479e-07, "logits/chosen": -0.6135578155517578, "logits/rejected": -0.5069617033004761, "logps/chosen": -87.38748168945312, "logps/rejected": -35.32286834716797, "loss": 2.3974, "rewards/accuracies": 1.0, "rewards/chosen": 0.9603973627090454, "rewards/margins": 0.8855011463165283, "rewards/rejected": 0.07489623874425888, "step": 5887 }, { "epoch": 0.96, "learning_rate": 8.010334497905394e-07, "logits/chosen": -0.6783040165901184, "logits/rejected": -0.6502459049224854, "logps/chosen": -76.24449157714844, "logps/rejected": -47.104225158691406, "loss": 0.6247, "rewards/accuracies": 1.0, "rewards/chosen": 0.9852920770645142, "rewards/margins": 0.14391100406646729, "rewards/rejected": 0.8413810729980469, "step": 5888 }, { "epoch": 0.96, "learning_rate": 8.00928503357768e-07, "logits/chosen": -0.6217213273048401, "logits/rejected": -0.6240353584289551, "logps/chosen": -4.365078449249268, "logps/rejected": -1.9712687730789185, "loss": 0.414, "rewards/accuracies": 0.0, "rewards/chosen": 0.05460476875305176, "rewards/margins": -0.18982934951782227, "rewards/rejected": 0.24443411827087402, "step": 5889 }, { "epoch": 0.96, "learning_rate": 8.008235361336844e-07, "logits/chosen": -0.7013985514640808, "logits/rejected": -0.7013985514640808, "logps/chosen": -77.74609375, "logps/rejected": -77.74609375, "loss": 0.3599, "rewards/accuracies": 0.0, "rewards/chosen": 2.460566759109497, "rewards/margins": 0.0, "rewards/rejected": 2.460566759109497, "step": 5890 }, { "epoch": 0.96, "learning_rate": 8.007185481255407e-07, "logits/chosen": -0.8180999159812927, "logits/rejected": -0.8167638778686523, "logps/chosen": -193.18429565429688, "logps/rejected": -73.53776550292969, "loss": 0.2922, "rewards/accuracies": 1.0, "rewards/chosen": 3.6977295875549316, "rewards/margins": 0.3736703395843506, "rewards/rejected": 3.324059247970581, "step": 5891 }, { "epoch": 0.96, "learning_rate": 8.006135393405911e-07, "logits/chosen": -0.6860589385032654, "logits/rejected": -0.7073689699172974, "logps/chosen": -132.15194702148438, "logps/rejected": -114.54107666015625, "loss": 0.6199, "rewards/accuracies": 0.0, "rewards/chosen": 0.934490978717804, "rewards/margins": -0.7263595461845398, "rewards/rejected": 1.6608505249023438, "step": 5892 }, { "epoch": 0.96, "learning_rate": 8.0050850978609e-07, "logits/chosen": -0.6667723059654236, "logits/rejected": -0.5607370138168335, "logps/chosen": -164.48147583007812, "logps/rejected": -51.909996032714844, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 0.6048187613487244, "rewards/margins": 0.581885576248169, "rewards/rejected": 0.022933198139071465, "step": 5893 }, { "epoch": 0.96, "learning_rate": 8.004034594692945e-07, "logits/chosen": -0.5997787714004517, "logits/rejected": -0.5627566576004028, "logps/chosen": -67.44483947753906, "logps/rejected": -91.13859558105469, "loss": 0.6032, "rewards/accuracies": 0.0, "rewards/chosen": 1.320504069328308, "rewards/margins": -0.8308838605880737, "rewards/rejected": 2.151387929916382, "step": 5894 }, { "epoch": 0.96, "learning_rate": 8.002983883974624e-07, "logits/chosen": -0.39781635999679565, "logits/rejected": -0.4256902039051056, "logps/chosen": -72.24140930175781, "logps/rejected": -105.50635528564453, "loss": 0.7515, "rewards/accuracies": 1.0, "rewards/chosen": 2.089630126953125, "rewards/margins": 0.8180633783340454, "rewards/rejected": 1.2715667486190796, "step": 5895 }, { "epoch": 0.96, "learning_rate": 8.001932965778531e-07, "logits/chosen": -0.38192102313041687, "logits/rejected": -0.3439021408557892, "logps/chosen": -43.03977966308594, "logps/rejected": -87.07975769042969, "loss": 0.4616, "rewards/accuracies": 0.0, "rewards/chosen": 1.8523765802383423, "rewards/margins": -0.4124351739883423, "rewards/rejected": 2.2648117542266846, "step": 5896 }, { "epoch": 0.96, "learning_rate": 8.000881840177274e-07, "logits/chosen": -0.6532649993896484, "logits/rejected": -0.6326256990432739, "logps/chosen": -81.0021743774414, "logps/rejected": -67.95540618896484, "loss": 1.2269, "rewards/accuracies": 0.0, "rewards/chosen": 1.2250686883926392, "rewards/margins": -0.6839348077774048, "rewards/rejected": 1.909003496170044, "step": 5897 }, { "epoch": 0.96, "learning_rate": 7.999830507243477e-07, "logits/chosen": -0.7440751194953918, "logits/rejected": -0.7540977597236633, "logps/chosen": -81.5516357421875, "logps/rejected": -115.42167663574219, "loss": 0.7474, "rewards/accuracies": 0.0, "rewards/chosen": 1.404680609703064, "rewards/margins": -0.22997665405273438, "rewards/rejected": 1.6346572637557983, "step": 5898 }, { "epoch": 0.96, "learning_rate": 7.998778967049777e-07, "logits/chosen": -0.4269104301929474, "logits/rejected": -0.4048348367214203, "logps/chosen": -51.00752639770508, "logps/rejected": -19.138696670532227, "loss": 2.2994, "rewards/accuracies": 1.0, "rewards/chosen": 2.005160093307495, "rewards/margins": 1.4557890892028809, "rewards/rejected": 0.5493709444999695, "step": 5899 }, { "epoch": 0.96, "learning_rate": 7.997727219668825e-07, "logits/chosen": -0.6699166297912598, "logits/rejected": -0.5731077790260315, "logps/chosen": -109.94974517822266, "logps/rejected": -95.4809799194336, "loss": 0.5715, "rewards/accuracies": 1.0, "rewards/chosen": 1.3562500476837158, "rewards/margins": 0.922515869140625, "rewards/rejected": 0.43373414874076843, "step": 5900 }, { "epoch": 0.96, "learning_rate": 7.996675265173287e-07, "logits/chosen": -0.6140562891960144, "logits/rejected": -0.5682899951934814, "logps/chosen": -54.80484390258789, "logps/rejected": -109.52810668945312, "loss": 1.1416, "rewards/accuracies": 0.0, "rewards/chosen": 1.916733980178833, "rewards/margins": -0.5291142463684082, "rewards/rejected": 2.445848226547241, "step": 5901 }, { "epoch": 0.96, "learning_rate": 7.995623103635842e-07, "logits/chosen": -1.0044894218444824, "logits/rejected": -1.0733565092086792, "logps/chosen": -141.72836303710938, "logps/rejected": -118.0427017211914, "loss": 0.5775, "rewards/accuracies": 0.0, "rewards/chosen": 5.356224060058594, "rewards/margins": -0.6170511245727539, "rewards/rejected": 5.973275184631348, "step": 5902 }, { "epoch": 0.96, "learning_rate": 7.994570735129186e-07, "logits/chosen": -0.39758652448654175, "logits/rejected": -0.3896962106227875, "logps/chosen": -48.7779541015625, "logps/rejected": -97.50008392333984, "loss": 0.2411, "rewards/accuracies": 1.0, "rewards/chosen": 2.2493393421173096, "rewards/margins": 0.576433539390564, "rewards/rejected": 1.6729058027267456, "step": 5903 }, { "epoch": 0.96, "learning_rate": 7.993518159726027e-07, "logits/chosen": -0.7860842347145081, "logits/rejected": -0.7003344297409058, "logps/chosen": -118.35608673095703, "logps/rejected": -115.62434387207031, "loss": 0.2337, "rewards/accuracies": 1.0, "rewards/chosen": 3.7663261890411377, "rewards/margins": 1.0668647289276123, "rewards/rejected": 2.6994614601135254, "step": 5904 }, { "epoch": 0.96, "learning_rate": 7.992465377499089e-07, "logits/chosen": -0.5692057609558105, "logits/rejected": -0.5473769307136536, "logps/chosen": -59.05402374267578, "logps/rejected": -81.08297729492188, "loss": 0.7425, "rewards/accuracies": 1.0, "rewards/chosen": 0.9747375845909119, "rewards/margins": 0.5494560599327087, "rewards/rejected": 0.4252815246582031, "step": 5905 }, { "epoch": 0.96, "learning_rate": 7.991412388521107e-07, "logits/chosen": -0.6569039821624756, "logits/rejected": -0.6873852610588074, "logps/chosen": -74.53826904296875, "logps/rejected": -87.80778503417969, "loss": 0.8116, "rewards/accuracies": 0.0, "rewards/chosen": 2.0398850440979004, "rewards/margins": -0.860450029373169, "rewards/rejected": 2.9003350734710693, "step": 5906 }, { "epoch": 0.96, "learning_rate": 7.990359192864835e-07, "logits/chosen": -0.8043412566184998, "logits/rejected": -0.7051538825035095, "logps/chosen": -115.85317993164062, "logps/rejected": -74.77656555175781, "loss": 0.1237, "rewards/accuracies": 1.0, "rewards/chosen": 2.831310987472534, "rewards/margins": 1.2859466075897217, "rewards/rejected": 1.5453643798828125, "step": 5907 }, { "epoch": 0.96, "learning_rate": 7.989305790603037e-07, "logits/chosen": -0.8383924961090088, "logits/rejected": -0.7255088686943054, "logps/chosen": -183.805908203125, "logps/rejected": -84.3609390258789, "loss": 0.2785, "rewards/accuracies": 1.0, "rewards/chosen": 5.234256267547607, "rewards/margins": 1.6530237197875977, "rewards/rejected": 3.5812325477600098, "step": 5908 }, { "epoch": 0.96, "learning_rate": 7.988252181808495e-07, "logits/chosen": -0.8202800154685974, "logits/rejected": -0.6385207772254944, "logps/chosen": -149.52252197265625, "logps/rejected": -97.724853515625, "loss": 0.6351, "rewards/accuracies": 0.0, "rewards/chosen": 2.288128614425659, "rewards/margins": -0.8822555541992188, "rewards/rejected": 3.170384168624878, "step": 5909 }, { "epoch": 0.96, "learning_rate": 7.987198366554001e-07, "logits/chosen": -0.2135942131280899, "logits/rejected": -0.12671561539173126, "logps/chosen": -50.72752380371094, "logps/rejected": -46.53057098388672, "loss": 0.4035, "rewards/accuracies": 1.0, "rewards/chosen": 1.4784096479415894, "rewards/margins": 0.16864705085754395, "rewards/rejected": 1.3097625970840454, "step": 5910 }, { "epoch": 0.96, "learning_rate": 7.986144344912366e-07, "logits/chosen": -0.8651191592216492, "logits/rejected": -0.8299902677536011, "logps/chosen": -99.21897888183594, "logps/rejected": -89.3599853515625, "loss": 2.7221, "rewards/accuracies": 0.0, "rewards/chosen": 0.7038009762763977, "rewards/margins": -0.8238860964775085, "rewards/rejected": 1.5276870727539062, "step": 5911 }, { "epoch": 0.96, "learning_rate": 7.985090116956411e-07, "logits/chosen": -0.6628762483596802, "logits/rejected": -0.633366048336029, "logps/chosen": -83.81407165527344, "logps/rejected": -64.28355407714844, "loss": 1.2691, "rewards/accuracies": 1.0, "rewards/chosen": 1.073799967765808, "rewards/margins": 0.9319161176681519, "rewards/rejected": 0.14188385009765625, "step": 5912 }, { "epoch": 0.96, "learning_rate": 7.984035682758973e-07, "logits/chosen": -0.8327157497406006, "logits/rejected": -0.7964822053909302, "logps/chosen": -100.45286560058594, "logps/rejected": -93.90350341796875, "loss": 0.4463, "rewards/accuracies": 1.0, "rewards/chosen": 3.627394199371338, "rewards/margins": 0.4847702980041504, "rewards/rejected": 3.1426239013671875, "step": 5913 }, { "epoch": 0.96, "learning_rate": 7.982981042392907e-07, "logits/chosen": -0.37594401836395264, "logits/rejected": -0.39283305406570435, "logps/chosen": -86.19078826904297, "logps/rejected": -36.81122589111328, "loss": 1.3198, "rewards/accuracies": 0.0, "rewards/chosen": 1.3483940362930298, "rewards/margins": -0.6765938997268677, "rewards/rejected": 2.0249879360198975, "step": 5914 }, { "epoch": 0.96, "learning_rate": 7.981926195931076e-07, "logits/chosen": -0.5914894342422485, "logits/rejected": -0.5914894342422485, "logps/chosen": -73.2554931640625, "logps/rejected": -73.2554931640625, "loss": 0.5311, "rewards/accuracies": 0.0, "rewards/chosen": 1.5019172430038452, "rewards/margins": 0.0, "rewards/rejected": 1.5019172430038452, "step": 5915 }, { "epoch": 0.96, "learning_rate": 7.98087114344636e-07, "logits/chosen": -0.8084363341331482, "logits/rejected": -0.7324312329292297, "logps/chosen": -163.79031372070312, "logps/rejected": -72.41954040527344, "loss": 0.1556, "rewards/accuracies": 1.0, "rewards/chosen": 4.40353536605835, "rewards/margins": 1.308525800704956, "rewards/rejected": 3.0950095653533936, "step": 5916 }, { "epoch": 0.96, "learning_rate": 7.979815885011652e-07, "logits/chosen": -0.43264874815940857, "logits/rejected": -0.6138922572135925, "logps/chosen": -81.11803436279297, "logps/rejected": -164.57894897460938, "loss": 2.5453, "rewards/accuracies": 0.0, "rewards/chosen": 0.6127555966377258, "rewards/margins": -3.5272562503814697, "rewards/rejected": 4.140011787414551, "step": 5917 }, { "epoch": 0.96, "learning_rate": 7.978760420699861e-07, "logits/chosen": -0.2546701431274414, "logits/rejected": -0.2491672933101654, "logps/chosen": -58.862979888916016, "logps/rejected": -99.31926727294922, "loss": 0.2206, "rewards/accuracies": 1.0, "rewards/chosen": 0.5790554285049438, "rewards/margins": 0.8302150964736938, "rewards/rejected": -0.25115966796875, "step": 5918 }, { "epoch": 0.96, "learning_rate": 7.977704750583913e-07, "logits/chosen": -0.6977422833442688, "logits/rejected": -0.6069809198379517, "logps/chosen": -150.40037536621094, "logps/rejected": -59.162784576416016, "loss": 0.3294, "rewards/accuracies": 1.0, "rewards/chosen": 3.3427703380584717, "rewards/margins": 0.2761576175689697, "rewards/rejected": 3.066612720489502, "step": 5919 }, { "epoch": 0.96, "learning_rate": 7.976648874736742e-07, "logits/chosen": -0.317585289478302, "logits/rejected": -0.30178743600845337, "logps/chosen": -79.710693359375, "logps/rejected": -20.17547607421875, "loss": 0.5294, "rewards/accuracies": 0.0, "rewards/chosen": 0.11872940510511398, "rewards/margins": -0.24420490860939026, "rewards/rejected": 0.36293432116508484, "step": 5920 }, { "epoch": 0.96, "learning_rate": 7.975592793231297e-07, "logits/chosen": -0.7104889750480652, "logits/rejected": -0.7132338285446167, "logps/chosen": -89.59039306640625, "logps/rejected": -124.94035339355469, "loss": 1.0436, "rewards/accuracies": 0.0, "rewards/chosen": 1.5096648931503296, "rewards/margins": -1.4518340826034546, "rewards/rejected": 2.961498975753784, "step": 5921 }, { "epoch": 0.96, "learning_rate": 7.974536506140545e-07, "logits/chosen": -0.7917489409446716, "logits/rejected": -0.6710178852081299, "logps/chosen": -79.42575073242188, "logps/rejected": -64.56318664550781, "loss": 1.4218, "rewards/accuracies": 0.0, "rewards/chosen": 2.0178451538085938, "rewards/margins": -0.7713487148284912, "rewards/rejected": 2.789193868637085, "step": 5922 }, { "epoch": 0.96, "learning_rate": 7.973480013537468e-07, "logits/chosen": -0.8539707064628601, "logits/rejected": -0.8277835249900818, "logps/chosen": -80.31904602050781, "logps/rejected": -45.45265579223633, "loss": 0.5745, "rewards/accuracies": 1.0, "rewards/chosen": 2.4551239013671875, "rewards/margins": 0.5492732524871826, "rewards/rejected": 1.9058506488800049, "step": 5923 }, { "epoch": 0.96, "learning_rate": 7.972423315495056e-07, "logits/chosen": -0.5260739326477051, "logits/rejected": -0.5149049758911133, "logps/chosen": -62.079994201660156, "logps/rejected": -76.94229125976562, "loss": 1.0254, "rewards/accuracies": 1.0, "rewards/chosen": 1.5700759887695312, "rewards/margins": 0.4731178283691406, "rewards/rejected": 1.0969581604003906, "step": 5924 }, { "epoch": 0.96, "learning_rate": 7.971366412086317e-07, "logits/chosen": -0.3152969777584076, "logits/rejected": -0.2881803810596466, "logps/chosen": -62.90021896362305, "logps/rejected": -111.44386291503906, "loss": 1.6556, "rewards/accuracies": 1.0, "rewards/chosen": 1.236593246459961, "rewards/margins": 1.3333454132080078, "rewards/rejected": -0.09675216674804688, "step": 5925 }, { "epoch": 0.96, "learning_rate": 7.970309303384277e-07, "logits/chosen": -0.7155097723007202, "logits/rejected": -1.09831702709198, "logps/chosen": -106.642578125, "logps/rejected": -35.916114807128906, "loss": 1.4135, "rewards/accuracies": 1.0, "rewards/chosen": 1.4967339038848877, "rewards/margins": 1.2156403064727783, "rewards/rejected": 0.2810935974121094, "step": 5926 }, { "epoch": 0.96, "learning_rate": 7.969251989461967e-07, "logits/chosen": -0.45447391271591187, "logits/rejected": -0.5000872611999512, "logps/chosen": -124.0379638671875, "logps/rejected": -134.33131408691406, "loss": 0.5306, "rewards/accuracies": 1.0, "rewards/chosen": 1.3988479375839233, "rewards/margins": 0.7707870006561279, "rewards/rejected": 0.6280609369277954, "step": 5927 }, { "epoch": 0.96, "learning_rate": 7.968194470392442e-07, "logits/chosen": -0.6435202956199646, "logits/rejected": -0.7241955399513245, "logps/chosen": -87.64865112304688, "logps/rejected": -119.4583969116211, "loss": 0.7552, "rewards/accuracies": 0.0, "rewards/chosen": 1.2637923955917358, "rewards/margins": -0.24612200260162354, "rewards/rejected": 1.5099143981933594, "step": 5928 }, { "epoch": 0.96, "learning_rate": 7.967136746248764e-07, "logits/chosen": -0.6364349722862244, "logits/rejected": -0.622269868850708, "logps/chosen": -138.77842712402344, "logps/rejected": -75.98712158203125, "loss": 0.2229, "rewards/accuracies": 1.0, "rewards/chosen": 2.3997743129730225, "rewards/margins": 0.634554386138916, "rewards/rejected": 1.7652199268341064, "step": 5929 }, { "epoch": 0.96, "learning_rate": 7.966078817104011e-07, "logits/chosen": -0.7552504539489746, "logits/rejected": -0.7688948512077332, "logps/chosen": -151.19520568847656, "logps/rejected": -40.281471252441406, "loss": 1.3212, "rewards/accuracies": 0.0, "rewards/chosen": 1.3291672468185425, "rewards/margins": -0.6607059240341187, "rewards/rejected": 1.9898731708526611, "step": 5930 }, { "epoch": 0.96, "learning_rate": 7.965020683031277e-07, "logits/chosen": -0.4940858483314514, "logits/rejected": -0.4322635233402252, "logps/chosen": -58.299720764160156, "logps/rejected": -81.33836364746094, "loss": 0.5712, "rewards/accuracies": 1.0, "rewards/chosen": 0.9932792782783508, "rewards/margins": 0.08776932954788208, "rewards/rejected": 0.9055099487304688, "step": 5931 }, { "epoch": 0.96, "learning_rate": 7.963962344103669e-07, "logits/chosen": -0.7620238065719604, "logits/rejected": -0.5551503300666809, "logps/chosen": -73.62770080566406, "logps/rejected": -23.01404571533203, "loss": 0.9447, "rewards/accuracies": 1.0, "rewards/chosen": 2.524733781814575, "rewards/margins": 2.4200809001922607, "rewards/rejected": 0.1046527847647667, "step": 5932 }, { "epoch": 0.96, "learning_rate": 7.962903800394309e-07, "logits/chosen": -0.7491163015365601, "logits/rejected": -0.6483025550842285, "logps/chosen": -48.400386810302734, "logps/rejected": -54.003990173339844, "loss": 1.0387, "rewards/accuracies": 0.0, "rewards/chosen": 2.127080202102661, "rewards/margins": -0.3337719440460205, "rewards/rejected": 2.4608521461486816, "step": 5933 }, { "epoch": 0.96, "learning_rate": 7.961845051976332e-07, "logits/chosen": -0.3628460168838501, "logits/rejected": -0.3013302683830261, "logps/chosen": -49.764060974121094, "logps/rejected": -40.462921142578125, "loss": 0.501, "rewards/accuracies": 1.0, "rewards/chosen": 1.871437907218933, "rewards/margins": 0.22163856029510498, "rewards/rejected": 1.6497993469238281, "step": 5934 }, { "epoch": 0.96, "learning_rate": 7.960786098922886e-07, "logits/chosen": -0.9284718036651611, "logits/rejected": -0.770488440990448, "logps/chosen": -109.6861801147461, "logps/rejected": -63.73863220214844, "loss": 0.1901, "rewards/accuracies": 1.0, "rewards/chosen": 4.049352169036865, "rewards/margins": 2.131838083267212, "rewards/rejected": 1.9175140857696533, "step": 5935 }, { "epoch": 0.96, "learning_rate": 7.959726941307136e-07, "logits/chosen": -0.5927475094795227, "logits/rejected": -0.550186038017273, "logps/chosen": -100.46810150146484, "logps/rejected": -37.310638427734375, "loss": 0.0659, "rewards/accuracies": 1.0, "rewards/chosen": 2.476452589035034, "rewards/margins": 2.1536598205566406, "rewards/rejected": 0.32279282808303833, "step": 5936 }, { "epoch": 0.96, "learning_rate": 7.958667579202261e-07, "logits/chosen": -0.5233722925186157, "logits/rejected": -0.5952980518341064, "logps/chosen": -64.39703369140625, "logps/rejected": -68.16903686523438, "loss": 0.5581, "rewards/accuracies": 0.0, "rewards/chosen": 2.3420944213867188, "rewards/margins": -0.008476972579956055, "rewards/rejected": 2.350571393966675, "step": 5937 }, { "epoch": 0.96, "learning_rate": 7.957608012681452e-07, "logits/chosen": -1.0010316371917725, "logits/rejected": -0.920650064945221, "logps/chosen": -86.3216552734375, "logps/rejected": -24.03367805480957, "loss": 1.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.3241890072822571, "rewards/margins": 0.3211904764175415, "rewards/rejected": 0.0029985427390784025, "step": 5938 }, { "epoch": 0.96, "learning_rate": 7.956548241817911e-07, "logits/chosen": -0.4670848250389099, "logits/rejected": -0.4670848250389099, "logps/chosen": -50.01396179199219, "logps/rejected": -50.01396179199219, "loss": 0.6564, "rewards/accuracies": 0.0, "rewards/chosen": 2.04046630859375, "rewards/margins": 0.0, "rewards/rejected": 2.04046630859375, "step": 5939 }, { "epoch": 0.96, "learning_rate": 7.955488266684865e-07, "logits/chosen": -0.3652065694332123, "logits/rejected": -0.34650516510009766, "logps/chosen": -81.18623352050781, "logps/rejected": -45.38484573364258, "loss": 0.5006, "rewards/accuracies": 0.0, "rewards/chosen": 0.498818963766098, "rewards/margins": -0.4494190514087677, "rewards/rejected": 0.9482380151748657, "step": 5940 }, { "epoch": 0.96, "learning_rate": 7.954428087355542e-07, "logits/chosen": -0.6995962262153625, "logits/rejected": -0.6189820170402527, "logps/chosen": -138.1783447265625, "logps/rejected": -109.86760711669922, "loss": 0.1445, "rewards/accuracies": 1.0, "rewards/chosen": 5.013580322265625, "rewards/margins": 2.9428367614746094, "rewards/rejected": 2.0707435607910156, "step": 5941 }, { "epoch": 0.96, "learning_rate": 7.953367703903196e-07, "logits/chosen": -0.5796265602111816, "logits/rejected": -0.5207563042640686, "logps/chosen": -75.3757095336914, "logps/rejected": -122.16946411132812, "loss": 0.3215, "rewards/accuracies": 1.0, "rewards/chosen": 4.427257537841797, "rewards/margins": 0.27710771560668945, "rewards/rejected": 4.150149822235107, "step": 5942 }, { "epoch": 0.96, "learning_rate": 7.952307116401085e-07, "logits/chosen": -0.7869579195976257, "logits/rejected": -0.7522667646408081, "logps/chosen": -74.25222778320312, "logps/rejected": -78.43228912353516, "loss": 1.9946, "rewards/accuracies": 0.0, "rewards/chosen": 1.0804016590118408, "rewards/margins": -0.38275063037872314, "rewards/rejected": 1.463152289390564, "step": 5943 }, { "epoch": 0.96, "learning_rate": 7.951246324922487e-07, "logits/chosen": -0.3996923565864563, "logits/rejected": -0.4270709455013275, "logps/chosen": -68.45975494384766, "logps/rejected": -88.20906066894531, "loss": 0.5326, "rewards/accuracies": 0.0, "rewards/chosen": 2.2220940589904785, "rewards/margins": -0.42312073707580566, "rewards/rejected": 2.645214796066284, "step": 5944 }, { "epoch": 0.96, "learning_rate": 7.950185329540693e-07, "logits/chosen": -0.8174620866775513, "logits/rejected": -0.7822257280349731, "logps/chosen": -155.39161682128906, "logps/rejected": -64.13800811767578, "loss": 0.2492, "rewards/accuracies": 1.0, "rewards/chosen": 4.2301926612854, "rewards/margins": 3.211956024169922, "rewards/rejected": 1.018236517906189, "step": 5945 }, { "epoch": 0.97, "learning_rate": 7.949124130329008e-07, "logits/chosen": -0.394042044878006, "logits/rejected": -0.42352715134620667, "logps/chosen": -72.87955474853516, "logps/rejected": -137.47061157226562, "loss": 1.028, "rewards/accuracies": 1.0, "rewards/chosen": 1.9790191650390625, "rewards/margins": 1.369532823562622, "rewards/rejected": 0.6094864010810852, "step": 5946 }, { "epoch": 0.97, "learning_rate": 7.948062727360751e-07, "logits/chosen": -0.8958616852760315, "logits/rejected": -0.8736555576324463, "logps/chosen": -251.93020629882812, "logps/rejected": -59.18254852294922, "loss": 1.4889, "rewards/accuracies": 0.0, "rewards/chosen": 0.08878173679113388, "rewards/margins": -2.3814589977264404, "rewards/rejected": 2.470240831375122, "step": 5947 }, { "epoch": 0.97, "learning_rate": 7.947001120709253e-07, "logits/chosen": -0.5643694400787354, "logits/rejected": -0.5459943413734436, "logps/chosen": -48.650146484375, "logps/rejected": -141.84176635742188, "loss": 0.2422, "rewards/accuracies": 1.0, "rewards/chosen": 1.2974846363067627, "rewards/margins": 1.0062416791915894, "rewards/rejected": 0.2912429869174957, "step": 5948 }, { "epoch": 0.97, "learning_rate": 7.945939310447864e-07, "logits/chosen": -0.5978949666023254, "logits/rejected": -0.6407864689826965, "logps/chosen": -153.13800048828125, "logps/rejected": -82.6573715209961, "loss": 1.0805, "rewards/accuracies": 0.0, "rewards/chosen": 1.5324554443359375, "rewards/margins": -1.899200439453125, "rewards/rejected": 3.4316558837890625, "step": 5949 }, { "epoch": 0.97, "learning_rate": 7.944877296649944e-07, "logits/chosen": -0.8160083889961243, "logits/rejected": -0.8160083889961243, "logps/chosen": -49.859954833984375, "logps/rejected": -49.859954833984375, "loss": 0.3537, "rewards/accuracies": 0.0, "rewards/chosen": 2.330730438232422, "rewards/margins": 0.0, "rewards/rejected": 2.330730438232422, "step": 5950 }, { "epoch": 0.97, "learning_rate": 7.943815079388866e-07, "logits/chosen": -0.28517210483551025, "logits/rejected": -0.28432485461235046, "logps/chosen": -5.469057559967041, "logps/rejected": -0.7450801730155945, "loss": 0.7835, "rewards/accuracies": 0.0, "rewards/chosen": -0.03977098688483238, "rewards/margins": -0.18787449598312378, "rewards/rejected": 0.1481035053730011, "step": 5951 }, { "epoch": 0.97, "learning_rate": 7.942752658738021e-07, "logits/chosen": -0.5487847328186035, "logits/rejected": -0.5260255336761475, "logps/chosen": -92.31265258789062, "logps/rejected": -87.87535095214844, "loss": 0.8017, "rewards/accuracies": 0.0, "rewards/chosen": 1.088452935218811, "rewards/margins": -0.6725883483886719, "rewards/rejected": 1.761041283607483, "step": 5952 }, { "epoch": 0.97, "learning_rate": 7.941690034770812e-07, "logits/chosen": -0.43884673714637756, "logits/rejected": -0.4217532277107239, "logps/chosen": -100.44945526123047, "logps/rejected": -56.15052032470703, "loss": 0.5541, "rewards/accuracies": 0.0, "rewards/chosen": 1.306464433670044, "rewards/margins": -0.3228919506072998, "rewards/rejected": 1.6293563842773438, "step": 5953 }, { "epoch": 0.97, "learning_rate": 7.940627207560656e-07, "logits/chosen": -0.13040529191493988, "logits/rejected": -0.08578811585903168, "logps/chosen": -51.44671630859375, "logps/rejected": -76.17324829101562, "loss": 0.2294, "rewards/accuracies": 1.0, "rewards/chosen": 2.5372910499572754, "rewards/margins": 1.128786563873291, "rewards/rejected": 1.4085044860839844, "step": 5954 }, { "epoch": 0.97, "learning_rate": 7.939564177180984e-07, "logits/chosen": -0.47877052426338196, "logits/rejected": -0.49001505970954895, "logps/chosen": -7.937621593475342, "logps/rejected": -0.8909477591514587, "loss": 0.5678, "rewards/accuracies": 0.0, "rewards/chosen": -0.22228732705116272, "rewards/margins": -0.4773634672164917, "rewards/rejected": 0.255076140165329, "step": 5955 }, { "epoch": 0.97, "learning_rate": 7.938500943705242e-07, "logits/chosen": -0.5773898959159851, "logits/rejected": -0.6428197026252747, "logps/chosen": -192.5598602294922, "logps/rejected": -143.35211181640625, "loss": 0.1677, "rewards/accuracies": 1.0, "rewards/chosen": 2.8874619007110596, "rewards/margins": 1.1094894409179688, "rewards/rejected": 1.7779724597930908, "step": 5956 }, { "epoch": 0.97, "learning_rate": 7.937437507206888e-07, "logits/chosen": -1.0121914148330688, "logits/rejected": -0.9850935339927673, "logps/chosen": -114.644287109375, "logps/rejected": -93.47386169433594, "loss": 1.5596, "rewards/accuracies": 0.0, "rewards/chosen": 0.9097488522529602, "rewards/margins": -1.4281067848205566, "rewards/rejected": 2.337855577468872, "step": 5957 }, { "epoch": 0.97, "learning_rate": 7.936373867759399e-07, "logits/chosen": -0.5701216459274292, "logits/rejected": -0.5386752486228943, "logps/chosen": -59.96202087402344, "logps/rejected": -58.891971588134766, "loss": 1.2108, "rewards/accuracies": 0.0, "rewards/chosen": 1.1773452758789062, "rewards/margins": -1.4599246978759766, "rewards/rejected": 2.637269973754883, "step": 5958 }, { "epoch": 0.97, "learning_rate": 7.935310025436257e-07, "logits/chosen": -0.8468085527420044, "logits/rejected": -0.8093105554580688, "logps/chosen": -95.91230773925781, "logps/rejected": -97.38131713867188, "loss": 0.6789, "rewards/accuracies": 0.0, "rewards/chosen": 0.8369079828262329, "rewards/margins": -0.04444122314453125, "rewards/rejected": 0.8813492059707642, "step": 5959 }, { "epoch": 0.97, "learning_rate": 7.934245980310969e-07, "logits/chosen": -1.0587180852890015, "logits/rejected": -0.9740852117538452, "logps/chosen": -70.1580581665039, "logps/rejected": -87.93173217773438, "loss": 0.3195, "rewards/accuracies": 1.0, "rewards/chosen": 3.7487106323242188, "rewards/margins": 0.1306319236755371, "rewards/rejected": 3.6180787086486816, "step": 5960 }, { "epoch": 0.97, "learning_rate": 7.933181732457046e-07, "logits/chosen": -0.48428645730018616, "logits/rejected": -0.5242030620574951, "logps/chosen": -78.48684692382812, "logps/rejected": -126.42337036132812, "loss": 0.8356, "rewards/accuracies": 0.0, "rewards/chosen": 0.7175262570381165, "rewards/margins": -0.2571045160293579, "rewards/rejected": 0.9746307730674744, "step": 5961 }, { "epoch": 0.97, "learning_rate": 7.932117281948021e-07, "logits/chosen": -0.8282221555709839, "logits/rejected": -1.146126627922058, "logps/chosen": -125.74853515625, "logps/rejected": -37.21148681640625, "loss": 0.7046, "rewards/accuracies": 1.0, "rewards/chosen": 0.7752701044082642, "rewards/margins": 0.6014969348907471, "rewards/rejected": 0.17377319931983948, "step": 5962 }, { "epoch": 0.97, "learning_rate": 7.931052628857435e-07, "logits/chosen": -0.30065470933914185, "logits/rejected": -0.6173065900802612, "logps/chosen": -72.50908660888672, "logps/rejected": -61.127079010009766, "loss": 0.8243, "rewards/accuracies": 0.0, "rewards/chosen": 1.0967124700546265, "rewards/margins": -0.7598240375518799, "rewards/rejected": 1.8565365076065063, "step": 5963 }, { "epoch": 0.97, "learning_rate": 7.929987773258846e-07, "logits/chosen": -0.8806843161582947, "logits/rejected": -0.9126670956611633, "logps/chosen": -107.81293487548828, "logps/rejected": -81.13858032226562, "loss": 0.5886, "rewards/accuracies": 0.0, "rewards/chosen": 0.6877945065498352, "rewards/margins": -0.7029083371162415, "rewards/rejected": 1.3907028436660767, "step": 5964 }, { "epoch": 0.97, "learning_rate": 7.928922715225826e-07, "logits/chosen": -0.8531185984611511, "logits/rejected": -0.8425326347351074, "logps/chosen": -121.00161743164062, "logps/rejected": -48.133079528808594, "loss": 0.874, "rewards/accuracies": 0.0, "rewards/chosen": 0.3750961422920227, "rewards/margins": -1.4713523387908936, "rewards/rejected": 1.846448540687561, "step": 5965 }, { "epoch": 0.97, "learning_rate": 7.92785745483196e-07, "logits/chosen": -0.5794451236724854, "logits/rejected": -0.6484608054161072, "logps/chosen": -109.18487548828125, "logps/rejected": -109.31948852539062, "loss": 0.5975, "rewards/accuracies": 0.0, "rewards/chosen": 0.9336555600166321, "rewards/margins": -0.8118926882743835, "rewards/rejected": 1.7455482482910156, "step": 5966 }, { "epoch": 0.97, "learning_rate": 7.926791992150847e-07, "logits/chosen": -0.557643473148346, "logits/rejected": -0.5548083186149597, "logps/chosen": -21.509654998779297, "logps/rejected": -33.86848449707031, "loss": 0.8357, "rewards/accuracies": 1.0, "rewards/chosen": 0.10089512169361115, "rewards/margins": 0.051349833607673645, "rewards/rejected": 0.0495452880859375, "step": 5967 }, { "epoch": 0.97, "learning_rate": 7.925726327256101e-07, "logits/chosen": -0.4539242088794708, "logits/rejected": -0.4490237832069397, "logps/chosen": -48.01337432861328, "logps/rejected": -47.060150146484375, "loss": 0.3834, "rewards/accuracies": 1.0, "rewards/chosen": 1.3478504419326782, "rewards/margins": 0.178558349609375, "rewards/rejected": 1.1692920923233032, "step": 5968 }, { "epoch": 0.97, "learning_rate": 7.924660460221351e-07, "logits/chosen": -0.5280419588088989, "logits/rejected": -0.5635668039321899, "logps/chosen": -64.86537170410156, "logps/rejected": -72.90116119384766, "loss": 0.9821, "rewards/accuracies": 1.0, "rewards/chosen": 0.797393798828125, "rewards/margins": 0.11177289485931396, "rewards/rejected": 0.685620903968811, "step": 5969 }, { "epoch": 0.97, "learning_rate": 7.923594391120236e-07, "logits/chosen": -0.6533975005149841, "logits/rejected": -0.6154752969741821, "logps/chosen": -52.311458587646484, "logps/rejected": -55.586402893066406, "loss": 1.4393, "rewards/accuracies": 0.0, "rewards/chosen": 0.8275501132011414, "rewards/margins": -0.9470470547676086, "rewards/rejected": 1.77459716796875, "step": 5970 }, { "epoch": 0.97, "learning_rate": 7.92252812002641e-07, "logits/chosen": -0.2124582976102829, "logits/rejected": -0.22938987612724304, "logps/chosen": -110.05580139160156, "logps/rejected": -53.71394729614258, "loss": 0.8111, "rewards/accuracies": 0.0, "rewards/chosen": 1.4892876148223877, "rewards/margins": -0.5899882316589355, "rewards/rejected": 2.0792758464813232, "step": 5971 }, { "epoch": 0.97, "learning_rate": 7.921461647013546e-07, "logits/chosen": -0.6709396243095398, "logits/rejected": -0.6431462168693542, "logps/chosen": -89.22183990478516, "logps/rejected": -126.26530456542969, "loss": 0.7644, "rewards/accuracies": 0.0, "rewards/chosen": 2.2438530921936035, "rewards/margins": -1.1922996044158936, "rewards/rejected": 3.436152696609497, "step": 5972 }, { "epoch": 0.97, "learning_rate": 7.920394972155324e-07, "logits/chosen": -0.5106766819953918, "logits/rejected": -0.5847810506820679, "logps/chosen": -81.64409637451172, "logps/rejected": -92.00650024414062, "loss": 0.9982, "rewards/accuracies": 0.0, "rewards/chosen": 1.5209931135177612, "rewards/margins": -1.7554367780685425, "rewards/rejected": 3.2764298915863037, "step": 5973 }, { "epoch": 0.97, "learning_rate": 7.919328095525444e-07, "logits/chosen": -0.5048211216926575, "logits/rejected": -0.5060390830039978, "logps/chosen": -28.038761138916016, "logps/rejected": -52.690879821777344, "loss": 0.541, "rewards/accuracies": 1.0, "rewards/chosen": 1.6188381910324097, "rewards/margins": 0.1178966760635376, "rewards/rejected": 1.500941514968872, "step": 5974 }, { "epoch": 0.97, "learning_rate": 7.918261017197614e-07, "logits/chosen": -0.8758875131607056, "logits/rejected": -0.8747400641441345, "logps/chosen": -62.257728576660156, "logps/rejected": -78.79256439208984, "loss": 0.329, "rewards/accuracies": 1.0, "rewards/chosen": 2.6475021839141846, "rewards/margins": 0.6698082685470581, "rewards/rejected": 1.9776939153671265, "step": 5975 }, { "epoch": 0.97, "learning_rate": 7.917193737245562e-07, "logits/chosen": -0.7074010968208313, "logits/rejected": -0.4325409233570099, "logps/chosen": -131.65460205078125, "logps/rejected": -50.90916061401367, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 5.531547546386719, "rewards/margins": 4.6986565589904785, "rewards/rejected": 0.8328911066055298, "step": 5976 }, { "epoch": 0.97, "learning_rate": 7.916126255743024e-07, "logits/chosen": -0.8161885142326355, "logits/rejected": -0.7364746928215027, "logps/chosen": -102.25514221191406, "logps/rejected": -130.92459106445312, "loss": 0.1859, "rewards/accuracies": 1.0, "rewards/chosen": 3.787449598312378, "rewards/margins": 1.2535598278045654, "rewards/rejected": 2.5338897705078125, "step": 5977 }, { "epoch": 0.97, "learning_rate": 7.915058572763756e-07, "logits/chosen": -0.5405439138412476, "logits/rejected": -0.5266421437263489, "logps/chosen": -260.24798583984375, "logps/rejected": -101.0224609375, "loss": 0.3447, "rewards/accuracies": 1.0, "rewards/chosen": 5.166861057281494, "rewards/margins": 2.5795249938964844, "rewards/rejected": 2.5873360633850098, "step": 5978 }, { "epoch": 0.97, "learning_rate": 7.913990688381522e-07, "logits/chosen": -1.114344596862793, "logits/rejected": -1.0793555974960327, "logps/chosen": -83.79859924316406, "logps/rejected": -89.38992309570312, "loss": 0.1529, "rewards/accuracies": 1.0, "rewards/chosen": 2.6796631813049316, "rewards/margins": 1.7746925354003906, "rewards/rejected": 0.9049705862998962, "step": 5979 }, { "epoch": 0.97, "learning_rate": 7.912922602670103e-07, "logits/chosen": -0.6257875561714172, "logits/rejected": -0.6491501331329346, "logps/chosen": -52.49509048461914, "logps/rejected": -92.77108764648438, "loss": 0.4854, "rewards/accuracies": 0.0, "rewards/chosen": 0.4558204710483551, "rewards/margins": -0.19997939467430115, "rewards/rejected": 0.6557998657226562, "step": 5980 }, { "epoch": 0.97, "learning_rate": 7.911854315703297e-07, "logits/chosen": -1.2755359411239624, "logits/rejected": -1.2056430578231812, "logps/chosen": -140.89122009277344, "logps/rejected": -162.47528076171875, "loss": 1.3463, "rewards/accuracies": 0.0, "rewards/chosen": 0.3913620114326477, "rewards/margins": -2.5484743118286133, "rewards/rejected": 2.939836263656616, "step": 5981 }, { "epoch": 0.97, "learning_rate": 7.910785827554908e-07, "logits/chosen": -0.7536048293113708, "logits/rejected": -0.7773340344429016, "logps/chosen": -94.02397155761719, "logps/rejected": -128.15985107421875, "loss": 2.0805, "rewards/accuracies": 0.0, "rewards/chosen": 0.4940536618232727, "rewards/margins": -1.4739608764648438, "rewards/rejected": 1.9680145978927612, "step": 5982 }, { "epoch": 0.97, "learning_rate": 7.909717138298762e-07, "logits/chosen": -0.24744611978530884, "logits/rejected": -0.30045798420906067, "logps/chosen": -122.19625854492188, "logps/rejected": -85.32994842529297, "loss": 1.1563, "rewards/accuracies": 0.0, "rewards/chosen": 1.5681930780410767, "rewards/margins": -1.2418831586837769, "rewards/rejected": 2.8100762367248535, "step": 5983 }, { "epoch": 0.97, "learning_rate": 7.908648248008691e-07, "logits/chosen": -0.9344719648361206, "logits/rejected": -0.7961843013763428, "logps/chosen": -152.05532836914062, "logps/rejected": -136.72171020507812, "loss": 0.5162, "rewards/accuracies": 0.0, "rewards/chosen": 5.712991237640381, "rewards/margins": -0.48076820373535156, "rewards/rejected": 6.193759441375732, "step": 5984 }, { "epoch": 0.97, "learning_rate": 7.90757915675855e-07, "logits/chosen": -0.8139150142669678, "logits/rejected": -0.6981330513954163, "logps/chosen": -67.026611328125, "logps/rejected": -122.75068664550781, "loss": 2.5668, "rewards/accuracies": 0.0, "rewards/chosen": 2.1728456020355225, "rewards/margins": -5.087801933288574, "rewards/rejected": 7.260647773742676, "step": 5985 }, { "epoch": 0.97, "learning_rate": 7.906509864622201e-07, "logits/chosen": -1.0047119855880737, "logits/rejected": -0.9843722581863403, "logps/chosen": -84.41554260253906, "logps/rejected": -74.13902282714844, "loss": 1.9433, "rewards/accuracies": 0.0, "rewards/chosen": 1.2324516773223877, "rewards/margins": -1.335961103439331, "rewards/rejected": 2.5684127807617188, "step": 5986 }, { "epoch": 0.97, "learning_rate": 7.905440371673522e-07, "logits/chosen": -0.45732277631759644, "logits/rejected": -0.42487165331840515, "logps/chosen": -54.49391555786133, "logps/rejected": -1.880581021308899, "loss": 1.0961, "rewards/accuracies": 0.0, "rewards/chosen": 0.4551410675048828, "rewards/margins": -0.12765711545944214, "rewards/rejected": 0.582798182964325, "step": 5987 }, { "epoch": 0.97, "learning_rate": 7.904370677986403e-07, "logits/chosen": -1.1210439205169678, "logits/rejected": -1.090563416481018, "logps/chosen": -169.85861206054688, "logps/rejected": -120.37520599365234, "loss": 1.7707, "rewards/accuracies": 0.0, "rewards/chosen": 2.8161118030548096, "rewards/margins": -2.6102426052093506, "rewards/rejected": 5.42635440826416, "step": 5988 }, { "epoch": 0.97, "learning_rate": 7.903300783634754e-07, "logits/chosen": 0.06214369088411331, "logits/rejected": 0.06214369088411331, "logps/chosen": -9.370349884033203, "logps/rejected": -9.370349884033203, "loss": 0.7211, "rewards/accuracies": 0.0, "rewards/chosen": 0.3172919452190399, "rewards/margins": 0.0, "rewards/rejected": 0.3172919452190399, "step": 5989 }, { "epoch": 0.97, "learning_rate": 7.90223068869249e-07, "logits/chosen": -0.5829223394393921, "logits/rejected": -0.5096240043640137, "logps/chosen": -64.85403442382812, "logps/rejected": -74.92691802978516, "loss": 0.2408, "rewards/accuracies": 1.0, "rewards/chosen": 2.3533523082733154, "rewards/margins": 0.5482001304626465, "rewards/rejected": 1.805152177810669, "step": 5990 }, { "epoch": 0.97, "learning_rate": 7.90116039323355e-07, "logits/chosen": -0.5556908845901489, "logits/rejected": -0.4723483622074127, "logps/chosen": -35.956459045410156, "logps/rejected": -9.913601875305176, "loss": 0.1299, "rewards/accuracies": 1.0, "rewards/chosen": 1.9016224145889282, "rewards/margins": 1.2725306749343872, "rewards/rejected": 0.629091739654541, "step": 5991 }, { "epoch": 0.97, "learning_rate": 7.900089897331874e-07, "logits/chosen": -0.7771406769752502, "logits/rejected": -0.6570599675178528, "logps/chosen": -119.26426696777344, "logps/rejected": -64.02481079101562, "loss": 0.5969, "rewards/accuracies": 1.0, "rewards/chosen": 4.369275093078613, "rewards/margins": 2.0331246852874756, "rewards/rejected": 2.3361504077911377, "step": 5992 }, { "epoch": 0.97, "learning_rate": 7.89901920106143e-07, "logits/chosen": -0.6779735684394836, "logits/rejected": -0.5748369097709656, "logps/chosen": -109.0903091430664, "logps/rejected": -107.65458679199219, "loss": 1.6333, "rewards/accuracies": 0.0, "rewards/chosen": 0.4074257016181946, "rewards/margins": -2.6009743213653564, "rewards/rejected": 3.0083999633789062, "step": 5993 }, { "epoch": 0.97, "learning_rate": 7.897948304496188e-07, "logits/chosen": -0.4265505373477936, "logits/rejected": -0.34377944469451904, "logps/chosen": -38.578826904296875, "logps/rejected": -29.93702507019043, "loss": 0.4228, "rewards/accuracies": 0.0, "rewards/chosen": 0.685925304889679, "rewards/margins": -0.2138635516166687, "rewards/rejected": 0.8997888565063477, "step": 5994 }, { "epoch": 0.97, "learning_rate": 7.896877207710139e-07, "logits/chosen": -0.634040117263794, "logits/rejected": -0.61662358045578, "logps/chosen": -115.65858459472656, "logps/rejected": -78.34834289550781, "loss": 1.148, "rewards/accuracies": 0.0, "rewards/chosen": 0.5928459167480469, "rewards/margins": -1.9345276355743408, "rewards/rejected": 2.5273735523223877, "step": 5995 }, { "epoch": 0.97, "learning_rate": 7.895805910777286e-07, "logits/chosen": -0.5721690058708191, "logits/rejected": -0.6018588542938232, "logps/chosen": -79.55499267578125, "logps/rejected": -65.19888305664062, "loss": 0.5663, "rewards/accuracies": 0.0, "rewards/chosen": 1.2117111682891846, "rewards/margins": -0.6904418468475342, "rewards/rejected": 1.9021530151367188, "step": 5996 }, { "epoch": 0.97, "learning_rate": 7.894734413771645e-07, "logits/chosen": -0.47053393721580505, "logits/rejected": -0.47369059920310974, "logps/chosen": -2.8578224182128906, "logps/rejected": -1.4635872840881348, "loss": 0.5605, "rewards/accuracies": 1.0, "rewards/chosen": 0.353492796421051, "rewards/margins": 0.08525082468986511, "rewards/rejected": 0.2682419717311859, "step": 5997 }, { "epoch": 0.97, "learning_rate": 7.893662716767246e-07, "logits/chosen": -0.5853433012962341, "logits/rejected": -0.5145816802978516, "logps/chosen": -118.59555053710938, "logps/rejected": -62.74169158935547, "loss": 0.2685, "rewards/accuracies": 1.0, "rewards/chosen": 1.054901123046875, "rewards/margins": 0.6204169988632202, "rewards/rejected": 0.4344840943813324, "step": 5998 }, { "epoch": 0.97, "learning_rate": 7.892590819838134e-07, "logits/chosen": -1.1066983938217163, "logits/rejected": -1.0625181198120117, "logps/chosen": -81.05143737792969, "logps/rejected": -275.9842529296875, "loss": 1.3551, "rewards/accuracies": 0.0, "rewards/chosen": 0.4980575740337372, "rewards/margins": -2.0003280639648438, "rewards/rejected": 2.4983856678009033, "step": 5999 }, { "epoch": 0.97, "learning_rate": 7.891518723058366e-07, "logits/chosen": -0.6370460391044617, "logits/rejected": -0.671550452709198, "logps/chosen": -55.52252197265625, "logps/rejected": -57.61436462402344, "loss": 1.2007, "rewards/accuracies": 0.0, "rewards/chosen": 0.8822822570800781, "rewards/margins": -1.2074241638183594, "rewards/rejected": 2.0897064208984375, "step": 6000 }, { "epoch": 0.97, "learning_rate": 7.890446426502014e-07, "logits/chosen": -0.6291772127151489, "logits/rejected": -0.6020772457122803, "logps/chosen": -57.424896240234375, "logps/rejected": -70.94580078125, "loss": 1.7753, "rewards/accuracies": 0.0, "rewards/chosen": 1.5023964643478394, "rewards/margins": -0.19218063354492188, "rewards/rejected": 1.6945770978927612, "step": 6001 }, { "epoch": 0.97, "learning_rate": 7.889373930243164e-07, "logits/chosen": -0.5276565551757812, "logits/rejected": -0.5091198086738586, "logps/chosen": -80.41973114013672, "logps/rejected": -43.2586669921875, "loss": 1.4267, "rewards/accuracies": 0.0, "rewards/chosen": 1.3985618352890015, "rewards/margins": -0.6263543367385864, "rewards/rejected": 2.024916172027588, "step": 6002 }, { "epoch": 0.97, "learning_rate": 7.888301234355914e-07, "logits/chosen": -0.7213847637176514, "logits/rejected": -0.5236448049545288, "logps/chosen": -183.5916748046875, "logps/rejected": -223.94488525390625, "loss": 0.4046, "rewards/accuracies": 0.0, "rewards/chosen": 4.20778226852417, "rewards/margins": -0.13998985290527344, "rewards/rejected": 4.347772121429443, "step": 6003 }, { "epoch": 0.97, "learning_rate": 7.887228338914378e-07, "logits/chosen": -0.7756687998771667, "logits/rejected": -0.7203055620193481, "logps/chosen": -176.37307739257812, "logps/rejected": -48.60205078125, "loss": 0.1225, "rewards/accuracies": 1.0, "rewards/chosen": 5.018838405609131, "rewards/margins": 2.700256109237671, "rewards/rejected": 2.31858229637146, "step": 6004 }, { "epoch": 0.97, "learning_rate": 7.886155243992683e-07, "logits/chosen": -0.5918712615966797, "logits/rejected": -0.5776358246803284, "logps/chosen": -51.640228271484375, "logps/rejected": -46.920318603515625, "loss": 0.8061, "rewards/accuracies": 1.0, "rewards/chosen": 0.8978805541992188, "rewards/margins": 0.3119773864746094, "rewards/rejected": 0.5859031677246094, "step": 6005 }, { "epoch": 0.97, "learning_rate": 7.88508194966497e-07, "logits/chosen": -0.8977841138839722, "logits/rejected": -0.8548246026039124, "logps/chosen": -98.075439453125, "logps/rejected": -39.67646789550781, "loss": 0.1252, "rewards/accuracies": 1.0, "rewards/chosen": 2.026175022125244, "rewards/margins": 1.7265019416809082, "rewards/rejected": 0.29967308044433594, "step": 6006 }, { "epoch": 0.98, "learning_rate": 7.884008456005393e-07, "logits/chosen": -0.5878233313560486, "logits/rejected": -0.5871185660362244, "logps/chosen": -86.7419662475586, "logps/rejected": -120.33740234375, "loss": 1.1573, "rewards/accuracies": 0.0, "rewards/chosen": 1.3344993591308594, "rewards/margins": -1.3451576232910156, "rewards/rejected": 2.679656982421875, "step": 6007 }, { "epoch": 0.98, "learning_rate": 7.88293476308812e-07, "logits/chosen": -0.6392614245414734, "logits/rejected": -0.5907402634620667, "logps/chosen": -94.732666015625, "logps/rejected": -94.76036071777344, "loss": 0.9387, "rewards/accuracies": 0.0, "rewards/chosen": 0.9365066885948181, "rewards/margins": -1.6286513805389404, "rewards/rejected": 2.5651581287384033, "step": 6008 }, { "epoch": 0.98, "learning_rate": 7.881860870987336e-07, "logits/chosen": -0.6369599103927612, "logits/rejected": -0.6267456412315369, "logps/chosen": -108.325927734375, "logps/rejected": -62.888790130615234, "loss": 1.1537, "rewards/accuracies": 0.0, "rewards/chosen": -0.16773377358913422, "rewards/margins": -0.09134217351675034, "rewards/rejected": -0.07639160007238388, "step": 6009 }, { "epoch": 0.98, "learning_rate": 7.880786779777231e-07, "logits/chosen": -0.5694903135299683, "logits/rejected": -0.5732383131980896, "logps/chosen": -60.412254333496094, "logps/rejected": -52.30831527709961, "loss": 0.2936, "rewards/accuracies": 1.0, "rewards/chosen": 2.0757272243499756, "rewards/margins": 0.38212788105010986, "rewards/rejected": 1.6935993432998657, "step": 6010 }, { "epoch": 0.98, "learning_rate": 7.87971248953202e-07, "logits/chosen": -0.3621618449687958, "logits/rejected": -0.3621618449687958, "logps/chosen": -93.31340789794922, "logps/rejected": -93.31340789794922, "loss": 0.5639, "rewards/accuracies": 0.0, "rewards/chosen": 2.0205421447753906, "rewards/margins": 0.0, "rewards/rejected": 2.0205421447753906, "step": 6011 }, { "epoch": 0.98, "learning_rate": 7.878638000325923e-07, "logits/chosen": -0.7233253121376038, "logits/rejected": -0.5950868129730225, "logps/chosen": -73.13660430908203, "logps/rejected": -60.53215026855469, "loss": 0.0938, "rewards/accuracies": 1.0, "rewards/chosen": 2.678786516189575, "rewards/margins": 3.0182061195373535, "rewards/rejected": -0.33941957354545593, "step": 6012 }, { "epoch": 0.98, "learning_rate": 7.877563312233179e-07, "logits/chosen": -0.4028069078922272, "logits/rejected": -0.35138121247291565, "logps/chosen": -42.62087631225586, "logps/rejected": -70.09378051757812, "loss": 0.7914, "rewards/accuracies": 0.0, "rewards/chosen": 1.6660457849502563, "rewards/margins": -0.3606175184249878, "rewards/rejected": 2.026663303375244, "step": 6013 }, { "epoch": 0.98, "learning_rate": 7.876488425328037e-07, "logits/chosen": -0.623954176902771, "logits/rejected": -0.5920833349227905, "logps/chosen": -102.65821075439453, "logps/rejected": -63.20640563964844, "loss": 0.1211, "rewards/accuracies": 1.0, "rewards/chosen": 5.360622406005859, "rewards/margins": 3.5205893516540527, "rewards/rejected": 1.840032935142517, "step": 6014 }, { "epoch": 0.98, "learning_rate": 7.875413339684762e-07, "logits/chosen": -0.8769247531890869, "logits/rejected": -0.7307322025299072, "logps/chosen": -68.2790756225586, "logps/rejected": -17.96623992919922, "loss": 0.0585, "rewards/accuracies": 1.0, "rewards/chosen": 3.4363296031951904, "rewards/margins": 2.9458627700805664, "rewards/rejected": 0.4904668927192688, "step": 6015 }, { "epoch": 0.98, "learning_rate": 7.874338055377634e-07, "logits/chosen": -0.6433239579200745, "logits/rejected": -0.6452235579490662, "logps/chosen": -203.1009521484375, "logps/rejected": -77.09394836425781, "loss": 0.0529, "rewards/accuracies": 1.0, "rewards/chosen": 3.8141114711761475, "rewards/margins": 2.838130235671997, "rewards/rejected": 0.9759811758995056, "step": 6016 }, { "epoch": 0.98, "learning_rate": 7.873262572480943e-07, "logits/chosen": -1.0551773309707642, "logits/rejected": -1.00041663646698, "logps/chosen": -105.8120346069336, "logps/rejected": -28.31856918334961, "loss": 0.7864, "rewards/accuracies": 1.0, "rewards/chosen": 1.4662201404571533, "rewards/margins": 0.9952487945556641, "rewards/rejected": 0.47097131609916687, "step": 6017 }, { "epoch": 0.98, "learning_rate": 7.872186891068995e-07, "logits/chosen": -0.5085724592208862, "logits/rejected": -0.4678012430667877, "logps/chosen": -77.30726623535156, "logps/rejected": -24.93765640258789, "loss": 0.9496, "rewards/accuracies": 0.0, "rewards/chosen": 1.6698821783065796, "rewards/margins": -0.4567059278488159, "rewards/rejected": 2.1265881061553955, "step": 6018 }, { "epoch": 0.98, "learning_rate": 7.871111011216109e-07, "logits/chosen": -0.6575573086738586, "logits/rejected": -0.6010036468505859, "logps/chosen": -52.75349807739258, "logps/rejected": -57.12218475341797, "loss": 0.5995, "rewards/accuracies": 1.0, "rewards/chosen": 1.5804821252822876, "rewards/margins": 0.6119266748428345, "rewards/rejected": 0.9685554504394531, "step": 6019 }, { "epoch": 0.98, "learning_rate": 7.870034932996621e-07, "logits/chosen": -0.674226701259613, "logits/rejected": -0.719005286693573, "logps/chosen": -59.32314682006836, "logps/rejected": -84.67472839355469, "loss": 1.2055, "rewards/accuracies": 0.0, "rewards/chosen": 0.7449024319648743, "rewards/margins": -1.7041332721710205, "rewards/rejected": 2.44903564453125, "step": 6020 }, { "epoch": 0.98, "learning_rate": 7.868958656484874e-07, "logits/chosen": -0.8486887216567993, "logits/rejected": -0.47468096017837524, "logps/chosen": -202.58023071289062, "logps/rejected": -97.34799194335938, "loss": 1.2368, "rewards/accuracies": 1.0, "rewards/chosen": 3.2893922328948975, "rewards/margins": 1.0644943714141846, "rewards/rejected": 2.224897861480713, "step": 6021 }, { "epoch": 0.98, "learning_rate": 7.86788218175523e-07, "logits/chosen": -0.5953947901725769, "logits/rejected": -0.5571495294570923, "logps/chosen": -141.6878662109375, "logps/rejected": -97.3912353515625, "loss": 0.8596, "rewards/accuracies": 0.0, "rewards/chosen": 0.7551590204238892, "rewards/margins": -1.299401879310608, "rewards/rejected": 2.054560899734497, "step": 6022 }, { "epoch": 0.98, "learning_rate": 7.866805508882064e-07, "logits/chosen": -1.0873878002166748, "logits/rejected": -1.0602338314056396, "logps/chosen": -74.45032501220703, "logps/rejected": -92.54734802246094, "loss": 0.7234, "rewards/accuracies": 0.0, "rewards/chosen": 2.247077226638794, "rewards/margins": -0.9861717224121094, "rewards/rejected": 3.2332489490509033, "step": 6023 }, { "epoch": 0.98, "learning_rate": 7.865728637939763e-07, "logits/chosen": -0.7625793814659119, "logits/rejected": -0.722531795501709, "logps/chosen": -270.1015625, "logps/rejected": -73.35071563720703, "loss": 1.1054, "rewards/accuracies": 1.0, "rewards/chosen": 2.3572633266448975, "rewards/margins": 0.9155663251876831, "rewards/rejected": 1.4416970014572144, "step": 6024 }, { "epoch": 0.98, "learning_rate": 7.864651569002729e-07, "logits/chosen": -0.5033558011054993, "logits/rejected": -0.5845891833305359, "logps/chosen": -98.62120056152344, "logps/rejected": -130.5699462890625, "loss": 1.2127, "rewards/accuracies": 0.0, "rewards/chosen": 1.3984383344650269, "rewards/margins": -1.8328741788864136, "rewards/rejected": 3.2313125133514404, "step": 6025 }, { "epoch": 0.98, "learning_rate": 7.863574302145378e-07, "logits/chosen": -0.4897652566432953, "logits/rejected": -0.4897652566432953, "logps/chosen": -98.45556640625, "logps/rejected": -98.45556640625, "loss": 0.4136, "rewards/accuracies": 0.0, "rewards/chosen": 1.7755447626113892, "rewards/margins": 0.0, "rewards/rejected": 1.7755447626113892, "step": 6026 }, { "epoch": 0.98, "learning_rate": 7.862496837442138e-07, "logits/chosen": -0.4579450488090515, "logits/rejected": -0.47392603754997253, "logps/chosen": -70.71593475341797, "logps/rejected": -89.8521499633789, "loss": 0.5333, "rewards/accuracies": 1.0, "rewards/chosen": 1.7489632368087769, "rewards/margins": 0.10092854499816895, "rewards/rejected": 1.648034691810608, "step": 6027 }, { "epoch": 0.98, "learning_rate": 7.861419174967451e-07, "logits/chosen": -0.38501647114753723, "logits/rejected": -0.38734927773475647, "logps/chosen": -5.055344104766846, "logps/rejected": -6.578424453735352, "loss": 0.3011, "rewards/accuracies": 1.0, "rewards/chosen": 0.21490469574928284, "rewards/margins": 0.19301076233386993, "rewards/rejected": 0.021893931552767754, "step": 6028 }, { "epoch": 0.98, "learning_rate": 7.860341314795775e-07, "logits/chosen": -0.9216092228889465, "logits/rejected": -0.9174534678459167, "logps/chosen": -86.28131103515625, "logps/rejected": -43.652854919433594, "loss": 0.4998, "rewards/accuracies": 1.0, "rewards/chosen": 1.2737091779708862, "rewards/margins": 0.1771073341369629, "rewards/rejected": 1.0966018438339233, "step": 6029 }, { "epoch": 0.98, "learning_rate": 7.859263257001577e-07, "logits/chosen": -0.5967868566513062, "logits/rejected": -0.5470763444900513, "logps/chosen": -62.66826629638672, "logps/rejected": -76.80964660644531, "loss": 0.3996, "rewards/accuracies": 1.0, "rewards/chosen": 1.7394218444824219, "rewards/margins": 0.03499603271484375, "rewards/rejected": 1.7044258117675781, "step": 6030 }, { "epoch": 0.98, "learning_rate": 7.858185001659343e-07, "logits/chosen": -0.9118443131446838, "logits/rejected": -0.6954723000526428, "logps/chosen": -77.76622009277344, "logps/rejected": -52.370277404785156, "loss": 0.4449, "rewards/accuracies": 1.0, "rewards/chosen": 1.8884963989257812, "rewards/margins": 0.6047431230545044, "rewards/rejected": 1.2837532758712769, "step": 6031 }, { "epoch": 0.98, "learning_rate": 7.857106548843571e-07, "logits/chosen": -0.7195372581481934, "logits/rejected": -0.6334415078163147, "logps/chosen": -77.05146026611328, "logps/rejected": -22.85849380493164, "loss": 1.1022, "rewards/accuracies": 1.0, "rewards/chosen": 4.230929851531982, "rewards/margins": 3.3438501358032227, "rewards/rejected": 0.8870798349380493, "step": 6032 }, { "epoch": 0.98, "learning_rate": 7.85602789862877e-07, "logits/chosen": -0.5624560713768005, "logits/rejected": -0.6029834151268005, "logps/chosen": -53.224403381347656, "logps/rejected": -59.6163330078125, "loss": 0.3712, "rewards/accuracies": 0.0, "rewards/chosen": 2.193105459213257, "rewards/margins": -0.0780782699584961, "rewards/rejected": 2.271183729171753, "step": 6033 }, { "epoch": 0.98, "learning_rate": 7.854949051089465e-07, "logits/chosen": -0.27093595266342163, "logits/rejected": -0.2999603748321533, "logps/chosen": -99.13530731201172, "logps/rejected": -52.59269714355469, "loss": 1.4939, "rewards/accuracies": 0.0, "rewards/chosen": 0.044689178466796875, "rewards/margins": -2.2869338989257812, "rewards/rejected": 2.331623077392578, "step": 6034 }, { "epoch": 0.98, "learning_rate": 7.853870006300195e-07, "logits/chosen": -0.5006147027015686, "logits/rejected": -0.3970430791378021, "logps/chosen": -87.4033432006836, "logps/rejected": -84.2127685546875, "loss": 0.3794, "rewards/accuracies": 1.0, "rewards/chosen": 2.7712998390197754, "rewards/margins": 0.37671899795532227, "rewards/rejected": 2.394580841064453, "step": 6035 }, { "epoch": 0.98, "learning_rate": 7.852790764335511e-07, "logits/chosen": -0.5036982297897339, "logits/rejected": -0.45345696806907654, "logps/chosen": -86.30111694335938, "logps/rejected": -64.5096664428711, "loss": 0.122, "rewards/accuracies": 1.0, "rewards/chosen": 2.6187827587127686, "rewards/margins": 1.6040939092636108, "rewards/rejected": 1.0146888494491577, "step": 6036 }, { "epoch": 0.98, "learning_rate": 7.851711325269979e-07, "logits/chosen": -0.5627520680427551, "logits/rejected": -0.5756657123565674, "logps/chosen": -53.74674987792969, "logps/rejected": -124.26500701904297, "loss": 0.4804, "rewards/accuracies": 1.0, "rewards/chosen": 1.2373169660568237, "rewards/margins": 0.6967552900314331, "rewards/rejected": 0.5405616760253906, "step": 6037 }, { "epoch": 0.98, "learning_rate": 7.850631689178175e-07, "logits/chosen": -0.8257214426994324, "logits/rejected": -0.8390609622001648, "logps/chosen": -105.66755676269531, "logps/rejected": -178.10574340820312, "loss": 2.3663, "rewards/accuracies": 0.0, "rewards/chosen": 2.767047166824341, "rewards/margins": -4.412759780883789, "rewards/rejected": 7.179806709289551, "step": 6038 }, { "epoch": 0.98, "learning_rate": 7.849551856134697e-07, "logits/chosen": -0.62965327501297, "logits/rejected": -0.5339362025260925, "logps/chosen": -69.90997314453125, "logps/rejected": -77.26058959960938, "loss": 0.1765, "rewards/accuracies": 1.0, "rewards/chosen": 1.8097915649414062, "rewards/margins": 0.9534896612167358, "rewards/rejected": 0.8563019037246704, "step": 6039 }, { "epoch": 0.98, "learning_rate": 7.848471826214146e-07, "logits/chosen": -0.6679308414459229, "logits/rejected": -0.6756949424743652, "logps/chosen": -125.96363067626953, "logps/rejected": -117.96104431152344, "loss": 0.4143, "rewards/accuracies": 1.0, "rewards/chosen": 0.7926079034805298, "rewards/margins": 0.20893782377243042, "rewards/rejected": 0.5836700797080994, "step": 6040 }, { "epoch": 0.98, "learning_rate": 7.847391599491146e-07, "logits/chosen": 0.0032668658532202244, "logits/rejected": 0.0032668658532202244, "logps/chosen": -1.5667846202850342, "logps/rejected": -1.5667846202850342, "loss": 0.4704, "rewards/accuracies": 0.0, "rewards/chosen": 0.32182490825653076, "rewards/margins": 0.0, "rewards/rejected": 0.32182490825653076, "step": 6041 }, { "epoch": 0.98, "learning_rate": 7.84631117604033e-07, "logits/chosen": -0.7301875948905945, "logits/rejected": -0.7159328460693359, "logps/chosen": -85.33480834960938, "logps/rejected": -110.28813171386719, "loss": 1.9233, "rewards/accuracies": 1.0, "rewards/chosen": 0.9733414053916931, "rewards/margins": 0.3251304626464844, "rewards/rejected": 0.6482109427452087, "step": 6042 }, { "epoch": 0.98, "learning_rate": 7.845230555936342e-07, "logits/chosen": -0.424646258354187, "logits/rejected": -0.5807053446769714, "logps/chosen": -134.1439208984375, "logps/rejected": -120.3810043334961, "loss": 1.6779, "rewards/accuracies": 0.0, "rewards/chosen": 1.073693871498108, "rewards/margins": -2.7019262313842773, "rewards/rejected": 3.775620222091675, "step": 6043 }, { "epoch": 0.98, "learning_rate": 7.844149739253845e-07, "logits/chosen": -0.5255617499351501, "logits/rejected": -0.527938961982727, "logps/chosen": -83.99729919433594, "logps/rejected": -82.4618911743164, "loss": 0.589, "rewards/accuracies": 1.0, "rewards/chosen": 1.166235327720642, "rewards/margins": 0.09837484359741211, "rewards/rejected": 1.06786048412323, "step": 6044 }, { "epoch": 0.98, "learning_rate": 7.843068726067512e-07, "logits/chosen": -0.574617326259613, "logits/rejected": -0.5242756605148315, "logps/chosen": -72.98309326171875, "logps/rejected": -56.78296661376953, "loss": 0.5069, "rewards/accuracies": 0.0, "rewards/chosen": 1.8962326049804688, "rewards/margins": -0.13522648811340332, "rewards/rejected": 2.031459093093872, "step": 6045 }, { "epoch": 0.98, "learning_rate": 7.841987516452032e-07, "logits/chosen": -0.7051824331283569, "logits/rejected": -0.6298040151596069, "logps/chosen": -125.7015380859375, "logps/rejected": -32.043434143066406, "loss": 0.3001, "rewards/accuracies": 1.0, "rewards/chosen": 1.0824264287948608, "rewards/margins": 0.908404529094696, "rewards/rejected": 0.174021914601326, "step": 6046 }, { "epoch": 0.98, "learning_rate": 7.840906110482106e-07, "logits/chosen": -0.6815928816795349, "logits/rejected": -0.7010571956634521, "logps/chosen": -65.10659790039062, "logps/rejected": -111.89265441894531, "loss": 0.4574, "rewards/accuracies": 1.0, "rewards/chosen": 0.8696533441543579, "rewards/margins": 0.13024139404296875, "rewards/rejected": 0.7394119501113892, "step": 6047 }, { "epoch": 0.98, "learning_rate": 7.839824508232448e-07, "logits/chosen": -0.7910014390945435, "logits/rejected": -0.6108272075653076, "logps/chosen": -157.32888793945312, "logps/rejected": -58.341365814208984, "loss": 0.433, "rewards/accuracies": 0.0, "rewards/chosen": 1.7686493396759033, "rewards/margins": -0.23727154731750488, "rewards/rejected": 2.005920886993408, "step": 6048 }, { "epoch": 0.98, "learning_rate": 7.838742709777788e-07, "logits/chosen": -0.7685444355010986, "logits/rejected": -0.76896071434021, "logps/chosen": -90.76594543457031, "logps/rejected": -50.24166488647461, "loss": 1.057, "rewards/accuracies": 0.0, "rewards/chosen": 2.5465989112854004, "rewards/margins": -0.05514788627624512, "rewards/rejected": 2.6017467975616455, "step": 6049 }, { "epoch": 0.98, "learning_rate": 7.837660715192866e-07, "logits/chosen": -0.47036778926849365, "logits/rejected": -0.47036778926849365, "logps/chosen": -40.77870559692383, "logps/rejected": -40.77870559692383, "loss": 0.3908, "rewards/accuracies": 0.0, "rewards/chosen": 1.695616602897644, "rewards/margins": 0.0, "rewards/rejected": 1.695616602897644, "step": 6050 }, { "epoch": 0.98, "learning_rate": 7.836578524552439e-07, "logits/chosen": -0.46237534284591675, "logits/rejected": -0.43962791562080383, "logps/chosen": -49.10059356689453, "logps/rejected": -50.611175537109375, "loss": 0.1855, "rewards/accuracies": 1.0, "rewards/chosen": 3.034855604171753, "rewards/margins": 0.9970283508300781, "rewards/rejected": 2.037827253341675, "step": 6051 }, { "epoch": 0.98, "learning_rate": 7.835496137931276e-07, "logits/chosen": -0.8929947018623352, "logits/rejected": -0.8349841833114624, "logps/chosen": -74.83126831054688, "logps/rejected": -11.672147750854492, "loss": 0.6564, "rewards/accuracies": 1.0, "rewards/chosen": 1.2912529706954956, "rewards/margins": 0.6799734830856323, "rewards/rejected": 0.6112794876098633, "step": 6052 }, { "epoch": 0.98, "learning_rate": 7.83441355540416e-07, "logits/chosen": -0.8048393130302429, "logits/rejected": -0.6645587086677551, "logps/chosen": -122.74559020996094, "logps/rejected": -85.39335632324219, "loss": 0.5772, "rewards/accuracies": 1.0, "rewards/chosen": 4.812382698059082, "rewards/margins": 2.778825521469116, "rewards/rejected": 2.033557176589966, "step": 6053 }, { "epoch": 0.98, "learning_rate": 7.833330777045886e-07, "logits/chosen": -0.9782501459121704, "logits/rejected": -1.005979061126709, "logps/chosen": -126.95880889892578, "logps/rejected": -78.72186279296875, "loss": 0.2821, "rewards/accuracies": 1.0, "rewards/chosen": 2.981377363204956, "rewards/margins": 1.2937750816345215, "rewards/rejected": 1.6876022815704346, "step": 6054 }, { "epoch": 0.98, "learning_rate": 7.832247802931265e-07, "logits/chosen": -0.40457212924957275, "logits/rejected": -0.3818608522415161, "logps/chosen": -70.60005187988281, "logps/rejected": -59.13852310180664, "loss": 0.6492, "rewards/accuracies": 1.0, "rewards/chosen": 0.6116699576377869, "rewards/margins": 0.1842346489429474, "rewards/rejected": 0.4274353086948395, "step": 6055 }, { "epoch": 0.98, "learning_rate": 7.831164633135121e-07, "logits/chosen": -0.45303401350975037, "logits/rejected": -0.5186401605606079, "logps/chosen": -57.13508224487305, "logps/rejected": -60.50555419921875, "loss": 1.739, "rewards/accuracies": 0.0, "rewards/chosen": 0.7229068875312805, "rewards/margins": -1.6270503997802734, "rewards/rejected": 2.349957227706909, "step": 6056 }, { "epoch": 0.98, "learning_rate": 7.830081267732288e-07, "logits/chosen": -0.7075359225273132, "logits/rejected": -0.6650173664093018, "logps/chosen": -115.76331329345703, "logps/rejected": -60.55876159667969, "loss": 0.9678, "rewards/accuracies": 0.0, "rewards/chosen": 1.330178141593933, "rewards/margins": -0.9652992486953735, "rewards/rejected": 2.2954773902893066, "step": 6057 }, { "epoch": 0.98, "learning_rate": 7.828997706797618e-07, "logits/chosen": -0.4025854170322418, "logits/rejected": -0.4046255052089691, "logps/chosen": -3.8763856887817383, "logps/rejected": -0.7493481636047363, "loss": 0.9743, "rewards/accuracies": 0.0, "rewards/chosen": 0.16222719848155975, "rewards/margins": -0.09129290282726288, "rewards/rejected": 0.25352010130882263, "step": 6058 }, { "epoch": 0.98, "learning_rate": 7.827913950405975e-07, "logits/chosen": -0.41457828879356384, "logits/rejected": -0.3156910538673401, "logps/chosen": -58.39088439941406, "logps/rejected": -16.127105712890625, "loss": 1.2717, "rewards/accuracies": 1.0, "rewards/chosen": 1.9576904773712158, "rewards/margins": 1.4154446125030518, "rewards/rejected": 0.5422458648681641, "step": 6059 }, { "epoch": 0.98, "learning_rate": 7.826829998632237e-07, "logits/chosen": -0.5997589230537415, "logits/rejected": -0.5997589230537415, "logps/chosen": -57.228023529052734, "logps/rejected": -57.228023529052734, "loss": 1.9776, "rewards/accuracies": 0.0, "rewards/chosen": 2.2140958309173584, "rewards/margins": 0.0, "rewards/rejected": 2.2140958309173584, "step": 6060 }, { "epoch": 0.98, "learning_rate": 7.825745851551293e-07, "logits/chosen": -0.6984378695487976, "logits/rejected": -0.6988528370857239, "logps/chosen": -108.46678161621094, "logps/rejected": -166.33551025390625, "loss": 0.1842, "rewards/accuracies": 1.0, "rewards/chosen": 3.8804001808166504, "rewards/margins": 0.9377763271331787, "rewards/rejected": 2.9426238536834717, "step": 6061 }, { "epoch": 0.98, "learning_rate": 7.824661509238048e-07, "logits/chosen": -0.1090533658862114, "logits/rejected": -0.1454085409641266, "logps/chosen": -19.771047592163086, "logps/rejected": -47.805419921875, "loss": 0.7048, "rewards/accuracies": 1.0, "rewards/chosen": -0.02325878106057644, "rewards/margins": 0.20310574769973755, "rewards/rejected": -0.22636452317237854, "step": 6062 }, { "epoch": 0.98, "learning_rate": 7.82357697176742e-07, "logits/chosen": -0.6668799519538879, "logits/rejected": -0.46790581941604614, "logps/chosen": -119.39183807373047, "logps/rejected": -53.591712951660156, "loss": 0.2379, "rewards/accuracies": 1.0, "rewards/chosen": 5.368262767791748, "rewards/margins": 2.0441019535064697, "rewards/rejected": 3.3241608142852783, "step": 6063 }, { "epoch": 0.98, "learning_rate": 7.82249223921434e-07, "logits/chosen": -0.7556968331336975, "logits/rejected": -0.6798568367958069, "logps/chosen": -121.28797149658203, "logps/rejected": -90.76751708984375, "loss": 0.9455, "rewards/accuracies": 1.0, "rewards/chosen": 4.8844780921936035, "rewards/margins": 2.6197776794433594, "rewards/rejected": 2.264700412750244, "step": 6064 }, { "epoch": 0.98, "learning_rate": 7.821407311653752e-07, "logits/chosen": -0.6938955187797546, "logits/rejected": -0.6699146628379822, "logps/chosen": -71.35993957519531, "logps/rejected": -40.80516052246094, "loss": 1.0297, "rewards/accuracies": 0.0, "rewards/chosen": 1.2026313543319702, "rewards/margins": -0.39303481578826904, "rewards/rejected": 1.5956661701202393, "step": 6065 }, { "epoch": 0.98, "learning_rate": 7.820322189160617e-07, "logits/chosen": -0.7252657413482666, "logits/rejected": -0.6318453550338745, "logps/chosen": -88.06904602050781, "logps/rejected": -61.65729522705078, "loss": 0.2067, "rewards/accuracies": 1.0, "rewards/chosen": 3.4489364624023438, "rewards/margins": 0.9557287693023682, "rewards/rejected": 2.4932076930999756, "step": 6066 }, { "epoch": 0.98, "learning_rate": 7.819236871809903e-07, "logits/chosen": -0.721513569355011, "logits/rejected": -0.7191702127456665, "logps/chosen": -42.36436080932617, "logps/rejected": -54.34034729003906, "loss": 0.3409, "rewards/accuracies": 1.0, "rewards/chosen": 1.436483383178711, "rewards/margins": 0.06098973751068115, "rewards/rejected": 1.3754936456680298, "step": 6067 }, { "epoch": 0.98, "learning_rate": 7.818151359676599e-07, "logits/chosen": -0.5928862690925598, "logits/rejected": -0.6647244691848755, "logps/chosen": -62.3873176574707, "logps/rejected": -117.69412231445312, "loss": 0.8766, "rewards/accuracies": 0.0, "rewards/chosen": 0.3509548306465149, "rewards/margins": -0.4373706579208374, "rewards/rejected": 0.7883254885673523, "step": 6068 }, { "epoch": 0.99, "learning_rate": 7.8170656528357e-07, "logits/chosen": -0.647874653339386, "logits/rejected": -0.6506539583206177, "logps/chosen": -89.89987182617188, "logps/rejected": -82.06658172607422, "loss": 0.9936, "rewards/accuracies": 0.0, "rewards/chosen": 1.5368156433105469, "rewards/margins": -1.6408729553222656, "rewards/rejected": 3.1776885986328125, "step": 6069 }, { "epoch": 0.99, "learning_rate": 7.815979751362221e-07, "logits/chosen": -0.6449651122093201, "logits/rejected": -0.6581882834434509, "logps/chosen": -26.08690643310547, "logps/rejected": -7.533121109008789, "loss": 0.381, "rewards/accuracies": 0.0, "rewards/chosen": 0.10988884419202805, "rewards/margins": -0.1089087501168251, "rewards/rejected": 0.21879759430885315, "step": 6070 }, { "epoch": 0.99, "learning_rate": 7.814893655331185e-07, "logits/chosen": -0.8309701085090637, "logits/rejected": -0.8309701085090637, "logps/chosen": -70.58708190917969, "logps/rejected": -70.58708190917969, "loss": 1.3759, "rewards/accuracies": 0.0, "rewards/chosen": 1.3539718389511108, "rewards/margins": 0.0, "rewards/rejected": 1.3539718389511108, "step": 6071 }, { "epoch": 0.99, "learning_rate": 7.813807364817633e-07, "logits/chosen": -1.1668150424957275, "logits/rejected": -1.1977399587631226, "logps/chosen": -66.8436508178711, "logps/rejected": -89.75513458251953, "loss": 1.2484, "rewards/accuracies": 0.0, "rewards/chosen": 1.2532570362091064, "rewards/margins": -1.1172103881835938, "rewards/rejected": 2.3704674243927, "step": 6072 }, { "epoch": 0.99, "learning_rate": 7.812720879896616e-07, "logits/chosen": -0.617725133895874, "logits/rejected": -0.5686054825782776, "logps/chosen": -47.390052795410156, "logps/rejected": -15.147529602050781, "loss": 0.2317, "rewards/accuracies": 1.0, "rewards/chosen": 1.6387344598770142, "rewards/margins": 0.7311790585517883, "rewards/rejected": 0.9075554013252258, "step": 6073 }, { "epoch": 0.99, "learning_rate": 7.811634200643201e-07, "logits/chosen": -0.44235309958457947, "logits/rejected": -0.238063782453537, "logps/chosen": -99.49915313720703, "logps/rejected": -21.99486541748047, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 3.271871328353882, "rewards/margins": 3.0012543201446533, "rewards/rejected": 0.2706170976161957, "step": 6074 }, { "epoch": 0.99, "learning_rate": 7.810547327132467e-07, "logits/chosen": -0.6070376038551331, "logits/rejected": -0.6251903176307678, "logps/chosen": -140.7792205810547, "logps/rejected": -78.4530029296875, "loss": 1.8273, "rewards/accuracies": 0.0, "rewards/chosen": 0.5541931390762329, "rewards/margins": -2.0779190063476562, "rewards/rejected": 2.6321122646331787, "step": 6075 }, { "epoch": 0.99, "learning_rate": 7.809460259439506e-07, "logits/chosen": -0.5026901960372925, "logits/rejected": -0.5486552119255066, "logps/chosen": -111.63803100585938, "logps/rejected": -91.76254272460938, "loss": 1.9869, "rewards/accuracies": 0.0, "rewards/chosen": 1.211085557937622, "rewards/margins": -2.381580352783203, "rewards/rejected": 3.592665910720825, "step": 6076 }, { "epoch": 0.99, "learning_rate": 7.808372997639423e-07, "logits/chosen": -0.9777165055274963, "logits/rejected": -0.964205801486969, "logps/chosen": -134.8836669921875, "logps/rejected": -87.38487243652344, "loss": 0.6062, "rewards/accuracies": 0.0, "rewards/chosen": 3.5423035621643066, "rewards/margins": -0.7767438888549805, "rewards/rejected": 4.319047451019287, "step": 6077 }, { "epoch": 0.99, "learning_rate": 7.80728554180734e-07, "logits/chosen": -0.6652429699897766, "logits/rejected": -0.6083773970603943, "logps/chosen": -55.67402648925781, "logps/rejected": -51.47119140625, "loss": 0.8152, "rewards/accuracies": 1.0, "rewards/chosen": 2.480181932449341, "rewards/margins": 0.03421926498413086, "rewards/rejected": 2.44596266746521, "step": 6078 }, { "epoch": 0.99, "learning_rate": 7.80619789201839e-07, "logits/chosen": -0.4982597231864929, "logits/rejected": -0.5061483383178711, "logps/chosen": -86.20926666259766, "logps/rejected": -42.61017608642578, "loss": 0.9412, "rewards/accuracies": 0.0, "rewards/chosen": -0.048780061304569244, "rewards/margins": -1.702904224395752, "rewards/rejected": 1.654124140739441, "step": 6079 }, { "epoch": 0.99, "learning_rate": 7.805110048347718e-07, "logits/chosen": -0.4090745449066162, "logits/rejected": -0.40715983510017395, "logps/chosen": -48.80567169189453, "logps/rejected": -81.37904357910156, "loss": 0.72, "rewards/accuracies": 1.0, "rewards/chosen": 1.2293685674667358, "rewards/margins": 0.5211212038993835, "rewards/rejected": 0.7082473635673523, "step": 6080 }, { "epoch": 0.99, "learning_rate": 7.804022010870483e-07, "logits/chosen": -0.27322715520858765, "logits/rejected": -0.2593494653701782, "logps/chosen": -3.562163829803467, "logps/rejected": -34.827430725097656, "loss": 0.2798, "rewards/accuracies": 1.0, "rewards/chosen": 0.36369481682777405, "rewards/margins": 0.5880333185195923, "rewards/rejected": -0.22433853149414062, "step": 6081 }, { "epoch": 0.99, "learning_rate": 7.802933779661859e-07, "logits/chosen": -1.3580658435821533, "logits/rejected": -1.3316372632980347, "logps/chosen": -63.51673126220703, "logps/rejected": -84.355224609375, "loss": 0.6283, "rewards/accuracies": 0.0, "rewards/chosen": 2.2522530555725098, "rewards/margins": -0.3525810241699219, "rewards/rejected": 2.6048340797424316, "step": 6082 }, { "epoch": 0.99, "learning_rate": 7.801845354797032e-07, "logits/chosen": -0.5608907341957092, "logits/rejected": -0.5506808757781982, "logps/chosen": -120.19315338134766, "logps/rejected": -116.80839538574219, "loss": 0.6438, "rewards/accuracies": 1.0, "rewards/chosen": 1.0570030212402344, "rewards/margins": 0.8451118469238281, "rewards/rejected": 0.21189117431640625, "step": 6083 }, { "epoch": 0.99, "learning_rate": 7.800756736351202e-07, "logits/chosen": -0.6111529469490051, "logits/rejected": -0.5295138359069824, "logps/chosen": -118.15907287597656, "logps/rejected": -61.88850402832031, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": 4.6822710037231445, "rewards/margins": 3.098221778869629, "rewards/rejected": 1.5840492248535156, "step": 6084 }, { "epoch": 0.99, "learning_rate": 7.799667924399584e-07, "logits/chosen": -0.6435942053794861, "logits/rejected": -0.5145096182823181, "logps/chosen": -61.906742095947266, "logps/rejected": -55.36420822143555, "loss": 0.4476, "rewards/accuracies": 1.0, "rewards/chosen": 2.0446438789367676, "rewards/margins": 0.054209232330322266, "rewards/rejected": 1.9904346466064453, "step": 6085 }, { "epoch": 0.99, "learning_rate": 7.798578919017402e-07, "logits/chosen": -0.8264632821083069, "logits/rejected": -0.8691927790641785, "logps/chosen": -250.9614715576172, "logps/rejected": -52.172279357910156, "loss": 0.1095, "rewards/accuracies": 1.0, "rewards/chosen": 3.6103897094726562, "rewards/margins": 1.724565863609314, "rewards/rejected": 1.8858238458633423, "step": 6086 }, { "epoch": 0.99, "learning_rate": 7.797489720279898e-07, "logits/chosen": -0.4416298270225525, "logits/rejected": -0.4510457217693329, "logps/chosen": -101.07522583007812, "logps/rejected": -64.25423431396484, "loss": 0.4564, "rewards/accuracies": 0.0, "rewards/chosen": 0.8270263671875, "rewards/margins": -0.054354846477508545, "rewards/rejected": 0.8813812136650085, "step": 6087 }, { "epoch": 0.99, "learning_rate": 7.796400328262324e-07, "logits/chosen": -0.7564566135406494, "logits/rejected": -0.6674492955207825, "logps/chosen": -152.01097106933594, "logps/rejected": -75.89958953857422, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": 6.372941493988037, "rewards/margins": 2.978447675704956, "rewards/rejected": 3.394493818283081, "step": 6088 }, { "epoch": 0.99, "learning_rate": 7.795310743039947e-07, "logits/chosen": -0.7102295756340027, "logits/rejected": -0.6942546963691711, "logps/chosen": -56.534000396728516, "logps/rejected": -33.30430603027344, "loss": 0.3429, "rewards/accuracies": 1.0, "rewards/chosen": 1.0859692096710205, "rewards/margins": 0.20455247163772583, "rewards/rejected": 0.8814167380332947, "step": 6089 }, { "epoch": 0.99, "learning_rate": 7.794220964688048e-07, "logits/chosen": -0.8667934536933899, "logits/rejected": -0.8212531208992004, "logps/chosen": -85.72653198242188, "logps/rejected": -73.5771255493164, "loss": 0.7212, "rewards/accuracies": 0.0, "rewards/chosen": 0.365438848733902, "rewards/margins": -1.1673859357833862, "rewards/rejected": 1.5328247547149658, "step": 6090 }, { "epoch": 0.99, "learning_rate": 7.793130993281918e-07, "logits/chosen": -0.44265615940093994, "logits/rejected": -0.3428053557872772, "logps/chosen": -48.913692474365234, "logps/rejected": -82.73089599609375, "loss": 1.4688, "rewards/accuracies": 1.0, "rewards/chosen": 2.2863941192626953, "rewards/margins": 1.094349980354309, "rewards/rejected": 1.1920441389083862, "step": 6091 }, { "epoch": 0.99, "learning_rate": 7.792040828896866e-07, "logits/chosen": -0.7369456887245178, "logits/rejected": -0.6794956922531128, "logps/chosen": -130.4467010498047, "logps/rejected": -65.59200286865234, "loss": 0.126, "rewards/accuracies": 1.0, "rewards/chosen": 2.934744358062744, "rewards/margins": 1.3108360767364502, "rewards/rejected": 1.623908281326294, "step": 6092 }, { "epoch": 0.99, "learning_rate": 7.790950471608211e-07, "logits/chosen": -0.5288418531417847, "logits/rejected": -0.5072408318519592, "logps/chosen": -121.56295013427734, "logps/rejected": -111.00076293945312, "loss": 0.371, "rewards/accuracies": 1.0, "rewards/chosen": 1.3109687566757202, "rewards/margins": 0.20341944694519043, "rewards/rejected": 1.1075493097305298, "step": 6093 }, { "epoch": 0.99, "learning_rate": 7.789859921491286e-07, "logits/chosen": -0.5645623803138733, "logits/rejected": -0.4403111934661865, "logps/chosen": -159.12257385253906, "logps/rejected": -40.73944854736328, "loss": 0.2662, "rewards/accuracies": 1.0, "rewards/chosen": 4.836255073547363, "rewards/margins": 3.620492696762085, "rewards/rejected": 1.2157623767852783, "step": 6094 }, { "epoch": 0.99, "learning_rate": 7.788769178621439e-07, "logits/chosen": -0.8269751071929932, "logits/rejected": -0.632073700428009, "logps/chosen": -144.65545654296875, "logps/rejected": -45.41133117675781, "loss": 0.6008, "rewards/accuracies": 1.0, "rewards/chosen": 4.509710788726807, "rewards/margins": 2.899493455886841, "rewards/rejected": 1.6102173328399658, "step": 6095 }, { "epoch": 0.99, "learning_rate": 7.78767824307403e-07, "logits/chosen": -0.48536473512649536, "logits/rejected": -0.46824416518211365, "logps/chosen": -74.39521789550781, "logps/rejected": -63.99948501586914, "loss": 0.5789, "rewards/accuracies": 0.0, "rewards/chosen": 1.494903564453125, "rewards/margins": -0.3763836622238159, "rewards/rejected": 1.871287226676941, "step": 6096 }, { "epoch": 0.99, "learning_rate": 7.78658711492443e-07, "logits/chosen": -0.7944760322570801, "logits/rejected": -0.8337051868438721, "logps/chosen": -210.62307739257812, "logps/rejected": -157.43447875976562, "loss": 1.3935, "rewards/accuracies": 0.0, "rewards/chosen": 1.7556167840957642, "rewards/margins": -2.253950595855713, "rewards/rejected": 4.0095672607421875, "step": 6097 }, { "epoch": 0.99, "learning_rate": 7.785495794248029e-07, "logits/chosen": -0.35416004061698914, "logits/rejected": -0.3560138940811157, "logps/chosen": -83.20932006835938, "logps/rejected": -43.449012756347656, "loss": 0.8572, "rewards/accuracies": 0.0, "rewards/chosen": 1.2376434803009033, "rewards/margins": -0.903592586517334, "rewards/rejected": 2.1412360668182373, "step": 6098 }, { "epoch": 0.99, "learning_rate": 7.784404281120225e-07, "logits/chosen": -0.21089646220207214, "logits/rejected": -0.1827065646648407, "logps/chosen": -12.597310066223145, "logps/rejected": -32.00339126586914, "loss": 0.5915, "rewards/accuracies": 0.0, "rewards/chosen": 0.6476077437400818, "rewards/margins": -0.7184517979621887, "rewards/rejected": 1.3660595417022705, "step": 6099 }, { "epoch": 0.99, "learning_rate": 7.783312575616431e-07, "logits/chosen": -0.5063315629959106, "logits/rejected": -0.529018223285675, "logps/chosen": -103.28910827636719, "logps/rejected": -55.3748779296875, "loss": 1.3512, "rewards/accuracies": 0.0, "rewards/chosen": 0.52926105260849, "rewards/margins": -0.17093008756637573, "rewards/rejected": 0.7001911401748657, "step": 6100 }, { "epoch": 0.99, "learning_rate": 7.782220677812074e-07, "logits/chosen": -0.3323613703250885, "logits/rejected": -0.3981707990169525, "logps/chosen": -25.727558135986328, "logps/rejected": -103.9793701171875, "loss": 1.5527, "rewards/accuracies": 0.0, "rewards/chosen": 0.8942531943321228, "rewards/margins": -1.807464838027954, "rewards/rejected": 2.7017180919647217, "step": 6101 }, { "epoch": 0.99, "learning_rate": 7.781128587782593e-07, "logits/chosen": -0.38602080941200256, "logits/rejected": -0.4239639639854431, "logps/chosen": -86.11124420166016, "logps/rejected": -150.09170532226562, "loss": 2.4295, "rewards/accuracies": 0.0, "rewards/chosen": 0.7412711977958679, "rewards/margins": -4.150236129760742, "rewards/rejected": 4.891507148742676, "step": 6102 }, { "epoch": 0.99, "learning_rate": 7.780036305603443e-07, "logits/chosen": -0.8030388355255127, "logits/rejected": -0.754357099533081, "logps/chosen": -132.11900329589844, "logps/rejected": -178.1175537109375, "loss": 0.6645, "rewards/accuracies": 0.0, "rewards/chosen": 5.3937530517578125, "rewards/margins": -0.9909043312072754, "rewards/rejected": 6.384657382965088, "step": 6103 }, { "epoch": 0.99, "learning_rate": 7.778943831350088e-07, "logits/chosen": -0.07182952761650085, "logits/rejected": -0.06558424234390259, "logps/chosen": -5.5477399826049805, "logps/rejected": -3.1328322887420654, "loss": 0.7216, "rewards/accuracies": 0.0, "rewards/chosen": 0.13961100578308105, "rewards/margins": -0.0675586462020874, "rewards/rejected": 0.20716965198516846, "step": 6104 }, { "epoch": 0.99, "learning_rate": 7.777851165098011e-07, "logits/chosen": -0.5448049902915955, "logits/rejected": -0.43079352378845215, "logps/chosen": -46.549869537353516, "logps/rejected": -6.833166122436523, "loss": 0.3156, "rewards/accuracies": 1.0, "rewards/chosen": 2.086183547973633, "rewards/margins": 1.1100599765777588, "rewards/rejected": 0.9761236310005188, "step": 6105 }, { "epoch": 0.99, "learning_rate": 7.776758306922702e-07, "logits/chosen": -0.5491591095924377, "logits/rejected": -0.5491591095924377, "logps/chosen": -91.34963989257812, "logps/rejected": -91.34963989257812, "loss": 0.3532, "rewards/accuracies": 0.0, "rewards/chosen": 1.4481391906738281, "rewards/margins": 0.0, "rewards/rejected": 1.4481391906738281, "step": 6106 }, { "epoch": 0.99, "learning_rate": 7.775665256899667e-07, "logits/chosen": -0.868838906288147, "logits/rejected": -1.182624340057373, "logps/chosen": -74.61380767822266, "logps/rejected": -36.95942306518555, "loss": 0.0828, "rewards/accuracies": 1.0, "rewards/chosen": 2.0693252086639404, "rewards/margins": 1.7838294506072998, "rewards/rejected": 0.2854957580566406, "step": 6107 }, { "epoch": 0.99, "learning_rate": 7.774572015104427e-07, "logits/chosen": -0.8443451523780823, "logits/rejected": -0.8205269575119019, "logps/chosen": -95.5849838256836, "logps/rejected": -79.15386962890625, "loss": 1.0861, "rewards/accuracies": 0.0, "rewards/chosen": 1.173254370689392, "rewards/margins": -0.9956284761428833, "rewards/rejected": 2.1688828468322754, "step": 6108 }, { "epoch": 0.99, "learning_rate": 7.773478581612514e-07, "logits/chosen": -0.6959282755851746, "logits/rejected": -0.5903859734535217, "logps/chosen": -66.82402038574219, "logps/rejected": -192.91806030273438, "loss": 2.6871, "rewards/accuracies": 0.0, "rewards/chosen": 2.5721404552459717, "rewards/margins": -5.118060111999512, "rewards/rejected": 7.6902008056640625, "step": 6109 }, { "epoch": 0.99, "learning_rate": 7.772384956499474e-07, "logits/chosen": -0.3435879647731781, "logits/rejected": -0.3435879647731781, "logps/chosen": -31.56096839904785, "logps/rejected": -31.56096839904785, "loss": 0.5299, "rewards/accuracies": 0.0, "rewards/chosen": 1.2587560415267944, "rewards/margins": 0.0, "rewards/rejected": 1.2587560415267944, "step": 6110 }, { "epoch": 0.99, "learning_rate": 7.771291139840866e-07, "logits/chosen": -0.7169399261474609, "logits/rejected": -0.7607959508895874, "logps/chosen": -118.69261932373047, "logps/rejected": -117.11775970458984, "loss": 0.0809, "rewards/accuracies": 1.0, "rewards/chosen": 4.770824432373047, "rewards/margins": 1.8060057163238525, "rewards/rejected": 2.9648187160491943, "step": 6111 }, { "epoch": 0.99, "learning_rate": 7.770197131712262e-07, "logits/chosen": -0.35307934880256653, "logits/rejected": -0.35307934880256653, "logps/chosen": -0.32500192523002625, "logps/rejected": -0.32500192523002625, "loss": 0.568, "rewards/accuracies": 0.0, "rewards/chosen": 0.0729975625872612, "rewards/margins": 0.0, "rewards/rejected": 0.0729975625872612, "step": 6112 }, { "epoch": 0.99, "learning_rate": 7.769102932189248e-07, "logits/chosen": -0.7305993437767029, "logits/rejected": -0.6454088687896729, "logps/chosen": -104.38726806640625, "logps/rejected": -61.49225616455078, "loss": 1.0936, "rewards/accuracies": 0.0, "rewards/chosen": 0.31584474444389343, "rewards/margins": -0.6815849542617798, "rewards/rejected": 0.9974296689033508, "step": 6113 }, { "epoch": 0.99, "learning_rate": 7.768008541347421e-07, "logits/chosen": -0.8997368812561035, "logits/rejected": -0.8811466693878174, "logps/chosen": -62.38772201538086, "logps/rejected": -49.799686431884766, "loss": 0.5745, "rewards/accuracies": 0.0, "rewards/chosen": 2.020212173461914, "rewards/margins": -0.17857050895690918, "rewards/rejected": 2.1987826824188232, "step": 6114 }, { "epoch": 0.99, "learning_rate": 7.766913959262398e-07, "logits/chosen": -0.5405282974243164, "logits/rejected": -0.5405282974243164, "logps/chosen": -62.04476547241211, "logps/rejected": -62.04476547241211, "loss": 0.9241, "rewards/accuracies": 0.0, "rewards/chosen": 2.0104215145111084, "rewards/margins": 0.0, "rewards/rejected": 2.0104215145111084, "step": 6115 }, { "epoch": 0.99, "learning_rate": 7.765819186009801e-07, "logits/chosen": -0.8706666827201843, "logits/rejected": -0.8309326171875, "logps/chosen": -66.63843536376953, "logps/rejected": -113.82180786132812, "loss": 1.0722, "rewards/accuracies": 1.0, "rewards/chosen": 1.0667030811309814, "rewards/margins": 0.2636719346046448, "rewards/rejected": 0.8030311465263367, "step": 6116 }, { "epoch": 0.99, "learning_rate": 7.764724221665268e-07, "logits/chosen": -0.4840763807296753, "logits/rejected": -0.517612874507904, "logps/chosen": -80.89744567871094, "logps/rejected": -128.8571014404297, "loss": 0.8288, "rewards/accuracies": 0.0, "rewards/chosen": 1.014369249343872, "rewards/margins": -1.407501220703125, "rewards/rejected": 2.421870470046997, "step": 6117 }, { "epoch": 0.99, "learning_rate": 7.763629066304451e-07, "logits/chosen": -0.5512661933898926, "logits/rejected": -0.6078004837036133, "logps/chosen": -66.6658935546875, "logps/rejected": -112.32901000976562, "loss": 2.7129, "rewards/accuracies": 0.0, "rewards/chosen": 0.8405159115791321, "rewards/margins": -4.798120975494385, "rewards/rejected": 5.638637065887451, "step": 6118 }, { "epoch": 0.99, "learning_rate": 7.762533720003015e-07, "logits/chosen": -0.7473284006118774, "logits/rejected": -0.6560125350952148, "logps/chosen": -137.28314208984375, "logps/rejected": -59.867271423339844, "loss": 0.3255, "rewards/accuracies": 1.0, "rewards/chosen": 2.4418106079101562, "rewards/margins": 0.34355688095092773, "rewards/rejected": 2.0982537269592285, "step": 6119 }, { "epoch": 0.99, "learning_rate": 7.761438182836639e-07, "logits/chosen": -0.7485087513923645, "logits/rejected": -0.7450340986251831, "logps/chosen": -7.273116111755371, "logps/rejected": -4.759698867797852, "loss": 0.8316, "rewards/accuracies": 0.0, "rewards/chosen": 0.12531976401805878, "rewards/margins": -0.10337543487548828, "rewards/rejected": 0.22869519889354706, "step": 6120 }, { "epoch": 0.99, "learning_rate": 7.760342454881012e-07, "logits/chosen": -0.47655627131462097, "logits/rejected": -0.4919673800468445, "logps/chosen": -3.9706361293792725, "logps/rejected": -1.6505014896392822, "loss": 0.9994, "rewards/accuracies": 0.0, "rewards/chosen": 0.22135484218597412, "rewards/margins": -0.14512109756469727, "rewards/rejected": 0.3664759397506714, "step": 6121 }, { "epoch": 0.99, "learning_rate": 7.759246536211843e-07, "logits/chosen": -0.6932470202445984, "logits/rejected": -0.6848293542861938, "logps/chosen": -110.67259979248047, "logps/rejected": -78.0828628540039, "loss": 0.8341, "rewards/accuracies": 0.0, "rewards/chosen": 1.3670845031738281, "rewards/margins": -0.7153191566467285, "rewards/rejected": 2.0824036598205566, "step": 6122 }, { "epoch": 0.99, "learning_rate": 7.758150426904844e-07, "logits/chosen": -0.37551426887512207, "logits/rejected": -0.40561020374298096, "logps/chosen": -78.56735229492188, "logps/rejected": -74.27397155761719, "loss": 0.6808, "rewards/accuracies": 1.0, "rewards/chosen": 1.337548851966858, "rewards/margins": 0.0498504638671875, "rewards/rejected": 1.2876983880996704, "step": 6123 }, { "epoch": 0.99, "learning_rate": 7.757054127035749e-07, "logits/chosen": -0.8370714783668518, "logits/rejected": -0.7514511942863464, "logps/chosen": -247.304443359375, "logps/rejected": -126.69092559814453, "loss": 2.6385, "rewards/accuracies": 0.0, "rewards/chosen": 2.3711183071136475, "rewards/margins": -3.519484758377075, "rewards/rejected": 5.890603065490723, "step": 6124 }, { "epoch": 0.99, "learning_rate": 7.755957636680301e-07, "logits/chosen": -0.725857138633728, "logits/rejected": -0.7294113636016846, "logps/chosen": -92.61473083496094, "logps/rejected": -93.52798461914062, "loss": 0.3574, "rewards/accuracies": 1.0, "rewards/chosen": 3.868220567703247, "rewards/margins": 0.8952269554138184, "rewards/rejected": 2.9729936122894287, "step": 6125 }, { "epoch": 0.99, "learning_rate": 7.754860955914259e-07, "logits/chosen": -0.5226802229881287, "logits/rejected": -0.5098704099655151, "logps/chosen": -32.015708923339844, "logps/rejected": -6.676889419555664, "loss": 0.54, "rewards/accuracies": 0.0, "rewards/chosen": 0.3368053436279297, "rewards/margins": -0.1986461877822876, "rewards/rejected": 0.5354515314102173, "step": 6126 }, { "epoch": 0.99, "learning_rate": 7.75376408481339e-07, "logits/chosen": -0.39185553789138794, "logits/rejected": -0.48771870136260986, "logps/chosen": -95.3267822265625, "logps/rejected": -123.30841064453125, "loss": 1.7995, "rewards/accuracies": 0.0, "rewards/chosen": 1.5865509510040283, "rewards/margins": -2.526707410812378, "rewards/rejected": 4.113258361816406, "step": 6127 }, { "epoch": 0.99, "learning_rate": 7.752667023453481e-07, "logits/chosen": -0.3876117169857025, "logits/rejected": -0.33331573009490967, "logps/chosen": -35.137916564941406, "logps/rejected": -93.20172119140625, "loss": 1.3941, "rewards/accuracies": 0.0, "rewards/chosen": 1.0291759967803955, "rewards/margins": -2.355011463165283, "rewards/rejected": 3.3841874599456787, "step": 6128 }, { "epoch": 0.99, "learning_rate": 7.751569771910324e-07, "logits/chosen": -0.8262133002281189, "logits/rejected": -0.7708740234375, "logps/chosen": -197.38624572753906, "logps/rejected": -155.72406005859375, "loss": 0.4124, "rewards/accuracies": 1.0, "rewards/chosen": 7.1069536209106445, "rewards/margins": 0.15326404571533203, "rewards/rejected": 6.9536895751953125, "step": 6129 }, { "epoch": 0.99, "learning_rate": 7.750472330259734e-07, "logits/chosen": -0.6241734623908997, "logits/rejected": -0.6082041263580322, "logps/chosen": -68.03653717041016, "logps/rejected": -72.10730743408203, "loss": 0.6266, "rewards/accuracies": 0.0, "rewards/chosen": 2.6482322216033936, "rewards/margins": -0.815861701965332, "rewards/rejected": 3.4640939235687256, "step": 6130 }, { "epoch": 1.0, "learning_rate": 7.749374698577528e-07, "logits/chosen": -0.4806985557079315, "logits/rejected": -0.3667272627353668, "logps/chosen": -70.08180236816406, "logps/rejected": -54.435569763183594, "loss": 0.3503, "rewards/accuracies": 1.0, "rewards/chosen": 2.973543643951416, "rewards/margins": 2.143289566040039, "rewards/rejected": 0.8302540183067322, "step": 6131 }, { "epoch": 1.0, "learning_rate": 7.748276876939549e-07, "logits/chosen": -0.7066382169723511, "logits/rejected": -0.7396609783172607, "logps/chosen": -56.55725860595703, "logps/rejected": -49.266178131103516, "loss": 0.9312, "rewards/accuracies": 0.0, "rewards/chosen": 1.3102165460586548, "rewards/margins": -0.36705970764160156, "rewards/rejected": 1.6772762537002563, "step": 6132 }, { "epoch": 1.0, "learning_rate": 7.747178865421639e-07, "logits/chosen": -0.8080960512161255, "logits/rejected": -0.8670426607131958, "logps/chosen": -113.3111572265625, "logps/rejected": -127.6741714477539, "loss": 0.4807, "rewards/accuracies": 0.0, "rewards/chosen": 0.940380871295929, "rewards/margins": -0.20291978120803833, "rewards/rejected": 1.1433006525039673, "step": 6133 }, { "epoch": 1.0, "learning_rate": 7.746080664099666e-07, "logits/chosen": -0.726667046546936, "logits/rejected": -0.7279161214828491, "logps/chosen": -1.7681186199188232, "logps/rejected": -8.685161590576172, "loss": 1.6984, "rewards/accuracies": 1.0, "rewards/chosen": 0.30515214800834656, "rewards/margins": 0.3140343725681305, "rewards/rejected": -0.008882236666977406, "step": 6134 }, { "epoch": 1.0, "learning_rate": 7.744982273049502e-07, "logits/chosen": -1.1284562349319458, "logits/rejected": -1.1124374866485596, "logps/chosen": -75.24098205566406, "logps/rejected": -166.8948974609375, "loss": 3.6877, "rewards/accuracies": 0.0, "rewards/chosen": 1.9881645441055298, "rewards/margins": -5.5994791984558105, "rewards/rejected": 7.587643623352051, "step": 6135 }, { "epoch": 1.0, "learning_rate": 7.743883692347035e-07, "logits/chosen": -0.6477594375610352, "logits/rejected": -0.6466790437698364, "logps/chosen": -45.95954513549805, "logps/rejected": -25.5113468170166, "loss": 0.4492, "rewards/accuracies": 0.0, "rewards/chosen": 0.16825905442237854, "rewards/margins": -0.24051150679588318, "rewards/rejected": 0.4087705612182617, "step": 6136 }, { "epoch": 1.0, "learning_rate": 7.742784922068168e-07, "logits/chosen": -0.7030335664749146, "logits/rejected": -0.7030335664749146, "logps/chosen": -58.06984329223633, "logps/rejected": -58.06984329223633, "loss": 0.8113, "rewards/accuracies": 0.0, "rewards/chosen": 2.284956693649292, "rewards/margins": 0.0, "rewards/rejected": 2.284956693649292, "step": 6137 }, { "epoch": 1.0, "learning_rate": 7.741685962288817e-07, "logits/chosen": -0.5478853583335876, "logits/rejected": -0.589143693447113, "logps/chosen": -17.339380264282227, "logps/rejected": -56.340389251708984, "loss": 1.1242, "rewards/accuracies": 0.0, "rewards/chosen": 0.7859567999839783, "rewards/margins": -1.8120675086975098, "rewards/rejected": 2.598024368286133, "step": 6138 }, { "epoch": 1.0, "learning_rate": 7.740586813084906e-07, "logits/chosen": -0.707652747631073, "logits/rejected": -0.48324039578437805, "logps/chosen": -224.8618621826172, "logps/rejected": -82.85700225830078, "loss": 0.6265, "rewards/accuracies": 1.0, "rewards/chosen": 4.291358947753906, "rewards/margins": 1.7796881198883057, "rewards/rejected": 2.5116708278656006, "step": 6139 }, { "epoch": 1.0, "learning_rate": 7.73948747453238e-07, "logits/chosen": -0.576651930809021, "logits/rejected": -0.5842230916023254, "logps/chosen": -29.99099349975586, "logps/rejected": -26.8487491607666, "loss": 0.3524, "rewards/accuracies": 1.0, "rewards/chosen": 0.5722618103027344, "rewards/margins": 0.047728896141052246, "rewards/rejected": 0.5245329141616821, "step": 6140 }, { "epoch": 1.0, "learning_rate": 7.738387946707189e-07, "logits/chosen": -0.6042222380638123, "logits/rejected": -0.5485540628433228, "logps/chosen": -46.96511459350586, "logps/rejected": -48.555625915527344, "loss": 0.4978, "rewards/accuracies": 0.0, "rewards/chosen": 1.5171021223068237, "rewards/margins": -0.382080078125, "rewards/rejected": 1.8991822004318237, "step": 6141 }, { "epoch": 1.0, "learning_rate": 7.737288229685301e-07, "logits/chosen": -0.5465705394744873, "logits/rejected": -0.49721893668174744, "logps/chosen": -64.50544738769531, "logps/rejected": -94.14410400390625, "loss": 2.1274, "rewards/accuracies": 1.0, "rewards/chosen": 2.452291965484619, "rewards/margins": 0.2818572521209717, "rewards/rejected": 2.1704347133636475, "step": 6142 }, { "epoch": 1.0, "learning_rate": 7.736188323542698e-07, "logits/chosen": -0.9128790497779846, "logits/rejected": -0.8725288510322571, "logps/chosen": -113.332275390625, "logps/rejected": -46.1100959777832, "loss": 0.5712, "rewards/accuracies": 1.0, "rewards/chosen": 4.553070068359375, "rewards/margins": 4.334649085998535, "rewards/rejected": 0.21842117607593536, "step": 6143 }, { "epoch": 1.0, "learning_rate": 7.735088228355373e-07, "logits/chosen": -0.7518608570098877, "logits/rejected": -0.7723015546798706, "logps/chosen": -75.60093688964844, "logps/rejected": -57.77653503417969, "loss": 0.6648, "rewards/accuracies": 1.0, "rewards/chosen": 1.2788574695587158, "rewards/margins": 0.11694943904876709, "rewards/rejected": 1.1619080305099487, "step": 6144 }, { "epoch": 1.0, "learning_rate": 7.733987944199329e-07, "logits/chosen": -0.9872390031814575, "logits/rejected": -0.8811345100402832, "logps/chosen": -120.96651458740234, "logps/rejected": -30.066692352294922, "loss": 0.2172, "rewards/accuracies": 1.0, "rewards/chosen": 0.9412285089492798, "rewards/margins": 0.7802200317382812, "rewards/rejected": 0.16100846230983734, "step": 6145 }, { "epoch": 1.0, "learning_rate": 7.732887471150589e-07, "logits/chosen": -0.7585382461547852, "logits/rejected": -0.7136399149894714, "logps/chosen": -87.02861022949219, "logps/rejected": -154.31040954589844, "loss": 0.6169, "rewards/accuracies": 1.0, "rewards/chosen": 1.0459381341934204, "rewards/margins": 0.3324081301689148, "rewards/rejected": 0.7135300040245056, "step": 6146 }, { "epoch": 1.0, "learning_rate": 7.731786809285184e-07, "logits/chosen": -0.8348082304000854, "logits/rejected": -0.7940959334373474, "logps/chosen": -158.25108337402344, "logps/rejected": -129.65792846679688, "loss": 1.4, "rewards/accuracies": 1.0, "rewards/chosen": 5.332258701324463, "rewards/margins": 0.8938827514648438, "rewards/rejected": 4.438375949859619, "step": 6147 }, { "epoch": 1.0, "learning_rate": 7.730685958679158e-07, "logits/chosen": -0.5822973251342773, "logits/rejected": -0.5766485929489136, "logps/chosen": -109.3967514038086, "logps/rejected": -146.2355499267578, "loss": 0.6319, "rewards/accuracies": 1.0, "rewards/chosen": 1.9001320600509644, "rewards/margins": 2.1551613807678223, "rewards/rejected": -0.2550292909145355, "step": 6148 }, { "epoch": 1.0, "learning_rate": 7.729584919408569e-07, "logits/chosen": -0.4552841782569885, "logits/rejected": -0.4331150949001312, "logps/chosen": -37.26293182373047, "logps/rejected": -1.9238828420639038, "loss": 0.628, "rewards/accuracies": 0.0, "rewards/chosen": 0.09269943088293076, "rewards/margins": -0.09974629431962967, "rewards/rejected": 0.19244572520256042, "step": 6149 }, { "epoch": 1.0, "learning_rate": 7.728483691549491e-07, "logits/chosen": -0.627787709236145, "logits/rejected": -0.5653539896011353, "logps/chosen": -75.64996337890625, "logps/rejected": -14.784128189086914, "loss": 0.636, "rewards/accuracies": 1.0, "rewards/chosen": 3.0084457397460938, "rewards/margins": 1.8981597423553467, "rewards/rejected": 1.110285997390747, "step": 6150 }, { "epoch": 1.0, "learning_rate": 7.727382275178007e-07, "logits/chosen": -0.9155417084693909, "logits/rejected": -0.9193013310432434, "logps/chosen": -201.9398956298828, "logps/rejected": -35.95057678222656, "loss": 0.2225, "rewards/accuracies": 1.0, "rewards/chosen": 4.380812168121338, "rewards/margins": 1.2462882995605469, "rewards/rejected": 3.134523868560791, "step": 6151 }, { "epoch": 1.0, "learning_rate": 7.726280670370214e-07, "logits/chosen": -0.8347976207733154, "logits/rejected": -0.8909246325492859, "logps/chosen": -121.90731048583984, "logps/rejected": -87.63688659667969, "loss": 0.9194, "rewards/accuracies": 0.0, "rewards/chosen": 5.647950172424316, "rewards/margins": -0.9659295082092285, "rewards/rejected": 6.613879680633545, "step": 6152 }, { "epoch": 1.0, "learning_rate": 7.725178877202223e-07, "logits/chosen": -0.7351426482200623, "logits/rejected": -0.6416733264923096, "logps/chosen": -89.88262939453125, "logps/rejected": -75.45419311523438, "loss": 0.1495, "rewards/accuracies": 1.0, "rewards/chosen": 5.024711608886719, "rewards/margins": 1.4819884300231934, "rewards/rejected": 3.5427231788635254, "step": 6153 }, { "epoch": 1.0, "learning_rate": 7.724076895750158e-07, "logits/chosen": -0.6578570008277893, "logits/rejected": -0.5555498600006104, "logps/chosen": -111.691650390625, "logps/rejected": -60.41199493408203, "loss": 0.5754, "rewards/accuracies": 1.0, "rewards/chosen": 0.9688804745674133, "rewards/margins": 0.9167167544364929, "rewards/rejected": 0.05216369777917862, "step": 6154 }, { "epoch": 1.0, "learning_rate": 7.722974726090155e-07, "logits/chosen": -0.9470848441123962, "logits/rejected": -0.8978784680366516, "logps/chosen": -22.148258209228516, "logps/rejected": -66.84004974365234, "loss": 2.4005, "rewards/accuracies": 0.0, "rewards/chosen": 0.6616941690444946, "rewards/margins": -1.162552833557129, "rewards/rejected": 1.8242470026016235, "step": 6155 }, { "epoch": 1.0, "learning_rate": 7.721872368298363e-07, "logits/chosen": -0.7689655423164368, "logits/rejected": -0.8387427926063538, "logps/chosen": -66.42227172851562, "logps/rejected": -106.14599609375, "loss": 1.1189, "rewards/accuracies": 0.0, "rewards/chosen": 1.3929214477539062, "rewards/margins": -2.0704376697540283, "rewards/rejected": 3.4633591175079346, "step": 6156 }, { "epoch": 1.0, "learning_rate": 7.720769822450945e-07, "logits/chosen": -0.6598291397094727, "logits/rejected": -0.699697732925415, "logps/chosen": -66.31241607666016, "logps/rejected": -132.22145080566406, "loss": 1.9926, "rewards/accuracies": 0.0, "rewards/chosen": 2.0242927074432373, "rewards/margins": -2.818161725997925, "rewards/rejected": 4.842454433441162, "step": 6157 }, { "epoch": 1.0, "learning_rate": 7.719667088624077e-07, "logits/chosen": -0.8827931880950928, "logits/rejected": -0.8489591479301453, "logps/chosen": -119.9638671875, "logps/rejected": -155.0267333984375, "loss": 1.5638, "rewards/accuracies": 0.0, "rewards/chosen": 1.717901587486267, "rewards/margins": -3.076413154602051, "rewards/rejected": 4.794314861297607, "step": 6158 }, { "epoch": 1.0, "learning_rate": 7.718564166893946e-07, "logits/chosen": -0.23551997542381287, "logits/rejected": -0.22411572933197021, "logps/chosen": -31.415102005004883, "logps/rejected": -88.27174377441406, "loss": 1.6669, "rewards/accuracies": 0.0, "rewards/chosen": 1.510877013206482, "rewards/margins": -0.7690118551254272, "rewards/rejected": 2.279888868331909, "step": 6159 }, { "epoch": 1.0, "learning_rate": 7.717461057336754e-07, "logits/chosen": -0.7075457572937012, "logits/rejected": -0.64747554063797, "logps/chosen": -102.4747314453125, "logps/rejected": -87.1867446899414, "loss": 0.2576, "rewards/accuracies": 1.0, "rewards/chosen": 2.8458251953125, "rewards/margins": 0.43393397331237793, "rewards/rejected": 2.411891222000122, "step": 6160 }, { "epoch": 1.0, "learning_rate": 7.716357760028717e-07, "logits/chosen": -0.7308763265609741, "logits/rejected": -0.7254858613014221, "logps/chosen": -128.1472930908203, "logps/rejected": -144.13284301757812, "loss": 2.1586, "rewards/accuracies": 0.0, "rewards/chosen": 5.512669563293457, "rewards/margins": -1.3511872291564941, "rewards/rejected": 6.863856792449951, "step": 6161 }, { "epoch": 1.0, "learning_rate": 7.715254275046062e-07, "logits/chosen": -0.5184933543205261, "logits/rejected": -0.5400461554527283, "logps/chosen": -61.93840789794922, "logps/rejected": -139.13519287109375, "loss": 0.2443, "rewards/accuracies": 1.0, "rewards/chosen": 0.8866340517997742, "rewards/margins": 0.5511375665664673, "rewards/rejected": 0.3354965150356293, "step": 6162 }, { "epoch": 1.0, "learning_rate": 7.714150602465026e-07, "logits/chosen": -0.6211647987365723, "logits/rejected": -0.5506455302238464, "logps/chosen": -51.93505096435547, "logps/rejected": -92.63426208496094, "loss": 0.8746, "rewards/accuracies": 0.0, "rewards/chosen": 1.2943611145019531, "rewards/margins": -0.8912239074707031, "rewards/rejected": 2.1855850219726562, "step": 6163 }, { "epoch": 1.0, "learning_rate": 7.713046742361866e-07, "logits/chosen": -0.899394154548645, "logits/rejected": -0.8319750428199768, "logps/chosen": -46.01144790649414, "logps/rejected": -87.28703308105469, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 2.4825565814971924, "rewards/margins": 0.31319379806518555, "rewards/rejected": 2.169362783432007, "step": 6164 }, { "epoch": 1.0, "learning_rate": 7.711942694812847e-07, "logits/chosen": -1.0015089511871338, "logits/rejected": -0.936506986618042, "logps/chosen": -101.43402099609375, "logps/rejected": -26.627893447875977, "loss": 0.3859, "rewards/accuracies": 1.0, "rewards/chosen": 0.3853759765625, "rewards/margins": 0.26123619079589844, "rewards/rejected": 0.12413978576660156, "step": 6165 }, { "epoch": 1.0, "learning_rate": 7.710838459894249e-07, "logits/chosen": -0.30139726400375366, "logits/rejected": -0.40882569551467896, "logps/chosen": -89.34435272216797, "logps/rejected": -103.63983917236328, "loss": 1.4004, "rewards/accuracies": 0.0, "rewards/chosen": 0.8757820129394531, "rewards/margins": -1.792668104171753, "rewards/rejected": 2.668450117111206, "step": 6166 }, { "epoch": 1.0, "learning_rate": 7.709734037682364e-07, "logits/chosen": -0.9068276286125183, "logits/rejected": -0.8484337329864502, "logps/chosen": -81.80379486083984, "logps/rejected": -17.078767776489258, "loss": 0.2067, "rewards/accuracies": 1.0, "rewards/chosen": 1.1923942565917969, "rewards/margins": 1.0501868724822998, "rewards/rejected": 0.1422073394060135, "step": 6167 }, { "epoch": 1.0, "learning_rate": 7.708629428253497e-07, "logits/chosen": -0.6323317885398865, "logits/rejected": -0.6985350251197815, "logps/chosen": -57.37876892089844, "logps/rejected": -125.25692749023438, "loss": 1.3445, "rewards/accuracies": 0.0, "rewards/chosen": 1.8896934986114502, "rewards/margins": -1.032325029373169, "rewards/rejected": 2.922018527984619, "step": 6168 }, { "epoch": 1.0, "learning_rate": 7.707524631683963e-07, "logits/chosen": -0.4782877266407013, "logits/rejected": -0.5014758110046387, "logps/chosen": -63.47846984863281, "logps/rejected": -150.04397583007812, "loss": 0.2031, "rewards/accuracies": 1.0, "rewards/chosen": 1.38250732421875, "rewards/margins": 1.086248755455017, "rewards/rejected": 0.2962585389614105, "step": 6169 }, { "epoch": 1.0, "learning_rate": 7.7064196480501e-07, "logits/chosen": -0.33909323811531067, "logits/rejected": -0.40964576601982117, "logps/chosen": -47.080257415771484, "logps/rejected": -136.63632202148438, "loss": 0.5826, "rewards/accuracies": 1.0, "rewards/chosen": 1.0923794507980347, "rewards/margins": 0.9466824531555176, "rewards/rejected": 0.14569702744483948, "step": 6170 }, { "epoch": 1.0, "learning_rate": 7.705314477428244e-07, "logits/chosen": -0.6684168577194214, "logits/rejected": -0.6487249135971069, "logps/chosen": -113.38074493408203, "logps/rejected": -45.38533020019531, "loss": 0.4824, "rewards/accuracies": 0.0, "rewards/chosen": 0.7227226495742798, "rewards/margins": -0.3998314142227173, "rewards/rejected": 1.122554063796997, "step": 6171 }, { "epoch": 1.0, "learning_rate": 7.704209119894757e-07, "logits/chosen": -1.0425728559494019, "logits/rejected": -1.064962387084961, "logps/chosen": -286.3389892578125, "logps/rejected": -106.01591491699219, "loss": 0.8765, "rewards/accuracies": 0.0, "rewards/chosen": 3.184771776199341, "rewards/margins": -1.5452682971954346, "rewards/rejected": 4.730040073394775, "step": 6172 }, { "epoch": 1.0, "learning_rate": 7.703103575526008e-07, "logits/chosen": -0.6544704437255859, "logits/rejected": -0.611589789390564, "logps/chosen": -64.79188537597656, "logps/rejected": -95.84188079833984, "loss": 0.1438, "rewards/accuracies": 1.0, "rewards/chosen": 1.4793137311935425, "rewards/margins": 1.1648476123809814, "rewards/rejected": 0.31446608901023865, "step": 6173 }, { "epoch": 1.0, "learning_rate": 7.701997844398378e-07, "logits/chosen": -0.8630504012107849, "logits/rejected": -0.8707236051559448, "logps/chosen": -197.37417602539062, "logps/rejected": -186.33953857421875, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 5.787411689758301, "rewards/margins": 0.019683837890625, "rewards/rejected": 5.767727851867676, "step": 6174 }, { "epoch": 1.0, "learning_rate": 7.700891926588265e-07, "logits/chosen": -0.552865743637085, "logits/rejected": -0.522983968257904, "logps/chosen": -44.177791595458984, "logps/rejected": -44.24201965332031, "loss": 0.7069, "rewards/accuracies": 0.0, "rewards/chosen": 1.3599293231964111, "rewards/margins": -0.32876312732696533, "rewards/rejected": 1.6886924505233765, "step": 6175 }, { "epoch": 1.0, "learning_rate": 7.699785822172074e-07, "logits/chosen": -0.9935566186904907, "logits/rejected": -0.8308000564575195, "logps/chosen": -133.24365234375, "logps/rejected": -58.17503356933594, "loss": 0.6183, "rewards/accuracies": 0.0, "rewards/chosen": 1.8210662603378296, "rewards/margins": -0.8706048727035522, "rewards/rejected": 2.691671133041382, "step": 6176 }, { "epoch": 1.0, "learning_rate": 7.698679531226229e-07, "logits/chosen": -0.4053713381290436, "logits/rejected": -0.33777081966400146, "logps/chosen": -95.3121337890625, "logps/rejected": -96.82073974609375, "loss": 0.3248, "rewards/accuracies": 1.0, "rewards/chosen": 3.4070587158203125, "rewards/margins": 0.4953901767730713, "rewards/rejected": 2.911668539047241, "step": 6177 }, { "epoch": 1.0, "learning_rate": 7.697573053827163e-07, "logits/chosen": -0.395355224609375, "logits/rejected": -0.39330098032951355, "logps/chosen": -2.0526528358459473, "logps/rejected": -40.214866638183594, "loss": 0.3627, "rewards/accuracies": 1.0, "rewards/chosen": 0.3060784935951233, "rewards/margins": 0.3166913688182831, "rewards/rejected": -0.010612869635224342, "step": 6178 }, { "epoch": 1.0, "learning_rate": 7.696466390051324e-07, "logits/chosen": -0.6684912443161011, "logits/rejected": -0.6744644641876221, "logps/chosen": -50.37779998779297, "logps/rejected": -46.94102478027344, "loss": 1.8863, "rewards/accuracies": 1.0, "rewards/chosen": 1.9020789861679077, "rewards/margins": 0.016081929206848145, "rewards/rejected": 1.8859970569610596, "step": 6179 }, { "epoch": 1.0, "learning_rate": 7.695359539975172e-07, "logits/chosen": -0.5483559370040894, "logits/rejected": -0.5879372358322144, "logps/chosen": -259.3536071777344, "logps/rejected": -67.86627960205078, "loss": 0.1289, "rewards/accuracies": 1.0, "rewards/chosen": 3.9974944591522217, "rewards/margins": 2.9771080017089844, "rewards/rejected": 1.0203865766525269, "step": 6180 }, { "epoch": 1.0, "learning_rate": 7.694252503675181e-07, "logits/chosen": -0.6460956931114197, "logits/rejected": -0.6508892178535461, "logps/chosen": -66.14787292480469, "logps/rejected": -79.50521850585938, "loss": 0.8728, "rewards/accuracies": 0.0, "rewards/chosen": 1.1644585132598877, "rewards/margins": -1.4178519248962402, "rewards/rejected": 2.582310438156128, "step": 6181 }, { "epoch": 1.0, "learning_rate": 7.693145281227833e-07, "logits/chosen": -0.8277862071990967, "logits/rejected": -0.7735284566879272, "logps/chosen": -197.5710906982422, "logps/rejected": -155.25503540039062, "loss": 0.3348, "rewards/accuracies": 1.0, "rewards/chosen": 7.0884690284729, "rewards/margins": 0.08787679672241211, "rewards/rejected": 7.000592231750488, "step": 6182 }, { "epoch": 1.0, "learning_rate": 7.692037872709629e-07, "logits/chosen": -0.44549304246902466, "logits/rejected": -0.4433441758155823, "logps/chosen": -2.1381051540374756, "logps/rejected": -1.5733492374420166, "loss": 0.7692, "rewards/accuracies": 0.0, "rewards/chosen": 0.20174257457256317, "rewards/margins": -0.11131979525089264, "rewards/rejected": 0.3130623698234558, "step": 6183 }, { "epoch": 1.0, "learning_rate": 7.690930278197081e-07, "logits/chosen": -0.7401416897773743, "logits/rejected": -0.8649731278419495, "logps/chosen": -149.41281127929688, "logps/rejected": -111.21299743652344, "loss": 1.3879, "rewards/accuracies": 0.0, "rewards/chosen": 0.9795730710029602, "rewards/margins": -2.546351671218872, "rewards/rejected": 3.5259246826171875, "step": 6184 }, { "epoch": 1.0, "learning_rate": 7.689822497766711e-07, "logits/chosen": -1.152410626411438, "logits/rejected": -1.1242802143096924, "logps/chosen": -57.18798065185547, "logps/rejected": -14.003849983215332, "loss": 0.1029, "rewards/accuracies": 1.0, "rewards/chosen": 2.2048797607421875, "rewards/margins": 1.9777247905731201, "rewards/rejected": 0.22715501487255096, "step": 6185 }, { "epoch": 1.0, "learning_rate": 7.688714531495059e-07, "logits/chosen": -0.6302074193954468, "logits/rejected": -0.6242685914039612, "logps/chosen": -19.278888702392578, "logps/rejected": -5.688138008117676, "loss": 0.5095, "rewards/accuracies": 0.0, "rewards/chosen": 0.5160591006278992, "rewards/margins": -0.16822433471679688, "rewards/rejected": 0.684283435344696, "step": 6186 }, { "epoch": 1.0, "learning_rate": 7.687606379458675e-07, "logits/chosen": -0.6446738839149475, "logits/rejected": -0.6286137104034424, "logps/chosen": -187.90223693847656, "logps/rejected": -168.44940185546875, "loss": 0.091, "rewards/accuracies": 1.0, "rewards/chosen": 3.4466140270233154, "rewards/margins": 2.314784049987793, "rewards/rejected": 1.131829857826233, "step": 6187 }, { "epoch": 1.0, "learning_rate": 7.68649804173412e-07, "logits/chosen": -0.8059420585632324, "logits/rejected": -0.7458767890930176, "logps/chosen": -93.02793884277344, "logps/rejected": -107.62283325195312, "loss": 0.2361, "rewards/accuracies": 1.0, "rewards/chosen": 1.318623423576355, "rewards/margins": 1.306593418121338, "rewards/rejected": 0.01203002966940403, "step": 6188 }, { "epoch": 1.0, "learning_rate": 7.685389518397969e-07, "logits/chosen": -0.5671932101249695, "logits/rejected": -0.7019377946853638, "logps/chosen": -104.63335418701172, "logps/rejected": -134.8462371826172, "loss": 1.1441, "rewards/accuracies": 0.0, "rewards/chosen": 1.879205346107483, "rewards/margins": -1.9605027437210083, "rewards/rejected": 3.839708089828491, "step": 6189 }, { "epoch": 1.0, "learning_rate": 7.684280809526812e-07, "logits/chosen": -0.6963170170783997, "logits/rejected": -0.7006974220275879, "logps/chosen": -58.327884674072266, "logps/rejected": -53.52533721923828, "loss": 1.8396, "rewards/accuracies": 0.0, "rewards/chosen": 1.1702778339385986, "rewards/margins": -0.4167804718017578, "rewards/rejected": 1.5870583057403564, "step": 6190 }, { "epoch": 1.0, "learning_rate": 7.68317191519725e-07, "logits/chosen": -0.4988709092140198, "logits/rejected": -0.4606972634792328, "logps/chosen": -191.154052734375, "logps/rejected": -48.10786437988281, "loss": 0.0649, "rewards/accuracies": 1.0, "rewards/chosen": 4.360655307769775, "rewards/margins": 2.0881364345550537, "rewards/rejected": 2.2725188732147217, "step": 6191 }, { "epoch": 1.01, "learning_rate": 7.682062835485897e-07, "logits/chosen": -0.6270405054092407, "logits/rejected": -0.6801964640617371, "logps/chosen": -210.85028076171875, "logps/rejected": -126.7906723022461, "loss": 1.3505, "rewards/accuracies": 0.0, "rewards/chosen": 5.041540622711182, "rewards/margins": -1.557143211364746, "rewards/rejected": 6.598683834075928, "step": 6192 }, { "epoch": 1.01, "learning_rate": 7.680953570469381e-07, "logits/chosen": -0.7971448302268982, "logits/rejected": -0.6771822571754456, "logps/chosen": -78.47569274902344, "logps/rejected": -65.32622528076172, "loss": 1.0198, "rewards/accuracies": 0.0, "rewards/chosen": 2.1128509044647217, "rewards/margins": -0.600039005279541, "rewards/rejected": 2.7128899097442627, "step": 6193 }, { "epoch": 1.01, "learning_rate": 7.679844120224339e-07, "logits/chosen": -0.8298662900924683, "logits/rejected": -0.6317153573036194, "logps/chosen": -144.70965576171875, "logps/rejected": -44.95115661621094, "loss": 0.6664, "rewards/accuracies": 1.0, "rewards/chosen": 4.50429105758667, "rewards/margins": 2.8480563163757324, "rewards/rejected": 1.6562347412109375, "step": 6194 }, { "epoch": 1.01, "learning_rate": 7.678734484827427e-07, "logits/chosen": -0.6535354852676392, "logits/rejected": -0.6521202325820923, "logps/chosen": -93.52215576171875, "logps/rejected": -129.19467163085938, "loss": 0.5155, "rewards/accuracies": 1.0, "rewards/chosen": 1.5764297246932983, "rewards/margins": 0.9115920662879944, "rewards/rejected": 0.664837658405304, "step": 6195 }, { "epoch": 1.01, "learning_rate": 7.677624664355307e-07, "logits/chosen": -0.5446627736091614, "logits/rejected": -0.34008273482322693, "logps/chosen": -107.88247680664062, "logps/rejected": -91.37614440917969, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 3.6171326637268066, "rewards/margins": 2.4092354774475098, "rewards/rejected": 1.2078971862792969, "step": 6196 }, { "epoch": 1.01, "learning_rate": 7.67651465888466e-07, "logits/chosen": -0.7070192098617554, "logits/rejected": -0.7022209167480469, "logps/chosen": -0.7798792719841003, "logps/rejected": -4.556509971618652, "loss": 0.4274, "rewards/accuracies": 0.0, "rewards/chosen": 0.23280183970928192, "rewards/margins": -0.05967502295970917, "rewards/rejected": 0.2924768626689911, "step": 6197 }, { "epoch": 1.01, "learning_rate": 7.675404468492175e-07, "logits/chosen": -0.6399945020675659, "logits/rejected": -0.4522080421447754, "logps/chosen": -142.01658630371094, "logps/rejected": -44.31430435180664, "loss": 0.5027, "rewards/accuracies": 1.0, "rewards/chosen": 4.767155647277832, "rewards/margins": 2.517368793487549, "rewards/rejected": 2.249786853790283, "step": 6198 }, { "epoch": 1.01, "learning_rate": 7.674294093254555e-07, "logits/chosen": -0.5682464241981506, "logits/rejected": -0.5682464241981506, "logps/chosen": -117.11459350585938, "logps/rejected": -117.11459350585938, "loss": 0.4283, "rewards/accuracies": 0.0, "rewards/chosen": 1.7597259283065796, "rewards/margins": 0.0, "rewards/rejected": 1.7597259283065796, "step": 6199 }, { "epoch": 1.01, "learning_rate": 7.67318353324852e-07, "logits/chosen": -0.7022472023963928, "logits/rejected": -0.7022472023963928, "logps/chosen": -63.89649963378906, "logps/rejected": -63.89649963378906, "loss": 0.3496, "rewards/accuracies": 0.0, "rewards/chosen": 1.9023971557617188, "rewards/margins": 0.0, "rewards/rejected": 1.9023971557617188, "step": 6200 }, { "epoch": 1.01, "learning_rate": 7.672072788550794e-07, "logits/chosen": -0.46621811389923096, "logits/rejected": -0.44114190340042114, "logps/chosen": -49.326934814453125, "logps/rejected": -50.41883850097656, "loss": 0.5459, "rewards/accuracies": 1.0, "rewards/chosen": 3.012221574783325, "rewards/margins": 0.9551606178283691, "rewards/rejected": 2.057060956954956, "step": 6201 }, { "epoch": 1.01, "learning_rate": 7.670961859238123e-07, "logits/chosen": -0.7443550229072571, "logits/rejected": -0.970539391040802, "logps/chosen": -53.66352081298828, "logps/rejected": -33.50030517578125, "loss": 1.7451, "rewards/accuracies": 1.0, "rewards/chosen": 2.514455556869507, "rewards/margins": 0.6234349012374878, "rewards/rejected": 1.891020655632019, "step": 6202 }, { "epoch": 1.01, "learning_rate": 7.669850745387261e-07, "logits/chosen": -0.8459900617599487, "logits/rejected": -0.7993270754814148, "logps/chosen": -125.34355926513672, "logps/rejected": -54.690269470214844, "loss": 0.7765, "rewards/accuracies": 1.0, "rewards/chosen": 4.400573253631592, "rewards/margins": 2.656148910522461, "rewards/rejected": 1.7444244623184204, "step": 6203 }, { "epoch": 1.01, "learning_rate": 7.668739447074973e-07, "logits/chosen": -0.7903228402137756, "logits/rejected": -0.7156599164009094, "logps/chosen": -150.01641845703125, "logps/rejected": -47.97138595581055, "loss": 0.4745, "rewards/accuracies": 1.0, "rewards/chosen": 4.2575836181640625, "rewards/margins": 2.2740259170532227, "rewards/rejected": 1.9835575819015503, "step": 6204 }, { "epoch": 1.01, "learning_rate": 7.667627964378043e-07, "logits/chosen": -0.5007017850875854, "logits/rejected": -0.4094884991645813, "logps/chosen": -67.343017578125, "logps/rejected": -62.533348083496094, "loss": 0.9343, "rewards/accuracies": 0.0, "rewards/chosen": 2.2554383277893066, "rewards/margins": -1.4443473815917969, "rewards/rejected": 3.6997857093811035, "step": 6205 }, { "epoch": 1.01, "learning_rate": 7.666516297373261e-07, "logits/chosen": -0.5691754817962646, "logits/rejected": -0.11385378241539001, "logps/chosen": -141.42276000976562, "logps/rejected": -37.46910095214844, "loss": 0.3057, "rewards/accuracies": 1.0, "rewards/chosen": 5.82403564453125, "rewards/margins": 4.903009414672852, "rewards/rejected": 0.9210262298583984, "step": 6206 }, { "epoch": 1.01, "learning_rate": 7.665404446137433e-07, "logits/chosen": -0.7800747156143188, "logits/rejected": -0.7343031764030457, "logps/chosen": -181.9776153564453, "logps/rejected": -65.37065887451172, "loss": 0.4463, "rewards/accuracies": 1.0, "rewards/chosen": 4.817018032073975, "rewards/margins": 2.092819929122925, "rewards/rejected": 2.72419810295105, "step": 6207 }, { "epoch": 1.01, "learning_rate": 7.664292410747379e-07, "logits/chosen": -0.7934921979904175, "logits/rejected": -0.8600932359695435, "logps/chosen": -44.71015167236328, "logps/rejected": -88.1578598022461, "loss": 1.6991, "rewards/accuracies": 0.0, "rewards/chosen": 1.4720710515975952, "rewards/margins": -1.2471596002578735, "rewards/rejected": 2.7192306518554688, "step": 6208 }, { "epoch": 1.01, "learning_rate": 7.663180191279929e-07, "logits/chosen": -0.7930799126625061, "logits/rejected": -0.5479933619499207, "logps/chosen": -134.3326873779297, "logps/rejected": -80.04220581054688, "loss": 0.0884, "rewards/accuracies": 1.0, "rewards/chosen": 4.467785835266113, "rewards/margins": 2.1415445804595947, "rewards/rejected": 2.3262412548065186, "step": 6209 }, { "epoch": 1.01, "learning_rate": 7.662067787811927e-07, "logits/chosen": -0.8599869608879089, "logits/rejected": -0.6982344388961792, "logps/chosen": -113.8068618774414, "logps/rejected": -175.00257873535156, "loss": 0.498, "rewards/accuracies": 0.0, "rewards/chosen": 5.271311283111572, "rewards/margins": -0.5148916244506836, "rewards/rejected": 5.786202907562256, "step": 6210 }, { "epoch": 1.01, "learning_rate": 7.660955200420231e-07, "logits/chosen": -0.8851914405822754, "logits/rejected": -0.8492923378944397, "logps/chosen": -119.79896545410156, "logps/rejected": -155.0680694580078, "loss": 1.684, "rewards/accuracies": 0.0, "rewards/chosen": 1.7343918085098267, "rewards/margins": -3.0557894706726074, "rewards/rejected": 4.7901811599731445, "step": 6211 }, { "epoch": 1.01, "learning_rate": 7.659842429181708e-07, "logits/chosen": -1.0189560651779175, "logits/rejected": -0.6841506958007812, "logps/chosen": -150.19998168945312, "logps/rejected": -131.19955444335938, "loss": 1.5191, "rewards/accuracies": 0.0, "rewards/chosen": 2.8176515102386475, "rewards/margins": -1.7894041538238525, "rewards/rejected": 4.6070556640625, "step": 6212 }, { "epoch": 1.01, "learning_rate": 7.65872947417324e-07, "logits/chosen": -0.9177621603012085, "logits/rejected": -0.8086180090904236, "logps/chosen": -86.83145904541016, "logps/rejected": -60.71480178833008, "loss": 1.2004, "rewards/accuracies": 0.0, "rewards/chosen": 0.7677101492881775, "rewards/margins": -1.6597301959991455, "rewards/rejected": 2.4274404048919678, "step": 6213 }, { "epoch": 1.01, "learning_rate": 7.657616335471723e-07, "logits/chosen": -0.5976729393005371, "logits/rejected": -0.5829585194587708, "logps/chosen": -125.19595336914062, "logps/rejected": -50.801841735839844, "loss": 0.6684, "rewards/accuracies": 0.0, "rewards/chosen": 1.6543854475021362, "rewards/margins": -0.7273825407028198, "rewards/rejected": 2.381767988204956, "step": 6214 }, { "epoch": 1.01, "learning_rate": 7.656503013154063e-07, "logits/chosen": -0.5201929807662964, "logits/rejected": -0.5166743993759155, "logps/chosen": -68.97329711914062, "logps/rejected": -105.83541870117188, "loss": 0.7543, "rewards/accuracies": 0.0, "rewards/chosen": 1.628503441810608, "rewards/margins": -0.7605973482131958, "rewards/rejected": 2.3891007900238037, "step": 6215 }, { "epoch": 1.01, "learning_rate": 7.655389507297181e-07, "logits/chosen": -1.033653974533081, "logits/rejected": -1.032985806465149, "logps/chosen": -96.4876480102539, "logps/rejected": -54.98628234863281, "loss": 0.9949, "rewards/accuracies": 0.0, "rewards/chosen": 0.5213562250137329, "rewards/margins": -0.5036910772323608, "rewards/rejected": 1.0250473022460938, "step": 6216 }, { "epoch": 1.01, "learning_rate": 7.65427581797801e-07, "logits/chosen": -0.6455026865005493, "logits/rejected": -0.6408105492591858, "logps/chosen": -21.52694320678711, "logps/rejected": -65.29235076904297, "loss": 0.9571, "rewards/accuracies": 1.0, "rewards/chosen": 0.03101673163473606, "rewards/margins": 0.38492071628570557, "rewards/rejected": -0.35390397906303406, "step": 6217 }, { "epoch": 1.01, "learning_rate": 7.653161945273496e-07, "logits/chosen": -0.6482725143432617, "logits/rejected": -0.661712110042572, "logps/chosen": -80.16852569580078, "logps/rejected": -199.50735473632812, "loss": 0.7769, "rewards/accuracies": 1.0, "rewards/chosen": 1.7490532398223877, "rewards/margins": 1.2996727228164673, "rewards/rejected": 0.449380487203598, "step": 6218 }, { "epoch": 1.01, "learning_rate": 7.652047889260594e-07, "logits/chosen": -0.6515079140663147, "logits/rejected": -0.5050578117370605, "logps/chosen": -58.15553283691406, "logps/rejected": -91.150390625, "loss": 1.5922, "rewards/accuracies": 0.0, "rewards/chosen": 1.5486878156661987, "rewards/margins": -3.040313720703125, "rewards/rejected": 4.589001655578613, "step": 6219 }, { "epoch": 1.01, "learning_rate": 7.650933650016278e-07, "logits/chosen": -0.729438841342926, "logits/rejected": -0.7330606579780579, "logps/chosen": -175.42025756835938, "logps/rejected": -112.46490478515625, "loss": 0.6432, "rewards/accuracies": 0.0, "rewards/chosen": 5.997766017913818, "rewards/margins": -0.4742417335510254, "rewards/rejected": 6.472007751464844, "step": 6220 }, { "epoch": 1.01, "learning_rate": 7.64981922761753e-07, "logits/chosen": -0.850595235824585, "logits/rejected": -0.850595235824585, "logps/chosen": -120.06086730957031, "logps/rejected": -120.06086730957031, "loss": 0.6153, "rewards/accuracies": 0.0, "rewards/chosen": 1.7107346057891846, "rewards/margins": 0.0, "rewards/rejected": 1.7107346057891846, "step": 6221 }, { "epoch": 1.01, "learning_rate": 7.648704622141347e-07, "logits/chosen": -0.5745662450790405, "logits/rejected": -0.5745662450790405, "logps/chosen": -69.85707092285156, "logps/rejected": -69.85707092285156, "loss": 0.5119, "rewards/accuracies": 0.0, "rewards/chosen": 2.065688371658325, "rewards/margins": 0.0, "rewards/rejected": 2.065688371658325, "step": 6222 }, { "epoch": 1.01, "learning_rate": 7.647589833664736e-07, "logits/chosen": -0.6729221940040588, "logits/rejected": -0.6295334100723267, "logps/chosen": -51.089935302734375, "logps/rejected": -98.93682861328125, "loss": 0.4546, "rewards/accuracies": 0.0, "rewards/chosen": 1.0596011877059937, "rewards/margins": -0.3577228784561157, "rewards/rejected": 1.4173240661621094, "step": 6223 }, { "epoch": 1.01, "learning_rate": 7.646474862264719e-07, "logits/chosen": -0.36408504843711853, "logits/rejected": -0.36408504843711853, "logps/chosen": -27.968481063842773, "logps/rejected": -27.968481063842773, "loss": 0.985, "rewards/accuracies": 0.0, "rewards/chosen": 0.12003765255212784, "rewards/margins": 0.0, "rewards/rejected": 0.12003765255212784, "step": 6224 }, { "epoch": 1.01, "learning_rate": 7.64535970801833e-07, "logits/chosen": -0.749344527721405, "logits/rejected": -0.8675225377082825, "logps/chosen": -147.92529296875, "logps/rejected": -111.1744384765625, "loss": 0.4015, "rewards/accuracies": 0.0, "rewards/chosen": 5.0917510986328125, "rewards/margins": -0.20311450958251953, "rewards/rejected": 5.294865608215332, "step": 6225 }, { "epoch": 1.01, "learning_rate": 7.644244371002618e-07, "logits/chosen": -0.6578677892684937, "logits/rejected": -0.6651427745819092, "logps/chosen": -13.10151195526123, "logps/rejected": -3.068267345428467, "loss": 0.6406, "rewards/accuracies": 0.0, "rewards/chosen": 0.2026442587375641, "rewards/margins": -0.27087730169296265, "rewards/rejected": 0.47352156043052673, "step": 6226 }, { "epoch": 1.01, "learning_rate": 7.643128851294637e-07, "logits/chosen": -0.5800546407699585, "logits/rejected": -0.5728016495704651, "logps/chosen": -88.17320251464844, "logps/rejected": -126.92517852783203, "loss": 0.4642, "rewards/accuracies": 1.0, "rewards/chosen": 1.3147369623184204, "rewards/margins": 1.006615400314331, "rewards/rejected": 0.3081215023994446, "step": 6227 }, { "epoch": 1.01, "learning_rate": 7.642013148971463e-07, "logits/chosen": -0.8436352014541626, "logits/rejected": -0.8436352014541626, "logps/chosen": -98.26318359375, "logps/rejected": -98.26318359375, "loss": 0.3737, "rewards/accuracies": 0.0, "rewards/chosen": 4.988107204437256, "rewards/margins": 0.0, "rewards/rejected": 4.988107204437256, "step": 6228 }, { "epoch": 1.01, "learning_rate": 7.64089726411018e-07, "logits/chosen": -0.6794013381004333, "logits/rejected": -0.7143180966377258, "logps/chosen": -79.7756576538086, "logps/rejected": -93.16111755371094, "loss": 0.7965, "rewards/accuracies": 0.0, "rewards/chosen": 1.3445907831192017, "rewards/margins": -0.3778388500213623, "rewards/rejected": 1.722429633140564, "step": 6229 }, { "epoch": 1.01, "learning_rate": 7.639781196787884e-07, "logits/chosen": -0.5536379814147949, "logits/rejected": -0.43129727244377136, "logps/chosen": -104.61874389648438, "logps/rejected": -85.46736145019531, "loss": 0.1525, "rewards/accuracies": 1.0, "rewards/chosen": 4.1968674659729, "rewards/margins": 1.106794834136963, "rewards/rejected": 3.0900726318359375, "step": 6230 }, { "epoch": 1.01, "learning_rate": 7.638664947081686e-07, "logits/chosen": -0.6845018267631531, "logits/rejected": -0.60307776927948, "logps/chosen": -191.87158203125, "logps/rejected": -124.76358795166016, "loss": 0.3504, "rewards/accuracies": 1.0, "rewards/chosen": 1.0263214111328125, "rewards/margins": 0.05252760648727417, "rewards/rejected": 0.9737938046455383, "step": 6231 }, { "epoch": 1.01, "learning_rate": 7.637548515068706e-07, "logits/chosen": -0.48408228158950806, "logits/rejected": -0.4559619426727295, "logps/chosen": -43.798458099365234, "logps/rejected": -68.0538330078125, "loss": 0.8651, "rewards/accuracies": 1.0, "rewards/chosen": 1.8620823621749878, "rewards/margins": 0.7503689527511597, "rewards/rejected": 1.1117134094238281, "step": 6232 }, { "epoch": 1.01, "learning_rate": 7.636431900826081e-07, "logits/chosen": -0.7952033877372742, "logits/rejected": -0.6919417381286621, "logps/chosen": -122.22128295898438, "logps/rejected": -58.607322692871094, "loss": 0.3711, "rewards/accuracies": 0.0, "rewards/chosen": 1.3839629888534546, "rewards/margins": -0.08306503295898438, "rewards/rejected": 1.467028021812439, "step": 6233 }, { "epoch": 1.01, "learning_rate": 7.635315104430957e-07, "logits/chosen": -1.123923420906067, "logits/rejected": -0.9120349884033203, "logps/chosen": -114.4100341796875, "logps/rejected": -56.98271179199219, "loss": 0.0366, "rewards/accuracies": 1.0, "rewards/chosen": 4.646328926086426, "rewards/margins": 2.669851064682007, "rewards/rejected": 1.976477861404419, "step": 6234 }, { "epoch": 1.01, "learning_rate": 7.634198125960496e-07, "logits/chosen": -0.5220696926116943, "logits/rejected": -0.564141571521759, "logps/chosen": -47.76853561401367, "logps/rejected": -102.23641204833984, "loss": 1.2822, "rewards/accuracies": 0.0, "rewards/chosen": 2.3026599884033203, "rewards/margins": -1.045093297958374, "rewards/rejected": 3.3477532863616943, "step": 6235 }, { "epoch": 1.01, "learning_rate": 7.633080965491869e-07, "logits/chosen": -0.5243837237358093, "logits/rejected": -0.5243837237358093, "logps/chosen": -99.43316650390625, "logps/rejected": -99.43316650390625, "loss": 0.6733, "rewards/accuracies": 0.0, "rewards/chosen": 2.730853319168091, "rewards/margins": 0.0, "rewards/rejected": 2.730853319168091, "step": 6236 }, { "epoch": 1.01, "learning_rate": 7.631963623102262e-07, "logits/chosen": -1.155239224433899, "logits/rejected": -1.0623353719711304, "logps/chosen": -98.801025390625, "logps/rejected": -214.50949096679688, "loss": 0.8066, "rewards/accuracies": 0.0, "rewards/chosen": 1.817090630531311, "rewards/margins": -1.3836570978164673, "rewards/rejected": 3.2007477283477783, "step": 6237 }, { "epoch": 1.01, "learning_rate": 7.630846098868872e-07, "logits/chosen": -0.17693191766738892, "logits/rejected": -0.07774531096220016, "logps/chosen": -44.36104202270508, "logps/rejected": -55.27635192871094, "loss": 0.9587, "rewards/accuracies": 1.0, "rewards/chosen": 2.1952779293060303, "rewards/margins": 0.6902302503585815, "rewards/rejected": 1.5050476789474487, "step": 6238 }, { "epoch": 1.01, "learning_rate": 7.629728392868913e-07, "logits/chosen": -0.6019995212554932, "logits/rejected": -0.4310489296913147, "logps/chosen": -95.6878662109375, "logps/rejected": -51.72613525390625, "loss": 0.6762, "rewards/accuracies": 1.0, "rewards/chosen": 3.507659912109375, "rewards/margins": 0.7761147022247314, "rewards/rejected": 2.7315452098846436, "step": 6239 }, { "epoch": 1.01, "learning_rate": 7.628610505179601e-07, "logits/chosen": -1.187560796737671, "logits/rejected": -1.0536689758300781, "logps/chosen": -160.93618774414062, "logps/rejected": -97.65304565429688, "loss": 0.4285, "rewards/accuracies": 1.0, "rewards/chosen": 4.330206394195557, "rewards/margins": 1.563443899154663, "rewards/rejected": 2.7667624950408936, "step": 6240 }, { "epoch": 1.01, "learning_rate": 7.627492435878176e-07, "logits/chosen": -0.6274636387825012, "logits/rejected": -0.6257392764091492, "logps/chosen": -112.46212768554688, "logps/rejected": -135.75653076171875, "loss": 2.1392, "rewards/accuracies": 0.0, "rewards/chosen": 1.2693703174591064, "rewards/margins": -4.223855018615723, "rewards/rejected": 5.49322509765625, "step": 6241 }, { "epoch": 1.01, "learning_rate": 7.626374185041886e-07, "logits/chosen": -0.3685760200023651, "logits/rejected": -0.3739098608493805, "logps/chosen": -56.82862091064453, "logps/rejected": -19.543521881103516, "loss": 1.1314, "rewards/accuracies": 0.0, "rewards/chosen": 0.13998031616210938, "rewards/margins": -0.15188026428222656, "rewards/rejected": 0.29186058044433594, "step": 6242 }, { "epoch": 1.01, "learning_rate": 7.62525575274799e-07, "logits/chosen": -0.6440595388412476, "logits/rejected": -0.6578142046928406, "logps/chosen": -189.04214477539062, "logps/rejected": -126.32687377929688, "loss": 0.1106, "rewards/accuracies": 1.0, "rewards/chosen": 3.840606689453125, "rewards/margins": 2.543652296066284, "rewards/rejected": 1.2969543933868408, "step": 6243 }, { "epoch": 1.01, "learning_rate": 7.624137139073761e-07, "logits/chosen": -1.0330394506454468, "logits/rejected": -1.1231074333190918, "logps/chosen": -197.56356811523438, "logps/rejected": -72.34390258789062, "loss": 0.3037, "rewards/accuracies": 1.0, "rewards/chosen": 5.057651042938232, "rewards/margins": 0.20879840850830078, "rewards/rejected": 4.848852634429932, "step": 6244 }, { "epoch": 1.01, "learning_rate": 7.623018344096488e-07, "logits/chosen": -0.6475691199302673, "logits/rejected": -0.6434559226036072, "logps/chosen": -1.7928779125213623, "logps/rejected": -9.28018569946289, "loss": 0.3951, "rewards/accuracies": 1.0, "rewards/chosen": 0.25108635425567627, "rewards/margins": 0.17008563876152039, "rewards/rejected": 0.08100070804357529, "step": 6245 }, { "epoch": 1.01, "learning_rate": 7.621899367893465e-07, "logits/chosen": -1.042109489440918, "logits/rejected": -0.6605983376502991, "logps/chosen": -99.5855484008789, "logps/rejected": -63.53305435180664, "loss": 0.395, "rewards/accuracies": 1.0, "rewards/chosen": 3.4452781677246094, "rewards/margins": 1.5235408544540405, "rewards/rejected": 1.9217373132705688, "step": 6246 }, { "epoch": 1.01, "learning_rate": 7.620780210542003e-07, "logits/chosen": -0.6154937148094177, "logits/rejected": -0.6394379734992981, "logps/chosen": -85.386962890625, "logps/rejected": -85.56008911132812, "loss": 0.1053, "rewards/accuracies": 1.0, "rewards/chosen": 2.9549126625061035, "rewards/margins": 1.4758073091506958, "rewards/rejected": 1.4791053533554077, "step": 6247 }, { "epoch": 1.01, "learning_rate": 7.619660872119429e-07, "logits/chosen": -0.828681230545044, "logits/rejected": -0.7330677509307861, "logps/chosen": -102.51434326171875, "logps/rejected": -63.834442138671875, "loss": 0.4575, "rewards/accuracies": 0.0, "rewards/chosen": 1.4984298944473267, "rewards/margins": -0.14465785026550293, "rewards/rejected": 1.6430877447128296, "step": 6248 }, { "epoch": 1.01, "learning_rate": 7.618541352703076e-07, "logits/chosen": -1.1136234998703003, "logits/rejected": -1.0549296140670776, "logps/chosen": -131.24134826660156, "logps/rejected": -102.27367401123047, "loss": 0.2317, "rewards/accuracies": 1.0, "rewards/chosen": 5.586375713348389, "rewards/margins": 0.8389124870300293, "rewards/rejected": 4.747463226318359, "step": 6249 }, { "epoch": 1.01, "learning_rate": 7.617421652370292e-07, "logits/chosen": -0.6902292966842651, "logits/rejected": -0.7351056933403015, "logps/chosen": -118.16121673583984, "logps/rejected": -50.303524017333984, "loss": 1.1148, "rewards/accuracies": 0.0, "rewards/chosen": 0.9784294366836548, "rewards/margins": -0.5332416296005249, "rewards/rejected": 1.5116710662841797, "step": 6250 }, { "epoch": 1.01, "learning_rate": 7.616301771198437e-07, "logits/chosen": -0.462515264749527, "logits/rejected": -0.3872213661670685, "logps/chosen": -44.7181396484375, "logps/rejected": -41.65862274169922, "loss": 0.5497, "rewards/accuracies": 1.0, "rewards/chosen": 3.1375205516815186, "rewards/margins": 1.1891230344772339, "rewards/rejected": 1.9483975172042847, "step": 6251 }, { "epoch": 1.01, "learning_rate": 7.615181709264886e-07, "logits/chosen": -0.4622594118118286, "logits/rejected": -0.4622594118118286, "logps/chosen": -51.51915740966797, "logps/rejected": -51.51915740966797, "loss": 0.7578, "rewards/accuracies": 0.0, "rewards/chosen": 2.1809632778167725, "rewards/margins": 0.0, "rewards/rejected": 2.1809632778167725, "step": 6252 }, { "epoch": 1.01, "learning_rate": 7.614061466647025e-07, "logits/chosen": -0.5950436592102051, "logits/rejected": -0.5741854906082153, "logps/chosen": -29.919414520263672, "logps/rejected": -51.04753875732422, "loss": 0.8979, "rewards/accuracies": 0.0, "rewards/chosen": 1.3303883075714111, "rewards/margins": -0.7007229328155518, "rewards/rejected": 2.031111240386963, "step": 6253 }, { "epoch": 1.02, "learning_rate": 7.612941043422249e-07, "logits/chosen": -0.6180550456047058, "logits/rejected": -0.6501370668411255, "logps/chosen": -72.99642944335938, "logps/rejected": -110.59361267089844, "loss": 1.1791, "rewards/accuracies": 0.0, "rewards/chosen": 1.60503089427948, "rewards/margins": -1.8519104719161987, "rewards/rejected": 3.4569413661956787, "step": 6254 }, { "epoch": 1.02, "learning_rate": 7.611820439667973e-07, "logits/chosen": -0.8989685773849487, "logits/rejected": -0.6660382151603699, "logps/chosen": -131.71075439453125, "logps/rejected": -82.20426177978516, "loss": 0.5351, "rewards/accuracies": 1.0, "rewards/chosen": 3.8905258178710938, "rewards/margins": 1.359900712966919, "rewards/rejected": 2.530625104904175, "step": 6255 }, { "epoch": 1.02, "learning_rate": 7.610699655461617e-07, "logits/chosen": -0.448976993560791, "logits/rejected": -0.42128148674964905, "logps/chosen": -96.37442016601562, "logps/rejected": -53.809017181396484, "loss": 0.3649, "rewards/accuracies": 0.0, "rewards/chosen": 1.312780737876892, "rewards/margins": -0.0574650764465332, "rewards/rejected": 1.3702458143234253, "step": 6256 }, { "epoch": 1.02, "learning_rate": 7.609578690880618e-07, "logits/chosen": -0.5776456594467163, "logits/rejected": -0.3975304663181305, "logps/chosen": -150.29393005371094, "logps/rejected": -15.93709945678711, "loss": 0.2269, "rewards/accuracies": 1.0, "rewards/chosen": 0.9818984866142273, "rewards/margins": 0.748884379863739, "rewards/rejected": 0.23301410675048828, "step": 6257 }, { "epoch": 1.02, "learning_rate": 7.608457546002422e-07, "logits/chosen": -0.7669878602027893, "logits/rejected": -0.6491497755050659, "logps/chosen": -103.31199645996094, "logps/rejected": -84.91964721679688, "loss": 0.4527, "rewards/accuracies": 1.0, "rewards/chosen": 5.252937316894531, "rewards/margins": 3.293703556060791, "rewards/rejected": 1.9592338800430298, "step": 6258 }, { "epoch": 1.02, "learning_rate": 7.607336220904492e-07, "logits/chosen": -0.7234150171279907, "logits/rejected": -0.6311962604522705, "logps/chosen": -56.35797882080078, "logps/rejected": -41.98046875, "loss": 0.7174, "rewards/accuracies": 1.0, "rewards/chosen": 3.1253325939178467, "rewards/margins": 1.5506259202957153, "rewards/rejected": 1.5747066736221313, "step": 6259 }, { "epoch": 1.02, "learning_rate": 7.606214715664301e-07, "logits/chosen": -0.7543030381202698, "logits/rejected": -0.6745795011520386, "logps/chosen": -72.02733612060547, "logps/rejected": -22.440959930419922, "loss": 0.1247, "rewards/accuracies": 1.0, "rewards/chosen": 1.554437279701233, "rewards/margins": 1.4633114337921143, "rewards/rejected": 0.09112586826086044, "step": 6260 }, { "epoch": 1.02, "learning_rate": 7.605093030359332e-07, "logits/chosen": -0.4499775171279907, "logits/rejected": -0.42868536710739136, "logps/chosen": -71.49555969238281, "logps/rejected": -8.345098495483398, "loss": 0.9552, "rewards/accuracies": 0.0, "rewards/chosen": 1.3115936517715454, "rewards/margins": -0.2463134527206421, "rewards/rejected": 1.5579071044921875, "step": 6261 }, { "epoch": 1.02, "learning_rate": 7.603971165067084e-07, "logits/chosen": -0.39256614446640015, "logits/rejected": -0.370450496673584, "logps/chosen": -40.530330657958984, "logps/rejected": -47.26622009277344, "loss": 0.7912, "rewards/accuracies": 1.0, "rewards/chosen": 1.0535000562667847, "rewards/margins": 0.37169843912124634, "rewards/rejected": 0.6818016171455383, "step": 6262 }, { "epoch": 1.02, "learning_rate": 7.602849119865069e-07, "logits/chosen": -0.500778079032898, "logits/rejected": -0.38194739818573, "logps/chosen": -63.89537048339844, "logps/rejected": -91.17849731445312, "loss": 1.5211, "rewards/accuracies": 1.0, "rewards/chosen": 2.284358263015747, "rewards/margins": 0.2159264087677002, "rewards/rejected": 2.068431854248047, "step": 6263 }, { "epoch": 1.02, "learning_rate": 7.601726894830806e-07, "logits/chosen": -0.7470186352729797, "logits/rejected": -0.38441282510757446, "logps/chosen": -158.2872314453125, "logps/rejected": -25.970666885375977, "loss": 0.665, "rewards/accuracies": 1.0, "rewards/chosen": 4.095573425292969, "rewards/margins": 3.6954216957092285, "rewards/rejected": 0.40015164017677307, "step": 6264 }, { "epoch": 1.02, "learning_rate": 7.600604490041834e-07, "logits/chosen": -0.6993882656097412, "logits/rejected": -0.6130518913269043, "logps/chosen": -86.08064270019531, "logps/rejected": -68.3769760131836, "loss": 0.4154, "rewards/accuracies": 1.0, "rewards/chosen": 2.306058645248413, "rewards/margins": 0.5847642421722412, "rewards/rejected": 1.7212944030761719, "step": 6265 }, { "epoch": 1.02, "learning_rate": 7.599481905575698e-07, "logits/chosen": -0.7371764779090881, "logits/rejected": -0.7109509110450745, "logps/chosen": -49.841285705566406, "logps/rejected": -51.21876525878906, "loss": 0.5698, "rewards/accuracies": 0.0, "rewards/chosen": 1.1089508533477783, "rewards/margins": -0.2462470531463623, "rewards/rejected": 1.3551979064941406, "step": 6266 }, { "epoch": 1.02, "learning_rate": 7.59835914150996e-07, "logits/chosen": -0.6244838833808899, "logits/rejected": -0.5190161466598511, "logps/chosen": -42.54009246826172, "logps/rejected": -42.698081970214844, "loss": 1.2722, "rewards/accuracies": 1.0, "rewards/chosen": 1.6540073156356812, "rewards/margins": 0.08269226551055908, "rewards/rejected": 1.571315050125122, "step": 6267 }, { "epoch": 1.02, "learning_rate": 7.59723619792219e-07, "logits/chosen": -1.0602662563323975, "logits/rejected": -1.0530427694320679, "logps/chosen": -84.89362335205078, "logps/rejected": -58.592079162597656, "loss": 1.1584, "rewards/accuracies": 0.0, "rewards/chosen": 2.197777509689331, "rewards/margins": -1.436091661453247, "rewards/rejected": 3.633869171142578, "step": 6268 }, { "epoch": 1.02, "learning_rate": 7.596113074889975e-07, "logits/chosen": -0.24141690135002136, "logits/rejected": -0.15667499601840973, "logps/chosen": -46.51692199707031, "logps/rejected": -62.2059211730957, "loss": 0.5602, "rewards/accuracies": 0.0, "rewards/chosen": 1.6096481084823608, "rewards/margins": -0.5586124658584595, "rewards/rejected": 2.1682605743408203, "step": 6269 }, { "epoch": 1.02, "learning_rate": 7.594989772490911e-07, "logits/chosen": -0.886324942111969, "logits/rejected": -0.8474531173706055, "logps/chosen": -48.07328796386719, "logps/rejected": -102.63665771484375, "loss": 1.0972, "rewards/accuracies": 0.0, "rewards/chosen": 1.9878120422363281, "rewards/margins": -1.2491874694824219, "rewards/rejected": 3.23699951171875, "step": 6270 }, { "epoch": 1.02, "learning_rate": 7.593866290802608e-07, "logits/chosen": -0.4380572736263275, "logits/rejected": -0.3821522891521454, "logps/chosen": -80.3309555053711, "logps/rejected": -99.29478454589844, "loss": 1.2565, "rewards/accuracies": 0.0, "rewards/chosen": 0.5300552248954773, "rewards/margins": -0.03331148624420166, "rewards/rejected": 0.563366711139679, "step": 6271 }, { "epoch": 1.02, "learning_rate": 7.592742629902687e-07, "logits/chosen": -0.5683325529098511, "logits/rejected": -0.3812497556209564, "logps/chosen": -86.4418716430664, "logps/rejected": -80.64794921875, "loss": 0.4837, "rewards/accuracies": 0.0, "rewards/chosen": 1.6842613220214844, "rewards/margins": -0.4705338478088379, "rewards/rejected": 2.1547951698303223, "step": 6272 }, { "epoch": 1.02, "learning_rate": 7.591618789868784e-07, "logits/chosen": -0.6005026698112488, "logits/rejected": -0.6005026698112488, "logps/chosen": -17.965885162353516, "logps/rejected": -17.965885162353516, "loss": 1.1791, "rewards/accuracies": 0.0, "rewards/chosen": 1.5769916772842407, "rewards/margins": 0.0, "rewards/rejected": 1.5769916772842407, "step": 6273 }, { "epoch": 1.02, "learning_rate": 7.590494770778543e-07, "logits/chosen": -0.6539438366889954, "logits/rejected": -0.5702720284461975, "logps/chosen": -88.21174621582031, "logps/rejected": -114.4354019165039, "loss": 0.5363, "rewards/accuracies": 0.0, "rewards/chosen": 2.3635413646698, "rewards/margins": -0.39032745361328125, "rewards/rejected": 2.753868818283081, "step": 6274 }, { "epoch": 1.02, "learning_rate": 7.589370572709626e-07, "logits/chosen": -0.7834051251411438, "logits/rejected": -0.7796880006790161, "logps/chosen": -66.30131530761719, "logps/rejected": -63.80447769165039, "loss": 1.0858, "rewards/accuracies": 0.0, "rewards/chosen": 1.2871201038360596, "rewards/margins": -1.6656162738800049, "rewards/rejected": 2.9527363777160645, "step": 6275 }, { "epoch": 1.02, "learning_rate": 7.588246195739703e-07, "logits/chosen": -0.6705200672149658, "logits/rejected": -0.6100813746452332, "logps/chosen": -50.542423248291016, "logps/rejected": -64.95681762695312, "loss": 0.1616, "rewards/accuracies": 1.0, "rewards/chosen": 1.618289589881897, "rewards/margins": 1.2182834148406982, "rewards/rejected": 0.40000611543655396, "step": 6276 }, { "epoch": 1.02, "learning_rate": 7.587121639946458e-07, "logits/chosen": -0.732832670211792, "logits/rejected": -0.732832670211792, "logps/chosen": -38.59861373901367, "logps/rejected": -38.59861373901367, "loss": 0.4995, "rewards/accuracies": 0.0, "rewards/chosen": 0.8005436062812805, "rewards/margins": 0.0, "rewards/rejected": 0.8005436062812805, "step": 6277 }, { "epoch": 1.02, "learning_rate": 7.585996905407586e-07, "logits/chosen": -0.5104425549507141, "logits/rejected": -0.4677468538284302, "logps/chosen": -24.937973022460938, "logps/rejected": -9.619911193847656, "loss": 0.6395, "rewards/accuracies": 1.0, "rewards/chosen": 0.28344517946243286, "rewards/margins": 0.08865167200565338, "rewards/rejected": 0.19479350745677948, "step": 6278 }, { "epoch": 1.02, "learning_rate": 7.584871992200799e-07, "logits/chosen": -0.834858775138855, "logits/rejected": -0.6135109066963196, "logps/chosen": -246.49227905273438, "logps/rejected": -155.34930419921875, "loss": 0.1778, "rewards/accuracies": 1.0, "rewards/chosen": 5.182214260101318, "rewards/margins": 1.2567718029022217, "rewards/rejected": 3.9254424571990967, "step": 6279 }, { "epoch": 1.02, "learning_rate": 7.583746900403813e-07, "logits/chosen": -0.8378557562828064, "logits/rejected": -1.13274347782135, "logps/chosen": -114.05635070800781, "logps/rejected": -36.97149658203125, "loss": 0.4192, "rewards/accuracies": 1.0, "rewards/chosen": 0.4983200132846832, "rewards/margins": 0.31271934509277344, "rewards/rejected": 0.1856006681919098, "step": 6280 }, { "epoch": 1.02, "learning_rate": 7.582621630094367e-07, "logits/chosen": -0.9650343060493469, "logits/rejected": -0.6057060360908508, "logps/chosen": -111.3310546875, "logps/rejected": -81.36662292480469, "loss": 0.7842, "rewards/accuracies": 0.0, "rewards/chosen": 0.5550605654716492, "rewards/margins": -1.0862336158752441, "rewards/rejected": 1.6412941217422485, "step": 6281 }, { "epoch": 1.02, "learning_rate": 7.581496181350202e-07, "logits/chosen": -0.8527451753616333, "logits/rejected": -0.8746795654296875, "logps/chosen": -115.42515563964844, "logps/rejected": -61.347938537597656, "loss": 0.4391, "rewards/accuracies": 1.0, "rewards/chosen": 2.080296277999878, "rewards/margins": 0.5793951749801636, "rewards/rejected": 1.5009011030197144, "step": 6282 }, { "epoch": 1.02, "learning_rate": 7.580370554249076e-07, "logits/chosen": -0.6369537115097046, "logits/rejected": -0.6871879696846008, "logps/chosen": -91.65308380126953, "logps/rejected": -98.33086395263672, "loss": 2.7232, "rewards/accuracies": 0.0, "rewards/chosen": 1.1924431324005127, "rewards/margins": -5.435914993286133, "rewards/rejected": 6.628357887268066, "step": 6283 }, { "epoch": 1.02, "learning_rate": 7.579244748868762e-07, "logits/chosen": -0.6213053464889526, "logits/rejected": -0.6300915479660034, "logps/chosen": -35.76129150390625, "logps/rejected": -55.32265090942383, "loss": 0.6057, "rewards/accuracies": 1.0, "rewards/chosen": 1.4872322082519531, "rewards/margins": 0.05126678943634033, "rewards/rejected": 1.4359654188156128, "step": 6284 }, { "epoch": 1.02, "learning_rate": 7.57811876528704e-07, "logits/chosen": -0.7471843361854553, "logits/rejected": -0.7510864734649658, "logps/chosen": -76.42198181152344, "logps/rejected": -89.2542495727539, "loss": 1.17, "rewards/accuracies": 0.0, "rewards/chosen": 1.6081078052520752, "rewards/margins": -1.4056639671325684, "rewards/rejected": 3.0137717723846436, "step": 6285 }, { "epoch": 1.02, "learning_rate": 7.576992603581707e-07, "logits/chosen": -0.48945239186286926, "logits/rejected": -0.4177543818950653, "logps/chosen": -89.57316589355469, "logps/rejected": -56.586517333984375, "loss": 0.8412, "rewards/accuracies": 1.0, "rewards/chosen": 1.9897568225860596, "rewards/margins": 0.36551666259765625, "rewards/rejected": 1.6242401599884033, "step": 6286 }, { "epoch": 1.02, "learning_rate": 7.575866263830568e-07, "logits/chosen": -0.8900706171989441, "logits/rejected": -0.6239897608757019, "logps/chosen": -84.60018920898438, "logps/rejected": -60.712181091308594, "loss": 0.765, "rewards/accuracies": 1.0, "rewards/chosen": 1.4272613525390625, "rewards/margins": 0.7038596868515015, "rewards/rejected": 0.723401665687561, "step": 6287 }, { "epoch": 1.02, "learning_rate": 7.574739746111443e-07, "logits/chosen": -0.4656786620616913, "logits/rejected": -0.4079747796058655, "logps/chosen": -49.16900634765625, "logps/rejected": -47.37128448486328, "loss": 0.6128, "rewards/accuracies": 1.0, "rewards/chosen": 2.1594741344451904, "rewards/margins": 0.8137786388397217, "rewards/rejected": 1.3456954956054688, "step": 6288 }, { "epoch": 1.02, "learning_rate": 7.573613050502164e-07, "logits/chosen": -0.5389654636383057, "logits/rejected": -0.5389654636383057, "logps/chosen": -51.96159362792969, "logps/rejected": -51.96159362792969, "loss": 0.9424, "rewards/accuracies": 0.0, "rewards/chosen": 1.321393609046936, "rewards/margins": 0.0, "rewards/rejected": 1.321393609046936, "step": 6289 }, { "epoch": 1.02, "learning_rate": 7.572486177080575e-07, "logits/chosen": -1.102123498916626, "logits/rejected": -0.9747598767280579, "logps/chosen": -134.4596710205078, "logps/rejected": -59.49628448486328, "loss": 1.0649, "rewards/accuracies": 0.0, "rewards/chosen": 1.3484848737716675, "rewards/margins": -0.20638728141784668, "rewards/rejected": 1.5548721551895142, "step": 6290 }, { "epoch": 1.02, "learning_rate": 7.571359125924532e-07, "logits/chosen": -0.5958311557769775, "logits/rejected": -0.5522467494010925, "logps/chosen": -76.0139389038086, "logps/rejected": -44.904048919677734, "loss": 0.8954, "rewards/accuracies": 0.0, "rewards/chosen": 0.038939666002988815, "rewards/margins": -0.9547573328018188, "rewards/rejected": 0.9936969876289368, "step": 6291 }, { "epoch": 1.02, "learning_rate": 7.570231897111906e-07, "logits/chosen": -0.5258913636207581, "logits/rejected": -0.608993411064148, "logps/chosen": -120.93463134765625, "logps/rejected": -52.040008544921875, "loss": 0.9415, "rewards/accuracies": 1.0, "rewards/chosen": 3.381239414215088, "rewards/margins": 1.7832825183868408, "rewards/rejected": 1.597956895828247, "step": 6292 }, { "epoch": 1.02, "learning_rate": 7.569104490720574e-07, "logits/chosen": -0.39189767837524414, "logits/rejected": -0.4010170102119446, "logps/chosen": -61.06208038330078, "logps/rejected": -78.17167663574219, "loss": 0.6886, "rewards/accuracies": 0.0, "rewards/chosen": 1.5971473455429077, "rewards/margins": -0.8416672945022583, "rewards/rejected": 2.438814640045166, "step": 6293 }, { "epoch": 1.02, "learning_rate": 7.56797690682843e-07, "logits/chosen": -0.5599962472915649, "logits/rejected": -0.33201706409454346, "logps/chosen": -91.59260559082031, "logps/rejected": -32.725616455078125, "loss": 1.3606, "rewards/accuracies": 1.0, "rewards/chosen": 3.58345103263855, "rewards/margins": 2.769867181777954, "rewards/rejected": 0.8135837912559509, "step": 6294 }, { "epoch": 1.02, "learning_rate": 7.566849145513381e-07, "logits/chosen": -0.709821343421936, "logits/rejected": -0.7389965057373047, "logps/chosen": -52.176849365234375, "logps/rejected": -52.391761779785156, "loss": 0.2518, "rewards/accuracies": 1.0, "rewards/chosen": 1.8002052307128906, "rewards/margins": 0.4490104913711548, "rewards/rejected": 1.3511947393417358, "step": 6295 }, { "epoch": 1.02, "learning_rate": 7.565721206853344e-07, "logits/chosen": -0.4883284866809845, "logits/rejected": -0.4737483561038971, "logps/chosen": -77.81665802001953, "logps/rejected": -54.21642303466797, "loss": 1.404, "rewards/accuracies": 0.0, "rewards/chosen": 1.0412887334823608, "rewards/margins": -0.6506690979003906, "rewards/rejected": 1.6919578313827515, "step": 6296 }, { "epoch": 1.02, "learning_rate": 7.564593090926248e-07, "logits/chosen": -0.8058741092681885, "logits/rejected": -0.8052693605422974, "logps/chosen": -116.51836395263672, "logps/rejected": -106.74455261230469, "loss": 0.9465, "rewards/accuracies": 0.0, "rewards/chosen": 1.2612892389297485, "rewards/margins": -1.591814398765564, "rewards/rejected": 2.8531036376953125, "step": 6297 }, { "epoch": 1.02, "learning_rate": 7.563464797810037e-07, "logits/chosen": -0.29622864723205566, "logits/rejected": -0.2717335522174835, "logps/chosen": -46.168800354003906, "logps/rejected": -117.16937255859375, "loss": 0.2253, "rewards/accuracies": 1.0, "rewards/chosen": 1.640238642692566, "rewards/margins": 1.6288098096847534, "rewards/rejected": 0.0114288330078125, "step": 6298 }, { "epoch": 1.02, "learning_rate": 7.562336327582663e-07, "logits/chosen": -0.6866641640663147, "logits/rejected": -0.7275497317314148, "logps/chosen": -59.3819580078125, "logps/rejected": -83.728515625, "loss": 1.6984, "rewards/accuracies": 0.0, "rewards/chosen": 0.7390213012695312, "rewards/margins": -1.804635763168335, "rewards/rejected": 2.543657064437866, "step": 6299 }, { "epoch": 1.02, "learning_rate": 7.561207680322095e-07, "logits/chosen": -0.6015467643737793, "logits/rejected": -0.47281596064567566, "logps/chosen": -76.98472595214844, "logps/rejected": -135.66143798828125, "loss": 1.3077, "rewards/accuracies": 0.0, "rewards/chosen": 2.210545301437378, "rewards/margins": -1.141523838043213, "rewards/rejected": 3.352069139480591, "step": 6300 }, { "epoch": 1.02, "learning_rate": 7.560078856106309e-07, "logits/chosen": -0.7831212282180786, "logits/rejected": -0.7277717590332031, "logps/chosen": -236.9542236328125, "logps/rejected": -139.7516326904297, "loss": 0.6449, "rewards/accuracies": 0.0, "rewards/chosen": 1.4131561517715454, "rewards/margins": -0.9531844854354858, "rewards/rejected": 2.3663406372070312, "step": 6301 }, { "epoch": 1.02, "learning_rate": 7.5589498550133e-07, "logits/chosen": -0.9778866767883301, "logits/rejected": -0.9435974955558777, "logps/chosen": -53.765193939208984, "logps/rejected": -54.143531799316406, "loss": 1.3702, "rewards/accuracies": 1.0, "rewards/chosen": 1.775359034538269, "rewards/margins": 0.017712831497192383, "rewards/rejected": 1.7576462030410767, "step": 6302 }, { "epoch": 1.02, "learning_rate": 7.557820677121066e-07, "logits/chosen": -0.6100594997406006, "logits/rejected": -0.6285483241081238, "logps/chosen": -98.06759643554688, "logps/rejected": -99.32769775390625, "loss": 1.6286, "rewards/accuracies": 0.0, "rewards/chosen": 0.7605544924736023, "rewards/margins": -3.1805100440979004, "rewards/rejected": 3.9410645961761475, "step": 6303 }, { "epoch": 1.02, "learning_rate": 7.556691322507627e-07, "logits/chosen": -0.710698664188385, "logits/rejected": -0.7015419602394104, "logps/chosen": -278.6895751953125, "logps/rejected": -52.99311065673828, "loss": 0.1641, "rewards/accuracies": 1.0, "rewards/chosen": 3.5920989513397217, "rewards/margins": 1.312929391860962, "rewards/rejected": 2.2791695594787598, "step": 6304 }, { "epoch": 1.02, "learning_rate": 7.555561791251009e-07, "logits/chosen": -0.793668806552887, "logits/rejected": -0.802264392375946, "logps/chosen": -38.04240417480469, "logps/rejected": -28.534317016601562, "loss": 0.3714, "rewards/accuracies": 1.0, "rewards/chosen": 0.8537460565567017, "rewards/margins": 0.11243361234664917, "rewards/rejected": 0.7413124442100525, "step": 6305 }, { "epoch": 1.02, "learning_rate": 7.554432083429251e-07, "logits/chosen": -0.9680143594741821, "logits/rejected": -0.983604371547699, "logps/chosen": -44.734291076660156, "logps/rejected": -59.72925567626953, "loss": 1.5535, "rewards/accuracies": 0.0, "rewards/chosen": 1.5801349878311157, "rewards/margins": -0.6835469007492065, "rewards/rejected": 2.2636818885803223, "step": 6306 }, { "epoch": 1.02, "learning_rate": 7.553302199120408e-07, "logits/chosen": -0.8287537097930908, "logits/rejected": -0.652841329574585, "logps/chosen": -157.87655639648438, "logps/rejected": -48.41679000854492, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": 5.475497722625732, "rewards/margins": 3.8122549057006836, "rewards/rejected": 1.6632426977157593, "step": 6307 }, { "epoch": 1.02, "learning_rate": 7.552172138402544e-07, "logits/chosen": -0.5861873030662537, "logits/rejected": -0.6241623163223267, "logps/chosen": -104.20613098144531, "logps/rejected": -118.25279998779297, "loss": 1.1218, "rewards/accuracies": 0.0, "rewards/chosen": 2.38224196434021, "rewards/margins": -1.8903448581695557, "rewards/rejected": 4.272586822509766, "step": 6308 }, { "epoch": 1.02, "learning_rate": 7.551041901353732e-07, "logits/chosen": 0.048843953758478165, "logits/rejected": 0.06574217230081558, "logps/chosen": -5.258509635925293, "logps/rejected": -5.041985988616943, "loss": 1.6973, "rewards/accuracies": 0.0, "rewards/chosen": -0.038542747497558594, "rewards/margins": -0.40841421484947205, "rewards/rejected": 0.36987146735191345, "step": 6309 }, { "epoch": 1.02, "learning_rate": 7.549911488052063e-07, "logits/chosen": -0.4514116644859314, "logits/rejected": -0.4514116644859314, "logps/chosen": -1.825703740119934, "logps/rejected": -1.825703740119934, "loss": 0.4877, "rewards/accuracies": 0.0, "rewards/chosen": 0.2328287661075592, "rewards/margins": 0.0, "rewards/rejected": 0.2328287661075592, "step": 6310 }, { "epoch": 1.02, "learning_rate": 7.548780898575639e-07, "logits/chosen": -0.7477807998657227, "logits/rejected": -0.7477807998657227, "logps/chosen": -69.51950073242188, "logps/rejected": -69.51950073242188, "loss": 1.0458, "rewards/accuracies": 0.0, "rewards/chosen": 1.611944556236267, "rewards/margins": 0.0, "rewards/rejected": 1.611944556236267, "step": 6311 }, { "epoch": 1.02, "learning_rate": 7.54765013300257e-07, "logits/chosen": -0.7872373461723328, "logits/rejected": -0.6574985384941101, "logps/chosen": -162.56561279296875, "logps/rejected": -51.3407096862793, "loss": 0.0531, "rewards/accuracies": 1.0, "rewards/chosen": 4.245706081390381, "rewards/margins": 2.3378992080688477, "rewards/rejected": 1.9078067541122437, "step": 6312 }, { "epoch": 1.02, "learning_rate": 7.546519191410983e-07, "logits/chosen": -0.7807378172874451, "logits/rejected": -0.7077154517173767, "logps/chosen": -100.28272247314453, "logps/rejected": -120.23701477050781, "loss": 0.7091, "rewards/accuracies": 0.0, "rewards/chosen": 3.3086998462677, "rewards/margins": -1.0759971141815186, "rewards/rejected": 4.384696960449219, "step": 6313 }, { "epoch": 1.02, "learning_rate": 7.545388073879017e-07, "logits/chosen": -0.2597619295120239, "logits/rejected": -0.28149205446243286, "logps/chosen": -101.09178161621094, "logps/rejected": -70.40953063964844, "loss": 1.0805, "rewards/accuracies": 1.0, "rewards/chosen": 1.325356364250183, "rewards/margins": 0.006723880767822266, "rewards/rejected": 1.3186324834823608, "step": 6314 }, { "epoch": 1.02, "learning_rate": 7.54425678048482e-07, "logits/chosen": -0.6568508148193359, "logits/rejected": -0.6978861689567566, "logps/chosen": -102.99527740478516, "logps/rejected": -53.56486511230469, "loss": 1.3354, "rewards/accuracies": 0.0, "rewards/chosen": 0.7940117120742798, "rewards/margins": -0.3341789245605469, "rewards/rejected": 1.1281906366348267, "step": 6315 }, { "epoch": 1.03, "learning_rate": 7.543125311306551e-07, "logits/chosen": -0.9433163404464722, "logits/rejected": -1.0582994222640991, "logps/chosen": -88.5144271850586, "logps/rejected": -103.93399047851562, "loss": 0.5481, "rewards/accuracies": 0.0, "rewards/chosen": 1.9901145696640015, "rewards/margins": -0.08856141567230225, "rewards/rejected": 2.0786759853363037, "step": 6316 }, { "epoch": 1.03, "learning_rate": 7.541993666422387e-07, "logits/chosen": -0.9262611865997314, "logits/rejected": -0.6875730156898499, "logps/chosen": -126.57382202148438, "logps/rejected": -42.190345764160156, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": 3.932403564453125, "rewards/margins": 3.662656784057617, "rewards/rejected": 0.2697467803955078, "step": 6317 }, { "epoch": 1.03, "learning_rate": 7.540861845910513e-07, "logits/chosen": -0.6912459135055542, "logits/rejected": -0.6106027960777283, "logps/chosen": -73.0132064819336, "logps/rejected": -65.17556762695312, "loss": 0.2103, "rewards/accuracies": 1.0, "rewards/chosen": 1.9751685857772827, "rewards/margins": 0.6864745616912842, "rewards/rejected": 1.2886940240859985, "step": 6318 }, { "epoch": 1.03, "learning_rate": 7.539729849849128e-07, "logits/chosen": -0.6710186004638672, "logits/rejected": -0.6710186004638672, "logps/chosen": -60.253944396972656, "logps/rejected": -60.253944396972656, "loss": 0.3967, "rewards/accuracies": 0.0, "rewards/chosen": 1.6744499206542969, "rewards/margins": 0.0, "rewards/rejected": 1.6744499206542969, "step": 6319 }, { "epoch": 1.03, "learning_rate": 7.538597678316442e-07, "logits/chosen": -1.0223973989486694, "logits/rejected": -1.0208072662353516, "logps/chosen": -174.70217895507812, "logps/rejected": -100.49873352050781, "loss": 0.0419, "rewards/accuracies": 1.0, "rewards/chosen": 4.4319610595703125, "rewards/margins": 3.5322768688201904, "rewards/rejected": 0.8996841311454773, "step": 6320 }, { "epoch": 1.03, "learning_rate": 7.537465331390676e-07, "logits/chosen": -0.8856300115585327, "logits/rejected": -0.696725070476532, "logps/chosen": -132.28054809570312, "logps/rejected": -75.70610046386719, "loss": 0.4681, "rewards/accuracies": 0.0, "rewards/chosen": 1.7214173078536987, "rewards/margins": -0.00067901611328125, "rewards/rejected": 1.72209632396698, "step": 6321 }, { "epoch": 1.03, "learning_rate": 7.536332809150066e-07, "logits/chosen": -0.5769016742706299, "logits/rejected": -0.5809944868087769, "logps/chosen": -55.463714599609375, "logps/rejected": -82.87399291992188, "loss": 0.5354, "rewards/accuracies": 1.0, "rewards/chosen": 1.6274795532226562, "rewards/margins": 0.9899673461914062, "rewards/rejected": 0.63751220703125, "step": 6322 }, { "epoch": 1.03, "learning_rate": 7.535200111672856e-07, "logits/chosen": -0.7373144626617432, "logits/rejected": -0.8920193910598755, "logps/chosen": -99.36570739746094, "logps/rejected": -84.82715606689453, "loss": 1.8933, "rewards/accuracies": 0.0, "rewards/chosen": 0.8838996887207031, "rewards/margins": -3.254086971282959, "rewards/rejected": 4.137986660003662, "step": 6323 }, { "epoch": 1.03, "learning_rate": 7.53406723903731e-07, "logits/chosen": -0.5447301864624023, "logits/rejected": -0.5337493419647217, "logps/chosen": -69.16426086425781, "logps/rejected": -78.39521789550781, "loss": 0.9949, "rewards/accuracies": 0.0, "rewards/chosen": 1.8595093488693237, "rewards/margins": -0.15991747379302979, "rewards/rejected": 2.0194268226623535, "step": 6324 }, { "epoch": 1.03, "learning_rate": 7.532934191321692e-07, "logits/chosen": -0.9392358660697937, "logits/rejected": -0.872529149055481, "logps/chosen": -57.04315185546875, "logps/rejected": -36.33808898925781, "loss": 0.6508, "rewards/accuracies": 0.0, "rewards/chosen": 1.3213165998458862, "rewards/margins": -0.26114463806152344, "rewards/rejected": 1.5824612379074097, "step": 6325 }, { "epoch": 1.03, "learning_rate": 7.53180096860429e-07, "logits/chosen": -0.5404008626937866, "logits/rejected": -0.5266716480255127, "logps/chosen": -55.84370040893555, "logps/rejected": -78.08124542236328, "loss": 1.0375, "rewards/accuracies": 1.0, "rewards/chosen": 1.5677708387374878, "rewards/margins": 0.252658486366272, "rewards/rejected": 1.3151123523712158, "step": 6326 }, { "epoch": 1.03, "learning_rate": 7.530667570963397e-07, "logits/chosen": -0.7421334385871887, "logits/rejected": -0.6732391715049744, "logps/chosen": -73.05429077148438, "logps/rejected": -33.70941162109375, "loss": 0.5542, "rewards/accuracies": 1.0, "rewards/chosen": 1.7490348815917969, "rewards/margins": 1.566033959388733, "rewards/rejected": 0.18300095200538635, "step": 6327 }, { "epoch": 1.03, "learning_rate": 7.529533998477319e-07, "logits/chosen": -0.6760983467102051, "logits/rejected": -0.6992814540863037, "logps/chosen": -57.8155517578125, "logps/rejected": -51.45626449584961, "loss": 0.3869, "rewards/accuracies": 0.0, "rewards/chosen": 1.3244065046310425, "rewards/margins": -0.09892153739929199, "rewards/rejected": 1.4233280420303345, "step": 6328 }, { "epoch": 1.03, "learning_rate": 7.528400251224378e-07, "logits/chosen": -0.5208228230476379, "logits/rejected": -0.5299978256225586, "logps/chosen": -64.1116943359375, "logps/rejected": -105.10437774658203, "loss": 0.4683, "rewards/accuracies": 0.0, "rewards/chosen": 2.6844024658203125, "rewards/margins": -0.4125022888183594, "rewards/rejected": 3.096904754638672, "step": 6329 }, { "epoch": 1.03, "learning_rate": 7.527266329282905e-07, "logits/chosen": -0.9030490517616272, "logits/rejected": -0.9614506363868713, "logps/chosen": -69.60012817382812, "logps/rejected": -115.49905395507812, "loss": 0.9674, "rewards/accuracies": 0.0, "rewards/chosen": 1.7982162237167358, "rewards/margins": -1.7721940279006958, "rewards/rejected": 3.5704102516174316, "step": 6330 }, { "epoch": 1.03, "learning_rate": 7.52613223273124e-07, "logits/chosen": -0.7648680806159973, "logits/rejected": -0.8338927030563354, "logps/chosen": -73.50732421875, "logps/rejected": -104.2104721069336, "loss": 1.2268, "rewards/accuracies": 0.0, "rewards/chosen": 1.8018783330917358, "rewards/margins": -2.2862725257873535, "rewards/rejected": 4.088150978088379, "step": 6331 }, { "epoch": 1.03, "learning_rate": 7.524997961647738e-07, "logits/chosen": -0.013032934628427029, "logits/rejected": -0.060639478266239166, "logps/chosen": -4.8752851486206055, "logps/rejected": -76.49946594238281, "loss": 1.3072, "rewards/accuracies": 0.0, "rewards/chosen": 0.011729431338608265, "rewards/margins": -0.5046676993370056, "rewards/rejected": 0.5163971185684204, "step": 6332 }, { "epoch": 1.03, "learning_rate": 7.523863516110771e-07, "logits/chosen": -0.7437136769294739, "logits/rejected": -0.7513222694396973, "logps/chosen": -9.702544212341309, "logps/rejected": -2.5205276012420654, "loss": 0.4352, "rewards/accuracies": 0.0, "rewards/chosen": 0.17701616883277893, "rewards/margins": -0.28060683608055115, "rewards/rejected": 0.4576230049133301, "step": 6333 }, { "epoch": 1.03, "learning_rate": 7.522728896198716e-07, "logits/chosen": -0.4504704773426056, "logits/rejected": -0.44684234261512756, "logps/chosen": -86.83393096923828, "logps/rejected": -89.58442687988281, "loss": 0.2352, "rewards/accuracies": 1.0, "rewards/chosen": 1.2347573041915894, "rewards/margins": 0.9946289658546448, "rewards/rejected": 0.2401283234357834, "step": 6334 }, { "epoch": 1.03, "learning_rate": 7.521594101989965e-07, "logits/chosen": -0.5600578784942627, "logits/rejected": -0.5831871628761292, "logps/chosen": -69.11202239990234, "logps/rejected": -61.80787658691406, "loss": 2.0447, "rewards/accuracies": 0.0, "rewards/chosen": 0.933941662311554, "rewards/margins": -1.2041184902191162, "rewards/rejected": 2.1380600929260254, "step": 6335 }, { "epoch": 1.03, "learning_rate": 7.52045913356292e-07, "logits/chosen": -0.48074159026145935, "logits/rejected": -0.5200623273849487, "logps/chosen": -81.60172271728516, "logps/rejected": -103.93479919433594, "loss": 0.257, "rewards/accuracies": 1.0, "rewards/chosen": 1.0665420293807983, "rewards/margins": 0.4103240370750427, "rewards/rejected": 0.6562179923057556, "step": 6336 }, { "epoch": 1.03, "learning_rate": 7.519323990995998e-07, "logits/chosen": -0.6624056100845337, "logits/rejected": -0.6735607981681824, "logps/chosen": -54.901432037353516, "logps/rejected": -91.1807632446289, "loss": 0.2254, "rewards/accuracies": 1.0, "rewards/chosen": 1.9458812475204468, "rewards/margins": 0.5672725439071655, "rewards/rejected": 1.3786087036132812, "step": 6337 }, { "epoch": 1.03, "learning_rate": 7.518188674367627e-07, "logits/chosen": -0.7981401085853577, "logits/rejected": -1.1490057706832886, "logps/chosen": -106.04336547851562, "logps/rejected": -33.577423095703125, "loss": 0.4271, "rewards/accuracies": 1.0, "rewards/chosen": 0.6138321161270142, "rewards/margins": 0.2769886255264282, "rewards/rejected": 0.33684349060058594, "step": 6338 }, { "epoch": 1.03, "learning_rate": 7.517053183756246e-07, "logits/chosen": -0.9062691926956177, "logits/rejected": -0.6241994500160217, "logps/chosen": -124.88803100585938, "logps/rejected": -24.92303466796875, "loss": 0.0372, "rewards/accuracies": 1.0, "rewards/chosen": 5.1170196533203125, "rewards/margins": 4.545592784881592, "rewards/rejected": 0.5714268088340759, "step": 6339 }, { "epoch": 1.03, "learning_rate": 7.515917519240304e-07, "logits/chosen": -0.6493205428123474, "logits/rejected": -0.6421349048614502, "logps/chosen": -102.39994049072266, "logps/rejected": -44.25398254394531, "loss": 0.5134, "rewards/accuracies": 0.0, "rewards/chosen": 0.9755393862724304, "rewards/margins": -0.5135185122489929, "rewards/rejected": 1.4890578985214233, "step": 6340 }, { "epoch": 1.03, "learning_rate": 7.51478168089827e-07, "logits/chosen": -0.803690493106842, "logits/rejected": -0.7607226371765137, "logps/chosen": -107.61092376708984, "logps/rejected": -69.98143005371094, "loss": 1.0621, "rewards/accuracies": 0.0, "rewards/chosen": 1.0789566040039062, "rewards/margins": -0.22002112865447998, "rewards/rejected": 1.2989777326583862, "step": 6341 }, { "epoch": 1.03, "learning_rate": 7.513645668808615e-07, "logits/chosen": -0.7620946764945984, "logits/rejected": -0.7130839824676514, "logps/chosen": -82.9263916015625, "logps/rejected": -91.20221710205078, "loss": 0.6072, "rewards/accuracies": 0.0, "rewards/chosen": 2.3748795986175537, "rewards/margins": -0.8165290355682373, "rewards/rejected": 3.191408634185791, "step": 6342 }, { "epoch": 1.03, "learning_rate": 7.51250948304983e-07, "logits/chosen": -0.470028817653656, "logits/rejected": -0.42597001791000366, "logps/chosen": -75.48705291748047, "logps/rejected": -84.36730194091797, "loss": 2.5266, "rewards/accuracies": 1.0, "rewards/chosen": 2.8487038612365723, "rewards/margins": 0.13320016860961914, "rewards/rejected": 2.715503692626953, "step": 6343 }, { "epoch": 1.03, "learning_rate": 7.511373123700413e-07, "logits/chosen": -0.8701663613319397, "logits/rejected": -0.781651496887207, "logps/chosen": -110.79191589355469, "logps/rejected": -184.5694580078125, "loss": 0.7771, "rewards/accuracies": 0.0, "rewards/chosen": 5.109890937805176, "rewards/margins": -1.2799863815307617, "rewards/rejected": 6.3898773193359375, "step": 6344 }, { "epoch": 1.03, "learning_rate": 7.510236590838877e-07, "logits/chosen": -0.4794047772884369, "logits/rejected": -0.4116571545600891, "logps/chosen": -59.50445556640625, "logps/rejected": -67.24736022949219, "loss": 0.3894, "rewards/accuracies": 1.0, "rewards/chosen": 1.2197052240371704, "rewards/margins": 0.06788098812103271, "rewards/rejected": 1.1518242359161377, "step": 6345 }, { "epoch": 1.03, "learning_rate": 7.509099884543744e-07, "logits/chosen": -0.9955959320068359, "logits/rejected": -0.852068305015564, "logps/chosen": -101.69306945800781, "logps/rejected": -34.98117446899414, "loss": 0.295, "rewards/accuracies": 1.0, "rewards/chosen": 0.6248695254325867, "rewards/margins": 0.4974491000175476, "rewards/rejected": 0.12742042541503906, "step": 6346 }, { "epoch": 1.03, "learning_rate": 7.50796300489355e-07, "logits/chosen": -0.37790781259536743, "logits/rejected": -0.4060055613517761, "logps/chosen": -53.36774444580078, "logps/rejected": -120.53208923339844, "loss": 0.1745, "rewards/accuracies": 1.0, "rewards/chosen": 1.8634750843048096, "rewards/margins": 1.8757164478302002, "rewards/rejected": -0.012241363525390625, "step": 6347 }, { "epoch": 1.03, "learning_rate": 7.506825951966842e-07, "logits/chosen": -0.9585335850715637, "logits/rejected": -0.9734249711036682, "logps/chosen": -34.89107894897461, "logps/rejected": -15.071375846862793, "loss": 0.6051, "rewards/accuracies": 0.0, "rewards/chosen": 0.016366196796298027, "rewards/margins": -0.3562886416912079, "rewards/rejected": 0.37265482544898987, "step": 6348 }, { "epoch": 1.03, "learning_rate": 7.505688725842182e-07, "logits/chosen": -0.8635136485099792, "logits/rejected": -0.8714655637741089, "logps/chosen": -79.03060150146484, "logps/rejected": -129.21566772460938, "loss": 0.44, "rewards/accuracies": 0.0, "rewards/chosen": 1.7340439558029175, "rewards/margins": -0.19829559326171875, "rewards/rejected": 1.9323395490646362, "step": 6349 }, { "epoch": 1.03, "learning_rate": 7.504551326598138e-07, "logits/chosen": -0.6475086808204651, "logits/rejected": -0.6823261976242065, "logps/chosen": -67.86448669433594, "logps/rejected": -145.6221466064453, "loss": 1.6136, "rewards/accuracies": 0.0, "rewards/chosen": 2.3721985816955566, "rewards/margins": -3.176025390625, "rewards/rejected": 5.548223972320557, "step": 6350 }, { "epoch": 1.03, "learning_rate": 7.503413754313299e-07, "logits/chosen": -0.7410553097724915, "logits/rejected": -0.5102633833885193, "logps/chosen": -95.03797149658203, "logps/rejected": -77.14010620117188, "loss": 0.0748, "rewards/accuracies": 1.0, "rewards/chosen": 5.8462138175964355, "rewards/margins": 3.435283899307251, "rewards/rejected": 2.4109299182891846, "step": 6351 }, { "epoch": 1.03, "learning_rate": 7.502276009066255e-07, "logits/chosen": -1.1505999565124512, "logits/rejected": -1.0827816724777222, "logps/chosen": -75.53636932373047, "logps/rejected": -94.40283203125, "loss": 1.0243, "rewards/accuracies": 0.0, "rewards/chosen": 1.3779312372207642, "rewards/margins": -1.6941345930099487, "rewards/rejected": 3.072065830230713, "step": 6352 }, { "epoch": 1.03, "learning_rate": 7.501138090935615e-07, "logits/chosen": -0.5254020094871521, "logits/rejected": -0.5088868141174316, "logps/chosen": -47.367462158203125, "logps/rejected": -100.15736389160156, "loss": 0.4573, "rewards/accuracies": 1.0, "rewards/chosen": 1.7201827764511108, "rewards/margins": 0.6064895391464233, "rewards/rejected": 1.1136932373046875, "step": 6353 }, { "epoch": 1.03, "learning_rate": 7.5e-07, "logits/chosen": -1.0052539110183716, "logits/rejected": -0.9724612832069397, "logps/chosen": -108.42889404296875, "logps/rejected": -100.15985107421875, "loss": 1.2254, "rewards/accuracies": 0.0, "rewards/chosen": 0.7543182373046875, "rewards/margins": -2.252532958984375, "rewards/rejected": 3.0068511962890625, "step": 6354 }, { "epoch": 1.03, "learning_rate": 7.49886173633804e-07, "logits/chosen": -0.3981642723083496, "logits/rejected": -0.3462800979614258, "logps/chosen": -41.296531677246094, "logps/rejected": -63.66058349609375, "loss": 0.1928, "rewards/accuracies": 1.0, "rewards/chosen": 2.476943254470825, "rewards/margins": 1.0296226739883423, "rewards/rejected": 1.447320580482483, "step": 6355 }, { "epoch": 1.03, "learning_rate": 7.497723300028378e-07, "logits/chosen": -0.4645936191082001, "logits/rejected": -0.4565463960170746, "logps/chosen": -34.652671813964844, "logps/rejected": -108.54741668701172, "loss": 0.4905, "rewards/accuracies": 0.0, "rewards/chosen": 0.20329056680202484, "rewards/margins": -0.4754142761230469, "rewards/rejected": 0.6787048578262329, "step": 6356 }, { "epoch": 1.03, "learning_rate": 7.496584691149669e-07, "logits/chosen": -0.7460725903511047, "logits/rejected": -0.7025883793830872, "logps/chosen": -92.67347717285156, "logps/rejected": -61.214752197265625, "loss": 0.5122, "rewards/accuracies": 0.0, "rewards/chosen": 1.3510239124298096, "rewards/margins": -0.36734461784362793, "rewards/rejected": 1.7183685302734375, "step": 6357 }, { "epoch": 1.03, "learning_rate": 7.495445909780583e-07, "logits/chosen": -0.6754636764526367, "logits/rejected": -0.6822425723075867, "logps/chosen": -87.31376647949219, "logps/rejected": -67.46427917480469, "loss": 1.2094, "rewards/accuracies": 0.0, "rewards/chosen": 1.4887794256210327, "rewards/margins": -0.8028815984725952, "rewards/rejected": 2.291661024093628, "step": 6358 }, { "epoch": 1.03, "learning_rate": 7.494306955999796e-07, "logits/chosen": -0.8513581156730652, "logits/rejected": -0.8630918264389038, "logps/chosen": -68.80767822265625, "logps/rejected": -85.31871032714844, "loss": 0.3009, "rewards/accuracies": 1.0, "rewards/chosen": 1.6798232793807983, "rewards/margins": 0.23699486255645752, "rewards/rejected": 1.4428284168243408, "step": 6359 }, { "epoch": 1.03, "learning_rate": 7.493167829885999e-07, "logits/chosen": -0.8837129473686218, "logits/rejected": -1.0032970905303955, "logps/chosen": -83.07920837402344, "logps/rejected": -70.39002227783203, "loss": 0.5676, "rewards/accuracies": 0.0, "rewards/chosen": 2.9667725563049316, "rewards/margins": -0.600151777267456, "rewards/rejected": 3.5669243335723877, "step": 6360 }, { "epoch": 1.03, "learning_rate": 7.492028531517895e-07, "logits/chosen": -0.8484466075897217, "logits/rejected": -0.7089875340461731, "logps/chosen": -118.80036926269531, "logps/rejected": -144.75338745117188, "loss": 0.3802, "rewards/accuracies": 1.0, "rewards/chosen": 6.1182146072387695, "rewards/margins": 1.0127577781677246, "rewards/rejected": 5.105456829071045, "step": 6361 }, { "epoch": 1.03, "learning_rate": 7.490889060974201e-07, "logits/chosen": -0.9351840615272522, "logits/rejected": -0.742723822593689, "logps/chosen": -89.65103149414062, "logps/rejected": -34.447059631347656, "loss": 0.1454, "rewards/accuracies": 1.0, "rewards/chosen": 4.332623481750488, "rewards/margins": 3.9153833389282227, "rewards/rejected": 0.4172401428222656, "step": 6362 }, { "epoch": 1.03, "learning_rate": 7.48974941833364e-07, "logits/chosen": -0.32916077971458435, "logits/rejected": -0.27036184072494507, "logps/chosen": -104.1982421875, "logps/rejected": -58.552764892578125, "loss": 0.881, "rewards/accuracies": 0.0, "rewards/chosen": 0.6556442379951477, "rewards/margins": -1.1348328590393066, "rewards/rejected": 1.7904770374298096, "step": 6363 }, { "epoch": 1.03, "learning_rate": 7.488609603674954e-07, "logits/chosen": -1.2766574621200562, "logits/rejected": -1.2284979820251465, "logps/chosen": -78.11205291748047, "logps/rejected": -82.04415893554688, "loss": 0.6365, "rewards/accuracies": 1.0, "rewards/chosen": 1.3948379755020142, "rewards/margins": 0.7493156790733337, "rewards/rejected": 0.6455222964286804, "step": 6364 }, { "epoch": 1.03, "learning_rate": 7.487469617076892e-07, "logits/chosen": -1.120164394378662, "logits/rejected": -1.1217470169067383, "logps/chosen": -59.40102005004883, "logps/rejected": -83.83676147460938, "loss": 2.2835, "rewards/accuracies": 0.0, "rewards/chosen": 1.6669368743896484, "rewards/margins": -1.8535733222961426, "rewards/rejected": 3.520510196685791, "step": 6365 }, { "epoch": 1.03, "learning_rate": 7.486329458618215e-07, "logits/chosen": -0.5514474511146545, "logits/rejected": -0.3849925100803375, "logps/chosen": -71.58551788330078, "logps/rejected": -20.57655143737793, "loss": 0.2695, "rewards/accuracies": 1.0, "rewards/chosen": 1.9925163984298706, "rewards/margins": 1.7543509006500244, "rewards/rejected": 0.238165482878685, "step": 6366 }, { "epoch": 1.03, "learning_rate": 7.485189128377698e-07, "logits/chosen": -0.7903954982757568, "logits/rejected": -0.7827180624008179, "logps/chosen": -80.7161865234375, "logps/rejected": -113.84529113769531, "loss": 0.5956, "rewards/accuracies": 1.0, "rewards/chosen": 1.2572907209396362, "rewards/margins": 0.23291254043579102, "rewards/rejected": 1.0243781805038452, "step": 6367 }, { "epoch": 1.03, "learning_rate": 7.484048626434128e-07, "logits/chosen": -0.5495706796646118, "logits/rejected": -0.3943820893764496, "logps/chosen": -55.30744171142578, "logps/rejected": -16.10930061340332, "loss": 0.7542, "rewards/accuracies": 1.0, "rewards/chosen": 2.175060272216797, "rewards/margins": 1.45832097530365, "rewards/rejected": 0.716739296913147, "step": 6368 }, { "epoch": 1.03, "learning_rate": 7.482907952866302e-07, "logits/chosen": -0.8146768808364868, "logits/rejected": -0.8040410280227661, "logps/chosen": -53.075218200683594, "logps/rejected": -59.407249450683594, "loss": 1.0919, "rewards/accuracies": 0.0, "rewards/chosen": 1.0730431079864502, "rewards/margins": -1.0599091053009033, "rewards/rejected": 2.1329522132873535, "step": 6369 }, { "epoch": 1.03, "learning_rate": 7.481767107753029e-07, "logits/chosen": -0.6760631203651428, "logits/rejected": -0.5150636434555054, "logps/chosen": -100.38291931152344, "logps/rejected": -17.580245971679688, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": 2.7369155883789062, "rewards/margins": 2.4630699157714844, "rewards/rejected": 0.2738456726074219, "step": 6370 }, { "epoch": 1.03, "learning_rate": 7.480626091173132e-07, "logits/chosen": -0.6446614861488342, "logits/rejected": -0.5969154834747314, "logps/chosen": -61.562469482421875, "logps/rejected": -103.50860595703125, "loss": 0.2497, "rewards/accuracies": 1.0, "rewards/chosen": 2.0420243740081787, "rewards/margins": 1.3797616958618164, "rewards/rejected": 0.6622627377510071, "step": 6371 }, { "epoch": 1.03, "learning_rate": 7.479484903205444e-07, "logits/chosen": -0.692680299282074, "logits/rejected": -0.524085521697998, "logps/chosen": -100.76652526855469, "logps/rejected": -19.05575942993164, "loss": 1.3707, "rewards/accuracies": 1.0, "rewards/chosen": 1.6883087158203125, "rewards/margins": 1.4181463718414307, "rewards/rejected": 0.2701624035835266, "step": 6372 }, { "epoch": 1.03, "learning_rate": 7.478343543928812e-07, "logits/chosen": -0.6229351758956909, "logits/rejected": -0.6229351758956909, "logps/chosen": -86.5917739868164, "logps/rejected": -86.5917739868164, "loss": 0.3931, "rewards/accuracies": 0.0, "rewards/chosen": 0.8357963562011719, "rewards/margins": 0.0, "rewards/rejected": 0.8357963562011719, "step": 6373 }, { "epoch": 1.03, "learning_rate": 7.477202013422089e-07, "logits/chosen": -0.39676737785339355, "logits/rejected": -0.3796822428703308, "logps/chosen": -84.41351318359375, "logps/rejected": -47.98374938964844, "loss": 0.5753, "rewards/accuracies": 0.0, "rewards/chosen": 1.2533798217773438, "rewards/margins": -0.6339095830917358, "rewards/rejected": 1.8872894048690796, "step": 6374 }, { "epoch": 1.03, "learning_rate": 7.476060311764148e-07, "logits/chosen": -0.28805801272392273, "logits/rejected": -0.3074527978897095, "logps/chosen": -55.06542205810547, "logps/rejected": -105.04071044921875, "loss": 0.7121, "rewards/accuracies": 1.0, "rewards/chosen": 1.610634684562683, "rewards/margins": 0.500922441482544, "rewards/rejected": 1.1097122430801392, "step": 6375 }, { "epoch": 1.03, "learning_rate": 7.474918439033868e-07, "logits/chosen": -0.6642339825630188, "logits/rejected": -0.6638065576553345, "logps/chosen": -133.81378173828125, "logps/rejected": -130.96067810058594, "loss": 0.6674, "rewards/accuracies": 0.0, "rewards/chosen": 1.1440811157226562, "rewards/margins": -1.0278778076171875, "rewards/rejected": 2.1719589233398438, "step": 6376 }, { "epoch": 1.04, "learning_rate": 7.473776395310142e-07, "logits/chosen": -0.6132321357727051, "logits/rejected": -0.48877352476119995, "logps/chosen": -104.10646057128906, "logps/rejected": -81.61141967773438, "loss": 0.3702, "rewards/accuracies": 1.0, "rewards/chosen": 2.6155335903167725, "rewards/margins": 0.30476927757263184, "rewards/rejected": 2.3107643127441406, "step": 6377 }, { "epoch": 1.04, "learning_rate": 7.472634180671873e-07, "logits/chosen": -0.42290163040161133, "logits/rejected": -0.4224424362182617, "logps/chosen": -6.085303783416748, "logps/rejected": -17.237438201904297, "loss": 0.7121, "rewards/accuracies": 1.0, "rewards/chosen": 0.07102465629577637, "rewards/margins": 0.23753905296325684, "rewards/rejected": -0.16651439666748047, "step": 6378 }, { "epoch": 1.04, "learning_rate": 7.47149179519798e-07, "logits/chosen": -0.7751091122627258, "logits/rejected": -0.7800890803337097, "logps/chosen": -68.62602233886719, "logps/rejected": -83.20696258544922, "loss": 0.7611, "rewards/accuracies": 0.0, "rewards/chosen": 1.9710861444473267, "rewards/margins": -0.46777260303497314, "rewards/rejected": 2.4388587474823, "step": 6379 }, { "epoch": 1.04, "learning_rate": 7.470349238967388e-07, "logits/chosen": -0.8097964525222778, "logits/rejected": -0.6790568828582764, "logps/chosen": -70.60209655761719, "logps/rejected": -233.9018096923828, "loss": 3.394, "rewards/accuracies": 0.0, "rewards/chosen": 3.2926528453826904, "rewards/margins": -4.653106689453125, "rewards/rejected": 7.9457597732543945, "step": 6380 }, { "epoch": 1.04, "learning_rate": 7.469206512059038e-07, "logits/chosen": -0.542890727519989, "logits/rejected": -0.5218287706375122, "logps/chosen": -56.01968002319336, "logps/rejected": -45.38261032104492, "loss": 0.9779, "rewards/accuracies": 0.0, "rewards/chosen": 0.7388687133789062, "rewards/margins": -1.0728000402450562, "rewards/rejected": 1.8116687536239624, "step": 6381 }, { "epoch": 1.04, "learning_rate": 7.468063614551883e-07, "logits/chosen": -0.4298362135887146, "logits/rejected": -0.42968815565109253, "logps/chosen": -5.798168182373047, "logps/rejected": -25.147624969482422, "loss": 1.7996, "rewards/accuracies": 1.0, "rewards/chosen": 0.2417333573102951, "rewards/margins": 0.3604412078857422, "rewards/rejected": -0.11870785057544708, "step": 6382 }, { "epoch": 1.04, "learning_rate": 7.466920546524885e-07, "logits/chosen": -0.6801120042800903, "logits/rejected": -0.676463782787323, "logps/chosen": -48.967742919921875, "logps/rejected": -66.58073425292969, "loss": 0.562, "rewards/accuracies": 1.0, "rewards/chosen": 1.3493813276290894, "rewards/margins": 0.17564702033996582, "rewards/rejected": 1.1737343072891235, "step": 6383 }, { "epoch": 1.04, "learning_rate": 7.46577730805702e-07, "logits/chosen": -0.3102712035179138, "logits/rejected": -0.28885191679000854, "logps/chosen": -74.44596862792969, "logps/rejected": -102.22354888916016, "loss": 0.546, "rewards/accuracies": 1.0, "rewards/chosen": 2.187303304672241, "rewards/margins": 0.2540414333343506, "rewards/rejected": 1.9332618713378906, "step": 6384 }, { "epoch": 1.04, "learning_rate": 7.464633899227272e-07, "logits/chosen": -0.7923809885978699, "logits/rejected": -0.8216886520385742, "logps/chosen": -122.23796844482422, "logps/rejected": -163.52894592285156, "loss": 0.885, "rewards/accuracies": 0.0, "rewards/chosen": 0.3876609802246094, "rewards/margins": -1.455828070640564, "rewards/rejected": 1.8434890508651733, "step": 6385 }, { "epoch": 1.04, "learning_rate": 7.463490320114645e-07, "logits/chosen": -0.596460223197937, "logits/rejected": -0.596460223197937, "logps/chosen": -63.01713562011719, "logps/rejected": -63.01713562011719, "loss": 0.6842, "rewards/accuracies": 0.0, "rewards/chosen": 1.8238228559494019, "rewards/margins": 0.0, "rewards/rejected": 1.8238228559494019, "step": 6386 }, { "epoch": 1.04, "learning_rate": 7.462346570798146e-07, "logits/chosen": -0.5704487562179565, "logits/rejected": -0.6857699155807495, "logps/chosen": -44.806312561035156, "logps/rejected": -113.90381622314453, "loss": 1.3203, "rewards/accuracies": 0.0, "rewards/chosen": 2.0397286415100098, "rewards/margins": -0.17151951789855957, "rewards/rejected": 2.2112481594085693, "step": 6387 }, { "epoch": 1.04, "learning_rate": 7.461202651356797e-07, "logits/chosen": -0.6027364134788513, "logits/rejected": -0.6159672737121582, "logps/chosen": -87.09638977050781, "logps/rejected": -84.83839416503906, "loss": 0.5715, "rewards/accuracies": 0.0, "rewards/chosen": 0.9641219973564148, "rewards/margins": -0.02252429723739624, "rewards/rejected": 0.986646294593811, "step": 6388 }, { "epoch": 1.04, "learning_rate": 7.460058561869634e-07, "logits/chosen": -0.47503843903541565, "logits/rejected": -0.47988376021385193, "logps/chosen": -2.8090405464172363, "logps/rejected": -1.2568926811218262, "loss": 0.4727, "rewards/accuracies": 0.0, "rewards/chosen": 0.09679090976715088, "rewards/margins": -0.10633242130279541, "rewards/rejected": 0.2031233310699463, "step": 6389 }, { "epoch": 1.04, "learning_rate": 7.458914302415701e-07, "logits/chosen": -0.08509808778762817, "logits/rejected": -0.08509808778762817, "logps/chosen": -1.8261737823486328, "logps/rejected": -1.8261737823486328, "loss": 0.5605, "rewards/accuracies": 0.0, "rewards/chosen": 0.17053385078907013, "rewards/margins": 0.0, "rewards/rejected": 0.17053385078907013, "step": 6390 }, { "epoch": 1.04, "learning_rate": 7.457769873074056e-07, "logits/chosen": -0.7120839357376099, "logits/rejected": -0.6370808482170105, "logps/chosen": -101.26960754394531, "logps/rejected": -166.26211547851562, "loss": 0.5553, "rewards/accuracies": 0.0, "rewards/chosen": 1.451734185218811, "rewards/margins": -0.6525474786758423, "rewards/rejected": 2.1042816638946533, "step": 6391 }, { "epoch": 1.04, "learning_rate": 7.456625273923769e-07, "logits/chosen": -0.2130924016237259, "logits/rejected": -0.13478955626487732, "logps/chosen": -79.4714584350586, "logps/rejected": -86.29635620117188, "loss": 0.3273, "rewards/accuracies": 1.0, "rewards/chosen": 2.360427141189575, "rewards/margins": 0.09286642074584961, "rewards/rejected": 2.2675607204437256, "step": 6392 }, { "epoch": 1.04, "learning_rate": 7.455480505043919e-07, "logits/chosen": -0.9769790172576904, "logits/rejected": -0.7309265732765198, "logps/chosen": -115.11138916015625, "logps/rejected": -68.21742248535156, "loss": 0.2704, "rewards/accuracies": 1.0, "rewards/chosen": 6.1621246337890625, "rewards/margins": 4.130372047424316, "rewards/rejected": 2.031752824783325, "step": 6393 }, { "epoch": 1.04, "learning_rate": 7.454335566513602e-07, "logits/chosen": -0.9218780398368835, "logits/rejected": -0.921332061290741, "logps/chosen": -103.3589859008789, "logps/rejected": -96.76716613769531, "loss": 0.8478, "rewards/accuracies": 1.0, "rewards/chosen": 1.8436371088027954, "rewards/margins": 0.28041911125183105, "rewards/rejected": 1.5632179975509644, "step": 6394 }, { "epoch": 1.04, "learning_rate": 7.45319045841192e-07, "logits/chosen": -0.8454092741012573, "logits/rejected": -0.7890502214431763, "logps/chosen": -38.81551742553711, "logps/rejected": -57.511634826660156, "loss": 0.5028, "rewards/accuracies": 0.0, "rewards/chosen": 1.4121555089950562, "rewards/margins": -0.35387086868286133, "rewards/rejected": 1.7660263776779175, "step": 6395 }, { "epoch": 1.04, "learning_rate": 7.452045180817989e-07, "logits/chosen": -0.9184952974319458, "logits/rejected": -0.9820871353149414, "logps/chosen": -100.22920227050781, "logps/rejected": -98.77021789550781, "loss": 0.3562, "rewards/accuracies": 1.0, "rewards/chosen": 5.266154766082764, "rewards/margins": 0.8759326934814453, "rewards/rejected": 4.390222072601318, "step": 6396 }, { "epoch": 1.04, "learning_rate": 7.450899733810937e-07, "logits/chosen": -0.9929563999176025, "logits/rejected": -0.8901425004005432, "logps/chosen": -95.51702117919922, "logps/rejected": -31.99445343017578, "loss": 1.3529, "rewards/accuracies": 1.0, "rewards/chosen": 0.6091529726982117, "rewards/margins": 0.6838355660438538, "rewards/rejected": -0.07468261569738388, "step": 6397 }, { "epoch": 1.04, "learning_rate": 7.449754117469904e-07, "logits/chosen": -0.8329678773880005, "logits/rejected": -0.7706392407417297, "logps/chosen": -83.45446014404297, "logps/rejected": -72.39801788330078, "loss": 0.6123, "rewards/accuracies": 0.0, "rewards/chosen": 1.7373298406600952, "rewards/margins": -0.8123277425765991, "rewards/rejected": 2.5496575832366943, "step": 6398 }, { "epoch": 1.04, "learning_rate": 7.448608331874042e-07, "logits/chosen": -0.6682800650596619, "logits/rejected": -0.6903741955757141, "logps/chosen": -53.20963668823242, "logps/rejected": -48.84583282470703, "loss": 0.6609, "rewards/accuracies": 0.0, "rewards/chosen": 1.7862614393234253, "rewards/margins": -0.4550570249557495, "rewards/rejected": 2.241318464279175, "step": 6399 }, { "epoch": 1.04, "learning_rate": 7.447462377102514e-07, "logits/chosen": -0.19764316082000732, "logits/rejected": -0.20452113449573517, "logps/chosen": -10.055647850036621, "logps/rejected": -3.91491436958313, "loss": 0.8388, "rewards/accuracies": 0.0, "rewards/chosen": -0.07747440785169601, "rewards/margins": -0.38090619444847107, "rewards/rejected": 0.30343177914619446, "step": 6400 }, { "epoch": 1.04, "learning_rate": 7.446316253234493e-07, "logits/chosen": -0.8235064148902893, "logits/rejected": -0.8284500241279602, "logps/chosen": -92.61312103271484, "logps/rejected": -128.26736450195312, "loss": 0.4255, "rewards/accuracies": 0.0, "rewards/chosen": 1.449439287185669, "rewards/margins": -0.27696144580841064, "rewards/rejected": 1.7264007329940796, "step": 6401 }, { "epoch": 1.04, "learning_rate": 7.445169960349166e-07, "logits/chosen": -0.8590577840805054, "logits/rejected": -0.7320399880409241, "logps/chosen": -97.82609558105469, "logps/rejected": -37.82461166381836, "loss": 0.4581, "rewards/accuracies": 0.0, "rewards/chosen": 1.1699364185333252, "rewards/margins": -0.20203626155853271, "rewards/rejected": 1.371972680091858, "step": 6402 }, { "epoch": 1.04, "learning_rate": 7.444023498525731e-07, "logits/chosen": -0.6095642447471619, "logits/rejected": -0.4788082540035248, "logps/chosen": -134.46803283691406, "logps/rejected": -83.66541290283203, "loss": 0.1579, "rewards/accuracies": 1.0, "rewards/chosen": 4.8465776443481445, "rewards/margins": 2.203690528869629, "rewards/rejected": 2.6428871154785156, "step": 6403 }, { "epoch": 1.04, "learning_rate": 7.442876867843398e-07, "logits/chosen": -0.8298590183258057, "logits/rejected": -0.773015022277832, "logps/chosen": -56.74134826660156, "logps/rejected": -73.11697387695312, "loss": 0.4549, "rewards/accuracies": 1.0, "rewards/chosen": 1.6384719610214233, "rewards/margins": 0.018982648849487305, "rewards/rejected": 1.619489312171936, "step": 6404 }, { "epoch": 1.04, "learning_rate": 7.441730068381389e-07, "logits/chosen": -0.5963831543922424, "logits/rejected": -0.6092291474342346, "logps/chosen": -56.98303985595703, "logps/rejected": -89.10336303710938, "loss": 0.3355, "rewards/accuracies": 1.0, "rewards/chosen": 0.748883843421936, "rewards/margins": 0.23428577184677124, "rewards/rejected": 0.5145980715751648, "step": 6405 }, { "epoch": 1.04, "learning_rate": 7.440583100218935e-07, "logits/chosen": -0.7353899478912354, "logits/rejected": -0.6875357627868652, "logps/chosen": -105.06824493408203, "logps/rejected": -52.802955627441406, "loss": 0.2, "rewards/accuracies": 1.0, "rewards/chosen": 4.570076942443848, "rewards/margins": 1.4692597389221191, "rewards/rejected": 3.1008172035217285, "step": 6406 }, { "epoch": 1.04, "learning_rate": 7.439435963435283e-07, "logits/chosen": -0.37268102169036865, "logits/rejected": -0.3562301695346832, "logps/chosen": -14.304198265075684, "logps/rejected": -19.085844039916992, "loss": 2.0342, "rewards/accuracies": 1.0, "rewards/chosen": 0.6780919432640076, "rewards/margins": 0.45181554555892944, "rewards/rejected": 0.22627639770507812, "step": 6407 }, { "epoch": 1.04, "learning_rate": 7.438288658109687e-07, "logits/chosen": -0.5912667512893677, "logits/rejected": -0.6505515575408936, "logps/chosen": -152.1826171875, "logps/rejected": -94.7220687866211, "loss": 0.0663, "rewards/accuracies": 1.0, "rewards/chosen": 2.982440233230591, "rewards/margins": 1.9769386053085327, "rewards/rejected": 1.005501627922058, "step": 6408 }, { "epoch": 1.04, "learning_rate": 7.437141184321416e-07, "logits/chosen": -0.7868590354919434, "logits/rejected": -0.5896843671798706, "logps/chosen": -60.013118743896484, "logps/rejected": -21.286108016967773, "loss": 0.0821, "rewards/accuracies": 1.0, "rewards/chosen": 2.88999605178833, "rewards/margins": 2.571178436279297, "rewards/rejected": 0.31881770491600037, "step": 6409 }, { "epoch": 1.04, "learning_rate": 7.43599354214975e-07, "logits/chosen": -0.2546195983886719, "logits/rejected": -0.1991925835609436, "logps/chosen": -67.81672668457031, "logps/rejected": -49.120914459228516, "loss": 0.2372, "rewards/accuracies": 1.0, "rewards/chosen": 1.928052544593811, "rewards/margins": 0.890533447265625, "rewards/rejected": 1.037519097328186, "step": 6410 }, { "epoch": 1.04, "learning_rate": 7.43484573167398e-07, "logits/chosen": -0.7098594307899475, "logits/rejected": -0.6678978800773621, "logps/chosen": -81.63330078125, "logps/rejected": -71.93911743164062, "loss": 1.1574, "rewards/accuracies": 0.0, "rewards/chosen": 1.43828284740448, "rewards/margins": -1.98552405834198, "rewards/rejected": 3.42380690574646, "step": 6411 }, { "epoch": 1.04, "learning_rate": 7.433697752973407e-07, "logits/chosen": -1.2293128967285156, "logits/rejected": -1.2317826747894287, "logps/chosen": -54.40599822998047, "logps/rejected": -17.887039184570312, "loss": 0.3932, "rewards/accuracies": 1.0, "rewards/chosen": 1.6508255004882812, "rewards/margins": 0.2754288911819458, "rewards/rejected": 1.3753966093063354, "step": 6412 }, { "epoch": 1.04, "learning_rate": 7.43254960612735e-07, "logits/chosen": -0.42746293544769287, "logits/rejected": -0.2593894600868225, "logps/chosen": -118.51699829101562, "logps/rejected": -73.86210632324219, "loss": 0.2175, "rewards/accuracies": 1.0, "rewards/chosen": 4.280728340148926, "rewards/margins": 2.2964320182800293, "rewards/rejected": 1.984296441078186, "step": 6413 }, { "epoch": 1.04, "learning_rate": 7.43140129121513e-07, "logits/chosen": -0.8792037963867188, "logits/rejected": -0.9032029509544373, "logps/chosen": -115.08245849609375, "logps/rejected": -137.05653381347656, "loss": 0.5989, "rewards/accuracies": 1.0, "rewards/chosen": 5.4312744140625, "rewards/margins": 0.9475522041320801, "rewards/rejected": 4.48372220993042, "step": 6414 }, { "epoch": 1.04, "learning_rate": 7.430252808316088e-07, "logits/chosen": -0.8013454079627991, "logits/rejected": -0.8952101469039917, "logps/chosen": -173.20973205566406, "logps/rejected": -209.2095947265625, "loss": 0.4495, "rewards/accuracies": 1.0, "rewards/chosen": 9.039384841918945, "rewards/margins": 3.088047981262207, "rewards/rejected": 5.951336860656738, "step": 6415 }, { "epoch": 1.04, "learning_rate": 7.429104157509573e-07, "logits/chosen": -0.6665811538696289, "logits/rejected": -0.7434557676315308, "logps/chosen": -206.45452880859375, "logps/rejected": -47.488006591796875, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": 5.732202053070068, "rewards/margins": 4.085686206817627, "rewards/rejected": 1.6465157270431519, "step": 6416 }, { "epoch": 1.04, "learning_rate": 7.427955338874942e-07, "logits/chosen": -0.7030638456344604, "logits/rejected": -0.596198558807373, "logps/chosen": -60.59858322143555, "logps/rejected": -66.59452819824219, "loss": 1.5664, "rewards/accuracies": 1.0, "rewards/chosen": 2.4740657806396484, "rewards/margins": 1.5619258880615234, "rewards/rejected": 0.912139892578125, "step": 6417 }, { "epoch": 1.04, "learning_rate": 7.426806352491573e-07, "logits/chosen": -0.7393778562545776, "logits/rejected": -0.47403404116630554, "logps/chosen": -195.68478393554688, "logps/rejected": -141.7991180419922, "loss": 0.2776, "rewards/accuracies": 1.0, "rewards/chosen": 4.534114360809326, "rewards/margins": 0.5393373966217041, "rewards/rejected": 3.994776964187622, "step": 6418 }, { "epoch": 1.04, "learning_rate": 7.425657198438848e-07, "logits/chosen": -0.9233951568603516, "logits/rejected": -0.8866878747940063, "logps/chosen": -88.39496612548828, "logps/rejected": -81.70036315917969, "loss": 0.3119, "rewards/accuracies": 1.0, "rewards/chosen": 2.845740556716919, "rewards/margins": 1.7284049987792969, "rewards/rejected": 1.117335557937622, "step": 6419 }, { "epoch": 1.04, "learning_rate": 7.424507876796162e-07, "logits/chosen": -0.5121632814407349, "logits/rejected": -0.505583643913269, "logps/chosen": -62.94690704345703, "logps/rejected": -62.90003204345703, "loss": 0.6504, "rewards/accuracies": 0.0, "rewards/chosen": 1.3018531799316406, "rewards/margins": -0.6792175769805908, "rewards/rejected": 1.9810707569122314, "step": 6420 }, { "epoch": 1.04, "learning_rate": 7.423358387642922e-07, "logits/chosen": -0.6072207689285278, "logits/rejected": -0.5991325378417969, "logps/chosen": -73.2614974975586, "logps/rejected": -76.82445526123047, "loss": 1.5255, "rewards/accuracies": 0.0, "rewards/chosen": 1.4628394842147827, "rewards/margins": -0.13005375862121582, "rewards/rejected": 1.5928932428359985, "step": 6421 }, { "epoch": 1.04, "learning_rate": 7.422208731058547e-07, "logits/chosen": -0.7729998826980591, "logits/rejected": -0.743485152721405, "logps/chosen": -57.74589157104492, "logps/rejected": -19.72979736328125, "loss": 0.3455, "rewards/accuracies": 1.0, "rewards/chosen": 0.8237361907958984, "rewards/margins": 0.5772247314453125, "rewards/rejected": 0.24651145935058594, "step": 6422 }, { "epoch": 1.04, "learning_rate": 7.42105890712247e-07, "logits/chosen": -0.663195013999939, "logits/rejected": -0.663195013999939, "logps/chosen": -76.4688949584961, "logps/rejected": -76.4688949584961, "loss": 0.3915, "rewards/accuracies": 0.0, "rewards/chosen": 3.0645484924316406, "rewards/margins": 0.0, "rewards/rejected": 3.0645484924316406, "step": 6423 }, { "epoch": 1.04, "learning_rate": 7.41990891591413e-07, "logits/chosen": -1.0211719274520874, "logits/rejected": -0.9027846455574036, "logps/chosen": -131.08969116210938, "logps/rejected": -49.77261734008789, "loss": 0.7948, "rewards/accuracies": 0.0, "rewards/chosen": 0.19935913383960724, "rewards/margins": -1.2932285070419312, "rewards/rejected": 1.492587685585022, "step": 6424 }, { "epoch": 1.04, "learning_rate": 7.41875875751298e-07, "logits/chosen": -1.0010039806365967, "logits/rejected": -1.004532814025879, "logps/chosen": -57.539894104003906, "logps/rejected": -59.9724006652832, "loss": 0.4282, "rewards/accuracies": 1.0, "rewards/chosen": 3.3565680980682373, "rewards/margins": 0.7384395599365234, "rewards/rejected": 2.618128538131714, "step": 6425 }, { "epoch": 1.04, "learning_rate": 7.417608431998486e-07, "logits/chosen": -0.39476755261421204, "logits/rejected": -0.3714962899684906, "logps/chosen": -60.43315887451172, "logps/rejected": -84.01129150390625, "loss": 0.954, "rewards/accuracies": 0.0, "rewards/chosen": 0.9536041617393494, "rewards/margins": -1.030888319015503, "rewards/rejected": 1.984492540359497, "step": 6426 }, { "epoch": 1.04, "learning_rate": 7.416457939450126e-07, "logits/chosen": -0.7778583765029907, "logits/rejected": -0.7323752045631409, "logps/chosen": -133.8673095703125, "logps/rejected": -202.62120056152344, "loss": 0.0331, "rewards/accuracies": 1.0, "rewards/chosen": 5.478585720062256, "rewards/margins": 3.80830979347229, "rewards/rejected": 1.6702759265899658, "step": 6427 }, { "epoch": 1.04, "learning_rate": 7.415307279947389e-07, "logits/chosen": -0.330191433429718, "logits/rejected": -0.33408254384994507, "logps/chosen": -178.80087280273438, "logps/rejected": -111.88712310791016, "loss": 0.5224, "rewards/accuracies": 0.0, "rewards/chosen": 2.5134644508361816, "rewards/margins": -0.5110876560211182, "rewards/rejected": 3.0245521068573, "step": 6428 }, { "epoch": 1.04, "learning_rate": 7.41415645356977e-07, "logits/chosen": -0.5908761620521545, "logits/rejected": -0.5569514036178589, "logps/chosen": -101.2118911743164, "logps/rejected": -68.31245422363281, "loss": 1.1166, "rewards/accuracies": 0.0, "rewards/chosen": 1.0697479248046875, "rewards/margins": -0.7143150568008423, "rewards/rejected": 1.7840629816055298, "step": 6429 }, { "epoch": 1.04, "learning_rate": 7.413005460396784e-07, "logits/chosen": -0.7576590180397034, "logits/rejected": -0.7425323724746704, "logps/chosen": -161.50234985351562, "logps/rejected": -149.57382202148438, "loss": 0.4816, "rewards/accuracies": 0.0, "rewards/chosen": 5.125509738922119, "rewards/margins": -0.04645681381225586, "rewards/rejected": 5.171966552734375, "step": 6430 }, { "epoch": 1.04, "learning_rate": 7.411854300507953e-07, "logits/chosen": -0.3706935942173004, "logits/rejected": -0.5711138844490051, "logps/chosen": -66.68809509277344, "logps/rejected": -95.75447845458984, "loss": 1.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.9794090390205383, "rewards/margins": -3.2902286052703857, "rewards/rejected": 4.269637584686279, "step": 6431 }, { "epoch": 1.04, "learning_rate": 7.41070297398281e-07, "logits/chosen": -0.26145491003990173, "logits/rejected": -0.3446016311645508, "logps/chosen": -79.17536926269531, "logps/rejected": -104.29147338867188, "loss": 1.8554, "rewards/accuracies": 0.0, "rewards/chosen": 1.547217607498169, "rewards/margins": -3.6524713039398193, "rewards/rejected": 5.199688911437988, "step": 6432 }, { "epoch": 1.04, "learning_rate": 7.409551480900902e-07, "logits/chosen": -0.6605798602104187, "logits/rejected": -0.6550816297531128, "logps/chosen": -45.7797737121582, "logps/rejected": -25.19337272644043, "loss": 0.5022, "rewards/accuracies": 0.0, "rewards/chosen": 0.1862361878156662, "rewards/margins": -0.2543317675590515, "rewards/rejected": 0.4405679702758789, "step": 6433 }, { "epoch": 1.04, "learning_rate": 7.408399821341786e-07, "logits/chosen": -0.8626803755760193, "logits/rejected": -0.9388912916183472, "logps/chosen": -77.26341247558594, "logps/rejected": -125.74340057373047, "loss": 2.1418, "rewards/accuracies": 0.0, "rewards/chosen": 1.4785140752792358, "rewards/margins": -2.834761619567871, "rewards/rejected": 4.3132758140563965, "step": 6434 }, { "epoch": 1.04, "learning_rate": 7.407247995385032e-07, "logits/chosen": -0.8540858030319214, "logits/rejected": -0.7611387968063354, "logps/chosen": -85.89033508300781, "logps/rejected": -46.97998046875, "loss": 0.3924, "rewards/accuracies": 0.0, "rewards/chosen": 1.2182044982910156, "rewards/margins": -0.007765650749206543, "rewards/rejected": 1.2259701490402222, "step": 6435 }, { "epoch": 1.04, "learning_rate": 7.406096003110219e-07, "logits/chosen": -0.9513811469078064, "logits/rejected": -1.0400416851043701, "logps/chosen": -224.2861328125, "logps/rejected": -88.75384521484375, "loss": 0.0935, "rewards/accuracies": 1.0, "rewards/chosen": 3.865588426589966, "rewards/margins": 2.6591339111328125, "rewards/rejected": 1.2064545154571533, "step": 6436 }, { "epoch": 1.04, "learning_rate": 7.404943844596938e-07, "logits/chosen": -1.193864107131958, "logits/rejected": -1.1944998502731323, "logps/chosen": -117.64335632324219, "logps/rejected": -90.42930603027344, "loss": 1.332, "rewards/accuracies": 0.0, "rewards/chosen": 0.9582862854003906, "rewards/margins": -0.8123992681503296, "rewards/rejected": 1.7706855535507202, "step": 6437 }, { "epoch": 1.04, "learning_rate": 7.403791519924793e-07, "logits/chosen": -0.352107971906662, "logits/rejected": -0.3621101677417755, "logps/chosen": -3.8817896842956543, "logps/rejected": -1.7769715785980225, "loss": 0.5563, "rewards/accuracies": 0.0, "rewards/chosen": 0.19626465439796448, "rewards/margins": -0.20306354761123657, "rewards/rejected": 0.39932820200920105, "step": 6438 }, { "epoch": 1.05, "learning_rate": 7.4026390291734e-07, "logits/chosen": -0.9496371746063232, "logits/rejected": -0.9217709302902222, "logps/chosen": -141.78057861328125, "logps/rejected": -56.13331604003906, "loss": 0.7117, "rewards/accuracies": 1.0, "rewards/chosen": 4.72210693359375, "rewards/margins": 4.674332618713379, "rewards/rejected": 0.04777412489056587, "step": 6439 }, { "epoch": 1.05, "learning_rate": 7.401486372422383e-07, "logits/chosen": -0.8648943305015564, "logits/rejected": -0.8648943305015564, "logps/chosen": -71.7044677734375, "logps/rejected": -71.7044677734375, "loss": 0.6145, "rewards/accuracies": 0.0, "rewards/chosen": 1.1983444690704346, "rewards/margins": 0.0, "rewards/rejected": 1.1983444690704346, "step": 6440 }, { "epoch": 1.05, "learning_rate": 7.400333549751381e-07, "logits/chosen": -0.8007624745368958, "logits/rejected": -0.643059253692627, "logps/chosen": -116.32183074951172, "logps/rejected": -118.24820709228516, "loss": 0.0406, "rewards/accuracies": 1.0, "rewards/chosen": 6.595316410064697, "rewards/margins": 5.944995880126953, "rewards/rejected": 0.6503204703330994, "step": 6441 }, { "epoch": 1.05, "learning_rate": 7.399180561240044e-07, "logits/chosen": -0.5105182528495789, "logits/rejected": -0.4636026918888092, "logps/chosen": -175.158447265625, "logps/rejected": -75.01995086669922, "loss": 0.3923, "rewards/accuracies": 1.0, "rewards/chosen": 3.9574615955352783, "rewards/margins": 2.1976051330566406, "rewards/rejected": 1.7598564624786377, "step": 6442 }, { "epoch": 1.05, "learning_rate": 7.39802740696803e-07, "logits/chosen": -0.993017315864563, "logits/rejected": -0.946460485458374, "logps/chosen": -36.81154251098633, "logps/rejected": -62.72400665283203, "loss": 0.4591, "rewards/accuracies": 1.0, "rewards/chosen": 3.363541841506958, "rewards/margins": 1.3868656158447266, "rewards/rejected": 1.9766762256622314, "step": 6443 }, { "epoch": 1.05, "learning_rate": 7.396874087015013e-07, "logits/chosen": -0.37849161028862, "logits/rejected": -0.37849161028862, "logps/chosen": -74.32640838623047, "logps/rejected": -74.32640838623047, "loss": 0.3778, "rewards/accuracies": 0.0, "rewards/chosen": 1.978951334953308, "rewards/margins": 0.0, "rewards/rejected": 1.978951334953308, "step": 6444 }, { "epoch": 1.05, "learning_rate": 7.395720601460675e-07, "logits/chosen": -0.7924202084541321, "logits/rejected": -0.7545422315597534, "logps/chosen": -58.989994049072266, "logps/rejected": -10.09228801727295, "loss": 0.5385, "rewards/accuracies": 1.0, "rewards/chosen": 1.4918025732040405, "rewards/margins": 0.7591354846954346, "rewards/rejected": 0.732667088508606, "step": 6445 }, { "epoch": 1.05, "learning_rate": 7.394566950384714e-07, "logits/chosen": -0.6350897550582886, "logits/rejected": -0.6636298298835754, "logps/chosen": -53.949920654296875, "logps/rejected": -67.99644470214844, "loss": 0.5963, "rewards/accuracies": 1.0, "rewards/chosen": 1.8203766345977783, "rewards/margins": 0.03944551944732666, "rewards/rejected": 1.7809311151504517, "step": 6446 }, { "epoch": 1.05, "learning_rate": 7.393413133866832e-07, "logits/chosen": -1.0010509490966797, "logits/rejected": -0.8901661038398743, "logps/chosen": -148.0744171142578, "logps/rejected": -298.0865783691406, "loss": 1.0371, "rewards/accuracies": 0.0, "rewards/chosen": 1.3784774541854858, "rewards/margins": -0.8541673421859741, "rewards/rejected": 2.23264479637146, "step": 6447 }, { "epoch": 1.05, "learning_rate": 7.392259151986751e-07, "logits/chosen": -1.0381217002868652, "logits/rejected": -1.0559321641921997, "logps/chosen": -114.87342834472656, "logps/rejected": -138.27281188964844, "loss": 1.3842, "rewards/accuracies": 0.0, "rewards/chosen": 3.5167603492736816, "rewards/margins": -2.6186141967773438, "rewards/rejected": 6.135374546051025, "step": 6448 }, { "epoch": 1.05, "learning_rate": 7.391105004824198e-07, "logits/chosen": -0.8327787518501282, "logits/rejected": -0.7339514493942261, "logps/chosen": -77.41522979736328, "logps/rejected": -40.4345817565918, "loss": 0.5753, "rewards/accuracies": 1.0, "rewards/chosen": 0.9871597290039062, "rewards/margins": 0.4576549530029297, "rewards/rejected": 0.5295047760009766, "step": 6449 }, { "epoch": 1.05, "learning_rate": 7.389950692458915e-07, "logits/chosen": -0.5104207396507263, "logits/rejected": -0.507297694683075, "logps/chosen": -84.52767944335938, "logps/rejected": -92.32893371582031, "loss": 0.3816, "rewards/accuracies": 1.0, "rewards/chosen": 0.9358474612236023, "rewards/margins": 0.33412015438079834, "rewards/rejected": 0.601727306842804, "step": 6450 }, { "epoch": 1.05, "learning_rate": 7.388796214970652e-07, "logits/chosen": -0.5792404413223267, "logits/rejected": -0.5792404413223267, "logps/chosen": -78.4502182006836, "logps/rejected": -78.4502182006836, "loss": 0.3708, "rewards/accuracies": 0.0, "rewards/chosen": 2.7642829418182373, "rewards/margins": 0.0, "rewards/rejected": 2.7642829418182373, "step": 6451 }, { "epoch": 1.05, "learning_rate": 7.387641572439173e-07, "logits/chosen": -1.047822117805481, "logits/rejected": -0.9713817834854126, "logps/chosen": -77.63343811035156, "logps/rejected": -119.05207824707031, "loss": 0.4537, "rewards/accuracies": 1.0, "rewards/chosen": 5.500695705413818, "rewards/margins": 0.5208449363708496, "rewards/rejected": 4.979850769042969, "step": 6452 }, { "epoch": 1.05, "learning_rate": 7.386486764944255e-07, "logits/chosen": -0.7676751017570496, "logits/rejected": -0.6486770510673523, "logps/chosen": -130.5786895751953, "logps/rejected": -55.311073303222656, "loss": 0.1879, "rewards/accuracies": 1.0, "rewards/chosen": 3.869307041168213, "rewards/margins": 2.058307647705078, "rewards/rejected": 1.8109992742538452, "step": 6453 }, { "epoch": 1.05, "learning_rate": 7.385331792565681e-07, "logits/chosen": -0.8670814037322998, "logits/rejected": -0.656850278377533, "logps/chosen": -101.13543701171875, "logps/rejected": -93.50302124023438, "loss": 0.4562, "rewards/accuracies": 0.0, "rewards/chosen": 4.0850372314453125, "rewards/margins": -0.35485410690307617, "rewards/rejected": 4.439891338348389, "step": 6454 }, { "epoch": 1.05, "learning_rate": 7.384176655383251e-07, "logits/chosen": -0.800695538520813, "logits/rejected": -0.5888293385505676, "logps/chosen": -138.9246063232422, "logps/rejected": -70.42962646484375, "loss": 0.3123, "rewards/accuracies": 1.0, "rewards/chosen": 4.194189548492432, "rewards/margins": 1.4113693237304688, "rewards/rejected": 2.782820224761963, "step": 6455 }, { "epoch": 1.05, "learning_rate": 7.383021353476774e-07, "logits/chosen": -0.6814080476760864, "logits/rejected": -0.582850456237793, "logps/chosen": -109.93856048583984, "logps/rejected": -96.56896209716797, "loss": 0.7994, "rewards/accuracies": 1.0, "rewards/chosen": 1.3573684692382812, "rewards/margins": 1.0324325561523438, "rewards/rejected": 0.3249359130859375, "step": 6456 }, { "epoch": 1.05, "learning_rate": 7.381865886926069e-07, "logits/chosen": -0.8293471336364746, "logits/rejected": -0.7495298981666565, "logps/chosen": -100.86326599121094, "logps/rejected": -129.89437866210938, "loss": 0.2535, "rewards/accuracies": 1.0, "rewards/chosen": 3.9266374111175537, "rewards/margins": 1.2897262573242188, "rewards/rejected": 2.636911153793335, "step": 6457 }, { "epoch": 1.05, "learning_rate": 7.380710255810969e-07, "logits/chosen": -0.7641194462776184, "logits/rejected": -0.6225060820579529, "logps/chosen": -193.1990966796875, "logps/rejected": -120.68287658691406, "loss": 1.6307, "rewards/accuracies": 0.0, "rewards/chosen": 0.6851135492324829, "rewards/margins": -1.8761612176895142, "rewards/rejected": 2.561274766921997, "step": 6458 }, { "epoch": 1.05, "learning_rate": 7.379554460211317e-07, "logits/chosen": -0.8874463438987732, "logits/rejected": -0.8419328331947327, "logps/chosen": -64.6134033203125, "logps/rejected": -88.09529876708984, "loss": 0.5259, "rewards/accuracies": 1.0, "rewards/chosen": 0.6735283136367798, "rewards/margins": 0.00503462553024292, "rewards/rejected": 0.6684936881065369, "step": 6459 }, { "epoch": 1.05, "learning_rate": 7.378398500206966e-07, "logits/chosen": -0.5829383730888367, "logits/rejected": -0.5671409964561462, "logps/chosen": -79.77556610107422, "logps/rejected": -79.5097885131836, "loss": 1.1066, "rewards/accuracies": 0.0, "rewards/chosen": 1.1864722967147827, "rewards/margins": -1.5230058431625366, "rewards/rejected": 2.7094781398773193, "step": 6460 }, { "epoch": 1.05, "learning_rate": 7.377242375877784e-07, "logits/chosen": -0.6922217011451721, "logits/rejected": -0.7030677795410156, "logps/chosen": -69.94207000732422, "logps/rejected": -95.890869140625, "loss": 0.9073, "rewards/accuracies": 0.0, "rewards/chosen": 0.8755211234092712, "rewards/margins": -0.5041610598564148, "rewards/rejected": 1.379682183265686, "step": 6461 }, { "epoch": 1.05, "learning_rate": 7.376086087303648e-07, "logits/chosen": -1.0668134689331055, "logits/rejected": -0.9068659543991089, "logps/chosen": -148.354736328125, "logps/rejected": -78.98109436035156, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": 5.842332363128662, "rewards/margins": 3.23689866065979, "rewards/rejected": 2.605433702468872, "step": 6462 }, { "epoch": 1.05, "learning_rate": 7.374929634564445e-07, "logits/chosen": -1.0429563522338867, "logits/rejected": -0.9734360575675964, "logps/chosen": -93.60981750488281, "logps/rejected": -151.50784301757812, "loss": 2.4973, "rewards/accuracies": 0.0, "rewards/chosen": 2.0929481983184814, "rewards/margins": -4.750779151916504, "rewards/rejected": 6.843727111816406, "step": 6463 }, { "epoch": 1.05, "learning_rate": 7.373773017740076e-07, "logits/chosen": -0.5710268616676331, "logits/rejected": -0.4840269386768341, "logps/chosen": -25.549272537231445, "logps/rejected": -74.50293731689453, "loss": 0.4459, "rewards/accuracies": 0.0, "rewards/chosen": 1.4414983987808228, "rewards/margins": -0.3180391788482666, "rewards/rejected": 1.7595375776290894, "step": 6464 }, { "epoch": 1.05, "learning_rate": 7.372616236910455e-07, "logits/chosen": -0.5863907933235168, "logits/rejected": -0.44460415840148926, "logps/chosen": -70.59991455078125, "logps/rejected": -131.36036682128906, "loss": 0.9648, "rewards/accuracies": 1.0, "rewards/chosen": 2.2767257690429688, "rewards/margins": 0.13840317726135254, "rewards/rejected": 2.138322591781616, "step": 6465 }, { "epoch": 1.05, "learning_rate": 7.371459292155499e-07, "logits/chosen": -0.5450283885002136, "logits/rejected": -0.47950413823127747, "logps/chosen": -59.98786926269531, "logps/rejected": -77.11695861816406, "loss": 0.1393, "rewards/accuracies": 1.0, "rewards/chosen": 3.071920871734619, "rewards/margins": 1.456159234046936, "rewards/rejected": 1.615761637687683, "step": 6466 }, { "epoch": 1.05, "learning_rate": 7.370302183555147e-07, "logits/chosen": -0.8542928695678711, "logits/rejected": -0.5866112112998962, "logps/chosen": -117.44495391845703, "logps/rejected": -57.160247802734375, "loss": 0.8691, "rewards/accuracies": 1.0, "rewards/chosen": 5.384011268615723, "rewards/margins": 4.181228160858154, "rewards/rejected": 1.202783226966858, "step": 6467 }, { "epoch": 1.05, "learning_rate": 7.369144911189342e-07, "logits/chosen": -0.6240605115890503, "logits/rejected": -0.63086998462677, "logps/chosen": -15.789833068847656, "logps/rejected": -3.81854248046875, "loss": 0.7642, "rewards/accuracies": 0.0, "rewards/chosen": -0.03199739381670952, "rewards/margins": -0.29119348526000977, "rewards/rejected": 0.25919610261917114, "step": 6468 }, { "epoch": 1.05, "learning_rate": 7.367987475138039e-07, "logits/chosen": -0.43829652667045593, "logits/rejected": -0.43829652667045593, "logps/chosen": -4.366004943847656, "logps/rejected": -4.366004943847656, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.0518579483032227, "rewards/margins": 0.0, "rewards/rejected": 1.0518579483032227, "step": 6469 }, { "epoch": 1.05, "learning_rate": 7.366829875481208e-07, "logits/chosen": -0.31007835268974304, "logits/rejected": -0.2951585054397583, "logps/chosen": -51.66141891479492, "logps/rejected": -84.57090759277344, "loss": 0.8766, "rewards/accuracies": 0.0, "rewards/chosen": 2.0212597846984863, "rewards/margins": -0.6259920597076416, "rewards/rejected": 2.647251844406128, "step": 6470 }, { "epoch": 1.05, "learning_rate": 7.365672112298829e-07, "logits/chosen": -0.703626811504364, "logits/rejected": -0.6940615773200989, "logps/chosen": -46.379554748535156, "logps/rejected": -102.41722106933594, "loss": 0.8276, "rewards/accuracies": 0.0, "rewards/chosen": 1.1136788129806519, "rewards/margins": -0.38728559017181396, "rewards/rejected": 1.5009644031524658, "step": 6471 }, { "epoch": 1.05, "learning_rate": 7.364514185670889e-07, "logits/chosen": -0.6709877848625183, "logits/rejected": -0.7189534306526184, "logps/chosen": -62.82194519042969, "logps/rejected": -90.10997772216797, "loss": 0.8808, "rewards/accuracies": 0.0, "rewards/chosen": 0.9916549921035767, "rewards/margins": -1.2852288484573364, "rewards/rejected": 2.276883840560913, "step": 6472 }, { "epoch": 1.05, "learning_rate": 7.363356095677394e-07, "logits/chosen": -0.5999199151992798, "logits/rejected": -0.5682944655418396, "logps/chosen": -51.41636657714844, "logps/rejected": -54.04182052612305, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 0.7543724179267883, "rewards/margins": 0.220184326171875, "rewards/rejected": 0.5341880917549133, "step": 6473 }, { "epoch": 1.05, "learning_rate": 7.362197842398354e-07, "logits/chosen": -0.7124495506286621, "logits/rejected": -0.6995797157287598, "logps/chosen": -56.37574768066406, "logps/rejected": -33.24335861206055, "loss": 1.4667, "rewards/accuracies": 1.0, "rewards/chosen": 1.1017944812774658, "rewards/margins": 0.2142830491065979, "rewards/rejected": 0.8875114321708679, "step": 6474 }, { "epoch": 1.05, "learning_rate": 7.361039425913795e-07, "logits/chosen": -1.0591027736663818, "logits/rejected": -0.987494707107544, "logps/chosen": -216.56521606445312, "logps/rejected": -47.765987396240234, "loss": 0.1938, "rewards/accuracies": 1.0, "rewards/chosen": 1.0131256580352783, "rewards/margins": 0.8609157800674438, "rewards/rejected": 0.15220986306667328, "step": 6475 }, { "epoch": 1.05, "learning_rate": 7.359880846303752e-07, "logits/chosen": -0.6517807841300964, "logits/rejected": -0.6517807841300964, "logps/chosen": -123.59271240234375, "logps/rejected": -123.59271240234375, "loss": 0.375, "rewards/accuracies": 0.0, "rewards/chosen": 2.4728362560272217, "rewards/margins": 0.0, "rewards/rejected": 2.4728362560272217, "step": 6476 }, { "epoch": 1.05, "learning_rate": 7.358722103648273e-07, "logits/chosen": -0.7819168567657471, "logits/rejected": -0.8739627003669739, "logps/chosen": -70.64295196533203, "logps/rejected": -117.88456726074219, "loss": 1.4808, "rewards/accuracies": 0.0, "rewards/chosen": 1.8217766284942627, "rewards/margins": -2.520153284072876, "rewards/rejected": 4.341929912567139, "step": 6477 }, { "epoch": 1.05, "learning_rate": 7.357563198027413e-07, "logits/chosen": -0.5415753722190857, "logits/rejected": -0.552222490310669, "logps/chosen": -95.66996002197266, "logps/rejected": -73.95779418945312, "loss": 0.9137, "rewards/accuracies": 0.0, "rewards/chosen": 0.8600212335586548, "rewards/margins": -0.7012641429901123, "rewards/rejected": 1.561285376548767, "step": 6478 }, { "epoch": 1.05, "learning_rate": 7.356404129521245e-07, "logits/chosen": -1.193030595779419, "logits/rejected": -1.165561556816101, "logps/chosen": -76.12881469726562, "logps/rejected": -52.99143981933594, "loss": 0.1396, "rewards/accuracies": 1.0, "rewards/chosen": 1.8021820783615112, "rewards/margins": 1.3520851135253906, "rewards/rejected": 0.45009690523147583, "step": 6479 }, { "epoch": 1.05, "learning_rate": 7.355244898209847e-07, "logits/chosen": -0.7260118126869202, "logits/rejected": -0.659290075302124, "logps/chosen": -77.27297973632812, "logps/rejected": -78.794921875, "loss": 1.1692, "rewards/accuracies": 0.0, "rewards/chosen": 1.981343150138855, "rewards/margins": -2.067938804626465, "rewards/rejected": 4.049282073974609, "step": 6480 }, { "epoch": 1.05, "learning_rate": 7.354085504173313e-07, "logits/chosen": -0.6510534882545471, "logits/rejected": -0.5674394369125366, "logps/chosen": -65.98698425292969, "logps/rejected": -19.53540802001953, "loss": 0.1355, "rewards/accuracies": 1.0, "rewards/chosen": 2.7065742015838623, "rewards/margins": 2.380690097808838, "rewards/rejected": 0.325884073972702, "step": 6481 }, { "epoch": 1.05, "learning_rate": 7.352925947491745e-07, "logits/chosen": -0.41563287377357483, "logits/rejected": -0.3088993728160858, "logps/chosen": -107.94951629638672, "logps/rejected": -24.633869171142578, "loss": 0.386, "rewards/accuracies": 1.0, "rewards/chosen": 1.6881401538848877, "rewards/margins": 0.8989613056182861, "rewards/rejected": 0.7891788482666016, "step": 6482 }, { "epoch": 1.05, "learning_rate": 7.351766228245258e-07, "logits/chosen": -0.25923776626586914, "logits/rejected": -0.2975538671016693, "logps/chosen": -18.961978912353516, "logps/rejected": -37.79225540161133, "loss": 0.5082, "rewards/accuracies": 0.0, "rewards/chosen": 0.3353603482246399, "rewards/margins": -0.15644073486328125, "rewards/rejected": 0.49180108308792114, "step": 6483 }, { "epoch": 1.05, "learning_rate": 7.350606346513977e-07, "logits/chosen": -0.7017138600349426, "logits/rejected": -0.7381530404090881, "logps/chosen": -65.54752349853516, "logps/rejected": -82.08952331542969, "loss": 0.4872, "rewards/accuracies": 0.0, "rewards/chosen": 2.3807075023651123, "rewards/margins": -0.4841775894165039, "rewards/rejected": 2.864885091781616, "step": 6484 }, { "epoch": 1.05, "learning_rate": 7.349446302378038e-07, "logits/chosen": -0.5348632335662842, "logits/rejected": -0.43575018644332886, "logps/chosen": -96.98323059082031, "logps/rejected": -39.78753662109375, "loss": 0.2002, "rewards/accuracies": 1.0, "rewards/chosen": 1.3690766096115112, "rewards/margins": 0.978348970413208, "rewards/rejected": 0.39072760939598083, "step": 6485 }, { "epoch": 1.05, "learning_rate": 7.348286095917591e-07, "logits/chosen": -0.7670094966888428, "logits/rejected": -0.7283490300178528, "logps/chosen": -70.26837158203125, "logps/rejected": -85.99017333984375, "loss": 0.4539, "rewards/accuracies": 1.0, "rewards/chosen": 1.8366501331329346, "rewards/margins": 0.401336669921875, "rewards/rejected": 1.4353134632110596, "step": 6486 }, { "epoch": 1.05, "learning_rate": 7.347125727212795e-07, "logits/chosen": -0.26045799255371094, "logits/rejected": -0.3429087698459625, "logps/chosen": -126.28921508789062, "logps/rejected": -73.70452117919922, "loss": 0.77, "rewards/accuracies": 1.0, "rewards/chosen": 2.4986114501953125, "rewards/margins": 0.8122917413711548, "rewards/rejected": 1.6863197088241577, "step": 6487 }, { "epoch": 1.05, "learning_rate": 7.345965196343819e-07, "logits/chosen": -0.7170432806015015, "logits/rejected": -0.7568666338920593, "logps/chosen": -105.17317199707031, "logps/rejected": -113.03902435302734, "loss": 0.3198, "rewards/accuracies": 1.0, "rewards/chosen": 0.7670982480049133, "rewards/margins": 0.17673718929290771, "rewards/rejected": 0.5903610587120056, "step": 6488 }, { "epoch": 1.05, "learning_rate": 7.344804503390847e-07, "logits/chosen": -0.6746049523353577, "logits/rejected": -0.6060986518859863, "logps/chosen": -49.11922836303711, "logps/rejected": -76.13323974609375, "loss": 0.6555, "rewards/accuracies": 0.0, "rewards/chosen": 2.0600788593292236, "rewards/margins": -0.6403110027313232, "rewards/rejected": 2.700389862060547, "step": 6489 }, { "epoch": 1.05, "learning_rate": 7.343643648434069e-07, "logits/chosen": -0.7533543705940247, "logits/rejected": -0.7382606267929077, "logps/chosen": -61.119842529296875, "logps/rejected": -35.12894821166992, "loss": 0.3619, "rewards/accuracies": 1.0, "rewards/chosen": 2.01096510887146, "rewards/margins": 0.24133241176605225, "rewards/rejected": 1.7696326971054077, "step": 6490 }, { "epoch": 1.05, "learning_rate": 7.342482631553692e-07, "logits/chosen": -0.6711063385009766, "logits/rejected": -0.7080379724502563, "logps/chosen": -100.43281555175781, "logps/rejected": -82.52818298339844, "loss": 0.4524, "rewards/accuracies": 0.0, "rewards/chosen": 1.1507370471954346, "rewards/margins": -0.25474846363067627, "rewards/rejected": 1.4054855108261108, "step": 6491 }, { "epoch": 1.05, "learning_rate": 7.341321452829929e-07, "logits/chosen": -0.8277087807655334, "logits/rejected": -0.6876426935195923, "logps/chosen": -198.1942901611328, "logps/rejected": -175.2406463623047, "loss": 0.1514, "rewards/accuracies": 1.0, "rewards/chosen": 6.985780239105225, "rewards/margins": 1.0514540672302246, "rewards/rejected": 5.934326171875, "step": 6492 }, { "epoch": 1.05, "learning_rate": 7.340160112343007e-07, "logits/chosen": -0.5020303130149841, "logits/rejected": -0.42855215072631836, "logps/chosen": -234.41830444335938, "logps/rejected": -98.32333374023438, "loss": 1.1247, "rewards/accuracies": 1.0, "rewards/chosen": 3.329663038253784, "rewards/margins": 0.8296117782592773, "rewards/rejected": 2.500051259994507, "step": 6493 }, { "epoch": 1.05, "learning_rate": 7.338998610173165e-07, "logits/chosen": -0.8981060981750488, "logits/rejected": -0.6950375437736511, "logps/chosen": -138.97389221191406, "logps/rejected": -61.441009521484375, "loss": 0.0317, "rewards/accuracies": 1.0, "rewards/chosen": 4.990931987762451, "rewards/margins": 2.977215051651001, "rewards/rejected": 2.01371693611145, "step": 6494 }, { "epoch": 1.05, "learning_rate": 7.337836946400651e-07, "logits/chosen": -0.26227521896362305, "logits/rejected": -0.2343679815530777, "logps/chosen": -95.91545867919922, "logps/rejected": -47.131439208984375, "loss": 0.3956, "rewards/accuracies": 0.0, "rewards/chosen": 0.6601570248603821, "rewards/margins": -0.1601409912109375, "rewards/rejected": 0.8202980160713196, "step": 6495 }, { "epoch": 1.05, "learning_rate": 7.336675121105725e-07, "logits/chosen": -0.3152885138988495, "logits/rejected": -0.3152885138988495, "logps/chosen": -37.824317932128906, "logps/rejected": -37.824317932128906, "loss": 0.4202, "rewards/accuracies": 0.0, "rewards/chosen": -0.009044647216796875, "rewards/margins": 0.0, "rewards/rejected": -0.009044647216796875, "step": 6496 }, { "epoch": 1.05, "learning_rate": 7.335513134368654e-07, "logits/chosen": -0.7480767965316772, "logits/rejected": -0.29151374101638794, "logps/chosen": -110.9990234375, "logps/rejected": -61.30866241455078, "loss": 1.5631, "rewards/accuracies": 0.0, "rewards/chosen": 0.6849716305732727, "rewards/margins": -2.7625961303710938, "rewards/rejected": 3.4475677013397217, "step": 6497 }, { "epoch": 1.05, "learning_rate": 7.334350986269728e-07, "logits/chosen": -1.1295684576034546, "logits/rejected": -0.95955491065979, "logps/chosen": -151.68936157226562, "logps/rejected": -45.4481201171875, "loss": 0.0435, "rewards/accuracies": 1.0, "rewards/chosen": 4.155801296234131, "rewards/margins": 3.0305216312408447, "rewards/rejected": 1.1252796649932861, "step": 6498 }, { "epoch": 1.05, "learning_rate": 7.333188676889237e-07, "logits/chosen": -0.2756612002849579, "logits/rejected": -0.32004907727241516, "logps/chosen": -59.46924591064453, "logps/rejected": -60.76447677612305, "loss": 1.773, "rewards/accuracies": 0.0, "rewards/chosen": 0.8074378967285156, "rewards/margins": -1.5878078937530518, "rewards/rejected": 2.3952457904815674, "step": 6499 }, { "epoch": 1.06, "learning_rate": 7.332026206307485e-07, "logits/chosen": -0.839705228805542, "logits/rejected": -0.7963603734970093, "logps/chosen": -76.78997802734375, "logps/rejected": -39.422454833984375, "loss": 0.2425, "rewards/accuracies": 1.0, "rewards/chosen": 1.9563950300216675, "rewards/margins": 0.7618851661682129, "rewards/rejected": 1.1945098638534546, "step": 6500 }, { "epoch": 1.06, "learning_rate": 7.330863574604786e-07, "logits/chosen": -0.7128550410270691, "logits/rejected": -0.31352394819259644, "logps/chosen": -150.079345703125, "logps/rejected": -107.36146545410156, "loss": 0.6462, "rewards/accuracies": 1.0, "rewards/chosen": 6.139570713043213, "rewards/margins": 3.3725221157073975, "rewards/rejected": 2.7670485973358154, "step": 6501 }, { "epoch": 1.06, "learning_rate": 7.329700781861471e-07, "logits/chosen": -0.7472676634788513, "logits/rejected": -0.7678095698356628, "logps/chosen": -95.33695983886719, "logps/rejected": -125.01649475097656, "loss": 0.4995, "rewards/accuracies": 1.0, "rewards/chosen": 1.713008165359497, "rewards/margins": 0.09808957576751709, "rewards/rejected": 1.61491858959198, "step": 6502 }, { "epoch": 1.06, "learning_rate": 7.328537828157874e-07, "logits/chosen": -0.7958968877792358, "logits/rejected": -0.7002748250961304, "logps/chosen": -80.47911071777344, "logps/rejected": -77.61085510253906, "loss": 0.4501, "rewards/accuracies": 0.0, "rewards/chosen": 1.768225073814392, "rewards/margins": -0.2688194513320923, "rewards/rejected": 2.0370445251464844, "step": 6503 }, { "epoch": 1.06, "learning_rate": 7.327374713574348e-07, "logits/chosen": -0.43049156665802, "logits/rejected": -0.3275418281555176, "logps/chosen": -74.55386352539062, "logps/rejected": -108.33958435058594, "loss": 0.3613, "rewards/accuracies": 1.0, "rewards/chosen": 1.903845191001892, "rewards/margins": 0.10846328735351562, "rewards/rejected": 1.7953819036483765, "step": 6504 }, { "epoch": 1.06, "learning_rate": 7.32621143819125e-07, "logits/chosen": -0.5981273055076599, "logits/rejected": -0.5915554165840149, "logps/chosen": -49.19453048706055, "logps/rejected": -42.29524612426758, "loss": 0.9196, "rewards/accuracies": 1.0, "rewards/chosen": 1.565764307975769, "rewards/margins": 0.5862877368927002, "rewards/rejected": 0.9794765710830688, "step": 6505 }, { "epoch": 1.06, "learning_rate": 7.325048002088954e-07, "logits/chosen": -0.41758280992507935, "logits/rejected": -0.40212908387184143, "logps/chosen": -4.845988750457764, "logps/rejected": -31.7133731842041, "loss": 1.907, "rewards/accuracies": 0.0, "rewards/chosen": 0.43208739161491394, "rewards/margins": -0.3289785087108612, "rewards/rejected": 0.7610659003257751, "step": 6506 }, { "epoch": 1.06, "learning_rate": 7.32388440534784e-07, "logits/chosen": -0.6935674548149109, "logits/rejected": -0.7781044840812683, "logps/chosen": -78.50254821777344, "logps/rejected": -95.32763671875, "loss": 1.0177, "rewards/accuracies": 0.0, "rewards/chosen": 1.3936203718185425, "rewards/margins": -1.509381890296936, "rewards/rejected": 2.9030022621154785, "step": 6507 }, { "epoch": 1.06, "learning_rate": 7.322720648048302e-07, "logits/chosen": -0.5278390049934387, "logits/rejected": -0.48601484298706055, "logps/chosen": -61.959999084472656, "logps/rejected": -77.75049591064453, "loss": 0.8496, "rewards/accuracies": 0.0, "rewards/chosen": 2.4877045154571533, "rewards/margins": -0.8247718811035156, "rewards/rejected": 3.312476396560669, "step": 6508 }, { "epoch": 1.06, "learning_rate": 7.321556730270743e-07, "logits/chosen": -0.7462387681007385, "logits/rejected": -0.6448383927345276, "logps/chosen": -50.45873260498047, "logps/rejected": -156.33615112304688, "loss": 0.755, "rewards/accuracies": 0.0, "rewards/chosen": 3.9130141735076904, "rewards/margins": -1.2417223453521729, "rewards/rejected": 5.154736518859863, "step": 6509 }, { "epoch": 1.06, "learning_rate": 7.320392652095583e-07, "logits/chosen": -0.3451041281223297, "logits/rejected": -0.38617077469825745, "logps/chosen": -59.517337799072266, "logps/rejected": -40.492218017578125, "loss": 0.7036, "rewards/accuracies": 0.0, "rewards/chosen": 2.01263165473938, "rewards/margins": -0.21136021614074707, "rewards/rejected": 2.223991870880127, "step": 6510 }, { "epoch": 1.06, "learning_rate": 7.319228413603246e-07, "logits/chosen": -0.5488056540489197, "logits/rejected": -0.5424309372901917, "logps/chosen": -97.25965118408203, "logps/rejected": -45.150630950927734, "loss": 1.1544, "rewards/accuracies": 0.0, "rewards/chosen": 0.7841522097587585, "rewards/margins": -0.9966278672218323, "rewards/rejected": 1.7807800769805908, "step": 6511 }, { "epoch": 1.06, "learning_rate": 7.318064014874171e-07, "logits/chosen": -0.6957798004150391, "logits/rejected": -0.6816864013671875, "logps/chosen": -79.31336212158203, "logps/rejected": -46.967445373535156, "loss": 0.9159, "rewards/accuracies": 0.0, "rewards/chosen": 0.9602424502372742, "rewards/margins": -1.186051845550537, "rewards/rejected": 2.146294355392456, "step": 6512 }, { "epoch": 1.06, "learning_rate": 7.316899455988805e-07, "logits/chosen": -0.4826313853263855, "logits/rejected": -0.4376738965511322, "logps/chosen": -47.91106414794922, "logps/rejected": -77.2841567993164, "loss": 1.1777, "rewards/accuracies": 0.0, "rewards/chosen": 1.784217119216919, "rewards/margins": -0.18405067920684814, "rewards/rejected": 1.968267798423767, "step": 6513 }, { "epoch": 1.06, "learning_rate": 7.315734737027611e-07, "logits/chosen": -1.1237117052078247, "logits/rejected": -1.0129427909851074, "logps/chosen": -105.49866485595703, "logps/rejected": -30.515701293945312, "loss": 0.2173, "rewards/accuracies": 1.0, "rewards/chosen": 1.6095787286758423, "rewards/margins": 1.5832782983779907, "rewards/rejected": 0.026300430297851562, "step": 6514 }, { "epoch": 1.06, "learning_rate": 7.314569858071056e-07, "logits/chosen": -1.0390732288360596, "logits/rejected": -1.0206875801086426, "logps/chosen": -48.060916900634766, "logps/rejected": -107.96610260009766, "loss": 0.4888, "rewards/accuracies": 1.0, "rewards/chosen": 1.044192910194397, "rewards/margins": 0.4113689661026001, "rewards/rejected": 0.6328239440917969, "step": 6515 }, { "epoch": 1.06, "learning_rate": 7.313404819199627e-07, "logits/chosen": -0.34565097093582153, "logits/rejected": -0.2807636559009552, "logps/chosen": -102.09928894042969, "logps/rejected": -54.37521743774414, "loss": 0.6491, "rewards/accuracies": 0.0, "rewards/chosen": 1.7728561162948608, "rewards/margins": -0.5740262269973755, "rewards/rejected": 2.3468823432922363, "step": 6516 }, { "epoch": 1.06, "learning_rate": 7.312239620493814e-07, "logits/chosen": -0.7020911574363708, "logits/rejected": -0.6303253769874573, "logps/chosen": -109.4949722290039, "logps/rejected": -51.53212356567383, "loss": 0.636, "rewards/accuracies": 0.0, "rewards/chosen": 0.5594795346260071, "rewards/margins": -0.4304126501083374, "rewards/rejected": 0.9898921847343445, "step": 6517 }, { "epoch": 1.06, "learning_rate": 7.31107426203412e-07, "logits/chosen": -0.8147445321083069, "logits/rejected": -0.8106703758239746, "logps/chosen": -50.4134635925293, "logps/rejected": -57.01673126220703, "loss": 1.3961, "rewards/accuracies": 0.0, "rewards/chosen": 0.6198566555976868, "rewards/margins": -1.3032405376434326, "rewards/rejected": 1.9230972528457642, "step": 6518 }, { "epoch": 1.06, "learning_rate": 7.309908743901064e-07, "logits/chosen": -0.7339059114456177, "logits/rejected": -0.7057641744613647, "logps/chosen": -69.70033264160156, "logps/rejected": -87.12373352050781, "loss": 0.3193, "rewards/accuracies": 1.0, "rewards/chosen": 2.704904317855835, "rewards/margins": 0.2534456253051758, "rewards/rejected": 2.451458692550659, "step": 6519 }, { "epoch": 1.06, "learning_rate": 7.30874306617517e-07, "logits/chosen": -0.7227686643600464, "logits/rejected": -0.804855465888977, "logps/chosen": -55.02619552612305, "logps/rejected": -36.481849670410156, "loss": 1.0954, "rewards/accuracies": 0.0, "rewards/chosen": 1.5221515893936157, "rewards/margins": -0.5414947271347046, "rewards/rejected": 2.0636463165283203, "step": 6520 }, { "epoch": 1.06, "learning_rate": 7.307577228936975e-07, "logits/chosen": -0.723518967628479, "logits/rejected": -0.7183077931404114, "logps/chosen": -83.41192626953125, "logps/rejected": -123.1972427368164, "loss": 0.8049, "rewards/accuracies": 1.0, "rewards/chosen": 1.5368026494979858, "rewards/margins": 0.6911460757255554, "rewards/rejected": 0.8456565737724304, "step": 6521 }, { "epoch": 1.06, "learning_rate": 7.306411232267029e-07, "logits/chosen": -0.4667913615703583, "logits/rejected": -0.5324379801750183, "logps/chosen": -107.8342514038086, "logps/rejected": -142.8531494140625, "loss": 1.7346, "rewards/accuracies": 0.0, "rewards/chosen": 1.4917305707931519, "rewards/margins": -3.326605796813965, "rewards/rejected": 4.818336486816406, "step": 6522 }, { "epoch": 1.06, "learning_rate": 7.30524507624589e-07, "logits/chosen": -0.6865977048873901, "logits/rejected": -0.6333329081535339, "logps/chosen": -118.21511840820312, "logps/rejected": -128.67662048339844, "loss": 1.9382, "rewards/accuracies": 0.0, "rewards/chosen": 2.7109436988830566, "rewards/margins": -2.563767910003662, "rewards/rejected": 5.274711608886719, "step": 6523 }, { "epoch": 1.06, "learning_rate": 7.304078760954128e-07, "logits/chosen": -0.3703250288963318, "logits/rejected": -0.34802430868148804, "logps/chosen": -48.907875061035156, "logps/rejected": -79.84163665771484, "loss": 0.5519, "rewards/accuracies": 0.0, "rewards/chosen": 1.725633978843689, "rewards/margins": -0.5702522993087769, "rewards/rejected": 2.295886278152466, "step": 6524 }, { "epoch": 1.06, "learning_rate": 7.302912286472325e-07, "logits/chosen": -0.7347689867019653, "logits/rejected": -0.7347689867019653, "logps/chosen": -41.61056137084961, "logps/rejected": -41.61056137084961, "loss": 0.4046, "rewards/accuracies": 0.0, "rewards/chosen": 1.0374943017959595, "rewards/margins": 0.0, "rewards/rejected": 1.0374943017959595, "step": 6525 }, { "epoch": 1.06, "learning_rate": 7.301745652881073e-07, "logits/chosen": -0.5323609113693237, "logits/rejected": -0.5313336849212646, "logps/chosen": -15.753578186035156, "logps/rejected": -17.46175765991211, "loss": 0.749, "rewards/accuracies": 0.0, "rewards/chosen": -0.059159185737371445, "rewards/margins": -0.1951281577348709, "rewards/rejected": 0.13596896827220917, "step": 6526 }, { "epoch": 1.06, "learning_rate": 7.300578860260976e-07, "logits/chosen": -0.5218423008918762, "logits/rejected": -0.5338560938835144, "logps/chosen": -129.49249267578125, "logps/rejected": -82.1480712890625, "loss": 0.6712, "rewards/accuracies": 0.0, "rewards/chosen": 0.6274628043174744, "rewards/margins": -0.9631499648094177, "rewards/rejected": 1.590612769126892, "step": 6527 }, { "epoch": 1.06, "learning_rate": 7.299411908692648e-07, "logits/chosen": -0.42487847805023193, "logits/rejected": -0.42487847805023193, "logps/chosen": -25.934452056884766, "logps/rejected": -25.934452056884766, "loss": 0.5885, "rewards/accuracies": 0.0, "rewards/chosen": 1.6415462493896484, "rewards/margins": 0.0, "rewards/rejected": 1.6415462493896484, "step": 6528 }, { "epoch": 1.06, "learning_rate": 7.298244798256717e-07, "logits/chosen": -0.5227342247962952, "logits/rejected": -0.31484347581863403, "logps/chosen": -123.13145446777344, "logps/rejected": -18.338016510009766, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 3.8091142177581787, "rewards/margins": 3.741788148880005, "rewards/rejected": 0.06732597202062607, "step": 6529 }, { "epoch": 1.06, "learning_rate": 7.297077529033812e-07, "logits/chosen": -0.853512704372406, "logits/rejected": -0.8764471411705017, "logps/chosen": -74.29261016845703, "logps/rejected": -62.780029296875, "loss": 0.442, "rewards/accuracies": 0.0, "rewards/chosen": 1.1846626996994019, "rewards/margins": -0.2386077642440796, "rewards/rejected": 1.4232704639434814, "step": 6530 }, { "epoch": 1.06, "learning_rate": 7.295910101104589e-07, "logits/chosen": -0.34126922488212585, "logits/rejected": -0.30406177043914795, "logps/chosen": -84.75688934326172, "logps/rejected": -102.1732406616211, "loss": 1.0276, "rewards/accuracies": 0.0, "rewards/chosen": 2.179851531982422, "rewards/margins": -1.915846347808838, "rewards/rejected": 4.09569787979126, "step": 6531 }, { "epoch": 1.06, "learning_rate": 7.2947425145497e-07, "logits/chosen": -0.7887658476829529, "logits/rejected": -0.6606199145317078, "logps/chosen": -132.6875762939453, "logps/rejected": -109.609619140625, "loss": 1.0991, "rewards/accuracies": 0.0, "rewards/chosen": 3.697474718093872, "rewards/margins": -2.050144910812378, "rewards/rejected": 5.74761962890625, "step": 6532 }, { "epoch": 1.06, "learning_rate": 7.293574769449817e-07, "logits/chosen": -0.7614182233810425, "logits/rejected": -0.6821452379226685, "logps/chosen": -116.14312744140625, "logps/rejected": -41.27766418457031, "loss": 1.2021, "rewards/accuracies": 1.0, "rewards/chosen": 1.592596411705017, "rewards/margins": 1.323665976524353, "rewards/rejected": 0.26893043518066406, "step": 6533 }, { "epoch": 1.06, "learning_rate": 7.292406865885618e-07, "logits/chosen": -0.5065605044364929, "logits/rejected": -0.4530894160270691, "logps/chosen": -48.4130859375, "logps/rejected": -65.56669616699219, "loss": 0.5989, "rewards/accuracies": 0.0, "rewards/chosen": 0.842472493648529, "rewards/margins": -0.6792903542518616, "rewards/rejected": 1.5217628479003906, "step": 6534 }, { "epoch": 1.06, "learning_rate": 7.291238803937798e-07, "logits/chosen": -0.5051459670066833, "logits/rejected": -0.5366350412368774, "logps/chosen": -115.13226318359375, "logps/rejected": -105.30702209472656, "loss": 1.9917, "rewards/accuracies": 0.0, "rewards/chosen": 2.2240676879882812, "rewards/margins": -2.948760986328125, "rewards/rejected": 5.172828674316406, "step": 6535 }, { "epoch": 1.06, "learning_rate": 7.290070583687056e-07, "logits/chosen": -0.8477160930633545, "logits/rejected": -0.771488606929779, "logps/chosen": -71.3748779296875, "logps/rejected": -17.9636287689209, "loss": 0.8481, "rewards/accuracies": 1.0, "rewards/chosen": 1.355004906654358, "rewards/margins": 1.099813461303711, "rewards/rejected": 0.2551914155483246, "step": 6536 }, { "epoch": 1.06, "learning_rate": 7.288902205214103e-07, "logits/chosen": -0.8364951610565186, "logits/rejected": -0.7592011094093323, "logps/chosen": -145.21168518066406, "logps/rejected": -93.52772521972656, "loss": 0.5093, "rewards/accuracies": 1.0, "rewards/chosen": 4.319786071777344, "rewards/margins": 2.1372947692871094, "rewards/rejected": 2.1824913024902344, "step": 6537 }, { "epoch": 1.06, "learning_rate": 7.287733668599668e-07, "logits/chosen": -0.6731693744659424, "logits/rejected": -0.7242827415466309, "logps/chosen": -49.756248474121094, "logps/rejected": -75.2732162475586, "loss": 0.5152, "rewards/accuracies": 0.0, "rewards/chosen": 1.9663231372833252, "rewards/margins": -0.5612196922302246, "rewards/rejected": 2.52754282951355, "step": 6538 }, { "epoch": 1.06, "learning_rate": 7.286564973924483e-07, "logits/chosen": -0.7166310548782349, "logits/rejected": -0.7961331009864807, "logps/chosen": -42.890132904052734, "logps/rejected": -163.66506958007812, "loss": 0.6204, "rewards/accuracies": 0.0, "rewards/chosen": 1.832540512084961, "rewards/margins": -0.15559124946594238, "rewards/rejected": 1.9881317615509033, "step": 6539 }, { "epoch": 1.06, "learning_rate": 7.285396121269293e-07, "logits/chosen": -0.7989761829376221, "logits/rejected": -0.6565044522285461, "logps/chosen": -121.63887023925781, "logps/rejected": -106.13986206054688, "loss": 0.4429, "rewards/accuracies": 0.0, "rewards/chosen": 1.9243866205215454, "rewards/margins": -0.3329223394393921, "rewards/rejected": 2.2573089599609375, "step": 6540 }, { "epoch": 1.06, "learning_rate": 7.284227110714856e-07, "logits/chosen": -0.8326372504234314, "logits/rejected": -0.77313631772995, "logps/chosen": -62.18701171875, "logps/rejected": -20.862884521484375, "loss": 0.6227, "rewards/accuracies": 1.0, "rewards/chosen": 2.003958225250244, "rewards/margins": 0.5130627155303955, "rewards/rejected": 1.4908955097198486, "step": 6541 }, { "epoch": 1.06, "learning_rate": 7.283057942341939e-07, "logits/chosen": -0.5153716206550598, "logits/rejected": -0.49707475304603577, "logps/chosen": -97.05101013183594, "logps/rejected": -42.56214141845703, "loss": 0.475, "rewards/accuracies": 1.0, "rewards/chosen": 1.5048736333847046, "rewards/margins": 0.046814680099487305, "rewards/rejected": 1.4580589532852173, "step": 6542 }, { "epoch": 1.06, "learning_rate": 7.281888616231322e-07, "logits/chosen": -1.1204833984375, "logits/rejected": -1.0289386510849, "logps/chosen": -102.30010223388672, "logps/rejected": -40.28851318359375, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 1.5213021039962769, "rewards/margins": 0.8696896433830261, "rewards/rejected": 0.6516124606132507, "step": 6543 }, { "epoch": 1.06, "learning_rate": 7.280719132463792e-07, "logits/chosen": -0.2838161885738373, "logits/rejected": -0.2838161885738373, "logps/chosen": -70.40054321289062, "logps/rejected": -70.40054321289062, "loss": 0.9148, "rewards/accuracies": 0.0, "rewards/chosen": 1.8168060779571533, "rewards/margins": 0.0, "rewards/rejected": 1.8168060779571533, "step": 6544 }, { "epoch": 1.06, "learning_rate": 7.279549491120149e-07, "logits/chosen": -0.6521942615509033, "logits/rejected": -0.5610432624816895, "logps/chosen": -65.50128936767578, "logps/rejected": -74.3756332397461, "loss": 0.3594, "rewards/accuracies": 0.0, "rewards/chosen": 1.5536705255508423, "rewards/margins": -0.00698089599609375, "rewards/rejected": 1.560651421546936, "step": 6545 }, { "epoch": 1.06, "learning_rate": 7.278379692281208e-07, "logits/chosen": -0.8834622502326965, "logits/rejected": -0.5862319469451904, "logps/chosen": -142.31689453125, "logps/rejected": -72.39688873291016, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 7.437631130218506, "rewards/margins": 5.272665023803711, "rewards/rejected": 2.164965867996216, "step": 6546 }, { "epoch": 1.06, "learning_rate": 7.277209736027787e-07, "logits/chosen": -0.44931453466415405, "logits/rejected": -0.44931453466415405, "logps/chosen": -112.03366088867188, "logps/rejected": -112.03366088867188, "loss": 0.3985, "rewards/accuracies": 0.0, "rewards/chosen": 3.0742766857147217, "rewards/margins": 0.0, "rewards/rejected": 3.0742766857147217, "step": 6547 }, { "epoch": 1.06, "learning_rate": 7.27603962244072e-07, "logits/chosen": -0.5939471125602722, "logits/rejected": -0.5360646843910217, "logps/chosen": -53.39170837402344, "logps/rejected": -41.78333282470703, "loss": 0.5186, "rewards/accuracies": 1.0, "rewards/chosen": 1.6653320789337158, "rewards/margins": 0.781719982624054, "rewards/rejected": 0.8836120963096619, "step": 6548 }, { "epoch": 1.06, "learning_rate": 7.274869351600852e-07, "logits/chosen": -0.9917904138565063, "logits/rejected": -0.7790274024009705, "logps/chosen": -97.67613983154297, "logps/rejected": -19.080934524536133, "loss": 0.5211, "rewards/accuracies": 1.0, "rewards/chosen": 4.598205089569092, "rewards/margins": 4.170830249786377, "rewards/rejected": 0.42737483978271484, "step": 6549 }, { "epoch": 1.06, "learning_rate": 7.273698923589037e-07, "logits/chosen": -0.5555225610733032, "logits/rejected": -0.40197739005088806, "logps/chosen": -69.65866088867188, "logps/rejected": -64.78385162353516, "loss": 0.473, "rewards/accuracies": 1.0, "rewards/chosen": 1.9683440923690796, "rewards/margins": 0.29970693588256836, "rewards/rejected": 1.6686371564865112, "step": 6550 }, { "epoch": 1.06, "learning_rate": 7.27252833848614e-07, "logits/chosen": -0.47293299436569214, "logits/rejected": -0.3716086745262146, "logps/chosen": -92.29681396484375, "logps/rejected": -55.014442443847656, "loss": 0.1176, "rewards/accuracies": 1.0, "rewards/chosen": 2.3186256885528564, "rewards/margins": 1.555046796798706, "rewards/rejected": 0.7635788321495056, "step": 6551 }, { "epoch": 1.06, "learning_rate": 7.271357596373038e-07, "logits/chosen": -0.4023769497871399, "logits/rejected": -0.20846417546272278, "logps/chosen": -55.558197021484375, "logps/rejected": -105.98197937011719, "loss": 0.54, "rewards/accuracies": 0.0, "rewards/chosen": 2.6392593383789062, "rewards/margins": -0.3743927478790283, "rewards/rejected": 3.0136520862579346, "step": 6552 }, { "epoch": 1.06, "learning_rate": 7.270186697330616e-07, "logits/chosen": -0.7051374912261963, "logits/rejected": -0.6863526105880737, "logps/chosen": -106.84235382080078, "logps/rejected": -46.74250030517578, "loss": 1.1534, "rewards/accuracies": 0.0, "rewards/chosen": 0.7364234924316406, "rewards/margins": -1.1596726179122925, "rewards/rejected": 1.896096110343933, "step": 6553 }, { "epoch": 1.06, "learning_rate": 7.269015641439776e-07, "logits/chosen": -0.6592106223106384, "logits/rejected": -0.5745404958724976, "logps/chosen": -52.68070983886719, "logps/rejected": -84.57779693603516, "loss": 0.7434, "rewards/accuracies": 0.0, "rewards/chosen": 1.8475807905197144, "rewards/margins": -0.6647728681564331, "rewards/rejected": 2.5123536586761475, "step": 6554 }, { "epoch": 1.06, "learning_rate": 7.267844428781424e-07, "logits/chosen": -0.6083440184593201, "logits/rejected": -0.5627481937408447, "logps/chosen": -59.116703033447266, "logps/rejected": -75.98265838623047, "loss": 1.1528, "rewards/accuracies": 0.0, "rewards/chosen": 1.4465420246124268, "rewards/margins": -1.0143344402313232, "rewards/rejected": 2.46087646484375, "step": 6555 }, { "epoch": 1.06, "learning_rate": 7.266673059436481e-07, "logits/chosen": -0.602607250213623, "logits/rejected": -0.5996363759040833, "logps/chosen": -113.90013122558594, "logps/rejected": -99.32598114013672, "loss": 0.536, "rewards/accuracies": 0.0, "rewards/chosen": 1.4628677368164062, "rewards/margins": -0.29493260383605957, "rewards/rejected": 1.7578003406524658, "step": 6556 }, { "epoch": 1.06, "learning_rate": 7.265501533485878e-07, "logits/chosen": -0.2700771391391754, "logits/rejected": -0.2847932279109955, "logps/chosen": -3.565786361694336, "logps/rejected": -1.8532246351242065, "loss": 0.3827, "rewards/accuracies": 0.0, "rewards/chosen": 0.5003237128257751, "rewards/margins": -0.07109493017196655, "rewards/rejected": 0.5714186429977417, "step": 6557 }, { "epoch": 1.06, "learning_rate": 7.264329851010553e-07, "logits/chosen": -0.5206971168518066, "logits/rejected": -0.38853439688682556, "logps/chosen": -51.72719192504883, "logps/rejected": -89.3533935546875, "loss": 0.2862, "rewards/accuracies": 1.0, "rewards/chosen": 1.6854504346847534, "rewards/margins": 0.6088657379150391, "rewards/rejected": 1.0765846967697144, "step": 6558 }, { "epoch": 1.06, "learning_rate": 7.263158012091463e-07, "logits/chosen": -0.17120957374572754, "logits/rejected": -0.17120957374572754, "logps/chosen": -42.32445526123047, "logps/rejected": -42.32445526123047, "loss": 0.8146, "rewards/accuracies": 0.0, "rewards/chosen": 0.9337791800498962, "rewards/margins": 0.0, "rewards/rejected": 0.9337791800498962, "step": 6559 }, { "epoch": 1.06, "learning_rate": 7.261986016809567e-07, "logits/chosen": -0.617443859577179, "logits/rejected": -0.5996897220611572, "logps/chosen": -70.494384765625, "logps/rejected": -88.64373779296875, "loss": 0.7089, "rewards/accuracies": 0.0, "rewards/chosen": 1.5514954328536987, "rewards/margins": -0.06201314926147461, "rewards/rejected": 1.6135085821151733, "step": 6560 }, { "epoch": 1.06, "learning_rate": 7.260813865245841e-07, "logits/chosen": -0.7234722375869751, "logits/rejected": -0.7234722375869751, "logps/chosen": -88.46786499023438, "logps/rejected": -88.46786499023438, "loss": 1.8864, "rewards/accuracies": 0.0, "rewards/chosen": 1.0169563293457031, "rewards/margins": 0.0, "rewards/rejected": 1.0169563293457031, "step": 6561 }, { "epoch": 1.07, "learning_rate": 7.259641557481268e-07, "logits/chosen": -0.5412355065345764, "logits/rejected": -0.13535119593143463, "logps/chosen": -125.41890716552734, "logps/rejected": -52.75370788574219, "loss": 0.2587, "rewards/accuracies": 1.0, "rewards/chosen": 3.099881887435913, "rewards/margins": 1.2258362770080566, "rewards/rejected": 1.8740456104278564, "step": 6562 }, { "epoch": 1.07, "learning_rate": 7.258469093596845e-07, "logits/chosen": -0.6403058171272278, "logits/rejected": -0.6476603746414185, "logps/chosen": -24.492801666259766, "logps/rejected": -24.164714813232422, "loss": 0.7169, "rewards/accuracies": 0.0, "rewards/chosen": 0.19103088974952698, "rewards/margins": -0.09283334016799927, "rewards/rejected": 0.28386422991752625, "step": 6563 }, { "epoch": 1.07, "learning_rate": 7.257296473673577e-07, "logits/chosen": -0.9983727931976318, "logits/rejected": -0.9150373339653015, "logps/chosen": -58.266231536865234, "logps/rejected": -23.882198333740234, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 1.1007442474365234, "rewards/margins": 0.7923791408538818, "rewards/rejected": 0.3083650767803192, "step": 6564 }, { "epoch": 1.07, "learning_rate": 7.256123697792483e-07, "logits/chosen": -0.4879536032676697, "logits/rejected": -0.4879536032676697, "logps/chosen": -22.75554084777832, "logps/rejected": -22.75554084777832, "loss": 0.5488, "rewards/accuracies": 0.0, "rewards/chosen": 0.9958265423774719, "rewards/margins": 0.0, "rewards/rejected": 0.9958265423774719, "step": 6565 }, { "epoch": 1.07, "learning_rate": 7.254950766034588e-07, "logits/chosen": -0.9612210988998413, "logits/rejected": -0.8783861398696899, "logps/chosen": -97.11640167236328, "logps/rejected": -146.68006896972656, "loss": 1.0141, "rewards/accuracies": 1.0, "rewards/chosen": 6.22548770904541, "rewards/margins": 0.20284795761108398, "rewards/rejected": 6.022639751434326, "step": 6566 }, { "epoch": 1.07, "learning_rate": 7.253777678480931e-07, "logits/chosen": -1.024419903755188, "logits/rejected": -0.9303612112998962, "logps/chosen": -112.3528060913086, "logps/rejected": -39.83873748779297, "loss": 1.5693, "rewards/accuracies": 1.0, "rewards/chosen": 1.3456703424453735, "rewards/margins": 1.3727009296417236, "rewards/rejected": -0.027030562981963158, "step": 6567 }, { "epoch": 1.07, "learning_rate": 7.252604435212563e-07, "logits/chosen": -0.6077345609664917, "logits/rejected": -0.6683399081230164, "logps/chosen": -100.46858215332031, "logps/rejected": -58.11373519897461, "loss": 2.548, "rewards/accuracies": 0.0, "rewards/chosen": 1.1398636102676392, "rewards/margins": -1.3706547021865845, "rewards/rejected": 2.5105183124542236, "step": 6568 }, { "epoch": 1.07, "learning_rate": 7.251431036310543e-07, "logits/chosen": -1.1006169319152832, "logits/rejected": -1.0653437376022339, "logps/chosen": -59.04652786254883, "logps/rejected": -84.57483673095703, "loss": 0.5045, "rewards/accuracies": 1.0, "rewards/chosen": 2.220435857772827, "rewards/margins": 0.8798291683197021, "rewards/rejected": 1.340606689453125, "step": 6569 }, { "epoch": 1.07, "learning_rate": 7.25025748185594e-07, "logits/chosen": -0.8815143704414368, "logits/rejected": -0.8058556914329529, "logps/chosen": -93.3243408203125, "logps/rejected": -73.75484466552734, "loss": 0.5362, "rewards/accuracies": 0.0, "rewards/chosen": 2.328852891921997, "rewards/margins": -0.11331701278686523, "rewards/rejected": 2.4421699047088623, "step": 6570 }, { "epoch": 1.07, "learning_rate": 7.249083771929838e-07, "logits/chosen": -0.08638698607683182, "logits/rejected": -0.15305352210998535, "logps/chosen": -107.49806213378906, "logps/rejected": -61.72932434082031, "loss": 0.9879, "rewards/accuracies": 0.0, "rewards/chosen": 0.7542701959609985, "rewards/margins": -0.8916648626327515, "rewards/rejected": 1.64593505859375, "step": 6571 }, { "epoch": 1.07, "learning_rate": 7.247909906613329e-07, "logits/chosen": -0.7160711884498596, "logits/rejected": -0.7290880084037781, "logps/chosen": -6.682714462280273, "logps/rejected": -46.31208419799805, "loss": 0.6882, "rewards/accuracies": 0.0, "rewards/chosen": 0.3066423535346985, "rewards/margins": -0.060514628887176514, "rewards/rejected": 0.367156982421875, "step": 6572 }, { "epoch": 1.07, "learning_rate": 7.246735885987514e-07, "logits/chosen": -1.032271385192871, "logits/rejected": -0.9335059523582458, "logps/chosen": -89.54319763183594, "logps/rejected": -87.61064147949219, "loss": 0.9653, "rewards/accuracies": 1.0, "rewards/chosen": 2.3410592079162598, "rewards/margins": 0.17892241477966309, "rewards/rejected": 2.1621367931365967, "step": 6573 }, { "epoch": 1.07, "learning_rate": 7.24556171013351e-07, "logits/chosen": -0.5101924538612366, "logits/rejected": -0.5247029662132263, "logps/chosen": -27.655675888061523, "logps/rejected": -5.615368843078613, "loss": 0.5956, "rewards/accuracies": 1.0, "rewards/chosen": 1.0910543203353882, "rewards/margins": 0.718299925327301, "rewards/rejected": 0.37275439500808716, "step": 6574 }, { "epoch": 1.07, "learning_rate": 7.244387379132437e-07, "logits/chosen": -0.539910078048706, "logits/rejected": -0.4660506248474121, "logps/chosen": -78.80267333984375, "logps/rejected": -32.572296142578125, "loss": 0.7125, "rewards/accuracies": 1.0, "rewards/chosen": 0.9569748044013977, "rewards/margins": 0.1785537600517273, "rewards/rejected": 0.7784210443496704, "step": 6575 }, { "epoch": 1.07, "learning_rate": 7.243212893065435e-07, "logits/chosen": -0.29289838671684265, "logits/rejected": -0.29289838671684265, "logps/chosen": -57.530391693115234, "logps/rejected": -57.530391693115234, "loss": 0.8823, "rewards/accuracies": 0.0, "rewards/chosen": 0.613934338092804, "rewards/margins": 0.0, "rewards/rejected": 0.613934338092804, "step": 6576 }, { "epoch": 1.07, "learning_rate": 7.242038252013647e-07, "logits/chosen": -0.8282744288444519, "logits/rejected": -0.8691571950912476, "logps/chosen": -169.59591674804688, "logps/rejected": -148.18817138671875, "loss": 1.5279, "rewards/accuracies": 0.0, "rewards/chosen": 3.725546360015869, "rewards/margins": -2.9553070068359375, "rewards/rejected": 6.680853366851807, "step": 6577 }, { "epoch": 1.07, "learning_rate": 7.24086345605823e-07, "logits/chosen": -0.841731071472168, "logits/rejected": -0.7958574295043945, "logps/chosen": -47.83775329589844, "logps/rejected": -46.29060363769531, "loss": 0.7241, "rewards/accuracies": 1.0, "rewards/chosen": 2.0048828125, "rewards/margins": 0.45316803455352783, "rewards/rejected": 1.5517147779464722, "step": 6578 }, { "epoch": 1.07, "learning_rate": 7.23968850528035e-07, "logits/chosen": -0.5521093010902405, "logits/rejected": -0.467348575592041, "logps/chosen": -61.67908477783203, "logps/rejected": -57.977874755859375, "loss": 0.4747, "rewards/accuracies": 1.0, "rewards/chosen": 2.4690330028533936, "rewards/margins": 0.6647391319274902, "rewards/rejected": 1.8042938709259033, "step": 6579 }, { "epoch": 1.07, "learning_rate": 7.238513399761188e-07, "logits/chosen": -0.8083311915397644, "logits/rejected": -0.775817334651947, "logps/chosen": -95.19076538085938, "logps/rejected": -9.917411804199219, "loss": 0.4558, "rewards/accuracies": 1.0, "rewards/chosen": 1.6035553216934204, "rewards/margins": 0.5788007974624634, "rewards/rejected": 1.024754524230957, "step": 6580 }, { "epoch": 1.07, "learning_rate": 7.237338139581931e-07, "logits/chosen": -0.5359248518943787, "logits/rejected": -0.5008548498153687, "logps/chosen": -54.61247253417969, "logps/rejected": -56.030189514160156, "loss": 0.3596, "rewards/accuracies": 1.0, "rewards/chosen": 2.3909318447113037, "rewards/margins": 0.2972269058227539, "rewards/rejected": 2.09370493888855, "step": 6581 }, { "epoch": 1.07, "learning_rate": 7.236162724823778e-07, "logits/chosen": -0.5461958646774292, "logits/rejected": -0.5461958646774292, "logps/chosen": -75.70603942871094, "logps/rejected": -75.70603942871094, "loss": 1.9011, "rewards/accuracies": 0.0, "rewards/chosen": 2.2523255348205566, "rewards/margins": 0.0, "rewards/rejected": 2.2523255348205566, "step": 6582 }, { "epoch": 1.07, "learning_rate": 7.23498715556794e-07, "logits/chosen": -0.4741174280643463, "logits/rejected": -0.40657857060432434, "logps/chosen": -50.228126525878906, "logps/rejected": -55.358306884765625, "loss": 0.8268, "rewards/accuracies": 1.0, "rewards/chosen": 1.4510475397109985, "rewards/margins": 0.12089157104492188, "rewards/rejected": 1.3301559686660767, "step": 6583 }, { "epoch": 1.07, "learning_rate": 7.233811431895638e-07, "logits/chosen": -0.5042394995689392, "logits/rejected": -0.5567334294319153, "logps/chosen": -72.80665588378906, "logps/rejected": -124.74469757080078, "loss": 0.3917, "rewards/accuracies": 0.0, "rewards/chosen": 1.151892066001892, "rewards/margins": -0.16682672500610352, "rewards/rejected": 1.3187187910079956, "step": 6584 }, { "epoch": 1.07, "learning_rate": 7.2326355538881e-07, "logits/chosen": -0.7354928851127625, "logits/rejected": -0.7326534390449524, "logps/chosen": -1.7883810997009277, "logps/rejected": -8.71549129486084, "loss": 0.3585, "rewards/accuracies": 1.0, "rewards/chosen": 0.30312591791152954, "rewards/margins": 0.31504112482070923, "rewards/rejected": -0.011915206909179688, "step": 6585 }, { "epoch": 1.07, "learning_rate": 7.231459521626573e-07, "logits/chosen": -0.6814563274383545, "logits/rejected": -0.525306761264801, "logps/chosen": -130.64585876464844, "logps/rejected": -114.91888427734375, "loss": 0.0839, "rewards/accuracies": 1.0, "rewards/chosen": 5.425602912902832, "rewards/margins": 2.5478594303131104, "rewards/rejected": 2.8777434825897217, "step": 6586 }, { "epoch": 1.07, "learning_rate": 7.230283335192306e-07, "logits/chosen": -1.042401671409607, "logits/rejected": -1.0088719129562378, "logps/chosen": -103.27900695800781, "logps/rejected": -100.75653076171875, "loss": 1.2016, "rewards/accuracies": 0.0, "rewards/chosen": 1.467034935951233, "rewards/margins": -0.8982192277908325, "rewards/rejected": 2.3652541637420654, "step": 6587 }, { "epoch": 1.07, "learning_rate": 7.229106994666562e-07, "logits/chosen": -0.617438554763794, "logits/rejected": -0.6371795535087585, "logps/chosen": -84.92230224609375, "logps/rejected": -97.58441162109375, "loss": 1.1594, "rewards/accuracies": 0.0, "rewards/chosen": 0.9208465814590454, "rewards/margins": -2.210784912109375, "rewards/rejected": 3.13163161277771, "step": 6588 }, { "epoch": 1.07, "learning_rate": 7.22793050013062e-07, "logits/chosen": -0.6837015748023987, "logits/rejected": -0.7867306470870972, "logps/chosen": -46.33812713623047, "logps/rejected": -51.04335021972656, "loss": 0.1955, "rewards/accuracies": 1.0, "rewards/chosen": 2.4205596446990967, "rewards/margins": 1.2107523679733276, "rewards/rejected": 1.209807276725769, "step": 6589 }, { "epoch": 1.07, "learning_rate": 7.22675385166576e-07, "logits/chosen": -0.13787341117858887, "logits/rejected": -0.15120340883731842, "logps/chosen": -13.422728538513184, "logps/rejected": -69.87147521972656, "loss": 0.9318, "rewards/accuracies": 1.0, "rewards/chosen": 0.16550417244434357, "rewards/margins": 0.12028703838586807, "rewards/rejected": 0.045217134058475494, "step": 6590 }, { "epoch": 1.07, "learning_rate": 7.225577049353278e-07, "logits/chosen": -0.7567947506904602, "logits/rejected": -0.6473474502563477, "logps/chosen": -89.94993591308594, "logps/rejected": -59.905601501464844, "loss": 0.9657, "rewards/accuracies": 1.0, "rewards/chosen": 5.246121406555176, "rewards/margins": 3.8174493312835693, "rewards/rejected": 1.4286720752716064, "step": 6591 }, { "epoch": 1.07, "learning_rate": 7.224400093274481e-07, "logits/chosen": -1.0150648355484009, "logits/rejected": -0.9239007234573364, "logps/chosen": -52.380104064941406, "logps/rejected": -9.101502418518066, "loss": 0.1906, "rewards/accuracies": 1.0, "rewards/chosen": 1.5514014959335327, "rewards/margins": 0.8662310242652893, "rewards/rejected": 0.6851704716682434, "step": 6592 }, { "epoch": 1.07, "learning_rate": 7.223222983510686e-07, "logits/chosen": -0.8564735054969788, "logits/rejected": -0.8638449907302856, "logps/chosen": -110.92330932617188, "logps/rejected": -87.20567321777344, "loss": 0.3581, "rewards/accuracies": 1.0, "rewards/chosen": 1.726582407951355, "rewards/margins": 0.25342559814453125, "rewards/rejected": 1.4731568098068237, "step": 6593 }, { "epoch": 1.07, "learning_rate": 7.22204572014322e-07, "logits/chosen": -0.590786337852478, "logits/rejected": -0.4738697409629822, "logps/chosen": -164.46595764160156, "logps/rejected": -107.74642181396484, "loss": 0.0326, "rewards/accuracies": 1.0, "rewards/chosen": 4.241673469543457, "rewards/margins": 2.866487979888916, "rewards/rejected": 1.3751853704452515, "step": 6594 }, { "epoch": 1.07, "learning_rate": 7.220868303253417e-07, "logits/chosen": -0.6952007412910461, "logits/rejected": -0.6419681310653687, "logps/chosen": -130.72169494628906, "logps/rejected": -92.22280883789062, "loss": 0.4219, "rewards/accuracies": 1.0, "rewards/chosen": 7.170904636383057, "rewards/margins": 3.154740810394287, "rewards/rejected": 4.0161638259887695, "step": 6595 }, { "epoch": 1.07, "learning_rate": 7.21969073292263e-07, "logits/chosen": -0.41982021927833557, "logits/rejected": -0.47454074025154114, "logps/chosen": -68.57843780517578, "logps/rejected": -59.30046081542969, "loss": 0.9808, "rewards/accuracies": 1.0, "rewards/chosen": 1.5297645330429077, "rewards/margins": 0.632134199142456, "rewards/rejected": 0.8976303339004517, "step": 6596 }, { "epoch": 1.07, "learning_rate": 7.218513009232215e-07, "logits/chosen": -0.44329938292503357, "logits/rejected": -0.45320555567741394, "logps/chosen": -65.50533294677734, "logps/rejected": -42.80335235595703, "loss": 0.7417, "rewards/accuracies": 0.0, "rewards/chosen": 0.8970786929130554, "rewards/margins": -1.1656107902526855, "rewards/rejected": 2.0626895427703857, "step": 6597 }, { "epoch": 1.07, "learning_rate": 7.217335132263544e-07, "logits/chosen": -0.651328980922699, "logits/rejected": -0.6098482608795166, "logps/chosen": -102.61524963378906, "logps/rejected": -79.61367797851562, "loss": 0.5282, "rewards/accuracies": 1.0, "rewards/chosen": 1.058284044265747, "rewards/margins": 0.07333528995513916, "rewards/rejected": 0.9849487543106079, "step": 6598 }, { "epoch": 1.07, "learning_rate": 7.216157102097996e-07, "logits/chosen": -0.8763152956962585, "logits/rejected": -0.7445243000984192, "logps/chosen": -101.46487426757812, "logps/rejected": -75.04737854003906, "loss": 0.3917, "rewards/accuracies": 1.0, "rewards/chosen": 5.319346904754639, "rewards/margins": 2.727513313293457, "rewards/rejected": 2.5918335914611816, "step": 6599 }, { "epoch": 1.07, "learning_rate": 7.21497891881696e-07, "logits/chosen": -0.53006911277771, "logits/rejected": -0.5223798155784607, "logps/chosen": -64.75779724121094, "logps/rejected": -83.39865112304688, "loss": 1.0263, "rewards/accuracies": 0.0, "rewards/chosen": 2.200127363204956, "rewards/margins": -0.6384103298187256, "rewards/rejected": 2.8385376930236816, "step": 6600 }, { "epoch": 1.07, "learning_rate": 7.213800582501842e-07, "logits/chosen": -0.39717888832092285, "logits/rejected": -0.39717888832092285, "logps/chosen": -106.31275177001953, "logps/rejected": -106.31275177001953, "loss": 0.4176, "rewards/accuracies": 0.0, "rewards/chosen": 1.1176979541778564, "rewards/margins": 0.0, "rewards/rejected": 1.1176979541778564, "step": 6601 }, { "epoch": 1.07, "learning_rate": 7.212622093234049e-07, "logits/chosen": -0.6862137317657471, "logits/rejected": -0.6677574515342712, "logps/chosen": -201.33477783203125, "logps/rejected": -253.5270538330078, "loss": 1.9057, "rewards/accuracies": 0.0, "rewards/chosen": 3.1221039295196533, "rewards/margins": -3.209437608718872, "rewards/rejected": 6.331541538238525, "step": 6602 }, { "epoch": 1.07, "learning_rate": 7.211443451095006e-07, "logits/chosen": -0.8791714906692505, "logits/rejected": -0.9352879524230957, "logps/chosen": -305.34033203125, "logps/rejected": -61.847537994384766, "loss": 0.2575, "rewards/accuracies": 1.0, "rewards/chosen": 4.786706447601318, "rewards/margins": 2.495117425918579, "rewards/rejected": 2.2915890216827393, "step": 6603 }, { "epoch": 1.07, "learning_rate": 7.210264656166145e-07, "logits/chosen": -0.5221447944641113, "logits/rejected": -0.4870408773422241, "logps/chosen": -101.99105834960938, "logps/rejected": -54.260337829589844, "loss": 1.046, "rewards/accuracies": 0.0, "rewards/chosen": 0.4688209593296051, "rewards/margins": -0.3996017277240753, "rewards/rejected": 0.8684226870536804, "step": 6604 }, { "epoch": 1.07, "learning_rate": 7.20908570852891e-07, "logits/chosen": -0.6432712078094482, "logits/rejected": -0.7620067000389099, "logps/chosen": -227.10824584960938, "logps/rejected": -66.34772491455078, "loss": 0.4237, "rewards/accuracies": 0.0, "rewards/chosen": 3.3302247524261475, "rewards/margins": -0.2686431407928467, "rewards/rejected": 3.598867893218994, "step": 6605 }, { "epoch": 1.07, "learning_rate": 7.207906608264755e-07, "logits/chosen": -0.6337791085243225, "logits/rejected": -0.6899476051330566, "logps/chosen": -184.9752960205078, "logps/rejected": -91.085205078125, "loss": 0.0444, "rewards/accuracies": 1.0, "rewards/chosen": 3.9610671997070312, "rewards/margins": 2.531285047531128, "rewards/rejected": 1.4297821521759033, "step": 6606 }, { "epoch": 1.07, "learning_rate": 7.206727355455146e-07, "logits/chosen": -0.9013943076133728, "logits/rejected": -0.8834732174873352, "logps/chosen": -72.58036804199219, "logps/rejected": -66.04776763916016, "loss": 0.7199, "rewards/accuracies": 0.0, "rewards/chosen": 1.0598663091659546, "rewards/margins": -0.5428093671798706, "rewards/rejected": 1.6026756763458252, "step": 6607 }, { "epoch": 1.07, "learning_rate": 7.205547950181556e-07, "logits/chosen": -1.0746077299118042, "logits/rejected": -1.0395407676696777, "logps/chosen": -93.32591247558594, "logps/rejected": -135.98876953125, "loss": 0.1298, "rewards/accuracies": 1.0, "rewards/chosen": 1.8396514654159546, "rewards/margins": 1.3186278343200684, "rewards/rejected": 0.5210235714912415, "step": 6608 }, { "epoch": 1.07, "learning_rate": 7.20436839252547e-07, "logits/chosen": -0.25020432472229004, "logits/rejected": -0.2604086995124817, "logps/chosen": -40.34390640258789, "logps/rejected": -99.3837890625, "loss": 0.6863, "rewards/accuracies": 0.0, "rewards/chosen": 1.0437053442001343, "rewards/margins": -1.0042186975479126, "rewards/rejected": 2.047924041748047, "step": 6609 }, { "epoch": 1.07, "learning_rate": 7.203188682568389e-07, "logits/chosen": -0.8550060391426086, "logits/rejected": -0.864559531211853, "logps/chosen": -112.16492462158203, "logps/rejected": -155.78570556640625, "loss": 3.3408, "rewards/accuracies": 0.0, "rewards/chosen": 2.745262861251831, "rewards/margins": -4.574489593505859, "rewards/rejected": 7.3197526931762695, "step": 6610 }, { "epoch": 1.07, "learning_rate": 7.202008820391817e-07, "logits/chosen": -0.4530135989189148, "logits/rejected": -0.4530135989189148, "logps/chosen": -24.4672794342041, "logps/rejected": -24.4672794342041, "loss": 0.4548, "rewards/accuracies": 0.0, "rewards/chosen": 1.6213759183883667, "rewards/margins": 0.0, "rewards/rejected": 1.6213759183883667, "step": 6611 }, { "epoch": 1.07, "learning_rate": 7.200828806077269e-07, "logits/chosen": -0.6262193322181702, "logits/rejected": -0.5372672080993652, "logps/chosen": -44.36717987060547, "logps/rejected": -50.297210693359375, "loss": 0.4249, "rewards/accuracies": 1.0, "rewards/chosen": 1.9735832214355469, "rewards/margins": 0.008988142013549805, "rewards/rejected": 1.964595079421997, "step": 6612 }, { "epoch": 1.07, "learning_rate": 7.199648639706274e-07, "logits/chosen": -0.5422765612602234, "logits/rejected": -0.47363394498825073, "logps/chosen": -30.205322265625, "logps/rejected": -6.487580299377441, "loss": 0.5176, "rewards/accuracies": 1.0, "rewards/chosen": 0.7764236330986023, "rewards/margins": 0.37449273467063904, "rewards/rejected": 0.40193089842796326, "step": 6613 }, { "epoch": 1.07, "learning_rate": 7.198468321360374e-07, "logits/chosen": -0.6755656003952026, "logits/rejected": -0.5492138266563416, "logps/chosen": -86.11492919921875, "logps/rejected": -48.002342224121094, "loss": 0.376, "rewards/accuracies": 1.0, "rewards/chosen": 1.899287462234497, "rewards/margins": 0.059462785720825195, "rewards/rejected": 1.8398246765136719, "step": 6614 }, { "epoch": 1.07, "learning_rate": 7.197287851121115e-07, "logits/chosen": -0.5698235034942627, "logits/rejected": -0.5820146799087524, "logps/chosen": -60.978187561035156, "logps/rejected": -99.3065185546875, "loss": 2.6119, "rewards/accuracies": 0.0, "rewards/chosen": 1.8133842945098877, "rewards/margins": -1.4122428894042969, "rewards/rejected": 3.2256271839141846, "step": 6615 }, { "epoch": 1.07, "learning_rate": 7.196107229070054e-07, "logits/chosen": -0.6674918532371521, "logits/rejected": -0.7045173645019531, "logps/chosen": -73.14513397216797, "logps/rejected": -57.662254333496094, "loss": 0.3109, "rewards/accuracies": 1.0, "rewards/chosen": 2.298520803451538, "rewards/margins": 0.2298746109008789, "rewards/rejected": 2.068646192550659, "step": 6616 }, { "epoch": 1.07, "learning_rate": 7.194926455288764e-07, "logits/chosen": -0.7285623550415039, "logits/rejected": -0.7400394678115845, "logps/chosen": -73.02487182617188, "logps/rejected": -88.13400268554688, "loss": 0.782, "rewards/accuracies": 0.0, "rewards/chosen": 1.9223312139511108, "rewards/margins": -0.18545687198638916, "rewards/rejected": 2.1077880859375, "step": 6617 }, { "epoch": 1.07, "learning_rate": 7.193745529858826e-07, "logits/chosen": -0.7164049744606018, "logits/rejected": -0.645269513130188, "logps/chosen": -47.44673156738281, "logps/rejected": -32.97272872924805, "loss": 0.4453, "rewards/accuracies": 0.0, "rewards/chosen": 1.3971999883651733, "rewards/margins": -0.27014660835266113, "rewards/rejected": 1.6673465967178345, "step": 6618 }, { "epoch": 1.07, "learning_rate": 7.192564452861828e-07, "logits/chosen": -0.5892745852470398, "logits/rejected": -0.5892745852470398, "logps/chosen": -38.076141357421875, "logps/rejected": -38.076141357421875, "loss": 0.5161, "rewards/accuracies": 0.0, "rewards/chosen": 1.724310278892517, "rewards/margins": 0.0, "rewards/rejected": 1.724310278892517, "step": 6619 }, { "epoch": 1.07, "learning_rate": 7.191383224379372e-07, "logits/chosen": -0.587073028087616, "logits/rejected": -0.5809579491615295, "logps/chosen": -47.82491683959961, "logps/rejected": -63.48259735107422, "loss": 0.5201, "rewards/accuracies": 0.0, "rewards/chosen": 1.767067313194275, "rewards/margins": -0.532138466835022, "rewards/rejected": 2.299205780029297, "step": 6620 }, { "epoch": 1.07, "learning_rate": 7.190201844493072e-07, "logits/chosen": -0.4997484087944031, "logits/rejected": -0.47098687291145325, "logps/chosen": -107.48270416259766, "logps/rejected": -84.58798217773438, "loss": 1.0146, "rewards/accuracies": 0.0, "rewards/chosen": 0.7356902956962585, "rewards/margins": -1.530595302581787, "rewards/rejected": 2.2662856578826904, "step": 6621 }, { "epoch": 1.07, "learning_rate": 7.189020313284549e-07, "logits/chosen": -0.07826652377843857, "logits/rejected": -0.10648979991674423, "logps/chosen": -70.17855834960938, "logps/rejected": -105.07093048095703, "loss": 0.6774, "rewards/accuracies": 0.0, "rewards/chosen": 1.0629860162734985, "rewards/margins": -0.00470125675201416, "rewards/rejected": 1.0676872730255127, "step": 6622 }, { "epoch": 1.07, "learning_rate": 7.187838630835432e-07, "logits/chosen": -1.0135163068771362, "logits/rejected": -0.9005641937255859, "logps/chosen": -372.208740234375, "logps/rejected": -154.29025268554688, "loss": 0.0449, "rewards/accuracies": 1.0, "rewards/chosen": 4.7239990234375, "rewards/margins": 2.4695708751678467, "rewards/rejected": 2.2544281482696533, "step": 6623 }, { "epoch": 1.08, "learning_rate": 7.186656797227369e-07, "logits/chosen": -0.41202834248542786, "logits/rejected": -0.4050826132297516, "logps/chosen": -3.6158504486083984, "logps/rejected": -5.703309535980225, "loss": 0.7729, "rewards/accuracies": 0.0, "rewards/chosen": 0.03218865394592285, "rewards/margins": -0.19785527884960175, "rewards/rejected": 0.2300439327955246, "step": 6624 }, { "epoch": 1.08, "learning_rate": 7.185474812542012e-07, "logits/chosen": -0.7903797030448914, "logits/rejected": -0.7903797030448914, "logps/chosen": -70.10078430175781, "logps/rejected": -70.10078430175781, "loss": 0.4515, "rewards/accuracies": 0.0, "rewards/chosen": 1.9226669073104858, "rewards/margins": 0.0, "rewards/rejected": 1.9226669073104858, "step": 6625 }, { "epoch": 1.08, "learning_rate": 7.184292676861023e-07, "logits/chosen": -0.5654151439666748, "logits/rejected": -0.5480116605758667, "logps/chosen": -67.16972351074219, "logps/rejected": -99.5863265991211, "loss": 0.4256, "rewards/accuracies": 1.0, "rewards/chosen": 1.8019180297851562, "rewards/margins": 0.5901466608047485, "rewards/rejected": 1.2117713689804077, "step": 6626 }, { "epoch": 1.08, "learning_rate": 7.183110390266079e-07, "logits/chosen": -0.6073750257492065, "logits/rejected": -0.6011239290237427, "logps/chosen": -72.77523803710938, "logps/rejected": -86.38811492919922, "loss": 1.7403, "rewards/accuracies": 0.0, "rewards/chosen": 1.6580628156661987, "rewards/margins": -2.522075653076172, "rewards/rejected": 4.18013858795166, "step": 6627 }, { "epoch": 1.08, "learning_rate": 7.181927952838865e-07, "logits/chosen": -0.8890739679336548, "logits/rejected": -0.8890739679336548, "logps/chosen": -65.32901000976562, "logps/rejected": -65.32901000976562, "loss": 1.1821, "rewards/accuracies": 0.0, "rewards/chosen": 3.2019119262695312, "rewards/margins": 0.0, "rewards/rejected": 3.2019119262695312, "step": 6628 }, { "epoch": 1.08, "learning_rate": 7.180745364661074e-07, "logits/chosen": -0.6451499462127686, "logits/rejected": -0.6018298864364624, "logps/chosen": -139.82762145996094, "logps/rejected": -70.16100311279297, "loss": 0.0621, "rewards/accuracies": 1.0, "rewards/chosen": 4.408966064453125, "rewards/margins": 2.4512031078338623, "rewards/rejected": 1.9577629566192627, "step": 6629 }, { "epoch": 1.08, "learning_rate": 7.179562625814412e-07, "logits/chosen": -0.9034460783004761, "logits/rejected": -0.8614436984062195, "logps/chosen": -129.238037109375, "logps/rejected": -38.5047721862793, "loss": 0.2145, "rewards/accuracies": 1.0, "rewards/chosen": 0.8079635500907898, "rewards/margins": 0.6285499334335327, "rewards/rejected": 0.1794136017560959, "step": 6630 }, { "epoch": 1.08, "learning_rate": 7.178379736380597e-07, "logits/chosen": -0.8568550944328308, "logits/rejected": -0.825494647026062, "logps/chosen": -136.73336791992188, "logps/rejected": -129.11859130859375, "loss": 1.804, "rewards/accuracies": 0.0, "rewards/chosen": 0.6244415640830994, "rewards/margins": -2.3629133701324463, "rewards/rejected": 2.9873549938201904, "step": 6631 }, { "epoch": 1.08, "learning_rate": 7.177196696441353e-07, "logits/chosen": -0.4953617751598358, "logits/rejected": -0.4675646126270294, "logps/chosen": -49.60511779785156, "logps/rejected": -18.918701171875, "loss": 0.4733, "rewards/accuracies": 1.0, "rewards/chosen": 1.7392524480819702, "rewards/margins": 1.0914329290390015, "rewards/rejected": 0.6478195190429688, "step": 6632 }, { "epoch": 1.08, "learning_rate": 7.176013506078419e-07, "logits/chosen": -1.244132399559021, "logits/rejected": -1.2737164497375488, "logps/chosen": -70.45796966552734, "logps/rejected": -56.73125457763672, "loss": 1.7592, "rewards/accuracies": 0.0, "rewards/chosen": 0.6070518493652344, "rewards/margins": -1.9862725734710693, "rewards/rejected": 2.5933244228363037, "step": 6633 }, { "epoch": 1.08, "learning_rate": 7.174830165373541e-07, "logits/chosen": -0.8274995684623718, "logits/rejected": -0.22286942601203918, "logps/chosen": -138.48687744140625, "logps/rejected": -125.07904052734375, "loss": 0.2875, "rewards/accuracies": 1.0, "rewards/chosen": 4.598249912261963, "rewards/margins": 0.3569350242614746, "rewards/rejected": 4.241314888000488, "step": 6634 }, { "epoch": 1.08, "learning_rate": 7.173646674408478e-07, "logits/chosen": -0.5941254496574402, "logits/rejected": -0.6888975501060486, "logps/chosen": -108.68255615234375, "logps/rejected": -98.01561737060547, "loss": 0.7971, "rewards/accuracies": 0.0, "rewards/chosen": 2.3011415004730225, "rewards/margins": -1.0229651927947998, "rewards/rejected": 3.3241066932678223, "step": 6635 }, { "epoch": 1.08, "learning_rate": 7.172463033264996e-07, "logits/chosen": -0.9850092530250549, "logits/rejected": -0.890117883682251, "logps/chosen": -103.71894836425781, "logps/rejected": -62.38313293457031, "loss": 1.0262, "rewards/accuracies": 1.0, "rewards/chosen": 1.012629747390747, "rewards/margins": 0.6734695434570312, "rewards/rejected": 0.33916017413139343, "step": 6636 }, { "epoch": 1.08, "learning_rate": 7.171279242024875e-07, "logits/chosen": -0.41485166549682617, "logits/rejected": -0.4454341530799866, "logps/chosen": -71.58261108398438, "logps/rejected": -104.66065979003906, "loss": 0.3687, "rewards/accuracies": 1.0, "rewards/chosen": 2.1555099487304688, "rewards/margins": 0.7993736267089844, "rewards/rejected": 1.3561363220214844, "step": 6637 }, { "epoch": 1.08, "learning_rate": 7.170095300769902e-07, "logits/chosen": -0.5670047998428345, "logits/rejected": -0.5695881247520447, "logps/chosen": -2.0792124271392822, "logps/rejected": -1.3890817165374756, "loss": 0.7208, "rewards/accuracies": 1.0, "rewards/chosen": 0.2820332944393158, "rewards/margins": 0.03641335666179657, "rewards/rejected": 0.24561993777751923, "step": 6638 }, { "epoch": 1.08, "learning_rate": 7.168911209581879e-07, "logits/chosen": -0.9982877969741821, "logits/rejected": -1.016230583190918, "logps/chosen": -224.1914825439453, "logps/rejected": -56.43944549560547, "loss": 0.9839, "rewards/accuracies": 1.0, "rewards/chosen": 4.459593296051025, "rewards/margins": 1.9264533519744873, "rewards/rejected": 2.533139944076538, "step": 6639 }, { "epoch": 1.08, "learning_rate": 7.167726968542612e-07, "logits/chosen": -0.6673306226730347, "logits/rejected": -0.6992648839950562, "logps/chosen": -67.70316314697266, "logps/rejected": -96.19099426269531, "loss": 1.1753, "rewards/accuracies": 0.0, "rewards/chosen": 1.9504951238632202, "rewards/margins": -1.6653069257736206, "rewards/rejected": 3.615802049636841, "step": 6640 }, { "epoch": 1.08, "learning_rate": 7.166542577733924e-07, "logits/chosen": -0.36305078864097595, "logits/rejected": -0.3540519177913666, "logps/chosen": -109.83487701416016, "logps/rejected": -46.90792465209961, "loss": 0.703, "rewards/accuracies": 1.0, "rewards/chosen": 1.618273138999939, "rewards/margins": 0.04457128047943115, "rewards/rejected": 1.5737018585205078, "step": 6641 }, { "epoch": 1.08, "learning_rate": 7.165358037237644e-07, "logits/chosen": -0.7811086773872375, "logits/rejected": -0.8059946298599243, "logps/chosen": -115.80201721191406, "logps/rejected": -86.53317260742188, "loss": 0.5329, "rewards/accuracies": 1.0, "rewards/chosen": 2.016400098800659, "rewards/margins": 0.11536550521850586, "rewards/rejected": 1.9010345935821533, "step": 6642 }, { "epoch": 1.08, "learning_rate": 7.164173347135611e-07, "logits/chosen": -0.6222512125968933, "logits/rejected": -0.6442713737487793, "logps/chosen": -75.56049346923828, "logps/rejected": -108.47303771972656, "loss": 0.4078, "rewards/accuracies": 0.0, "rewards/chosen": 0.8866691589355469, "rewards/margins": -0.18576431274414062, "rewards/rejected": 1.0724334716796875, "step": 6643 }, { "epoch": 1.08, "learning_rate": 7.16298850750968e-07, "logits/chosen": -0.6408817768096924, "logits/rejected": -0.6333848834037781, "logps/chosen": -64.4971923828125, "logps/rejected": -39.96112823486328, "loss": 0.5528, "rewards/accuracies": 1.0, "rewards/chosen": 1.8591499328613281, "rewards/margins": 0.14614641666412354, "rewards/rejected": 1.7130035161972046, "step": 6644 }, { "epoch": 1.08, "learning_rate": 7.161803518441707e-07, "logits/chosen": -0.34512385725975037, "logits/rejected": -0.22561115026474, "logps/chosen": -111.60455322265625, "logps/rejected": -69.853515625, "loss": 0.551, "rewards/accuracies": 1.0, "rewards/chosen": 2.2880494594573975, "rewards/margins": 0.005803108215332031, "rewards/rejected": 2.2822463512420654, "step": 6645 }, { "epoch": 1.08, "learning_rate": 7.160618380013567e-07, "logits/chosen": -0.1878342628479004, "logits/rejected": -0.1878342628479004, "logps/chosen": -19.38642692565918, "logps/rejected": -19.38642692565918, "loss": 2.4389, "rewards/accuracies": 0.0, "rewards/chosen": 0.11247577518224716, "rewards/margins": 0.0, "rewards/rejected": 0.11247577518224716, "step": 6646 }, { "epoch": 1.08, "learning_rate": 7.159433092307141e-07, "logits/chosen": -0.9470311999320984, "logits/rejected": -0.8649973273277283, "logps/chosen": -133.75888061523438, "logps/rejected": -49.430824279785156, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": 4.878109931945801, "rewards/margins": 2.8373849391937256, "rewards/rejected": 2.040724992752075, "step": 6647 }, { "epoch": 1.08, "learning_rate": 7.15824765540432e-07, "logits/chosen": -0.7600170969963074, "logits/rejected": -0.6205185055732727, "logps/chosen": -312.2603454589844, "logps/rejected": -23.158037185668945, "loss": 0.0254, "rewards/accuracies": 1.0, "rewards/chosen": 5.840936183929443, "rewards/margins": 5.4150824546813965, "rewards/rejected": 0.42585355043411255, "step": 6648 }, { "epoch": 1.08, "learning_rate": 7.157062069387009e-07, "logits/chosen": -0.7907006740570068, "logits/rejected": -0.6527064442634583, "logps/chosen": -136.77996826171875, "logps/rejected": -15.811012268066406, "loss": 0.2406, "rewards/accuracies": 1.0, "rewards/chosen": 4.231619358062744, "rewards/margins": 3.030057430267334, "rewards/rejected": 1.2015619277954102, "step": 6649 }, { "epoch": 1.08, "learning_rate": 7.155876334337119e-07, "logits/chosen": -0.6444718241691589, "logits/rejected": -0.4091920256614685, "logps/chosen": -44.203529357910156, "logps/rejected": -33.338680267333984, "loss": 0.7572, "rewards/accuracies": 1.0, "rewards/chosen": 2.365476369857788, "rewards/margins": 1.8912408351898193, "rewards/rejected": 0.47423553466796875, "step": 6650 }, { "epoch": 1.08, "learning_rate": 7.154690450336572e-07, "logits/chosen": -0.6942800879478455, "logits/rejected": -0.6698281764984131, "logps/chosen": -339.5302734375, "logps/rejected": -116.84959411621094, "loss": 0.7659, "rewards/accuracies": 0.0, "rewards/chosen": 3.8505005836486816, "rewards/margins": -1.2733688354492188, "rewards/rejected": 5.1238694190979, "step": 6651 }, { "epoch": 1.08, "learning_rate": 7.153504417467305e-07, "logits/chosen": -0.5898998379707336, "logits/rejected": -0.5634817481040955, "logps/chosen": -77.73360443115234, "logps/rejected": -47.0760498046875, "loss": 1.1611, "rewards/accuracies": 0.0, "rewards/chosen": 1.649562120437622, "rewards/margins": -0.47678375244140625, "rewards/rejected": 2.1263458728790283, "step": 6652 }, { "epoch": 1.08, "learning_rate": 7.152318235811257e-07, "logits/chosen": -0.45723196864128113, "logits/rejected": -0.4522939622402191, "logps/chosen": -37.49052429199219, "logps/rejected": -25.926708221435547, "loss": 0.8781, "rewards/accuracies": 0.0, "rewards/chosen": 0.1678016632795334, "rewards/margins": -0.3810386657714844, "rewards/rejected": 0.548840343952179, "step": 6653 }, { "epoch": 1.08, "learning_rate": 7.151131905450385e-07, "logits/chosen": -0.5297833681106567, "logits/rejected": -0.4877350926399231, "logps/chosen": -109.06956481933594, "logps/rejected": -69.80557250976562, "loss": 0.8411, "rewards/accuracies": 0.0, "rewards/chosen": 1.5726028680801392, "rewards/margins": -1.0452171564102173, "rewards/rejected": 2.6178200244903564, "step": 6654 }, { "epoch": 1.08, "learning_rate": 7.149945426466653e-07, "logits/chosen": -0.3045519292354584, "logits/rejected": -0.3045519292354584, "logps/chosen": -37.19353485107422, "logps/rejected": -37.19353485107422, "loss": 0.7734, "rewards/accuracies": 0.0, "rewards/chosen": 2.017749071121216, "rewards/margins": 0.0, "rewards/rejected": 2.017749071121216, "step": 6655 }, { "epoch": 1.08, "learning_rate": 7.148758798942036e-07, "logits/chosen": -0.8602051138877869, "logits/rejected": -0.7680692076683044, "logps/chosen": -158.697265625, "logps/rejected": -118.998046875, "loss": 0.8498, "rewards/accuracies": 1.0, "rewards/chosen": 4.332322597503662, "rewards/margins": 1.881547451019287, "rewards/rejected": 2.450775146484375, "step": 6656 }, { "epoch": 1.08, "learning_rate": 7.147572022958517e-07, "logits/chosen": -0.8678199648857117, "logits/rejected": -0.8813139200210571, "logps/chosen": -108.7767333984375, "logps/rejected": -127.08403015136719, "loss": 2.7769, "rewards/accuracies": 0.0, "rewards/chosen": 1.4477676153182983, "rewards/margins": -5.366464138031006, "rewards/rejected": 6.814231872558594, "step": 6657 }, { "epoch": 1.08, "learning_rate": 7.146385098598091e-07, "logits/chosen": -0.6169163584709167, "logits/rejected": -0.6115991473197937, "logps/chosen": -63.28547668457031, "logps/rejected": -102.52651977539062, "loss": 1.5184, "rewards/accuracies": 0.0, "rewards/chosen": 0.5133048892021179, "rewards/margins": -1.7926032543182373, "rewards/rejected": 2.305908203125, "step": 6658 }, { "epoch": 1.08, "learning_rate": 7.145198025942765e-07, "logits/chosen": -0.7698298096656799, "logits/rejected": -0.6649380922317505, "logps/chosen": -110.83088684082031, "logps/rejected": -143.66891479492188, "loss": 0.2542, "rewards/accuracies": 1.0, "rewards/chosen": 2.3416154384613037, "rewards/margins": 0.4568803310394287, "rewards/rejected": 1.884735107421875, "step": 6659 }, { "epoch": 1.08, "learning_rate": 7.144010805074553e-07, "logits/chosen": -1.3527714014053345, "logits/rejected": -1.3232721090316772, "logps/chosen": -77.99897003173828, "logps/rejected": -69.471435546875, "loss": 0.5965, "rewards/accuracies": 1.0, "rewards/chosen": 1.7415138483047485, "rewards/margins": 0.07502520084381104, "rewards/rejected": 1.6664886474609375, "step": 6660 }, { "epoch": 1.08, "learning_rate": 7.142823436075481e-07, "logits/chosen": -0.6274048686027527, "logits/rejected": -0.6122438311576843, "logps/chosen": -49.185211181640625, "logps/rejected": -55.926414489746094, "loss": 0.6038, "rewards/accuracies": 1.0, "rewards/chosen": 1.9916366338729858, "rewards/margins": 0.07937848567962646, "rewards/rejected": 1.9122581481933594, "step": 6661 }, { "epoch": 1.08, "learning_rate": 7.141635919027585e-07, "logits/chosen": -0.8336988091468811, "logits/rejected": -0.7339341640472412, "logps/chosen": -166.3343963623047, "logps/rejected": -48.494415283203125, "loss": 0.2547, "rewards/accuracies": 1.0, "rewards/chosen": 5.996116638183594, "rewards/margins": 4.525698661804199, "rewards/rejected": 1.470417857170105, "step": 6662 }, { "epoch": 1.08, "learning_rate": 7.140448254012912e-07, "logits/chosen": -0.3341672718524933, "logits/rejected": -0.3345862925052643, "logps/chosen": -7.541654109954834, "logps/rejected": -12.650957107543945, "loss": 0.9124, "rewards/accuracies": 0.0, "rewards/chosen": 0.6223213076591492, "rewards/margins": -0.19783669710159302, "rewards/rejected": 0.8201580047607422, "step": 6663 }, { "epoch": 1.08, "learning_rate": 7.139260441113518e-07, "logits/chosen": -0.11771811544895172, "logits/rejected": -0.11771811544895172, "logps/chosen": -19.679824829101562, "logps/rejected": -19.679824829101562, "loss": 0.4126, "rewards/accuracies": 0.0, "rewards/chosen": 0.2851230800151825, "rewards/margins": 0.0, "rewards/rejected": 0.2851230800151825, "step": 6664 }, { "epoch": 1.08, "learning_rate": 7.138072480411469e-07, "logits/chosen": -0.3846990466117859, "logits/rejected": -0.36155614256858826, "logps/chosen": -60.954345703125, "logps/rejected": -59.985809326171875, "loss": 0.2398, "rewards/accuracies": 1.0, "rewards/chosen": 2.4736313819885254, "rewards/margins": 0.6392776966094971, "rewards/rejected": 1.8343536853790283, "step": 6665 }, { "epoch": 1.08, "learning_rate": 7.136884371988843e-07, "logits/chosen": -0.5925712585449219, "logits/rejected": -0.551588237285614, "logps/chosen": -77.0398941040039, "logps/rejected": -101.47285461425781, "loss": 0.3605, "rewards/accuracies": 0.0, "rewards/chosen": 1.1070502996444702, "rewards/margins": -0.044561028480529785, "rewards/rejected": 1.151611328125, "step": 6666 }, { "epoch": 1.08, "learning_rate": 7.135696115927725e-07, "logits/chosen": -0.56111741065979, "logits/rejected": -0.5501258373260498, "logps/chosen": -18.48119354248047, "logps/rejected": -10.968353271484375, "loss": 1.1018, "rewards/accuracies": 0.0, "rewards/chosen": 0.013340378180146217, "rewards/margins": -0.35688647627830505, "rewards/rejected": 0.3702268600463867, "step": 6667 }, { "epoch": 1.08, "learning_rate": 7.134507712310214e-07, "logits/chosen": -0.7329749464988708, "logits/rejected": -0.6973139643669128, "logps/chosen": -186.27383422851562, "logps/rejected": -48.23113250732422, "loss": 0.4656, "rewards/accuracies": 1.0, "rewards/chosen": 3.6360626220703125, "rewards/margins": 1.8302733898162842, "rewards/rejected": 1.8057892322540283, "step": 6668 }, { "epoch": 1.08, "learning_rate": 7.133319161218417e-07, "logits/chosen": -0.722258448600769, "logits/rejected": -0.47783219814300537, "logps/chosen": -150.2929229736328, "logps/rejected": -65.91343688964844, "loss": 0.0491, "rewards/accuracies": 1.0, "rewards/chosen": 5.454967021942139, "rewards/margins": 2.6842105388641357, "rewards/rejected": 2.770756483078003, "step": 6669 }, { "epoch": 1.08, "learning_rate": 7.132130462734451e-07, "logits/chosen": -0.4924101233482361, "logits/rejected": -0.4381675720214844, "logps/chosen": -65.01583862304688, "logps/rejected": -84.52792358398438, "loss": 0.9448, "rewards/accuracies": 0.0, "rewards/chosen": 2.115081787109375, "rewards/margins": -0.1687028408050537, "rewards/rejected": 2.2837846279144287, "step": 6670 }, { "epoch": 1.08, "learning_rate": 7.130941616940446e-07, "logits/chosen": -0.8709384202957153, "logits/rejected": -0.7869387269020081, "logps/chosen": -96.4724349975586, "logps/rejected": -31.318950653076172, "loss": 0.2258, "rewards/accuracies": 1.0, "rewards/chosen": 3.499540090560913, "rewards/margins": 3.41208815574646, "rewards/rejected": 0.08745193481445312, "step": 6671 }, { "epoch": 1.08, "learning_rate": 7.129752623918537e-07, "logits/chosen": -0.6787328124046326, "logits/rejected": -0.6114585995674133, "logps/chosen": -48.368431091308594, "logps/rejected": -29.940677642822266, "loss": 1.5026, "rewards/accuracies": 0.0, "rewards/chosen": 1.2900627851486206, "rewards/margins": -0.3855632543563843, "rewards/rejected": 1.6756260395050049, "step": 6672 }, { "epoch": 1.08, "learning_rate": 7.128563483750874e-07, "logits/chosen": -0.9386125206947327, "logits/rejected": -0.5881759524345398, "logps/chosen": -155.0286865234375, "logps/rejected": -34.98638153076172, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 5.269757270812988, "rewards/margins": 5.051975250244141, "rewards/rejected": 0.21778221428394318, "step": 6673 }, { "epoch": 1.08, "learning_rate": 7.127374196519615e-07, "logits/chosen": -0.7254100441932678, "logits/rejected": -0.688849151134491, "logps/chosen": -83.0062255859375, "logps/rejected": -68.88633728027344, "loss": 0.1014, "rewards/accuracies": 1.0, "rewards/chosen": 3.733172655105591, "rewards/margins": 3.0396454334259033, "rewards/rejected": 0.6935272216796875, "step": 6674 }, { "epoch": 1.08, "learning_rate": 7.126184762306928e-07, "logits/chosen": -0.7352065443992615, "logits/rejected": -0.68564373254776, "logps/chosen": -131.11866760253906, "logps/rejected": -34.37406539916992, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": 1.9239975214004517, "rewards/margins": 1.7780522108078003, "rewards/rejected": 0.14594535529613495, "step": 6675 }, { "epoch": 1.08, "learning_rate": 7.124995181194993e-07, "logits/chosen": -0.5703887343406677, "logits/rejected": -0.559592604637146, "logps/chosen": -84.31120300292969, "logps/rejected": -92.32682037353516, "loss": 0.7947, "rewards/accuracies": 0.0, "rewards/chosen": 0.7971916198730469, "rewards/margins": -0.8016952276229858, "rewards/rejected": 1.5988868474960327, "step": 6676 }, { "epoch": 1.08, "learning_rate": 7.123805453265996e-07, "logits/chosen": -0.2669074535369873, "logits/rejected": -0.24423018097877502, "logps/chosen": -29.987957000732422, "logps/rejected": -1.5249695777893066, "loss": 0.5906, "rewards/accuracies": 0.0, "rewards/chosen": 0.2577476501464844, "rewards/margins": -0.29040753841400146, "rewards/rejected": 0.5481551885604858, "step": 6677 }, { "epoch": 1.08, "learning_rate": 7.12261557860214e-07, "logits/chosen": -0.43932899832725525, "logits/rejected": -0.44615471363067627, "logps/chosen": -5.990830898284912, "logps/rejected": -1.9586260318756104, "loss": 1.3731, "rewards/accuracies": 0.0, "rewards/chosen": 0.38055500388145447, "rewards/margins": -0.2682935297489166, "rewards/rejected": 0.6488485336303711, "step": 6678 }, { "epoch": 1.08, "learning_rate": 7.12142555728563e-07, "logits/chosen": -0.6471061706542969, "logits/rejected": -0.5846315622329712, "logps/chosen": -43.634674072265625, "logps/rejected": -56.94671630859375, "loss": 0.4133, "rewards/accuracies": 1.0, "rewards/chosen": 2.1008071899414062, "rewards/margins": 0.22430419921875, "rewards/rejected": 1.8765029907226562, "step": 6679 }, { "epoch": 1.08, "learning_rate": 7.120235389398688e-07, "logits/chosen": -0.312105655670166, "logits/rejected": -0.25279557704925537, "logps/chosen": -57.60142135620117, "logps/rejected": -35.56355285644531, "loss": 1.0993, "rewards/accuracies": 1.0, "rewards/chosen": 1.0869884490966797, "rewards/margins": 0.020396828651428223, "rewards/rejected": 1.0665916204452515, "step": 6680 }, { "epoch": 1.08, "learning_rate": 7.119045075023542e-07, "logits/chosen": -0.5667538642883301, "logits/rejected": -0.5590496063232422, "logps/chosen": -86.62931823730469, "logps/rejected": -103.7607421875, "loss": 0.4595, "rewards/accuracies": 1.0, "rewards/chosen": 1.7807327508926392, "rewards/margins": 1.8307304382324219, "rewards/rejected": -0.049997713416814804, "step": 6681 }, { "epoch": 1.08, "learning_rate": 7.117854614242433e-07, "logits/chosen": -0.6596084833145142, "logits/rejected": -0.5608294010162354, "logps/chosen": -74.56480407714844, "logps/rejected": -47.95248031616211, "loss": 0.108, "rewards/accuracies": 1.0, "rewards/chosen": 3.7235474586486816, "rewards/margins": 1.7812031507492065, "rewards/rejected": 1.942344307899475, "step": 6682 }, { "epoch": 1.08, "learning_rate": 7.11666400713761e-07, "logits/chosen": -0.9157896041870117, "logits/rejected": -0.9468256235122681, "logps/chosen": -134.32110595703125, "logps/rejected": -61.05574035644531, "loss": 0.7492, "rewards/accuracies": 0.0, "rewards/chosen": 1.2509613037109375, "rewards/margins": -0.8704872131347656, "rewards/rejected": 2.121448516845703, "step": 6683 }, { "epoch": 1.08, "learning_rate": 7.115473253791329e-07, "logits/chosen": -0.47952941060066223, "logits/rejected": -0.41995498538017273, "logps/chosen": -65.86099243164062, "logps/rejected": -13.837287902832031, "loss": 0.6523, "rewards/accuracies": 1.0, "rewards/chosen": 0.7581756711006165, "rewards/margins": 0.2750389277935028, "rewards/rejected": 0.48313674330711365, "step": 6684 }, { "epoch": 1.09, "learning_rate": 7.114282354285865e-07, "logits/chosen": -0.8557203412055969, "logits/rejected": -0.8691866993904114, "logps/chosen": -88.40467834472656, "logps/rejected": -95.42665100097656, "loss": 1.4323, "rewards/accuracies": 0.0, "rewards/chosen": 1.8742973804473877, "rewards/margins": -2.767446279525757, "rewards/rejected": 4.6417436599731445, "step": 6685 }, { "epoch": 1.09, "learning_rate": 7.113091308703497e-07, "logits/chosen": -0.5691826939582825, "logits/rejected": -0.5884762406349182, "logps/chosen": -83.23455047607422, "logps/rejected": -74.34059143066406, "loss": 0.9685, "rewards/accuracies": 0.0, "rewards/chosen": 0.8177467584609985, "rewards/margins": -0.18055039644241333, "rewards/rejected": 0.9982971549034119, "step": 6686 }, { "epoch": 1.09, "learning_rate": 7.111900117126512e-07, "logits/chosen": -0.6984518766403198, "logits/rejected": -0.15016254782676697, "logps/chosen": -97.74111938476562, "logps/rejected": -84.11299896240234, "loss": 0.9135, "rewards/accuracies": 0.0, "rewards/chosen": 1.2945022583007812, "rewards/margins": -0.728600263595581, "rewards/rejected": 2.0231025218963623, "step": 6687 }, { "epoch": 1.09, "learning_rate": 7.110708779637213e-07, "logits/chosen": -0.6305614709854126, "logits/rejected": -0.5792315006256104, "logps/chosen": -47.049407958984375, "logps/rejected": -15.067144393920898, "loss": 0.202, "rewards/accuracies": 1.0, "rewards/chosen": 1.672798991203308, "rewards/margins": 0.757205069065094, "rewards/rejected": 0.9155939221382141, "step": 6688 }, { "epoch": 1.09, "learning_rate": 7.109517296317908e-07, "logits/chosen": -0.19437934458255768, "logits/rejected": -0.19040444493293762, "logps/chosen": -0.8463765382766724, "logps/rejected": -10.66827392578125, "loss": 0.3536, "rewards/accuracies": 1.0, "rewards/chosen": 0.2708202302455902, "rewards/margins": 0.18825663626194, "rewards/rejected": 0.08256359398365021, "step": 6689 }, { "epoch": 1.09, "learning_rate": 7.10832566725092e-07, "logits/chosen": -0.535966157913208, "logits/rejected": -0.3415244221687317, "logps/chosen": -62.2917594909668, "logps/rejected": -14.622581481933594, "loss": 0.3481, "rewards/accuracies": 1.0, "rewards/chosen": 2.9121358394622803, "rewards/margins": 2.3108413219451904, "rewards/rejected": 0.6012945175170898, "step": 6690 }, { "epoch": 1.09, "learning_rate": 7.107133892518576e-07, "logits/chosen": -0.5769896507263184, "logits/rejected": -0.5885458588600159, "logps/chosen": -108.7990951538086, "logps/rejected": -129.56365966796875, "loss": 0.8315, "rewards/accuracies": 0.0, "rewards/chosen": 0.6427032351493835, "rewards/margins": -0.4931030869483948, "rewards/rejected": 1.1358063220977783, "step": 6691 }, { "epoch": 1.09, "learning_rate": 7.105941972203219e-07, "logits/chosen": -0.6557695269584656, "logits/rejected": -0.5443201065063477, "logps/chosen": -79.15927124023438, "logps/rejected": -103.83897399902344, "loss": 0.4256, "rewards/accuracies": 0.0, "rewards/chosen": 1.8670272827148438, "rewards/margins": -0.2818787097930908, "rewards/rejected": 2.1489059925079346, "step": 6692 }, { "epoch": 1.09, "learning_rate": 7.104749906387197e-07, "logits/chosen": -0.6215458512306213, "logits/rejected": -0.5395346283912659, "logps/chosen": -65.80814361572266, "logps/rejected": -71.1187515258789, "loss": 0.5497, "rewards/accuracies": 1.0, "rewards/chosen": 1.5454429388046265, "rewards/margins": 0.08800649642944336, "rewards/rejected": 1.457436442375183, "step": 6693 }, { "epoch": 1.09, "learning_rate": 7.103557695152873e-07, "logits/chosen": -0.4500219523906708, "logits/rejected": -0.5003314018249512, "logps/chosen": -122.5274658203125, "logps/rejected": -94.20602416992188, "loss": 1.6109, "rewards/accuracies": 0.0, "rewards/chosen": 3.30712890625, "rewards/margins": -3.092360019683838, "rewards/rejected": 6.399488925933838, "step": 6694 }, { "epoch": 1.09, "learning_rate": 7.102365338582616e-07, "logits/chosen": -0.6865018606185913, "logits/rejected": -0.762057363986969, "logps/chosen": -47.25912857055664, "logps/rejected": -114.69068908691406, "loss": 1.177, "rewards/accuracies": 0.0, "rewards/chosen": 2.439216375350952, "rewards/margins": -2.0773637294769287, "rewards/rejected": 4.516580104827881, "step": 6695 }, { "epoch": 1.09, "learning_rate": 7.101172836758806e-07, "logits/chosen": -0.3838174641132355, "logits/rejected": -0.4240119457244873, "logps/chosen": -68.62139892578125, "logps/rejected": -98.52847290039062, "loss": 1.0911, "rewards/accuracies": 0.0, "rewards/chosen": 1.6991875171661377, "rewards/margins": -1.2332985401153564, "rewards/rejected": 2.932486057281494, "step": 6696 }, { "epoch": 1.09, "learning_rate": 7.099980189763835e-07, "logits/chosen": -0.513133704662323, "logits/rejected": -0.5124709606170654, "logps/chosen": -3.760842800140381, "logps/rejected": -14.171382904052734, "loss": 0.3289, "rewards/accuracies": 1.0, "rewards/chosen": 0.11067700386047363, "rewards/margins": 0.1742202341556549, "rewards/rejected": -0.06354322284460068, "step": 6697 }, { "epoch": 1.09, "learning_rate": 7.098787397680104e-07, "logits/chosen": -0.6162392497062683, "logits/rejected": -0.6388434171676636, "logps/chosen": -114.0067367553711, "logps/rejected": -45.39594268798828, "loss": 0.6762, "rewards/accuracies": 0.0, "rewards/chosen": 0.4304184019565582, "rewards/margins": -1.0331512689590454, "rewards/rejected": 1.4635696411132812, "step": 6698 }, { "epoch": 1.09, "learning_rate": 7.097594460590022e-07, "logits/chosen": -0.7551752924919128, "logits/rejected": -0.7169799208641052, "logps/chosen": -173.4962158203125, "logps/rejected": -65.54597473144531, "loss": 2.2548, "rewards/accuracies": 1.0, "rewards/chosen": 3.6484405994415283, "rewards/margins": 2.090197801589966, "rewards/rejected": 1.5582427978515625, "step": 6699 }, { "epoch": 1.09, "learning_rate": 7.09640137857601e-07, "logits/chosen": -0.4687509834766388, "logits/rejected": -0.5368709564208984, "logps/chosen": -65.90830993652344, "logps/rejected": -72.5828857421875, "loss": 0.2317, "rewards/accuracies": 1.0, "rewards/chosen": 2.364905595779419, "rewards/margins": 0.7419105768203735, "rewards/rejected": 1.6229950189590454, "step": 6700 }, { "epoch": 1.09, "learning_rate": 7.0952081517205e-07, "logits/chosen": -0.7439290881156921, "logits/rejected": -0.6274296045303345, "logps/chosen": -105.25773620605469, "logps/rejected": -101.67681884765625, "loss": 0.8623, "rewards/accuracies": 0.0, "rewards/chosen": 2.657231092453003, "rewards/margins": -0.7660737037658691, "rewards/rejected": 3.423304796218872, "step": 6701 }, { "epoch": 1.09, "learning_rate": 7.09401478010593e-07, "logits/chosen": -0.6634738445281982, "logits/rejected": -0.6634738445281982, "logps/chosen": -65.88746643066406, "logps/rejected": -65.88746643066406, "loss": 0.5753, "rewards/accuracies": 0.0, "rewards/chosen": 1.6741653680801392, "rewards/margins": 0.0, "rewards/rejected": 1.6741653680801392, "step": 6702 }, { "epoch": 1.09, "learning_rate": 7.092821263814755e-07, "logits/chosen": -0.2518463432788849, "logits/rejected": -0.2874915599822998, "logps/chosen": -55.551734924316406, "logps/rejected": -69.57817077636719, "loss": 2.6369, "rewards/accuracies": 0.0, "rewards/chosen": 1.5152580738067627, "rewards/margins": -0.6507225036621094, "rewards/rejected": 2.165980577468872, "step": 6703 }, { "epoch": 1.09, "learning_rate": 7.091627602929431e-07, "logits/chosen": -0.8074176907539368, "logits/rejected": -0.8843621611595154, "logps/chosen": -79.38046264648438, "logps/rejected": -138.974609375, "loss": 2.4444, "rewards/accuracies": 0.0, "rewards/chosen": 1.0738617181777954, "rewards/margins": -4.18043851852417, "rewards/rejected": 5.254300117492676, "step": 6704 }, { "epoch": 1.09, "learning_rate": 7.090433797532431e-07, "logits/chosen": -0.7073619961738586, "logits/rejected": -0.2503124475479126, "logps/chosen": -80.54829406738281, "logps/rejected": -104.1087417602539, "loss": 0.2396, "rewards/accuracies": 1.0, "rewards/chosen": 1.9105675220489502, "rewards/margins": 0.6384216547012329, "rewards/rejected": 1.2721458673477173, "step": 6705 }, { "epoch": 1.09, "learning_rate": 7.089239847706237e-07, "logits/chosen": -0.8109712600708008, "logits/rejected": -0.7579646110534668, "logps/chosen": -52.32708740234375, "logps/rejected": -47.771820068359375, "loss": 0.1324, "rewards/accuracies": 1.0, "rewards/chosen": 2.0202178955078125, "rewards/margins": 1.2247695922851562, "rewards/rejected": 0.7954483032226562, "step": 6706 }, { "epoch": 1.09, "learning_rate": 7.088045753533336e-07, "logits/chosen": -0.6169546246528625, "logits/rejected": -0.6062929034233093, "logps/chosen": -28.697635650634766, "logps/rejected": -23.25884437561035, "loss": 0.5327, "rewards/accuracies": 1.0, "rewards/chosen": 0.6356129050254822, "rewards/margins": 0.2653789818286896, "rewards/rejected": 0.3702339231967926, "step": 6707 }, { "epoch": 1.09, "learning_rate": 7.086851515096233e-07, "logits/chosen": -0.4034964144229889, "logits/rejected": -0.44496023654937744, "logps/chosen": -7.787723541259766, "logps/rejected": -47.8287239074707, "loss": 1.4227, "rewards/accuracies": 0.0, "rewards/chosen": 0.3517664074897766, "rewards/margins": -0.4285837411880493, "rewards/rejected": 0.7803501486778259, "step": 6708 }, { "epoch": 1.09, "learning_rate": 7.085657132477434e-07, "logits/chosen": -0.7252238392829895, "logits/rejected": -0.7265580892562866, "logps/chosen": -41.2255859375, "logps/rejected": -29.78182029724121, "loss": 0.6407, "rewards/accuracies": 0.0, "rewards/chosen": 0.6348842978477478, "rewards/margins": -0.9531667828559875, "rewards/rejected": 1.5880510807037354, "step": 6709 }, { "epoch": 1.09, "learning_rate": 7.084462605759464e-07, "logits/chosen": -1.026923418045044, "logits/rejected": -1.0206406116485596, "logps/chosen": -71.22029876708984, "logps/rejected": -162.0014190673828, "loss": 1.6099, "rewards/accuracies": 0.0, "rewards/chosen": 2.3683221340179443, "rewards/margins": -3.152209997177124, "rewards/rejected": 5.520532131195068, "step": 6710 }, { "epoch": 1.09, "learning_rate": 7.08326793502485e-07, "logits/chosen": -0.7691153287887573, "logits/rejected": -0.738002359867096, "logps/chosen": -57.166175842285156, "logps/rejected": -35.95356750488281, "loss": 0.5086, "rewards/accuracies": 0.0, "rewards/chosen": 0.6761093139648438, "rewards/margins": -0.476894736289978, "rewards/rejected": 1.1530040502548218, "step": 6711 }, { "epoch": 1.09, "learning_rate": 7.082073120356134e-07, "logits/chosen": -0.6327793598175049, "logits/rejected": -0.4059959650039673, "logps/chosen": -254.823486328125, "logps/rejected": -25.298112869262695, "loss": 1.284, "rewards/accuracies": 1.0, "rewards/chosen": 3.5896759033203125, "rewards/margins": 3.385476589202881, "rewards/rejected": 0.20419941842556, "step": 6712 }, { "epoch": 1.09, "learning_rate": 7.080878161835866e-07, "logits/chosen": -0.5365161895751953, "logits/rejected": -0.5043103098869324, "logps/chosen": -103.51651000976562, "logps/rejected": -99.59736633300781, "loss": 0.6827, "rewards/accuracies": 0.0, "rewards/chosen": 0.5604271292686462, "rewards/margins": -0.1416938304901123, "rewards/rejected": 0.7021209597587585, "step": 6713 }, { "epoch": 1.09, "learning_rate": 7.079683059546606e-07, "logits/chosen": -0.1544552445411682, "logits/rejected": -0.38262006640434265, "logps/chosen": -89.59909057617188, "logps/rejected": -62.247127532958984, "loss": 0.4818, "rewards/accuracies": 1.0, "rewards/chosen": 2.1942505836486816, "rewards/margins": 0.45063066482543945, "rewards/rejected": 1.7436199188232422, "step": 6714 }, { "epoch": 1.09, "learning_rate": 7.078487813570925e-07, "logits/chosen": -0.9662954211235046, "logits/rejected": -0.9061386585235596, "logps/chosen": -61.275306701660156, "logps/rejected": -21.065126419067383, "loss": 0.1206, "rewards/accuracies": 1.0, "rewards/chosen": 2.2532920837402344, "rewards/margins": 1.887474775314331, "rewards/rejected": 0.36581727862358093, "step": 6715 }, { "epoch": 1.09, "learning_rate": 7.077292423991403e-07, "logits/chosen": -0.6115900874137878, "logits/rejected": -0.5995717644691467, "logps/chosen": -47.191558837890625, "logps/rejected": -53.179443359375, "loss": 0.2377, "rewards/accuracies": 1.0, "rewards/chosen": 3.1098923683166504, "rewards/margins": 0.6359589099884033, "rewards/rejected": 2.473933458328247, "step": 6716 }, { "epoch": 1.09, "learning_rate": 7.076096890890631e-07, "logits/chosen": -0.27074915170669556, "logits/rejected": -0.25618302822113037, "logps/chosen": -72.11474609375, "logps/rejected": -59.785743713378906, "loss": 0.269, "rewards/accuracies": 1.0, "rewards/chosen": 4.067971229553223, "rewards/margins": 0.9361863136291504, "rewards/rejected": 3.1317849159240723, "step": 6717 }, { "epoch": 1.09, "learning_rate": 7.074901214351207e-07, "logits/chosen": -1.1876921653747559, "logits/rejected": -1.155324101448059, "logps/chosen": -129.51405334472656, "logps/rejected": -40.571861267089844, "loss": 0.9016, "rewards/accuracies": 1.0, "rewards/chosen": 1.5788345336914062, "rewards/margins": 1.450208306312561, "rewards/rejected": 0.1286262571811676, "step": 6718 }, { "epoch": 1.09, "learning_rate": 7.073705394455742e-07, "logits/chosen": -0.19530631601810455, "logits/rejected": -0.17629222571849823, "logps/chosen": -109.70387268066406, "logps/rejected": -128.27609252929688, "loss": 1.2192, "rewards/accuracies": 1.0, "rewards/chosen": 0.4741111695766449, "rewards/margins": 0.1934410035610199, "rewards/rejected": 0.280670166015625, "step": 6719 }, { "epoch": 1.09, "learning_rate": 7.072509431286857e-07, "logits/chosen": -0.4670206904411316, "logits/rejected": -0.42625921964645386, "logps/chosen": -202.83009338378906, "logps/rejected": -64.53388977050781, "loss": 0.1172, "rewards/accuracies": 1.0, "rewards/chosen": 3.998884677886963, "rewards/margins": 1.8531296253204346, "rewards/rejected": 2.1457550525665283, "step": 6720 }, { "epoch": 1.09, "learning_rate": 7.071313324927179e-07, "logits/chosen": -0.8404430150985718, "logits/rejected": -0.7555673122406006, "logps/chosen": -92.52555847167969, "logps/rejected": -77.82514953613281, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": 5.146977424621582, "rewards/margins": 2.8859009742736816, "rewards/rejected": 2.2610764503479004, "step": 6721 }, { "epoch": 1.09, "learning_rate": 7.07011707545935e-07, "logits/chosen": -0.6964544057846069, "logits/rejected": -0.7063484191894531, "logps/chosen": -265.9295959472656, "logps/rejected": -99.1115493774414, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": 5.513339519500732, "rewards/margins": 3.5891475677490234, "rewards/rejected": 1.9241920709609985, "step": 6722 }, { "epoch": 1.09, "learning_rate": 7.06892068296602e-07, "logits/chosen": -0.37475210428237915, "logits/rejected": -0.3320869207382202, "logps/chosen": -80.89891815185547, "logps/rejected": -109.13147735595703, "loss": 0.5665, "rewards/accuracies": 1.0, "rewards/chosen": 2.4616012573242188, "rewards/margins": 0.17500829696655273, "rewards/rejected": 2.286592960357666, "step": 6723 }, { "epoch": 1.09, "learning_rate": 7.067724147529846e-07, "logits/chosen": -1.1051344871520996, "logits/rejected": -1.108380675315857, "logps/chosen": -33.447608947753906, "logps/rejected": -151.0307159423828, "loss": 1.8429, "rewards/accuracies": 0.0, "rewards/chosen": 1.6979507207870483, "rewards/margins": -2.941162109375, "rewards/rejected": 4.639112949371338, "step": 6724 }, { "epoch": 1.09, "learning_rate": 7.066527469233496e-07, "logits/chosen": -0.9284958839416504, "logits/rejected": -0.7461722493171692, "logps/chosen": -91.36111450195312, "logps/rejected": -18.464502334594727, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 5.053821086883545, "rewards/margins": 4.7226057052612305, "rewards/rejected": 0.3312152922153473, "step": 6725 }, { "epoch": 1.09, "learning_rate": 7.065330648159655e-07, "logits/chosen": -0.9693500399589539, "logits/rejected": -0.9929289221763611, "logps/chosen": -41.34328842163086, "logps/rejected": -45.11952209472656, "loss": 0.471, "rewards/accuracies": 0.0, "rewards/chosen": 2.3861300945281982, "rewards/margins": -0.2886319160461426, "rewards/rejected": 2.674762010574341, "step": 6726 }, { "epoch": 1.09, "learning_rate": 7.064133684391007e-07, "logits/chosen": -0.15118205547332764, "logits/rejected": -0.14756517112255096, "logps/chosen": -2.839019298553467, "logps/rejected": -17.775657653808594, "loss": 0.5118, "rewards/accuracies": 1.0, "rewards/chosen": 0.2611119747161865, "rewards/margins": 0.11054205894470215, "rewards/rejected": 0.15056991577148438, "step": 6727 }, { "epoch": 1.09, "learning_rate": 7.062936578010253e-07, "logits/chosen": -0.6051188707351685, "logits/rejected": -0.7528581023216248, "logps/chosen": -62.70704650878906, "logps/rejected": -99.4604263305664, "loss": 3.1287, "rewards/accuracies": 0.0, "rewards/chosen": 1.136163353919983, "rewards/margins": -4.207189559936523, "rewards/rejected": 5.343352794647217, "step": 6728 }, { "epoch": 1.09, "learning_rate": 7.061739329100101e-07, "logits/chosen": -0.7194264531135559, "logits/rejected": -0.7509520649909973, "logps/chosen": -87.4622802734375, "logps/rejected": -49.43168640136719, "loss": 0.5223, "rewards/accuracies": 0.0, "rewards/chosen": 1.3835029602050781, "rewards/margins": -0.5937889814376831, "rewards/rejected": 1.9772919416427612, "step": 6729 }, { "epoch": 1.09, "learning_rate": 7.060541937743269e-07, "logits/chosen": -0.639500617980957, "logits/rejected": -0.6337363719940186, "logps/chosen": -7.1205525398254395, "logps/rejected": -20.144071578979492, "loss": 0.5779, "rewards/accuracies": 1.0, "rewards/chosen": 0.12871623039245605, "rewards/margins": 0.3862832486629486, "rewards/rejected": -0.25756701827049255, "step": 6730 }, { "epoch": 1.09, "learning_rate": 7.059344404022487e-07, "logits/chosen": -0.8718906044960022, "logits/rejected": -0.8249348998069763, "logps/chosen": -138.30218505859375, "logps/rejected": -75.28288269042969, "loss": 0.1726, "rewards/accuracies": 1.0, "rewards/chosen": 6.7108001708984375, "rewards/margins": 2.910064697265625, "rewards/rejected": 3.8007354736328125, "step": 6731 }, { "epoch": 1.09, "learning_rate": 7.058146728020491e-07, "logits/chosen": -0.6136599183082581, "logits/rejected": -0.6544497013092041, "logps/chosen": -93.93994140625, "logps/rejected": -91.56509399414062, "loss": 0.9108, "rewards/accuracies": 0.0, "rewards/chosen": 1.7468734979629517, "rewards/margins": -1.6239150762557983, "rewards/rejected": 3.37078857421875, "step": 6732 }, { "epoch": 1.09, "learning_rate": 7.056948909820032e-07, "logits/chosen": -0.8224272131919861, "logits/rejected": -0.8293344378471375, "logps/chosen": -38.20140838623047, "logps/rejected": -38.729557037353516, "loss": 0.6646, "rewards/accuracies": 0.0, "rewards/chosen": 2.1412055492401123, "rewards/margins": -0.15127921104431152, "rewards/rejected": 2.292484760284424, "step": 6733 }, { "epoch": 1.09, "learning_rate": 7.055750949503866e-07, "logits/chosen": -0.6230584383010864, "logits/rejected": -0.5895074605941772, "logps/chosen": -22.51803970336914, "logps/rejected": -45.83610534667969, "loss": 0.3746, "rewards/accuracies": 1.0, "rewards/chosen": 0.605890691280365, "rewards/margins": 0.4984894096851349, "rewards/rejected": 0.1074012741446495, "step": 6734 }, { "epoch": 1.09, "learning_rate": 7.05455284715476e-07, "logits/chosen": -0.4862643778324127, "logits/rejected": -0.47175079584121704, "logps/chosen": -57.591156005859375, "logps/rejected": -88.57244873046875, "loss": 0.7996, "rewards/accuracies": 0.0, "rewards/chosen": 1.4756195545196533, "rewards/margins": -0.8298263549804688, "rewards/rejected": 2.305445909500122, "step": 6735 }, { "epoch": 1.09, "learning_rate": 7.053354602855495e-07, "logits/chosen": -0.5253012180328369, "logits/rejected": -0.39459457993507385, "logps/chosen": -57.513946533203125, "logps/rejected": -43.4686393737793, "loss": 0.6218, "rewards/accuracies": 1.0, "rewards/chosen": 1.4614380598068237, "rewards/margins": 0.38731813430786133, "rewards/rejected": 1.0741199254989624, "step": 6736 }, { "epoch": 1.09, "learning_rate": 7.052156216688854e-07, "logits/chosen": -0.4277096092700958, "logits/rejected": -0.6067864894866943, "logps/chosen": -216.46107482910156, "logps/rejected": -208.8625030517578, "loss": 2.0066, "rewards/accuracies": 0.0, "rewards/chosen": 4.72854471206665, "rewards/margins": -3.8970494270324707, "rewards/rejected": 8.625594139099121, "step": 6737 }, { "epoch": 1.09, "learning_rate": 7.050957688737636e-07, "logits/chosen": -0.7639027237892151, "logits/rejected": -0.6501526236534119, "logps/chosen": -134.58883666992188, "logps/rejected": -110.74730682373047, "loss": 0.0425, "rewards/accuracies": 1.0, "rewards/chosen": 6.9542131423950195, "rewards/margins": 2.8373208045959473, "rewards/rejected": 4.116892337799072, "step": 6738 }, { "epoch": 1.09, "learning_rate": 7.049759019084649e-07, "logits/chosen": -0.6043055057525635, "logits/rejected": -0.5612887740135193, "logps/chosen": -69.74059295654297, "logps/rejected": -63.477447509765625, "loss": 0.6725, "rewards/accuracies": 0.0, "rewards/chosen": 1.0461342334747314, "rewards/margins": -0.5508109331130981, "rewards/rejected": 1.5969451665878296, "step": 6739 }, { "epoch": 1.09, "learning_rate": 7.048560207812708e-07, "logits/chosen": -0.7753798365592957, "logits/rejected": -0.6092845797538757, "logps/chosen": -120.58906555175781, "logps/rejected": -64.25372314453125, "loss": 0.9133, "rewards/accuracies": 0.0, "rewards/chosen": 1.4101380109786987, "rewards/margins": -0.7062240839004517, "rewards/rejected": 2.1163620948791504, "step": 6740 }, { "epoch": 1.09, "learning_rate": 7.047361255004641e-07, "logits/chosen": -0.883660078048706, "logits/rejected": -0.9464284777641296, "logps/chosen": -77.97413635253906, "logps/rejected": -173.33883666992188, "loss": 0.95, "rewards/accuracies": 0.0, "rewards/chosen": 1.0627793073654175, "rewards/margins": -1.4892441034317017, "rewards/rejected": 2.552023410797119, "step": 6741 }, { "epoch": 1.09, "learning_rate": 7.046162160743283e-07, "logits/chosen": -0.29305678606033325, "logits/rejected": -0.2911192774772644, "logps/chosen": -6.177633285522461, "logps/rejected": -3.5879569053649902, "loss": 1.8757, "rewards/accuracies": 0.0, "rewards/chosen": 0.23311233520507812, "rewards/margins": -0.21590599417686462, "rewards/rejected": 0.44901832938194275, "step": 6742 }, { "epoch": 1.09, "learning_rate": 7.04496292511148e-07, "logits/chosen": -0.2703661620616913, "logits/rejected": -0.30379337072372437, "logps/chosen": -70.23775482177734, "logps/rejected": -43.864723205566406, "loss": 0.9099, "rewards/accuracies": 0.0, "rewards/chosen": 0.1505378782749176, "rewards/margins": -1.5614010095596313, "rewards/rejected": 1.7119388580322266, "step": 6743 }, { "epoch": 1.09, "learning_rate": 7.043763548192089e-07, "logits/chosen": -0.8050088882446289, "logits/rejected": -0.8292750716209412, "logps/chosen": -126.98770904541016, "logps/rejected": -95.70489501953125, "loss": 0.3294, "rewards/accuracies": 1.0, "rewards/chosen": 3.513991594314575, "rewards/margins": 0.10133600234985352, "rewards/rejected": 3.4126555919647217, "step": 6744 }, { "epoch": 1.09, "learning_rate": 7.042564030067977e-07, "logits/chosen": -0.30163154006004333, "logits/rejected": -0.30163154006004333, "logps/chosen": -81.7698974609375, "logps/rejected": -81.7698974609375, "loss": 0.5211, "rewards/accuracies": 0.0, "rewards/chosen": 0.7694763541221619, "rewards/margins": 0.0, "rewards/rejected": 0.7694763541221619, "step": 6745 }, { "epoch": 1.09, "learning_rate": 7.041364370822016e-07, "logits/chosen": -1.0365700721740723, "logits/rejected": -0.9083259105682373, "logps/chosen": -171.72007751464844, "logps/rejected": -62.434326171875, "loss": 0.3883, "rewards/accuracies": 0.0, "rewards/chosen": 1.1380035877227783, "rewards/margins": -0.040515899658203125, "rewards/rejected": 1.1785194873809814, "step": 6746 }, { "epoch": 1.1, "learning_rate": 7.040164570537092e-07, "logits/chosen": -0.8088636994361877, "logits/rejected": -0.9229952096939087, "logps/chosen": -287.4111328125, "logps/rejected": -84.7810287475586, "loss": 0.6203, "rewards/accuracies": 0.0, "rewards/chosen": 3.6609253883361816, "rewards/margins": -0.8168845176696777, "rewards/rejected": 4.477809906005859, "step": 6747 }, { "epoch": 1.1, "learning_rate": 7.038964629296101e-07, "logits/chosen": -1.022634744644165, "logits/rejected": -0.651725172996521, "logps/chosen": -164.3140869140625, "logps/rejected": -97.56428527832031, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": 5.856864929199219, "rewards/margins": 2.531175136566162, "rewards/rejected": 3.3256897926330566, "step": 6748 }, { "epoch": 1.1, "learning_rate": 7.037764547181947e-07, "logits/chosen": -0.3277459740638733, "logits/rejected": -0.3277459740638733, "logps/chosen": -0.5773231983184814, "logps/rejected": -0.5773231983184814, "loss": 0.5307, "rewards/accuracies": 0.0, "rewards/chosen": 0.21626077592372894, "rewards/margins": 0.0, "rewards/rejected": 0.21626077592372894, "step": 6749 }, { "epoch": 1.1, "learning_rate": 7.036564324277545e-07, "logits/chosen": -0.675040602684021, "logits/rejected": -0.5931689143180847, "logps/chosen": -127.58567810058594, "logps/rejected": -73.31027221679688, "loss": 0.0836, "rewards/accuracies": 1.0, "rewards/chosen": 4.982804775238037, "rewards/margins": 2.5398924350738525, "rewards/rejected": 2.4429123401641846, "step": 6750 }, { "epoch": 1.1, "learning_rate": 7.035363960665817e-07, "logits/chosen": -0.6693103909492493, "logits/rejected": -0.6338420510292053, "logps/chosen": -35.89754104614258, "logps/rejected": -71.43923950195312, "loss": 0.4496, "rewards/accuracies": 0.0, "rewards/chosen": 1.6599963903427124, "rewards/margins": -0.12053191661834717, "rewards/rejected": 1.7805283069610596, "step": 6751 }, { "epoch": 1.1, "learning_rate": 7.034163456429698e-07, "logits/chosen": -0.5172806978225708, "logits/rejected": -0.5339922904968262, "logps/chosen": -102.54574584960938, "logps/rejected": -55.46657943725586, "loss": 0.7461, "rewards/accuracies": 0.0, "rewards/chosen": 0.6035972833633423, "rewards/margins": -0.08742368221282959, "rewards/rejected": 0.6910209655761719, "step": 6752 }, { "epoch": 1.1, "learning_rate": 7.032962811652132e-07, "logits/chosen": -0.7289832234382629, "logits/rejected": -0.7261655330657959, "logps/chosen": -105.91902923583984, "logps/rejected": -129.93328857421875, "loss": 0.0902, "rewards/accuracies": 1.0, "rewards/chosen": 6.991544246673584, "rewards/margins": 2.1432013511657715, "rewards/rejected": 4.8483428955078125, "step": 6753 }, { "epoch": 1.1, "learning_rate": 7.031762026416073e-07, "logits/chosen": -0.28471386432647705, "logits/rejected": 0.10452935099601746, "logps/chosen": -106.19055938720703, "logps/rejected": -139.34478759765625, "loss": 0.9819, "rewards/accuracies": 0.0, "rewards/chosen": 1.0406509637832642, "rewards/margins": -1.4043060541152954, "rewards/rejected": 2.4449570178985596, "step": 6754 }, { "epoch": 1.1, "learning_rate": 7.030561100804481e-07, "logits/chosen": -0.3798310458660126, "logits/rejected": -0.3798310458660126, "logps/chosen": -31.148069381713867, "logps/rejected": -31.148069381713867, "loss": 1.1895, "rewards/accuracies": 0.0, "rewards/chosen": 1.6370092630386353, "rewards/margins": 0.0, "rewards/rejected": 1.6370092630386353, "step": 6755 }, { "epoch": 1.1, "learning_rate": 7.029360034900332e-07, "logits/chosen": -1.0781543254852295, "logits/rejected": -0.9519546627998352, "logps/chosen": -61.00722122192383, "logps/rejected": -95.7719955444336, "loss": 0.381, "rewards/accuracies": 1.0, "rewards/chosen": 2.7164089679718018, "rewards/margins": 0.41023826599121094, "rewards/rejected": 2.306170701980591, "step": 6756 }, { "epoch": 1.1, "learning_rate": 7.028158828786606e-07, "logits/chosen": -0.36998337507247925, "logits/rejected": -0.28977301716804504, "logps/chosen": -83.53899383544922, "logps/rejected": -56.66413879394531, "loss": 0.6106, "rewards/accuracies": 0.0, "rewards/chosen": 0.5416984558105469, "rewards/margins": -0.8678878545761108, "rewards/rejected": 1.4095863103866577, "step": 6757 }, { "epoch": 1.1, "learning_rate": 7.026957482546294e-07, "logits/chosen": -0.6553485989570618, "logits/rejected": -0.6494235992431641, "logps/chosen": -80.48062133789062, "logps/rejected": -66.4681396484375, "loss": 2.46, "rewards/accuracies": 0.0, "rewards/chosen": 0.4349517822265625, "rewards/margins": -1.7220368385314941, "rewards/rejected": 2.1569886207580566, "step": 6758 }, { "epoch": 1.1, "learning_rate": 7.0257559962624e-07, "logits/chosen": -0.8557142615318298, "logits/rejected": -0.7445778250694275, "logps/chosen": -95.44803619384766, "logps/rejected": -87.67228698730469, "loss": 1.3534, "rewards/accuracies": 0.0, "rewards/chosen": -0.3778114318847656, "rewards/margins": -2.2330474853515625, "rewards/rejected": 1.8552360534667969, "step": 6759 }, { "epoch": 1.1, "learning_rate": 7.024554370017936e-07, "logits/chosen": -0.5270605087280273, "logits/rejected": -0.5870228409767151, "logps/chosen": -79.81655883789062, "logps/rejected": -93.34996795654297, "loss": 0.9291, "rewards/accuracies": 0.0, "rewards/chosen": 0.4523452818393707, "rewards/margins": -1.1121361255645752, "rewards/rejected": 1.5644813776016235, "step": 6760 }, { "epoch": 1.1, "learning_rate": 7.02335260389592e-07, "logits/chosen": -0.9538764357566833, "logits/rejected": -0.8956131935119629, "logps/chosen": -106.1121826171875, "logps/rejected": -37.34056854248047, "loss": 0.1608, "rewards/accuracies": 1.0, "rewards/chosen": 1.7170906066894531, "rewards/margins": 1.3637478351593018, "rewards/rejected": 0.35334283113479614, "step": 6761 }, { "epoch": 1.1, "learning_rate": 7.022150697979384e-07, "logits/chosen": -0.6251435875892639, "logits/rejected": -0.5714313387870789, "logps/chosen": -40.1423454284668, "logps/rejected": -17.509763717651367, "loss": 0.4034, "rewards/accuracies": 1.0, "rewards/chosen": 0.7245987057685852, "rewards/margins": 0.40942898392677307, "rewards/rejected": 0.31516972184181213, "step": 6762 }, { "epoch": 1.1, "learning_rate": 7.020948652351369e-07, "logits/chosen": -0.8627686500549316, "logits/rejected": -0.8792958855628967, "logps/chosen": -105.37006378173828, "logps/rejected": -198.29681396484375, "loss": 0.5781, "rewards/accuracies": 0.0, "rewards/chosen": 1.2161720991134644, "rewards/margins": -0.6444053649902344, "rewards/rejected": 1.8605774641036987, "step": 6763 }, { "epoch": 1.1, "learning_rate": 7.019746467094925e-07, "logits/chosen": -0.5569127202033997, "logits/rejected": -0.41457265615463257, "logps/chosen": -96.75291442871094, "logps/rejected": -28.924758911132812, "loss": 0.3165, "rewards/accuracies": 1.0, "rewards/chosen": 1.058619737625122, "rewards/margins": 0.26228946447372437, "rewards/rejected": 0.7963302731513977, "step": 6764 }, { "epoch": 1.1, "learning_rate": 7.018544142293111e-07, "logits/chosen": -0.37889397144317627, "logits/rejected": -0.34373316168785095, "logps/chosen": -142.50616455078125, "logps/rejected": -71.82827758789062, "loss": 0.6623, "rewards/accuracies": 0.0, "rewards/chosen": -0.6563888788223267, "rewards/margins": -0.9536957144737244, "rewards/rejected": 0.2973068356513977, "step": 6765 }, { "epoch": 1.1, "learning_rate": 7.017341678028996e-07, "logits/chosen": -0.769493043422699, "logits/rejected": -0.7581494450569153, "logps/chosen": -89.64932250976562, "logps/rejected": -58.119728088378906, "loss": 0.7622, "rewards/accuracies": 0.0, "rewards/chosen": 0.419363409280777, "rewards/margins": -1.256208062171936, "rewards/rejected": 1.6755714416503906, "step": 6766 }, { "epoch": 1.1, "learning_rate": 7.016139074385661e-07, "logits/chosen": -0.6967176795005798, "logits/rejected": -0.6392706036567688, "logps/chosen": -77.25540161132812, "logps/rejected": -63.8112678527832, "loss": 0.6979, "rewards/accuracies": 0.0, "rewards/chosen": 0.6344528198242188, "rewards/margins": -1.077042818069458, "rewards/rejected": 1.7114956378936768, "step": 6767 }, { "epoch": 1.1, "learning_rate": 7.014936331446191e-07, "logits/chosen": -0.4520256519317627, "logits/rejected": -0.4521159529685974, "logps/chosen": -62.8526611328125, "logps/rejected": -147.35870361328125, "loss": 1.0887, "rewards/accuracies": 1.0, "rewards/chosen": 1.5486793518066406, "rewards/margins": 0.3099418878555298, "rewards/rejected": 1.2387374639511108, "step": 6768 }, { "epoch": 1.1, "learning_rate": 7.013733449293686e-07, "logits/chosen": -0.7854175567626953, "logits/rejected": -0.6200208067893982, "logps/chosen": -156.03994750976562, "logps/rejected": -56.92535400390625, "loss": 0.6075, "rewards/accuracies": 0.0, "rewards/chosen": 1.3294785022735596, "rewards/margins": -0.09121012687683105, "rewards/rejected": 1.4206886291503906, "step": 6769 }, { "epoch": 1.1, "learning_rate": 7.012530428011254e-07, "logits/chosen": -1.1624423265457153, "logits/rejected": -1.0586788654327393, "logps/chosen": -75.49508666992188, "logps/rejected": -21.031314849853516, "loss": 0.1259, "rewards/accuracies": 1.0, "rewards/chosen": 5.415066719055176, "rewards/margins": 4.969552516937256, "rewards/rejected": 0.4455142915248871, "step": 6770 }, { "epoch": 1.1, "learning_rate": 7.011327267682013e-07, "logits/chosen": -0.7768743634223938, "logits/rejected": -0.6336992383003235, "logps/chosen": -101.20907592773438, "logps/rejected": -69.72327423095703, "loss": 0.0716, "rewards/accuracies": 1.0, "rewards/chosen": 2.8628602027893066, "rewards/margins": 1.9719047546386719, "rewards/rejected": 0.89095538854599, "step": 6771 }, { "epoch": 1.1, "learning_rate": 7.010123968389087e-07, "logits/chosen": -0.6829451322555542, "logits/rejected": -0.6352525353431702, "logps/chosen": -114.10231018066406, "logps/rejected": -111.90426635742188, "loss": 0.7021, "rewards/accuracies": 0.0, "rewards/chosen": 1.5504578351974487, "rewards/margins": -1.0549041032791138, "rewards/rejected": 2.6053619384765625, "step": 6772 }, { "epoch": 1.1, "learning_rate": 7.008920530215618e-07, "logits/chosen": -0.7154743075370789, "logits/rejected": -0.683481752872467, "logps/chosen": -79.58184051513672, "logps/rejected": -49.85597610473633, "loss": 0.8302, "rewards/accuracies": 1.0, "rewards/chosen": 3.705134630203247, "rewards/margins": 2.367884874343872, "rewards/rejected": 1.337249755859375, "step": 6773 }, { "epoch": 1.1, "learning_rate": 7.007716953244747e-07, "logits/chosen": -0.5067769289016724, "logits/rejected": -0.5067769289016724, "logps/chosen": -17.79154396057129, "logps/rejected": -17.79154396057129, "loss": 0.4017, "rewards/accuracies": 0.0, "rewards/chosen": 0.2013261765241623, "rewards/margins": 0.0, "rewards/rejected": 0.2013261765241623, "step": 6774 }, { "epoch": 1.1, "learning_rate": 7.006513237559632e-07, "logits/chosen": -0.813870370388031, "logits/rejected": -0.6071824431419373, "logps/chosen": -67.25692749023438, "logps/rejected": -167.9246063232422, "loss": 1.0319, "rewards/accuracies": 1.0, "rewards/chosen": 2.2713944911956787, "rewards/margins": 0.26690077781677246, "rewards/rejected": 2.0044937133789062, "step": 6775 }, { "epoch": 1.1, "learning_rate": 7.005309383243437e-07, "logits/chosen": -0.8085855841636658, "logits/rejected": -0.688729465007782, "logps/chosen": -140.38739013671875, "logps/rejected": -41.377777099609375, "loss": 0.2005, "rewards/accuracies": 1.0, "rewards/chosen": 0.9622055292129517, "rewards/margins": 0.7178020477294922, "rewards/rejected": 0.24440346658229828, "step": 6776 }, { "epoch": 1.1, "learning_rate": 7.004105390379341e-07, "logits/chosen": -0.18582558631896973, "logits/rejected": -0.17846444249153137, "logps/chosen": -4.106420993804932, "logps/rejected": -2.738454580307007, "loss": 0.3901, "rewards/accuracies": 1.0, "rewards/chosen": 0.17093577980995178, "rewards/margins": 0.00032036006450653076, "rewards/rejected": 0.17061541974544525, "step": 6777 }, { "epoch": 1.1, "learning_rate": 7.002901259050522e-07, "logits/chosen": -0.9889733791351318, "logits/rejected": -0.6116132736206055, "logps/chosen": -78.94186401367188, "logps/rejected": -123.63113403320312, "loss": 0.4785, "rewards/accuracies": 0.0, "rewards/chosen": 1.7674087285995483, "rewards/margins": -0.39607250690460205, "rewards/rejected": 2.1634812355041504, "step": 6778 }, { "epoch": 1.1, "learning_rate": 7.00169698934018e-07, "logits/chosen": -0.5049935579299927, "logits/rejected": -0.44980284571647644, "logps/chosen": -77.7838363647461, "logps/rejected": -53.46853256225586, "loss": 0.4402, "rewards/accuracies": 0.0, "rewards/chosen": 1.6081199645996094, "rewards/margins": -0.3255733251571655, "rewards/rejected": 1.933693289756775, "step": 6779 }, { "epoch": 1.1, "learning_rate": 7.000492581331515e-07, "logits/chosen": -0.8767843842506409, "logits/rejected": -0.8767843842506409, "logps/chosen": -98.72550964355469, "logps/rejected": -98.72550964355469, "loss": 0.4224, "rewards/accuracies": 0.0, "rewards/chosen": 1.4882103204727173, "rewards/margins": 0.0, "rewards/rejected": 1.4882103204727173, "step": 6780 }, { "epoch": 1.1, "learning_rate": 6.999288035107743e-07, "logits/chosen": -0.5959012508392334, "logits/rejected": -0.6140809655189514, "logps/chosen": -78.17583465576172, "logps/rejected": -156.61390686035156, "loss": 0.9419, "rewards/accuracies": 0.0, "rewards/chosen": 3.0806503295898438, "rewards/margins": -1.6783266067504883, "rewards/rejected": 4.758976936340332, "step": 6781 }, { "epoch": 1.1, "learning_rate": 6.998083350752083e-07, "logits/chosen": -0.6469534039497375, "logits/rejected": -0.6068810820579529, "logps/chosen": -97.68228149414062, "logps/rejected": -78.03691864013672, "loss": 0.4, "rewards/accuracies": 1.0, "rewards/chosen": 2.724482774734497, "rewards/margins": 0.00579071044921875, "rewards/rejected": 2.7186920642852783, "step": 6782 }, { "epoch": 1.1, "learning_rate": 6.99687852834777e-07, "logits/chosen": -1.2722039222717285, "logits/rejected": -1.363317608833313, "logps/chosen": -144.4119873046875, "logps/rejected": -38.84194564819336, "loss": 0.0434, "rewards/accuracies": 1.0, "rewards/chosen": 3.9937241077423096, "rewards/margins": 3.5755016803741455, "rewards/rejected": 0.41822242736816406, "step": 6783 }, { "epoch": 1.1, "learning_rate": 6.995673567978047e-07, "logits/chosen": -1.1117357015609741, "logits/rejected": -1.05543053150177, "logps/chosen": -44.94002151489258, "logps/rejected": -79.77085876464844, "loss": 0.416, "rewards/accuracies": 0.0, "rewards/chosen": 1.7790569067001343, "rewards/margins": -0.22182118892669678, "rewards/rejected": 2.000878095626831, "step": 6784 }, { "epoch": 1.1, "learning_rate": 6.994468469726162e-07, "logits/chosen": -0.7916778922080994, "logits/rejected": -0.7220019698143005, "logps/chosen": -162.55120849609375, "logps/rejected": -147.4777374267578, "loss": 0.4445, "rewards/accuracies": 0.0, "rewards/chosen": 4.775332927703857, "rewards/margins": -0.14667510986328125, "rewards/rejected": 4.922008037567139, "step": 6785 }, { "epoch": 1.1, "learning_rate": 6.99326323367538e-07, "logits/chosen": -0.862446129322052, "logits/rejected": -0.6427843570709229, "logps/chosen": -105.53108978271484, "logps/rejected": -35.548248291015625, "loss": 0.0257, "rewards/accuracies": 1.0, "rewards/chosen": 3.720609426498413, "rewards/margins": 3.3838930130004883, "rewards/rejected": 0.3367164731025696, "step": 6786 }, { "epoch": 1.1, "learning_rate": 6.992057859908967e-07, "logits/chosen": -0.9233442544937134, "logits/rejected": -0.8276201486587524, "logps/chosen": -118.65989685058594, "logps/rejected": -46.58748245239258, "loss": 0.0411, "rewards/accuracies": 1.0, "rewards/chosen": 3.3858642578125, "rewards/margins": 3.2746500968933105, "rewards/rejected": 0.1112140640616417, "step": 6787 }, { "epoch": 1.1, "learning_rate": 6.990852348510205e-07, "logits/chosen": -0.4030952751636505, "logits/rejected": -0.4054245054721832, "logps/chosen": -4.9673285484313965, "logps/rejected": -6.350653171539307, "loss": 1.3012, "rewards/accuracies": 1.0, "rewards/chosen": 0.22370624542236328, "rewards/margins": 0.17903518676757812, "rewards/rejected": 0.044671058654785156, "step": 6788 }, { "epoch": 1.1, "learning_rate": 6.989646699562383e-07, "logits/chosen": -0.6285033226013184, "logits/rejected": -0.6357231140136719, "logps/chosen": -108.2285385131836, "logps/rejected": -43.915218353271484, "loss": 0.6258, "rewards/accuracies": 1.0, "rewards/chosen": 3.005112409591675, "rewards/margins": 1.187516689300537, "rewards/rejected": 1.8175957202911377, "step": 6789 }, { "epoch": 1.1, "learning_rate": 6.988440913148801e-07, "logits/chosen": -0.6519898176193237, "logits/rejected": -0.7021197080612183, "logps/chosen": -74.18643951416016, "logps/rejected": -59.19292449951172, "loss": 0.611, "rewards/accuracies": 0.0, "rewards/chosen": 1.0130821466445923, "rewards/margins": -0.11490178108215332, "rewards/rejected": 1.1279839277267456, "step": 6790 }, { "epoch": 1.1, "learning_rate": 6.987234989352766e-07, "logits/chosen": -0.48631739616394043, "logits/rejected": -0.5233331918716431, "logps/chosen": -80.17082214355469, "logps/rejected": -128.81857299804688, "loss": 1.1029, "rewards/accuracies": 0.0, "rewards/chosen": 1.087031602859497, "rewards/margins": -1.3386917114257812, "rewards/rejected": 2.4257233142852783, "step": 6791 }, { "epoch": 1.1, "learning_rate": 6.986028928257597e-07, "logits/chosen": -0.8235382437705994, "logits/rejected": -0.5539065003395081, "logps/chosen": -92.83770751953125, "logps/rejected": -261.937255859375, "loss": 1.3461, "rewards/accuracies": 0.0, "rewards/chosen": 1.3064377307891846, "rewards/margins": -2.5974838733673096, "rewards/rejected": 3.903921604156494, "step": 6792 }, { "epoch": 1.1, "learning_rate": 6.984822729946621e-07, "logits/chosen": -0.19298715889453888, "logits/rejected": -0.1852824091911316, "logps/chosen": -30.27496337890625, "logps/rejected": -9.83192253112793, "loss": 0.5602, "rewards/accuracies": 0.0, "rewards/chosen": -0.09798850864171982, "rewards/margins": -0.11782608181238174, "rewards/rejected": 0.019837571308016777, "step": 6793 }, { "epoch": 1.1, "learning_rate": 6.983616394503176e-07, "logits/chosen": -0.5622621178627014, "logits/rejected": -0.5825416445732117, "logps/chosen": -63.49055480957031, "logps/rejected": -108.76754760742188, "loss": 0.9223, "rewards/accuracies": 1.0, "rewards/chosen": 0.692126452922821, "rewards/margins": 0.6862014532089233, "rewards/rejected": 0.005924988072365522, "step": 6794 }, { "epoch": 1.1, "learning_rate": 6.982409922010606e-07, "logits/chosen": -0.6020691394805908, "logits/rejected": -0.6293827891349792, "logps/chosen": -65.51026916503906, "logps/rejected": -41.36418151855469, "loss": 1.2882, "rewards/accuracies": 0.0, "rewards/chosen": 1.5284096002578735, "rewards/margins": -0.4400627613067627, "rewards/rejected": 1.9684723615646362, "step": 6795 }, { "epoch": 1.1, "learning_rate": 6.981203312552269e-07, "logits/chosen": -0.7560843229293823, "logits/rejected": -0.7170442938804626, "logps/chosen": -210.5851287841797, "logps/rejected": -65.26996612548828, "loss": 0.0981, "rewards/accuracies": 1.0, "rewards/chosen": 3.909649610519409, "rewards/margins": 1.9568747282028198, "rewards/rejected": 1.9527748823165894, "step": 6796 }, { "epoch": 1.1, "learning_rate": 6.979996566211528e-07, "logits/chosen": -0.5452927947044373, "logits/rejected": -0.5144027471542358, "logps/chosen": -135.62855529785156, "logps/rejected": -55.003265380859375, "loss": 0.3848, "rewards/accuracies": 0.0, "rewards/chosen": 0.9647979736328125, "rewards/margins": -0.07277989387512207, "rewards/rejected": 1.0375778675079346, "step": 6797 }, { "epoch": 1.1, "learning_rate": 6.978789683071759e-07, "logits/chosen": -0.8802456259727478, "logits/rejected": -0.8294705748558044, "logps/chosen": -144.38320922851562, "logps/rejected": -109.31861877441406, "loss": 0.8502, "rewards/accuracies": 1.0, "rewards/chosen": 0.6342193484306335, "rewards/margins": 0.4938308596611023, "rewards/rejected": 0.14038848876953125, "step": 6798 }, { "epoch": 1.1, "learning_rate": 6.977582663216349e-07, "logits/chosen": -0.7194036841392517, "logits/rejected": -0.6584633588790894, "logps/chosen": -126.21398162841797, "logps/rejected": -73.37600708007812, "loss": 0.7777, "rewards/accuracies": 1.0, "rewards/chosen": 1.8663307428359985, "rewards/margins": 0.42388153076171875, "rewards/rejected": 1.4424492120742798, "step": 6799 }, { "epoch": 1.1, "learning_rate": 6.976375506728686e-07, "logits/chosen": -0.643265426158905, "logits/rejected": -0.6599166989326477, "logps/chosen": -103.05875396728516, "logps/rejected": -91.17430877685547, "loss": 0.5934, "rewards/accuracies": 0.0, "rewards/chosen": 1.0352448225021362, "rewards/margins": -0.598497748374939, "rewards/rejected": 1.6337425708770752, "step": 6800 }, { "epoch": 1.1, "learning_rate": 6.975168213692179e-07, "logits/chosen": -0.6647454500198364, "logits/rejected": -0.6101059317588806, "logps/chosen": -130.61419677734375, "logps/rejected": -48.19126892089844, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": 5.931449890136719, "rewards/margins": 3.982872724533081, "rewards/rejected": 1.9485771656036377, "step": 6801 }, { "epoch": 1.1, "learning_rate": 6.973960784190235e-07, "logits/chosen": -0.5290684700012207, "logits/rejected": -0.5151088237762451, "logps/chosen": -31.703706741333008, "logps/rejected": -6.622121810913086, "loss": 0.9545, "rewards/accuracies": 0.0, "rewards/chosen": 0.36800557374954224, "rewards/margins": -0.17292273044586182, "rewards/rejected": 0.540928304195404, "step": 6802 }, { "epoch": 1.1, "learning_rate": 6.972753218306281e-07, "logits/chosen": -0.5537010431289673, "logits/rejected": -0.5833324790000916, "logps/chosen": -71.6240234375, "logps/rejected": -49.11234664916992, "loss": 0.7594, "rewards/accuracies": 0.0, "rewards/chosen": 1.2327454090118408, "rewards/margins": -0.283400297164917, "rewards/rejected": 1.5161457061767578, "step": 6803 }, { "epoch": 1.1, "learning_rate": 6.971545516123745e-07, "logits/chosen": -0.764247477054596, "logits/rejected": -0.6565410494804382, "logps/chosen": -53.37037658691406, "logps/rejected": -21.36629295349121, "loss": 0.3969, "rewards/accuracies": 1.0, "rewards/chosen": 1.5830062627792358, "rewards/margins": 1.1159276962280273, "rewards/rejected": 0.4670785963535309, "step": 6804 }, { "epoch": 1.1, "learning_rate": 6.970337677726069e-07, "logits/chosen": -0.4631028473377228, "logits/rejected": -0.44859665632247925, "logps/chosen": -33.594879150390625, "logps/rejected": -25.245807647705078, "loss": 0.3391, "rewards/accuracies": 1.0, "rewards/chosen": -0.1481681913137436, "rewards/margins": 0.2551673650741577, "rewards/rejected": -0.4033355712890625, "step": 6805 }, { "epoch": 1.1, "learning_rate": 6.969129703196702e-07, "logits/chosen": -0.5330854654312134, "logits/rejected": -0.5312463045120239, "logps/chosen": -76.3204116821289, "logps/rejected": -62.9593391418457, "loss": 0.419, "rewards/accuracies": 1.0, "rewards/chosen": 1.565242052078247, "rewards/margins": 0.011198878288269043, "rewards/rejected": 1.554043173789978, "step": 6806 }, { "epoch": 1.1, "learning_rate": 6.967921592619104e-07, "logits/chosen": -0.6123017072677612, "logits/rejected": -0.4627639055252075, "logps/chosen": -81.44818878173828, "logps/rejected": -152.87701416015625, "loss": 2.0413, "rewards/accuracies": 0.0, "rewards/chosen": 1.1327400207519531, "rewards/margins": -3.8760828971862793, "rewards/rejected": 5.008822917938232, "step": 6807 }, { "epoch": 1.11, "learning_rate": 6.966713346076747e-07, "logits/chosen": -0.5570408701896667, "logits/rejected": -0.497909814119339, "logps/chosen": -123.34822082519531, "logps/rejected": -60.99818420410156, "loss": 0.4353, "rewards/accuracies": 0.0, "rewards/chosen": 0.9576675295829773, "rewards/margins": -0.22153478860855103, "rewards/rejected": 1.1792023181915283, "step": 6808 }, { "epoch": 1.11, "learning_rate": 6.965504963653105e-07, "logits/chosen": -0.8829249739646912, "logits/rejected": -0.8623161315917969, "logps/chosen": -23.491466522216797, "logps/rejected": -55.92808151245117, "loss": 2.3943, "rewards/accuracies": 0.0, "rewards/chosen": 1.489582896232605, "rewards/margins": -0.04655647277832031, "rewards/rejected": 1.5361393690109253, "step": 6809 }, { "epoch": 1.11, "learning_rate": 6.964296445431668e-07, "logits/chosen": -0.6030209064483643, "logits/rejected": -0.6115537285804749, "logps/chosen": -87.92401885986328, "logps/rejected": -96.88587951660156, "loss": 1.2664, "rewards/accuracies": 0.0, "rewards/chosen": 0.5749183893203735, "rewards/margins": -2.3879919052124023, "rewards/rejected": 2.9629104137420654, "step": 6810 }, { "epoch": 1.11, "learning_rate": 6.963087791495934e-07, "logits/chosen": -0.83643639087677, "logits/rejected": -0.7813219428062439, "logps/chosen": -79.45375061035156, "logps/rejected": -26.524303436279297, "loss": 0.329, "rewards/accuracies": 1.0, "rewards/chosen": 1.23674476146698, "rewards/margins": 0.14502036571502686, "rewards/rejected": 1.0917243957519531, "step": 6811 }, { "epoch": 1.11, "learning_rate": 6.961879001929408e-07, "logits/chosen": -0.9183332920074463, "logits/rejected": -0.9306972026824951, "logps/chosen": -71.68678283691406, "logps/rejected": -36.3311767578125, "loss": 0.4718, "rewards/accuracies": 1.0, "rewards/chosen": 1.5175141096115112, "rewards/margins": 1.242842197418213, "rewards/rejected": 0.2746719419956207, "step": 6812 }, { "epoch": 1.11, "learning_rate": 6.960670076815607e-07, "logits/chosen": -0.30536743998527527, "logits/rejected": -0.14134201407432556, "logps/chosen": -57.84791564941406, "logps/rejected": -12.042305946350098, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": 2.2542877197265625, "rewards/margins": 1.4322514533996582, "rewards/rejected": 0.8220362663269043, "step": 6813 }, { "epoch": 1.11, "learning_rate": 6.959461016238055e-07, "logits/chosen": -0.7697667479515076, "logits/rejected": -0.6738225817680359, "logps/chosen": -82.90702056884766, "logps/rejected": -123.156494140625, "loss": 1.5984, "rewards/accuracies": 0.0, "rewards/chosen": 0.6581459045410156, "rewards/margins": -2.634619951248169, "rewards/rejected": 3.2927658557891846, "step": 6814 }, { "epoch": 1.11, "learning_rate": 6.958251820280288e-07, "logits/chosen": -1.1034165620803833, "logits/rejected": -1.127693772315979, "logps/chosen": -69.63764953613281, "logps/rejected": -63.332542419433594, "loss": 0.3054, "rewards/accuracies": 1.0, "rewards/chosen": 1.9113502502441406, "rewards/margins": 0.35554802417755127, "rewards/rejected": 1.5558022260665894, "step": 6815 }, { "epoch": 1.11, "learning_rate": 6.957042489025849e-07, "logits/chosen": -0.9903183579444885, "logits/rejected": -0.9479082822799683, "logps/chosen": -66.65174865722656, "logps/rejected": -105.32464599609375, "loss": 0.4264, "rewards/accuracies": 0.0, "rewards/chosen": 2.1065750122070312, "rewards/margins": -0.11513447761535645, "rewards/rejected": 2.2217094898223877, "step": 6816 }, { "epoch": 1.11, "learning_rate": 6.955833022558292e-07, "logits/chosen": -0.5581071972846985, "logits/rejected": -0.5581071972846985, "logps/chosen": -71.53900146484375, "logps/rejected": -71.53900146484375, "loss": 0.3548, "rewards/accuracies": 0.0, "rewards/chosen": 1.6484107971191406, "rewards/margins": 0.0, "rewards/rejected": 1.6484107971191406, "step": 6817 }, { "epoch": 1.11, "learning_rate": 6.954623420961178e-07, "logits/chosen": -0.8509629964828491, "logits/rejected": -0.8691816329956055, "logps/chosen": -172.74716186523438, "logps/rejected": -103.94743347167969, "loss": 0.7253, "rewards/accuracies": 0.0, "rewards/chosen": 4.895532131195068, "rewards/margins": -1.0458879470825195, "rewards/rejected": 5.941420078277588, "step": 6818 }, { "epoch": 1.11, "learning_rate": 6.953413684318083e-07, "logits/chosen": -0.6602147221565247, "logits/rejected": -0.6477691531181335, "logps/chosen": -142.93887329101562, "logps/rejected": -118.87039947509766, "loss": 0.7721, "rewards/accuracies": 0.0, "rewards/chosen": 0.7054107785224915, "rewards/margins": -1.2828240394592285, "rewards/rejected": 1.9882347583770752, "step": 6819 }, { "epoch": 1.11, "learning_rate": 6.952203812712583e-07, "logits/chosen": -0.464396595954895, "logits/rejected": -0.44874390959739685, "logps/chosen": -14.196393966674805, "logps/rejected": -29.307762145996094, "loss": 0.5789, "rewards/accuracies": 1.0, "rewards/chosen": 1.4192808866500854, "rewards/margins": 0.09101128578186035, "rewards/rejected": 1.328269600868225, "step": 6820 }, { "epoch": 1.11, "learning_rate": 6.950993806228273e-07, "logits/chosen": -0.49631965160369873, "logits/rejected": -0.5739476084709167, "logps/chosen": -68.68107604980469, "logps/rejected": -59.310096740722656, "loss": 1.4072, "rewards/accuracies": 1.0, "rewards/chosen": 1.9275375604629517, "rewards/margins": 0.4299507141113281, "rewards/rejected": 1.4975868463516235, "step": 6821 }, { "epoch": 1.11, "learning_rate": 6.949783664948752e-07, "logits/chosen": -0.5489373803138733, "logits/rejected": -0.41227221488952637, "logps/chosen": -116.2569580078125, "logps/rejected": -62.21935272216797, "loss": 0.8183, "rewards/accuracies": 1.0, "rewards/chosen": 4.145544528961182, "rewards/margins": 1.6235320568084717, "rewards/rejected": 2.52201247215271, "step": 6822 }, { "epoch": 1.11, "learning_rate": 6.948573388957627e-07, "logits/chosen": -0.7780530452728271, "logits/rejected": -0.7417877316474915, "logps/chosen": -92.70006561279297, "logps/rejected": -49.996337890625, "loss": 0.3862, "rewards/accuracies": 1.0, "rewards/chosen": 1.9991730451583862, "rewards/margins": 1.194518804550171, "rewards/rejected": 0.8046543002128601, "step": 6823 }, { "epoch": 1.11, "learning_rate": 6.94736297833852e-07, "logits/chosen": -0.6630226373672485, "logits/rejected": -0.7091478109359741, "logps/chosen": -40.98871612548828, "logps/rejected": -137.8992919921875, "loss": 3.2302, "rewards/accuracies": 0.0, "rewards/chosen": 1.587685465812683, "rewards/margins": -3.7317419052124023, "rewards/rejected": 5.319427490234375, "step": 6824 }, { "epoch": 1.11, "learning_rate": 6.946152433175057e-07, "logits/chosen": -0.8354429006576538, "logits/rejected": -0.6451027989387512, "logps/chosen": -97.40985107421875, "logps/rejected": -28.633535385131836, "loss": 0.1442, "rewards/accuracies": 1.0, "rewards/chosen": 3.1633224487304688, "rewards/margins": 2.92803955078125, "rewards/rejected": 0.23528289794921875, "step": 6825 }, { "epoch": 1.11, "learning_rate": 6.944941753550876e-07, "logits/chosen": -0.9637733101844788, "logits/rejected": -0.986607551574707, "logps/chosen": -188.63462829589844, "logps/rejected": -73.74630737304688, "loss": 1.0731, "rewards/accuracies": 0.0, "rewards/chosen": 0.08017730712890625, "rewards/margins": -1.7452011108398438, "rewards/rejected": 1.82537841796875, "step": 6826 }, { "epoch": 1.11, "learning_rate": 6.943730939549622e-07, "logits/chosen": -0.9894523620605469, "logits/rejected": -0.835587203502655, "logps/chosen": -128.6983642578125, "logps/rejected": -42.807220458984375, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": 6.393110752105713, "rewards/margins": 4.488424777984619, "rewards/rejected": 1.9046859741210938, "step": 6827 }, { "epoch": 1.11, "learning_rate": 6.942519991254952e-07, "logits/chosen": -0.32162877917289734, "logits/rejected": -0.31818172335624695, "logps/chosen": -5.3605499267578125, "logps/rejected": -9.108580589294434, "loss": 1.2795, "rewards/accuracies": 1.0, "rewards/chosen": 0.2484244406223297, "rewards/margins": 0.1982606053352356, "rewards/rejected": 0.05016384273767471, "step": 6828 }, { "epoch": 1.11, "learning_rate": 6.941308908750532e-07, "logits/chosen": -0.7620101571083069, "logits/rejected": -0.7461162805557251, "logps/chosen": -56.182273864746094, "logps/rejected": -97.44020080566406, "loss": 0.2571, "rewards/accuracies": 1.0, "rewards/chosen": 1.5400886535644531, "rewards/margins": 0.7765907049179077, "rewards/rejected": 0.7634979486465454, "step": 6829 }, { "epoch": 1.11, "learning_rate": 6.940097692120034e-07, "logits/chosen": -0.5847261548042297, "logits/rejected": -0.6305176615715027, "logps/chosen": -64.12997436523438, "logps/rejected": -87.30154418945312, "loss": 0.3791, "rewards/accuracies": 1.0, "rewards/chosen": 3.018056631088257, "rewards/margins": 0.41611647605895996, "rewards/rejected": 2.601940155029297, "step": 6830 }, { "epoch": 1.11, "learning_rate": 6.938886341447143e-07, "logits/chosen": -0.7021276354789734, "logits/rejected": -0.6356691122055054, "logps/chosen": -60.71039581298828, "logps/rejected": -68.87371826171875, "loss": 1.0743, "rewards/accuracies": 1.0, "rewards/chosen": 2.261077880859375, "rewards/margins": 0.2104644775390625, "rewards/rejected": 2.0506134033203125, "step": 6831 }, { "epoch": 1.11, "learning_rate": 6.937674856815551e-07, "logits/chosen": -0.877185583114624, "logits/rejected": -1.186381220817566, "logps/chosen": -74.6236343383789, "logps/rejected": -36.97140884399414, "loss": 1.4656, "rewards/accuracies": 1.0, "rewards/chosen": 2.068342685699463, "rewards/margins": 1.7840454578399658, "rewards/rejected": 0.2842971980571747, "step": 6832 }, { "epoch": 1.11, "learning_rate": 6.936463238308963e-07, "logits/chosen": -0.6092742681503296, "logits/rejected": -0.6500198245048523, "logps/chosen": -44.16500473022461, "logps/rejected": -49.546173095703125, "loss": 2.6385, "rewards/accuracies": 0.0, "rewards/chosen": 0.42498779296875, "rewards/margins": -1.5351409912109375, "rewards/rejected": 1.9601287841796875, "step": 6833 }, { "epoch": 1.11, "learning_rate": 6.935251486011086e-07, "logits/chosen": -0.5453922748565674, "logits/rejected": -0.4806676208972931, "logps/chosen": -144.18600463867188, "logps/rejected": -92.51618957519531, "loss": 0.4962, "rewards/accuracies": 1.0, "rewards/chosen": 1.578364610671997, "rewards/margins": 0.4030029773712158, "rewards/rejected": 1.1753616333007812, "step": 6834 }, { "epoch": 1.11, "learning_rate": 6.934039600005643e-07, "logits/chosen": -0.6739711761474609, "logits/rejected": -0.4713505804538727, "logps/chosen": -78.57101440429688, "logps/rejected": -78.62152099609375, "loss": 0.5683, "rewards/accuracies": 1.0, "rewards/chosen": 3.687251329421997, "rewards/margins": 1.9776268005371094, "rewards/rejected": 1.7096245288848877, "step": 6835 }, { "epoch": 1.11, "learning_rate": 6.932827580376365e-07, "logits/chosen": -0.6944010257720947, "logits/rejected": -0.6773064732551575, "logps/chosen": -59.951412200927734, "logps/rejected": -83.34011840820312, "loss": 0.5933, "rewards/accuracies": 0.0, "rewards/chosen": 1.1874821186065674, "rewards/margins": -0.28343307971954346, "rewards/rejected": 1.4709151983261108, "step": 6836 }, { "epoch": 1.11, "learning_rate": 6.93161542720699e-07, "logits/chosen": -0.6008711457252502, "logits/rejected": -0.7291989922523499, "logps/chosen": -101.73028564453125, "logps/rejected": -142.2521514892578, "loss": 2.1319, "rewards/accuracies": 0.0, "rewards/chosen": 1.9604294300079346, "rewards/margins": -3.5421812534332275, "rewards/rejected": 5.502610683441162, "step": 6837 }, { "epoch": 1.11, "learning_rate": 6.930403140581265e-07, "logits/chosen": -0.9748202562332153, "logits/rejected": -0.9622282981872559, "logps/chosen": -39.12813186645508, "logps/rejected": -78.68473052978516, "loss": 0.7127, "rewards/accuracies": 0.0, "rewards/chosen": 1.572731375694275, "rewards/margins": -0.6490880250930786, "rewards/rejected": 2.2218194007873535, "step": 6838 }, { "epoch": 1.11, "learning_rate": 6.929190720582948e-07, "logits/chosen": -0.7610663771629333, "logits/rejected": -0.7282423973083496, "logps/chosen": -77.84394836425781, "logps/rejected": -110.73776245117188, "loss": 0.3845, "rewards/accuracies": 1.0, "rewards/chosen": 1.3219650983810425, "rewards/margins": 0.3307945728302002, "rewards/rejected": 0.9911705255508423, "step": 6839 }, { "epoch": 1.11, "learning_rate": 6.927978167295807e-07, "logits/chosen": -0.6045732498168945, "logits/rejected": -0.589458167552948, "logps/chosen": -203.89068603515625, "logps/rejected": -101.1505126953125, "loss": 0.0668, "rewards/accuracies": 1.0, "rewards/chosen": 4.386013984680176, "rewards/margins": 2.5391693115234375, "rewards/rejected": 1.8468445539474487, "step": 6840 }, { "epoch": 1.11, "learning_rate": 6.926765480803618e-07, "logits/chosen": -0.38275498151779175, "logits/rejected": -0.3357410132884979, "logps/chosen": -69.7054214477539, "logps/rejected": -50.275699615478516, "loss": 1.0877, "rewards/accuracies": 0.0, "rewards/chosen": 1.6239601373672485, "rewards/margins": -0.32336926460266113, "rewards/rejected": 1.9473294019699097, "step": 6841 }, { "epoch": 1.11, "learning_rate": 6.925552661190165e-07, "logits/chosen": -0.7067457437515259, "logits/rejected": -0.8537722229957581, "logps/chosen": -83.76170349121094, "logps/rejected": -70.3790283203125, "loss": 1.0651, "rewards/accuracies": 1.0, "rewards/chosen": 1.228973388671875, "rewards/margins": 0.50079345703125, "rewards/rejected": 0.728179931640625, "step": 6842 }, { "epoch": 1.11, "learning_rate": 6.924339708539243e-07, "logits/chosen": -0.6461071968078613, "logits/rejected": -0.5956014394760132, "logps/chosen": -53.550193786621094, "logps/rejected": -19.787628173828125, "loss": 0.5386, "rewards/accuracies": 1.0, "rewards/chosen": 0.9205536246299744, "rewards/margins": 0.5938720703125, "rewards/rejected": 0.326681524515152, "step": 6843 }, { "epoch": 1.11, "learning_rate": 6.923126622934655e-07, "logits/chosen": -0.6863288879394531, "logits/rejected": -0.6471000909805298, "logps/chosen": -86.32821655273438, "logps/rejected": -16.656814575195312, "loss": 0.8486, "rewards/accuracies": 1.0, "rewards/chosen": 0.38044053316116333, "rewards/margins": 0.05839940905570984, "rewards/rejected": 0.3220411241054535, "step": 6844 }, { "epoch": 1.11, "learning_rate": 6.921913404460216e-07, "logits/chosen": -1.0482616424560547, "logits/rejected": -1.0153658390045166, "logps/chosen": -80.15521240234375, "logps/rejected": -104.42442321777344, "loss": 0.5246, "rewards/accuracies": 0.0, "rewards/chosen": 4.053350925445557, "rewards/margins": -0.5745573043823242, "rewards/rejected": 4.627908229827881, "step": 6845 }, { "epoch": 1.11, "learning_rate": 6.920700053199744e-07, "logits/chosen": -0.5640029907226562, "logits/rejected": -0.5843813419342041, "logps/chosen": -90.45287322998047, "logps/rejected": -91.02152252197266, "loss": 0.8192, "rewards/accuracies": 0.0, "rewards/chosen": 1.603166937828064, "rewards/margins": -0.609214186668396, "rewards/rejected": 2.21238112449646, "step": 6846 }, { "epoch": 1.11, "learning_rate": 6.919486569237073e-07, "logits/chosen": -0.66922527551651, "logits/rejected": -0.7145326137542725, "logps/chosen": -240.7191619873047, "logps/rejected": -94.13629913330078, "loss": 0.1026, "rewards/accuracies": 1.0, "rewards/chosen": 3.564570665359497, "rewards/margins": 2.0054008960723877, "rewards/rejected": 1.5591697692871094, "step": 6847 }, { "epoch": 1.11, "learning_rate": 6.918272952656041e-07, "logits/chosen": -0.877019464969635, "logits/rejected": -0.6797295808792114, "logps/chosen": -106.19352722167969, "logps/rejected": -69.05677795410156, "loss": 0.2242, "rewards/accuracies": 1.0, "rewards/chosen": 4.853776454925537, "rewards/margins": 2.5424749851226807, "rewards/rejected": 2.3113014698028564, "step": 6848 }, { "epoch": 1.11, "learning_rate": 6.917059203540501e-07, "logits/chosen": -0.624972939491272, "logits/rejected": -0.624972939491272, "logps/chosen": -72.99508666992188, "logps/rejected": -72.99508666992188, "loss": 0.4282, "rewards/accuracies": 0.0, "rewards/chosen": 1.2491416931152344, "rewards/margins": 0.0, "rewards/rejected": 1.2491416931152344, "step": 6849 }, { "epoch": 1.11, "learning_rate": 6.915845321974309e-07, "logits/chosen": -1.4156708717346191, "logits/rejected": -1.3116053342819214, "logps/chosen": -100.78361511230469, "logps/rejected": -92.75691986083984, "loss": 0.1526, "rewards/accuracies": 1.0, "rewards/chosen": 5.219168186187744, "rewards/margins": 2.8138625621795654, "rewards/rejected": 2.4053056240081787, "step": 6850 }, { "epoch": 1.11, "learning_rate": 6.914631308041332e-07, "logits/chosen": -0.5788414478302002, "logits/rejected": -0.5641111731529236, "logps/chosen": -42.25896072387695, "logps/rejected": -58.04009246826172, "loss": 1.5434, "rewards/accuracies": 1.0, "rewards/chosen": 1.9050381183624268, "rewards/margins": 0.24910545349121094, "rewards/rejected": 1.6559326648712158, "step": 6851 }, { "epoch": 1.11, "learning_rate": 6.913417161825449e-07, "logits/chosen": -0.9557366967201233, "logits/rejected": -0.8150792121887207, "logps/chosen": -204.49497985839844, "logps/rejected": -97.41426086425781, "loss": 0.5245, "rewards/accuracies": 1.0, "rewards/chosen": 5.679649353027344, "rewards/margins": 2.3875839710235596, "rewards/rejected": 3.292065382003784, "step": 6852 }, { "epoch": 1.11, "learning_rate": 6.912202883410545e-07, "logits/chosen": -0.5699650645256042, "logits/rejected": -0.4951695203781128, "logps/chosen": -71.4244384765625, "logps/rejected": -53.87828826904297, "loss": 0.6118, "rewards/accuracies": 1.0, "rewards/chosen": 1.722399115562439, "rewards/margins": 0.5351706743240356, "rewards/rejected": 1.1872284412384033, "step": 6853 }, { "epoch": 1.11, "learning_rate": 6.910988472880514e-07, "logits/chosen": -0.3735479414463043, "logits/rejected": -0.371621310710907, "logps/chosen": -4.270885467529297, "logps/rejected": -2.6786601543426514, "loss": 0.4136, "rewards/accuracies": 0.0, "rewards/chosen": 0.2567468285560608, "rewards/margins": -0.002387017011642456, "rewards/rejected": 0.25913384556770325, "step": 6854 }, { "epoch": 1.11, "learning_rate": 6.909773930319262e-07, "logits/chosen": -0.8853753209114075, "logits/rejected": -1.161117672920227, "logps/chosen": -93.7603759765625, "logps/rejected": -50.75405502319336, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.8682479858398438, "rewards/margins": 0.21590763330459595, "rewards/rejected": 0.6523403525352478, "step": 6855 }, { "epoch": 1.11, "learning_rate": 6.908559255810703e-07, "logits/chosen": -0.6509702801704407, "logits/rejected": -0.6485928297042847, "logps/chosen": -3.0800840854644775, "logps/rejected": -1.9957314729690552, "loss": 0.4125, "rewards/accuracies": 0.0, "rewards/chosen": 0.45649710297584534, "rewards/margins": -0.14669093489646912, "rewards/rejected": 0.6031880378723145, "step": 6856 }, { "epoch": 1.11, "learning_rate": 6.907344449438758e-07, "logits/chosen": -0.4336625039577484, "logits/rejected": -0.4549403190612793, "logps/chosen": -15.18974781036377, "logps/rejected": -21.85199546813965, "loss": 0.4032, "rewards/accuracies": 1.0, "rewards/chosen": 0.44347068667411804, "rewards/margins": 0.2243065983057022, "rewards/rejected": 0.21916408836841583, "step": 6857 }, { "epoch": 1.11, "learning_rate": 6.906129511287357e-07, "logits/chosen": -0.7037732005119324, "logits/rejected": -0.506315290927887, "logps/chosen": -185.2086181640625, "logps/rejected": -67.75326538085938, "loss": 0.1554, "rewards/accuracies": 1.0, "rewards/chosen": 4.5981597900390625, "rewards/margins": 1.7158012390136719, "rewards/rejected": 2.8823585510253906, "step": 6858 }, { "epoch": 1.11, "learning_rate": 6.904914441440446e-07, "logits/chosen": -0.2483168989419937, "logits/rejected": -0.2483168989419937, "logps/chosen": -39.82139205932617, "logps/rejected": -39.82139205932617, "loss": 0.5761, "rewards/accuracies": 0.0, "rewards/chosen": -0.09049377590417862, "rewards/margins": 0.0, "rewards/rejected": -0.09049377590417862, "step": 6859 }, { "epoch": 1.11, "learning_rate": 6.903699239981969e-07, "logits/chosen": -0.49457696080207825, "logits/rejected": -0.48423752188682556, "logps/chosen": -45.96039962768555, "logps/rejected": -40.14740753173828, "loss": 0.2997, "rewards/accuracies": 1.0, "rewards/chosen": 2.3178906440734863, "rewards/margins": 0.22153735160827637, "rewards/rejected": 2.09635329246521, "step": 6860 }, { "epoch": 1.11, "learning_rate": 6.902483906995888e-07, "logits/chosen": -0.6673318147659302, "logits/rejected": -0.6654970049858093, "logps/chosen": -67.03091430664062, "logps/rejected": -68.47352600097656, "loss": 0.5079, "rewards/accuracies": 0.0, "rewards/chosen": 1.5763778686523438, "rewards/margins": -0.4295334815979004, "rewards/rejected": 2.005911350250244, "step": 6861 }, { "epoch": 1.11, "learning_rate": 6.901268442566171e-07, "logits/chosen": -0.8541253209114075, "logits/rejected": -0.9174044132232666, "logps/chosen": -96.2813491821289, "logps/rejected": -121.80204010009766, "loss": 0.4528, "rewards/accuracies": 1.0, "rewards/chosen": 1.6961357593536377, "rewards/margins": 1.4410858154296875, "rewards/rejected": 0.2550499141216278, "step": 6862 }, { "epoch": 1.11, "learning_rate": 6.900052846776795e-07, "logits/chosen": -0.5558140873908997, "logits/rejected": -0.605896532535553, "logps/chosen": -162.9013214111328, "logps/rejected": -76.73588562011719, "loss": 1.1755, "rewards/accuracies": 1.0, "rewards/chosen": 3.424525499343872, "rewards/margins": 1.2513320446014404, "rewards/rejected": 2.1731934547424316, "step": 6863 }, { "epoch": 1.11, "learning_rate": 6.898837119711746e-07, "logits/chosen": -0.6105445027351379, "logits/rejected": -0.5961822867393494, "logps/chosen": -94.44578552246094, "logps/rejected": -110.54148864746094, "loss": 0.2438, "rewards/accuracies": 1.0, "rewards/chosen": 1.5445778369903564, "rewards/margins": 0.608284056186676, "rewards/rejected": 0.9362937808036804, "step": 6864 }, { "epoch": 1.11, "learning_rate": 6.897621261455018e-07, "logits/chosen": -0.7009800672531128, "logits/rejected": -0.7717646956443787, "logps/chosen": -97.1237564086914, "logps/rejected": -97.05923461914062, "loss": 1.5955, "rewards/accuracies": 0.0, "rewards/chosen": 0.5889320373535156, "rewards/margins": -1.3220947980880737, "rewards/rejected": 1.9110268354415894, "step": 6865 }, { "epoch": 1.11, "learning_rate": 6.896405272090616e-07, "logits/chosen": -0.826284646987915, "logits/rejected": -0.8040104508399963, "logps/chosen": -118.89629364013672, "logps/rejected": -124.07190704345703, "loss": 1.2374, "rewards/accuracies": 0.0, "rewards/chosen": 3.772275686264038, "rewards/margins": -0.31539463996887207, "rewards/rejected": 4.08767032623291, "step": 6866 }, { "epoch": 1.11, "learning_rate": 6.895189151702553e-07, "logits/chosen": -0.3054697811603546, "logits/rejected": -0.2987096607685089, "logps/chosen": -18.921199798583984, "logps/rejected": -38.658565521240234, "loss": 0.5004, "rewards/accuracies": 1.0, "rewards/chosen": 0.5068870782852173, "rewards/margins": 0.23791125416755676, "rewards/rejected": 0.2689758241176605, "step": 6867 }, { "epoch": 1.11, "learning_rate": 6.893972900374855e-07, "logits/chosen": -0.6818580031394958, "logits/rejected": -0.6590455174446106, "logps/chosen": -89.17676544189453, "logps/rejected": -124.6653060913086, "loss": 1.3331, "rewards/accuracies": 0.0, "rewards/chosen": 2.2483603954315186, "rewards/margins": -1.347792148590088, "rewards/rejected": 3.5961525440216064, "step": 6868 }, { "epoch": 1.11, "learning_rate": 6.892756518191549e-07, "logits/chosen": -0.31320104002952576, "logits/rejected": -0.19822357594966888, "logps/chosen": -67.67974853515625, "logps/rejected": -71.21623229980469, "loss": 0.6356, "rewards/accuracies": 1.0, "rewards/chosen": 2.6527206897735596, "rewards/margins": 2.0380210876464844, "rewards/rejected": 0.6146995425224304, "step": 6869 }, { "epoch": 1.12, "learning_rate": 6.891540005236674e-07, "logits/chosen": -0.8351740837097168, "logits/rejected": -0.7191517353057861, "logps/chosen": -80.99397277832031, "logps/rejected": -73.16407012939453, "loss": 0.2641, "rewards/accuracies": 1.0, "rewards/chosen": 1.4038848876953125, "rewards/margins": 0.9152237176895142, "rewards/rejected": 0.4886611998081207, "step": 6870 }, { "epoch": 1.12, "learning_rate": 6.890323361594286e-07, "logits/chosen": -0.678962230682373, "logits/rejected": -0.6139972805976868, "logps/chosen": -94.48033905029297, "logps/rejected": -38.4710807800293, "loss": 0.2349, "rewards/accuracies": 1.0, "rewards/chosen": 1.2230767011642456, "rewards/margins": 0.9814900159835815, "rewards/rejected": 0.24158668518066406, "step": 6871 }, { "epoch": 1.12, "learning_rate": 6.88910658734844e-07, "logits/chosen": -0.661474347114563, "logits/rejected": -0.5923370122909546, "logps/chosen": -111.31678009033203, "logps/rejected": -69.08193969726562, "loss": 0.4197, "rewards/accuracies": 1.0, "rewards/chosen": 1.9824882745742798, "rewards/margins": 0.7348663806915283, "rewards/rejected": 1.2476218938827515, "step": 6872 }, { "epoch": 1.12, "learning_rate": 6.887889682583204e-07, "logits/chosen": -0.5462098121643066, "logits/rejected": -0.5278428196907043, "logps/chosen": -27.078754425048828, "logps/rejected": -64.3216552734375, "loss": 0.2438, "rewards/accuracies": 1.0, "rewards/chosen": 1.9580048322677612, "rewards/margins": 0.5042046308517456, "rewards/rejected": 1.4538002014160156, "step": 6873 }, { "epoch": 1.12, "learning_rate": 6.886672647382652e-07, "logits/chosen": -0.5583081245422363, "logits/rejected": -0.5992954969406128, "logps/chosen": -16.9447021484375, "logps/rejected": -56.34172821044922, "loss": 1.5912, "rewards/accuracies": 0.0, "rewards/chosen": 0.8254246115684509, "rewards/margins": -1.7724659442901611, "rewards/rejected": 2.597890615463257, "step": 6874 }, { "epoch": 1.12, "learning_rate": 6.885455481830873e-07, "logits/chosen": -0.3545123338699341, "logits/rejected": -0.25517985224723816, "logps/chosen": -62.55656433105469, "logps/rejected": -50.0074348449707, "loss": 0.5107, "rewards/accuracies": 1.0, "rewards/chosen": 2.218296766281128, "rewards/margins": 1.0738799571990967, "rewards/rejected": 1.1444168090820312, "step": 6875 }, { "epoch": 1.12, "learning_rate": 6.884238186011961e-07, "logits/chosen": -0.4232833683490753, "logits/rejected": -0.438323974609375, "logps/chosen": -12.684029579162598, "logps/rejected": -1.219557523727417, "loss": 0.8682, "rewards/accuracies": 0.0, "rewards/chosen": -0.14766645431518555, "rewards/margins": -0.44964030385017395, "rewards/rejected": 0.3019738495349884, "step": 6876 }, { "epoch": 1.12, "learning_rate": 6.88302076001002e-07, "logits/chosen": -0.5187267661094666, "logits/rejected": -0.5775147080421448, "logps/chosen": -33.36043930053711, "logps/rejected": -122.24673461914062, "loss": 0.932, "rewards/accuracies": 0.0, "rewards/chosen": 1.763956904411316, "rewards/margins": -0.6556795835494995, "rewards/rejected": 2.4196364879608154, "step": 6877 }, { "epoch": 1.12, "learning_rate": 6.88180320390916e-07, "logits/chosen": -0.8987578749656677, "logits/rejected": -0.6654422879219055, "logps/chosen": -85.27729797363281, "logps/rejected": -68.18186950683594, "loss": 0.0731, "rewards/accuracies": 1.0, "rewards/chosen": 4.2463860511779785, "rewards/margins": 2.877943515777588, "rewards/rejected": 1.3684425354003906, "step": 6878 }, { "epoch": 1.12, "learning_rate": 6.880585517793507e-07, "logits/chosen": -1.2277482748031616, "logits/rejected": -1.1706221103668213, "logps/chosen": -317.06866455078125, "logps/rejected": -121.65180969238281, "loss": 1.0111, "rewards/accuracies": 0.0, "rewards/chosen": 0.935070812702179, "rewards/margins": -1.8619294166564941, "rewards/rejected": 2.7970001697540283, "step": 6879 }, { "epoch": 1.12, "learning_rate": 6.879367701747187e-07, "logits/chosen": -0.6304751038551331, "logits/rejected": -0.8787584900856018, "logps/chosen": -32.96160888671875, "logps/rejected": -69.06916809082031, "loss": 0.43, "rewards/accuracies": 0.0, "rewards/chosen": 1.478554606437683, "rewards/margins": -0.2524474859237671, "rewards/rejected": 1.7310020923614502, "step": 6880 }, { "epoch": 1.12, "learning_rate": 6.878149755854342e-07, "logits/chosen": -0.4602733254432678, "logits/rejected": -0.4369443356990814, "logps/chosen": -27.328083038330078, "logps/rejected": -5.337630271911621, "loss": 0.6491, "rewards/accuracies": 0.0, "rewards/chosen": 0.1311555951833725, "rewards/margins": -0.598334789276123, "rewards/rejected": 0.7294903993606567, "step": 6881 }, { "epoch": 1.12, "learning_rate": 6.87693168019912e-07, "logits/chosen": -0.693239688873291, "logits/rejected": -0.7912740707397461, "logps/chosen": -234.28701782226562, "logps/rejected": -142.21395874023438, "loss": 0.4587, "rewards/accuracies": 0.0, "rewards/chosen": 4.436361789703369, "rewards/margins": -0.23973560333251953, "rewards/rejected": 4.676097393035889, "step": 6882 }, { "epoch": 1.12, "learning_rate": 6.875713474865678e-07, "logits/chosen": -0.6275337934494019, "logits/rejected": -0.5940045714378357, "logps/chosen": -41.97098922729492, "logps/rejected": -34.07498550415039, "loss": 0.2985, "rewards/accuracies": 1.0, "rewards/chosen": 0.9668213129043579, "rewards/margins": 0.21624833345413208, "rewards/rejected": 0.7505729794502258, "step": 6883 }, { "epoch": 1.12, "learning_rate": 6.874495139938185e-07, "logits/chosen": -0.7297706604003906, "logits/rejected": -0.4191039204597473, "logps/chosen": -146.664794921875, "logps/rejected": -50.420166015625, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": 4.5161333084106445, "rewards/margins": 4.197739124298096, "rewards/rejected": 0.31839409470558167, "step": 6884 }, { "epoch": 1.12, "learning_rate": 6.873276675500812e-07, "logits/chosen": -0.37013527750968933, "logits/rejected": -0.37013527750968933, "logps/chosen": -3.4381494522094727, "logps/rejected": -3.4381494522094727, "loss": 0.6455, "rewards/accuracies": 0.0, "rewards/chosen": 0.3635617792606354, "rewards/margins": 0.0, "rewards/rejected": 0.3635617792606354, "step": 6885 }, { "epoch": 1.12, "learning_rate": 6.872058081637747e-07, "logits/chosen": -0.3161177933216095, "logits/rejected": -0.37305983901023865, "logps/chosen": -106.88658142089844, "logps/rejected": -63.87104034423828, "loss": 0.9384, "rewards/accuracies": 0.0, "rewards/chosen": 1.6982589960098267, "rewards/margins": -0.8182669878005981, "rewards/rejected": 2.516525983810425, "step": 6886 }, { "epoch": 1.12, "learning_rate": 6.870839358433183e-07, "logits/chosen": -0.8993850946426392, "logits/rejected": -0.8699589371681213, "logps/chosen": -76.0367660522461, "logps/rejected": -46.99241638183594, "loss": 0.5534, "rewards/accuracies": 0.0, "rewards/chosen": 1.4267014265060425, "rewards/margins": -0.7036551237106323, "rewards/rejected": 2.130356550216675, "step": 6887 }, { "epoch": 1.12, "learning_rate": 6.86962050597132e-07, "logits/chosen": -0.7882791757583618, "logits/rejected": -0.7588454484939575, "logps/chosen": -68.91084289550781, "logps/rejected": -57.12404251098633, "loss": 0.16, "rewards/accuracies": 1.0, "rewards/chosen": 3.2642579078674316, "rewards/margins": 1.0669238567352295, "rewards/rejected": 2.197334051132202, "step": 6888 }, { "epoch": 1.12, "learning_rate": 6.86840152433637e-07, "logits/chosen": -0.3980560898780823, "logits/rejected": -0.3980560898780823, "logps/chosen": -62.94456481933594, "logps/rejected": -62.94456481933594, "loss": 0.3554, "rewards/accuracies": 0.0, "rewards/chosen": 0.4982971251010895, "rewards/margins": 0.0, "rewards/rejected": 0.4982971251010895, "step": 6889 }, { "epoch": 1.12, "learning_rate": 6.867182413612556e-07, "logits/chosen": -0.5115787982940674, "logits/rejected": -0.5115787982940674, "logps/chosen": -69.48335266113281, "logps/rejected": -69.48335266113281, "loss": 1.2152, "rewards/accuracies": 0.0, "rewards/chosen": 1.3451080322265625, "rewards/margins": 0.0, "rewards/rejected": 1.3451080322265625, "step": 6890 }, { "epoch": 1.12, "learning_rate": 6.865963173884101e-07, "logits/chosen": -0.34721288084983826, "logits/rejected": -0.24530284106731415, "logps/chosen": -88.45697021484375, "logps/rejected": -69.37614440917969, "loss": 1.0394, "rewards/accuracies": 0.0, "rewards/chosen": 0.277609258890152, "rewards/margins": -1.835518717765808, "rewards/rejected": 2.1131279468536377, "step": 6891 }, { "epoch": 1.12, "learning_rate": 6.864743805235251e-07, "logits/chosen": -0.7145737409591675, "logits/rejected": -0.3226574659347534, "logps/chosen": -50.588382720947266, "logps/rejected": -65.51126861572266, "loss": 0.3581, "rewards/accuracies": 1.0, "rewards/chosen": 3.033445358276367, "rewards/margins": 0.0050280094146728516, "rewards/rejected": 3.0284173488616943, "step": 6892 }, { "epoch": 1.12, "learning_rate": 6.863524307750246e-07, "logits/chosen": -0.7602099180221558, "logits/rejected": -0.7602099180221558, "logps/chosen": -35.35314178466797, "logps/rejected": -35.35314178466797, "loss": 0.3913, "rewards/accuracies": 0.0, "rewards/chosen": 1.4269897937774658, "rewards/margins": 0.0, "rewards/rejected": 1.4269897937774658, "step": 6893 }, { "epoch": 1.12, "learning_rate": 6.862304681513344e-07, "logits/chosen": -0.935984194278717, "logits/rejected": -0.8490325212478638, "logps/chosen": -80.12913513183594, "logps/rejected": -102.65151977539062, "loss": 1.0153, "rewards/accuracies": 0.0, "rewards/chosen": 1.3795868158340454, "rewards/margins": -0.8361502885818481, "rewards/rejected": 2.2157371044158936, "step": 6894 }, { "epoch": 1.12, "learning_rate": 6.86108492660881e-07, "logits/chosen": -0.7026351094245911, "logits/rejected": -0.5025327205657959, "logps/chosen": -78.59749603271484, "logps/rejected": -26.694026947021484, "loss": 0.1508, "rewards/accuracies": 1.0, "rewards/chosen": 2.919322967529297, "rewards/margins": 2.4760096073150635, "rewards/rejected": 0.4433134198188782, "step": 6895 }, { "epoch": 1.12, "learning_rate": 6.859865043120918e-07, "logits/chosen": -0.7312182188034058, "logits/rejected": -0.6407327055931091, "logps/chosen": -87.76756286621094, "logps/rejected": -61.533451080322266, "loss": 0.2153, "rewards/accuracies": 1.0, "rewards/chosen": 3.4790847301483154, "rewards/margins": 0.9734926223754883, "rewards/rejected": 2.505592107772827, "step": 6896 }, { "epoch": 1.12, "learning_rate": 6.858645031133949e-07, "logits/chosen": -0.7894877195358276, "logits/rejected": -0.8048161268234253, "logps/chosen": -45.035091400146484, "logps/rejected": -80.75535583496094, "loss": 0.2077, "rewards/accuracies": 1.0, "rewards/chosen": 2.086427688598633, "rewards/margins": 0.7535525560379028, "rewards/rejected": 1.33287513256073, "step": 6897 }, { "epoch": 1.12, "learning_rate": 6.857424890732195e-07, "logits/chosen": -0.23634690046310425, "logits/rejected": -0.29607298970222473, "logps/chosen": -103.74774169921875, "logps/rejected": -79.75877380371094, "loss": 0.5104, "rewards/accuracies": 0.0, "rewards/chosen": 0.7094688415527344, "rewards/margins": -0.04062575101852417, "rewards/rejected": 0.7500945925712585, "step": 6898 }, { "epoch": 1.12, "learning_rate": 6.856204621999955e-07, "logits/chosen": -0.6239296793937683, "logits/rejected": -0.6702550053596497, "logps/chosen": -72.58718872070312, "logps/rejected": -166.4326171875, "loss": 1.1928, "rewards/accuracies": 0.0, "rewards/chosen": 0.5312851071357727, "rewards/margins": -1.4889466762542725, "rewards/rejected": 2.0202317237854004, "step": 6899 }, { "epoch": 1.12, "learning_rate": 6.854984225021541e-07, "logits/chosen": -0.5376120805740356, "logits/rejected": -0.4928978681564331, "logps/chosen": -64.41229248046875, "logps/rejected": -29.056177139282227, "loss": 1.3254, "rewards/accuracies": 1.0, "rewards/chosen": 1.0496978759765625, "rewards/margins": 0.27900177240371704, "rewards/rejected": 0.7706961035728455, "step": 6900 }, { "epoch": 1.12, "learning_rate": 6.853763699881269e-07, "logits/chosen": -0.3341658115386963, "logits/rejected": -0.3341658115386963, "logps/chosen": -6.276491165161133, "logps/rejected": -6.276491165161133, "loss": 1.444, "rewards/accuracies": 0.0, "rewards/chosen": 0.37632542848587036, "rewards/margins": 0.0, "rewards/rejected": 0.37632542848587036, "step": 6901 }, { "epoch": 1.12, "learning_rate": 6.852543046663466e-07, "logits/chosen": -0.6333615183830261, "logits/rejected": -0.6378001570701599, "logps/chosen": -4.29154109954834, "logps/rejected": -1.9001469612121582, "loss": 0.7625, "rewards/accuracies": 0.0, "rewards/chosen": 0.06195850297808647, "rewards/margins": -0.1895877867937088, "rewards/rejected": 0.25154629349708557, "step": 6902 }, { "epoch": 1.12, "learning_rate": 6.851322265452466e-07, "logits/chosen": -0.4621916711330414, "logits/rejected": -0.335683673620224, "logps/chosen": -54.31986999511719, "logps/rejected": -13.761968612670898, "loss": 0.1343, "rewards/accuracies": 1.0, "rewards/chosen": 1.8739738464355469, "rewards/margins": 1.4543927907943726, "rewards/rejected": 0.41958102583885193, "step": 6903 }, { "epoch": 1.12, "learning_rate": 6.850101356332616e-07, "logits/chosen": -0.5586817860603333, "logits/rejected": -0.5586817860603333, "logps/chosen": -111.32276916503906, "logps/rejected": -111.32276916503906, "loss": 0.4859, "rewards/accuracies": 0.0, "rewards/chosen": 0.5494278073310852, "rewards/margins": 0.0, "rewards/rejected": 0.5494278073310852, "step": 6904 }, { "epoch": 1.12, "learning_rate": 6.848880319388269e-07, "logits/chosen": -0.7964051961898804, "logits/rejected": -0.7765082120895386, "logps/chosen": -101.06051635742188, "logps/rejected": -88.70685577392578, "loss": 1.9478, "rewards/accuracies": 0.0, "rewards/chosen": 1.799835205078125, "rewards/margins": -2.525127410888672, "rewards/rejected": 4.324962615966797, "step": 6905 }, { "epoch": 1.12, "learning_rate": 6.847659154703785e-07, "logits/chosen": -0.8487369418144226, "logits/rejected": -0.6389737129211426, "logps/chosen": -225.42849731445312, "logps/rejected": -59.84449005126953, "loss": 0.1031, "rewards/accuracies": 1.0, "rewards/chosen": 4.277505397796631, "rewards/margins": 1.4955389499664307, "rewards/rejected": 2.7819664478302, "step": 6906 }, { "epoch": 1.12, "learning_rate": 6.846437862363535e-07, "logits/chosen": -0.42678454518318176, "logits/rejected": -0.39620083570480347, "logps/chosen": -62.61534118652344, "logps/rejected": -63.61261749267578, "loss": 1.5666, "rewards/accuracies": 0.0, "rewards/chosen": 2.415876865386963, "rewards/margins": -0.20298528671264648, "rewards/rejected": 2.6188621520996094, "step": 6907 }, { "epoch": 1.12, "learning_rate": 6.845216442451901e-07, "logits/chosen": -0.965718150138855, "logits/rejected": -0.8972375392913818, "logps/chosen": -115.7227783203125, "logps/rejected": -85.23014068603516, "loss": 1.3515, "rewards/accuracies": 0.0, "rewards/chosen": 0.5844520926475525, "rewards/margins": -1.598379373550415, "rewards/rejected": 2.1828315258026123, "step": 6908 }, { "epoch": 1.12, "learning_rate": 6.843994895053271e-07, "logits/chosen": -0.6475200057029724, "logits/rejected": -0.6489160060882568, "logps/chosen": -64.31907653808594, "logps/rejected": -43.44538116455078, "loss": 0.4422, "rewards/accuracies": 0.0, "rewards/chosen": 1.311639428138733, "rewards/margins": -0.3149726390838623, "rewards/rejected": 1.6266120672225952, "step": 6909 }, { "epoch": 1.12, "learning_rate": 6.842773220252041e-07, "logits/chosen": -0.9440919756889343, "logits/rejected": -0.9216688871383667, "logps/chosen": -60.904727935791016, "logps/rejected": -139.62850952148438, "loss": 0.9735, "rewards/accuracies": 0.0, "rewards/chosen": 2.5827298164367676, "rewards/margins": -1.780327320098877, "rewards/rejected": 4.3630571365356445, "step": 6910 }, { "epoch": 1.12, "learning_rate": 6.841551418132618e-07, "logits/chosen": -0.7680540680885315, "logits/rejected": -0.6779471635818481, "logps/chosen": -58.923831939697266, "logps/rejected": -95.79135131835938, "loss": 1.1176, "rewards/accuracies": 0.0, "rewards/chosen": 1.3485714197158813, "rewards/margins": -1.0848132371902466, "rewards/rejected": 2.433384656906128, "step": 6911 }, { "epoch": 1.12, "learning_rate": 6.840329488779417e-07, "logits/chosen": -0.234090194106102, "logits/rejected": -0.1716003566980362, "logps/chosen": -57.85993194580078, "logps/rejected": -39.86112594604492, "loss": 1.0695, "rewards/accuracies": 0.0, "rewards/chosen": 0.8030311465263367, "rewards/margins": -1.4592602252960205, "rewards/rejected": 2.262291431427002, "step": 6912 }, { "epoch": 1.12, "learning_rate": 6.839107432276863e-07, "logits/chosen": -0.6725759506225586, "logits/rejected": -0.5583603382110596, "logps/chosen": -73.7643814086914, "logps/rejected": -63.481712341308594, "loss": 0.2937, "rewards/accuracies": 1.0, "rewards/chosen": 2.4812562465667725, "rewards/margins": 0.31359267234802246, "rewards/rejected": 2.16766357421875, "step": 6913 }, { "epoch": 1.12, "learning_rate": 6.837885248709385e-07, "logits/chosen": -0.6802317500114441, "logits/rejected": -0.7198284268379211, "logps/chosen": -58.88604736328125, "logps/rejected": -73.38276672363281, "loss": 0.6142, "rewards/accuracies": 0.0, "rewards/chosen": 1.3323029279708862, "rewards/margins": -0.5946570634841919, "rewards/rejected": 1.9269599914550781, "step": 6914 }, { "epoch": 1.12, "learning_rate": 6.836662938161429e-07, "logits/chosen": -0.673269510269165, "logits/rejected": -0.6925050616264343, "logps/chosen": -109.47267150878906, "logps/rejected": -81.58197021484375, "loss": 1.3156, "rewards/accuracies": 1.0, "rewards/chosen": 1.371649146080017, "rewards/margins": 0.18590235710144043, "rewards/rejected": 1.1857467889785767, "step": 6915 }, { "epoch": 1.12, "learning_rate": 6.83544050071744e-07, "logits/chosen": -0.530714750289917, "logits/rejected": -0.3980949819087982, "logps/chosen": -72.36087799072266, "logps/rejected": -43.78202819824219, "loss": 0.2135, "rewards/accuracies": 1.0, "rewards/chosen": 2.978548526763916, "rewards/margins": 0.6689820289611816, "rewards/rejected": 2.3095664978027344, "step": 6916 }, { "epoch": 1.12, "learning_rate": 6.834217936461882e-07, "logits/chosen": -0.6557541489601135, "logits/rejected": -0.6222604513168335, "logps/chosen": -27.470788955688477, "logps/rejected": -23.475589752197266, "loss": 1.0373, "rewards/accuracies": 1.0, "rewards/chosen": 0.9978048205375671, "rewards/margins": 0.49896904826164246, "rewards/rejected": 0.4988357722759247, "step": 6917 }, { "epoch": 1.12, "learning_rate": 6.832995245479218e-07, "logits/chosen": -0.5503082275390625, "logits/rejected": -0.5476090312004089, "logps/chosen": -3.454904794692993, "logps/rejected": -10.561767578125, "loss": 0.7557, "rewards/accuracies": 0.0, "rewards/chosen": 0.04619128629565239, "rewards/margins": -0.0048821717500686646, "rewards/rejected": 0.051073458045721054, "step": 6918 }, { "epoch": 1.12, "learning_rate": 6.831772427853928e-07, "logits/chosen": -0.5958976745605469, "logits/rejected": -0.5958976745605469, "logps/chosen": -0.6476654410362244, "logps/rejected": -0.6476654410362244, "loss": 0.4833, "rewards/accuracies": 0.0, "rewards/chosen": 0.19137278199195862, "rewards/margins": 0.0, "rewards/rejected": 0.19137278199195862, "step": 6919 }, { "epoch": 1.12, "learning_rate": 6.830549483670495e-07, "logits/chosen": -0.8980092406272888, "logits/rejected": -0.9207533001899719, "logps/chosen": -103.34709167480469, "logps/rejected": -92.35507202148438, "loss": 0.6548, "rewards/accuracies": 0.0, "rewards/chosen": 0.7340332269668579, "rewards/margins": -0.976287841796875, "rewards/rejected": 1.710321068763733, "step": 6920 }, { "epoch": 1.12, "learning_rate": 6.829326413013413e-07, "logits/chosen": -0.3112240731716156, "logits/rejected": -0.3539721965789795, "logps/chosen": -85.92760467529297, "logps/rejected": -45.6038818359375, "loss": 1.3512, "rewards/accuracies": 0.0, "rewards/chosen": 1.7993271350860596, "rewards/margins": -0.4292938709259033, "rewards/rejected": 2.228621006011963, "step": 6921 }, { "epoch": 1.12, "learning_rate": 6.828103215967186e-07, "logits/chosen": -0.6567087769508362, "logits/rejected": -0.6055413484573364, "logps/chosen": -74.7532958984375, "logps/rejected": -38.77119827270508, "loss": 0.5301, "rewards/accuracies": 1.0, "rewards/chosen": 0.912701427936554, "rewards/margins": 0.8129673004150391, "rewards/rejected": 0.0997341200709343, "step": 6922 }, { "epoch": 1.12, "learning_rate": 6.826879892616324e-07, "logits/chosen": -0.22532114386558533, "logits/rejected": -0.21721908450126648, "logps/chosen": -1.1527445316314697, "logps/rejected": -3.1369500160217285, "loss": 0.4865, "rewards/accuracies": 0.0, "rewards/chosen": 0.1906963586807251, "rewards/margins": -0.12337622046470642, "rewards/rejected": 0.3140725791454315, "step": 6923 }, { "epoch": 1.12, "learning_rate": 6.825656443045346e-07, "logits/chosen": -0.7228490710258484, "logits/rejected": -0.7190729975700378, "logps/chosen": -30.58315658569336, "logps/rejected": -53.04656219482422, "loss": 0.4007, "rewards/accuracies": 1.0, "rewards/chosen": 2.9498822689056396, "rewards/margins": 0.2308797836303711, "rewards/rejected": 2.7190024852752686, "step": 6924 }, { "epoch": 1.12, "learning_rate": 6.824432867338785e-07, "logits/chosen": -0.6876521110534668, "logits/rejected": -0.6831225156784058, "logps/chosen": -52.26859664916992, "logps/rejected": -56.45582962036133, "loss": 1.0389, "rewards/accuracies": 1.0, "rewards/chosen": 2.0243725776672363, "rewards/margins": 0.08036196231842041, "rewards/rejected": 1.944010615348816, "step": 6925 }, { "epoch": 1.12, "learning_rate": 6.823209165581175e-07, "logits/chosen": -0.6967913508415222, "logits/rejected": -0.6967913508415222, "logps/chosen": -63.15290832519531, "logps/rejected": -63.15290832519531, "loss": 0.3486, "rewards/accuracies": 0.0, "rewards/chosen": 1.1582611799240112, "rewards/margins": 0.0, "rewards/rejected": 1.1582611799240112, "step": 6926 }, { "epoch": 1.12, "learning_rate": 6.821985337857063e-07, "logits/chosen": -0.7865721583366394, "logits/rejected": -0.7826489806175232, "logps/chosen": -81.79483032226562, "logps/rejected": -89.72494506835938, "loss": 0.4846, "rewards/accuracies": 0.0, "rewards/chosen": 0.7895218133926392, "rewards/margins": -0.47513580322265625, "rewards/rejected": 1.2646576166152954, "step": 6927 }, { "epoch": 1.12, "learning_rate": 6.820761384251004e-07, "logits/chosen": -0.3825742304325104, "logits/rejected": -0.3713601529598236, "logps/chosen": -1.9342143535614014, "logps/rejected": -12.207483291625977, "loss": 1.2972, "rewards/accuracies": 1.0, "rewards/chosen": 0.36455902457237244, "rewards/margins": 0.20823921263217926, "rewards/rejected": 0.15631981194019318, "step": 6928 }, { "epoch": 1.12, "learning_rate": 6.81953730484756e-07, "logits/chosen": -0.5685858130455017, "logits/rejected": -0.5993508696556091, "logps/chosen": -114.61146545410156, "logps/rejected": -106.87030029296875, "loss": 1.4018, "rewards/accuracies": 1.0, "rewards/chosen": 0.8851715326309204, "rewards/margins": 0.200408935546875, "rewards/rejected": 0.6847625970840454, "step": 6929 }, { "epoch": 1.12, "learning_rate": 6.818313099731307e-07, "logits/chosen": -0.6930933594703674, "logits/rejected": -0.6181077361106873, "logps/chosen": -97.1857681274414, "logps/rejected": -86.61599731445312, "loss": 2.952, "rewards/accuracies": 1.0, "rewards/chosen": 5.14791202545166, "rewards/margins": 1.2432794570922852, "rewards/rejected": 3.904632568359375, "step": 6930 }, { "epoch": 1.12, "learning_rate": 6.817088768986822e-07, "logits/chosen": -0.5724619626998901, "logits/rejected": -0.45987504720687866, "logps/chosen": -54.345306396484375, "logps/rejected": -61.182743072509766, "loss": 2.4674, "rewards/accuracies": 0.0, "rewards/chosen": 1.5923187732696533, "rewards/margins": -0.6579098701477051, "rewards/rejected": 2.2502286434173584, "step": 6931 }, { "epoch": 1.13, "learning_rate": 6.815864312698699e-07, "logits/chosen": -0.738196849822998, "logits/rejected": -0.8800017237663269, "logps/chosen": -65.54194641113281, "logps/rejected": -52.14638900756836, "loss": 0.4879, "rewards/accuracies": 1.0, "rewards/chosen": 2.762983798980713, "rewards/margins": 0.7773327827453613, "rewards/rejected": 1.9856510162353516, "step": 6932 }, { "epoch": 1.13, "learning_rate": 6.814639730951532e-07, "logits/chosen": -0.5915740728378296, "logits/rejected": -0.5915740728378296, "logps/chosen": -56.842010498046875, "logps/rejected": -56.842010498046875, "loss": 1.5034, "rewards/accuracies": 0.0, "rewards/chosen": 0.9076194763183594, "rewards/margins": 0.0, "rewards/rejected": 0.9076194763183594, "step": 6933 }, { "epoch": 1.13, "learning_rate": 6.81341502382993e-07, "logits/chosen": -0.7274751663208008, "logits/rejected": -0.6788404583930969, "logps/chosen": -87.90354919433594, "logps/rejected": -86.82429504394531, "loss": 0.1084, "rewards/accuracies": 1.0, "rewards/chosen": 5.596251010894775, "rewards/margins": 3.209280490875244, "rewards/rejected": 2.3869705200195312, "step": 6934 }, { "epoch": 1.13, "learning_rate": 6.812190191418508e-07, "logits/chosen": -0.8026223182678223, "logits/rejected": -0.6993151903152466, "logps/chosen": -52.08018112182617, "logps/rejected": -27.999969482421875, "loss": 0.3861, "rewards/accuracies": 1.0, "rewards/chosen": 2.2901484966278076, "rewards/margins": 2.0735111236572266, "rewards/rejected": 0.21663741767406464, "step": 6935 }, { "epoch": 1.13, "learning_rate": 6.810965233801892e-07, "logits/chosen": -0.5396225452423096, "logits/rejected": -0.4656139612197876, "logps/chosen": -36.0191535949707, "logps/rejected": -88.4574966430664, "loss": 1.1399, "rewards/accuracies": 1.0, "rewards/chosen": 1.2845875024795532, "rewards/margins": 0.6820644736289978, "rewards/rejected": 0.6025230288505554, "step": 6936 }, { "epoch": 1.13, "learning_rate": 6.809740151064713e-07, "logits/chosen": -0.5284250974655151, "logits/rejected": -0.6010076403617859, "logps/chosen": -63.06019973754883, "logps/rejected": -68.12225341796875, "loss": 0.3716, "rewards/accuracies": 1.0, "rewards/chosen": 2.4757778644561768, "rewards/margins": 0.12052798271179199, "rewards/rejected": 2.3552498817443848, "step": 6937 }, { "epoch": 1.13, "learning_rate": 6.808514943291615e-07, "logits/chosen": -1.059562087059021, "logits/rejected": -0.9191034436225891, "logps/chosen": -168.56349182128906, "logps/rejected": -115.00563049316406, "loss": 1.6121, "rewards/accuracies": 0.0, "rewards/chosen": 5.076714992523193, "rewards/margins": -2.237570285797119, "rewards/rejected": 7.3142852783203125, "step": 6938 }, { "epoch": 1.13, "learning_rate": 6.807289610567246e-07, "logits/chosen": -1.1572580337524414, "logits/rejected": -1.0208686590194702, "logps/chosen": -147.2796630859375, "logps/rejected": -20.00198745727539, "loss": 0.1206, "rewards/accuracies": 1.0, "rewards/chosen": 6.122048854827881, "rewards/margins": 5.703732013702393, "rewards/rejected": 0.418317049741745, "step": 6939 }, { "epoch": 1.13, "learning_rate": 6.806064152976264e-07, "logits/chosen": -0.17908944189548492, "logits/rejected": -0.1743553727865219, "logps/chosen": -55.303279876708984, "logps/rejected": -74.26473999023438, "loss": 1.0366, "rewards/accuracies": 0.0, "rewards/chosen": 1.6366008520126343, "rewards/margins": -1.9103916883468628, "rewards/rejected": 3.546992540359497, "step": 6940 }, { "epoch": 1.13, "learning_rate": 6.804838570603339e-07, "logits/chosen": -0.5120408535003662, "logits/rejected": -0.562688410282135, "logps/chosen": -132.3345947265625, "logps/rejected": -68.37750244140625, "loss": 1.3121, "rewards/accuracies": 0.0, "rewards/chosen": 0.15047608315944672, "rewards/margins": -2.503740072250366, "rewards/rejected": 2.6542160511016846, "step": 6941 }, { "epoch": 1.13, "learning_rate": 6.803612863533149e-07, "logits/chosen": -0.5260218977928162, "logits/rejected": -0.5055811405181885, "logps/chosen": -33.09242630004883, "logps/rejected": -16.520732879638672, "loss": 0.7527, "rewards/accuracies": 0.0, "rewards/chosen": -0.27496108412742615, "rewards/margins": -0.3820047378540039, "rewards/rejected": 0.10704364627599716, "step": 6942 }, { "epoch": 1.13, "learning_rate": 6.802387031850372e-07, "logits/chosen": -0.692244827747345, "logits/rejected": -0.5464292168617249, "logps/chosen": -118.52633666992188, "logps/rejected": -51.15420150756836, "loss": 0.2608, "rewards/accuracies": 1.0, "rewards/chosen": 4.1887969970703125, "rewards/margins": 2.2965097427368164, "rewards/rejected": 1.8922871351242065, "step": 6943 }, { "epoch": 1.13, "learning_rate": 6.801161075639708e-07, "logits/chosen": -0.7019821405410767, "logits/rejected": -0.6618523001670837, "logps/chosen": -60.41739273071289, "logps/rejected": -39.35102081298828, "loss": 0.2621, "rewards/accuracies": 1.0, "rewards/chosen": 1.8713024854660034, "rewards/margins": 1.0923184156417847, "rewards/rejected": 0.7789840698242188, "step": 6944 }, { "epoch": 1.13, "learning_rate": 6.799934994985855e-07, "logits/chosen": -0.7835681438446045, "logits/rejected": -0.7705678939819336, "logps/chosen": -139.51170349121094, "logps/rejected": -58.96012878417969, "loss": 0.6744, "rewards/accuracies": 0.0, "rewards/chosen": 1.1772232055664062, "rewards/margins": -0.47539985179901123, "rewards/rejected": 1.6526230573654175, "step": 6945 }, { "epoch": 1.13, "learning_rate": 6.798708789973527e-07, "logits/chosen": -0.537299633026123, "logits/rejected": -0.537299633026123, "logps/chosen": -1.749539852142334, "logps/rejected": -1.749539852142334, "loss": 0.4391, "rewards/accuracies": 0.0, "rewards/chosen": 0.24104662239551544, "rewards/margins": 0.0, "rewards/rejected": 0.24104662239551544, "step": 6946 }, { "epoch": 1.13, "learning_rate": 6.79748246068744e-07, "logits/chosen": -0.4819139540195465, "logits/rejected": -0.39453163743019104, "logps/chosen": -56.934165954589844, "logps/rejected": -59.30065155029297, "loss": 0.2394, "rewards/accuracies": 1.0, "rewards/chosen": 2.0101494789123535, "rewards/margins": 0.5177979469299316, "rewards/rejected": 1.4923515319824219, "step": 6947 }, { "epoch": 1.13, "learning_rate": 6.796256007212322e-07, "logits/chosen": -0.7625476121902466, "logits/rejected": -0.6338925957679749, "logps/chosen": -52.36798858642578, "logps/rejected": -66.53766632080078, "loss": 0.13, "rewards/accuracies": 1.0, "rewards/chosen": 3.0288047790527344, "rewards/margins": 1.2856780290603638, "rewards/rejected": 1.7431267499923706, "step": 6948 }, { "epoch": 1.13, "learning_rate": 6.795029429632912e-07, "logits/chosen": -0.7924266457557678, "logits/rejected": -0.8191348910331726, "logps/chosen": -154.316650390625, "logps/rejected": -48.12139129638672, "loss": 0.1818, "rewards/accuracies": 1.0, "rewards/chosen": 3.690084934234619, "rewards/margins": 0.847541093826294, "rewards/rejected": 2.842543840408325, "step": 6949 }, { "epoch": 1.13, "learning_rate": 6.793802728033951e-07, "logits/chosen": -0.2121305614709854, "logits/rejected": -0.2121305614709854, "logps/chosen": -82.9171142578125, "logps/rejected": -82.9171142578125, "loss": 2.3117, "rewards/accuracies": 0.0, "rewards/chosen": 0.3129837214946747, "rewards/margins": 0.0, "rewards/rejected": 0.3129837214946747, "step": 6950 }, { "epoch": 1.13, "learning_rate": 6.792575902500196e-07, "logits/chosen": -0.5449433326721191, "logits/rejected": -0.4959236681461334, "logps/chosen": -52.64521789550781, "logps/rejected": -52.13304138183594, "loss": 0.6667, "rewards/accuracies": 1.0, "rewards/chosen": 0.752637505531311, "rewards/margins": 0.22027552127838135, "rewards/rejected": 0.5323619842529297, "step": 6951 }, { "epoch": 1.13, "learning_rate": 6.791348953116407e-07, "logits/chosen": -0.9697802066802979, "logits/rejected": -0.9334235191345215, "logps/chosen": -34.22966003417969, "logps/rejected": -13.552132606506348, "loss": 0.5385, "rewards/accuracies": 0.0, "rewards/chosen": 0.5157211422920227, "rewards/margins": -0.31834250688552856, "rewards/rejected": 0.8340636491775513, "step": 6952 }, { "epoch": 1.13, "learning_rate": 6.790121879967357e-07, "logits/chosen": -0.6132892966270447, "logits/rejected": -0.6204110383987427, "logps/chosen": -97.77667999267578, "logps/rejected": -164.72412109375, "loss": 0.3166, "rewards/accuracies": 1.0, "rewards/chosen": 1.528285264968872, "rewards/margins": 0.28414154052734375, "rewards/rejected": 1.2441437244415283, "step": 6953 }, { "epoch": 1.13, "learning_rate": 6.788894683137822e-07, "logits/chosen": -0.978448748588562, "logits/rejected": -0.9164511561393738, "logps/chosen": -152.49696350097656, "logps/rejected": -101.36966705322266, "loss": 0.85, "rewards/accuracies": 0.0, "rewards/chosen": 3.4668776988983154, "rewards/margins": -0.5511453151702881, "rewards/rejected": 4.0180230140686035, "step": 6954 }, { "epoch": 1.13, "learning_rate": 6.78766736271259e-07, "logits/chosen": -0.6892455816268921, "logits/rejected": -0.637077808380127, "logps/chosen": -76.16007232666016, "logps/rejected": -58.986549377441406, "loss": 0.4548, "rewards/accuracies": 0.0, "rewards/chosen": 1.003930687904358, "rewards/margins": -0.16219258308410645, "rewards/rejected": 1.1661232709884644, "step": 6955 }, { "epoch": 1.13, "learning_rate": 6.78643991877646e-07, "logits/chosen": -0.9424141049385071, "logits/rejected": -0.8886040449142456, "logps/chosen": -238.90451049804688, "logps/rejected": -57.726463317871094, "loss": 0.3175, "rewards/accuracies": 1.0, "rewards/chosen": 3.0763702392578125, "rewards/margins": 0.9015707969665527, "rewards/rejected": 2.1747994422912598, "step": 6956 }, { "epoch": 1.13, "learning_rate": 6.785212351414234e-07, "logits/chosen": -0.5855615139007568, "logits/rejected": -0.5659732818603516, "logps/chosen": -106.94285583496094, "logps/rejected": -83.60497283935547, "loss": 0.9808, "rewards/accuracies": 1.0, "rewards/chosen": 0.9403266906738281, "rewards/margins": 0.7971786260604858, "rewards/rejected": 0.1431480497121811, "step": 6957 }, { "epoch": 1.13, "learning_rate": 6.783984660710726e-07, "logits/chosen": -0.6029516458511353, "logits/rejected": -0.6045106053352356, "logps/chosen": -192.5111083984375, "logps/rejected": -76.00756072998047, "loss": 0.3155, "rewards/accuracies": 1.0, "rewards/chosen": 1.6168396472930908, "rewards/margins": 0.3903557062149048, "rewards/rejected": 1.226483941078186, "step": 6958 }, { "epoch": 1.13, "learning_rate": 6.78275684675076e-07, "logits/chosen": -0.38488999009132385, "logits/rejected": -0.36848127841949463, "logps/chosen": -48.86819076538086, "logps/rejected": -19.720481872558594, "loss": 0.6272, "rewards/accuracies": 1.0, "rewards/chosen": 0.36187249422073364, "rewards/margins": 0.153788760304451, "rewards/rejected": 0.20808373391628265, "step": 6959 }, { "epoch": 1.13, "learning_rate": 6.781528909619163e-07, "logits/chosen": -1.0553901195526123, "logits/rejected": -1.123328685760498, "logps/chosen": -95.52861785888672, "logps/rejected": -140.47885131835938, "loss": 1.9924, "rewards/accuracies": 0.0, "rewards/chosen": 1.4815216064453125, "rewards/margins": -2.7699599266052246, "rewards/rejected": 4.251481533050537, "step": 6960 }, { "epoch": 1.13, "learning_rate": 6.780300849400776e-07, "logits/chosen": -0.5811465382575989, "logits/rejected": -0.5810205340385437, "logps/chosen": -110.56748962402344, "logps/rejected": -52.29303741455078, "loss": 1.2107, "rewards/accuracies": 1.0, "rewards/chosen": 1.6402084827423096, "rewards/margins": 0.6014915704727173, "rewards/rejected": 1.0387169122695923, "step": 6961 }, { "epoch": 1.13, "learning_rate": 6.779072666180446e-07, "logits/chosen": -0.4516538679599762, "logits/rejected": -0.4819064438343048, "logps/chosen": -148.33251953125, "logps/rejected": -46.480926513671875, "loss": 0.1567, "rewards/accuracies": 1.0, "rewards/chosen": 3.2691071033477783, "rewards/margins": 1.12451171875, "rewards/rejected": 2.1445953845977783, "step": 6962 }, { "epoch": 1.13, "learning_rate": 6.777844360043027e-07, "logits/chosen": -0.46862420439720154, "logits/rejected": -0.47554898262023926, "logps/chosen": -51.163177490234375, "logps/rejected": -55.244598388671875, "loss": 0.5713, "rewards/accuracies": 1.0, "rewards/chosen": 1.1308761835098267, "rewards/margins": 0.22435075044631958, "rewards/rejected": 0.9065254330635071, "step": 6963 }, { "epoch": 1.13, "learning_rate": 6.776615931073387e-07, "logits/chosen": -0.792486310005188, "logits/rejected": -0.8543378114700317, "logps/chosen": -155.43612670898438, "logps/rejected": -146.87855529785156, "loss": 0.8054, "rewards/accuracies": 0.0, "rewards/chosen": 4.086883544921875, "rewards/margins": -1.373295783996582, "rewards/rejected": 5.460179328918457, "step": 6964 }, { "epoch": 1.13, "learning_rate": 6.775387379356395e-07, "logits/chosen": -0.7664676308631897, "logits/rejected": -0.7626376152038574, "logps/chosen": -123.62623596191406, "logps/rejected": -120.63655090332031, "loss": 1.5383, "rewards/accuracies": 0.0, "rewards/chosen": 0.5134353637695312, "rewards/margins": -0.4847305417060852, "rewards/rejected": 0.9981659054756165, "step": 6965 }, { "epoch": 1.13, "learning_rate": 6.774158704976933e-07, "logits/chosen": -0.6821593046188354, "logits/rejected": -0.6867033243179321, "logps/chosen": -72.50419616699219, "logps/rejected": -87.5169677734375, "loss": 1.3177, "rewards/accuracies": 0.0, "rewards/chosen": 1.4900726079940796, "rewards/margins": -1.6160401105880737, "rewards/rejected": 3.1061127185821533, "step": 6966 }, { "epoch": 1.13, "learning_rate": 6.772929908019894e-07, "logits/chosen": -0.5615083575248718, "logits/rejected": -0.5684024691581726, "logps/chosen": -71.59497833251953, "logps/rejected": -90.51959228515625, "loss": 0.6494, "rewards/accuracies": 1.0, "rewards/chosen": 0.9431343078613281, "rewards/margins": 0.53997802734375, "rewards/rejected": 0.4031562805175781, "step": 6967 }, { "epoch": 1.13, "learning_rate": 6.771700988570172e-07, "logits/chosen": -0.533821702003479, "logits/rejected": -0.4438968300819397, "logps/chosen": -109.255615234375, "logps/rejected": -162.72866821289062, "loss": 0.6449, "rewards/accuracies": 0.0, "rewards/chosen": 5.725555419921875, "rewards/margins": -0.04910612106323242, "rewards/rejected": 5.774661540985107, "step": 6968 }, { "epoch": 1.13, "learning_rate": 6.770471946712678e-07, "logits/chosen": -0.24495075643062592, "logits/rejected": -0.2957508862018585, "logps/chosen": -48.032989501953125, "logps/rejected": -67.00403594970703, "loss": 0.5391, "rewards/accuracies": 0.0, "rewards/chosen": 1.3508293628692627, "rewards/margins": -0.6358054876327515, "rewards/rejected": 1.9866348505020142, "step": 6969 }, { "epoch": 1.13, "learning_rate": 6.769242782532323e-07, "logits/chosen": -0.48900479078292847, "logits/rejected": -0.48900479078292847, "logps/chosen": -91.35643005371094, "logps/rejected": -91.35643005371094, "loss": 0.4112, "rewards/accuracies": 0.0, "rewards/chosen": 0.9688888788223267, "rewards/margins": 0.0, "rewards/rejected": 0.9688888788223267, "step": 6970 }, { "epoch": 1.13, "learning_rate": 6.768013496114034e-07, "logits/chosen": -0.5506722927093506, "logits/rejected": -0.6382287740707397, "logps/chosen": -121.42816162109375, "logps/rejected": -220.02206420898438, "loss": 0.9515, "rewards/accuracies": 0.0, "rewards/chosen": 3.7381744384765625, "rewards/margins": -1.1115083694458008, "rewards/rejected": 4.849682807922363, "step": 6971 }, { "epoch": 1.13, "learning_rate": 6.766784087542741e-07, "logits/chosen": -0.31772080063819885, "logits/rejected": -0.3519432544708252, "logps/chosen": -57.06340026855469, "logps/rejected": -139.8013458251953, "loss": 0.1677, "rewards/accuracies": 1.0, "rewards/chosen": 0.756274402141571, "rewards/margins": 0.9654403924942017, "rewards/rejected": -0.20916596055030823, "step": 6972 }, { "epoch": 1.13, "learning_rate": 6.765554556903383e-07, "logits/chosen": -0.5001456141471863, "logits/rejected": -0.5001456141471863, "logps/chosen": -0.9252239465713501, "logps/rejected": -0.9252239465713501, "loss": 0.5579, "rewards/accuracies": 0.0, "rewards/chosen": 0.44881850481033325, "rewards/margins": 0.0, "rewards/rejected": 0.44881850481033325, "step": 6973 }, { "epoch": 1.13, "learning_rate": 6.764324904280914e-07, "logits/chosen": -0.6598785519599915, "logits/rejected": -0.5495944023132324, "logps/chosen": -118.10384368896484, "logps/rejected": -109.89708709716797, "loss": 1.206, "rewards/accuracies": 0.0, "rewards/chosen": 5.5201945304870605, "rewards/margins": -0.840667724609375, "rewards/rejected": 6.3608622550964355, "step": 6974 }, { "epoch": 1.13, "learning_rate": 6.763095129760286e-07, "logits/chosen": -0.4541189670562744, "logits/rejected": -0.26761823892593384, "logps/chosen": -38.87370681762695, "logps/rejected": -23.65108871459961, "loss": 0.0657, "rewards/accuracies": 1.0, "rewards/chosen": 2.2217063903808594, "rewards/margins": 2.2746424674987793, "rewards/rejected": -0.05293617397546768, "step": 6975 }, { "epoch": 1.13, "learning_rate": 6.76186523342647e-07, "logits/chosen": -1.3110271692276, "logits/rejected": -1.2386523485183716, "logps/chosen": -90.00855255126953, "logps/rejected": -20.480411529541016, "loss": 0.1873, "rewards/accuracies": 1.0, "rewards/chosen": 6.227608680725098, "rewards/margins": 5.79963493347168, "rewards/rejected": 0.4279739558696747, "step": 6976 }, { "epoch": 1.13, "learning_rate": 6.760635215364434e-07, "logits/chosen": -0.7756971716880798, "logits/rejected": -0.7642946839332581, "logps/chosen": -63.363983154296875, "logps/rejected": -69.70645141601562, "loss": 1.1778, "rewards/accuracies": 0.0, "rewards/chosen": 1.8858940601348877, "rewards/margins": -0.3865668773651123, "rewards/rejected": 2.2724609375, "step": 6977 }, { "epoch": 1.13, "learning_rate": 6.759405075659165e-07, "logits/chosen": -0.7563965320587158, "logits/rejected": -0.7340575456619263, "logps/chosen": -70.06845092773438, "logps/rejected": -59.98480224609375, "loss": 0.5279, "rewards/accuracies": 1.0, "rewards/chosen": 1.9576278924942017, "rewards/margins": 0.4364159107208252, "rewards/rejected": 1.5212119817733765, "step": 6978 }, { "epoch": 1.13, "learning_rate": 6.758174814395653e-07, "logits/chosen": -0.511900782585144, "logits/rejected": -0.5130701661109924, "logps/chosen": -103.40696716308594, "logps/rejected": -69.22064208984375, "loss": 0.2177, "rewards/accuracies": 1.0, "rewards/chosen": 4.036567687988281, "rewards/margins": 1.0095946788787842, "rewards/rejected": 3.026973009109497, "step": 6979 }, { "epoch": 1.13, "learning_rate": 6.756944431658897e-07, "logits/chosen": -0.8410183191299438, "logits/rejected": -0.8532549142837524, "logps/chosen": -92.84771728515625, "logps/rejected": -133.52664184570312, "loss": 0.4884, "rewards/accuracies": 0.0, "rewards/chosen": 1.4968750476837158, "rewards/margins": -0.38469386100769043, "rewards/rejected": 1.8815689086914062, "step": 6980 }, { "epoch": 1.13, "learning_rate": 6.755713927533906e-07, "logits/chosen": -0.6830055713653564, "logits/rejected": -0.6247265934944153, "logps/chosen": -101.03431701660156, "logps/rejected": -76.88096618652344, "loss": 1.5125, "rewards/accuracies": 0.0, "rewards/chosen": 2.5432708263397217, "rewards/margins": -0.7517304420471191, "rewards/rejected": 3.295001268386841, "step": 6981 }, { "epoch": 1.13, "learning_rate": 6.754483302105695e-07, "logits/chosen": -0.5632721185684204, "logits/rejected": -0.546311616897583, "logps/chosen": -59.83393096923828, "logps/rejected": -58.79545593261719, "loss": 0.6218, "rewards/accuracies": 1.0, "rewards/chosen": 2.5965301990509033, "rewards/margins": 1.4391907453536987, "rewards/rejected": 1.1573394536972046, "step": 6982 }, { "epoch": 1.13, "learning_rate": 6.753252555459289e-07, "logits/chosen": -0.42606696486473083, "logits/rejected": -0.43997088074684143, "logps/chosen": -6.575772285461426, "logps/rejected": -2.7781741619110107, "loss": 2.6672, "rewards/accuracies": 0.0, "rewards/chosen": 0.2501208484172821, "rewards/margins": -0.38426730036735535, "rewards/rejected": 0.6343881487846375, "step": 6983 }, { "epoch": 1.13, "learning_rate": 6.752021687679721e-07, "logits/chosen": -0.3037899136543274, "logits/rejected": -0.3037899136543274, "logps/chosen": -18.032920837402344, "logps/rejected": -18.032920837402344, "loss": 0.3876, "rewards/accuracies": 0.0, "rewards/chosen": 0.17304649949073792, "rewards/margins": 0.0, "rewards/rejected": 0.17304649949073792, "step": 6984 }, { "epoch": 1.13, "learning_rate": 6.750790698852031e-07, "logits/chosen": -0.7149339914321899, "logits/rejected": -0.7344185709953308, "logps/chosen": -118.81536865234375, "logps/rejected": -29.697532653808594, "loss": 0.9259, "rewards/accuracies": 0.0, "rewards/chosen": 0.889984130859375, "rewards/margins": -0.3228870630264282, "rewards/rejected": 1.2128711938858032, "step": 6985 }, { "epoch": 1.13, "learning_rate": 6.749559589061273e-07, "logits/chosen": -0.5888216495513916, "logits/rejected": -0.47421446442604065, "logps/chosen": -121.19989013671875, "logps/rejected": -16.482807159423828, "loss": 0.3875, "rewards/accuracies": 0.0, "rewards/chosen": 0.7100616693496704, "rewards/margins": -0.058249831199645996, "rewards/rejected": 0.7683115005493164, "step": 6986 }, { "epoch": 1.13, "learning_rate": 6.748328358392499e-07, "logits/chosen": -0.2789745628833771, "logits/rejected": -0.5078559517860413, "logps/chosen": -97.92578887939453, "logps/rejected": -82.85982513427734, "loss": 0.9778, "rewards/accuracies": 0.0, "rewards/chosen": 0.8460044860839844, "rewards/margins": -1.799781084060669, "rewards/rejected": 2.6457855701446533, "step": 6987 }, { "epoch": 1.13, "learning_rate": 6.747097006930778e-07, "logits/chosen": -0.5677583813667297, "logits/rejected": -0.6273746490478516, "logps/chosen": -72.17503356933594, "logps/rejected": -107.85403442382812, "loss": 2.2285, "rewards/accuracies": 0.0, "rewards/chosen": 1.513752818107605, "rewards/margins": -2.9653472900390625, "rewards/rejected": 4.479100227355957, "step": 6988 }, { "epoch": 1.13, "learning_rate": 6.745865534761187e-07, "logits/chosen": -0.1705782115459442, "logits/rejected": -0.17386554181575775, "logps/chosen": -11.749693870544434, "logps/rejected": -11.074373245239258, "loss": 1.1182, "rewards/accuracies": 0.0, "rewards/chosen": -0.20782414078712463, "rewards/margins": -0.14371366798877716, "rewards/rejected": -0.06411047279834747, "step": 6989 }, { "epoch": 1.13, "learning_rate": 6.744633941968805e-07, "logits/chosen": -0.7941108345985413, "logits/rejected": -0.5980517864227295, "logps/chosen": -156.31088256835938, "logps/rejected": -55.952491760253906, "loss": 0.0648, "rewards/accuracies": 1.0, "rewards/chosen": 4.502252101898193, "rewards/margins": 2.6716299057006836, "rewards/rejected": 1.8306220769882202, "step": 6990 }, { "epoch": 1.13, "learning_rate": 6.743402228638727e-07, "logits/chosen": -0.3747030198574066, "logits/rejected": -0.3747030198574066, "logps/chosen": -25.528120040893555, "logps/rejected": -25.528120040893555, "loss": 0.6742, "rewards/accuracies": 0.0, "rewards/chosen": 0.47478047013282776, "rewards/margins": 0.0, "rewards/rejected": 0.47478047013282776, "step": 6991 }, { "epoch": 1.13, "learning_rate": 6.742170394856051e-07, "logits/chosen": -0.39852002263069153, "logits/rejected": -0.35279005765914917, "logps/chosen": -56.06188201904297, "logps/rejected": -21.49138832092285, "loss": 0.4547, "rewards/accuracies": 1.0, "rewards/chosen": 0.4792221188545227, "rewards/margins": 0.18617287278175354, "rewards/rejected": 0.29304924607276917, "step": 6992 }, { "epoch": 1.14, "learning_rate": 6.740938440705884e-07, "logits/chosen": -0.7521851062774658, "logits/rejected": -0.7141246795654297, "logps/chosen": -77.89535522460938, "logps/rejected": -96.31464385986328, "loss": 1.4139, "rewards/accuracies": 1.0, "rewards/chosen": 3.8251564502716064, "rewards/margins": 1.233156681060791, "rewards/rejected": 2.5919997692108154, "step": 6993 }, { "epoch": 1.14, "learning_rate": 6.739706366273344e-07, "logits/chosen": -0.5115477442741394, "logits/rejected": -0.5323749780654907, "logps/chosen": -7.212643146514893, "logps/rejected": -23.742666244506836, "loss": 0.7635, "rewards/accuracies": 0.0, "rewards/chosen": 0.39602428674697876, "rewards/margins": -0.49384409189224243, "rewards/rejected": 0.8898683786392212, "step": 6994 }, { "epoch": 1.14, "learning_rate": 6.738474171643557e-07, "logits/chosen": -0.835360050201416, "logits/rejected": -0.7129079699516296, "logps/chosen": -91.06336975097656, "logps/rejected": -63.50091552734375, "loss": 0.2242, "rewards/accuracies": 1.0, "rewards/chosen": 3.3658447265625, "rewards/margins": 1.0273468494415283, "rewards/rejected": 2.3384978771209717, "step": 6995 }, { "epoch": 1.14, "learning_rate": 6.737241856901652e-07, "logits/chosen": -0.8253377079963684, "logits/rejected": -0.7263950109481812, "logps/chosen": -92.12360382080078, "logps/rejected": -78.75469970703125, "loss": 0.922, "rewards/accuracies": 1.0, "rewards/chosen": 1.489824652671814, "rewards/margins": 0.12144851684570312, "rewards/rejected": 1.3683761358261108, "step": 6996 }, { "epoch": 1.14, "learning_rate": 6.736009422132774e-07, "logits/chosen": -0.7505432367324829, "logits/rejected": -0.46643760800361633, "logps/chosen": -136.4573516845703, "logps/rejected": -42.285308837890625, "loss": 1.2742, "rewards/accuracies": 1.0, "rewards/chosen": 3.962010145187378, "rewards/margins": 3.6114165782928467, "rewards/rejected": 0.35059356689453125, "step": 6997 }, { "epoch": 1.14, "learning_rate": 6.734776867422072e-07, "logits/chosen": -0.6715923547744751, "logits/rejected": -0.6118834614753723, "logps/chosen": -75.75787353515625, "logps/rejected": -68.2388916015625, "loss": 1.0801, "rewards/accuracies": 0.0, "rewards/chosen": 1.0275650024414062, "rewards/margins": -0.5669387578964233, "rewards/rejected": 1.5945037603378296, "step": 6998 }, { "epoch": 1.14, "learning_rate": 6.733544192854702e-07, "logits/chosen": -0.4286017119884491, "logits/rejected": -0.46362927556037903, "logps/chosen": -4.739281177520752, "logps/rejected": -97.39474487304688, "loss": 0.2832, "rewards/accuracies": 1.0, "rewards/chosen": 0.42849498987197876, "rewards/margins": 0.7675933837890625, "rewards/rejected": -0.33909836411476135, "step": 6999 }, { "epoch": 1.14, "learning_rate": 6.73231139851583e-07, "logits/chosen": -0.6208581924438477, "logits/rejected": -0.5637329816818237, "logps/chosen": -42.117496490478516, "logps/rejected": -47.53193283081055, "loss": 0.3575, "rewards/accuracies": 1.0, "rewards/chosen": 1.7844902276992798, "rewards/margins": 1.2644554376602173, "rewards/rejected": 0.5200347900390625, "step": 7000 }, { "epoch": 1.14, "learning_rate": 6.731078484490635e-07, "logits/chosen": -0.8520305156707764, "logits/rejected": -0.8895648717880249, "logps/chosen": -123.69556427001953, "logps/rejected": -126.0223159790039, "loss": 1.071, "rewards/accuracies": 0.0, "rewards/chosen": 1.8900368213653564, "rewards/margins": -1.9449234008789062, "rewards/rejected": 3.8349602222442627, "step": 7001 }, { "epoch": 1.14, "learning_rate": 6.729845450864293e-07, "logits/chosen": -0.316829651594162, "logits/rejected": -0.3195553421974182, "logps/chosen": -4.736662864685059, "logps/rejected": -17.388397216796875, "loss": 0.8628, "rewards/accuracies": 1.0, "rewards/chosen": 0.6289598345756531, "rewards/margins": 0.2547399401664734, "rewards/rejected": 0.3742198944091797, "step": 7002 }, { "epoch": 1.14, "learning_rate": 6.728612297722e-07, "logits/chosen": -0.8322758674621582, "logits/rejected": -0.7389039397239685, "logps/chosen": -68.7539291381836, "logps/rejected": -54.638214111328125, "loss": 0.5478, "rewards/accuracies": 0.0, "rewards/chosen": 2.261206865310669, "rewards/margins": -0.6143431663513184, "rewards/rejected": 2.8755500316619873, "step": 7003 }, { "epoch": 1.14, "learning_rate": 6.727379025148953e-07, "logits/chosen": -0.713816225528717, "logits/rejected": -0.6776599287986755, "logps/chosen": -44.016685485839844, "logps/rejected": -24.168930053710938, "loss": 0.9442, "rewards/accuracies": 0.0, "rewards/chosen": 0.3328239619731903, "rewards/margins": -0.4989330470561981, "rewards/rejected": 0.8317570090293884, "step": 7004 }, { "epoch": 1.14, "learning_rate": 6.72614563323036e-07, "logits/chosen": -0.4459107220172882, "logits/rejected": -0.4333084523677826, "logps/chosen": -91.87779235839844, "logps/rejected": -104.03431701660156, "loss": 1.0449, "rewards/accuracies": 0.0, "rewards/chosen": 0.7322677969932556, "rewards/margins": -0.029490649700164795, "rewards/rejected": 0.7617584466934204, "step": 7005 }, { "epoch": 1.14, "learning_rate": 6.724912122051439e-07, "logits/chosen": -0.6945973038673401, "logits/rejected": -0.6575621962547302, "logps/chosen": -114.64163208007812, "logps/rejected": -103.85586547851562, "loss": 0.3877, "rewards/accuracies": 0.0, "rewards/chosen": 4.581418037414551, "rewards/margins": -0.0035309791564941406, "rewards/rejected": 4.584949016571045, "step": 7006 }, { "epoch": 1.14, "learning_rate": 6.723678491697409e-07, "logits/chosen": -0.6479141712188721, "logits/rejected": -0.6790425777435303, "logps/chosen": -85.041015625, "logps/rejected": -169.197998046875, "loss": 0.2662, "rewards/accuracies": 1.0, "rewards/chosen": 1.265924096107483, "rewards/margins": 0.44790804386138916, "rewards/rejected": 0.8180160522460938, "step": 7007 }, { "epoch": 1.14, "learning_rate": 6.722444742253504e-07, "logits/chosen": -0.5260244607925415, "logits/rejected": -0.5027210712432861, "logps/chosen": -60.68132019042969, "logps/rejected": -82.75882720947266, "loss": 0.4161, "rewards/accuracies": 1.0, "rewards/chosen": 0.616345226764679, "rewards/margins": 0.2822403013706207, "rewards/rejected": 0.3341049253940582, "step": 7008 }, { "epoch": 1.14, "learning_rate": 6.721210873804967e-07, "logits/chosen": -0.1909525841474533, "logits/rejected": -0.1946062296628952, "logps/chosen": -2.0006825923919678, "logps/rejected": -31.515230178833008, "loss": 0.6705, "rewards/accuracies": 1.0, "rewards/chosen": 0.24159100651741028, "rewards/margins": 0.12172158062458038, "rewards/rejected": 0.1198694258928299, "step": 7009 }, { "epoch": 1.14, "learning_rate": 6.719976886437044e-07, "logits/chosen": -0.8642134070396423, "logits/rejected": -0.7880978584289551, "logps/chosen": -55.28812789916992, "logps/rejected": -66.43133544921875, "loss": 0.5657, "rewards/accuracies": 1.0, "rewards/chosen": 4.175338268280029, "rewards/margins": 1.7125208377838135, "rewards/rejected": 2.462817430496216, "step": 7010 }, { "epoch": 1.14, "learning_rate": 6.718742780234994e-07, "logits/chosen": -0.2325492650270462, "logits/rejected": -0.23899328708648682, "logps/chosen": -8.612846374511719, "logps/rejected": -7.1461358070373535, "loss": 0.8067, "rewards/accuracies": 0.0, "rewards/chosen": -0.08220348507165909, "rewards/margins": -0.09881515800952911, "rewards/rejected": 0.016611671075224876, "step": 7011 }, { "epoch": 1.14, "learning_rate": 6.717508555284079e-07, "logits/chosen": -0.9457877278327942, "logits/rejected": -0.8432413339614868, "logps/chosen": -76.01414489746094, "logps/rejected": -63.274452209472656, "loss": 0.9763, "rewards/accuracies": 0.0, "rewards/chosen": 1.2022781372070312, "rewards/margins": -1.734370470046997, "rewards/rejected": 2.9366486072540283, "step": 7012 }, { "epoch": 1.14, "learning_rate": 6.716274211669575e-07, "logits/chosen": -0.5592774748802185, "logits/rejected": -0.5104910135269165, "logps/chosen": -76.47312927246094, "logps/rejected": -47.17778778076172, "loss": 0.6706, "rewards/accuracies": 0.0, "rewards/chosen": 1.490606665611267, "rewards/margins": -0.035607218742370605, "rewards/rejected": 1.5262138843536377, "step": 7013 }, { "epoch": 1.14, "learning_rate": 6.715039749476763e-07, "logits/chosen": -0.6523230075836182, "logits/rejected": -0.6487724184989929, "logps/chosen": -43.182376861572266, "logps/rejected": -60.548789978027344, "loss": 1.4138, "rewards/accuracies": 0.0, "rewards/chosen": 0.17165298759937286, "rewards/margins": -0.14647941291332245, "rewards/rejected": 0.3181324005126953, "step": 7014 }, { "epoch": 1.14, "learning_rate": 6.713805168790931e-07, "logits/chosen": -0.6652107834815979, "logits/rejected": -0.6090154051780701, "logps/chosen": -69.96026611328125, "logps/rejected": -44.334556579589844, "loss": 0.3464, "rewards/accuracies": 1.0, "rewards/chosen": 3.7064850330352783, "rewards/margins": 1.8444877862930298, "rewards/rejected": 1.8619972467422485, "step": 7015 }, { "epoch": 1.14, "learning_rate": 6.712570469697379e-07, "logits/chosen": -0.9772132039070129, "logits/rejected": -0.9392528533935547, "logps/chosen": -91.99512481689453, "logps/rejected": -68.16735076904297, "loss": 0.6406, "rewards/accuracies": 1.0, "rewards/chosen": 0.6556114554405212, "rewards/margins": 0.2226249873638153, "rewards/rejected": 0.43298646807670593, "step": 7016 }, { "epoch": 1.14, "learning_rate": 6.711335652281411e-07, "logits/chosen": -0.5916756391525269, "logits/rejected": -0.5859650373458862, "logps/chosen": -135.3385009765625, "logps/rejected": -76.25480651855469, "loss": 0.1922, "rewards/accuracies": 1.0, "rewards/chosen": 3.673541307449341, "rewards/margins": 1.050445556640625, "rewards/rejected": 2.623095750808716, "step": 7017 }, { "epoch": 1.14, "learning_rate": 6.710100716628344e-07, "logits/chosen": -0.6258440017700195, "logits/rejected": -0.6948743462562561, "logps/chosen": -89.53916931152344, "logps/rejected": -67.52764892578125, "loss": 1.3734, "rewards/accuracies": 0.0, "rewards/chosen": 0.02916870079934597, "rewards/margins": -2.4826500415802, "rewards/rejected": 2.5118186473846436, "step": 7018 }, { "epoch": 1.14, "learning_rate": 6.708865662823497e-07, "logits/chosen": -2.090575695037842, "logits/rejected": -2.0187320709228516, "logps/chosen": -82.2086410522461, "logps/rejected": -75.1466293334961, "loss": 0.6567, "rewards/accuracies": 0.0, "rewards/chosen": 0.9161476492881775, "rewards/margins": -0.6877945065498352, "rewards/rejected": 1.6039421558380127, "step": 7019 }, { "epoch": 1.14, "learning_rate": 6.707630490952203e-07, "logits/chosen": -0.5415120720863342, "logits/rejected": -0.347012996673584, "logps/chosen": -97.61825561523438, "logps/rejected": -14.153300285339355, "loss": 0.2567, "rewards/accuracies": 1.0, "rewards/chosen": 2.2139663696289062, "rewards/margins": 1.414412260055542, "rewards/rejected": 0.799554169178009, "step": 7020 }, { "epoch": 1.14, "learning_rate": 6.706395201099799e-07, "logits/chosen": -0.6408846378326416, "logits/rejected": -0.7746456861495972, "logps/chosen": -131.44161987304688, "logps/rejected": -87.23260498046875, "loss": 0.5743, "rewards/accuracies": 1.0, "rewards/chosen": 3.5519776344299316, "rewards/margins": 1.6416542530059814, "rewards/rejected": 1.9103233814239502, "step": 7021 }, { "epoch": 1.14, "learning_rate": 6.705159793351633e-07, "logits/chosen": -0.5222921371459961, "logits/rejected": -0.47411683201789856, "logps/chosen": -75.68067932128906, "logps/rejected": -81.959716796875, "loss": 0.6523, "rewards/accuracies": 0.0, "rewards/chosen": 0.9136566519737244, "rewards/margins": -0.27603834867477417, "rewards/rejected": 1.1896950006484985, "step": 7022 }, { "epoch": 1.14, "learning_rate": 6.703924267793061e-07, "logits/chosen": -0.6979766488075256, "logits/rejected": -0.6683833003044128, "logps/chosen": -33.846282958984375, "logps/rejected": -25.413114547729492, "loss": 0.5418, "rewards/accuracies": 1.0, "rewards/chosen": 0.6913498044013977, "rewards/margins": 0.08713626861572266, "rewards/rejected": 0.604213535785675, "step": 7023 }, { "epoch": 1.14, "learning_rate": 6.702688624509442e-07, "logits/chosen": -1.1556575298309326, "logits/rejected": -1.1827671527862549, "logps/chosen": -226.62918090820312, "logps/rejected": -157.48681640625, "loss": 0.2008, "rewards/accuracies": 1.0, "rewards/chosen": 5.3572998046875, "rewards/margins": 0.8395566940307617, "rewards/rejected": 4.517743110656738, "step": 7024 }, { "epoch": 1.14, "learning_rate": 6.701452863586152e-07, "logits/chosen": -0.717808187007904, "logits/rejected": -0.6790157556533813, "logps/chosen": -122.76541137695312, "logps/rejected": -38.056053161621094, "loss": 0.3851, "rewards/accuracies": 1.0, "rewards/chosen": 1.880757212638855, "rewards/margins": 0.7640137672424316, "rewards/rejected": 1.1167434453964233, "step": 7025 }, { "epoch": 1.14, "learning_rate": 6.700216985108568e-07, "logits/chosen": -0.5227144360542297, "logits/rejected": -0.5157222151756287, "logps/chosen": -89.84677124023438, "logps/rejected": -113.29051208496094, "loss": 1.2152, "rewards/accuracies": 0.0, "rewards/chosen": 0.5190933346748352, "rewards/margins": -1.5437378883361816, "rewards/rejected": 2.062831163406372, "step": 7026 }, { "epoch": 1.14, "learning_rate": 6.698980989162077e-07, "logits/chosen": -0.6326500177383423, "logits/rejected": -0.637061595916748, "logps/chosen": -6.892552375793457, "logps/rejected": -3.6979730129241943, "loss": 0.5466, "rewards/accuracies": 0.0, "rewards/chosen": 0.03892488405108452, "rewards/margins": -0.2432934194803238, "rewards/rejected": 0.2822183072566986, "step": 7027 }, { "epoch": 1.14, "learning_rate": 6.697744875832077e-07, "logits/chosen": -0.49543052911758423, "logits/rejected": -0.22179915010929108, "logps/chosen": -46.80675506591797, "logps/rejected": -47.832191467285156, "loss": 0.2727, "rewards/accuracies": 1.0, "rewards/chosen": 2.3060860633850098, "rewards/margins": 1.132351040840149, "rewards/rejected": 1.1737350225448608, "step": 7028 }, { "epoch": 1.14, "learning_rate": 6.69650864520397e-07, "logits/chosen": -0.40085819363594055, "logits/rejected": -0.40085819363594055, "logps/chosen": -79.39498138427734, "logps/rejected": -79.39498138427734, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 2.2646141052246094, "rewards/margins": 0.0, "rewards/rejected": 2.2646141052246094, "step": 7029 }, { "epoch": 1.14, "learning_rate": 6.695272297363168e-07, "logits/chosen": -0.7758673429489136, "logits/rejected": -0.7117175459861755, "logps/chosen": -39.89518737792969, "logps/rejected": -95.5788803100586, "loss": 0.3688, "rewards/accuracies": 1.0, "rewards/chosen": 1.830116629600525, "rewards/margins": 0.45796775817871094, "rewards/rejected": 1.372148871421814, "step": 7030 }, { "epoch": 1.14, "learning_rate": 6.69403583239509e-07, "logits/chosen": -0.8806565403938293, "logits/rejected": -0.7804229855537415, "logps/chosen": -81.87367248535156, "logps/rejected": -134.02423095703125, "loss": 0.3203, "rewards/accuracies": 1.0, "rewards/chosen": 1.3473869562149048, "rewards/margins": 0.34416890144348145, "rewards/rejected": 1.0032180547714233, "step": 7031 }, { "epoch": 1.14, "learning_rate": 6.692799250385167e-07, "logits/chosen": -0.6469843983650208, "logits/rejected": -0.4871005713939667, "logps/chosen": -63.00714874267578, "logps/rejected": -108.72239685058594, "loss": 1.2129, "rewards/accuracies": 0.0, "rewards/chosen": 0.9490089416503906, "rewards/margins": -2.1778724193573, "rewards/rejected": 3.1268813610076904, "step": 7032 }, { "epoch": 1.14, "learning_rate": 6.691562551418832e-07, "logits/chosen": -0.6245549917221069, "logits/rejected": -1.216554045677185, "logps/chosen": -98.3309097290039, "logps/rejected": -62.914947509765625, "loss": 0.7802, "rewards/accuracies": 1.0, "rewards/chosen": 1.4366577863693237, "rewards/margins": 0.9561753273010254, "rewards/rejected": 0.4804824888706207, "step": 7033 }, { "epoch": 1.14, "learning_rate": 6.690325735581532e-07, "logits/chosen": -0.44582998752593994, "logits/rejected": -0.41307559609413147, "logps/chosen": -79.52845764160156, "logps/rejected": -55.27250671386719, "loss": 0.2708, "rewards/accuracies": 1.0, "rewards/chosen": 1.4458938837051392, "rewards/margins": 0.3549225330352783, "rewards/rejected": 1.0909713506698608, "step": 7034 }, { "epoch": 1.14, "learning_rate": 6.689088802958716e-07, "logits/chosen": -0.7219038605690002, "logits/rejected": -0.5653160214424133, "logps/chosen": -170.90164184570312, "logps/rejected": -190.22869873046875, "loss": 1.2558, "rewards/accuracies": 0.0, "rewards/chosen": 4.262977600097656, "rewards/margins": -1.7523512840270996, "rewards/rejected": 6.015328884124756, "step": 7035 }, { "epoch": 1.14, "learning_rate": 6.687851753635846e-07, "logits/chosen": -0.6335497498512268, "logits/rejected": -0.6665869951248169, "logps/chosen": -51.961387634277344, "logps/rejected": -113.9123764038086, "loss": 1.9658, "rewards/accuracies": 0.0, "rewards/chosen": 1.4888092279434204, "rewards/margins": -3.482776641845703, "rewards/rejected": 4.971585750579834, "step": 7036 }, { "epoch": 1.14, "learning_rate": 6.686614587698391e-07, "logits/chosen": -0.7442460656166077, "logits/rejected": -0.7623191475868225, "logps/chosen": -79.43962860107422, "logps/rejected": -101.11039733886719, "loss": 0.5013, "rewards/accuracies": 0.0, "rewards/chosen": 0.807708740234375, "rewards/margins": -0.3408859968185425, "rewards/rejected": 1.1485947370529175, "step": 7037 }, { "epoch": 1.14, "learning_rate": 6.685377305231826e-07, "logits/chosen": -0.5516578555107117, "logits/rejected": -0.5109786987304688, "logps/chosen": -97.83515930175781, "logps/rejected": -121.76786041259766, "loss": 0.6155, "rewards/accuracies": 1.0, "rewards/chosen": 1.2894172668457031, "rewards/margins": 0.18161773681640625, "rewards/rejected": 1.1077995300292969, "step": 7038 }, { "epoch": 1.14, "learning_rate": 6.684139906321638e-07, "logits/chosen": -0.6032640337944031, "logits/rejected": -0.41071799397468567, "logps/chosen": -126.32440948486328, "logps/rejected": -92.59249877929688, "loss": 0.2151, "rewards/accuracies": 1.0, "rewards/chosen": 4.049588680267334, "rewards/margins": 1.207183599472046, "rewards/rejected": 2.842405080795288, "step": 7039 }, { "epoch": 1.14, "learning_rate": 6.682902391053318e-07, "logits/chosen": -0.07811097055673599, "logits/rejected": -0.08189726620912552, "logps/chosen": -1.6409869194030762, "logps/rejected": -24.588756561279297, "loss": 0.7253, "rewards/accuracies": 1.0, "rewards/chosen": 0.2624414563179016, "rewards/margins": 0.29810887575149536, "rewards/rejected": -0.03566741943359375, "step": 7040 }, { "epoch": 1.14, "learning_rate": 6.681664759512366e-07, "logits/chosen": -0.7047341465950012, "logits/rejected": -0.6331236362457275, "logps/chosen": -110.69005584716797, "logps/rejected": -50.279388427734375, "loss": 0.0403, "rewards/accuracies": 1.0, "rewards/chosen": 4.289277076721191, "rewards/margins": 3.414792776107788, "rewards/rejected": 0.8744842410087585, "step": 7041 }, { "epoch": 1.14, "learning_rate": 6.680427011784291e-07, "logits/chosen": -0.8405051231384277, "logits/rejected": -0.7435153126716614, "logps/chosen": -103.94300079345703, "logps/rejected": -55.5925178527832, "loss": 0.175, "rewards/accuracies": 1.0, "rewards/chosen": 5.201778411865234, "rewards/margins": 2.5126826763153076, "rewards/rejected": 2.6890957355499268, "step": 7042 }, { "epoch": 1.14, "learning_rate": 6.679189147954609e-07, "logits/chosen": -0.5980018377304077, "logits/rejected": -0.6012099981307983, "logps/chosen": -118.31565856933594, "logps/rejected": -60.00642013549805, "loss": 1.0002, "rewards/accuracies": 1.0, "rewards/chosen": 2.6515395641326904, "rewards/margins": 0.858185887336731, "rewards/rejected": 1.7933536767959595, "step": 7043 }, { "epoch": 1.14, "learning_rate": 6.677951168108846e-07, "logits/chosen": -0.6457034945487976, "logits/rejected": -0.7195901274681091, "logps/chosen": -76.49017333984375, "logps/rejected": -69.4229965209961, "loss": 0.7526, "rewards/accuracies": 0.0, "rewards/chosen": 1.3444321155548096, "rewards/margins": -0.8679986000061035, "rewards/rejected": 2.212430715560913, "step": 7044 }, { "epoch": 1.14, "learning_rate": 6.676713072332535e-07, "logits/chosen": -0.6727781295776367, "logits/rejected": -0.6842570304870605, "logps/chosen": -178.81532287597656, "logps/rejected": -100.37919616699219, "loss": 0.698, "rewards/accuracies": 0.0, "rewards/chosen": 4.975613594055176, "rewards/margins": -0.19019317626953125, "rewards/rejected": 5.165806770324707, "step": 7045 }, { "epoch": 1.14, "learning_rate": 6.675474860711215e-07, "logits/chosen": -0.38965925574302673, "logits/rejected": -0.3876323699951172, "logps/chosen": -4.491621494293213, "logps/rejected": -9.590608596801758, "loss": 0.3677, "rewards/accuracies": 1.0, "rewards/chosen": 0.19531750679016113, "rewards/margins": 0.40730369091033936, "rewards/rejected": -0.21198616921901703, "step": 7046 }, { "epoch": 1.14, "learning_rate": 6.674236533330436e-07, "logits/chosen": -0.12651152908802032, "logits/rejected": -0.14485083520412445, "logps/chosen": -12.336865425109863, "logps/rejected": -30.24634552001953, "loss": 0.8552, "rewards/accuracies": 0.0, "rewards/chosen": -0.12332706898450851, "rewards/margins": -0.5684625506401062, "rewards/rejected": 0.4451355040073395, "step": 7047 }, { "epoch": 1.14, "learning_rate": 6.672998090275755e-07, "logits/chosen": -0.6959870457649231, "logits/rejected": -0.6834468841552734, "logps/chosen": -82.60504913330078, "logps/rejected": -94.65066528320312, "loss": 1.0333, "rewards/accuracies": 1.0, "rewards/chosen": 2.844341278076172, "rewards/margins": 0.39797353744506836, "rewards/rejected": 2.4463677406311035, "step": 7048 }, { "epoch": 1.14, "learning_rate": 6.671759531632735e-07, "logits/chosen": -0.9657424092292786, "logits/rejected": -0.9168907403945923, "logps/chosen": -32.628807067871094, "logps/rejected": -91.03997802734375, "loss": 0.7398, "rewards/accuracies": 0.0, "rewards/chosen": 1.638007402420044, "rewards/margins": -0.2926185131072998, "rewards/rejected": 1.9306259155273438, "step": 7049 }, { "epoch": 1.14, "learning_rate": 6.670520857486949e-07, "logits/chosen": -0.7635489106178284, "logits/rejected": -0.7113059759140015, "logps/chosen": -107.24580383300781, "logps/rejected": -69.80582427978516, "loss": 0.1708, "rewards/accuracies": 1.0, "rewards/chosen": 4.27041482925415, "rewards/margins": 2.941901445388794, "rewards/rejected": 1.3285133838653564, "step": 7050 }, { "epoch": 1.14, "learning_rate": 6.66928206792398e-07, "logits/chosen": -0.6282379031181335, "logits/rejected": -0.677376925945282, "logps/chosen": -112.43730163574219, "logps/rejected": -127.41822814941406, "loss": 0.8923, "rewards/accuracies": 0.0, "rewards/chosen": 0.7392852902412415, "rewards/margins": -1.352116584777832, "rewards/rejected": 2.0914018154144287, "step": 7051 }, { "epoch": 1.14, "learning_rate": 6.668043163029414e-07, "logits/chosen": -0.7053991556167603, "logits/rejected": -0.5872250199317932, "logps/chosen": -70.02427673339844, "logps/rejected": -41.03284454345703, "loss": 0.4585, "rewards/accuracies": 0.0, "rewards/chosen": 2.298384189605713, "rewards/margins": -0.30832505226135254, "rewards/rejected": 2.6067092418670654, "step": 7052 }, { "epoch": 1.14, "learning_rate": 6.666804142888848e-07, "logits/chosen": -1.3013648986816406, "logits/rejected": -1.2147150039672852, "logps/chosen": -105.50836181640625, "logps/rejected": -66.33770751953125, "loss": 0.9207, "rewards/accuracies": 0.0, "rewards/chosen": 1.1201927661895752, "rewards/margins": -1.5851669311523438, "rewards/rejected": 2.705359697341919, "step": 7053 }, { "epoch": 1.14, "learning_rate": 6.665565007587888e-07, "logits/chosen": -0.9780521392822266, "logits/rejected": -0.8447094559669495, "logps/chosen": -128.07469177246094, "logps/rejected": -20.282922744750977, "loss": 0.0415, "rewards/accuracies": 1.0, "rewards/chosen": 4.232815742492676, "rewards/margins": 3.614201545715332, "rewards/rejected": 0.6186141967773438, "step": 7054 }, { "epoch": 1.15, "learning_rate": 6.664325757212146e-07, "logits/chosen": -0.6816171407699585, "logits/rejected": -0.6786388754844666, "logps/chosen": -61.824867248535156, "logps/rejected": -82.41900634765625, "loss": 0.1303, "rewards/accuracies": 1.0, "rewards/chosen": 1.9100326299667358, "rewards/margins": 1.219294786453247, "rewards/rejected": 0.6907379031181335, "step": 7055 }, { "epoch": 1.15, "learning_rate": 6.663086391847241e-07, "logits/chosen": -0.5947462320327759, "logits/rejected": -0.48850056529045105, "logps/chosen": -196.2720947265625, "logps/rejected": -225.47811889648438, "loss": 2.8027, "rewards/accuracies": 0.0, "rewards/chosen": 3.308392286300659, "rewards/margins": -3.102673292160034, "rewards/rejected": 6.411065578460693, "step": 7056 }, { "epoch": 1.15, "learning_rate": 6.661846911578804e-07, "logits/chosen": -0.5359119176864624, "logits/rejected": -0.5345030426979065, "logps/chosen": -20.83983039855957, "logps/rejected": -26.559873580932617, "loss": 0.8268, "rewards/accuracies": 1.0, "rewards/chosen": 0.4491487443447113, "rewards/margins": 0.22111758589744568, "rewards/rejected": 0.22803115844726562, "step": 7057 }, { "epoch": 1.15, "learning_rate": 6.660607316492471e-07, "logits/chosen": -0.07584058493375778, "logits/rejected": -0.07584058493375778, "logps/chosen": -52.561607360839844, "logps/rejected": -52.561607360839844, "loss": 0.6047, "rewards/accuracies": 0.0, "rewards/chosen": 0.5384544730186462, "rewards/margins": 0.0, "rewards/rejected": 0.5384544730186462, "step": 7058 }, { "epoch": 1.15, "learning_rate": 6.659367606673883e-07, "logits/chosen": -0.8067072033882141, "logits/rejected": -0.7507685422897339, "logps/chosen": -95.20243835449219, "logps/rejected": -60.30701446533203, "loss": 1.1596, "rewards/accuracies": 0.0, "rewards/chosen": 1.7449264526367188, "rewards/margins": -0.5858864784240723, "rewards/rejected": 2.330812931060791, "step": 7059 }, { "epoch": 1.15, "learning_rate": 6.658127782208695e-07, "logits/chosen": -0.8623247742652893, "logits/rejected": -0.9812796711921692, "logps/chosen": -133.78355407714844, "logps/rejected": -42.70261764526367, "loss": 0.1498, "rewards/accuracies": 1.0, "rewards/chosen": 6.591229438781738, "rewards/margins": 4.92606258392334, "rewards/rejected": 1.6651668548583984, "step": 7060 }, { "epoch": 1.15, "learning_rate": 6.656887843182565e-07, "logits/chosen": -0.8585206866264343, "logits/rejected": -0.8267785310745239, "logps/chosen": -52.48646545410156, "logps/rejected": -109.77664184570312, "loss": 1.0749, "rewards/accuracies": 0.0, "rewards/chosen": 1.3689254522323608, "rewards/margins": -0.45273590087890625, "rewards/rejected": 1.821661353111267, "step": 7061 }, { "epoch": 1.15, "learning_rate": 6.655647789681166e-07, "logits/chosen": -0.5088537931442261, "logits/rejected": -0.5214044451713562, "logps/chosen": -71.21505737304688, "logps/rejected": -81.77930450439453, "loss": 1.3207, "rewards/accuracies": 0.0, "rewards/chosen": 2.2006988525390625, "rewards/margins": -2.2275962829589844, "rewards/rejected": 4.428295135498047, "step": 7062 }, { "epoch": 1.15, "learning_rate": 6.654407621790169e-07, "logits/chosen": -0.8661528825759888, "logits/rejected": -0.821389377117157, "logps/chosen": -51.787635803222656, "logps/rejected": -53.95970153808594, "loss": 0.45, "rewards/accuracies": 0.0, "rewards/chosen": 1.9452804327011108, "rewards/margins": -0.34599769115448, "rewards/rejected": 2.291278123855591, "step": 7063 }, { "epoch": 1.15, "learning_rate": 6.653167339595261e-07, "logits/chosen": -0.6928084492683411, "logits/rejected": -0.660462498664856, "logps/chosen": -54.101409912109375, "logps/rejected": -58.117576599121094, "loss": 0.6449, "rewards/accuracies": 0.0, "rewards/chosen": 1.7663086652755737, "rewards/margins": -0.9472290277481079, "rewards/rejected": 2.7135376930236816, "step": 7064 }, { "epoch": 1.15, "learning_rate": 6.651926943182129e-07, "logits/chosen": -0.6416589021682739, "logits/rejected": -0.5869854688644409, "logps/chosen": -53.837188720703125, "logps/rejected": -127.4455337524414, "loss": 0.2088, "rewards/accuracies": 1.0, "rewards/chosen": 2.4991226196289062, "rewards/margins": 1.4057579040527344, "rewards/rejected": 1.0933647155761719, "step": 7065 }, { "epoch": 1.15, "learning_rate": 6.650686432636479e-07, "logits/chosen": -0.46944960951805115, "logits/rejected": -0.5304833054542542, "logps/chosen": -48.20256423950195, "logps/rejected": -81.74333190917969, "loss": 0.8275, "rewards/accuracies": 0.0, "rewards/chosen": 0.9607875943183899, "rewards/margins": -0.6745334267616272, "rewards/rejected": 1.635321021080017, "step": 7066 }, { "epoch": 1.15, "learning_rate": 6.649445808044013e-07, "logits/chosen": -0.65260910987854, "logits/rejected": -0.6775253415107727, "logps/chosen": -94.8458023071289, "logps/rejected": -102.33198547363281, "loss": 0.6573, "rewards/accuracies": 0.0, "rewards/chosen": 0.32903748750686646, "rewards/margins": -0.6483474969863892, "rewards/rejected": 0.9773849844932556, "step": 7067 }, { "epoch": 1.15, "learning_rate": 6.64820506949045e-07, "logits/chosen": -0.6515728831291199, "logits/rejected": -0.6681309938430786, "logps/chosen": -52.92451477050781, "logps/rejected": -73.3872299194336, "loss": 0.4218, "rewards/accuracies": 0.0, "rewards/chosen": 2.133345127105713, "rewards/margins": -0.10387349128723145, "rewards/rejected": 2.2372186183929443, "step": 7068 }, { "epoch": 1.15, "learning_rate": 6.646964217061513e-07, "logits/chosen": -1.3069573640823364, "logits/rejected": -1.2663320302963257, "logps/chosen": -63.036163330078125, "logps/rejected": -70.830322265625, "loss": 0.7638, "rewards/accuracies": 0.0, "rewards/chosen": 2.2538704872131348, "rewards/margins": -1.1033439636230469, "rewards/rejected": 3.3572144508361816, "step": 7069 }, { "epoch": 1.15, "learning_rate": 6.645723250842932e-07, "logits/chosen": -0.9192554354667664, "logits/rejected": -0.7761746048927307, "logps/chosen": -191.01654052734375, "logps/rejected": -192.107177734375, "loss": 0.2038, "rewards/accuracies": 1.0, "rewards/chosen": 7.173590183258057, "rewards/margins": 0.7011260986328125, "rewards/rejected": 6.472464084625244, "step": 7070 }, { "epoch": 1.15, "learning_rate": 6.644482170920445e-07, "logits/chosen": -0.7696558833122253, "logits/rejected": -0.6744630932807922, "logps/chosen": -90.85795593261719, "logps/rejected": -27.948184967041016, "loss": 0.0775, "rewards/accuracies": 1.0, "rewards/chosen": 2.566171407699585, "rewards/margins": 1.925970196723938, "rewards/rejected": 0.640201210975647, "step": 7071 }, { "epoch": 1.15, "learning_rate": 6.6432409773798e-07, "logits/chosen": -0.6631471514701843, "logits/rejected": -0.6980021595954895, "logps/chosen": -52.605987548828125, "logps/rejected": -140.0084228515625, "loss": 1.791, "rewards/accuracies": 0.0, "rewards/chosen": 2.8536651134490967, "rewards/margins": -2.527447462081909, "rewards/rejected": 5.381112575531006, "step": 7072 }, { "epoch": 1.15, "learning_rate": 6.641999670306754e-07, "logits/chosen": -1.0549052953720093, "logits/rejected": -0.9042353630065918, "logps/chosen": -82.8812255859375, "logps/rejected": -120.97077178955078, "loss": 0.0495, "rewards/accuracies": 1.0, "rewards/chosen": 4.973980903625488, "rewards/margins": 2.964933156967163, "rewards/rejected": 2.009047746658325, "step": 7073 }, { "epoch": 1.15, "learning_rate": 6.640758249787066e-07, "logits/chosen": -0.8535983562469482, "logits/rejected": -0.8425643444061279, "logps/chosen": -173.27175903320312, "logps/rejected": -162.13888549804688, "loss": 0.7662, "rewards/accuracies": 0.0, "rewards/chosen": 4.262628078460693, "rewards/margins": -0.2109527587890625, "rewards/rejected": 4.473580837249756, "step": 7074 }, { "epoch": 1.15, "learning_rate": 6.639516715906509e-07, "logits/chosen": -0.6381810307502747, "logits/rejected": -0.637241780757904, "logps/chosen": -131.14939880371094, "logps/rejected": -145.82345581054688, "loss": 0.5675, "rewards/accuracies": 1.0, "rewards/chosen": 1.7156661748886108, "rewards/margins": 0.6790480613708496, "rewards/rejected": 1.0366181135177612, "step": 7075 }, { "epoch": 1.15, "learning_rate": 6.638275068750861e-07, "logits/chosen": -0.8311973214149475, "logits/rejected": -0.8067228198051453, "logps/chosen": -61.817100524902344, "logps/rejected": -86.06758117675781, "loss": 0.4069, "rewards/accuracies": 1.0, "rewards/chosen": 3.4131019115448, "rewards/margins": 0.4766676425933838, "rewards/rejected": 2.936434268951416, "step": 7076 }, { "epoch": 1.15, "learning_rate": 6.637033308405905e-07, "logits/chosen": -0.9208499789237976, "logits/rejected": -0.7298943996429443, "logps/chosen": -209.79336547851562, "logps/rejected": -59.40166091918945, "loss": 0.2311, "rewards/accuracies": 1.0, "rewards/chosen": 1.3260116577148438, "rewards/margins": 1.1698505878448486, "rewards/rejected": 0.1561611145734787, "step": 7077 }, { "epoch": 1.15, "learning_rate": 6.635791434957439e-07, "logits/chosen": -0.5961245894432068, "logits/rejected": -0.7302224040031433, "logps/chosen": -241.58795166015625, "logps/rejected": -88.08894348144531, "loss": 0.1121, "rewards/accuracies": 1.0, "rewards/chosen": 4.877612590789795, "rewards/margins": 3.184020519256592, "rewards/rejected": 1.6935920715332031, "step": 7078 }, { "epoch": 1.15, "learning_rate": 6.63454944849126e-07, "logits/chosen": -0.4470568001270294, "logits/rejected": -0.4229622781276703, "logps/chosen": -50.55632019042969, "logps/rejected": -42.87404251098633, "loss": 0.5029, "rewards/accuracies": 0.0, "rewards/chosen": 2.554821729660034, "rewards/margins": -0.5282268524169922, "rewards/rejected": 3.0830485820770264, "step": 7079 }, { "epoch": 1.15, "learning_rate": 6.633307349093182e-07, "logits/chosen": -0.3304893374443054, "logits/rejected": -0.3304893374443054, "logps/chosen": -86.91584777832031, "logps/rejected": -86.91584777832031, "loss": 0.3491, "rewards/accuracies": 0.0, "rewards/chosen": 1.7751129865646362, "rewards/margins": 0.0, "rewards/rejected": 1.7751129865646362, "step": 7080 }, { "epoch": 1.15, "learning_rate": 6.632065136849022e-07, "logits/chosen": -0.7899444699287415, "logits/rejected": -0.7687106132507324, "logps/chosen": -111.6729507446289, "logps/rejected": -53.24568557739258, "loss": 1.1598, "rewards/accuracies": 0.0, "rewards/chosen": 1.1137580871582031, "rewards/margins": -2.1540944576263428, "rewards/rejected": 3.267852544784546, "step": 7081 }, { "epoch": 1.15, "learning_rate": 6.630822811844603e-07, "logits/chosen": -0.8795604109764099, "logits/rejected": -0.89069664478302, "logps/chosen": -48.31138229370117, "logps/rejected": -64.12178039550781, "loss": 2.638, "rewards/accuracies": 0.0, "rewards/chosen": 1.9274799823760986, "rewards/margins": -0.6264133453369141, "rewards/rejected": 2.5538933277130127, "step": 7082 }, { "epoch": 1.15, "learning_rate": 6.629580374165759e-07, "logits/chosen": -0.3165362477302551, "logits/rejected": -0.2979457676410675, "logps/chosen": -98.61781311035156, "logps/rejected": -87.39347839355469, "loss": 1.1101, "rewards/accuracies": 0.0, "rewards/chosen": 0.5743263363838196, "rewards/margins": -0.38297194242477417, "rewards/rejected": 0.9572982788085938, "step": 7083 }, { "epoch": 1.15, "learning_rate": 6.628337823898329e-07, "logits/chosen": -0.5851921439170837, "logits/rejected": -0.43512392044067383, "logps/chosen": -84.34788513183594, "logps/rejected": -136.38323974609375, "loss": 0.0294, "rewards/accuracies": 1.0, "rewards/chosen": 5.089646816253662, "rewards/margins": 4.58828592300415, "rewards/rejected": 0.501361072063446, "step": 7084 }, { "epoch": 1.15, "learning_rate": 6.627095161128163e-07, "logits/chosen": -0.6801183223724365, "logits/rejected": -0.6428657174110413, "logps/chosen": -56.52688980102539, "logps/rejected": -86.64383697509766, "loss": 1.0433, "rewards/accuracies": 0.0, "rewards/chosen": 1.6169627904891968, "rewards/margins": -0.005215883255004883, "rewards/rejected": 1.6221786737442017, "step": 7085 }, { "epoch": 1.15, "learning_rate": 6.625852385941118e-07, "logits/chosen": -0.5864220261573792, "logits/rejected": -0.5639362931251526, "logps/chosen": -54.52360534667969, "logps/rejected": -119.1881103515625, "loss": 0.7944, "rewards/accuracies": 1.0, "rewards/chosen": 1.7767753601074219, "rewards/margins": 0.5819236040115356, "rewards/rejected": 1.1948517560958862, "step": 7086 }, { "epoch": 1.15, "learning_rate": 6.624609498423057e-07, "logits/chosen": -0.8181968927383423, "logits/rejected": -0.8181968927383423, "logps/chosen": -41.43801498413086, "logps/rejected": -41.43801498413086, "loss": 0.3546, "rewards/accuracies": 0.0, "rewards/chosen": 0.3254531919956207, "rewards/margins": 0.0, "rewards/rejected": 0.3254531919956207, "step": 7087 }, { "epoch": 1.15, "learning_rate": 6.623366498659853e-07, "logits/chosen": -0.5345097184181213, "logits/rejected": -0.5353294014930725, "logps/chosen": -2.098966360092163, "logps/rejected": -16.163005828857422, "loss": 0.3314, "rewards/accuracies": 1.0, "rewards/chosen": 0.2116582691669464, "rewards/margins": 0.0851457417011261, "rewards/rejected": 0.1265125274658203, "step": 7088 }, { "epoch": 1.15, "learning_rate": 6.622123386737381e-07, "logits/chosen": -0.8432528972625732, "logits/rejected": -0.8817334175109863, "logps/chosen": -107.9410400390625, "logps/rejected": -114.14471435546875, "loss": 1.3544, "rewards/accuracies": 0.0, "rewards/chosen": 0.484506219625473, "rewards/margins": -0.7665153741836548, "rewards/rejected": 1.2510216236114502, "step": 7089 }, { "epoch": 1.15, "learning_rate": 6.620880162741534e-07, "logits/chosen": -0.7627508044242859, "logits/rejected": -0.6627397537231445, "logps/chosen": -48.72028350830078, "logps/rejected": -64.45036315917969, "loss": 0.4891, "rewards/accuracies": 0.0, "rewards/chosen": 1.7868636846542358, "rewards/margins": -0.4098557233810425, "rewards/rejected": 2.1967194080352783, "step": 7090 }, { "epoch": 1.15, "learning_rate": 6.619636826758203e-07, "logits/chosen": -0.3577635586261749, "logits/rejected": -0.2665215730667114, "logps/chosen": -62.65732192993164, "logps/rejected": -19.710346221923828, "loss": 0.5606, "rewards/accuracies": 1.0, "rewards/chosen": 0.46134454011917114, "rewards/margins": 0.11864718794822693, "rewards/rejected": 0.3426973521709442, "step": 7091 }, { "epoch": 1.15, "learning_rate": 6.618393378873295e-07, "logits/chosen": -0.8861193656921387, "logits/rejected": -0.8862183094024658, "logps/chosen": -98.11756896972656, "logps/rejected": -40.42517852783203, "loss": 0.621, "rewards/accuracies": 1.0, "rewards/chosen": 3.150202989578247, "rewards/margins": 1.1031043529510498, "rewards/rejected": 2.0470986366271973, "step": 7092 }, { "epoch": 1.15, "learning_rate": 6.617149819172719e-07, "logits/chosen": -0.49403101205825806, "logits/rejected": -0.5045379996299744, "logps/chosen": -37.93718719482422, "logps/rejected": -45.03196716308594, "loss": 0.5158, "rewards/accuracies": 0.0, "rewards/chosen": 1.06401526927948, "rewards/margins": -0.36802399158477783, "rewards/rejected": 1.4320392608642578, "step": 7093 }, { "epoch": 1.15, "learning_rate": 6.615906147742388e-07, "logits/chosen": -0.5421148538589478, "logits/rejected": -0.5821502208709717, "logps/chosen": -77.05137634277344, "logps/rejected": -80.14691162109375, "loss": 0.4274, "rewards/accuracies": 1.0, "rewards/chosen": 1.4144455194473267, "rewards/margins": 0.21210408210754395, "rewards/rejected": 1.2023414373397827, "step": 7094 }, { "epoch": 1.15, "learning_rate": 6.614662364668234e-07, "logits/chosen": -1.0131850242614746, "logits/rejected": -1.0494340658187866, "logps/chosen": -90.16468811035156, "logps/rejected": -129.8064422607422, "loss": 2.8223, "rewards/accuracies": 0.0, "rewards/chosen": 1.0272339582443237, "rewards/margins": -4.557486057281494, "rewards/rejected": 5.584720134735107, "step": 7095 }, { "epoch": 1.15, "learning_rate": 6.613418470036188e-07, "logits/chosen": -0.897972583770752, "logits/rejected": -0.8916864395141602, "logps/chosen": -53.40182876586914, "logps/rejected": -64.3372573852539, "loss": 0.577, "rewards/accuracies": 1.0, "rewards/chosen": 2.2940762042999268, "rewards/margins": 1.0984607934951782, "rewards/rejected": 1.1956154108047485, "step": 7096 }, { "epoch": 1.15, "learning_rate": 6.612174463932193e-07, "logits/chosen": -0.8578560948371887, "logits/rejected": -0.7228778600692749, "logps/chosen": -215.15859985351562, "logps/rejected": -183.33706665039062, "loss": 1.5094, "rewards/accuracies": 0.0, "rewards/chosen": 4.260776042938232, "rewards/margins": -2.9423446655273438, "rewards/rejected": 7.203120708465576, "step": 7097 }, { "epoch": 1.15, "learning_rate": 6.610930346442197e-07, "logits/chosen": -0.48501262068748474, "logits/rejected": -0.44118547439575195, "logps/chosen": -101.11097717285156, "logps/rejected": -132.877685546875, "loss": 2.3993, "rewards/accuracies": 1.0, "rewards/chosen": 0.9527488946914673, "rewards/margins": 0.12256395816802979, "rewards/rejected": 0.8301849365234375, "step": 7098 }, { "epoch": 1.15, "learning_rate": 6.609686117652157e-07, "logits/chosen": -0.5020679235458374, "logits/rejected": -0.5020679235458374, "logps/chosen": -37.02854919433594, "logps/rejected": -37.02854919433594, "loss": 0.3805, "rewards/accuracies": 0.0, "rewards/chosen": 1.4615364074707031, "rewards/margins": 0.0, "rewards/rejected": 1.4615364074707031, "step": 7099 }, { "epoch": 1.15, "learning_rate": 6.608441777648036e-07, "logits/chosen": -0.7251610159873962, "logits/rejected": -0.6289207935333252, "logps/chosen": -113.55210876464844, "logps/rejected": -71.8168716430664, "loss": 0.7623, "rewards/accuracies": 1.0, "rewards/chosen": 1.457067847251892, "rewards/margins": 1.085660457611084, "rewards/rejected": 0.37140733003616333, "step": 7100 }, { "epoch": 1.15, "learning_rate": 6.607197326515807e-07, "logits/chosen": -0.7966191172599792, "logits/rejected": -0.7109410762786865, "logps/chosen": -85.67599487304688, "logps/rejected": -36.670005798339844, "loss": 0.7532, "rewards/accuracies": 1.0, "rewards/chosen": 0.9737861752510071, "rewards/margins": 0.9923614859580994, "rewards/rejected": -0.018575286492705345, "step": 7101 }, { "epoch": 1.15, "learning_rate": 6.605952764341452e-07, "logits/chosen": -0.7386854290962219, "logits/rejected": -0.6635096669197083, "logps/chosen": -89.01304626464844, "logps/rejected": -31.021060943603516, "loss": 0.4452, "rewards/accuracies": 1.0, "rewards/chosen": 1.0700005292892456, "rewards/margins": 0.022108912467956543, "rewards/rejected": 1.047891616821289, "step": 7102 }, { "epoch": 1.15, "learning_rate": 6.604708091210957e-07, "logits/chosen": -0.5574223399162292, "logits/rejected": -0.5220564603805542, "logps/chosen": -58.79467010498047, "logps/rejected": -17.498741149902344, "loss": 0.4614, "rewards/accuracies": 0.0, "rewards/chosen": 0.6428207755088806, "rewards/margins": -0.13043361902236938, "rewards/rejected": 0.77325439453125, "step": 7103 }, { "epoch": 1.15, "learning_rate": 6.603463307210315e-07, "logits/chosen": -0.6811395287513733, "logits/rejected": -0.6948225498199463, "logps/chosen": -46.43852233886719, "logps/rejected": -79.28251647949219, "loss": 0.7767, "rewards/accuracies": 0.0, "rewards/chosen": 1.0833717584609985, "rewards/margins": -1.2552238702774048, "rewards/rejected": 2.3385956287384033, "step": 7104 }, { "epoch": 1.15, "learning_rate": 6.60221841242553e-07, "logits/chosen": -0.771345853805542, "logits/rejected": -0.771345853805542, "logps/chosen": -59.03138732910156, "logps/rejected": -59.03138732910156, "loss": 0.3503, "rewards/accuracies": 0.0, "rewards/chosen": 1.053443193435669, "rewards/margins": 0.0, "rewards/rejected": 1.053443193435669, "step": 7105 }, { "epoch": 1.15, "learning_rate": 6.600973406942616e-07, "logits/chosen": -0.5575992465019226, "logits/rejected": -0.46257835626602173, "logps/chosen": -78.08048248291016, "logps/rejected": -17.516521453857422, "loss": 0.0478, "rewards/accuracies": 1.0, "rewards/chosen": 2.939915418624878, "rewards/margins": 2.421689033508301, "rewards/rejected": 0.5182262659072876, "step": 7106 }, { "epoch": 1.15, "learning_rate": 6.599728290847586e-07, "logits/chosen": -0.8342189192771912, "logits/rejected": -0.8255704045295715, "logps/chosen": -67.84785461425781, "logps/rejected": -76.75367736816406, "loss": 0.4229, "rewards/accuracies": 1.0, "rewards/chosen": 0.9224869012832642, "rewards/margins": 0.4978622496128082, "rewards/rejected": 0.42462465167045593, "step": 7107 }, { "epoch": 1.15, "learning_rate": 6.598483064226469e-07, "logits/chosen": -1.1743602752685547, "logits/rejected": -1.142615795135498, "logps/chosen": -68.56234741210938, "logps/rejected": -76.50753784179688, "loss": 0.8867, "rewards/accuracies": 1.0, "rewards/chosen": 3.5730226039886475, "rewards/margins": 1.4013497829437256, "rewards/rejected": 2.171672821044922, "step": 7108 }, { "epoch": 1.15, "learning_rate": 6.597237727165297e-07, "logits/chosen": -1.2585816383361816, "logits/rejected": -0.6872630715370178, "logps/chosen": -44.10966491699219, "logps/rejected": -103.16270446777344, "loss": 1.2609, "rewards/accuracies": 0.0, "rewards/chosen": 1.7972618341445923, "rewards/margins": -1.2252525091171265, "rewards/rejected": 3.0225143432617188, "step": 7109 }, { "epoch": 1.15, "learning_rate": 6.59599227975011e-07, "logits/chosen": -0.9016551375389099, "logits/rejected": -0.9078608751296997, "logps/chosen": -45.49363708496094, "logps/rejected": -72.40009307861328, "loss": 0.4409, "rewards/accuracies": 1.0, "rewards/chosen": 2.618497610092163, "rewards/margins": 0.2714653015136719, "rewards/rejected": 2.347032308578491, "step": 7110 }, { "epoch": 1.15, "learning_rate": 6.594746722066959e-07, "logits/chosen": -0.7871145009994507, "logits/rejected": -0.7077243328094482, "logps/chosen": -90.50270080566406, "logps/rejected": -36.46983337402344, "loss": 0.4516, "rewards/accuracies": 1.0, "rewards/chosen": 2.5309853553771973, "rewards/margins": 2.221708297729492, "rewards/rejected": 0.3092769682407379, "step": 7111 }, { "epoch": 1.15, "learning_rate": 6.593501054201899e-07, "logits/chosen": -0.3973543643951416, "logits/rejected": -0.29480451345443726, "logps/chosen": -34.45263671875, "logps/rejected": -50.989219665527344, "loss": 0.5768, "rewards/accuracies": 0.0, "rewards/chosen": 1.2428638935089111, "rewards/margins": -0.38617968559265137, "rewards/rejected": 1.6290435791015625, "step": 7112 }, { "epoch": 1.15, "learning_rate": 6.592255276240993e-07, "logits/chosen": -0.6137505769729614, "logits/rejected": -0.6410210728645325, "logps/chosen": -58.61334991455078, "logps/rejected": -79.26734924316406, "loss": 0.7766, "rewards/accuracies": 0.0, "rewards/chosen": 0.08294563740491867, "rewards/margins": -1.2013226747512817, "rewards/rejected": 1.2842682600021362, "step": 7113 }, { "epoch": 1.15, "learning_rate": 6.591009388270314e-07, "logits/chosen": -0.8593626618385315, "logits/rejected": -0.8593626618385315, "logps/chosen": -98.73941040039062, "logps/rejected": -98.73941040039062, "loss": 0.8957, "rewards/accuracies": 0.0, "rewards/chosen": 2.9131698608398438, "rewards/margins": 0.0, "rewards/rejected": 2.9131698608398438, "step": 7114 }, { "epoch": 1.15, "learning_rate": 6.589763390375942e-07, "logits/chosen": -0.3325241208076477, "logits/rejected": -0.32993432879447937, "logps/chosen": -75.33108520507812, "logps/rejected": -135.16378784179688, "loss": 1.0318, "rewards/accuracies": 1.0, "rewards/chosen": 0.7899215817451477, "rewards/margins": 0.24249422550201416, "rewards/rejected": 0.5474273562431335, "step": 7115 }, { "epoch": 1.16, "learning_rate": 6.58851728264396e-07, "logits/chosen": -0.6582070589065552, "logits/rejected": -0.6006688475608826, "logps/chosen": -60.556007385253906, "logps/rejected": -70.31398010253906, "loss": 1.0547, "rewards/accuracies": 0.0, "rewards/chosen": 1.8736594915390015, "rewards/margins": -0.4281371831893921, "rewards/rejected": 2.3017966747283936, "step": 7116 }, { "epoch": 1.16, "learning_rate": 6.587271065160465e-07, "logits/chosen": -0.21470306813716888, "logits/rejected": -0.21470306813716888, "logps/chosen": -108.14654541015625, "logps/rejected": -108.14654541015625, "loss": 0.7051, "rewards/accuracies": 0.0, "rewards/chosen": 0.7633438110351562, "rewards/margins": 0.0, "rewards/rejected": 0.7633438110351562, "step": 7117 }, { "epoch": 1.16, "learning_rate": 6.58602473801156e-07, "logits/chosen": -0.7527439594268799, "logits/rejected": -0.6993808150291443, "logps/chosen": -51.30535125732422, "logps/rejected": -65.44552612304688, "loss": 0.8384, "rewards/accuracies": 0.0, "rewards/chosen": 0.9897556304931641, "rewards/margins": -0.6770931482315063, "rewards/rejected": 1.6668487787246704, "step": 7118 }, { "epoch": 1.16, "learning_rate": 6.584778301283351e-07, "logits/chosen": -0.9029147028923035, "logits/rejected": -0.8421151041984558, "logps/chosen": -188.3179473876953, "logps/rejected": -296.7032775878906, "loss": 0.2358, "rewards/accuracies": 1.0, "rewards/chosen": 6.535417079925537, "rewards/margins": 0.6243057250976562, "rewards/rejected": 5.911111354827881, "step": 7119 }, { "epoch": 1.16, "learning_rate": 6.583531755061958e-07, "logits/chosen": -0.8089188933372498, "logits/rejected": -0.6627336740493774, "logps/chosen": -161.980712890625, "logps/rejected": -42.57933044433594, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 5.790672302246094, "rewards/margins": 4.265139579772949, "rewards/rejected": 1.525532603263855, "step": 7120 }, { "epoch": 1.16, "learning_rate": 6.582285099433502e-07, "logits/chosen": -0.7348108291625977, "logits/rejected": -0.683802604675293, "logps/chosen": -87.85458374023438, "logps/rejected": -74.8925552368164, "loss": 0.7355, "rewards/accuracies": 0.0, "rewards/chosen": 0.9477257132530212, "rewards/margins": -0.31843870878219604, "rewards/rejected": 1.2661644220352173, "step": 7121 }, { "epoch": 1.16, "learning_rate": 6.58103833448412e-07, "logits/chosen": -0.6405948400497437, "logits/rejected": -0.5604303479194641, "logps/chosen": -108.05501556396484, "logps/rejected": -79.8072738647461, "loss": 0.7857, "rewards/accuracies": 1.0, "rewards/chosen": 5.811164379119873, "rewards/margins": 1.564702033996582, "rewards/rejected": 4.246462345123291, "step": 7122 }, { "epoch": 1.16, "learning_rate": 6.579791460299948e-07, "logits/chosen": -0.3696020841598511, "logits/rejected": -0.3010941743850708, "logps/chosen": -55.57684326171875, "logps/rejected": -37.77252960205078, "loss": 1.5256, "rewards/accuracies": 0.0, "rewards/chosen": 1.0182746648788452, "rewards/margins": -0.7781791687011719, "rewards/rejected": 1.796453833580017, "step": 7123 }, { "epoch": 1.16, "learning_rate": 6.578544476967133e-07, "logits/chosen": -0.41327548027038574, "logits/rejected": -0.2867739200592041, "logps/chosen": -36.15117263793945, "logps/rejected": -9.18234920501709, "loss": 2.225, "rewards/accuracies": 1.0, "rewards/chosen": 2.0454328060150146, "rewards/margins": 1.3033729791641235, "rewards/rejected": 0.7420598268508911, "step": 7124 }, { "epoch": 1.16, "learning_rate": 6.577297384571831e-07, "logits/chosen": -0.6356247663497925, "logits/rejected": -0.6244388222694397, "logps/chosen": -49.340484619140625, "logps/rejected": -24.410892486572266, "loss": 1.7445, "rewards/accuracies": 0.0, "rewards/chosen": 0.34343644976615906, "rewards/margins": -0.19392260909080505, "rewards/rejected": 0.5373590588569641, "step": 7125 }, { "epoch": 1.16, "learning_rate": 6.576050183200206e-07, "logits/chosen": -0.8162438273429871, "logits/rejected": -0.6208778023719788, "logps/chosen": -149.0501251220703, "logps/rejected": -35.308597564697266, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": 4.6993913650512695, "rewards/margins": 2.4442265033721924, "rewards/rejected": 2.255164861679077, "step": 7126 }, { "epoch": 1.16, "learning_rate": 6.574802872938425e-07, "logits/chosen": -0.23343046009540558, "logits/rejected": -0.19749648869037628, "logps/chosen": -59.91071319580078, "logps/rejected": -57.4188117980957, "loss": 0.4036, "rewards/accuracies": 0.0, "rewards/chosen": 2.0434372425079346, "rewards/margins": -0.0473780632019043, "rewards/rejected": 2.090815305709839, "step": 7127 }, { "epoch": 1.16, "learning_rate": 6.573555453872668e-07, "logits/chosen": -0.5348095297813416, "logits/rejected": -0.5561842322349548, "logps/chosen": -87.1159896850586, "logps/rejected": -50.18525695800781, "loss": 0.6577, "rewards/accuracies": 1.0, "rewards/chosen": 2.46661376953125, "rewards/margins": 0.4578382968902588, "rewards/rejected": 2.008775472640991, "step": 7128 }, { "epoch": 1.16, "learning_rate": 6.572307926089117e-07, "logits/chosen": -0.5571644306182861, "logits/rejected": -0.6624861359596252, "logps/chosen": -121.37071990966797, "logps/rejected": -115.42449951171875, "loss": 2.6369, "rewards/accuracies": 0.0, "rewards/chosen": 1.0826454162597656, "rewards/margins": -4.584940433502197, "rewards/rejected": 5.667585849761963, "step": 7129 }, { "epoch": 1.16, "learning_rate": 6.571060289673965e-07, "logits/chosen": -1.0037117004394531, "logits/rejected": -0.9859775900840759, "logps/chosen": -86.51365661621094, "logps/rejected": -102.46444702148438, "loss": 0.8168, "rewards/accuracies": 0.0, "rewards/chosen": 4.4673051834106445, "rewards/margins": -1.376969814300537, "rewards/rejected": 5.844274997711182, "step": 7130 }, { "epoch": 1.16, "learning_rate": 6.569812544713413e-07, "logits/chosen": -0.6468334197998047, "logits/rejected": -0.6322274804115295, "logps/chosen": -27.28437042236328, "logps/rejected": -12.685050010681152, "loss": 0.3042, "rewards/accuracies": 1.0, "rewards/chosen": 1.580100655555725, "rewards/margins": 1.0572302341461182, "rewards/rejected": 0.5228703618049622, "step": 7131 }, { "epoch": 1.16, "learning_rate": 6.568564691293669e-07, "logits/chosen": -0.6641737222671509, "logits/rejected": -0.6482471227645874, "logps/chosen": -104.21464538574219, "logps/rejected": -61.98723602294922, "loss": 0.5776, "rewards/accuracies": 0.0, "rewards/chosen": 0.7447647452354431, "rewards/margins": -0.7181465029716492, "rewards/rejected": 1.4629112482070923, "step": 7132 }, { "epoch": 1.16, "learning_rate": 6.567316729500944e-07, "logits/chosen": -1.0329840183258057, "logits/rejected": -0.995990514755249, "logps/chosen": -144.67630004882812, "logps/rejected": -22.774240493774414, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": 8.575861930847168, "rewards/margins": 8.125181198120117, "rewards/rejected": 0.45068055391311646, "step": 7133 }, { "epoch": 1.16, "learning_rate": 6.566068659421466e-07, "logits/chosen": -0.8892329335212708, "logits/rejected": -1.0465418100357056, "logps/chosen": -75.28777313232422, "logps/rejected": -107.78837585449219, "loss": 3.6706, "rewards/accuracies": 0.0, "rewards/chosen": 1.226887583732605, "rewards/margins": -5.455990314483643, "rewards/rejected": 6.682878017425537, "step": 7134 }, { "epoch": 1.16, "learning_rate": 6.564820481141461e-07, "logits/chosen": -0.7602318525314331, "logits/rejected": -0.7729523181915283, "logps/chosen": -122.45132446289062, "logps/rejected": -51.010765075683594, "loss": 0.7515, "rewards/accuracies": 0.0, "rewards/chosen": 1.655877709388733, "rewards/margins": -0.23462748527526855, "rewards/rejected": 1.8905051946640015, "step": 7135 }, { "epoch": 1.16, "learning_rate": 6.563572194747167e-07, "logits/chosen": -0.6482594013214111, "logits/rejected": -0.6482594013214111, "logps/chosen": -5.334421634674072, "logps/rejected": -5.334421634674072, "loss": 0.8845, "rewards/accuracies": 0.0, "rewards/chosen": 0.7700795531272888, "rewards/margins": 0.0, "rewards/rejected": 0.7700795531272888, "step": 7136 }, { "epoch": 1.16, "learning_rate": 6.562323800324829e-07, "logits/chosen": -0.28623270988464355, "logits/rejected": -0.2977555990219116, "logps/chosen": -87.50709533691406, "logps/rejected": -48.01129913330078, "loss": 1.3195, "rewards/accuracies": 0.0, "rewards/chosen": 0.6272323727607727, "rewards/margins": -1.0876045227050781, "rewards/rejected": 1.7148369550704956, "step": 7137 }, { "epoch": 1.16, "learning_rate": 6.561075297960699e-07, "logits/chosen": -1.3743257522583008, "logits/rejected": -1.3625752925872803, "logps/chosen": -69.89910125732422, "logps/rejected": -155.54876708984375, "loss": 1.8668, "rewards/accuracies": 0.0, "rewards/chosen": 1.5684616565704346, "rewards/margins": -3.3963258266448975, "rewards/rejected": 4.964787483215332, "step": 7138 }, { "epoch": 1.16, "learning_rate": 6.559826687741037e-07, "logits/chosen": -0.7357898950576782, "logits/rejected": -0.6783398389816284, "logps/chosen": -136.80938720703125, "logps/rejected": -144.3031005859375, "loss": 0.6138, "rewards/accuracies": 0.0, "rewards/chosen": 4.320929050445557, "rewards/margins": -0.8119840621948242, "rewards/rejected": 5.132913112640381, "step": 7139 }, { "epoch": 1.16, "learning_rate": 6.55857796975211e-07, "logits/chosen": -0.6209718585014343, "logits/rejected": -0.545428991317749, "logps/chosen": -89.09571838378906, "logps/rejected": -62.810455322265625, "loss": 0.4883, "rewards/accuracies": 1.0, "rewards/chosen": 1.5508133172988892, "rewards/margins": 0.12487566471099854, "rewards/rejected": 1.4259376525878906, "step": 7140 }, { "epoch": 1.16, "learning_rate": 6.557329144080193e-07, "logits/chosen": -1.0087279081344604, "logits/rejected": -0.9113335013389587, "logps/chosen": -125.3813247680664, "logps/rejected": -64.59174346923828, "loss": 0.0853, "rewards/accuracies": 1.0, "rewards/chosen": 4.173197269439697, "rewards/margins": 2.3748183250427246, "rewards/rejected": 1.798378825187683, "step": 7141 }, { "epoch": 1.16, "learning_rate": 6.556080210811568e-07, "logits/chosen": -0.7812633514404297, "logits/rejected": -0.6545394659042358, "logps/chosen": -89.18189239501953, "logps/rejected": -39.957908630371094, "loss": 0.328, "rewards/accuracies": 1.0, "rewards/chosen": 1.3262962102890015, "rewards/margins": 1.2635928392410278, "rewards/rejected": 0.06270332634449005, "step": 7142 }, { "epoch": 1.16, "learning_rate": 6.554831170032524e-07, "logits/chosen": -0.6089571118354797, "logits/rejected": -0.6129086017608643, "logps/chosen": -13.575400352478027, "logps/rejected": -14.799880027770996, "loss": 0.3609, "rewards/accuracies": 0.0, "rewards/chosen": 1.1322197914123535, "rewards/margins": -0.011313676834106445, "rewards/rejected": 1.14353346824646, "step": 7143 }, { "epoch": 1.16, "learning_rate": 6.553582021829358e-07, "logits/chosen": -0.9028369784355164, "logits/rejected": -0.8052952885627747, "logps/chosen": -145.0773162841797, "logps/rejected": -103.9722900390625, "loss": 0.6373, "rewards/accuracies": 1.0, "rewards/chosen": 4.302229404449463, "rewards/margins": 0.23752880096435547, "rewards/rejected": 4.064700603485107, "step": 7144 }, { "epoch": 1.16, "learning_rate": 6.552332766288374e-07, "logits/chosen": -0.5595867037773132, "logits/rejected": -0.560745894908905, "logps/chosen": -188.42715454101562, "logps/rejected": -101.91818237304688, "loss": 0.3265, "rewards/accuracies": 1.0, "rewards/chosen": 3.953930616378784, "rewards/margins": 0.23061513900756836, "rewards/rejected": 3.723315477371216, "step": 7145 }, { "epoch": 1.16, "learning_rate": 6.551083403495883e-07, "logits/chosen": -0.8098409175872803, "logits/rejected": -0.7324097752571106, "logps/chosen": -86.05339813232422, "logps/rejected": -85.31453704833984, "loss": 0.4473, "rewards/accuracies": 0.0, "rewards/chosen": 1.7065238952636719, "rewards/margins": -0.12016606330871582, "rewards/rejected": 1.8266899585723877, "step": 7146 }, { "epoch": 1.16, "learning_rate": 6.549833933538208e-07, "logits/chosen": -0.7469528317451477, "logits/rejected": -0.6846994161605835, "logps/chosen": -147.9383544921875, "logps/rejected": -34.74186706542969, "loss": 0.0691, "rewards/accuracies": 1.0, "rewards/chosen": 4.6486968994140625, "rewards/margins": 3.5297722816467285, "rewards/rejected": 1.1189247369766235, "step": 7147 }, { "epoch": 1.16, "learning_rate": 6.548584356501672e-07, "logits/chosen": -0.6048662066459656, "logits/rejected": -0.6602715253829956, "logps/chosen": -29.45659065246582, "logps/rejected": -132.35107421875, "loss": 2.9589, "rewards/accuracies": 0.0, "rewards/chosen": 2.38708758354187, "rewards/margins": -4.011845588684082, "rewards/rejected": 6.398933410644531, "step": 7148 }, { "epoch": 1.16, "learning_rate": 6.547334672472609e-07, "logits/chosen": -0.38862359523773193, "logits/rejected": -0.41273418068885803, "logps/chosen": -53.962425231933594, "logps/rejected": -48.59314727783203, "loss": 1.1909, "rewards/accuracies": 0.0, "rewards/chosen": -0.1666305512189865, "rewards/margins": -1.8206565380096436, "rewards/rejected": 1.6540260314941406, "step": 7149 }, { "epoch": 1.16, "learning_rate": 6.546084881537362e-07, "logits/chosen": -0.5038471221923828, "logits/rejected": -0.5038471221923828, "logps/chosen": -45.77876663208008, "logps/rejected": -45.77876663208008, "loss": 0.4066, "rewards/accuracies": 0.0, "rewards/chosen": 0.8303031921386719, "rewards/margins": 0.0, "rewards/rejected": 0.8303031921386719, "step": 7150 }, { "epoch": 1.16, "learning_rate": 6.544834983782279e-07, "logits/chosen": -0.6582517027854919, "logits/rejected": -0.6780081391334534, "logps/chosen": -210.75189208984375, "logps/rejected": -88.49961853027344, "loss": 0.1802, "rewards/accuracies": 1.0, "rewards/chosen": 2.8068466186523438, "rewards/margins": 1.1664772033691406, "rewards/rejected": 1.6403694152832031, "step": 7151 }, { "epoch": 1.16, "learning_rate": 6.543584979293715e-07, "logits/chosen": -0.7290984392166138, "logits/rejected": -0.5782179832458496, "logps/chosen": -107.27638244628906, "logps/rejected": -72.09395599365234, "loss": 0.6188, "rewards/accuracies": 1.0, "rewards/chosen": 3.361828565597534, "rewards/margins": 0.7776479721069336, "rewards/rejected": 2.5841805934906006, "step": 7152 }, { "epoch": 1.16, "learning_rate": 6.542334868158035e-07, "logits/chosen": -0.7063637375831604, "logits/rejected": -0.6147045493125916, "logps/chosen": -63.42155838012695, "logps/rejected": -25.883373260498047, "loss": 0.5084, "rewards/accuracies": 1.0, "rewards/chosen": 2.15173077583313, "rewards/margins": 1.9886337518692017, "rewards/rejected": 0.16309700906276703, "step": 7153 }, { "epoch": 1.16, "learning_rate": 6.541084650461609e-07, "logits/chosen": -0.4770711660385132, "logits/rejected": -0.4590087831020355, "logps/chosen": -34.321109771728516, "logps/rejected": -20.768404006958008, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 0.1840965300798416, "rewards/margins": -0.5597713589668274, "rewards/rejected": 0.7438678741455078, "step": 7154 }, { "epoch": 1.16, "learning_rate": 6.539834326290817e-07, "logits/chosen": -0.49595415592193604, "logits/rejected": -0.4866446256637573, "logps/chosen": -212.31072998046875, "logps/rejected": -115.89048767089844, "loss": 1.3143, "rewards/accuracies": 0.0, "rewards/chosen": 2.33591628074646, "rewards/margins": -2.0154950618743896, "rewards/rejected": 4.35141134262085, "step": 7155 }, { "epoch": 1.16, "learning_rate": 6.538583895732042e-07, "logits/chosen": -0.8945117592811584, "logits/rejected": -0.8940922617912292, "logps/chosen": -46.19792556762695, "logps/rejected": -124.57185363769531, "loss": 0.619, "rewards/accuracies": 1.0, "rewards/chosen": 2.0462682247161865, "rewards/margins": 1.1477575302124023, "rewards/rejected": 0.898510754108429, "step": 7156 }, { "epoch": 1.16, "learning_rate": 6.537333358871677e-07, "logits/chosen": -0.41322922706604004, "logits/rejected": -0.37483882904052734, "logps/chosen": -30.914875030517578, "logps/rejected": -82.40001678466797, "loss": 0.9973, "rewards/accuracies": 0.0, "rewards/chosen": 1.8599903583526611, "rewards/margins": -0.6540012359619141, "rewards/rejected": 2.513991594314575, "step": 7157 }, { "epoch": 1.16, "learning_rate": 6.536082715796124e-07, "logits/chosen": -0.48727142810821533, "logits/rejected": -0.5071747303009033, "logps/chosen": -93.12759399414062, "logps/rejected": -46.29087829589844, "loss": 0.5101, "rewards/accuracies": 0.0, "rewards/chosen": 2.0895638465881348, "rewards/margins": -0.5669798851013184, "rewards/rejected": 2.656543731689453, "step": 7158 }, { "epoch": 1.16, "learning_rate": 6.53483196659179e-07, "logits/chosen": -0.5404512882232666, "logits/rejected": -0.5879459977149963, "logps/chosen": -101.99272155761719, "logps/rejected": -72.96237182617188, "loss": 1.4162, "rewards/accuracies": 0.0, "rewards/chosen": 0.9986129999160767, "rewards/margins": -1.3897591829299927, "rewards/rejected": 2.3883721828460693, "step": 7159 }, { "epoch": 1.16, "learning_rate": 6.53358111134509e-07, "logits/chosen": -1.1111795902252197, "logits/rejected": -1.0257344245910645, "logps/chosen": -206.26388549804688, "logps/rejected": -124.89736938476562, "loss": 0.242, "rewards/accuracies": 1.0, "rewards/chosen": 4.559731960296631, "rewards/margins": 0.4840526580810547, "rewards/rejected": 4.075679302215576, "step": 7160 }, { "epoch": 1.16, "learning_rate": 6.532330150142446e-07, "logits/chosen": -0.8271083831787109, "logits/rejected": -0.778739333152771, "logps/chosen": -59.10097122192383, "logps/rejected": -54.48639678955078, "loss": 0.7993, "rewards/accuracies": 0.0, "rewards/chosen": 1.366679072380066, "rewards/margins": -0.7442432641983032, "rewards/rejected": 2.110922336578369, "step": 7161 }, { "epoch": 1.16, "learning_rate": 6.531079083070287e-07, "logits/chosen": -0.6596767902374268, "logits/rejected": -0.5468370914459229, "logps/chosen": -54.492279052734375, "logps/rejected": -106.80052185058594, "loss": 1.4742, "rewards/accuracies": 0.0, "rewards/chosen": 1.4688392877578735, "rewards/margins": -2.2188148498535156, "rewards/rejected": 3.6876542568206787, "step": 7162 }, { "epoch": 1.16, "learning_rate": 6.529827910215052e-07, "logits/chosen": -0.5273072123527527, "logits/rejected": -0.5540449023246765, "logps/chosen": -43.73462677001953, "logps/rejected": -36.25905990600586, "loss": 2.4444, "rewards/accuracies": 0.0, "rewards/chosen": 1.8563660383224487, "rewards/margins": -0.5515280961990356, "rewards/rejected": 2.4078941345214844, "step": 7163 }, { "epoch": 1.16, "learning_rate": 6.528576631663183e-07, "logits/chosen": -1.5023927688598633, "logits/rejected": -1.4794811010360718, "logps/chosen": -135.64208984375, "logps/rejected": -102.52066040039062, "loss": 0.0532, "rewards/accuracies": 1.0, "rewards/chosen": 5.759991645812988, "rewards/margins": 2.628563165664673, "rewards/rejected": 3.1314284801483154, "step": 7164 }, { "epoch": 1.16, "learning_rate": 6.527325247501132e-07, "logits/chosen": -1.0064462423324585, "logits/rejected": -1.2156895399093628, "logps/chosen": -129.86669921875, "logps/rejected": -36.42332458496094, "loss": 2.4449, "rewards/accuracies": 1.0, "rewards/chosen": 0.45261383056640625, "rewards/margins": 0.16723555326461792, "rewards/rejected": 0.28537827730178833, "step": 7165 }, { "epoch": 1.16, "learning_rate": 6.526073757815359e-07, "logits/chosen": -0.6827452778816223, "logits/rejected": -0.6998108625411987, "logps/chosen": -9.748323440551758, "logps/rejected": -2.9061381816864014, "loss": 0.5477, "rewards/accuracies": 0.0, "rewards/chosen": 0.06634273380041122, "rewards/margins": -0.286953330039978, "rewards/rejected": 0.35329607129096985, "step": 7166 }, { "epoch": 1.16, "learning_rate": 6.524822162692329e-07, "logits/chosen": -0.7904117703437805, "logits/rejected": -0.775108814239502, "logps/chosen": -135.93408203125, "logps/rejected": -216.92247009277344, "loss": 0.6539, "rewards/accuracies": 0.0, "rewards/chosen": 4.2418060302734375, "rewards/margins": -0.6310257911682129, "rewards/rejected": 4.87283182144165, "step": 7167 }, { "epoch": 1.16, "learning_rate": 6.523570462218515e-07, "logits/chosen": -0.3763364851474762, "logits/rejected": -0.32933980226516724, "logps/chosen": -16.590333938598633, "logps/rejected": -6.8680219650268555, "loss": 0.2083, "rewards/accuracies": 1.0, "rewards/chosen": 1.5971235036849976, "rewards/margins": 0.6686354279518127, "rewards/rejected": 0.9284880757331848, "step": 7168 }, { "epoch": 1.16, "learning_rate": 6.522318656480398e-07, "logits/chosen": -0.9487753510475159, "logits/rejected": -0.9448492527008057, "logps/chosen": -106.25390625, "logps/rejected": -142.6038818359375, "loss": 0.6433, "rewards/accuracies": 0.0, "rewards/chosen": 2.815913438796997, "rewards/margins": -0.7050552368164062, "rewards/rejected": 3.5209686756134033, "step": 7169 }, { "epoch": 1.16, "learning_rate": 6.521066745564467e-07, "logits/chosen": -0.11501698195934296, "logits/rejected": -0.13848751783370972, "logps/chosen": -4.472433567047119, "logps/rejected": -74.94780731201172, "loss": 3.1625, "rewards/accuracies": 1.0, "rewards/chosen": 0.5060257315635681, "rewards/margins": 0.17898866534233093, "rewards/rejected": 0.3270370662212372, "step": 7170 }, { "epoch": 1.16, "learning_rate": 6.519814729557216e-07, "logits/chosen": -0.6693499088287354, "logits/rejected": -0.7452743053436279, "logps/chosen": -67.74948120117188, "logps/rejected": -94.0560302734375, "loss": 2.0066, "rewards/accuracies": 0.0, "rewards/chosen": 2.0526275634765625, "rewards/margins": -2.8995699882507324, "rewards/rejected": 4.952197551727295, "step": 7171 }, { "epoch": 1.16, "learning_rate": 6.518562608545147e-07, "logits/chosen": -0.8008536696434021, "logits/rejected": -0.8277650475502014, "logps/chosen": -82.1138916015625, "logps/rejected": -65.82797241210938, "loss": 0.563, "rewards/accuracies": 1.0, "rewards/chosen": 2.0376458168029785, "rewards/margins": 0.18968212604522705, "rewards/rejected": 1.8479636907577515, "step": 7172 }, { "epoch": 1.16, "learning_rate": 6.517310382614771e-07, "logits/chosen": -0.6100500226020813, "logits/rejected": -0.4182542562484741, "logps/chosen": -130.907470703125, "logps/rejected": -47.16773986816406, "loss": 0.1022, "rewards/accuracies": 1.0, "rewards/chosen": 4.684915065765381, "rewards/margins": 2.2402944564819336, "rewards/rejected": 2.4446206092834473, "step": 7173 }, { "epoch": 1.16, "learning_rate": 6.516058051852604e-07, "logits/chosen": -0.07659399509429932, "logits/rejected": -0.024159599095582962, "logps/chosen": -95.01115417480469, "logps/rejected": -23.052791595458984, "loss": 0.3567, "rewards/accuracies": 1.0, "rewards/chosen": 0.5546455383300781, "rewards/margins": 0.2134130299091339, "rewards/rejected": 0.3412325084209442, "step": 7174 }, { "epoch": 1.16, "learning_rate": 6.514805616345173e-07, "logits/chosen": -0.9239206314086914, "logits/rejected": -0.927035391330719, "logps/chosen": -243.06346130371094, "logps/rejected": -119.31846618652344, "loss": 0.3518, "rewards/accuracies": 0.0, "rewards/chosen": 4.873451232910156, "rewards/margins": -0.0023317337036132812, "rewards/rejected": 4.8757829666137695, "step": 7175 }, { "epoch": 1.16, "learning_rate": 6.513553076179005e-07, "logits/chosen": -0.5230340957641602, "logits/rejected": -0.4013596177101135, "logps/chosen": -60.31134033203125, "logps/rejected": -51.19530487060547, "loss": 0.2452, "rewards/accuracies": 1.0, "rewards/chosen": 2.8640716075897217, "rewards/margins": 0.6245734691619873, "rewards/rejected": 2.2394981384277344, "step": 7176 }, { "epoch": 1.16, "learning_rate": 6.51230043144064e-07, "logits/chosen": -0.933995246887207, "logits/rejected": -0.9339446425437927, "logps/chosen": -129.60952758789062, "logps/rejected": -207.76235961914062, "loss": 2.7523, "rewards/accuracies": 0.0, "rewards/chosen": 2.885009765625, "rewards/margins": -4.66558837890625, "rewards/rejected": 7.55059814453125, "step": 7177 }, { "epoch": 1.17, "learning_rate": 6.511047682216627e-07, "logits/chosen": -0.9068295955657959, "logits/rejected": -0.7764863967895508, "logps/chosen": -70.55109405517578, "logps/rejected": -25.334569931030273, "loss": 0.3983, "rewards/accuracies": 0.0, "rewards/chosen": 0.3785545527935028, "rewards/margins": -0.12054690718650818, "rewards/rejected": 0.499101459980011, "step": 7178 }, { "epoch": 1.17, "learning_rate": 6.509794828593516e-07, "logits/chosen": -0.28260692954063416, "logits/rejected": -0.2896680235862732, "logps/chosen": -6.45263147354126, "logps/rejected": -3.112212896347046, "loss": 1.807, "rewards/accuracies": 1.0, "rewards/chosen": 0.2607634961605072, "rewards/margins": 0.008848100900650024, "rewards/rejected": 0.2519153952598572, "step": 7179 }, { "epoch": 1.17, "learning_rate": 6.508541870657866e-07, "logits/chosen": -0.6524676084518433, "logits/rejected": -0.6860164999961853, "logps/chosen": -55.11791229248047, "logps/rejected": -57.42800521850586, "loss": 0.7284, "rewards/accuracies": 0.0, "rewards/chosen": 0.9227432608604431, "rewards/margins": -1.1855990886688232, "rewards/rejected": 2.108342409133911, "step": 7180 }, { "epoch": 1.17, "learning_rate": 6.50728880849625e-07, "logits/chosen": -0.16757768392562866, "logits/rejected": -0.1446792334318161, "logps/chosen": -45.14933776855469, "logps/rejected": -16.83344268798828, "loss": 1.188, "rewards/accuracies": 1.0, "rewards/chosen": 1.2511459589004517, "rewards/margins": 0.1813873052597046, "rewards/rejected": 1.069758653640747, "step": 7181 }, { "epoch": 1.17, "learning_rate": 6.506035642195238e-07, "logits/chosen": -0.34796974062919617, "logits/rejected": -0.36291906237602234, "logps/chosen": -18.098604202270508, "logps/rejected": -56.18006896972656, "loss": 1.2316, "rewards/accuracies": 0.0, "rewards/chosen": 0.21357250213623047, "rewards/margins": -0.03209935128688812, "rewards/rejected": 0.2456718534231186, "step": 7182 }, { "epoch": 1.17, "learning_rate": 6.504782371841413e-07, "logits/chosen": -0.7293726205825806, "logits/rejected": -0.6643602848052979, "logps/chosen": -93.116943359375, "logps/rejected": -11.432477951049805, "loss": 0.4544, "rewards/accuracies": 0.0, "rewards/chosen": 0.5513061881065369, "rewards/margins": -0.2533363103866577, "rewards/rejected": 0.8046424984931946, "step": 7183 }, { "epoch": 1.17, "learning_rate": 6.503528997521364e-07, "logits/chosen": -0.7771141529083252, "logits/rejected": -0.8199415802955627, "logps/chosen": -107.5014877319336, "logps/rejected": -91.44754791259766, "loss": 1.4472, "rewards/accuracies": 0.0, "rewards/chosen": 0.9938926696777344, "rewards/margins": -1.96634840965271, "rewards/rejected": 2.9602410793304443, "step": 7184 }, { "epoch": 1.17, "learning_rate": 6.502275519321689e-07, "logits/chosen": -0.5409501194953918, "logits/rejected": -0.5239463448524475, "logps/chosen": -58.222198486328125, "logps/rejected": -94.03250885009766, "loss": 0.6197, "rewards/accuracies": 0.0, "rewards/chosen": 2.276397705078125, "rewards/margins": -0.19824457168579102, "rewards/rejected": 2.474642276763916, "step": 7185 }, { "epoch": 1.17, "learning_rate": 6.501021937328991e-07, "logits/chosen": -0.6167660355567932, "logits/rejected": -0.5804640054702759, "logps/chosen": -42.954734802246094, "logps/rejected": -70.20652770996094, "loss": 0.6447, "rewards/accuracies": 1.0, "rewards/chosen": 2.301950216293335, "rewards/margins": 0.15379953384399414, "rewards/rejected": 2.148150682449341, "step": 7186 }, { "epoch": 1.17, "learning_rate": 6.499768251629879e-07, "logits/chosen": -0.5800683498382568, "logits/rejected": -0.5176330804824829, "logps/chosen": -53.22127914428711, "logps/rejected": -40.6646728515625, "loss": 0.4083, "rewards/accuracies": 1.0, "rewards/chosen": 1.6306583881378174, "rewards/margins": 0.2113323211669922, "rewards/rejected": 1.4193260669708252, "step": 7187 }, { "epoch": 1.17, "learning_rate": 6.498514462310971e-07, "logits/chosen": -0.42313894629478455, "logits/rejected": -0.43045130372047424, "logps/chosen": -72.94015502929688, "logps/rejected": -32.529640197753906, "loss": 1.0845, "rewards/accuracies": 1.0, "rewards/chosen": 1.2764427661895752, "rewards/margins": 0.10010910034179688, "rewards/rejected": 1.1763336658477783, "step": 7188 }, { "epoch": 1.17, "learning_rate": 6.497260569458893e-07, "logits/chosen": -0.7225520610809326, "logits/rejected": -0.5663905143737793, "logps/chosen": -107.67778778076172, "logps/rejected": -92.18568420410156, "loss": 0.455, "rewards/accuracies": 1.0, "rewards/chosen": 4.568091869354248, "rewards/margins": 1.3995814323425293, "rewards/rejected": 3.1685104370117188, "step": 7189 }, { "epoch": 1.17, "learning_rate": 6.496006573160277e-07, "logits/chosen": -0.8448155522346497, "logits/rejected": -0.8435184359550476, "logps/chosen": -85.96455383300781, "logps/rejected": -76.4749755859375, "loss": 0.595, "rewards/accuracies": 0.0, "rewards/chosen": 1.5481888055801392, "rewards/margins": -0.6650398969650269, "rewards/rejected": 2.213228702545166, "step": 7190 }, { "epoch": 1.17, "learning_rate": 6.494752473501763e-07, "logits/chosen": -0.8168603181838989, "logits/rejected": -0.811857283115387, "logps/chosen": -74.00199890136719, "logps/rejected": -110.20065307617188, "loss": 0.1083, "rewards/accuracies": 1.0, "rewards/chosen": 1.5318084955215454, "rewards/margins": 2.397754669189453, "rewards/rejected": -0.8659462332725525, "step": 7191 }, { "epoch": 1.17, "learning_rate": 6.493498270569997e-07, "logits/chosen": -0.8085612654685974, "logits/rejected": -0.8200055956840515, "logps/chosen": -72.7708740234375, "logps/rejected": -105.7296371459961, "loss": 0.5288, "rewards/accuracies": 0.0, "rewards/chosen": 1.2500847578048706, "rewards/margins": -0.5709601640701294, "rewards/rejected": 1.821044921875, "step": 7192 }, { "epoch": 1.17, "learning_rate": 6.492243964451631e-07, "logits/chosen": -1.0404410362243652, "logits/rejected": -1.0191134214401245, "logps/chosen": -56.08009338378906, "logps/rejected": -25.93152618408203, "loss": 1.079, "rewards/accuracies": 1.0, "rewards/chosen": 1.0049294233322144, "rewards/margins": 0.8426520228385925, "rewards/rejected": 0.16227741539478302, "step": 7193 }, { "epoch": 1.17, "learning_rate": 6.490989555233327e-07, "logits/chosen": -0.5886868238449097, "logits/rejected": -0.5433019995689392, "logps/chosen": -28.807424545288086, "logps/rejected": -75.63932037353516, "loss": 0.8878, "rewards/accuracies": 0.0, "rewards/chosen": 1.091805100440979, "rewards/margins": -0.6687180995941162, "rewards/rejected": 1.7605232000350952, "step": 7194 }, { "epoch": 1.17, "learning_rate": 6.489735043001752e-07, "logits/chosen": -0.7061984539031982, "logits/rejected": -0.7281826734542847, "logps/chosen": -130.97470092773438, "logps/rejected": -113.86747741699219, "loss": 2.6669, "rewards/accuracies": 0.0, "rewards/chosen": 1.052215576171875, "rewards/margins": -0.675994873046875, "rewards/rejected": 1.72821044921875, "step": 7195 }, { "epoch": 1.17, "learning_rate": 6.488480427843583e-07, "logits/chosen": -0.5736481547355652, "logits/rejected": -0.44634366035461426, "logps/chosen": -87.46961975097656, "logps/rejected": -38.46510696411133, "loss": 0.9558, "rewards/accuracies": 1.0, "rewards/chosen": 1.6663398742675781, "rewards/margins": 1.5551189184188843, "rewards/rejected": 0.11122093349695206, "step": 7196 }, { "epoch": 1.17, "learning_rate": 6.487225709845499e-07, "logits/chosen": -0.7426111698150635, "logits/rejected": -0.7120504379272461, "logps/chosen": -118.83663940429688, "logps/rejected": -195.05030822753906, "loss": 0.1358, "rewards/accuracies": 1.0, "rewards/chosen": 5.800961494445801, "rewards/margins": 1.4380602836608887, "rewards/rejected": 4.362901210784912, "step": 7197 }, { "epoch": 1.17, "learning_rate": 6.485970889094191e-07, "logits/chosen": -0.8106738924980164, "logits/rejected": -0.7750462293624878, "logps/chosen": -115.50160217285156, "logps/rejected": -57.20017623901367, "loss": 0.2106, "rewards/accuracies": 1.0, "rewards/chosen": 2.3574235439300537, "rewards/margins": 0.7611058950424194, "rewards/rejected": 1.5963176488876343, "step": 7198 }, { "epoch": 1.17, "learning_rate": 6.484715965676357e-07, "logits/chosen": -0.13851766288280487, "logits/rejected": -0.09848206490278244, "logps/chosen": -42.657257080078125, "logps/rejected": -12.331048965454102, "loss": 0.2525, "rewards/accuracies": 1.0, "rewards/chosen": 1.4448035955429077, "rewards/margins": 0.7313711047172546, "rewards/rejected": 0.7134324908256531, "step": 7199 }, { "epoch": 1.17, "learning_rate": 6.483460939678696e-07, "logits/chosen": -0.8450865149497986, "logits/rejected": -0.5332125425338745, "logps/chosen": -127.3288345336914, "logps/rejected": -15.441545486450195, "loss": 0.4506, "rewards/accuracies": 1.0, "rewards/chosen": 3.872709035873413, "rewards/margins": 3.5127947330474854, "rewards/rejected": 0.35991421341896057, "step": 7200 }, { "epoch": 1.17, "learning_rate": 6.482205811187921e-07, "logits/chosen": -0.5239185690879822, "logits/rejected": -0.46869802474975586, "logps/chosen": -53.229705810546875, "logps/rejected": -39.4506950378418, "loss": 0.3716, "rewards/accuracies": 1.0, "rewards/chosen": 1.726812720298767, "rewards/margins": 0.765917181968689, "rewards/rejected": 0.9608955383300781, "step": 7201 }, { "epoch": 1.17, "learning_rate": 6.480950580290751e-07, "logits/chosen": -0.5357283353805542, "logits/rejected": -0.5357283353805542, "logps/chosen": -57.894500732421875, "logps/rejected": -57.894500732421875, "loss": 0.3486, "rewards/accuracies": 0.0, "rewards/chosen": 1.0243446826934814, "rewards/margins": 0.0, "rewards/rejected": 1.0243446826934814, "step": 7202 }, { "epoch": 1.17, "learning_rate": 6.479695247073906e-07, "logits/chosen": -0.35308903455734253, "logits/rejected": -0.29931074380874634, "logps/chosen": -113.86143493652344, "logps/rejected": -52.47837829589844, "loss": 0.4569, "rewards/accuracies": 1.0, "rewards/chosen": 1.7092331647872925, "rewards/margins": 0.07772219181060791, "rewards/rejected": 1.6315109729766846, "step": 7203 }, { "epoch": 1.17, "learning_rate": 6.478439811624123e-07, "logits/chosen": -0.6714341044425964, "logits/rejected": -0.5869840383529663, "logps/chosen": -110.7533187866211, "logps/rejected": -67.85221862792969, "loss": 0.0925, "rewards/accuracies": 1.0, "rewards/chosen": 4.373687267303467, "rewards/margins": 2.2276971340179443, "rewards/rejected": 2.1459901332855225, "step": 7204 }, { "epoch": 1.17, "learning_rate": 6.477184274028136e-07, "logits/chosen": -0.39974984526634216, "logits/rejected": -0.39169785380363464, "logps/chosen": -114.28977966308594, "logps/rejected": -48.2444953918457, "loss": 0.4787, "rewards/accuracies": 0.0, "rewards/chosen": 1.6249831914901733, "rewards/margins": -0.4686809778213501, "rewards/rejected": 2.0936641693115234, "step": 7205 }, { "epoch": 1.17, "learning_rate": 6.475928634372693e-07, "logits/chosen": -0.611261785030365, "logits/rejected": -0.6480973958969116, "logps/chosen": -140.80685424804688, "logps/rejected": -133.9906005859375, "loss": 0.1975, "rewards/accuracies": 1.0, "rewards/chosen": 2.7801804542541504, "rewards/margins": 1.3754379749298096, "rewards/rejected": 1.4047424793243408, "step": 7206 }, { "epoch": 1.17, "learning_rate": 6.474672892744548e-07, "logits/chosen": -0.6842206716537476, "logits/rejected": -0.6684904098510742, "logps/chosen": -80.94351959228516, "logps/rejected": -92.89421844482422, "loss": 0.513, "rewards/accuracies": 1.0, "rewards/chosen": 2.5461487770080566, "rewards/margins": 0.3836648464202881, "rewards/rejected": 2.1624839305877686, "step": 7207 }, { "epoch": 1.17, "learning_rate": 6.473417049230458e-07, "logits/chosen": -0.8275103569030762, "logits/rejected": -0.8376211524009705, "logps/chosen": -267.59149169921875, "logps/rejected": -79.72429656982422, "loss": 0.1955, "rewards/accuracies": 1.0, "rewards/chosen": 5.360400676727295, "rewards/margins": 3.0941126346588135, "rewards/rejected": 2.2662880420684814, "step": 7208 }, { "epoch": 1.17, "learning_rate": 6.472161103917193e-07, "logits/chosen": -0.49711355566978455, "logits/rejected": -0.5503156185150146, "logps/chosen": -49.96770477294922, "logps/rejected": -55.263954162597656, "loss": 0.8729, "rewards/accuracies": 0.0, "rewards/chosen": 1.7203712463378906, "rewards/margins": -0.7860519886016846, "rewards/rejected": 2.506423234939575, "step": 7209 }, { "epoch": 1.17, "learning_rate": 6.470905056891524e-07, "logits/chosen": -0.9510305523872375, "logits/rejected": -0.7432680130004883, "logps/chosen": -105.4784927368164, "logps/rejected": -19.93375015258789, "loss": 0.1268, "rewards/accuracies": 1.0, "rewards/chosen": 4.068091869354248, "rewards/margins": 3.7779619693756104, "rewards/rejected": 0.2901298701763153, "step": 7210 }, { "epoch": 1.17, "learning_rate": 6.469648908240235e-07, "logits/chosen": -0.6348710060119629, "logits/rejected": -0.6348710060119629, "logps/chosen": -24.991207122802734, "logps/rejected": -24.991207122802734, "loss": 0.3784, "rewards/accuracies": 0.0, "rewards/chosen": 0.880313515663147, "rewards/margins": 0.0, "rewards/rejected": 0.880313515663147, "step": 7211 }, { "epoch": 1.17, "learning_rate": 6.468392658050112e-07, "logits/chosen": -0.6053948998451233, "logits/rejected": -0.6053948998451233, "logps/chosen": -76.38126373291016, "logps/rejected": -76.38126373291016, "loss": 0.5076, "rewards/accuracies": 0.0, "rewards/chosen": 3.7005813121795654, "rewards/margins": 0.0, "rewards/rejected": 3.7005813121795654, "step": 7212 }, { "epoch": 1.17, "learning_rate": 6.46713630640795e-07, "logits/chosen": -0.8785231113433838, "logits/rejected": -0.8442509174346924, "logps/chosen": -49.291587829589844, "logps/rejected": -61.129791259765625, "loss": 0.2907, "rewards/accuracies": 1.0, "rewards/chosen": 1.6759697198867798, "rewards/margins": 0.3048408031463623, "rewards/rejected": 1.3711289167404175, "step": 7213 }, { "epoch": 1.17, "learning_rate": 6.465879853400552e-07, "logits/chosen": -0.9212061166763306, "logits/rejected": -0.9386858940124512, "logps/chosen": -59.19645690917969, "logps/rejected": -116.68190002441406, "loss": 0.6003, "rewards/accuracies": 1.0, "rewards/chosen": 2.3949387073516846, "rewards/margins": 0.6947739124298096, "rewards/rejected": 1.700164794921875, "step": 7214 }, { "epoch": 1.17, "learning_rate": 6.464623299114727e-07, "logits/chosen": -0.7642978429794312, "logits/rejected": -0.8070737719535828, "logps/chosen": -63.608123779296875, "logps/rejected": -30.6192684173584, "loss": 0.6382, "rewards/accuracies": 0.0, "rewards/chosen": 1.1534897089004517, "rewards/margins": -0.7463724613189697, "rewards/rejected": 1.8998621702194214, "step": 7215 }, { "epoch": 1.17, "learning_rate": 6.463366643637289e-07, "logits/chosen": -1.0843024253845215, "logits/rejected": -0.9647015333175659, "logps/chosen": -119.43685150146484, "logps/rejected": -146.27374267578125, "loss": 2.6341, "rewards/accuracies": 0.0, "rewards/chosen": 4.602266788482666, "rewards/margins": -3.2438912391662598, "rewards/rejected": 7.846158027648926, "step": 7216 }, { "epoch": 1.17, "learning_rate": 6.462109887055062e-07, "logits/chosen": -0.8560471534729004, "logits/rejected": -0.9699988961219788, "logps/chosen": -114.88768768310547, "logps/rejected": -111.85820007324219, "loss": 2.7452, "rewards/accuracies": 0.0, "rewards/chosen": 2.217331647872925, "rewards/margins": -5.442009925842285, "rewards/rejected": 7.659341335296631, "step": 7217 }, { "epoch": 1.17, "learning_rate": 6.460853029454878e-07, "logits/chosen": -0.46949195861816406, "logits/rejected": -0.46949195861816406, "logps/chosen": -80.77641296386719, "logps/rejected": -80.77641296386719, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 3.0866806507110596, "rewards/margins": 0.0, "rewards/rejected": 3.0866806507110596, "step": 7218 }, { "epoch": 1.17, "learning_rate": 6.459596070923572e-07, "logits/chosen": -0.6725666522979736, "logits/rejected": -0.657960057258606, "logps/chosen": -26.968854904174805, "logps/rejected": -34.08680725097656, "loss": 0.3788, "rewards/accuracies": 0.0, "rewards/chosen": 1.2101503610610962, "rewards/margins": -0.011329412460327148, "rewards/rejected": 1.2214797735214233, "step": 7219 }, { "epoch": 1.17, "learning_rate": 6.458339011547987e-07, "logits/chosen": -1.036266565322876, "logits/rejected": -0.9547681212425232, "logps/chosen": -79.8172607421875, "logps/rejected": -76.11942291259766, "loss": 1.3271, "rewards/accuracies": 1.0, "rewards/chosen": 4.490833282470703, "rewards/margins": 1.8087058067321777, "rewards/rejected": 2.6821274757385254, "step": 7220 }, { "epoch": 1.17, "learning_rate": 6.457081851414977e-07, "logits/chosen": -0.36932313442230225, "logits/rejected": -0.47549936175346375, "logps/chosen": -222.86587524414062, "logps/rejected": -135.55686950683594, "loss": 0.5314, "rewards/accuracies": 0.0, "rewards/chosen": 2.8013672828674316, "rewards/margins": -0.6153793334960938, "rewards/rejected": 3.4167466163635254, "step": 7221 }, { "epoch": 1.17, "learning_rate": 6.455824590611397e-07, "logits/chosen": -0.6550471186637878, "logits/rejected": -0.6281454563140869, "logps/chosen": -24.52960968017578, "logps/rejected": -8.199431419372559, "loss": 0.5659, "rewards/accuracies": 1.0, "rewards/chosen": 0.8208217620849609, "rewards/margins": 0.22507244348526, "rewards/rejected": 0.5957493185997009, "step": 7222 }, { "epoch": 1.17, "learning_rate": 6.454567229224113e-07, "logits/chosen": -0.4497547447681427, "logits/rejected": -0.41143599152565, "logps/chosen": -122.4803695678711, "logps/rejected": -114.04061126708984, "loss": 0.2645, "rewards/accuracies": 1.0, "rewards/chosen": 4.286234378814697, "rewards/margins": 0.955819845199585, "rewards/rejected": 3.3304145336151123, "step": 7223 }, { "epoch": 1.17, "learning_rate": 6.453309767339997e-07, "logits/chosen": -0.3484063446521759, "logits/rejected": -0.330621600151062, "logps/chosen": -61.551414489746094, "logps/rejected": -37.319053649902344, "loss": 1.4758, "rewards/accuracies": 0.0, "rewards/chosen": 0.8089515566825867, "rewards/margins": -0.29879993200302124, "rewards/rejected": 1.107751488685608, "step": 7224 }, { "epoch": 1.17, "learning_rate": 6.452052205045928e-07, "logits/chosen": -0.6656802892684937, "logits/rejected": -0.5865824222564697, "logps/chosen": -35.1048583984375, "logps/rejected": -71.25509643554688, "loss": 0.4539, "rewards/accuracies": 1.0, "rewards/chosen": 2.5171570777893066, "rewards/margins": 1.4290467500686646, "rewards/rejected": 1.088110327720642, "step": 7225 }, { "epoch": 1.17, "learning_rate": 6.450794542428791e-07, "logits/chosen": -0.8966795802116394, "logits/rejected": -0.7298879027366638, "logps/chosen": -137.47512817382812, "logps/rejected": -39.41789627075195, "loss": 0.1064, "rewards/accuracies": 1.0, "rewards/chosen": 4.334168910980225, "rewards/margins": 4.058041095733643, "rewards/rejected": 0.27612802386283875, "step": 7226 }, { "epoch": 1.17, "learning_rate": 6.449536779575477e-07, "logits/chosen": -0.7786207795143127, "logits/rejected": -0.6667641401290894, "logps/chosen": -51.01754379272461, "logps/rejected": -29.240943908691406, "loss": 0.7964, "rewards/accuracies": 1.0, "rewards/chosen": 0.9879936575889587, "rewards/margins": 0.2493465542793274, "rewards/rejected": 0.7386471033096313, "step": 7227 }, { "epoch": 1.17, "learning_rate": 6.448278916572889e-07, "logits/chosen": -0.622847318649292, "logits/rejected": -0.5082105994224548, "logps/chosen": -84.60834503173828, "logps/rejected": -32.129730224609375, "loss": 0.8913, "rewards/accuracies": 1.0, "rewards/chosen": 1.996436357498169, "rewards/margins": 1.6999931335449219, "rewards/rejected": 0.2964431941509247, "step": 7228 }, { "epoch": 1.17, "learning_rate": 6.447020953507931e-07, "logits/chosen": -0.5917068719863892, "logits/rejected": -0.5930123925209045, "logps/chosen": -1.6137683391571045, "logps/rejected": -2.0963568687438965, "loss": 0.7717, "rewards/accuracies": 0.0, "rewards/chosen": 0.2886662185192108, "rewards/margins": -0.0657561719417572, "rewards/rejected": 0.354422390460968, "step": 7229 }, { "epoch": 1.17, "learning_rate": 6.445762890467517e-07, "logits/chosen": -0.8065885901451111, "logits/rejected": -0.8235552906990051, "logps/chosen": -58.79184341430664, "logps/rejected": -61.25933074951172, "loss": 0.5674, "rewards/accuracies": 0.0, "rewards/chosen": 1.4810177087783813, "rewards/margins": -0.7244366407394409, "rewards/rejected": 2.2054543495178223, "step": 7230 }, { "epoch": 1.17, "learning_rate": 6.444504727538566e-07, "logits/chosen": -0.7185217142105103, "logits/rejected": -0.7551965117454529, "logps/chosen": -55.85585021972656, "logps/rejected": -48.6171760559082, "loss": 0.8116, "rewards/accuracies": 0.0, "rewards/chosen": 1.3803573846817017, "rewards/margins": -0.3618190288543701, "rewards/rejected": 1.7421764135360718, "step": 7231 }, { "epoch": 1.17, "learning_rate": 6.443246464808007e-07, "logits/chosen": -0.6752306222915649, "logits/rejected": -0.6752306222915649, "logps/chosen": -38.16194152832031, "logps/rejected": -38.16194152832031, "loss": 0.9693, "rewards/accuracies": 0.0, "rewards/chosen": 1.6762245893478394, "rewards/margins": 0.0, "rewards/rejected": 1.6762245893478394, "step": 7232 }, { "epoch": 1.17, "learning_rate": 6.441988102362774e-07, "logits/chosen": -0.6340345144271851, "logits/rejected": -0.658339262008667, "logps/chosen": -114.08149719238281, "logps/rejected": -70.94574737548828, "loss": 0.6915, "rewards/accuracies": 0.0, "rewards/chosen": 2.021052598953247, "rewards/margins": -0.8739204406738281, "rewards/rejected": 2.894973039627075, "step": 7233 }, { "epoch": 1.17, "learning_rate": 6.440729640289808e-07, "logits/chosen": -0.706921398639679, "logits/rejected": -0.6047802567481995, "logps/chosen": -276.8858642578125, "logps/rejected": -66.60910034179688, "loss": 0.3855, "rewards/accuracies": 1.0, "rewards/chosen": 3.9159302711486816, "rewards/margins": 3.1336045265197754, "rewards/rejected": 0.7823257446289062, "step": 7234 }, { "epoch": 1.17, "learning_rate": 6.439471078676056e-07, "logits/chosen": -0.5586422681808472, "logits/rejected": -0.4615846574306488, "logps/chosen": -46.34564971923828, "logps/rejected": -81.32090759277344, "loss": 0.4456, "rewards/accuracies": 1.0, "rewards/chosen": 2.4583137035369873, "rewards/margins": 1.293918490409851, "rewards/rejected": 1.1643952131271362, "step": 7235 }, { "epoch": 1.17, "learning_rate": 6.438212417608472e-07, "logits/chosen": -0.6705511212348938, "logits/rejected": -0.66398686170578, "logps/chosen": -101.78316497802734, "logps/rejected": -67.76318359375, "loss": 0.3868, "rewards/accuracies": 0.0, "rewards/chosen": 1.7714706659317017, "rewards/margins": -0.1500610113143921, "rewards/rejected": 1.9215316772460938, "step": 7236 }, { "epoch": 1.17, "learning_rate": 6.436953657174017e-07, "logits/chosen": -0.4202907979488373, "logits/rejected": -0.415254145860672, "logps/chosen": -24.432117462158203, "logps/rejected": -59.10242462158203, "loss": 0.7958, "rewards/accuracies": 1.0, "rewards/chosen": 0.5050655603408813, "rewards/margins": 0.5237137079238892, "rewards/rejected": -0.018648147583007812, "step": 7237 }, { "epoch": 1.17, "learning_rate": 6.435694797459664e-07, "logits/chosen": -0.5818698406219482, "logits/rejected": -0.6058077216148376, "logps/chosen": -58.97210693359375, "logps/rejected": -98.69804382324219, "loss": 0.7328, "rewards/accuracies": 1.0, "rewards/chosen": 1.2649872303009033, "rewards/margins": 0.1953277587890625, "rewards/rejected": 1.0696594715118408, "step": 7238 }, { "epoch": 1.17, "learning_rate": 6.434435838552383e-07, "logits/chosen": -0.5413532257080078, "logits/rejected": -0.44070860743522644, "logps/chosen": -277.0653381347656, "logps/rejected": -79.87596893310547, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": 4.250357151031494, "rewards/margins": 2.543158769607544, "rewards/rejected": 1.7071983814239502, "step": 7239 }, { "epoch": 1.18, "learning_rate": 6.43317678053916e-07, "logits/chosen": -0.6345229148864746, "logits/rejected": -0.6345229148864746, "logps/chosen": -73.792724609375, "logps/rejected": -73.792724609375, "loss": 0.407, "rewards/accuracies": 0.0, "rewards/chosen": 2.335569143295288, "rewards/margins": 0.0, "rewards/rejected": 2.335569143295288, "step": 7240 }, { "epoch": 1.18, "learning_rate": 6.43191762350698e-07, "logits/chosen": -0.4923029839992523, "logits/rejected": -0.6460250020027161, "logps/chosen": -58.508731842041016, "logps/rejected": -62.491939544677734, "loss": 0.3805, "rewards/accuracies": 1.0, "rewards/chosen": 0.8903781771659851, "rewards/margins": 0.043711066246032715, "rewards/rejected": 0.8466671109199524, "step": 7241 }, { "epoch": 1.18, "learning_rate": 6.430658367542843e-07, "logits/chosen": -0.39639905095100403, "logits/rejected": -0.45743250846862793, "logps/chosen": -64.96028900146484, "logps/rejected": -107.50288391113281, "loss": 0.3464, "rewards/accuracies": 1.0, "rewards/chosen": 1.2155815362930298, "rewards/margins": 0.14020311832427979, "rewards/rejected": 1.07537841796875, "step": 7242 }, { "epoch": 1.18, "learning_rate": 6.429399012733749e-07, "logits/chosen": -0.8504582047462463, "logits/rejected": -0.788061261177063, "logps/chosen": -79.13361358642578, "logps/rejected": -95.27867889404297, "loss": 0.8897, "rewards/accuracies": 0.0, "rewards/chosen": 1.6484276056289673, "rewards/margins": -1.372322916984558, "rewards/rejected": 3.0207505226135254, "step": 7243 }, { "epoch": 1.18, "learning_rate": 6.428139559166707e-07, "logits/chosen": -0.5342184901237488, "logits/rejected": -0.5554108023643494, "logps/chosen": -102.43650817871094, "logps/rejected": -97.48056030273438, "loss": 0.5473, "rewards/accuracies": 0.0, "rewards/chosen": 1.2011559009552002, "rewards/margins": -0.28704607486724854, "rewards/rejected": 1.4882019758224487, "step": 7244 }, { "epoch": 1.18, "learning_rate": 6.426880006928738e-07, "logits/chosen": -0.38907384872436523, "logits/rejected": -0.3870145380496979, "logps/chosen": -1.2322052717208862, "logps/rejected": -4.272550582885742, "loss": 2.0026, "rewards/accuracies": 0.0, "rewards/chosen": 0.3114255368709564, "rewards/margins": -0.07448193430900574, "rewards/rejected": 0.38590747117996216, "step": 7245 }, { "epoch": 1.18, "learning_rate": 6.42562035610686e-07, "logits/chosen": -0.9311854839324951, "logits/rejected": -0.8472182154655457, "logps/chosen": -189.17294311523438, "logps/rejected": -17.905548095703125, "loss": 0.0786, "rewards/accuracies": 1.0, "rewards/chosen": 5.468867778778076, "rewards/margins": 5.145125389099121, "rewards/rejected": 0.3237423002719879, "step": 7246 }, { "epoch": 1.18, "learning_rate": 6.424360606788104e-07, "logits/chosen": -0.45622265338897705, "logits/rejected": -0.5109761357307434, "logps/chosen": -116.4655532836914, "logps/rejected": -89.85353088378906, "loss": 1.2494, "rewards/accuracies": 0.0, "rewards/chosen": 2.9566216468811035, "rewards/margins": -1.6260595321655273, "rewards/rejected": 4.582681179046631, "step": 7247 }, { "epoch": 1.18, "learning_rate": 6.423100759059509e-07, "logits/chosen": -0.9901790022850037, "logits/rejected": -0.9963074922561646, "logps/chosen": -51.72414779663086, "logps/rejected": -123.77350616455078, "loss": 0.5577, "rewards/accuracies": 0.0, "rewards/chosen": 0.987491250038147, "rewards/margins": -0.7055774927139282, "rewards/rejected": 1.6930687427520752, "step": 7248 }, { "epoch": 1.18, "learning_rate": 6.421840813008116e-07, "logits/chosen": -0.815237820148468, "logits/rejected": -0.9893472194671631, "logps/chosen": -139.60159301757812, "logps/rejected": -72.219970703125, "loss": 0.4697, "rewards/accuracies": 1.0, "rewards/chosen": 5.265742778778076, "rewards/margins": 1.1420297622680664, "rewards/rejected": 4.12371301651001, "step": 7249 }, { "epoch": 1.18, "learning_rate": 6.420580768720976e-07, "logits/chosen": -0.6760917901992798, "logits/rejected": -0.6972925662994385, "logps/chosen": -78.88394927978516, "logps/rejected": -70.83210754394531, "loss": 1.6479, "rewards/accuracies": 0.0, "rewards/chosen": 1.0666221380233765, "rewards/margins": -0.541201114654541, "rewards/rejected": 1.6078232526779175, "step": 7250 }, { "epoch": 1.18, "learning_rate": 6.419320626285147e-07, "logits/chosen": -0.9169003367424011, "logits/rejected": -0.6898808479309082, "logps/chosen": -161.1755828857422, "logps/rejected": -34.96847152709961, "loss": 0.92, "rewards/accuracies": 1.0, "rewards/chosen": 4.479637145996094, "rewards/margins": 3.30218768119812, "rewards/rejected": 1.1774494647979736, "step": 7251 }, { "epoch": 1.18, "learning_rate": 6.418060385787694e-07, "logits/chosen": -0.19270402193069458, "logits/rejected": -0.23573154211044312, "logps/chosen": -7.368488788604736, "logps/rejected": -59.12344741821289, "loss": 0.7865, "rewards/accuracies": 0.0, "rewards/chosen": 0.3372473418712616, "rewards/margins": -0.02769559621810913, "rewards/rejected": 0.3649429380893707, "step": 7252 }, { "epoch": 1.18, "learning_rate": 6.416800047315686e-07, "logits/chosen": -0.9003270864486694, "logits/rejected": -0.8447041511535645, "logps/chosen": -40.33905029296875, "logps/rejected": -50.043758392333984, "loss": 0.4398, "rewards/accuracies": 1.0, "rewards/chosen": 1.8946212530136108, "rewards/margins": 0.5480403900146484, "rewards/rejected": 1.3465808629989624, "step": 7253 }, { "epoch": 1.18, "learning_rate": 6.415539610956198e-07, "logits/chosen": -0.41196906566619873, "logits/rejected": -0.37314140796661377, "logps/chosen": -64.75730895996094, "logps/rejected": -54.985137939453125, "loss": 1.4823, "rewards/accuracies": 0.0, "rewards/chosen": 1.7670074701309204, "rewards/margins": -0.1300903558731079, "rewards/rejected": 1.8970978260040283, "step": 7254 }, { "epoch": 1.18, "learning_rate": 6.414279076796319e-07, "logits/chosen": -0.8907154202461243, "logits/rejected": -0.9337854385375977, "logps/chosen": -145.94772338867188, "logps/rejected": -114.44760131835938, "loss": 0.4705, "rewards/accuracies": 0.0, "rewards/chosen": 6.219668865203857, "rewards/margins": -0.4294281005859375, "rewards/rejected": 6.649096965789795, "step": 7255 }, { "epoch": 1.18, "learning_rate": 6.413018444923137e-07, "logits/chosen": -0.7076898813247681, "logits/rejected": -0.6111306548118591, "logps/chosen": -127.54512023925781, "logps/rejected": -95.60435485839844, "loss": 0.0432, "rewards/accuracies": 1.0, "rewards/chosen": 6.231143474578857, "rewards/margins": 3.923614025115967, "rewards/rejected": 2.3075294494628906, "step": 7256 }, { "epoch": 1.18, "learning_rate": 6.411757715423751e-07, "logits/chosen": -0.7257171273231506, "logits/rejected": -0.6818400025367737, "logps/chosen": -77.18519592285156, "logps/rejected": -41.754669189453125, "loss": 0.4827, "rewards/accuracies": 1.0, "rewards/chosen": 1.9007385969161987, "rewards/margins": 0.49508941173553467, "rewards/rejected": 1.405649185180664, "step": 7257 }, { "epoch": 1.18, "learning_rate": 6.410496888385265e-07, "logits/chosen": -0.8375307321548462, "logits/rejected": -0.8069821000099182, "logps/chosen": -133.74346923828125, "logps/rejected": -180.79190063476562, "loss": 0.394, "rewards/accuracies": 0.0, "rewards/chosen": 5.778921604156494, "rewards/margins": -0.09718036651611328, "rewards/rejected": 5.876101970672607, "step": 7258 }, { "epoch": 1.18, "learning_rate": 6.40923596389479e-07, "logits/chosen": -0.9281277060508728, "logits/rejected": -0.8285238742828369, "logps/chosen": -65.84664916992188, "logps/rejected": -122.478271484375, "loss": 2.3536, "rewards/accuracies": 0.0, "rewards/chosen": 3.3500564098358154, "rewards/margins": -2.892051935195923, "rewards/rejected": 6.242108345031738, "step": 7259 }, { "epoch": 1.18, "learning_rate": 6.407974942039444e-07, "logits/chosen": -0.2998313307762146, "logits/rejected": -0.29448869824409485, "logps/chosen": -0.6232094764709473, "logps/rejected": -31.21405029296875, "loss": 0.8211, "rewards/accuracies": 1.0, "rewards/chosen": 0.3300130367279053, "rewards/margins": 0.33225855231285095, "rewards/rejected": -0.0022455216385424137, "step": 7260 }, { "epoch": 1.18, "learning_rate": 6.406713822906352e-07, "logits/chosen": -0.7269232273101807, "logits/rejected": -0.6754172444343567, "logps/chosen": -78.74838256835938, "logps/rejected": -69.8504638671875, "loss": 0.507, "rewards/accuracies": 0.0, "rewards/chosen": 2.758010149002075, "rewards/margins": -0.3478691577911377, "rewards/rejected": 3.105879306793213, "step": 7261 }, { "epoch": 1.18, "learning_rate": 6.405452606582647e-07, "logits/chosen": -0.5858076810836792, "logits/rejected": -0.6854051947593689, "logps/chosen": -93.07049560546875, "logps/rejected": -175.98361206054688, "loss": 0.8334, "rewards/accuracies": 0.0, "rewards/chosen": 1.1819305419921875, "rewards/margins": -1.222930908203125, "rewards/rejected": 2.4048614501953125, "step": 7262 }, { "epoch": 1.18, "learning_rate": 6.404191293155463e-07, "logits/chosen": -0.6610153913497925, "logits/rejected": -0.6939724087715149, "logps/chosen": -58.67021942138672, "logps/rejected": -48.27394104003906, "loss": 0.5679, "rewards/accuracies": 0.0, "rewards/chosen": 1.1171661615371704, "rewards/margins": -0.29442715644836426, "rewards/rejected": 1.4115933179855347, "step": 7263 }, { "epoch": 1.18, "learning_rate": 6.402929882711948e-07, "logits/chosen": -0.9035254716873169, "logits/rejected": -0.8131615519523621, "logps/chosen": -88.24339294433594, "logps/rejected": -79.26303100585938, "loss": 0.1128, "rewards/accuracies": 1.0, "rewards/chosen": 5.930426120758057, "rewards/margins": 3.5082550048828125, "rewards/rejected": 2.422171115875244, "step": 7264 }, { "epoch": 1.18, "learning_rate": 6.401668375339253e-07, "logits/chosen": -0.6027806997299194, "logits/rejected": -0.6201696991920471, "logps/chosen": -56.89301681518555, "logps/rejected": -35.722572326660156, "loss": 2.0253, "rewards/accuracies": 1.0, "rewards/chosen": 2.4793269634246826, "rewards/margins": 0.6357913017272949, "rewards/rejected": 1.8435356616973877, "step": 7265 }, { "epoch": 1.18, "learning_rate": 6.400406771124535e-07, "logits/chosen": -0.6474614143371582, "logits/rejected": -0.5906745195388794, "logps/chosen": -72.1553726196289, "logps/rejected": -46.011085510253906, "loss": 0.84, "rewards/accuracies": 1.0, "rewards/chosen": 1.8139839172363281, "rewards/margins": 0.18614161014556885, "rewards/rejected": 1.6278423070907593, "step": 7266 }, { "epoch": 1.18, "learning_rate": 6.39914507015496e-07, "logits/chosen": -0.9351075291633606, "logits/rejected": -0.8195149898529053, "logps/chosen": -126.59232330322266, "logps/rejected": -216.88568115234375, "loss": 2.0125, "rewards/accuracies": 0.0, "rewards/chosen": 2.1344292163848877, "rewards/margins": -3.960974931716919, "rewards/rejected": 6.095404148101807, "step": 7267 }, { "epoch": 1.18, "learning_rate": 6.397883272517702e-07, "logits/chosen": -0.8539945483207703, "logits/rejected": -0.8148240447044373, "logps/chosen": -66.02615356445312, "logps/rejected": -35.777488708496094, "loss": 0.2941, "rewards/accuracies": 1.0, "rewards/chosen": 2.864879608154297, "rewards/margins": 0.2483515739440918, "rewards/rejected": 2.616528034210205, "step": 7268 }, { "epoch": 1.18, "learning_rate": 6.396621378299934e-07, "logits/chosen": -0.6207334995269775, "logits/rejected": -0.7298477292060852, "logps/chosen": -257.92791748046875, "logps/rejected": -248.1902313232422, "loss": 2.9893, "rewards/accuracies": 0.0, "rewards/chosen": 3.405752658843994, "rewards/margins": -5.298243999481201, "rewards/rejected": 8.703996658325195, "step": 7269 }, { "epoch": 1.18, "learning_rate": 6.395359387588845e-07, "logits/chosen": -0.18464615941047668, "logits/rejected": -0.18390816450119019, "logps/chosen": -11.134407997131348, "logps/rejected": -1.8097761869430542, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 0.2660866677761078, "rewards/margins": -0.008672058582305908, "rewards/rejected": 0.2747587263584137, "step": 7270 }, { "epoch": 1.18, "learning_rate": 6.394097300471625e-07, "logits/chosen": -0.8609353303909302, "logits/rejected": -0.9411550760269165, "logps/chosen": -97.07337188720703, "logps/rejected": -123.88565063476562, "loss": 3.6504, "rewards/accuracies": 0.0, "rewards/chosen": 1.4517936706542969, "rewards/margins": -5.1783061027526855, "rewards/rejected": 6.630099773406982, "step": 7271 }, { "epoch": 1.18, "learning_rate": 6.392835117035471e-07, "logits/chosen": -1.0300865173339844, "logits/rejected": -0.9766458868980408, "logps/chosen": -45.101341247558594, "logps/rejected": -52.320274353027344, "loss": 0.6987, "rewards/accuracies": 0.0, "rewards/chosen": 1.0945205688476562, "rewards/margins": -1.0301156044006348, "rewards/rejected": 2.124636173248291, "step": 7272 }, { "epoch": 1.18, "learning_rate": 6.391572837367591e-07, "logits/chosen": -0.7959709167480469, "logits/rejected": -0.6629595756530762, "logps/chosen": -175.04669189453125, "logps/rejected": -200.03533935546875, "loss": 1.1339, "rewards/accuracies": 0.0, "rewards/chosen": 1.3128341436386108, "rewards/margins": -0.09970855712890625, "rewards/rejected": 1.412542700767517, "step": 7273 }, { "epoch": 1.18, "learning_rate": 6.390310461555195e-07, "logits/chosen": -0.6253038644790649, "logits/rejected": -0.5535422563552856, "logps/chosen": -79.63751220703125, "logps/rejected": -47.60148620605469, "loss": 0.3629, "rewards/accuracies": 1.0, "rewards/chosen": 1.7602417469024658, "rewards/margins": 0.698184609413147, "rewards/rejected": 1.0620571374893188, "step": 7274 }, { "epoch": 1.18, "learning_rate": 6.389047989685502e-07, "logits/chosen": -0.6849215030670166, "logits/rejected": -0.6567279696464539, "logps/chosen": -76.34259033203125, "logps/rejected": -60.891265869140625, "loss": 0.6189, "rewards/accuracies": 0.0, "rewards/chosen": 2.2506768703460693, "rewards/margins": -0.3372335433959961, "rewards/rejected": 2.5879104137420654, "step": 7275 }, { "epoch": 1.18, "learning_rate": 6.387785421845735e-07, "logits/chosen": -1.1154531240463257, "logits/rejected": -1.0931169986724854, "logps/chosen": -65.7010498046875, "logps/rejected": -86.90677642822266, "loss": 0.5543, "rewards/accuracies": 0.0, "rewards/chosen": 1.461584448814392, "rewards/margins": -0.24232566356658936, "rewards/rejected": 1.7039101123809814, "step": 7276 }, { "epoch": 1.18, "learning_rate": 6.386522758123128e-07, "logits/chosen": -0.15518155694007874, "logits/rejected": -0.21523357927799225, "logps/chosen": -69.94578552246094, "logps/rejected": -115.79808044433594, "loss": 0.7673, "rewards/accuracies": 1.0, "rewards/chosen": 0.13030853867530823, "rewards/margins": 0.09659348428249359, "rewards/rejected": 0.033715058118104935, "step": 7277 }, { "epoch": 1.18, "learning_rate": 6.385259998604918e-07, "logits/chosen": -0.4651091396808624, "logits/rejected": -0.4651091396808624, "logps/chosen": -44.8058967590332, "logps/rejected": -44.8058967590332, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.4199669063091278, "rewards/margins": 0.0, "rewards/rejected": 0.4199669063091278, "step": 7278 }, { "epoch": 1.18, "learning_rate": 6.383997143378348e-07, "logits/chosen": -0.6545481085777283, "logits/rejected": -0.654276967048645, "logps/chosen": -83.19581604003906, "logps/rejected": -132.22433471679688, "loss": 0.1966, "rewards/accuracies": 1.0, "rewards/chosen": 0.7085952758789062, "rewards/margins": 1.4178740978240967, "rewards/rejected": -0.7092788815498352, "step": 7279 }, { "epoch": 1.18, "learning_rate": 6.382734192530672e-07, "logits/chosen": -0.9203678965568542, "logits/rejected": -0.8741005659103394, "logps/chosen": -107.3743896484375, "logps/rejected": -63.59792709350586, "loss": 0.2811, "rewards/accuracies": 1.0, "rewards/chosen": 2.6609954833984375, "rewards/margins": 0.41594958305358887, "rewards/rejected": 2.2450459003448486, "step": 7280 }, { "epoch": 1.18, "learning_rate": 6.381471146149147e-07, "logits/chosen": -1.7477823495864868, "logits/rejected": -1.8287168741226196, "logps/chosen": -166.3163604736328, "logps/rejected": -125.87928009033203, "loss": 0.2056, "rewards/accuracies": 1.0, "rewards/chosen": 5.006248474121094, "rewards/margins": 0.989809513092041, "rewards/rejected": 4.016438961029053, "step": 7281 }, { "epoch": 1.18, "learning_rate": 6.380208004321036e-07, "logits/chosen": -0.6677022576332092, "logits/rejected": -0.683866560459137, "logps/chosen": -44.715576171875, "logps/rejected": -7.257296085357666, "loss": 0.4361, "rewards/accuracies": 0.0, "rewards/chosen": 0.17616768181324005, "rewards/margins": -0.04801984131336212, "rewards/rejected": 0.22418752312660217, "step": 7282 }, { "epoch": 1.18, "learning_rate": 6.378944767133612e-07, "logits/chosen": -0.8801170587539673, "logits/rejected": -0.8475872278213501, "logps/chosen": -120.23289489746094, "logps/rejected": -84.62381744384766, "loss": 0.4116, "rewards/accuracies": 1.0, "rewards/chosen": 4.773886203765869, "rewards/margins": 2.871941566467285, "rewards/rejected": 1.9019447565078735, "step": 7283 }, { "epoch": 1.18, "learning_rate": 6.377681434674153e-07, "logits/chosen": -0.6540279388427734, "logits/rejected": -0.6540279388427734, "logps/chosen": -62.63684844970703, "logps/rejected": -62.63684844970703, "loss": 0.3606, "rewards/accuracies": 0.0, "rewards/chosen": 1.6240501403808594, "rewards/margins": 0.0, "rewards/rejected": 1.6240501403808594, "step": 7284 }, { "epoch": 1.18, "learning_rate": 6.376418007029943e-07, "logits/chosen": -0.9546438455581665, "logits/rejected": -0.9732710123062134, "logps/chosen": -74.20719146728516, "logps/rejected": -114.20936584472656, "loss": 1.5176, "rewards/accuracies": 0.0, "rewards/chosen": 0.9605469107627869, "rewards/margins": -1.51715087890625, "rewards/rejected": 2.4776978492736816, "step": 7285 }, { "epoch": 1.18, "learning_rate": 6.375154484288272e-07, "logits/chosen": -0.7891829013824463, "logits/rejected": -0.6256623268127441, "logps/chosen": -132.620361328125, "logps/rejected": -48.29188919067383, "loss": 1.7565, "rewards/accuracies": 0.0, "rewards/chosen": 0.832745373249054, "rewards/margins": -0.9127712845802307, "rewards/rejected": 1.7455166578292847, "step": 7286 }, { "epoch": 1.18, "learning_rate": 6.373890866536438e-07, "logits/chosen": -0.5365033745765686, "logits/rejected": -0.3204193413257599, "logps/chosen": -90.57030487060547, "logps/rejected": -24.211261749267578, "loss": 0.0215, "rewards/accuracies": 1.0, "rewards/chosen": 4.6812052726745605, "rewards/margins": 4.3792524337768555, "rewards/rejected": 0.3019527494907379, "step": 7287 }, { "epoch": 1.18, "learning_rate": 6.372627153861745e-07, "logits/chosen": -0.5760272145271301, "logits/rejected": -0.5444846153259277, "logps/chosen": -52.97640609741211, "logps/rejected": -56.54521942138672, "loss": 0.9059, "rewards/accuracies": 1.0, "rewards/chosen": 1.947964072227478, "rewards/margins": 0.3518252372741699, "rewards/rejected": 1.596138834953308, "step": 7288 }, { "epoch": 1.18, "learning_rate": 6.371363346351504e-07, "logits/chosen": -0.6085616946220398, "logits/rejected": -0.5515183806419373, "logps/chosen": -86.85490417480469, "logps/rejected": -121.30691528320312, "loss": 2.2979, "rewards/accuracies": 0.0, "rewards/chosen": 2.3350601196289062, "rewards/margins": -1.751652717590332, "rewards/rejected": 4.086712837219238, "step": 7289 }, { "epoch": 1.18, "learning_rate": 6.370099444093031e-07, "logits/chosen": -0.23453374207019806, "logits/rejected": -0.2261454164981842, "logps/chosen": -46.32557678222656, "logps/rejected": -46.229522705078125, "loss": 0.8454, "rewards/accuracies": 0.0, "rewards/chosen": 1.9395942687988281, "rewards/margins": -0.2767312526702881, "rewards/rejected": 2.216325521469116, "step": 7290 }, { "epoch": 1.18, "learning_rate": 6.36883544717365e-07, "logits/chosen": -0.6502240896224976, "logits/rejected": -0.6814544796943665, "logps/chosen": -70.14291381835938, "logps/rejected": -124.86860656738281, "loss": 1.2353, "rewards/accuracies": 0.0, "rewards/chosen": 2.0654945373535156, "rewards/margins": -1.535886526107788, "rewards/rejected": 3.6013810634613037, "step": 7291 }, { "epoch": 1.18, "learning_rate": 6.367571355680692e-07, "logits/chosen": -0.3206002712249756, "logits/rejected": -0.3206002712249756, "logps/chosen": -2.954601764678955, "logps/rejected": -2.954601764678955, "loss": 0.3634, "rewards/accuracies": 0.0, "rewards/chosen": 0.13041634857654572, "rewards/margins": 0.0, "rewards/rejected": 0.13041634857654572, "step": 7292 }, { "epoch": 1.18, "learning_rate": 6.366307169701495e-07, "logits/chosen": -0.37207648158073425, "logits/rejected": -0.28160175681114197, "logps/chosen": -93.1854248046875, "logps/rejected": -49.67292785644531, "loss": 0.0713, "rewards/accuracies": 1.0, "rewards/chosen": 4.118068218231201, "rewards/margins": 2.553049087524414, "rewards/rejected": 1.5650192499160767, "step": 7293 }, { "epoch": 1.18, "learning_rate": 6.365042889323399e-07, "logits/chosen": -1.150617003440857, "logits/rejected": -0.9977452158927917, "logps/chosen": -121.94227600097656, "logps/rejected": -110.11312866210938, "loss": 0.1956, "rewards/accuracies": 1.0, "rewards/chosen": 4.471646308898926, "rewards/margins": 1.3730897903442383, "rewards/rejected": 3.0985565185546875, "step": 7294 }, { "epoch": 1.18, "learning_rate": 6.363778514633755e-07, "logits/chosen": -0.5584102272987366, "logits/rejected": -0.5584102272987366, "logps/chosen": -90.59831237792969, "logps/rejected": -90.59831237792969, "loss": 0.4448, "rewards/accuracies": 0.0, "rewards/chosen": 1.3501938581466675, "rewards/margins": 0.0, "rewards/rejected": 1.3501938581466675, "step": 7295 }, { "epoch": 1.18, "learning_rate": 6.362514045719921e-07, "logits/chosen": -0.5304352641105652, "logits/rejected": -0.5328363180160522, "logps/chosen": -81.12008666992188, "logps/rejected": -49.47808837890625, "loss": 0.4621, "rewards/accuracies": 1.0, "rewards/chosen": 1.217267632484436, "rewards/margins": 0.5715622305870056, "rewards/rejected": 0.6457054018974304, "step": 7296 }, { "epoch": 1.18, "learning_rate": 6.361249482669258e-07, "logits/chosen": -0.8257681131362915, "logits/rejected": -0.5864273905754089, "logps/chosen": -110.09246826171875, "logps/rejected": -57.99415588378906, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 5.052694797515869, "rewards/margins": 2.0949974060058594, "rewards/rejected": 2.9576973915100098, "step": 7297 }, { "epoch": 1.18, "learning_rate": 6.359984825569137e-07, "logits/chosen": -0.38844597339630127, "logits/rejected": -0.38844597339630127, "logps/chosen": -30.82833480834961, "logps/rejected": -30.82833480834961, "loss": 1.563, "rewards/accuracies": 0.0, "rewards/chosen": -0.04249267652630806, "rewards/margins": 0.0, "rewards/rejected": -0.04249267652630806, "step": 7298 }, { "epoch": 1.18, "learning_rate": 6.358720074506931e-07, "logits/chosen": -0.8692054748535156, "logits/rejected": -0.865567684173584, "logps/chosen": -66.58876037597656, "logps/rejected": -47.25794982910156, "loss": 0.5378, "rewards/accuracies": 1.0, "rewards/chosen": 2.894240617752075, "rewards/margins": 0.05993199348449707, "rewards/rejected": 2.834308624267578, "step": 7299 }, { "epoch": 1.18, "learning_rate": 6.357455229570026e-07, "logits/chosen": -0.7652462720870972, "logits/rejected": -0.7807176113128662, "logps/chosen": -80.4924545288086, "logps/rejected": -114.32185363769531, "loss": 1.3681, "rewards/accuracies": 0.0, "rewards/chosen": 1.5105987787246704, "rewards/margins": -0.23404085636138916, "rewards/rejected": 1.7446396350860596, "step": 7300 }, { "epoch": 1.19, "learning_rate": 6.356190290845809e-07, "logits/chosen": -0.577440083026886, "logits/rejected": -0.470050573348999, "logps/chosen": -86.19564819335938, "logps/rejected": -44.372589111328125, "loss": 0.8945, "rewards/accuracies": 0.0, "rewards/chosen": 0.17878571152687073, "rewards/margins": -0.6888847351074219, "rewards/rejected": 0.867670476436615, "step": 7301 }, { "epoch": 1.19, "learning_rate": 6.354925258421675e-07, "logits/chosen": -0.4985731840133667, "logits/rejected": -0.4985731840133667, "logps/chosen": -47.89239501953125, "logps/rejected": -47.89239501953125, "loss": 0.8213, "rewards/accuracies": 0.0, "rewards/chosen": 0.846247136592865, "rewards/margins": 0.0, "rewards/rejected": 0.846247136592865, "step": 7302 }, { "epoch": 1.19, "learning_rate": 6.353660132385026e-07, "logits/chosen": -0.6697016954421997, "logits/rejected": -0.5909063220024109, "logps/chosen": -86.8871078491211, "logps/rejected": -15.517647743225098, "loss": 1.0057, "rewards/accuracies": 0.0, "rewards/chosen": -0.022621918469667435, "rewards/margins": -0.35076096653938293, "rewards/rejected": 0.3281390368938446, "step": 7303 }, { "epoch": 1.19, "learning_rate": 6.352394912823269e-07, "logits/chosen": -0.9524791836738586, "logits/rejected": -0.7484015226364136, "logps/chosen": -96.65428161621094, "logps/rejected": -38.55050277709961, "loss": 0.053, "rewards/accuracies": 1.0, "rewards/chosen": 5.170945644378662, "rewards/margins": 3.8463845252990723, "rewards/rejected": 1.3245609998703003, "step": 7304 }, { "epoch": 1.19, "learning_rate": 6.351129599823821e-07, "logits/chosen": -0.5615952610969543, "logits/rejected": -0.5719007253646851, "logps/chosen": -93.79337310791016, "logps/rejected": -41.66358947753906, "loss": 0.6578, "rewards/accuracies": 1.0, "rewards/chosen": 1.2023719549179077, "rewards/margins": 0.14666211605072021, "rewards/rejected": 1.0557098388671875, "step": 7305 }, { "epoch": 1.19, "learning_rate": 6.349864193474103e-07, "logits/chosen": -0.4791944622993469, "logits/rejected": -0.33491820096969604, "logps/chosen": -69.61805725097656, "logps/rejected": -27.636539459228516, "loss": 0.1586, "rewards/accuracies": 1.0, "rewards/chosen": 1.5438026189804077, "rewards/margins": 1.404620885848999, "rewards/rejected": 0.1391817182302475, "step": 7306 }, { "epoch": 1.19, "learning_rate": 6.34859869386154e-07, "logits/chosen": -0.7614559531211853, "logits/rejected": -0.8417707085609436, "logps/chosen": -37.64522171020508, "logps/rejected": -199.56912231445312, "loss": 2.5503, "rewards/accuracies": 0.0, "rewards/chosen": 1.8599979877471924, "rewards/margins": -4.751330375671387, "rewards/rejected": 6.611328125, "step": 7307 }, { "epoch": 1.19, "learning_rate": 6.347333101073568e-07, "logits/chosen": -0.9110033512115479, "logits/rejected": -0.8411797285079956, "logps/chosen": -92.17649841308594, "logps/rejected": -40.7978515625, "loss": 0.5924, "rewards/accuracies": 1.0, "rewards/chosen": 1.8691673278808594, "rewards/margins": 1.7530514001846313, "rewards/rejected": 0.11611595004796982, "step": 7308 }, { "epoch": 1.19, "learning_rate": 6.34606741519763e-07, "logits/chosen": -0.6606946587562561, "logits/rejected": -0.5822980403900146, "logps/chosen": -74.05968475341797, "logps/rejected": -15.076922416687012, "loss": 0.0659, "rewards/accuracies": 1.0, "rewards/chosen": 3.167473554611206, "rewards/margins": 2.0864670276641846, "rewards/rejected": 1.0810065269470215, "step": 7309 }, { "epoch": 1.19, "learning_rate": 6.344801636321167e-07, "logits/chosen": -0.7827479839324951, "logits/rejected": -0.8396091461181641, "logps/chosen": -87.42412567138672, "logps/rejected": -85.4058837890625, "loss": 1.0222, "rewards/accuracies": 0.0, "rewards/chosen": 0.8324432373046875, "rewards/margins": -1.1247665882110596, "rewards/rejected": 1.957209825515747, "step": 7310 }, { "epoch": 1.19, "learning_rate": 6.343535764531638e-07, "logits/chosen": -0.8649649620056152, "logits/rejected": -1.032538652420044, "logps/chosen": -88.99497985839844, "logps/rejected": -200.5462188720703, "loss": 1.9222, "rewards/accuracies": 0.0, "rewards/chosen": 0.6761986017227173, "rewards/margins": -3.4082956314086914, "rewards/rejected": 4.084494113922119, "step": 7311 }, { "epoch": 1.19, "learning_rate": 6.342269799916499e-07, "logits/chosen": -0.6004531383514404, "logits/rejected": -0.6529483199119568, "logps/chosen": -82.40861511230469, "logps/rejected": -84.56744384765625, "loss": 0.5262, "rewards/accuracies": 0.0, "rewards/chosen": 1.0885604619979858, "rewards/margins": -0.5776505470275879, "rewards/rejected": 1.6662110090255737, "step": 7312 }, { "epoch": 1.19, "learning_rate": 6.341003742563218e-07, "logits/chosen": -0.8769139647483826, "logits/rejected": -0.8387244343757629, "logps/chosen": -62.31153106689453, "logps/rejected": -67.94231414794922, "loss": 0.6212, "rewards/accuracies": 1.0, "rewards/chosen": 1.7789268493652344, "rewards/margins": 0.3077148199081421, "rewards/rejected": 1.4712120294570923, "step": 7313 }, { "epoch": 1.19, "learning_rate": 6.339737592559266e-07, "logits/chosen": -0.9221694469451904, "logits/rejected": -0.9402856826782227, "logps/chosen": -60.01207733154297, "logps/rejected": -104.21310424804688, "loss": 0.7864, "rewards/accuracies": 0.0, "rewards/chosen": 0.609179675579071, "rewards/margins": -0.2601516842842102, "rewards/rejected": 0.8693313598632812, "step": 7314 }, { "epoch": 1.19, "learning_rate": 6.338471349992124e-07, "logits/chosen": -0.70750892162323, "logits/rejected": -0.6636096835136414, "logps/chosen": -40.31165313720703, "logps/rejected": -51.95769500732422, "loss": 0.5336, "rewards/accuracies": 1.0, "rewards/chosen": 3.0958542823791504, "rewards/margins": 0.3650169372558594, "rewards/rejected": 2.730837345123291, "step": 7315 }, { "epoch": 1.19, "learning_rate": 6.337205014949276e-07, "logits/chosen": -0.7442106008529663, "logits/rejected": -0.6469349265098572, "logps/chosen": -118.77088165283203, "logps/rejected": -70.24775695800781, "loss": 0.8141, "rewards/accuracies": 1.0, "rewards/chosen": 5.331197261810303, "rewards/margins": 3.280097007751465, "rewards/rejected": 2.051100254058838, "step": 7316 }, { "epoch": 1.19, "learning_rate": 6.335938587518215e-07, "logits/chosen": -0.5985927581787109, "logits/rejected": -0.5443174839019775, "logps/chosen": -91.5908203125, "logps/rejected": -31.75896453857422, "loss": 1.6088, "rewards/accuracies": 1.0, "rewards/chosen": 1.2791534662246704, "rewards/margins": 1.0489952564239502, "rewards/rejected": 0.2301582396030426, "step": 7317 }, { "epoch": 1.19, "learning_rate": 6.334672067786437e-07, "logits/chosen": -0.9646620750427246, "logits/rejected": -0.9605196118354797, "logps/chosen": -90.07536315917969, "logps/rejected": -76.30624389648438, "loss": 0.3565, "rewards/accuracies": 1.0, "rewards/chosen": 1.6970916986465454, "rewards/margins": 0.10860288143157959, "rewards/rejected": 1.5884888172149658, "step": 7318 }, { "epoch": 1.19, "learning_rate": 6.33340545584145e-07, "logits/chosen": -0.7701765894889832, "logits/rejected": -0.6777469515800476, "logps/chosen": -123.30916595458984, "logps/rejected": -144.8621826171875, "loss": 0.1956, "rewards/accuracies": 1.0, "rewards/chosen": 4.639856815338135, "rewards/margins": 0.7732734680175781, "rewards/rejected": 3.8665833473205566, "step": 7319 }, { "epoch": 1.19, "learning_rate": 6.332138751770761e-07, "logits/chosen": -0.5759009718894958, "logits/rejected": -0.5947877764701843, "logps/chosen": -53.48548889160156, "logps/rejected": -122.595458984375, "loss": 2.3606, "rewards/accuracies": 1.0, "rewards/chosen": 1.2634429931640625, "rewards/margins": 0.555926501750946, "rewards/rejected": 0.7075164914131165, "step": 7320 }, { "epoch": 1.19, "learning_rate": 6.33087195566189e-07, "logits/chosen": -0.14671675860881805, "logits/rejected": -0.19984933733940125, "logps/chosen": -3.9406228065490723, "logps/rejected": -48.24120330810547, "loss": 0.8725, "rewards/accuracies": 1.0, "rewards/chosen": 0.5893815755844116, "rewards/margins": 0.019704222679138184, "rewards/rejected": 0.5696773529052734, "step": 7321 }, { "epoch": 1.19, "learning_rate": 6.329605067602359e-07, "logits/chosen": -0.09241857379674911, "logits/rejected": -0.08849425613880157, "logps/chosen": -3.1492197513580322, "logps/rejected": -21.34244155883789, "loss": 0.4259, "rewards/accuracies": 1.0, "rewards/chosen": 0.2547747492790222, "rewards/margins": 0.15152500569820404, "rewards/rejected": 0.10324974358081818, "step": 7322 }, { "epoch": 1.19, "learning_rate": 6.328338087679699e-07, "logits/chosen": -0.7153222560882568, "logits/rejected": -0.7020019292831421, "logps/chosen": -257.6817626953125, "logps/rejected": -100.18681335449219, "loss": 0.2362, "rewards/accuracies": 1.0, "rewards/chosen": 5.387793064117432, "rewards/margins": 0.8594603538513184, "rewards/rejected": 4.528332710266113, "step": 7323 }, { "epoch": 1.19, "learning_rate": 6.327071015981446e-07, "logits/chosen": -0.7862913608551025, "logits/rejected": -0.6826744079589844, "logps/chosen": -188.1270751953125, "logps/rejected": -97.94229888916016, "loss": 0.1383, "rewards/accuracies": 1.0, "rewards/chosen": 2.0850205421447754, "rewards/margins": 1.1513574123382568, "rewards/rejected": 0.9336631894111633, "step": 7324 }, { "epoch": 1.19, "learning_rate": 6.325803852595143e-07, "logits/chosen": -1.111308217048645, "logits/rejected": -1.192737102508545, "logps/chosen": -212.55308532714844, "logps/rejected": -142.9716796875, "loss": 0.1187, "rewards/accuracies": 1.0, "rewards/chosen": 4.198265075683594, "rewards/margins": 2.4113616943359375, "rewards/rejected": 1.7869033813476562, "step": 7325 }, { "epoch": 1.19, "learning_rate": 6.324536597608339e-07, "logits/chosen": -0.527449369430542, "logits/rejected": -0.5347278714179993, "logps/chosen": -55.61528396606445, "logps/rejected": -86.5737075805664, "loss": 0.6987, "rewards/accuracies": 0.0, "rewards/chosen": 0.9354732632637024, "rewards/margins": -0.824918806552887, "rewards/rejected": 1.7603920698165894, "step": 7326 }, { "epoch": 1.19, "learning_rate": 6.323269251108587e-07, "logits/chosen": -0.5962072014808655, "logits/rejected": -0.6174205541610718, "logps/chosen": -35.51982116699219, "logps/rejected": -105.21897888183594, "loss": 1.0483, "rewards/accuracies": 0.0, "rewards/chosen": 1.423864722251892, "rewards/margins": -1.426855444908142, "rewards/rejected": 2.850720167160034, "step": 7327 }, { "epoch": 1.19, "learning_rate": 6.322001813183452e-07, "logits/chosen": -0.9005743861198425, "logits/rejected": -0.7478814721107483, "logps/chosen": -112.74143981933594, "logps/rejected": -49.03927230834961, "loss": 0.5281, "rewards/accuracies": 0.0, "rewards/chosen": 0.9122230410575867, "rewards/margins": -0.17520564794540405, "rewards/rejected": 1.0874286890029907, "step": 7328 }, { "epoch": 1.19, "learning_rate": 6.320734283920502e-07, "logits/chosen": -0.12284857779741287, "logits/rejected": -0.12284857779741287, "logps/chosen": -45.722415924072266, "logps/rejected": -45.722415924072266, "loss": 0.5861, "rewards/accuracies": 0.0, "rewards/chosen": 0.05582160875201225, "rewards/margins": 0.0, "rewards/rejected": 0.05582160875201225, "step": 7329 }, { "epoch": 1.19, "learning_rate": 6.319466663407308e-07, "logits/chosen": -0.8183654546737671, "logits/rejected": -0.7852643728256226, "logps/chosen": -105.57603454589844, "logps/rejected": -65.23016357421875, "loss": 1.5887, "rewards/accuracies": 0.0, "rewards/chosen": 0.4034721553325653, "rewards/margins": -1.6574089527130127, "rewards/rejected": 2.0608811378479004, "step": 7330 }, { "epoch": 1.19, "learning_rate": 6.318198951731453e-07, "logits/chosen": -0.6246111392974854, "logits/rejected": -0.546431303024292, "logps/chosen": -60.86616134643555, "logps/rejected": -83.52589416503906, "loss": 1.1164, "rewards/accuracies": 0.0, "rewards/chosen": 1.1876331567764282, "rewards/margins": -0.5275790691375732, "rewards/rejected": 1.7152122259140015, "step": 7331 }, { "epoch": 1.19, "learning_rate": 6.316931148980522e-07, "logits/chosen": -0.7023497819900513, "logits/rejected": -0.6860402822494507, "logps/chosen": -93.02713012695312, "logps/rejected": -50.91392517089844, "loss": 2.1711, "rewards/accuracies": 0.0, "rewards/chosen": 0.971570611000061, "rewards/margins": -0.17013394832611084, "rewards/rejected": 1.1417045593261719, "step": 7332 }, { "epoch": 1.19, "learning_rate": 6.315663255242112e-07, "logits/chosen": -0.7720851302146912, "logits/rejected": -0.3148873746395111, "logps/chosen": -109.69059753417969, "logps/rejected": -79.62393188476562, "loss": 0.642, "rewards/accuracies": 1.0, "rewards/chosen": 5.291374206542969, "rewards/margins": 1.957493543624878, "rewards/rejected": 3.333880662918091, "step": 7333 }, { "epoch": 1.19, "learning_rate": 6.314395270603818e-07, "logits/chosen": -0.6708536148071289, "logits/rejected": -0.6708536148071289, "logps/chosen": -62.34419250488281, "logps/rejected": -62.34419250488281, "loss": 1.3454, "rewards/accuracies": 0.0, "rewards/chosen": 2.8182404041290283, "rewards/margins": 0.0, "rewards/rejected": 2.8182404041290283, "step": 7334 }, { "epoch": 1.19, "learning_rate": 6.313127195153248e-07, "logits/chosen": -0.623056173324585, "logits/rejected": -0.623056173324585, "logps/chosen": -88.56263732910156, "logps/rejected": -88.56263732910156, "loss": 0.3474, "rewards/accuracies": 0.0, "rewards/chosen": 2.6553285121917725, "rewards/margins": 0.0, "rewards/rejected": 2.6553285121917725, "step": 7335 }, { "epoch": 1.19, "learning_rate": 6.311859028978013e-07, "logits/chosen": -0.6760308742523193, "logits/rejected": -0.6978949904441833, "logps/chosen": -145.13900756835938, "logps/rejected": -137.59478759765625, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 6.159903049468994, "rewards/margins": -0.5400495529174805, "rewards/rejected": 6.699952602386475, "step": 7336 }, { "epoch": 1.19, "learning_rate": 6.310590772165734e-07, "logits/chosen": -0.6311963796615601, "logits/rejected": -0.5353041887283325, "logps/chosen": -65.01904296875, "logps/rejected": -49.59855270385742, "loss": 0.7502, "rewards/accuracies": 1.0, "rewards/chosen": 2.504721164703369, "rewards/margins": 0.7648892402648926, "rewards/rejected": 1.7398319244384766, "step": 7337 }, { "epoch": 1.19, "learning_rate": 6.309322424804033e-07, "logits/chosen": -0.5474543571472168, "logits/rejected": -0.48505887389183044, "logps/chosen": -73.07899475097656, "logps/rejected": -48.246131896972656, "loss": 0.885, "rewards/accuracies": 1.0, "rewards/chosen": 1.5821068286895752, "rewards/margins": 0.4026123285293579, "rewards/rejected": 1.1794945001602173, "step": 7338 }, { "epoch": 1.19, "learning_rate": 6.308053986980542e-07, "logits/chosen": -0.7337092757225037, "logits/rejected": -0.7337092757225037, "logps/chosen": -63.24860382080078, "logps/rejected": -63.24860382080078, "loss": 0.7576, "rewards/accuracies": 0.0, "rewards/chosen": 2.2946197986602783, "rewards/margins": 0.0, "rewards/rejected": 2.2946197986602783, "step": 7339 }, { "epoch": 1.19, "learning_rate": 6.306785458782896e-07, "logits/chosen": -1.0133053064346313, "logits/rejected": -0.9030203223228455, "logps/chosen": -111.60920715332031, "logps/rejected": -101.28226470947266, "loss": 0.425, "rewards/accuracies": 1.0, "rewards/chosen": 3.7061479091644287, "rewards/margins": 0.7731118202209473, "rewards/rejected": 2.9330360889434814, "step": 7340 }, { "epoch": 1.19, "learning_rate": 6.30551684029874e-07, "logits/chosen": -0.3517010807991028, "logits/rejected": -0.3517010807991028, "logps/chosen": -30.765117645263672, "logps/rejected": -30.765117645263672, "loss": 0.5564, "rewards/accuracies": 0.0, "rewards/chosen": 1.3383411169052124, "rewards/margins": 0.0, "rewards/rejected": 1.3383411169052124, "step": 7341 }, { "epoch": 1.19, "learning_rate": 6.304248131615723e-07, "logits/chosen": -0.6421530246734619, "logits/rejected": -0.5153960585594177, "logps/chosen": -68.78650665283203, "logps/rejected": -20.10874366760254, "loss": 0.5098, "rewards/accuracies": 1.0, "rewards/chosen": 1.1505714654922485, "rewards/margins": 0.23185008764266968, "rewards/rejected": 0.9187213778495789, "step": 7342 }, { "epoch": 1.19, "learning_rate": 6.302979332821503e-07, "logits/chosen": -0.7988471984863281, "logits/rejected": -0.6132275462150574, "logps/chosen": -117.44999694824219, "logps/rejected": -63.790611267089844, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": 5.783336162567139, "rewards/margins": 4.93928861618042, "rewards/rejected": 0.8440475463867188, "step": 7343 }, { "epoch": 1.19, "learning_rate": 6.301710444003739e-07, "logits/chosen": -0.39122140407562256, "logits/rejected": -0.4103570878505707, "logps/chosen": -85.23851013183594, "logps/rejected": -36.26349639892578, "loss": 0.5355, "rewards/accuracies": 0.0, "rewards/chosen": 1.4436218738555908, "rewards/margins": -0.636138916015625, "rewards/rejected": 2.079760789871216, "step": 7344 }, { "epoch": 1.19, "learning_rate": 6.300441465250099e-07, "logits/chosen": -0.5888890027999878, "logits/rejected": -0.580446720123291, "logps/chosen": -152.17242431640625, "logps/rejected": -127.21260070800781, "loss": 0.6873, "rewards/accuracies": 0.0, "rewards/chosen": 6.032148838043213, "rewards/margins": -0.9152588844299316, "rewards/rejected": 6.9474077224731445, "step": 7345 }, { "epoch": 1.19, "learning_rate": 6.29917239664826e-07, "logits/chosen": -0.6190210580825806, "logits/rejected": -0.6325175166130066, "logps/chosen": -107.17250061035156, "logps/rejected": -126.68667602539062, "loss": 2.704, "rewards/accuracies": 0.0, "rewards/chosen": 2.549713134765625, "rewards/margins": -3.225381374359131, "rewards/rejected": 5.775094509124756, "step": 7346 }, { "epoch": 1.19, "learning_rate": 6.2979032382859e-07, "logits/chosen": -0.4258895814418793, "logits/rejected": -0.3611484169960022, "logps/chosen": -79.20479583740234, "logps/rejected": -67.67266845703125, "loss": 0.3975, "rewards/accuracies": 0.0, "rewards/chosen": 1.0457245111465454, "rewards/margins": -0.05191493034362793, "rewards/rejected": 1.0976394414901733, "step": 7347 }, { "epoch": 1.19, "learning_rate": 6.296633990250708e-07, "logits/chosen": -0.7720677852630615, "logits/rejected": -0.8717775940895081, "logps/chosen": -199.08287048339844, "logps/rejected": -119.49935150146484, "loss": 0.3447, "rewards/accuracies": 1.0, "rewards/chosen": 5.50197172164917, "rewards/margins": 0.736238956451416, "rewards/rejected": 4.765732765197754, "step": 7348 }, { "epoch": 1.19, "learning_rate": 6.295364652630376e-07, "logits/chosen": -0.6825241446495056, "logits/rejected": -0.5861693620681763, "logps/chosen": -110.96478271484375, "logps/rejected": -96.85584259033203, "loss": 1.032, "rewards/accuracies": 1.0, "rewards/chosen": 4.722073554992676, "rewards/margins": 1.271268606185913, "rewards/rejected": 3.4508049488067627, "step": 7349 }, { "epoch": 1.19, "learning_rate": 6.294095225512604e-07, "logits/chosen": -0.5646293759346008, "logits/rejected": -0.567400336265564, "logps/chosen": -40.1035041809082, "logps/rejected": -79.71611785888672, "loss": 0.3051, "rewards/accuracies": 1.0, "rewards/chosen": 1.6679165363311768, "rewards/margins": 0.8316166400909424, "rewards/rejected": 0.8362998962402344, "step": 7350 }, { "epoch": 1.19, "learning_rate": 6.292825708985095e-07, "logits/chosen": -0.7133116722106934, "logits/rejected": -0.6802229881286621, "logps/chosen": -128.85960388183594, "logps/rejected": -64.24325561523438, "loss": 1.9216, "rewards/accuracies": 1.0, "rewards/chosen": 1.401240587234497, "rewards/margins": 0.13818895816802979, "rewards/rejected": 1.2630516290664673, "step": 7351 }, { "epoch": 1.19, "learning_rate": 6.291556103135564e-07, "logits/chosen": -0.4267173111438751, "logits/rejected": -0.3527850806713104, "logps/chosen": -79.83094024658203, "logps/rejected": -78.33003234863281, "loss": 1.1055, "rewards/accuracies": 1.0, "rewards/chosen": 1.0643256902694702, "rewards/margins": 0.5299773812294006, "rewards/rejected": 0.5343483090400696, "step": 7352 }, { "epoch": 1.19, "learning_rate": 6.290286408051726e-07, "logits/chosen": -0.39151623845100403, "logits/rejected": -0.3863525390625, "logps/chosen": -29.877941131591797, "logps/rejected": -30.624954223632812, "loss": 1.584, "rewards/accuracies": 1.0, "rewards/chosen": 0.18048515915870667, "rewards/margins": 0.27685490250587463, "rewards/rejected": -0.09636974334716797, "step": 7353 }, { "epoch": 1.19, "learning_rate": 6.289016623821307e-07, "logits/chosen": -0.431969553232193, "logits/rejected": -0.431969553232193, "logps/chosen": -24.80642318725586, "logps/rejected": -24.80642318725586, "loss": 0.639, "rewards/accuracies": 0.0, "rewards/chosen": 1.5544548034667969, "rewards/margins": 0.0, "rewards/rejected": 1.5544548034667969, "step": 7354 }, { "epoch": 1.19, "learning_rate": 6.287746750532036e-07, "logits/chosen": -0.722438395023346, "logits/rejected": -0.718743085861206, "logps/chosen": -61.66073226928711, "logps/rejected": -5.867189407348633, "loss": 0.5384, "rewards/accuracies": 0.0, "rewards/chosen": 0.34150201082229614, "rewards/margins": -0.49741315841674805, "rewards/rejected": 0.8389151692390442, "step": 7355 }, { "epoch": 1.19, "learning_rate": 6.286476788271649e-07, "logits/chosen": -0.41477200388908386, "logits/rejected": -0.32764747738838196, "logps/chosen": -39.605255126953125, "logps/rejected": -13.720321655273438, "loss": 0.5503, "rewards/accuracies": 1.0, "rewards/chosen": 1.3673477172851562, "rewards/margins": 0.7340499758720398, "rewards/rejected": 0.6332977414131165, "step": 7356 }, { "epoch": 1.19, "learning_rate": 6.285206737127888e-07, "logits/chosen": -0.7800403237342834, "logits/rejected": -0.6610355377197266, "logps/chosen": -77.98802185058594, "logps/rejected": -103.84402465820312, "loss": 0.0771, "rewards/accuracies": 1.0, "rewards/chosen": 3.5765130519866943, "rewards/margins": 2.7233285903930664, "rewards/rejected": 0.8531845211982727, "step": 7357 }, { "epoch": 1.19, "learning_rate": 6.283936597188502e-07, "logits/chosen": -0.3406670391559601, "logits/rejected": -0.3464660346508026, "logps/chosen": -5.098201751708984, "logps/rejected": -3.6185824871063232, "loss": 0.8782, "rewards/accuracies": 0.0, "rewards/chosen": 0.6985117197036743, "rewards/margins": -0.1006118655204773, "rewards/rejected": 0.7991235852241516, "step": 7358 }, { "epoch": 1.19, "learning_rate": 6.282666368541249e-07, "logits/chosen": -0.8678858876228333, "logits/rejected": -0.8137144446372986, "logps/chosen": -91.95213317871094, "logps/rejected": -32.17156219482422, "loss": 0.1774, "rewards/accuracies": 1.0, "rewards/chosen": 1.1008590459823608, "rewards/margins": 0.9066841006278992, "rewards/rejected": 0.19417496025562286, "step": 7359 }, { "epoch": 1.19, "learning_rate": 6.281396051273884e-07, "logits/chosen": -0.7315771579742432, "logits/rejected": -0.814562201499939, "logps/chosen": -157.50985717773438, "logps/rejected": -207.93777465820312, "loss": 1.0729, "rewards/accuracies": 0.0, "rewards/chosen": 4.763223171234131, "rewards/margins": -1.9977264404296875, "rewards/rejected": 6.760949611663818, "step": 7360 }, { "epoch": 1.19, "learning_rate": 6.280125645474177e-07, "logits/chosen": -0.6469706892967224, "logits/rejected": -0.718999981880188, "logps/chosen": -65.37274169921875, "logps/rejected": -89.17143249511719, "loss": 1.6244, "rewards/accuracies": 0.0, "rewards/chosen": 0.9791000485420227, "rewards/margins": -0.5813888907432556, "rewards/rejected": 1.5604889392852783, "step": 7361 }, { "epoch": 1.19, "learning_rate": 6.278855151229901e-07, "logits/chosen": -0.6074363589286804, "logits/rejected": -0.5013327598571777, "logps/chosen": -58.23158264160156, "logps/rejected": -27.956378936767578, "loss": 0.4282, "rewards/accuracies": 1.0, "rewards/chosen": 2.3683090209960938, "rewards/margins": 1.4059569835662842, "rewards/rejected": 0.9623519778251648, "step": 7362 }, { "epoch": 1.2, "learning_rate": 6.277584568628833e-07, "logits/chosen": -0.46824920177459717, "logits/rejected": -0.4676792621612549, "logps/chosen": -1.9811426401138306, "logps/rejected": -3.4218664169311523, "loss": 0.6309, "rewards/accuracies": 0.0, "rewards/chosen": 0.36632853746414185, "rewards/margins": -0.17112332582473755, "rewards/rejected": 0.5374518632888794, "step": 7363 }, { "epoch": 1.2, "learning_rate": 6.27631389775876e-07, "logits/chosen": -0.5763615965843201, "logits/rejected": -0.5476845502853394, "logps/chosen": -67.70870971679688, "logps/rejected": -39.318572998046875, "loss": 0.499, "rewards/accuracies": 1.0, "rewards/chosen": 1.476080298423767, "rewards/margins": 0.1067042350769043, "rewards/rejected": 1.3693760633468628, "step": 7364 }, { "epoch": 1.2, "learning_rate": 6.275043138707474e-07, "logits/chosen": -0.6977499723434448, "logits/rejected": -0.6371263265609741, "logps/chosen": -60.674930572509766, "logps/rejected": -56.48134994506836, "loss": 0.6255, "rewards/accuracies": 0.0, "rewards/chosen": 1.200033187866211, "rewards/margins": -0.8318474292755127, "rewards/rejected": 2.0318806171417236, "step": 7365 }, { "epoch": 1.2, "learning_rate": 6.273772291562774e-07, "logits/chosen": -1.1282596588134766, "logits/rejected": -0.984542191028595, "logps/chosen": -99.6581802368164, "logps/rejected": -71.73960876464844, "loss": 0.1179, "rewards/accuracies": 1.0, "rewards/chosen": 4.052095890045166, "rewards/margins": 1.4206254482269287, "rewards/rejected": 2.6314704418182373, "step": 7366 }, { "epoch": 1.2, "learning_rate": 6.272501356412458e-07, "logits/chosen": -1.012768030166626, "logits/rejected": -0.9407297372817993, "logps/chosen": -114.64242553710938, "logps/rejected": -89.86639404296875, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": 5.079925537109375, "rewards/margins": 3.0010809898376465, "rewards/rejected": 2.0788445472717285, "step": 7367 }, { "epoch": 1.2, "learning_rate": 6.271230333344339e-07, "logits/chosen": -0.7087762355804443, "logits/rejected": -0.6420539617538452, "logps/chosen": -53.05304718017578, "logps/rejected": -47.227779388427734, "loss": 1.1613, "rewards/accuracies": 1.0, "rewards/chosen": 2.229602813720703, "rewards/margins": 0.4037982225418091, "rewards/rejected": 1.825804591178894, "step": 7368 }, { "epoch": 1.2, "learning_rate": 6.269959222446235e-07, "logits/chosen": -0.5667663812637329, "logits/rejected": -0.5381067395210266, "logps/chosen": -104.48189544677734, "logps/rejected": -60.09878158569336, "loss": 0.6455, "rewards/accuracies": 0.0, "rewards/chosen": 0.5257903933525085, "rewards/margins": -0.9283985495567322, "rewards/rejected": 1.4541889429092407, "step": 7369 }, { "epoch": 1.2, "learning_rate": 6.268688023805964e-07, "logits/chosen": -0.7424202561378479, "logits/rejected": -0.6903296113014221, "logps/chosen": -40.412750244140625, "logps/rejected": -17.630229949951172, "loss": 0.8495, "rewards/accuracies": 1.0, "rewards/chosen": 0.6587303280830383, "rewards/margins": 0.2752365171909332, "rewards/rejected": 0.3834938108921051, "step": 7370 }, { "epoch": 1.2, "learning_rate": 6.267416737511354e-07, "logits/chosen": -0.3293170630931854, "logits/rejected": -0.3293170630931854, "logps/chosen": -22.698396682739258, "logps/rejected": -22.698396682739258, "loss": 0.6358, "rewards/accuracies": 0.0, "rewards/chosen": 0.4381643235683441, "rewards/margins": 0.0, "rewards/rejected": 0.4381643235683441, "step": 7371 }, { "epoch": 1.2, "learning_rate": 6.26614536365024e-07, "logits/chosen": -0.1827983558177948, "logits/rejected": -0.18677017092704773, "logps/chosen": -2.4403038024902344, "logps/rejected": -8.361318588256836, "loss": 0.989, "rewards/accuracies": 1.0, "rewards/chosen": 0.2070145159959793, "rewards/margins": 0.15966638922691345, "rewards/rejected": 0.04734811931848526, "step": 7372 }, { "epoch": 1.2, "learning_rate": 6.264873902310463e-07, "logits/chosen": -0.639471709728241, "logits/rejected": -0.5874136686325073, "logps/chosen": -85.44863891601562, "logps/rejected": -153.16104125976562, "loss": 1.7735, "rewards/accuracies": 0.0, "rewards/chosen": 4.485571384429932, "rewards/margins": -2.4628233909606934, "rewards/rejected": 6.948394775390625, "step": 7373 }, { "epoch": 1.2, "learning_rate": 6.263602353579866e-07, "logits/chosen": -0.7917977571487427, "logits/rejected": -0.4284035265445709, "logps/chosen": -57.64739227294922, "logps/rejected": -100.17146301269531, "loss": 1.5528, "rewards/accuracies": 1.0, "rewards/chosen": 5.005650520324707, "rewards/margins": 0.1781768798828125, "rewards/rejected": 4.8274736404418945, "step": 7374 }, { "epoch": 1.2, "learning_rate": 6.262330717546304e-07, "logits/chosen": -0.8698614239692688, "logits/rejected": -0.9022942185401917, "logps/chosen": -54.31889343261719, "logps/rejected": -70.58617401123047, "loss": 0.4118, "rewards/accuracies": 1.0, "rewards/chosen": 2.7509453296661377, "rewards/margins": 0.19131159782409668, "rewards/rejected": 2.559633731842041, "step": 7375 }, { "epoch": 1.2, "learning_rate": 6.261058994297634e-07, "logits/chosen": -0.6011131405830383, "logits/rejected": -0.6706734299659729, "logps/chosen": -107.47004699707031, "logps/rejected": -108.89508056640625, "loss": 1.2427, "rewards/accuracies": 0.0, "rewards/chosen": 1.1051384210586548, "rewards/margins": -0.6828505992889404, "rewards/rejected": 1.7879890203475952, "step": 7376 }, { "epoch": 1.2, "learning_rate": 6.259787183921719e-07, "logits/chosen": -0.7160352468490601, "logits/rejected": -0.6477316617965698, "logps/chosen": -120.15536499023438, "logps/rejected": -62.91815948486328, "loss": 0.6543, "rewards/accuracies": 0.0, "rewards/chosen": 1.5852539539337158, "rewards/margins": -0.3874824047088623, "rewards/rejected": 1.9727363586425781, "step": 7377 }, { "epoch": 1.2, "learning_rate": 6.258515286506429e-07, "logits/chosen": -0.8769285082817078, "logits/rejected": -0.810423731803894, "logps/chosen": -71.60623168945312, "logps/rejected": -35.95126724243164, "loss": 0.499, "rewards/accuracies": 1.0, "rewards/chosen": 3.284162998199463, "rewards/margins": 1.1186084747314453, "rewards/rejected": 2.1655545234680176, "step": 7378 }, { "epoch": 1.2, "learning_rate": 6.257243302139642e-07, "logits/chosen": -0.3032342493534088, "logits/rejected": -0.32042667269706726, "logps/chosen": -20.31231689453125, "logps/rejected": -34.120487213134766, "loss": 0.6387, "rewards/accuracies": 0.0, "rewards/chosen": 0.9339082837104797, "rewards/margins": -0.37850862741470337, "rewards/rejected": 1.312416911125183, "step": 7379 }, { "epoch": 1.2, "learning_rate": 6.255971230909238e-07, "logits/chosen": -0.7972804307937622, "logits/rejected": -0.8060433864593506, "logps/chosen": -61.11383819580078, "logps/rejected": -141.40133666992188, "loss": 0.1655, "rewards/accuracies": 1.0, "rewards/chosen": 1.4457054138183594, "rewards/margins": 1.2134361267089844, "rewards/rejected": 0.232269287109375, "step": 7380 }, { "epoch": 1.2, "learning_rate": 6.254699072903107e-07, "logits/chosen": -0.9487542510032654, "logits/rejected": -0.8606671094894409, "logps/chosen": -74.98119354248047, "logps/rejected": -12.988614082336426, "loss": 1.9844, "rewards/accuracies": 1.0, "rewards/chosen": 1.7130546569824219, "rewards/margins": 1.3810025453567505, "rewards/rejected": 0.3320521414279938, "step": 7381 }, { "epoch": 1.2, "learning_rate": 6.253426828209143e-07, "logits/chosen": -0.6241946816444397, "logits/rejected": -0.5376724004745483, "logps/chosen": -122.34232330322266, "logps/rejected": -139.30740356445312, "loss": 0.72, "rewards/accuracies": 1.0, "rewards/chosen": 0.6772834658622742, "rewards/margins": 0.6828437447547913, "rewards/rejected": -0.0055603026412427425, "step": 7382 }, { "epoch": 1.2, "learning_rate": 6.252154496915243e-07, "logits/chosen": -0.6672136783599854, "logits/rejected": -0.6613348126411438, "logps/chosen": -141.4489288330078, "logps/rejected": -146.37261962890625, "loss": 0.2808, "rewards/accuracies": 1.0, "rewards/chosen": 1.7371826171875, "rewards/margins": 0.2896972894668579, "rewards/rejected": 1.447485327720642, "step": 7383 }, { "epoch": 1.2, "learning_rate": 6.250882079109317e-07, "logits/chosen": -0.558856189250946, "logits/rejected": -0.542386531829834, "logps/chosen": -85.99758911132812, "logps/rejected": -64.02556610107422, "loss": 0.707, "rewards/accuracies": 0.0, "rewards/chosen": 2.1135361194610596, "rewards/margins": -0.2668890953063965, "rewards/rejected": 2.380425214767456, "step": 7384 }, { "epoch": 1.2, "learning_rate": 6.249609574879274e-07, "logits/chosen": -0.4063507318496704, "logits/rejected": -0.35069000720977783, "logps/chosen": -64.10322570800781, "logps/rejected": -70.41321563720703, "loss": 0.2871, "rewards/accuracies": 1.0, "rewards/chosen": 2.392333984375, "rewards/margins": 0.833147406578064, "rewards/rejected": 1.559186577796936, "step": 7385 }, { "epoch": 1.2, "learning_rate": 6.248336984313034e-07, "logits/chosen": -1.0388281345367432, "logits/rejected": -1.0033280849456787, "logps/chosen": -66.00434112548828, "logps/rejected": -51.315521240234375, "loss": 1.9352, "rewards/accuracies": 0.0, "rewards/chosen": 1.3306686878204346, "rewards/margins": -0.01682734489440918, "rewards/rejected": 1.3474960327148438, "step": 7386 }, { "epoch": 1.2, "learning_rate": 6.247064307498521e-07, "logits/chosen": -0.287047415971756, "logits/rejected": -0.3744621276855469, "logps/chosen": -51.97016906738281, "logps/rejected": -105.26492309570312, "loss": 0.4832, "rewards/accuracies": 0.0, "rewards/chosen": 0.9385761618614197, "rewards/margins": -0.4705929160118103, "rewards/rejected": 1.40916907787323, "step": 7387 }, { "epoch": 1.2, "learning_rate": 6.245791544523663e-07, "logits/chosen": -0.6547088027000427, "logits/rejected": -0.6941459774971008, "logps/chosen": -70.97893524169922, "logps/rejected": -51.53091049194336, "loss": 1.0977, "rewards/accuracies": 0.0, "rewards/chosen": 1.1891297101974487, "rewards/margins": -0.9770632982254028, "rewards/rejected": 2.1661930084228516, "step": 7388 }, { "epoch": 1.2, "learning_rate": 6.244518695476397e-07, "logits/chosen": -0.6456906795501709, "logits/rejected": -0.522467315196991, "logps/chosen": -109.82066345214844, "logps/rejected": -132.4468231201172, "loss": 0.4944, "rewards/accuracies": 0.0, "rewards/chosen": 3.9942550659179688, "rewards/margins": -0.5224990844726562, "rewards/rejected": 4.516754150390625, "step": 7389 }, { "epoch": 1.2, "learning_rate": 6.243245760444666e-07, "logits/chosen": -0.15221117436885834, "logits/rejected": -0.09613214433193207, "logps/chosen": -43.58483123779297, "logps/rejected": -47.375545501708984, "loss": 0.4852, "rewards/accuracies": 1.0, "rewards/chosen": 1.8432705402374268, "rewards/margins": 1.140913724899292, "rewards/rejected": 0.70235675573349, "step": 7390 }, { "epoch": 1.2, "learning_rate": 6.241972739516416e-07, "logits/chosen": -0.5111705660820007, "logits/rejected": -0.49522072076797485, "logps/chosen": -29.539318084716797, "logps/rejected": -91.08472442626953, "loss": 0.9221, "rewards/accuracies": 0.0, "rewards/chosen": 0.9724079370498657, "rewards/margins": -0.6770870685577393, "rewards/rejected": 1.649495005607605, "step": 7391 }, { "epoch": 1.2, "learning_rate": 6.240699632779601e-07, "logits/chosen": -0.37877294421195984, "logits/rejected": -0.3912981152534485, "logps/chosen": -54.48747634887695, "logps/rejected": -53.492164611816406, "loss": 0.2784, "rewards/accuracies": 1.0, "rewards/chosen": 0.8547161221504211, "rewards/margins": 0.31648141145706177, "rewards/rejected": 0.5382347106933594, "step": 7392 }, { "epoch": 1.2, "learning_rate": 6.239426440322181e-07, "logits/chosen": -0.39561980962753296, "logits/rejected": -0.32706037163734436, "logps/chosen": -42.558292388916016, "logps/rejected": -37.87366485595703, "loss": 0.3301, "rewards/accuracies": 1.0, "rewards/chosen": 1.895219087600708, "rewards/margins": 0.5641822814941406, "rewards/rejected": 1.3310368061065674, "step": 7393 }, { "epoch": 1.2, "learning_rate": 6.238153162232123e-07, "logits/chosen": -0.5847687721252441, "logits/rejected": -0.4922323524951935, "logps/chosen": -41.77720642089844, "logps/rejected": -58.042823791503906, "loss": 0.3381, "rewards/accuracies": 1.0, "rewards/chosen": 2.140230894088745, "rewards/margins": 0.8522403240203857, "rewards/rejected": 1.2879905700683594, "step": 7394 }, { "epoch": 1.2, "learning_rate": 6.236879798597396e-07, "logits/chosen": -0.4942311644554138, "logits/rejected": -0.5029210448265076, "logps/chosen": -85.12873840332031, "logps/rejected": -105.07425689697266, "loss": 1.5065, "rewards/accuracies": 0.0, "rewards/chosen": 1.8043335676193237, "rewards/margins": -2.9603323936462402, "rewards/rejected": 4.7646660804748535, "step": 7395 }, { "epoch": 1.2, "learning_rate": 6.235606349505977e-07, "logits/chosen": -0.9257135391235352, "logits/rejected": -0.8363440036773682, "logps/chosen": -163.55108642578125, "logps/rejected": -174.3364715576172, "loss": 0.6661, "rewards/accuracies": 0.0, "rewards/chosen": 5.19490385055542, "rewards/margins": -0.9974989891052246, "rewards/rejected": 6.1924028396606445, "step": 7396 }, { "epoch": 1.2, "learning_rate": 6.234332815045852e-07, "logits/chosen": -0.6078172922134399, "logits/rejected": -0.4354700446128845, "logps/chosen": -47.03107452392578, "logps/rejected": -64.8905258178711, "loss": 0.2429, "rewards/accuracies": 1.0, "rewards/chosen": 2.031391143798828, "rewards/margins": 0.7476333379745483, "rewards/rejected": 1.2837578058242798, "step": 7397 }, { "epoch": 1.2, "learning_rate": 6.23305919530501e-07, "logits/chosen": -0.697422444820404, "logits/rejected": -0.6747528910636902, "logps/chosen": -47.13948440551758, "logps/rejected": -82.7435073852539, "loss": 0.6484, "rewards/accuracies": 0.0, "rewards/chosen": 2.1166882514953613, "rewards/margins": -0.41231799125671387, "rewards/rejected": 2.529006242752075, "step": 7398 }, { "epoch": 1.2, "learning_rate": 6.231785490371442e-07, "logits/chosen": -0.770625114440918, "logits/rejected": -0.911412239074707, "logps/chosen": -125.94317626953125, "logps/rejected": -99.59602355957031, "loss": 0.6499, "rewards/accuracies": 0.0, "rewards/chosen": 3.8529205322265625, "rewards/margins": -0.20005369186401367, "rewards/rejected": 4.052974224090576, "step": 7399 }, { "epoch": 1.2, "learning_rate": 6.230511700333153e-07, "logits/chosen": -0.7351155281066895, "logits/rejected": -0.6752849221229553, "logps/chosen": -146.7816925048828, "logps/rejected": -183.08157348632812, "loss": 0.3437, "rewards/accuracies": 1.0, "rewards/chosen": 5.036242961883545, "rewards/margins": 2.2980363368988037, "rewards/rejected": 2.738206624984741, "step": 7400 }, { "epoch": 1.2, "learning_rate": 6.22923782527815e-07, "logits/chosen": -0.31066688895225525, "logits/rejected": -0.298280268907547, "logps/chosen": -23.66680908203125, "logps/rejected": -1.5146044492721558, "loss": 0.617, "rewards/accuracies": 0.0, "rewards/chosen": 0.24143925309181213, "rewards/margins": -0.14779409766197205, "rewards/rejected": 0.3892333507537842, "step": 7401 }, { "epoch": 1.2, "learning_rate": 6.227963865294443e-07, "logits/chosen": -0.28508061170578003, "logits/rejected": -0.28309300541877747, "logps/chosen": -2.3222856521606445, "logps/rejected": -1.5647671222686768, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.29235973954200745, "rewards/margins": 0.02462214231491089, "rewards/rejected": 0.26773759722709656, "step": 7402 }, { "epoch": 1.2, "learning_rate": 6.226689820470053e-07, "logits/chosen": -0.6649352312088013, "logits/rejected": -0.5961549878120422, "logps/chosen": -56.18514633178711, "logps/rejected": -48.65069580078125, "loss": 0.4577, "rewards/accuracies": 1.0, "rewards/chosen": 1.7346820831298828, "rewards/margins": 0.9948436617851257, "rewards/rejected": 0.7398384213447571, "step": 7403 }, { "epoch": 1.2, "learning_rate": 6.225415690893003e-07, "logits/chosen": -0.7114107012748718, "logits/rejected": -0.6830527186393738, "logps/chosen": -42.90912628173828, "logps/rejected": -31.27692985534668, "loss": 0.9378, "rewards/accuracies": 1.0, "rewards/chosen": 0.9625873565673828, "rewards/margins": 0.5878236293792725, "rewards/rejected": 0.37476369738578796, "step": 7404 }, { "epoch": 1.2, "learning_rate": 6.224141476651324e-07, "logits/chosen": -0.8336501717567444, "logits/rejected": -0.8401277661323547, "logps/chosen": -54.933712005615234, "logps/rejected": -53.01245880126953, "loss": 0.2523, "rewards/accuracies": 1.0, "rewards/chosen": 1.4211437702178955, "rewards/margins": 0.6442707180976868, "rewards/rejected": 0.7768730521202087, "step": 7405 }, { "epoch": 1.2, "learning_rate": 6.222867177833052e-07, "logits/chosen": -0.34274882078170776, "logits/rejected": -0.4051362872123718, "logps/chosen": -64.05084991455078, "logps/rejected": -108.20822143554688, "loss": 1.1032, "rewards/accuracies": 0.0, "rewards/chosen": 1.0208030939102173, "rewards/margins": -0.3819305896759033, "rewards/rejected": 1.4027336835861206, "step": 7406 }, { "epoch": 1.2, "learning_rate": 6.221592794526228e-07, "logits/chosen": -0.279329776763916, "logits/rejected": -0.22816988825798035, "logps/chosen": -47.971736907958984, "logps/rejected": -26.31047821044922, "loss": 1.2278, "rewards/accuracies": 1.0, "rewards/chosen": 2.1028308868408203, "rewards/margins": 1.1571147441864014, "rewards/rejected": 0.9457160830497742, "step": 7407 }, { "epoch": 1.2, "learning_rate": 6.220318326818902e-07, "logits/chosen": -0.59145188331604, "logits/rejected": -0.5912852883338928, "logps/chosen": -103.00676727294922, "logps/rejected": -114.67204284667969, "loss": 0.6339, "rewards/accuracies": 0.0, "rewards/chosen": 1.0178276300430298, "rewards/margins": -0.663916826248169, "rewards/rejected": 1.6817444562911987, "step": 7408 }, { "epoch": 1.2, "learning_rate": 6.219043774799125e-07, "logits/chosen": -0.23852907121181488, "logits/rejected": -0.23852907121181488, "logps/chosen": -3.528862714767456, "logps/rejected": -3.528862714767456, "loss": 0.5034, "rewards/accuracies": 0.0, "rewards/chosen": 0.055884670466184616, "rewards/margins": 0.0, "rewards/rejected": 0.055884670466184616, "step": 7409 }, { "epoch": 1.2, "learning_rate": 6.217769138554959e-07, "logits/chosen": -0.7733625769615173, "logits/rejected": -0.8113172054290771, "logps/chosen": -79.65312957763672, "logps/rejected": -133.49227905273438, "loss": 0.949, "rewards/accuracies": 1.0, "rewards/chosen": 0.9397239685058594, "rewards/margins": 0.39234691858291626, "rewards/rejected": 0.5473770499229431, "step": 7410 }, { "epoch": 1.2, "learning_rate": 6.216494418174468e-07, "logits/chosen": -0.8101308345794678, "logits/rejected": -0.793913722038269, "logps/chosen": -31.890560150146484, "logps/rejected": -57.90486145019531, "loss": 1.2206, "rewards/accuracies": 1.0, "rewards/chosen": 2.686098098754883, "rewards/margins": 0.3596179485321045, "rewards/rejected": 2.3264801502227783, "step": 7411 }, { "epoch": 1.2, "learning_rate": 6.215219613745723e-07, "logits/chosen": -1.0556350946426392, "logits/rejected": -1.049942970275879, "logps/chosen": -103.832763671875, "logps/rejected": -26.57423973083496, "loss": 0.702, "rewards/accuracies": 0.0, "rewards/chosen": 0.5718857049942017, "rewards/margins": -0.2868329882621765, "rewards/rejected": 0.8587186932563782, "step": 7412 }, { "epoch": 1.2, "learning_rate": 6.213944725356803e-07, "logits/chosen": -0.646441638469696, "logits/rejected": -0.659415602684021, "logps/chosen": -33.078453063964844, "logps/rejected": -36.29074478149414, "loss": 0.7802, "rewards/accuracies": 1.0, "rewards/chosen": 1.7704139947891235, "rewards/margins": 0.15758097171783447, "rewards/rejected": 1.612833023071289, "step": 7413 }, { "epoch": 1.2, "learning_rate": 6.212669753095787e-07, "logits/chosen": -0.6925734877586365, "logits/rejected": -0.6925734877586365, "logps/chosen": -89.36925506591797, "logps/rejected": -89.36925506591797, "loss": 2.6408, "rewards/accuracies": 0.0, "rewards/chosen": 1.4375618696212769, "rewards/margins": 0.0, "rewards/rejected": 1.4375618696212769, "step": 7414 }, { "epoch": 1.2, "learning_rate": 6.211394697050766e-07, "logits/chosen": -0.5055712461471558, "logits/rejected": -0.5800617337226868, "logps/chosen": -60.87187957763672, "logps/rejected": -47.40852737426758, "loss": 0.6411, "rewards/accuracies": 0.0, "rewards/chosen": 1.7029694318771362, "rewards/margins": -0.9399005174636841, "rewards/rejected": 2.6428699493408203, "step": 7415 }, { "epoch": 1.2, "learning_rate": 6.210119557309833e-07, "logits/chosen": -0.691012442111969, "logits/rejected": -0.8038288354873657, "logps/chosen": -148.40501403808594, "logps/rejected": -224.24609375, "loss": 2.3387, "rewards/accuracies": 0.0, "rewards/chosen": 1.9906295537948608, "rewards/margins": -2.9441237449645996, "rewards/rejected": 4.93475341796875, "step": 7416 }, { "epoch": 1.2, "learning_rate": 6.20884433396109e-07, "logits/chosen": -0.2679955065250397, "logits/rejected": -0.34268975257873535, "logps/chosen": -80.90192413330078, "logps/rejected": -88.78978729248047, "loss": 0.771, "rewards/accuracies": 0.0, "rewards/chosen": 0.7495315670967102, "rewards/margins": -1.186948299407959, "rewards/rejected": 1.936479926109314, "step": 7417 }, { "epoch": 1.2, "learning_rate": 6.207569027092641e-07, "logits/chosen": -0.7420239448547363, "logits/rejected": -0.6399750113487244, "logps/chosen": -97.72569274902344, "logps/rejected": -100.00463104248047, "loss": 0.1108, "rewards/accuracies": 1.0, "rewards/chosen": 3.907444715499878, "rewards/margins": 1.4609718322753906, "rewards/rejected": 2.4464728832244873, "step": 7418 }, { "epoch": 1.2, "learning_rate": 6.206293636792598e-07, "logits/chosen": -0.9083827137947083, "logits/rejected": -0.8468356728553772, "logps/chosen": -79.51325225830078, "logps/rejected": -32.83183288574219, "loss": 0.1915, "rewards/accuracies": 1.0, "rewards/chosen": 0.9083450436592102, "rewards/margins": 0.8172321319580078, "rewards/rejected": 0.0911129042506218, "step": 7419 }, { "epoch": 1.2, "learning_rate": 6.205018163149078e-07, "logits/chosen": -0.4848504364490509, "logits/rejected": -0.4754561483860016, "logps/chosen": -88.03364562988281, "logps/rejected": -66.54730987548828, "loss": 0.4432, "rewards/accuracies": 0.0, "rewards/chosen": 0.5883186459541321, "rewards/margins": -0.2248901128768921, "rewards/rejected": 0.8132087588310242, "step": 7420 }, { "epoch": 1.2, "learning_rate": 6.203742606250208e-07, "logits/chosen": -0.7208397388458252, "logits/rejected": -0.87209552526474, "logps/chosen": -60.296241760253906, "logps/rejected": -151.82785034179688, "loss": 1.6718, "rewards/accuracies": 0.0, "rewards/chosen": 1.43822181224823, "rewards/margins": -3.303600788116455, "rewards/rejected": 4.741822719573975, "step": 7421 }, { "epoch": 1.2, "learning_rate": 6.202466966184111e-07, "logits/chosen": -0.7987238168716431, "logits/rejected": -0.7224951982498169, "logps/chosen": -42.52204132080078, "logps/rejected": -19.116283416748047, "loss": 0.2198, "rewards/accuracies": 1.0, "rewards/chosen": 0.9983646273612976, "rewards/margins": 0.7033573389053345, "rewards/rejected": 0.2950073182582855, "step": 7422 }, { "epoch": 1.2, "learning_rate": 6.201191243038926e-07, "logits/chosen": -0.9749504327774048, "logits/rejected": -0.901573896408081, "logps/chosen": -88.97200012207031, "logps/rejected": -81.78158569335938, "loss": 0.39, "rewards/accuracies": 1.0, "rewards/chosen": 2.2948739528656006, "rewards/margins": 0.3475228548049927, "rewards/rejected": 1.947351098060608, "step": 7423 }, { "epoch": 1.2, "learning_rate": 6.19991543690279e-07, "logits/chosen": -0.8736931681632996, "logits/rejected": -0.8821280002593994, "logps/chosen": -117.0093994140625, "logps/rejected": -94.70133209228516, "loss": 1.4228, "rewards/accuracies": 0.0, "rewards/chosen": 2.5022339820861816, "rewards/margins": -2.779923439025879, "rewards/rejected": 5.2821574211120605, "step": 7424 }, { "epoch": 1.21, "learning_rate": 6.198639547863853e-07, "logits/chosen": -0.11884626001119614, "logits/rejected": -0.10773753374814987, "logps/chosen": -36.522193908691406, "logps/rejected": -50.432640075683594, "loss": 0.6343, "rewards/accuracies": 1.0, "rewards/chosen": 1.0780426263809204, "rewards/margins": 0.9683456420898438, "rewards/rejected": 0.10969696193933487, "step": 7425 }, { "epoch": 1.21, "learning_rate": 6.197363576010263e-07, "logits/chosen": -0.4268229007720947, "logits/rejected": -0.45164409279823303, "logps/chosen": -71.70046997070312, "logps/rejected": -56.18806076049805, "loss": 1.1724, "rewards/accuracies": 0.0, "rewards/chosen": 0.8928260803222656, "rewards/margins": -1.9033520221710205, "rewards/rejected": 2.796178102493286, "step": 7426 }, { "epoch": 1.21, "learning_rate": 6.19608752143018e-07, "logits/chosen": -0.5612069964408875, "logits/rejected": -0.5821905136108398, "logps/chosen": -58.34141540527344, "logps/rejected": -103.1878890991211, "loss": 0.5193, "rewards/accuracies": 0.0, "rewards/chosen": 2.225773572921753, "rewards/margins": -0.5645380020141602, "rewards/rejected": 2.790311574935913, "step": 7427 }, { "epoch": 1.21, "learning_rate": 6.194811384211768e-07, "logits/chosen": -0.8696614503860474, "logits/rejected": -0.7934346199035645, "logps/chosen": -53.12862014770508, "logps/rejected": -44.21356964111328, "loss": 1.5715, "rewards/accuracies": 1.0, "rewards/chosen": 2.873286008834839, "rewards/margins": 1.3465689420700073, "rewards/rejected": 1.5267170667648315, "step": 7428 }, { "epoch": 1.21, "learning_rate": 6.193535164443193e-07, "logits/chosen": -0.6174846887588501, "logits/rejected": -0.6270330548286438, "logps/chosen": -74.73155212402344, "logps/rejected": -46.46429443359375, "loss": 0.507, "rewards/accuracies": 1.0, "rewards/chosen": 1.2600898742675781, "rewards/margins": 0.3107803463935852, "rewards/rejected": 0.9493095278739929, "step": 7429 }, { "epoch": 1.21, "learning_rate": 6.192258862212633e-07, "logits/chosen": -0.30579158663749695, "logits/rejected": -0.2780494689941406, "logps/chosen": -69.30287170410156, "logps/rejected": -76.34903717041016, "loss": 0.7333, "rewards/accuracies": 1.0, "rewards/chosen": 2.42840576171875, "rewards/margins": 0.1489264965057373, "rewards/rejected": 2.2794792652130127, "step": 7430 }, { "epoch": 1.21, "learning_rate": 6.190982477608266e-07, "logits/chosen": -0.4428744912147522, "logits/rejected": -0.4428744912147522, "logps/chosen": -50.68133544921875, "logps/rejected": -50.68133544921875, "loss": 0.7941, "rewards/accuracies": 0.0, "rewards/chosen": 1.1976631879806519, "rewards/margins": 0.0, "rewards/rejected": 1.1976631879806519, "step": 7431 }, { "epoch": 1.21, "learning_rate": 6.18970601071828e-07, "logits/chosen": -0.6111647486686707, "logits/rejected": -0.6203797459602356, "logps/chosen": -63.15144729614258, "logps/rejected": -65.131591796875, "loss": 0.834, "rewards/accuracies": 1.0, "rewards/chosen": 1.2851123809814453, "rewards/margins": 0.05733907222747803, "rewards/rejected": 1.2277733087539673, "step": 7432 }, { "epoch": 1.21, "learning_rate": 6.188429461630865e-07, "logits/chosen": -0.5023120045661926, "logits/rejected": -0.5208885669708252, "logps/chosen": -49.79280090332031, "logps/rejected": -72.55116271972656, "loss": 0.8805, "rewards/accuracies": 0.0, "rewards/chosen": 1.3578628301620483, "rewards/margins": -1.249475121498108, "rewards/rejected": 2.6073379516601562, "step": 7433 }, { "epoch": 1.21, "learning_rate": 6.187152830434219e-07, "logits/chosen": -0.7705277800559998, "logits/rejected": -0.7461660504341125, "logps/chosen": -229.79833984375, "logps/rejected": -103.80732727050781, "loss": 1.2033, "rewards/accuracies": 0.0, "rewards/chosen": 4.069039821624756, "rewards/margins": -1.395486831665039, "rewards/rejected": 5.464526653289795, "step": 7434 }, { "epoch": 1.21, "learning_rate": 6.185876117216547e-07, "logits/chosen": -1.2171657085418701, "logits/rejected": -1.2147988080978394, "logps/chosen": -98.60865783691406, "logps/rejected": -105.90615844726562, "loss": 1.2711, "rewards/accuracies": 0.0, "rewards/chosen": 1.4918197393417358, "rewards/margins": -2.0925002098083496, "rewards/rejected": 3.584320068359375, "step": 7435 }, { "epoch": 1.21, "learning_rate": 6.184599322066054e-07, "logits/chosen": -0.8585788011550903, "logits/rejected": -0.8655344247817993, "logps/chosen": -113.74711608886719, "logps/rejected": -120.03887939453125, "loss": 1.0724, "rewards/accuracies": 0.0, "rewards/chosen": 5.089469909667969, "rewards/margins": -1.6841158866882324, "rewards/rejected": 6.773585796356201, "step": 7436 }, { "epoch": 1.21, "learning_rate": 6.183322445070958e-07, "logits/chosen": -0.7023127675056458, "logits/rejected": -0.6739169955253601, "logps/chosen": -135.6085968017578, "logps/rejected": -181.61456298828125, "loss": 2.1888, "rewards/accuracies": 0.0, "rewards/chosen": 6.419368267059326, "rewards/margins": -0.36815929412841797, "rewards/rejected": 6.787527561187744, "step": 7437 }, { "epoch": 1.21, "learning_rate": 6.182045486319477e-07, "logits/chosen": -0.5932076573371887, "logits/rejected": -0.5932076573371887, "logps/chosen": -68.33431243896484, "logps/rejected": -68.33431243896484, "loss": 0.3507, "rewards/accuracies": 0.0, "rewards/chosen": 0.13008271157741547, "rewards/margins": 0.0, "rewards/rejected": 0.13008271157741547, "step": 7438 }, { "epoch": 1.21, "learning_rate": 6.180768445899838e-07, "logits/chosen": -1.0858227014541626, "logits/rejected": -1.0459803342819214, "logps/chosen": -83.00251770019531, "logps/rejected": -175.53329467773438, "loss": 0.6713, "rewards/accuracies": 0.0, "rewards/chosen": 5.962767124176025, "rewards/margins": -0.04175567626953125, "rewards/rejected": 6.004522800445557, "step": 7439 }, { "epoch": 1.21, "learning_rate": 6.179491323900272e-07, "logits/chosen": -0.5473907589912415, "logits/rejected": -0.46667084097862244, "logps/chosen": -102.68612670898438, "logps/rejected": -62.71137237548828, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": 3.1443450450897217, "rewards/margins": 1.0644958019256592, "rewards/rejected": 2.0798492431640625, "step": 7440 }, { "epoch": 1.21, "learning_rate": 6.178214120409015e-07, "logits/chosen": -0.3217891752719879, "logits/rejected": -0.14609479904174805, "logps/chosen": -53.455482482910156, "logps/rejected": -123.35306549072266, "loss": 1.1978, "rewards/accuracies": 0.0, "rewards/chosen": 1.4478905200958252, "rewards/margins": -1.5775635242462158, "rewards/rejected": 3.025454044342041, "step": 7441 }, { "epoch": 1.21, "learning_rate": 6.176936835514311e-07, "logits/chosen": -1.0107394456863403, "logits/rejected": -1.0330661535263062, "logps/chosen": -105.07527160644531, "logps/rejected": -37.89189910888672, "loss": 0.9972, "rewards/accuracies": 0.0, "rewards/chosen": 1.5082687139511108, "rewards/margins": -0.7337826490402222, "rewards/rejected": 2.242051362991333, "step": 7442 }, { "epoch": 1.21, "learning_rate": 6.17565946930441e-07, "logits/chosen": -0.9117835164070129, "logits/rejected": -0.9245642423629761, "logps/chosen": -132.79818725585938, "logps/rejected": -225.76683044433594, "loss": 1.3106, "rewards/accuracies": 1.0, "rewards/chosen": 6.385647773742676, "rewards/margins": 0.9375901222229004, "rewards/rejected": 5.448057651519775, "step": 7443 }, { "epoch": 1.21, "learning_rate": 6.174382021867561e-07, "logits/chosen": -0.6177032589912415, "logits/rejected": -0.5684918761253357, "logps/chosen": -75.21920776367188, "logps/rejected": -98.36227416992188, "loss": 1.3626, "rewards/accuracies": 0.0, "rewards/chosen": 1.5754340887069702, "rewards/margins": -1.3795541524887085, "rewards/rejected": 2.9549882411956787, "step": 7444 }, { "epoch": 1.21, "learning_rate": 6.173104493292027e-07, "logits/chosen": -0.28507229685783386, "logits/rejected": -0.2907753884792328, "logps/chosen": -2.764559507369995, "logps/rejected": -1.4379353523254395, "loss": 1.092, "rewards/accuracies": 0.0, "rewards/chosen": 0.312948077917099, "rewards/margins": -0.11608591675758362, "rewards/rejected": 0.4290339946746826, "step": 7445 }, { "epoch": 1.21, "learning_rate": 6.171826883666074e-07, "logits/chosen": -0.5060245990753174, "logits/rejected": -0.5440147519111633, "logps/chosen": -4.796826362609863, "logps/rejected": -34.53913116455078, "loss": 0.6418, "rewards/accuracies": 0.0, "rewards/chosen": 0.24707122147083282, "rewards/margins": -0.19947077333927155, "rewards/rejected": 0.44654199481010437, "step": 7446 }, { "epoch": 1.21, "learning_rate": 6.170549193077971e-07, "logits/chosen": -0.6862188577651978, "logits/rejected": -0.7046584486961365, "logps/chosen": -69.32188415527344, "logps/rejected": -42.408241271972656, "loss": 1.1262, "rewards/accuracies": 0.0, "rewards/chosen": 0.8745689392089844, "rewards/margins": -1.162825107574463, "rewards/rejected": 2.0373940467834473, "step": 7447 }, { "epoch": 1.21, "learning_rate": 6.169271421615993e-07, "logits/chosen": -0.5582419633865356, "logits/rejected": -0.5582419633865356, "logps/chosen": -49.68914031982422, "logps/rejected": -49.68914031982422, "loss": 0.3603, "rewards/accuracies": 0.0, "rewards/chosen": 1.5042060613632202, "rewards/margins": 0.0, "rewards/rejected": 1.5042060613632202, "step": 7448 }, { "epoch": 1.21, "learning_rate": 6.167993569368425e-07, "logits/chosen": -0.5329305529594421, "logits/rejected": -0.49703681468963623, "logps/chosen": -25.260295867919922, "logps/rejected": -5.568492889404297, "loss": 1.1082, "rewards/accuracies": 0.0, "rewards/chosen": 0.4125123918056488, "rewards/margins": -0.04247981309890747, "rewards/rejected": 0.4549922049045563, "step": 7449 }, { "epoch": 1.21, "learning_rate": 6.166715636423552e-07, "logits/chosen": -0.650278627872467, "logits/rejected": -0.6125128865242004, "logps/chosen": -84.4522705078125, "logps/rejected": -48.45383834838867, "loss": 1.4872, "rewards/accuracies": 0.0, "rewards/chosen": 1.0011261701583862, "rewards/margins": -0.06581151485443115, "rewards/rejected": 1.0669376850128174, "step": 7450 }, { "epoch": 1.21, "learning_rate": 6.165437622869669e-07, "logits/chosen": -0.6829745769500732, "logits/rejected": -0.6182686686515808, "logps/chosen": -99.25049591064453, "logps/rejected": -19.9388370513916, "loss": 0.4903, "rewards/accuracies": 0.0, "rewards/chosen": 0.2324211150407791, "rewards/margins": -0.17951621115207672, "rewards/rejected": 0.41193732619285583, "step": 7451 }, { "epoch": 1.21, "learning_rate": 6.164159528795073e-07, "logits/chosen": -0.5847472548484802, "logits/rejected": -0.6138675212860107, "logps/chosen": -55.99818801879883, "logps/rejected": -37.315677642822266, "loss": 0.3884, "rewards/accuracies": 1.0, "rewards/chosen": 1.068966269493103, "rewards/margins": 0.21896815299987793, "rewards/rejected": 0.8499981164932251, "step": 7452 }, { "epoch": 1.21, "learning_rate": 6.16288135428807e-07, "logits/chosen": -0.490554541349411, "logits/rejected": -0.6103875041007996, "logps/chosen": -98.23460388183594, "logps/rejected": -122.99691009521484, "loss": 1.6843, "rewards/accuracies": 0.0, "rewards/chosen": 2.0082786083221436, "rewards/margins": -2.4803545475006104, "rewards/rejected": 4.488633155822754, "step": 7453 }, { "epoch": 1.21, "learning_rate": 6.161603099436967e-07, "logits/chosen": -0.7180781364440918, "logits/rejected": -0.6485846042633057, "logps/chosen": -55.810699462890625, "logps/rejected": -44.32238006591797, "loss": 0.3156, "rewards/accuracies": 1.0, "rewards/chosen": 1.8189239501953125, "rewards/margins": 0.8028262853622437, "rewards/rejected": 1.0160976648330688, "step": 7454 }, { "epoch": 1.21, "learning_rate": 6.160324764330083e-07, "logits/chosen": -0.405068963766098, "logits/rejected": -0.40892499685287476, "logps/chosen": -4.284450054168701, "logps/rejected": -10.86073112487793, "loss": 1.1414, "rewards/accuracies": 0.0, "rewards/chosen": 0.32290664315223694, "rewards/margins": -0.01893267035484314, "rewards/rejected": 0.3418393135070801, "step": 7455 }, { "epoch": 1.21, "learning_rate": 6.159046349055735e-07, "logits/chosen": -0.32601553201675415, "logits/rejected": -0.32511457800865173, "logps/chosen": -13.046417236328125, "logps/rejected": -60.507423400878906, "loss": 0.9451, "rewards/accuracies": 0.0, "rewards/chosen": 0.020727157592773438, "rewards/margins": -0.790200412273407, "rewards/rejected": 0.8109275698661804, "step": 7456 }, { "epoch": 1.21, "learning_rate": 6.157767853702252e-07, "logits/chosen": -1.2812891006469727, "logits/rejected": -1.2382886409759521, "logps/chosen": -96.75326538085938, "logps/rejected": -82.85505676269531, "loss": 0.1948, "rewards/accuracies": 1.0, "rewards/chosen": 4.4199538230896, "rewards/margins": 0.765277624130249, "rewards/rejected": 3.6546761989593506, "step": 7457 }, { "epoch": 1.21, "learning_rate": 6.156489278357965e-07, "logits/chosen": -0.77547687292099, "logits/rejected": -0.6644958853721619, "logps/chosen": -70.1866226196289, "logps/rejected": -40.4170036315918, "loss": 0.9638, "rewards/accuracies": 0.0, "rewards/chosen": 1.340539574623108, "rewards/margins": -0.9998146295547485, "rewards/rejected": 2.3403542041778564, "step": 7458 }, { "epoch": 1.21, "learning_rate": 6.155210623111212e-07, "logits/chosen": -0.6541215181350708, "logits/rejected": -0.6580871939659119, "logps/chosen": -14.265398025512695, "logps/rejected": -6.283255100250244, "loss": 0.8867, "rewards/accuracies": 0.0, "rewards/chosen": 0.31239795684814453, "rewards/margins": -0.06531596183776855, "rewards/rejected": 0.3777139186859131, "step": 7459 }, { "epoch": 1.21, "learning_rate": 6.153931888050337e-07, "logits/chosen": -0.5475884079933167, "logits/rejected": -0.5047992467880249, "logps/chosen": -55.18033981323242, "logps/rejected": -52.40209197998047, "loss": 0.3042, "rewards/accuracies": 1.0, "rewards/chosen": 1.3643115758895874, "rewards/margins": 0.20700526237487793, "rewards/rejected": 1.1573063135147095, "step": 7460 }, { "epoch": 1.21, "learning_rate": 6.152653073263685e-07, "logits/chosen": -0.4842834770679474, "logits/rejected": -0.4672239422798157, "logps/chosen": -118.68075561523438, "logps/rejected": -91.50253295898438, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 3.8963546752929688, "rewards/margins": 1.9010093212127686, "rewards/rejected": 1.9953453540802002, "step": 7461 }, { "epoch": 1.21, "learning_rate": 6.151374178839612e-07, "logits/chosen": -0.7323659658432007, "logits/rejected": -0.7199625968933105, "logps/chosen": -54.54928970336914, "logps/rejected": -38.84516906738281, "loss": 0.7576, "rewards/accuracies": 0.0, "rewards/chosen": 1.5341297388076782, "rewards/margins": -0.4383026361465454, "rewards/rejected": 1.9724323749542236, "step": 7462 }, { "epoch": 1.21, "learning_rate": 6.150095204866479e-07, "logits/chosen": -0.989685595035553, "logits/rejected": -0.9124552011489868, "logps/chosen": -111.500244140625, "logps/rejected": -146.0323486328125, "loss": 0.9493, "rewards/accuracies": 0.0, "rewards/chosen": 2.334576368331909, "rewards/margins": -1.6974990367889404, "rewards/rejected": 4.03207540512085, "step": 7463 }, { "epoch": 1.21, "learning_rate": 6.148816151432649e-07, "logits/chosen": -0.8932262063026428, "logits/rejected": -0.5516838431358337, "logps/chosen": -130.29379272460938, "logps/rejected": -94.55455017089844, "loss": 0.9197, "rewards/accuracies": 0.0, "rewards/chosen": 1.7935150861740112, "rewards/margins": -0.46284186840057373, "rewards/rejected": 2.256356954574585, "step": 7464 }, { "epoch": 1.21, "learning_rate": 6.147537018626493e-07, "logits/chosen": -0.6211875677108765, "logits/rejected": -0.5587098598480225, "logps/chosen": -65.58885192871094, "logps/rejected": -79.43070983886719, "loss": 0.6977, "rewards/accuracies": 0.0, "rewards/chosen": 1.2282860279083252, "rewards/margins": -0.10378646850585938, "rewards/rejected": 1.3320724964141846, "step": 7465 }, { "epoch": 1.21, "learning_rate": 6.14625780653639e-07, "logits/chosen": -0.7881118059158325, "logits/rejected": -0.5955215692520142, "logps/chosen": -113.77373504638672, "logps/rejected": -97.05439758300781, "loss": 0.1337, "rewards/accuracies": 1.0, "rewards/chosen": 5.3161139488220215, "rewards/margins": 1.1908726692199707, "rewards/rejected": 4.125241279602051, "step": 7466 }, { "epoch": 1.21, "learning_rate": 6.144978515250715e-07, "logits/chosen": -0.44735997915267944, "logits/rejected": -0.44412896037101746, "logps/chosen": -3.581549644470215, "logps/rejected": -1.5994263887405396, "loss": 1.4778, "rewards/accuracies": 0.0, "rewards/chosen": 0.0857698917388916, "rewards/margins": -0.07072506844997406, "rewards/rejected": 0.15649496018886566, "step": 7467 }, { "epoch": 1.21, "learning_rate": 6.143699144857859e-07, "logits/chosen": -0.1673380732536316, "logits/rejected": -0.17438575625419617, "logps/chosen": -8.561909675598145, "logps/rejected": -4.138160228729248, "loss": 0.7769, "rewards/accuracies": 0.0, "rewards/chosen": 0.022553348913788795, "rewards/margins": -0.3108977675437927, "rewards/rejected": 0.33345112204551697, "step": 7468 }, { "epoch": 1.21, "learning_rate": 6.142419695446215e-07, "logits/chosen": -0.7593105435371399, "logits/rejected": -0.7044522762298584, "logps/chosen": -76.63298797607422, "logps/rejected": -87.62603759765625, "loss": 0.2904, "rewards/accuracies": 1.0, "rewards/chosen": 1.7347939014434814, "rewards/margins": 0.5195274353027344, "rewards/rejected": 1.215266466140747, "step": 7469 }, { "epoch": 1.21, "learning_rate": 6.141140167104179e-07, "logits/chosen": -0.6373028755187988, "logits/rejected": -0.6571348905563354, "logps/chosen": -120.90917205810547, "logps/rejected": -82.95186614990234, "loss": 1.2274, "rewards/accuracies": 0.0, "rewards/chosen": 0.22748565673828125, "rewards/margins": -2.297191619873047, "rewards/rejected": 2.524677276611328, "step": 7470 }, { "epoch": 1.21, "learning_rate": 6.139860559920154e-07, "logits/chosen": -0.646018922328949, "logits/rejected": -0.6496412754058838, "logps/chosen": -55.448638916015625, "logps/rejected": -71.41763305664062, "loss": 1.4472, "rewards/accuracies": 0.0, "rewards/chosen": 1.3015412092208862, "rewards/margins": -0.15082240104675293, "rewards/rejected": 1.4523636102676392, "step": 7471 }, { "epoch": 1.21, "learning_rate": 6.138580873982552e-07, "logits/chosen": -0.7792460322380066, "logits/rejected": -0.7408434152603149, "logps/chosen": -53.86204528808594, "logps/rejected": -63.60558319091797, "loss": 0.7633, "rewards/accuracies": 1.0, "rewards/chosen": 1.7783997058868408, "rewards/margins": 0.19647681713104248, "rewards/rejected": 1.5819228887557983, "step": 7472 }, { "epoch": 1.21, "learning_rate": 6.137301109379784e-07, "logits/chosen": -0.3429294228553772, "logits/rejected": -0.3429294228553772, "logps/chosen": -60.70738983154297, "logps/rejected": -60.70738983154297, "loss": 0.8276, "rewards/accuracies": 0.0, "rewards/chosen": 1.824170708656311, "rewards/margins": 0.0, "rewards/rejected": 1.824170708656311, "step": 7473 }, { "epoch": 1.21, "learning_rate": 6.13602126620027e-07, "logits/chosen": -0.8474478125572205, "logits/rejected": -0.8223177790641785, "logps/chosen": -76.75852966308594, "logps/rejected": -84.93887329101562, "loss": 0.1927, "rewards/accuracies": 1.0, "rewards/chosen": 0.9234321713447571, "rewards/margins": 0.7586662173271179, "rewards/rejected": 0.16476593911647797, "step": 7474 }, { "epoch": 1.21, "learning_rate": 6.134741344532436e-07, "logits/chosen": -0.7298197746276855, "logits/rejected": -0.6370371580123901, "logps/chosen": -100.099853515625, "logps/rejected": -56.70152282714844, "loss": 1.598, "rewards/accuracies": 1.0, "rewards/chosen": 1.9193023443222046, "rewards/margins": 0.6778655052185059, "rewards/rejected": 1.2414368391036987, "step": 7475 }, { "epoch": 1.21, "learning_rate": 6.133461344464713e-07, "logits/chosen": -0.5179349780082703, "logits/rejected": -0.5179349780082703, "logps/chosen": -3.8862555027008057, "logps/rejected": -3.8862555027008057, "loss": 0.5999, "rewards/accuracies": 0.0, "rewards/chosen": 0.1721857786178589, "rewards/margins": 0.0, "rewards/rejected": 0.1721857786178589, "step": 7476 }, { "epoch": 1.21, "learning_rate": 6.132181266085535e-07, "logits/chosen": -0.5133890509605408, "logits/rejected": -0.425712913274765, "logps/chosen": -165.0631103515625, "logps/rejected": -22.913860321044922, "loss": 0.5647, "rewards/accuracies": 1.0, "rewards/chosen": 3.962994337081909, "rewards/margins": 3.793674945831299, "rewards/rejected": 0.16931934654712677, "step": 7477 }, { "epoch": 1.21, "learning_rate": 6.130901109483344e-07, "logits/chosen": -0.6012619733810425, "logits/rejected": -0.6126497983932495, "logps/chosen": -54.25929260253906, "logps/rejected": -89.40995788574219, "loss": 0.1827, "rewards/accuracies": 1.0, "rewards/chosen": 2.1682510375976562, "rewards/margins": 1.2342529296875, "rewards/rejected": 0.9339981079101562, "step": 7478 }, { "epoch": 1.21, "learning_rate": 6.129620874746587e-07, "logits/chosen": -0.9608895778656006, "logits/rejected": -0.767285168170929, "logps/chosen": -184.11328125, "logps/rejected": -55.610748291015625, "loss": 0.064, "rewards/accuracies": 1.0, "rewards/chosen": 4.98876953125, "rewards/margins": 3.6654114723205566, "rewards/rejected": 1.323358178138733, "step": 7479 }, { "epoch": 1.21, "learning_rate": 6.128340561963717e-07, "logits/chosen": -0.18742215633392334, "logits/rejected": -0.18207691609859467, "logps/chosen": -81.95535278320312, "logps/rejected": -48.23585510253906, "loss": 0.5934, "rewards/accuracies": 1.0, "rewards/chosen": 1.985198974609375, "rewards/margins": 0.6211799383163452, "rewards/rejected": 1.3640190362930298, "step": 7480 }, { "epoch": 1.21, "learning_rate": 6.127060171223191e-07, "logits/chosen": -0.9315997362136841, "logits/rejected": -0.9336718320846558, "logps/chosen": -70.04304504394531, "logps/rejected": -41.302059173583984, "loss": 0.5087, "rewards/accuracies": 1.0, "rewards/chosen": 1.480096459388733, "rewards/margins": 0.08560597896575928, "rewards/rejected": 1.3944904804229736, "step": 7481 }, { "epoch": 1.21, "learning_rate": 6.12577970261347e-07, "logits/chosen": 0.01384427584707737, "logits/rejected": 0.02425856702029705, "logps/chosen": -16.238460540771484, "logps/rejected": -51.944114685058594, "loss": 0.4874, "rewards/accuracies": 0.0, "rewards/chosen": 0.24916954338550568, "rewards/margins": -0.10572929680347443, "rewards/rejected": 0.3548988401889801, "step": 7482 }, { "epoch": 1.21, "learning_rate": 6.124499156223026e-07, "logits/chosen": -0.3203190863132477, "logits/rejected": -0.2935793399810791, "logps/chosen": -42.5176887512207, "logps/rejected": -107.87985229492188, "loss": 1.6986, "rewards/accuracies": 0.0, "rewards/chosen": 2.0244832038879395, "rewards/margins": -2.9575467109680176, "rewards/rejected": 4.982029914855957, "step": 7483 }, { "epoch": 1.21, "learning_rate": 6.123218532140329e-07, "logits/chosen": -0.647576630115509, "logits/rejected": -0.3905554711818695, "logps/chosen": -139.07630920410156, "logps/rejected": -30.75290298461914, "loss": 0.125, "rewards/accuracies": 1.0, "rewards/chosen": 5.221832275390625, "rewards/margins": 4.075429439544678, "rewards/rejected": 1.1464027166366577, "step": 7484 }, { "epoch": 1.21, "learning_rate": 6.121937830453859e-07, "logits/chosen": -0.7269150018692017, "logits/rejected": -0.7028196454048157, "logps/chosen": -40.18528747558594, "logps/rejected": -35.35264587402344, "loss": 0.8109, "rewards/accuracies": 1.0, "rewards/chosen": 1.8630741834640503, "rewards/margins": 0.547318696975708, "rewards/rejected": 1.3157554864883423, "step": 7485 }, { "epoch": 1.22, "learning_rate": 6.1206570512521e-07, "logits/chosen": -0.5778607130050659, "logits/rejected": -0.5758354663848877, "logps/chosen": -21.234933853149414, "logps/rejected": -33.78423309326172, "loss": 0.4245, "rewards/accuracies": 1.0, "rewards/chosen": 0.12836723029613495, "rewards/margins": 0.0703967958688736, "rewards/rejected": 0.057970430701971054, "step": 7486 }, { "epoch": 1.22, "learning_rate": 6.119376194623544e-07, "logits/chosen": -0.7264066338539124, "logits/rejected": -0.6628545522689819, "logps/chosen": -90.95249938964844, "logps/rejected": -36.14633560180664, "loss": 0.0928, "rewards/accuracies": 1.0, "rewards/chosen": 2.364794969558716, "rewards/margins": 2.338491439819336, "rewards/rejected": 0.026303483173251152, "step": 7487 }, { "epoch": 1.22, "learning_rate": 6.118095260656685e-07, "logits/chosen": -0.7078244686126709, "logits/rejected": -0.7029893398284912, "logps/chosen": -85.60223388671875, "logps/rejected": -68.8276596069336, "loss": 0.6008, "rewards/accuracies": 0.0, "rewards/chosen": 1.7725273370742798, "rewards/margins": -0.12958216667175293, "rewards/rejected": 1.9021095037460327, "step": 7488 }, { "epoch": 1.22, "learning_rate": 6.116814249440022e-07, "logits/chosen": -0.7067751288414001, "logits/rejected": -0.6819497346878052, "logps/chosen": -103.64199829101562, "logps/rejected": -67.70308685302734, "loss": 1.9271, "rewards/accuracies": 1.0, "rewards/chosen": 1.7011101245880127, "rewards/margins": 0.42904436588287354, "rewards/rejected": 1.2720657587051392, "step": 7489 }, { "epoch": 1.22, "learning_rate": 6.115533161062061e-07, "logits/chosen": -0.4029501974582672, "logits/rejected": -0.3673330843448639, "logps/chosen": -48.91908645629883, "logps/rejected": -19.5998477935791, "loss": 0.3948, "rewards/accuracies": 0.0, "rewards/chosen": 0.3004184663295746, "rewards/margins": -0.17510682344436646, "rewards/rejected": 0.47552528977394104, "step": 7490 }, { "epoch": 1.22, "learning_rate": 6.114251995611315e-07, "logits/chosen": -0.730855405330658, "logits/rejected": -0.5459392070770264, "logps/chosen": -83.65202331542969, "logps/rejected": -84.6932373046875, "loss": 1.531, "rewards/accuracies": 0.0, "rewards/chosen": 1.5272034406661987, "rewards/margins": -1.9101120233535767, "rewards/rejected": 3.4373154640197754, "step": 7491 }, { "epoch": 1.22, "learning_rate": 6.112970753176299e-07, "logits/chosen": -0.5075708031654358, "logits/rejected": -0.4588974416255951, "logps/chosen": -56.849998474121094, "logps/rejected": -47.15681838989258, "loss": 0.359, "rewards/accuracies": 1.0, "rewards/chosen": 1.2564445734024048, "rewards/margins": 0.26295584440231323, "rewards/rejected": 0.9934887290000916, "step": 7492 }, { "epoch": 1.22, "learning_rate": 6.111689433845535e-07, "logits/chosen": -0.4908287227153778, "logits/rejected": -0.4972899854183197, "logps/chosen": -57.15349578857422, "logps/rejected": -106.73670196533203, "loss": 0.1965, "rewards/accuracies": 1.0, "rewards/chosen": 1.610543131828308, "rewards/margins": 0.7420868277549744, "rewards/rejected": 0.8684563040733337, "step": 7493 }, { "epoch": 1.22, "learning_rate": 6.11040803770755e-07, "logits/chosen": -0.5285146832466125, "logits/rejected": -0.5155698657035828, "logps/chosen": -48.310001373291016, "logps/rejected": -72.43373107910156, "loss": 1.3233, "rewards/accuracies": 0.0, "rewards/chosen": 1.1503952741622925, "rewards/margins": -0.19154655933380127, "rewards/rejected": 1.3419418334960938, "step": 7494 }, { "epoch": 1.22, "learning_rate": 6.109126564850877e-07, "logits/chosen": -0.06581160426139832, "logits/rejected": -0.06581160426139832, "logps/chosen": -53.94505310058594, "logps/rejected": -53.94505310058594, "loss": 0.6111, "rewards/accuracies": 0.0, "rewards/chosen": 0.2598831355571747, "rewards/margins": 0.0, "rewards/rejected": 0.2598831355571747, "step": 7495 }, { "epoch": 1.22, "learning_rate": 6.107845015364053e-07, "logits/chosen": -0.8133583664894104, "logits/rejected": -0.964987576007843, "logps/chosen": -151.626953125, "logps/rejected": -92.57867431640625, "loss": 3.4165, "rewards/accuracies": 0.0, "rewards/chosen": 0.28999024629592896, "rewards/margins": -6.702783107757568, "rewards/rejected": 6.992773532867432, "step": 7496 }, { "epoch": 1.22, "learning_rate": 6.106563389335621e-07, "logits/chosen": -0.4581170380115509, "logits/rejected": -0.4581170380115509, "logps/chosen": -13.227421760559082, "logps/rejected": -13.227421760559082, "loss": 0.61, "rewards/accuracies": 0.0, "rewards/chosen": 0.4799559712409973, "rewards/margins": 0.0, "rewards/rejected": 0.4799559712409973, "step": 7497 }, { "epoch": 1.22, "learning_rate": 6.105281686854128e-07, "logits/chosen": -1.2302274703979492, "logits/rejected": -1.1919959783554077, "logps/chosen": -113.49967956542969, "logps/rejected": -105.65796661376953, "loss": 1.5189, "rewards/accuracies": 0.0, "rewards/chosen": 4.3353166580200195, "rewards/margins": -1.726677417755127, "rewards/rejected": 6.0619940757751465, "step": 7498 }, { "epoch": 1.22, "learning_rate": 6.10399990800813e-07, "logits/chosen": -0.6503502130508423, "logits/rejected": -0.7538923025131226, "logps/chosen": -50.426048278808594, "logps/rejected": -84.2416000366211, "loss": 1.3632, "rewards/accuracies": 0.0, "rewards/chosen": 2.129662275314331, "rewards/margins": -2.533276319503784, "rewards/rejected": 4.662938594818115, "step": 7499 }, { "epoch": 1.22, "learning_rate": 6.102718052886183e-07, "logits/chosen": -0.7018617987632751, "logits/rejected": -0.66705721616745, "logps/chosen": -75.82342529296875, "logps/rejected": -46.55997085571289, "loss": 0.4136, "rewards/accuracies": 1.0, "rewards/chosen": 1.027398705482483, "rewards/margins": 0.13159221410751343, "rewards/rejected": 0.8958064913749695, "step": 7500 }, { "epoch": 1.22, "learning_rate": 6.101436121576854e-07, "logits/chosen": -0.5893861055374146, "logits/rejected": -0.45669928193092346, "logps/chosen": -92.25365447998047, "logps/rejected": -44.76313018798828, "loss": 0.11, "rewards/accuracies": 1.0, "rewards/chosen": 5.420639991760254, "rewards/margins": 4.265323638916016, "rewards/rejected": 1.1553162336349487, "step": 7501 }, { "epoch": 1.22, "learning_rate": 6.10015411416871e-07, "logits/chosen": -0.03102823719382286, "logits/rejected": -0.030347388237714767, "logps/chosen": -4.17862606048584, "logps/rejected": -11.194685935974121, "loss": 0.626, "rewards/accuracies": 1.0, "rewards/chosen": 0.22836999595165253, "rewards/margins": 0.2896369993686676, "rewards/rejected": -0.06126699596643448, "step": 7502 }, { "epoch": 1.22, "learning_rate": 6.098872030750328e-07, "logits/chosen": -0.5175407528877258, "logits/rejected": -0.45370417833328247, "logps/chosen": -31.464834213256836, "logps/rejected": -7.333762168884277, "loss": 0.2617, "rewards/accuracies": 1.0, "rewards/chosen": 0.6160807013511658, "rewards/margins": 0.400684118270874, "rewards/rejected": 0.21539659798145294, "step": 7503 }, { "epoch": 1.22, "learning_rate": 6.097589871410285e-07, "logits/chosen": -0.738989531993866, "logits/rejected": -0.6213770508766174, "logps/chosen": -103.16110229492188, "logps/rejected": -35.258113861083984, "loss": 0.1954, "rewards/accuracies": 1.0, "rewards/chosen": 1.914319634437561, "rewards/margins": 1.8714855909347534, "rewards/rejected": 0.0428340919315815, "step": 7504 }, { "epoch": 1.22, "learning_rate": 6.096307636237167e-07, "logits/chosen": -0.7921934127807617, "logits/rejected": -0.8875738978385925, "logps/chosen": -97.85460662841797, "logps/rejected": -84.40640258789062, "loss": 0.5172, "rewards/accuracies": 0.0, "rewards/chosen": 3.9720466136932373, "rewards/margins": -0.4743340015411377, "rewards/rejected": 4.446380615234375, "step": 7505 }, { "epoch": 1.22, "learning_rate": 6.095025325319565e-07, "logits/chosen": -0.6306502819061279, "logits/rejected": -0.5860051512718201, "logps/chosen": -123.24211120605469, "logps/rejected": -44.95228576660156, "loss": 0.6288, "rewards/accuracies": 0.0, "rewards/chosen": 0.5505142211914062, "rewards/margins": -0.6810940504074097, "rewards/rejected": 1.231608271598816, "step": 7506 }, { "epoch": 1.22, "learning_rate": 6.093742938746074e-07, "logits/chosen": -0.8444074392318726, "logits/rejected": -0.554470956325531, "logps/chosen": -137.36988830566406, "logps/rejected": -30.97049331665039, "loss": 0.8599, "rewards/accuracies": 1.0, "rewards/chosen": 4.164341926574707, "rewards/margins": 3.7171640396118164, "rewards/rejected": 0.4471778869628906, "step": 7507 }, { "epoch": 1.22, "learning_rate": 6.092460476605295e-07, "logits/chosen": -0.9088843464851379, "logits/rejected": -0.8648272156715393, "logps/chosen": -58.1025505065918, "logps/rejected": -80.20558166503906, "loss": 0.3012, "rewards/accuracies": 1.0, "rewards/chosen": 2.54056978225708, "rewards/margins": 0.30037879943847656, "rewards/rejected": 2.2401909828186035, "step": 7508 }, { "epoch": 1.22, "learning_rate": 6.091177938985836e-07, "logits/chosen": -0.6839550733566284, "logits/rejected": -0.6279312968254089, "logps/chosen": -84.4430160522461, "logps/rejected": -101.04308319091797, "loss": 0.1551, "rewards/accuracies": 1.0, "rewards/chosen": 2.209686279296875, "rewards/margins": 1.5559325218200684, "rewards/rejected": 0.6537536978721619, "step": 7509 }, { "epoch": 1.22, "learning_rate": 6.089895325976304e-07, "logits/chosen": -0.8558642864227295, "logits/rejected": -0.6778804659843445, "logps/chosen": -118.10823822021484, "logps/rejected": -55.36664581298828, "loss": 0.9608, "rewards/accuracies": 0.0, "rewards/chosen": 2.651059865951538, "rewards/margins": -0.02435445785522461, "rewards/rejected": 2.6754143238067627, "step": 7510 }, { "epoch": 1.22, "learning_rate": 6.088612637665319e-07, "logits/chosen": -1.1606011390686035, "logits/rejected": -1.1455273628234863, "logps/chosen": -114.10394287109375, "logps/rejected": -40.83698272705078, "loss": 1.0448, "rewards/accuracies": 1.0, "rewards/chosen": 1.548309326171875, "rewards/margins": 1.3315712213516235, "rewards/rejected": 0.21673813462257385, "step": 7511 }, { "epoch": 1.22, "learning_rate": 6.0873298741415e-07, "logits/chosen": -1.1403100490570068, "logits/rejected": -1.1292964220046997, "logps/chosen": -79.15265655517578, "logps/rejected": -83.78924560546875, "loss": 0.4903, "rewards/accuracies": 0.0, "rewards/chosen": 1.984893798828125, "rewards/margins": -0.4507484436035156, "rewards/rejected": 2.4356422424316406, "step": 7512 }, { "epoch": 1.22, "learning_rate": 6.086047035493476e-07, "logits/chosen": -0.7543431520462036, "logits/rejected": -0.6919186115264893, "logps/chosen": -56.06016540527344, "logps/rejected": -12.934212684631348, "loss": 0.2029, "rewards/accuracies": 1.0, "rewards/chosen": 1.399176001548767, "rewards/margins": 0.7403180003166199, "rewards/rejected": 0.6588580012321472, "step": 7513 }, { "epoch": 1.22, "learning_rate": 6.084764121809878e-07, "logits/chosen": -0.3141167461872101, "logits/rejected": -0.25855565071105957, "logps/chosen": -232.43508911132812, "logps/rejected": -42.87605285644531, "loss": 0.1016, "rewards/accuracies": 1.0, "rewards/chosen": 4.0016937255859375, "rewards/margins": 1.820399522781372, "rewards/rejected": 2.1812942028045654, "step": 7514 }, { "epoch": 1.22, "learning_rate": 6.083481133179343e-07, "logits/chosen": -0.5260840058326721, "logits/rejected": -0.45281869173049927, "logps/chosen": -98.47142028808594, "logps/rejected": -81.18269348144531, "loss": 0.2482, "rewards/accuracies": 1.0, "rewards/chosen": 2.0914361476898193, "rewards/margins": 1.5043420791625977, "rewards/rejected": 0.5870941281318665, "step": 7515 }, { "epoch": 1.22, "learning_rate": 6.082198069690514e-07, "logits/chosen": -0.9302684664726257, "logits/rejected": -0.6842821836471558, "logps/chosen": -123.1212158203125, "logps/rejected": -70.56443786621094, "loss": 0.0763, "rewards/accuracies": 1.0, "rewards/chosen": 4.953044414520264, "rewards/margins": 2.448025703430176, "rewards/rejected": 2.505018711090088, "step": 7516 }, { "epoch": 1.22, "learning_rate": 6.080914931432038e-07, "logits/chosen": -0.6433915495872498, "logits/rejected": -0.7173696160316467, "logps/chosen": -97.67835235595703, "logps/rejected": -139.11148071289062, "loss": 2.0675, "rewards/accuracies": 0.0, "rewards/chosen": 2.132319688796997, "rewards/margins": -2.7892563343048096, "rewards/rejected": 4.921576023101807, "step": 7517 }, { "epoch": 1.22, "learning_rate": 6.079631718492568e-07, "logits/chosen": -1.1751954555511475, "logits/rejected": -1.1234636306762695, "logps/chosen": -113.4638442993164, "logps/rejected": -96.66189575195312, "loss": 0.9929, "rewards/accuracies": 0.0, "rewards/chosen": 0.5383872985839844, "rewards/margins": -1.7039992809295654, "rewards/rejected": 2.24238657951355, "step": 7518 }, { "epoch": 1.22, "learning_rate": 6.078348430960762e-07, "logits/chosen": -0.7151083946228027, "logits/rejected": -0.5393413305282593, "logps/chosen": -80.681640625, "logps/rejected": -73.73345947265625, "loss": 0.7592, "rewards/accuracies": 0.0, "rewards/chosen": 2.9313888549804688, "rewards/margins": -1.2484207153320312, "rewards/rejected": 4.1798095703125, "step": 7519 }, { "epoch": 1.22, "learning_rate": 6.077065068925284e-07, "logits/chosen": -0.6613430380821228, "logits/rejected": -0.6396116614341736, "logps/chosen": -61.641517639160156, "logps/rejected": -77.712646484375, "loss": 0.4745, "rewards/accuracies": 1.0, "rewards/chosen": 1.0570518970489502, "rewards/margins": 0.468439519405365, "rewards/rejected": 0.5886123776435852, "step": 7520 }, { "epoch": 1.22, "learning_rate": 6.075781632474799e-07, "logits/chosen": -0.16394652426242828, "logits/rejected": -0.16394652426242828, "logps/chosen": -0.9973649382591248, "logps/rejected": -0.9973649382591248, "loss": 0.7946, "rewards/accuracies": 0.0, "rewards/chosen": 0.4408535659313202, "rewards/margins": 0.0, "rewards/rejected": 0.4408535659313202, "step": 7521 }, { "epoch": 1.22, "learning_rate": 6.074498121697983e-07, "logits/chosen": -0.7046693563461304, "logits/rejected": -0.7133995890617371, "logps/chosen": -73.68548583984375, "logps/rejected": -67.9243392944336, "loss": 0.7751, "rewards/accuracies": 1.0, "rewards/chosen": 2.2760980129241943, "rewards/margins": 0.32352912425994873, "rewards/rejected": 1.9525688886642456, "step": 7522 }, { "epoch": 1.22, "learning_rate": 6.073214536683514e-07, "logits/chosen": -0.41945457458496094, "logits/rejected": -0.4555087387561798, "logps/chosen": -58.21939468383789, "logps/rejected": -103.7112808227539, "loss": 1.0899, "rewards/accuracies": 0.0, "rewards/chosen": 2.48203706741333, "rewards/margins": -1.851273536682129, "rewards/rejected": 4.333310604095459, "step": 7523 }, { "epoch": 1.22, "learning_rate": 6.071930877520075e-07, "logits/chosen": -0.44219231605529785, "logits/rejected": -0.4636910855770111, "logps/chosen": -124.92633819580078, "logps/rejected": -57.723445892333984, "loss": 0.6265, "rewards/accuracies": 0.0, "rewards/chosen": 1.2709617614746094, "rewards/margins": -0.7724781036376953, "rewards/rejected": 2.0434398651123047, "step": 7524 }, { "epoch": 1.22, "learning_rate": 6.070647144296355e-07, "logits/chosen": -0.6755349636077881, "logits/rejected": -0.711144208908081, "logps/chosen": -98.60417175292969, "logps/rejected": -90.5303726196289, "loss": 2.0785, "rewards/accuracies": 0.0, "rewards/chosen": 0.6306930780410767, "rewards/margins": -4.119669437408447, "rewards/rejected": 4.750362396240234, "step": 7525 }, { "epoch": 1.22, "learning_rate": 6.069363337101049e-07, "logits/chosen": -0.7171348929405212, "logits/rejected": -0.6625332832336426, "logps/chosen": -67.17291259765625, "logps/rejected": -68.77243041992188, "loss": 0.368, "rewards/accuracies": 1.0, "rewards/chosen": 1.7824653387069702, "rewards/margins": 0.2203742265701294, "rewards/rejected": 1.5620911121368408, "step": 7526 }, { "epoch": 1.22, "learning_rate": 6.068079456022854e-07, "logits/chosen": -0.5024195909500122, "logits/rejected": -0.49778512120246887, "logps/chosen": -3.147214889526367, "logps/rejected": -32.3597297668457, "loss": 0.4341, "rewards/accuracies": 1.0, "rewards/chosen": 0.5229166150093079, "rewards/margins": 0.555964469909668, "rewards/rejected": -0.033047866076231, "step": 7527 }, { "epoch": 1.22, "learning_rate": 6.066795501150476e-07, "logits/chosen": -0.2133961170911789, "logits/rejected": -0.21716280281543732, "logps/chosen": -27.035491943359375, "logps/rejected": -20.82610321044922, "loss": 0.8819, "rewards/accuracies": 0.0, "rewards/chosen": -0.08449440449476242, "rewards/margins": -0.41045382618904114, "rewards/rejected": 0.3259594142436981, "step": 7528 }, { "epoch": 1.22, "learning_rate": 6.065511472572621e-07, "logits/chosen": -0.5847747921943665, "logits/rejected": -0.48911356925964355, "logps/chosen": -158.05307006835938, "logps/rejected": -119.38676452636719, "loss": 1.0473, "rewards/accuracies": 1.0, "rewards/chosen": 1.0551300048828125, "rewards/margins": 0.5787986516952515, "rewards/rejected": 0.47633132338523865, "step": 7529 }, { "epoch": 1.22, "learning_rate": 6.064227370378007e-07, "logits/chosen": -0.9007891416549683, "logits/rejected": -0.8362921476364136, "logps/chosen": -132.7018280029297, "logps/rejected": -156.73275756835938, "loss": 2.2554, "rewards/accuracies": 0.0, "rewards/chosen": 1.842341661453247, "rewards/margins": -4.4744462966918945, "rewards/rejected": 6.3167877197265625, "step": 7530 }, { "epoch": 1.22, "learning_rate": 6.06294319465535e-07, "logits/chosen": -1.003833532333374, "logits/rejected": -0.9339314699172974, "logps/chosen": -85.58425903320312, "logps/rejected": -34.58974838256836, "loss": 0.317, "rewards/accuracies": 1.0, "rewards/chosen": 1.4697456359863281, "rewards/margins": 0.49720650911331177, "rewards/rejected": 0.9725391268730164, "step": 7531 }, { "epoch": 1.22, "learning_rate": 6.061658945493377e-07, "logits/chosen": -0.9419706463813782, "logits/rejected": -0.8353786468505859, "logps/chosen": -78.04269409179688, "logps/rejected": -46.131919860839844, "loss": 0.0918, "rewards/accuracies": 1.0, "rewards/chosen": 2.471724033355713, "rewards/margins": 1.6092991828918457, "rewards/rejected": 0.8624248504638672, "step": 7532 }, { "epoch": 1.22, "learning_rate": 6.060374622980815e-07, "logits/chosen": -0.6300513744354248, "logits/rejected": -0.5877277851104736, "logps/chosen": -96.41106414794922, "logps/rejected": -62.67829895019531, "loss": 1.1599, "rewards/accuracies": 0.0, "rewards/chosen": 1.2303704023361206, "rewards/margins": -0.040814876556396484, "rewards/rejected": 1.271185278892517, "step": 7533 }, { "epoch": 1.22, "learning_rate": 6.059090227206402e-07, "logits/chosen": -0.47105497121810913, "logits/rejected": -0.5134523510932922, "logps/chosen": -63.02788543701172, "logps/rejected": -90.78491973876953, "loss": 0.7023, "rewards/accuracies": 1.0, "rewards/chosen": 0.9887108206748962, "rewards/margins": 0.19962769746780396, "rewards/rejected": 0.7890831232070923, "step": 7534 }, { "epoch": 1.22, "learning_rate": 6.057805758258873e-07, "logits/chosen": -0.8974418044090271, "logits/rejected": -0.9056470394134521, "logps/chosen": -208.2754669189453, "logps/rejected": -91.558837890625, "loss": 0.3239, "rewards/accuracies": 1.0, "rewards/chosen": 4.328733921051025, "rewards/margins": 0.24781036376953125, "rewards/rejected": 4.080923557281494, "step": 7535 }, { "epoch": 1.22, "learning_rate": 6.056521216226977e-07, "logits/chosen": -0.7701082825660706, "logits/rejected": -0.7950635552406311, "logps/chosen": -63.068424224853516, "logps/rejected": -79.009033203125, "loss": 0.6625, "rewards/accuracies": 1.0, "rewards/chosen": 2.8725643157958984, "rewards/margins": 1.1488124132156372, "rewards/rejected": 1.7237519025802612, "step": 7536 }, { "epoch": 1.22, "learning_rate": 6.055236601199461e-07, "logits/chosen": -0.9363085031509399, "logits/rejected": -0.9408221244812012, "logps/chosen": -102.51959228515625, "logps/rejected": -92.46831512451172, "loss": 1.1765, "rewards/accuracies": 1.0, "rewards/chosen": 3.9052155017852783, "rewards/margins": 0.14053106307983398, "rewards/rejected": 3.7646844387054443, "step": 7537 }, { "epoch": 1.22, "learning_rate": 6.053951913265082e-07, "logits/chosen": -0.5547675490379333, "logits/rejected": -0.5796497464179993, "logps/chosen": -63.494503021240234, "logps/rejected": -118.34757995605469, "loss": 0.5885, "rewards/accuracies": 0.0, "rewards/chosen": 1.3060535192489624, "rewards/margins": -0.17129182815551758, "rewards/rejected": 1.47734534740448, "step": 7538 }, { "epoch": 1.22, "learning_rate": 6.052667152512597e-07, "logits/chosen": -0.7102269530296326, "logits/rejected": -0.6682708263397217, "logps/chosen": -57.699729919433594, "logps/rejected": -74.29092407226562, "loss": 0.4529, "rewards/accuracies": 1.0, "rewards/chosen": 2.3876380920410156, "rewards/margins": 0.5720908641815186, "rewards/rejected": 1.815547227859497, "step": 7539 }, { "epoch": 1.22, "learning_rate": 6.051382319030772e-07, "logits/chosen": -0.9115849733352661, "logits/rejected": -1.0244412422180176, "logps/chosen": -145.5864715576172, "logps/rejected": -169.9048309326172, "loss": 1.2375, "rewards/accuracies": 0.0, "rewards/chosen": 4.5173845291137695, "rewards/margins": -1.2578887939453125, "rewards/rejected": 5.775273323059082, "step": 7540 }, { "epoch": 1.22, "learning_rate": 6.050097412908378e-07, "logits/chosen": -0.6958727240562439, "logits/rejected": -0.6544294953346252, "logps/chosen": -67.76510620117188, "logps/rejected": -34.413604736328125, "loss": 0.1168, "rewards/accuracies": 1.0, "rewards/chosen": 3.1172637939453125, "rewards/margins": 1.55738365650177, "rewards/rejected": 1.5598801374435425, "step": 7541 }, { "epoch": 1.22, "learning_rate": 6.048812434234188e-07, "logits/chosen": -0.5858427882194519, "logits/rejected": -0.5509633421897888, "logps/chosen": -69.59852600097656, "logps/rejected": -95.88978576660156, "loss": 0.8055, "rewards/accuracies": 0.0, "rewards/chosen": 1.8825409412384033, "rewards/margins": -1.3716628551483154, "rewards/rejected": 3.2542037963867188, "step": 7542 }, { "epoch": 1.22, "learning_rate": 6.047527383096983e-07, "logits/chosen": -0.6246113181114197, "logits/rejected": -0.5353289842605591, "logps/chosen": -48.200660705566406, "logps/rejected": -39.1225471496582, "loss": 0.1241, "rewards/accuracies": 1.0, "rewards/chosen": 2.513822317123413, "rewards/margins": 1.349035382270813, "rewards/rejected": 1.1647869348526, "step": 7543 }, { "epoch": 1.22, "learning_rate": 6.046242259585548e-07, "logits/chosen": -0.6731961369514465, "logits/rejected": -0.5321416258811951, "logps/chosen": -106.07342529296875, "logps/rejected": -23.25482749938965, "loss": 0.0451, "rewards/accuracies": 1.0, "rewards/chosen": 3.02567458152771, "rewards/margins": 2.49338436126709, "rewards/rejected": 0.5322901010513306, "step": 7544 }, { "epoch": 1.22, "learning_rate": 6.044957063788672e-07, "logits/chosen": -0.5531153678894043, "logits/rejected": -0.5091052651405334, "logps/chosen": -27.856460571289062, "logps/rejected": -43.30483627319336, "loss": 1.136, "rewards/accuracies": 0.0, "rewards/chosen": 0.6930786371231079, "rewards/margins": -0.7157547473907471, "rewards/rejected": 1.408833384513855, "step": 7545 }, { "epoch": 1.22, "learning_rate": 6.043671795795151e-07, "logits/chosen": -1.0115875005722046, "logits/rejected": -1.003610610961914, "logps/chosen": -127.44953918457031, "logps/rejected": -85.62690734863281, "loss": 0.0791, "rewards/accuracies": 1.0, "rewards/chosen": 4.407392978668213, "rewards/margins": 2.3859810829162598, "rewards/rejected": 2.021411895751953, "step": 7546 }, { "epoch": 1.22, "learning_rate": 6.042386455693784e-07, "logits/chosen": -0.5757662057876587, "logits/rejected": -0.5810710787773132, "logps/chosen": -96.53959655761719, "logps/rejected": -179.73104858398438, "loss": 0.9898, "rewards/accuracies": 0.0, "rewards/chosen": -0.02282562293112278, "rewards/margins": -1.049835205078125, "rewards/rejected": 1.0270096063613892, "step": 7547 }, { "epoch": 1.23, "learning_rate": 6.041101043573376e-07, "logits/chosen": -0.47630104422569275, "logits/rejected": -0.5127701759338379, "logps/chosen": -62.94434356689453, "logps/rejected": -54.070953369140625, "loss": 2.0559, "rewards/accuracies": 1.0, "rewards/chosen": 2.387014865875244, "rewards/margins": 1.0001527070999146, "rewards/rejected": 1.3868621587753296, "step": 7548 }, { "epoch": 1.23, "learning_rate": 6.039815559522738e-07, "logits/chosen": -0.7519941926002502, "logits/rejected": -0.6642632484436035, "logps/chosen": -78.06010437011719, "logps/rejected": -137.2873992919922, "loss": 0.9951, "rewards/accuracies": 1.0, "rewards/chosen": 1.9710266590118408, "rewards/margins": 1.3234878778457642, "rewards/rejected": 0.6475387811660767, "step": 7549 }, { "epoch": 1.23, "learning_rate": 6.038530003630682e-07, "logits/chosen": -0.8666396141052246, "logits/rejected": -0.8666396141052246, "logps/chosen": -123.85340118408203, "logps/rejected": -123.85340118408203, "loss": 0.6163, "rewards/accuracies": 0.0, "rewards/chosen": 1.6881736516952515, "rewards/margins": 0.0, "rewards/rejected": 1.6881736516952515, "step": 7550 }, { "epoch": 1.23, "learning_rate": 6.037244375986029e-07, "logits/chosen": -0.632628858089447, "logits/rejected": -0.6472000479698181, "logps/chosen": -61.27677917480469, "logps/rejected": -84.6693344116211, "loss": 1.2901, "rewards/accuracies": 0.0, "rewards/chosen": 2.322889804840088, "rewards/margins": -0.031178951263427734, "rewards/rejected": 2.3540687561035156, "step": 7551 }, { "epoch": 1.23, "learning_rate": 6.035958676677606e-07, "logits/chosen": -0.5013307929039001, "logits/rejected": -0.4986068606376648, "logps/chosen": -36.0695915222168, "logps/rejected": -50.608436584472656, "loss": 0.8225, "rewards/accuracies": 1.0, "rewards/chosen": 2.221231460571289, "rewards/margins": 0.782973051071167, "rewards/rejected": 1.438258409500122, "step": 7552 }, { "epoch": 1.23, "learning_rate": 6.03467290579424e-07, "logits/chosen": -0.3186706006526947, "logits/rejected": -0.27000102400779724, "logps/chosen": -55.73170471191406, "logps/rejected": -54.189910888671875, "loss": 0.4202, "rewards/accuracies": 1.0, "rewards/chosen": 1.6962631940841675, "rewards/margins": 1.0957410335540771, "rewards/rejected": 0.6005222201347351, "step": 7553 }, { "epoch": 1.23, "learning_rate": 6.033387063424764e-07, "logits/chosen": -0.7314664125442505, "logits/rejected": -0.7535794973373413, "logps/chosen": -187.73745727539062, "logps/rejected": -72.5884017944336, "loss": 0.7668, "rewards/accuracies": 0.0, "rewards/chosen": 0.13327331840991974, "rewards/margins": -1.1982231140136719, "rewards/rejected": 1.3314964771270752, "step": 7554 }, { "epoch": 1.23, "learning_rate": 6.032101149658022e-07, "logits/chosen": -0.19496430456638336, "logits/rejected": -0.19496430456638336, "logps/chosen": -6.514399528503418, "logps/rejected": -6.514399528503418, "loss": 0.7806, "rewards/accuracies": 0.0, "rewards/chosen": 0.22490988671779633, "rewards/margins": 0.0, "rewards/rejected": 0.22490988671779633, "step": 7555 }, { "epoch": 1.23, "learning_rate": 6.030815164582857e-07, "logits/chosen": -0.3956778049468994, "logits/rejected": -0.3872555196285248, "logps/chosen": -76.89530944824219, "logps/rejected": -110.24037170410156, "loss": 0.3081, "rewards/accuracies": 1.0, "rewards/chosen": 1.5883225202560425, "rewards/margins": 0.440828800201416, "rewards/rejected": 1.1474937200546265, "step": 7556 }, { "epoch": 1.23, "learning_rate": 6.029529108288117e-07, "logits/chosen": -0.5388091206550598, "logits/rejected": -0.5339640378952026, "logps/chosen": -1.838883638381958, "logps/rejected": -3.1060807704925537, "loss": 0.36, "rewards/accuracies": 1.0, "rewards/chosen": 0.3563166558742523, "rewards/margins": 0.05569338798522949, "rewards/rejected": 0.3006232678890228, "step": 7557 }, { "epoch": 1.23, "learning_rate": 6.028242980862658e-07, "logits/chosen": -0.1028057113289833, "logits/rejected": -0.1028057113289833, "logps/chosen": -20.946510314941406, "logps/rejected": -20.946510314941406, "loss": 0.4396, "rewards/accuracies": 0.0, "rewards/chosen": 0.23594951629638672, "rewards/margins": 0.0, "rewards/rejected": 0.23594951629638672, "step": 7558 }, { "epoch": 1.23, "learning_rate": 6.026956782395337e-07, "logits/chosen": -0.9173133969306946, "logits/rejected": -0.947727382183075, "logps/chosen": -50.421295166015625, "logps/rejected": -74.19830322265625, "loss": 0.8596, "rewards/accuracies": 0.0, "rewards/chosen": 2.8365638256073, "rewards/margins": -0.41310977935791016, "rewards/rejected": 3.24967360496521, "step": 7559 }, { "epoch": 1.23, "learning_rate": 6.025670512975021e-07, "logits/chosen": -0.6894835829734802, "logits/rejected": -0.6789222955703735, "logps/chosen": -89.15681457519531, "logps/rejected": -88.31331634521484, "loss": 0.7425, "rewards/accuracies": 0.0, "rewards/chosen": 1.171015977859497, "rewards/margins": -0.4139503240585327, "rewards/rejected": 1.5849663019180298, "step": 7560 }, { "epoch": 1.23, "learning_rate": 6.024384172690576e-07, "logits/chosen": -0.8038697838783264, "logits/rejected": -0.7498301267623901, "logps/chosen": -84.12782287597656, "logps/rejected": -72.31864929199219, "loss": 0.305, "rewards/accuracies": 1.0, "rewards/chosen": 1.8652619123458862, "rewards/margins": 1.659925103187561, "rewards/rejected": 0.2053367644548416, "step": 7561 }, { "epoch": 1.23, "learning_rate": 6.023097761630879e-07, "logits/chosen": -0.3638553023338318, "logits/rejected": -0.3814888596534729, "logps/chosen": -51.3780403137207, "logps/rejected": -32.58926773071289, "loss": 0.6078, "rewards/accuracies": 0.0, "rewards/chosen": 0.43439409136772156, "rewards/margins": -0.5513759851455688, "rewards/rejected": 0.985770046710968, "step": 7562 }, { "epoch": 1.23, "learning_rate": 6.021811279884806e-07, "logits/chosen": -0.6595919132232666, "logits/rejected": -0.5088199973106384, "logps/chosen": -87.36837768554688, "logps/rejected": -35.20884704589844, "loss": 0.1109, "rewards/accuracies": 1.0, "rewards/chosen": 3.9010589122772217, "rewards/margins": 2.507793426513672, "rewards/rejected": 1.3932656049728394, "step": 7563 }, { "epoch": 1.23, "learning_rate": 6.020524727541243e-07, "logits/chosen": -0.4056621193885803, "logits/rejected": -0.3887590765953064, "logps/chosen": -24.96137237548828, "logps/rejected": -10.72143840789795, "loss": 0.8947, "rewards/accuracies": 1.0, "rewards/chosen": 0.42218017578125, "rewards/margins": 0.031758397817611694, "rewards/rejected": 0.3904217779636383, "step": 7564 }, { "epoch": 1.23, "learning_rate": 6.019238104689077e-07, "logits/chosen": -1.0892548561096191, "logits/rejected": -0.9706575274467468, "logps/chosen": -67.61016845703125, "logps/rejected": -70.44116973876953, "loss": 0.4992, "rewards/accuracies": 1.0, "rewards/chosen": 1.963568091392517, "rewards/margins": 0.0499420166015625, "rewards/rejected": 1.9136260747909546, "step": 7565 }, { "epoch": 1.23, "learning_rate": 6.017951411417202e-07, "logits/chosen": -0.6405752897262573, "logits/rejected": -0.6397968530654907, "logps/chosen": -51.262367248535156, "logps/rejected": -41.30189514160156, "loss": 0.4705, "rewards/accuracies": 0.0, "rewards/chosen": 0.7629494071006775, "rewards/margins": -0.07832527160644531, "rewards/rejected": 0.8412746787071228, "step": 7566 }, { "epoch": 1.23, "learning_rate": 6.016664647814516e-07, "logits/chosen": -0.38513991236686707, "logits/rejected": -0.38513991236686707, "logps/chosen": -91.65292358398438, "logps/rejected": -91.65292358398438, "loss": 0.7089, "rewards/accuracies": 0.0, "rewards/chosen": 2.1865906715393066, "rewards/margins": 0.0, "rewards/rejected": 2.1865906715393066, "step": 7567 }, { "epoch": 1.23, "learning_rate": 6.015377813969924e-07, "logits/chosen": -0.6668921709060669, "logits/rejected": -0.6895819902420044, "logps/chosen": -83.21502685546875, "logps/rejected": -76.62557220458984, "loss": 0.6336, "rewards/accuracies": 1.0, "rewards/chosen": 1.8292617797851562, "rewards/margins": 0.07511365413665771, "rewards/rejected": 1.7541481256484985, "step": 7568 }, { "epoch": 1.23, "learning_rate": 6.014090909972333e-07, "logits/chosen": -0.7979509234428406, "logits/rejected": -0.7266055941581726, "logps/chosen": -45.75649642944336, "logps/rejected": -99.7413101196289, "loss": 0.1972, "rewards/accuracies": 1.0, "rewards/chosen": 2.340005874633789, "rewards/margins": 0.9542865753173828, "rewards/rejected": 1.3857192993164062, "step": 7569 }, { "epoch": 1.23, "learning_rate": 6.012803935910654e-07, "logits/chosen": -0.4531032145023346, "logits/rejected": -0.4531032145023346, "logps/chosen": -91.083740234375, "logps/rejected": -91.083740234375, "loss": 0.4851, "rewards/accuracies": 0.0, "rewards/chosen": 1.9775062799453735, "rewards/margins": 0.0, "rewards/rejected": 1.9775062799453735, "step": 7570 }, { "epoch": 1.23, "learning_rate": 6.011516891873807e-07, "logits/chosen": -0.7041233777999878, "logits/rejected": -0.7242213487625122, "logps/chosen": -65.07583618164062, "logps/rejected": -109.42398834228516, "loss": 0.5606, "rewards/accuracies": 0.0, "rewards/chosen": 0.872729480266571, "rewards/margins": -0.11354905366897583, "rewards/rejected": 0.9862785339355469, "step": 7571 }, { "epoch": 1.23, "learning_rate": 6.010229777950713e-07, "logits/chosen": -0.2876839339733124, "logits/rejected": -0.09766540676355362, "logps/chosen": -48.986183166503906, "logps/rejected": -14.774238586425781, "loss": 0.1681, "rewards/accuracies": 1.0, "rewards/chosen": 2.0475432872772217, "rewards/margins": 0.9643868207931519, "rewards/rejected": 1.0831564664840698, "step": 7572 }, { "epoch": 1.23, "learning_rate": 6.008942594230301e-07, "logits/chosen": -0.6528134942054749, "logits/rejected": -0.6479770541191101, "logps/chosen": -77.26823425292969, "logps/rejected": -62.44773864746094, "loss": 0.8805, "rewards/accuracies": 0.0, "rewards/chosen": 0.9041687250137329, "rewards/margins": -0.9048134088516235, "rewards/rejected": 1.8089821338653564, "step": 7573 }, { "epoch": 1.23, "learning_rate": 6.007655340801503e-07, "logits/chosen": -0.37463054060935974, "logits/rejected": -0.36203813552856445, "logps/chosen": -72.94306945800781, "logps/rejected": -69.63710021972656, "loss": 0.5341, "rewards/accuracies": 1.0, "rewards/chosen": 2.1499106884002686, "rewards/margins": 1.0755103826522827, "rewards/rejected": 1.0744003057479858, "step": 7574 }, { "epoch": 1.23, "learning_rate": 6.006368017753255e-07, "logits/chosen": -0.5461795330047607, "logits/rejected": -0.59432053565979, "logps/chosen": -92.5054931640625, "logps/rejected": -98.73822021484375, "loss": 1.0099, "rewards/accuracies": 0.0, "rewards/chosen": 0.30610352754592896, "rewards/margins": -1.5929458141326904, "rewards/rejected": 1.8990494012832642, "step": 7575 }, { "epoch": 1.23, "learning_rate": 6.005080625174499e-07, "logits/chosen": -0.7332572340965271, "logits/rejected": -1.1266510486602783, "logps/chosen": -57.26826858520508, "logps/rejected": -56.02783966064453, "loss": 1.3402, "rewards/accuracies": 1.0, "rewards/chosen": 0.6766323447227478, "rewards/margins": 0.4453423023223877, "rewards/rejected": 0.2312900573015213, "step": 7576 }, { "epoch": 1.23, "learning_rate": 6.003793163154183e-07, "logits/chosen": -0.26289016008377075, "logits/rejected": -0.2579643130302429, "logps/chosen": -55.0123291015625, "logps/rejected": -42.92219924926758, "loss": 0.84, "rewards/accuracies": 0.0, "rewards/chosen": 0.9762298464775085, "rewards/margins": -0.35644763708114624, "rewards/rejected": 1.3326774835586548, "step": 7577 }, { "epoch": 1.23, "learning_rate": 6.002505631781257e-07, "logits/chosen": -0.9515335559844971, "logits/rejected": -0.8514811992645264, "logps/chosen": -66.34675598144531, "logps/rejected": -44.64625549316406, "loss": 1.0007, "rewards/accuracies": 1.0, "rewards/chosen": 2.9200599193573, "rewards/margins": 1.7154872417449951, "rewards/rejected": 1.2045726776123047, "step": 7578 }, { "epoch": 1.23, "learning_rate": 6.001218031144676e-07, "logits/chosen": -1.0299086570739746, "logits/rejected": -0.8675430417060852, "logps/chosen": -95.62767791748047, "logps/rejected": -82.93849182128906, "loss": 2.2297, "rewards/accuracies": 0.0, "rewards/chosen": 1.7209084033966064, "rewards/margins": -3.210486650466919, "rewards/rejected": 4.931395053863525, "step": 7579 }, { "epoch": 1.23, "learning_rate": 5.999930361333404e-07, "logits/chosen": -0.6935794949531555, "logits/rejected": -0.6074458360671997, "logps/chosen": -83.92916107177734, "logps/rejected": -75.30580139160156, "loss": 0.7112, "rewards/accuracies": 0.0, "rewards/chosen": 1.404077172279358, "rewards/margins": -0.5895698070526123, "rewards/rejected": 1.9936469793319702, "step": 7580 }, { "epoch": 1.23, "learning_rate": 5.998642622436405e-07, "logits/chosen": -0.36593371629714966, "logits/rejected": -0.36593371629714966, "logps/chosen": -65.51754760742188, "logps/rejected": -65.51754760742188, "loss": 0.3701, "rewards/accuracies": 0.0, "rewards/chosen": 1.7830231189727783, "rewards/margins": 0.0, "rewards/rejected": 1.7830231189727783, "step": 7581 }, { "epoch": 1.23, "learning_rate": 5.997354814542648e-07, "logits/chosen": -0.5592461228370667, "logits/rejected": -0.6645612716674805, "logps/chosen": -127.10531616210938, "logps/rejected": -90.91764831542969, "loss": 1.6476, "rewards/accuracies": 0.0, "rewards/chosen": 0.32597658038139343, "rewards/margins": -2.300694227218628, "rewards/rejected": 2.6266708374023438, "step": 7582 }, { "epoch": 1.23, "learning_rate": 5.996066937741113e-07, "logits/chosen": -0.6705262064933777, "logits/rejected": -0.6694859862327576, "logps/chosen": -54.626014709472656, "logps/rejected": -68.66246795654297, "loss": 0.8668, "rewards/accuracies": 0.0, "rewards/chosen": 0.9676399230957031, "rewards/margins": -0.322723388671875, "rewards/rejected": 1.2903633117675781, "step": 7583 }, { "epoch": 1.23, "learning_rate": 5.994778992120778e-07, "logits/chosen": -0.539257287979126, "logits/rejected": -0.5485881567001343, "logps/chosen": -34.89579772949219, "logps/rejected": -38.81648635864258, "loss": 0.8225, "rewards/accuracies": 0.0, "rewards/chosen": 0.2223663330078125, "rewards/margins": -0.4348106384277344, "rewards/rejected": 0.6571769714355469, "step": 7584 }, { "epoch": 1.23, "learning_rate": 5.993490977770626e-07, "logits/chosen": -0.034654825925827026, "logits/rejected": -0.017351776361465454, "logps/chosen": -1.4872788190841675, "logps/rejected": -24.005041122436523, "loss": 0.2796, "rewards/accuracies": 1.0, "rewards/chosen": 0.33727413415908813, "rewards/margins": 0.5972903966903687, "rewards/rejected": -0.2600162625312805, "step": 7585 }, { "epoch": 1.23, "learning_rate": 5.992202894779648e-07, "logits/chosen": -0.491558313369751, "logits/rejected": -0.4765031933784485, "logps/chosen": -51.2435302734375, "logps/rejected": -111.80842590332031, "loss": 0.8689, "rewards/accuracies": 1.0, "rewards/chosen": 1.6419365406036377, "rewards/margins": 0.196746826171875, "rewards/rejected": 1.4451897144317627, "step": 7586 }, { "epoch": 1.23, "learning_rate": 5.990914743236839e-07, "logits/chosen": -0.7502806186676025, "logits/rejected": -0.7920027375221252, "logps/chosen": -63.04125213623047, "logps/rejected": -72.20399475097656, "loss": 0.432, "rewards/accuracies": 1.0, "rewards/chosen": 1.4397064447402954, "rewards/margins": 0.3291572332382202, "rewards/rejected": 1.1105492115020752, "step": 7587 }, { "epoch": 1.23, "learning_rate": 5.989626523231197e-07, "logits/chosen": -1.1556110382080078, "logits/rejected": -1.1815038919448853, "logps/chosen": -150.07516479492188, "logps/rejected": -86.8311996459961, "loss": 0.2301, "rewards/accuracies": 1.0, "rewards/chosen": 4.829573154449463, "rewards/margins": 1.868656873703003, "rewards/rejected": 2.96091628074646, "step": 7588 }, { "epoch": 1.23, "learning_rate": 5.988338234851726e-07, "logits/chosen": -0.5283203125, "logits/rejected": -0.5395713448524475, "logps/chosen": -85.82928466796875, "logps/rejected": -42.46790313720703, "loss": 1.8972, "rewards/accuracies": 0.0, "rewards/chosen": -0.010781860910356045, "rewards/margins": -1.6791332960128784, "rewards/rejected": 1.668351411819458, "step": 7589 }, { "epoch": 1.23, "learning_rate": 5.987049878187436e-07, "logits/chosen": -0.605207622051239, "logits/rejected": -0.5355259776115417, "logps/chosen": -197.93780517578125, "logps/rejected": -173.143798828125, "loss": 1.1597, "rewards/accuracies": 0.0, "rewards/chosen": 4.276379585266113, "rewards/margins": -1.7407135963439941, "rewards/rejected": 6.017093181610107, "step": 7590 }, { "epoch": 1.23, "learning_rate": 5.985761453327338e-07, "logits/chosen": -0.6806617379188538, "logits/rejected": -0.6391767263412476, "logps/chosen": -54.579742431640625, "logps/rejected": -115.66282653808594, "loss": 1.6815, "rewards/accuracies": 0.0, "rewards/chosen": 2.147045135498047, "rewards/margins": -1.811037540435791, "rewards/rejected": 3.958082675933838, "step": 7591 }, { "epoch": 1.23, "learning_rate": 5.98447296036045e-07, "logits/chosen": -0.5840044021606445, "logits/rejected": -0.476999431848526, "logps/chosen": -70.3991470336914, "logps/rejected": -20.701335906982422, "loss": 0.2499, "rewards/accuracies": 1.0, "rewards/chosen": 1.588067650794983, "rewards/margins": 1.306096076965332, "rewards/rejected": 0.2819715440273285, "step": 7592 }, { "epoch": 1.23, "learning_rate": 5.983184399375795e-07, "logits/chosen": -0.7481334805488586, "logits/rejected": -0.770595133304596, "logps/chosen": -54.77075958251953, "logps/rejected": -48.55509567260742, "loss": 0.8893, "rewards/accuracies": 1.0, "rewards/chosen": 2.261167287826538, "rewards/margins": 0.594653844833374, "rewards/rejected": 1.666513442993164, "step": 7593 }, { "epoch": 1.23, "learning_rate": 5.981895770462404e-07, "logits/chosen": -0.5549883842468262, "logits/rejected": -0.6300166249275208, "logps/chosen": -4.599138259887695, "logps/rejected": -57.90721130371094, "loss": 1.3689, "rewards/accuracies": 0.0, "rewards/chosen": 0.6064402461051941, "rewards/margins": -2.182814598083496, "rewards/rejected": 2.789254903793335, "step": 7594 }, { "epoch": 1.23, "learning_rate": 5.980607073709304e-07, "logits/chosen": -0.7625024318695068, "logits/rejected": -0.7085363268852234, "logps/chosen": -104.85631561279297, "logps/rejected": -93.00067138671875, "loss": 1.0144, "rewards/accuracies": 0.0, "rewards/chosen": 0.9294395446777344, "rewards/margins": -1.8845481872558594, "rewards/rejected": 2.8139877319335938, "step": 7595 }, { "epoch": 1.23, "learning_rate": 5.979318309205535e-07, "logits/chosen": -0.8487887978553772, "logits/rejected": -0.8424397110939026, "logps/chosen": -61.14912796020508, "logps/rejected": -62.60552215576172, "loss": 0.96, "rewards/accuracies": 0.0, "rewards/chosen": 1.5250271558761597, "rewards/margins": -1.5705684423446655, "rewards/rejected": 3.095595598220825, "step": 7596 }, { "epoch": 1.23, "learning_rate": 5.978029477040136e-07, "logits/chosen": -0.4776555895805359, "logits/rejected": -0.511565089225769, "logps/chosen": -102.42311096191406, "logps/rejected": -140.16583251953125, "loss": 1.3278, "rewards/accuracies": 0.0, "rewards/chosen": 4.254957675933838, "rewards/margins": -2.4427947998046875, "rewards/rejected": 6.697752475738525, "step": 7597 }, { "epoch": 1.23, "learning_rate": 5.976740577302154e-07, "logits/chosen": -0.4772663712501526, "logits/rejected": -0.4490808844566345, "logps/chosen": -53.91688537597656, "logps/rejected": -1.9140609502792358, "loss": 0.8511, "rewards/accuracies": 0.0, "rewards/chosen": 0.5128440856933594, "rewards/margins": -0.06660610437393188, "rewards/rejected": 0.5794501900672913, "step": 7598 }, { "epoch": 1.23, "learning_rate": 5.975451610080642e-07, "logits/chosen": -0.7152318358421326, "logits/rejected": -0.7152318358421326, "logps/chosen": -55.48792266845703, "logps/rejected": -55.48792266845703, "loss": 0.6339, "rewards/accuracies": 0.0, "rewards/chosen": 1.6331993341445923, "rewards/margins": 0.0, "rewards/rejected": 1.6331993341445923, "step": 7599 }, { "epoch": 1.23, "learning_rate": 5.974162575464652e-07, "logits/chosen": -0.9217402338981628, "logits/rejected": -0.8991196751594543, "logps/chosen": -135.41827392578125, "logps/rejected": -140.01327514648438, "loss": 0.222, "rewards/accuracies": 1.0, "rewards/chosen": 7.314724922180176, "rewards/margins": 2.0763978958129883, "rewards/rejected": 5.2383270263671875, "step": 7600 }, { "epoch": 1.23, "learning_rate": 5.972873473543246e-07, "logits/chosen": -0.7013759613037109, "logits/rejected": -0.679469883441925, "logps/chosen": -59.6862678527832, "logps/rejected": -86.84305572509766, "loss": 0.4038, "rewards/accuracies": 1.0, "rewards/chosen": 2.839695453643799, "rewards/margins": 0.8796169757843018, "rewards/rejected": 1.960078477859497, "step": 7601 }, { "epoch": 1.23, "learning_rate": 5.971584304405488e-07, "logits/chosen": -0.7604393362998962, "logits/rejected": -0.728204607963562, "logps/chosen": -50.44597244262695, "logps/rejected": -84.80752563476562, "loss": 0.4167, "rewards/accuracies": 1.0, "rewards/chosen": 2.2132153511047363, "rewards/margins": 0.04730796813964844, "rewards/rejected": 2.165907382965088, "step": 7602 }, { "epoch": 1.23, "learning_rate": 5.970295068140449e-07, "logits/chosen": -0.11115635931491852, "logits/rejected": -0.11115635931491852, "logps/chosen": -41.57863998413086, "logps/rejected": -41.57863998413086, "loss": 0.3841, "rewards/accuracies": 0.0, "rewards/chosen": 0.5631851553916931, "rewards/margins": 0.0, "rewards/rejected": 0.5631851553916931, "step": 7603 }, { "epoch": 1.23, "learning_rate": 5.9690057648372e-07, "logits/chosen": -0.632641613483429, "logits/rejected": -0.771204948425293, "logps/chosen": -27.579341888427734, "logps/rejected": -56.762306213378906, "loss": 1.0998, "rewards/accuracies": 0.0, "rewards/chosen": 1.0660851001739502, "rewards/margins": -1.7073135375976562, "rewards/rejected": 2.7733986377716064, "step": 7604 }, { "epoch": 1.23, "learning_rate": 5.967716394584823e-07, "logits/chosen": -0.758369505405426, "logits/rejected": -0.6227120161056519, "logps/chosen": -98.50464630126953, "logps/rejected": -53.93981170654297, "loss": 1.435, "rewards/accuracies": 1.0, "rewards/chosen": 2.5672194957733154, "rewards/margins": 0.6754401922225952, "rewards/rejected": 1.8917793035507202, "step": 7605 }, { "epoch": 1.23, "learning_rate": 5.966426957472399e-07, "logits/chosen": -0.8545447587966919, "logits/rejected": -0.8406322002410889, "logps/chosen": -62.33384704589844, "logps/rejected": -24.87290382385254, "loss": 1.0417, "rewards/accuracies": 1.0, "rewards/chosen": 1.3524688482284546, "rewards/margins": 1.2408174276351929, "rewards/rejected": 0.11165142059326172, "step": 7606 }, { "epoch": 1.23, "learning_rate": 5.965137453589017e-07, "logits/chosen": -0.726199746131897, "logits/rejected": -0.7029838562011719, "logps/chosen": -128.99949645996094, "logps/rejected": -118.53390502929688, "loss": 2.3026, "rewards/accuracies": 0.0, "rewards/chosen": 1.6236404180526733, "rewards/margins": -0.5773667097091675, "rewards/rejected": 2.201007127761841, "step": 7607 }, { "epoch": 1.23, "learning_rate": 5.963847883023769e-07, "logits/chosen": -0.978925883769989, "logits/rejected": -0.9897683262825012, "logps/chosen": -55.792945861816406, "logps/rejected": -64.03244018554688, "loss": 0.5082, "rewards/accuracies": 1.0, "rewards/chosen": 1.7727813720703125, "rewards/margins": 0.17677080631256104, "rewards/rejected": 1.5960105657577515, "step": 7608 }, { "epoch": 1.24, "learning_rate": 5.962558245865754e-07, "logits/chosen": -0.5460822582244873, "logits/rejected": -0.5711613893508911, "logps/chosen": -75.45069122314453, "logps/rejected": -65.20555114746094, "loss": 0.9951, "rewards/accuracies": 0.0, "rewards/chosen": 0.93819659948349, "rewards/margins": -0.22625654935836792, "rewards/rejected": 1.164453148841858, "step": 7609 }, { "epoch": 1.24, "learning_rate": 5.96126854220407e-07, "logits/chosen": -0.9420987963676453, "logits/rejected": -1.010797381401062, "logps/chosen": -152.06004333496094, "logps/rejected": -134.98004150390625, "loss": 1.5353, "rewards/accuracies": 0.0, "rewards/chosen": 2.8291687965393066, "rewards/margins": -2.6514434814453125, "rewards/rejected": 5.480612277984619, "step": 7610 }, { "epoch": 1.24, "learning_rate": 5.959978772127825e-07, "logits/chosen": -1.167655348777771, "logits/rejected": -1.234030842781067, "logps/chosen": -181.2808837890625, "logps/rejected": -75.64301300048828, "loss": 0.1291, "rewards/accuracies": 1.0, "rewards/chosen": 4.571549892425537, "rewards/margins": 2.8488545417785645, "rewards/rejected": 1.722695231437683, "step": 7611 }, { "epoch": 1.24, "learning_rate": 5.958688935726131e-07, "logits/chosen": -0.7955116033554077, "logits/rejected": -0.5955367684364319, "logps/chosen": -144.42984008789062, "logps/rejected": -76.24153900146484, "loss": 0.4157, "rewards/accuracies": 1.0, "rewards/chosen": 4.166553020477295, "rewards/margins": 1.3815202713012695, "rewards/rejected": 2.7850327491760254, "step": 7612 }, { "epoch": 1.24, "learning_rate": 5.957399033088103e-07, "logits/chosen": -0.6338120102882385, "logits/rejected": -0.6560118794441223, "logps/chosen": -66.88127136230469, "logps/rejected": -59.81673812866211, "loss": 0.6501, "rewards/accuracies": 0.0, "rewards/chosen": 2.0641791820526123, "rewards/margins": -0.04004645347595215, "rewards/rejected": 2.1042256355285645, "step": 7613 }, { "epoch": 1.24, "learning_rate": 5.956109064302861e-07, "logits/chosen": -0.40594443678855896, "logits/rejected": -0.5201565623283386, "logps/chosen": -105.91873931884766, "logps/rejected": -107.28799438476562, "loss": 1.4721, "rewards/accuracies": 0.0, "rewards/chosen": 1.7236076593399048, "rewards/margins": -1.9422005414962769, "rewards/rejected": 3.6658082008361816, "step": 7614 }, { "epoch": 1.24, "learning_rate": 5.954819029459529e-07, "logits/chosen": -0.8436791300773621, "logits/rejected": -0.7923448085784912, "logps/chosen": -56.137699127197266, "logps/rejected": -49.288818359375, "loss": 0.6707, "rewards/accuracies": 1.0, "rewards/chosen": 1.8939541578292847, "rewards/margins": 0.06075334548950195, "rewards/rejected": 1.8332008123397827, "step": 7615 }, { "epoch": 1.24, "learning_rate": 5.953528928647237e-07, "logits/chosen": -0.6823756098747253, "logits/rejected": -0.6102805733680725, "logps/chosen": -55.42363739013672, "logps/rejected": -20.653121948242188, "loss": 1.5859, "rewards/accuracies": 0.0, "rewards/chosen": 0.5726009607315063, "rewards/margins": -0.09587746858596802, "rewards/rejected": 0.6684784293174744, "step": 7616 }, { "epoch": 1.24, "learning_rate": 5.952238761955117e-07, "logits/chosen": -0.33020931482315063, "logits/rejected": -0.33020931482315063, "logps/chosen": -94.78138732910156, "logps/rejected": -94.78138732910156, "loss": 0.8516, "rewards/accuracies": 0.0, "rewards/chosen": 0.697283923625946, "rewards/margins": 0.0, "rewards/rejected": 0.697283923625946, "step": 7617 }, { "epoch": 1.24, "learning_rate": 5.950948529472309e-07, "logits/chosen": -0.8210084438323975, "logits/rejected": -0.8210084438323975, "logps/chosen": -103.53358459472656, "logps/rejected": -103.53358459472656, "loss": 0.3651, "rewards/accuracies": 0.0, "rewards/chosen": 1.8424514532089233, "rewards/margins": 0.0, "rewards/rejected": 1.8424514532089233, "step": 7618 }, { "epoch": 1.24, "learning_rate": 5.949658231287957e-07, "logits/chosen": -0.3048543334007263, "logits/rejected": -0.2620237469673157, "logps/chosen": -89.86741638183594, "logps/rejected": -73.98697662353516, "loss": 0.2276, "rewards/accuracies": 1.0, "rewards/chosen": 1.4315643310546875, "rewards/margins": 0.8880615234375, "rewards/rejected": 0.5435028076171875, "step": 7619 }, { "epoch": 1.24, "learning_rate": 5.948367867491206e-07, "logits/chosen": -1.0240490436553955, "logits/rejected": -0.8045191168785095, "logps/chosen": -185.5538330078125, "logps/rejected": -44.42586135864258, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": 4.036099433898926, "rewards/margins": 2.5190610885620117, "rewards/rejected": 1.517038345336914, "step": 7620 }, { "epoch": 1.24, "learning_rate": 5.94707743817121e-07, "logits/chosen": -1.1298565864562988, "logits/rejected": -1.0430138111114502, "logps/chosen": -92.58122253417969, "logps/rejected": -91.40705871582031, "loss": 0.6311, "rewards/accuracies": 0.0, "rewards/chosen": 5.292370796203613, "rewards/margins": -0.47799062728881836, "rewards/rejected": 5.770361423492432, "step": 7621 }, { "epoch": 1.24, "learning_rate": 5.945786943417123e-07, "logits/chosen": -0.5846198201179504, "logits/rejected": -0.5619192123413086, "logps/chosen": -171.2567138671875, "logps/rejected": -186.25173950195312, "loss": 0.0706, "rewards/accuracies": 1.0, "rewards/chosen": 6.744451999664307, "rewards/margins": 2.111154079437256, "rewards/rejected": 4.633297920227051, "step": 7622 }, { "epoch": 1.24, "learning_rate": 5.944496383318108e-07, "logits/chosen": -0.8555746674537659, "logits/rejected": -0.9769259095191956, "logps/chosen": -69.18762969970703, "logps/rejected": -103.06094360351562, "loss": 2.1323, "rewards/accuracies": 0.0, "rewards/chosen": 1.9601868391036987, "rewards/margins": -3.5430173873901367, "rewards/rejected": 5.503204345703125, "step": 7623 }, { "epoch": 1.24, "learning_rate": 5.94320575796333e-07, "logits/chosen": -1.1356704235076904, "logits/rejected": -1.1277979612350464, "logps/chosen": -133.04898071289062, "logps/rejected": -131.24545288085938, "loss": 1.7633, "rewards/accuracies": 0.0, "rewards/chosen": 4.899366855621338, "rewards/margins": -2.484440803527832, "rewards/rejected": 7.38380765914917, "step": 7624 }, { "epoch": 1.24, "learning_rate": 5.941915067441958e-07, "logits/chosen": -0.7459421157836914, "logits/rejected": -0.6741539835929871, "logps/chosen": -89.30484771728516, "logps/rejected": -60.286651611328125, "loss": 0.3825, "rewards/accuracies": 1.0, "rewards/chosen": 2.1144280433654785, "rewards/margins": 1.2027504444122314, "rewards/rejected": 0.9116775393486023, "step": 7625 }, { "epoch": 1.24, "learning_rate": 5.940624311843168e-07, "logits/chosen": -1.0291024446487427, "logits/rejected": -1.0040801763534546, "logps/chosen": -144.56723022460938, "logps/rejected": -142.475341796875, "loss": 0.2062, "rewards/accuracies": 1.0, "rewards/chosen": 5.71462869644165, "rewards/margins": 1.163116455078125, "rewards/rejected": 4.551512241363525, "step": 7626 }, { "epoch": 1.24, "learning_rate": 5.93933349125614e-07, "logits/chosen": -0.9479265809059143, "logits/rejected": -0.9340254664421082, "logps/chosen": -67.04761505126953, "logps/rejected": -84.02840423583984, "loss": 0.7555, "rewards/accuracies": 0.0, "rewards/chosen": 2.874396562576294, "rewards/margins": -0.4081611633300781, "rewards/rejected": 3.282557725906372, "step": 7627 }, { "epoch": 1.24, "learning_rate": 5.938042605770052e-07, "logits/chosen": -0.7275065183639526, "logits/rejected": -0.6717727780342102, "logps/chosen": -63.34951400756836, "logps/rejected": -29.153366088867188, "loss": 0.318, "rewards/accuracies": 1.0, "rewards/chosen": 0.8274601101875305, "rewards/margins": 0.30986249446868896, "rewards/rejected": 0.5175976157188416, "step": 7628 }, { "epoch": 1.24, "learning_rate": 5.936751655474099e-07, "logits/chosen": -0.6298758387565613, "logits/rejected": -0.5810234546661377, "logps/chosen": -108.32783508300781, "logps/rejected": -40.22971725463867, "loss": 0.2752, "rewards/accuracies": 1.0, "rewards/chosen": 2.4026412963867188, "rewards/margins": 0.6797405481338501, "rewards/rejected": 1.7229007482528687, "step": 7629 }, { "epoch": 1.24, "learning_rate": 5.935460640457469e-07, "logits/chosen": -0.45880892872810364, "logits/rejected": -0.44377750158309937, "logps/chosen": -142.90802001953125, "logps/rejected": -145.24807739257812, "loss": 2.2238, "rewards/accuracies": 0.0, "rewards/chosen": 1.06256103515625, "rewards/margins": -2.8853530883789062, "rewards/rejected": 3.9479141235351562, "step": 7630 }, { "epoch": 1.24, "learning_rate": 5.93416956080936e-07, "logits/chosen": -0.8566870093345642, "logits/rejected": -0.8471984267234802, "logps/chosen": -55.19348907470703, "logps/rejected": -59.16185760498047, "loss": 0.5761, "rewards/accuracies": 0.0, "rewards/chosen": 2.4923553466796875, "rewards/margins": -0.4144432544708252, "rewards/rejected": 2.9067986011505127, "step": 7631 }, { "epoch": 1.24, "learning_rate": 5.932878416618974e-07, "logits/chosen": -0.6435758471488953, "logits/rejected": -0.5594884753227234, "logps/chosen": -60.079566955566406, "logps/rejected": -18.112459182739258, "loss": 0.1891, "rewards/accuracies": 1.0, "rewards/chosen": 1.2183083295822144, "rewards/margins": 1.0510001182556152, "rewards/rejected": 0.1673082411289215, "step": 7632 }, { "epoch": 1.24, "learning_rate": 5.931587207975516e-07, "logits/chosen": -0.6927649974822998, "logits/rejected": -0.6795395612716675, "logps/chosen": -62.9190559387207, "logps/rejected": -102.12300109863281, "loss": 0.9082, "rewards/accuracies": 1.0, "rewards/chosen": 1.9334102869033813, "rewards/margins": 0.7431232929229736, "rewards/rejected": 1.1902869939804077, "step": 7633 }, { "epoch": 1.24, "learning_rate": 5.930295934968197e-07, "logits/chosen": -1.2242263555526733, "logits/rejected": -1.1955348253250122, "logps/chosen": -209.65750122070312, "logps/rejected": -90.23922729492188, "loss": 0.0584, "rewards/accuracies": 1.0, "rewards/chosen": 5.603152751922607, "rewards/margins": 3.340986967086792, "rewards/rejected": 2.2621657848358154, "step": 7634 }, { "epoch": 1.24, "learning_rate": 5.929004597686232e-07, "logits/chosen": -0.7678748369216919, "logits/rejected": -0.8048648238182068, "logps/chosen": -89.31004333496094, "logps/rejected": -96.89802551269531, "loss": 0.4733, "rewards/accuracies": 0.0, "rewards/chosen": 1.477412462234497, "rewards/margins": -0.3451507091522217, "rewards/rejected": 1.8225631713867188, "step": 7635 }, { "epoch": 1.24, "learning_rate": 5.927713196218838e-07, "logits/chosen": -0.4946701228618622, "logits/rejected": -0.48390552401542664, "logps/chosen": -22.19825553894043, "logps/rejected": -3.5197925567626953, "loss": 0.4665, "rewards/accuracies": 1.0, "rewards/chosen": 0.40751227736473083, "rewards/margins": 0.013820111751556396, "rewards/rejected": 0.39369216561317444, "step": 7636 }, { "epoch": 1.24, "learning_rate": 5.926421730655241e-07, "logits/chosen": -0.39773833751678467, "logits/rejected": -0.4329596757888794, "logps/chosen": -45.76313781738281, "logps/rejected": -54.970706939697266, "loss": 1.2501, "rewards/accuracies": 1.0, "rewards/chosen": 0.7890052795410156, "rewards/margins": 0.20075875520706177, "rewards/rejected": 0.5882465243339539, "step": 7637 }, { "epoch": 1.24, "learning_rate": 5.92513020108467e-07, "logits/chosen": -0.5616706609725952, "logits/rejected": -0.5616706609725952, "logps/chosen": -76.62012481689453, "logps/rejected": -76.62012481689453, "loss": 0.9099, "rewards/accuracies": 0.0, "rewards/chosen": 1.9731765985488892, "rewards/margins": 0.0, "rewards/rejected": 1.9731765985488892, "step": 7638 }, { "epoch": 1.24, "learning_rate": 5.923838607596353e-07, "logits/chosen": -0.4725441634654999, "logits/rejected": -0.4061528742313385, "logps/chosen": -86.58557891845703, "logps/rejected": -36.26481628417969, "loss": 0.1472, "rewards/accuracies": 1.0, "rewards/chosen": 2.8483169078826904, "rewards/margins": 1.340988039970398, "rewards/rejected": 1.5073288679122925, "step": 7639 }, { "epoch": 1.24, "learning_rate": 5.922546950279532e-07, "logits/chosen": -0.7514411211013794, "logits/rejected": -0.7514411211013794, "logps/chosen": -0.7815660834312439, "logps/rejected": -0.7815660834312439, "loss": 1.0745, "rewards/accuracies": 0.0, "rewards/chosen": 0.3072335422039032, "rewards/margins": 0.0, "rewards/rejected": 0.3072335422039032, "step": 7640 }, { "epoch": 1.24, "learning_rate": 5.921255229223443e-07, "logits/chosen": -0.7636670470237732, "logits/rejected": -0.7236074805259705, "logps/chosen": -172.51275634765625, "logps/rejected": -167.97293090820312, "loss": 1.3068, "rewards/accuracies": 0.0, "rewards/chosen": 5.525084018707275, "rewards/margins": -1.9864592552185059, "rewards/rejected": 7.511543273925781, "step": 7641 }, { "epoch": 1.24, "learning_rate": 5.919963444517338e-07, "logits/chosen": -0.6277382373809814, "logits/rejected": -0.543997585773468, "logps/chosen": -78.72431945800781, "logps/rejected": -18.49024200439453, "loss": 0.2261, "rewards/accuracies": 1.0, "rewards/chosen": 3.1984245777130127, "rewards/margins": 2.8425960540771484, "rewards/rejected": 0.35582849383354187, "step": 7642 }, { "epoch": 1.24, "learning_rate": 5.91867159625046e-07, "logits/chosen": -0.5335271954536438, "logits/rejected": -0.4584500193595886, "logps/chosen": -78.47191619873047, "logps/rejected": -66.62679290771484, "loss": 0.2195, "rewards/accuracies": 1.0, "rewards/chosen": 1.595528483390808, "rewards/margins": 0.8805252909660339, "rewards/rejected": 0.7150031924247742, "step": 7643 }, { "epoch": 1.24, "learning_rate": 5.91737968451207e-07, "logits/chosen": -0.46288949251174927, "logits/rejected": -0.4640989601612091, "logps/chosen": -57.28343200683594, "logps/rejected": -54.478675842285156, "loss": 1.0357, "rewards/accuracies": 0.0, "rewards/chosen": 0.8956283926963806, "rewards/margins": -0.2942337393760681, "rewards/rejected": 1.1898621320724487, "step": 7644 }, { "epoch": 1.24, "learning_rate": 5.916087709391423e-07, "logits/chosen": -0.3346249461174011, "logits/rejected": -0.26942408084869385, "logps/chosen": -65.08979797363281, "logps/rejected": -75.91017150878906, "loss": 0.59, "rewards/accuracies": 0.0, "rewards/chosen": 1.1564239263534546, "rewards/margins": -0.47903144359588623, "rewards/rejected": 1.6354553699493408, "step": 7645 }, { "epoch": 1.24, "learning_rate": 5.914795670977784e-07, "logits/chosen": -0.5499446392059326, "logits/rejected": -0.35318174958229065, "logps/chosen": -86.54534149169922, "logps/rejected": -65.59266662597656, "loss": 0.8314, "rewards/accuracies": 1.0, "rewards/chosen": 6.500546932220459, "rewards/margins": 3.6686391830444336, "rewards/rejected": 2.8319077491760254, "step": 7646 }, { "epoch": 1.24, "learning_rate": 5.913503569360421e-07, "logits/chosen": -0.6173743605613708, "logits/rejected": -0.5933346748352051, "logps/chosen": -19.18848419189453, "logps/rejected": -8.72841739654541, "loss": 0.4016, "rewards/accuracies": 1.0, "rewards/chosen": 0.7802526354789734, "rewards/margins": 0.27644383907318115, "rewards/rejected": 0.5038087964057922, "step": 7647 }, { "epoch": 1.24, "learning_rate": 5.912211404628603e-07, "logits/chosen": -1.0317444801330566, "logits/rejected": -0.8196530938148499, "logps/chosen": -71.67010498046875, "logps/rejected": -91.73470306396484, "loss": 0.2185, "rewards/accuracies": 1.0, "rewards/chosen": 4.677391052246094, "rewards/margins": 2.487093210220337, "rewards/rejected": 2.190297842025757, "step": 7648 }, { "epoch": 1.24, "learning_rate": 5.910919176871609e-07, "logits/chosen": -0.6037872433662415, "logits/rejected": -0.5672330856323242, "logps/chosen": -104.08588409423828, "logps/rejected": -53.730464935302734, "loss": 0.1529, "rewards/accuracies": 1.0, "rewards/chosen": 2.4670968055725098, "rewards/margins": 1.0508885383605957, "rewards/rejected": 1.416208267211914, "step": 7649 }, { "epoch": 1.24, "learning_rate": 5.90962688617872e-07, "logits/chosen": -0.8865141868591309, "logits/rejected": -0.8355679512023926, "logps/chosen": -141.2467041015625, "logps/rejected": -138.42831420898438, "loss": 1.5681, "rewards/accuracies": 1.0, "rewards/chosen": 4.143492221832275, "rewards/margins": 0.5331192016601562, "rewards/rejected": 3.610373020172119, "step": 7650 }, { "epoch": 1.24, "learning_rate": 5.90833453263922e-07, "logits/chosen": -0.728644609451294, "logits/rejected": -0.6892861723899841, "logps/chosen": -171.77297973632812, "logps/rejected": -68.91075897216797, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 8.218774795532227, "rewards/margins": 6.144868850708008, "rewards/rejected": 2.0739059448242188, "step": 7651 }, { "epoch": 1.24, "learning_rate": 5.907042116342398e-07, "logits/chosen": -0.6107735633850098, "logits/rejected": -0.6384187340736389, "logps/chosen": -36.68886947631836, "logps/rejected": -75.96184539794922, "loss": 0.7116, "rewards/accuracies": 0.0, "rewards/chosen": 1.8983200788497925, "rewards/margins": -0.5145179033279419, "rewards/rejected": 2.4128379821777344, "step": 7652 }, { "epoch": 1.24, "learning_rate": 5.905749637377548e-07, "logits/chosen": -1.417103886604309, "logits/rejected": -1.3430848121643066, "logps/chosen": -67.6982650756836, "logps/rejected": -93.0289077758789, "loss": 1.1395, "rewards/accuracies": 0.0, "rewards/chosen": 3.450087785720825, "rewards/margins": -2.1133148670196533, "rewards/rejected": 5.5634026527404785, "step": 7653 }, { "epoch": 1.24, "learning_rate": 5.904457095833969e-07, "logits/chosen": -0.7852472066879272, "logits/rejected": -0.6184786558151245, "logps/chosen": -129.1364288330078, "logps/rejected": -40.960811614990234, "loss": 0.0534, "rewards/accuracies": 1.0, "rewards/chosen": 5.252223491668701, "rewards/margins": 3.030240058898926, "rewards/rejected": 2.2219834327697754, "step": 7654 }, { "epoch": 1.24, "learning_rate": 5.903164491800963e-07, "logits/chosen": -0.5260902047157288, "logits/rejected": -0.5090190172195435, "logps/chosen": -29.583677291870117, "logps/rejected": -22.527008056640625, "loss": 0.4637, "rewards/accuracies": 1.0, "rewards/chosen": 0.6792944073677063, "rewards/margins": 0.00219118595123291, "rewards/rejected": 0.6771032214164734, "step": 7655 }, { "epoch": 1.24, "learning_rate": 5.901871825367835e-07, "logits/chosen": -1.0492284297943115, "logits/rejected": -1.033691644668579, "logps/chosen": -119.80474090576172, "logps/rejected": -114.22650146484375, "loss": 2.0753, "rewards/accuracies": 0.0, "rewards/chosen": 5.55144739151001, "rewards/margins": -1.5300254821777344, "rewards/rejected": 7.081472873687744, "step": 7656 }, { "epoch": 1.24, "learning_rate": 5.900579096623899e-07, "logits/chosen": -0.882636308670044, "logits/rejected": -0.8670461773872375, "logps/chosen": -102.31999206542969, "logps/rejected": -81.88046264648438, "loss": 0.5481, "rewards/accuracies": 1.0, "rewards/chosen": 5.453015327453613, "rewards/margins": 0.111114501953125, "rewards/rejected": 5.341900825500488, "step": 7657 }, { "epoch": 1.24, "learning_rate": 5.899286305658468e-07, "logits/chosen": -0.9128690361976624, "logits/rejected": -1.0436736345291138, "logps/chosen": -123.1924819946289, "logps/rejected": -124.96173095703125, "loss": 1.6405, "rewards/accuracies": 0.0, "rewards/chosen": 2.2085137367248535, "rewards/margins": -2.201030731201172, "rewards/rejected": 4.409544467926025, "step": 7658 }, { "epoch": 1.24, "learning_rate": 5.897993452560862e-07, "logits/chosen": -1.1292414665222168, "logits/rejected": -1.0956039428710938, "logps/chosen": -79.75265502929688, "logps/rejected": -273.34686279296875, "loss": 1.4758, "rewards/accuracies": 0.0, "rewards/chosen": 0.6279358267784119, "rewards/margins": -2.1341888904571533, "rewards/rejected": 2.76212477684021, "step": 7659 }, { "epoch": 1.24, "learning_rate": 5.896700537420406e-07, "logits/chosen": -0.646534264087677, "logits/rejected": -0.5524419546127319, "logps/chosen": -79.16997528076172, "logps/rejected": -9.69714069366455, "loss": 0.3807, "rewards/accuracies": 1.0, "rewards/chosen": 1.218453288078308, "rewards/margins": 0.3674052953720093, "rewards/rejected": 0.8510479927062988, "step": 7660 }, { "epoch": 1.24, "learning_rate": 5.895407560326427e-07, "logits/chosen": -0.4353208839893341, "logits/rejected": -0.20499637722969055, "logps/chosen": -90.16360473632812, "logps/rejected": -44.18641662597656, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": 6.0121612548828125, "rewards/margins": 4.8827290534973145, "rewards/rejected": 1.1294323205947876, "step": 7661 }, { "epoch": 1.24, "learning_rate": 5.894114521368258e-07, "logits/chosen": -0.636947751045227, "logits/rejected": -0.6642543077468872, "logps/chosen": -37.35478210449219, "logps/rejected": -58.328182220458984, "loss": 3.3319, "rewards/accuracies": 0.0, "rewards/chosen": 0.9158027768135071, "rewards/margins": -5.10058069229126, "rewards/rejected": 6.016383647918701, "step": 7662 }, { "epoch": 1.24, "learning_rate": 5.892821420635237e-07, "logits/chosen": -0.33882513642311096, "logits/rejected": -0.38831090927124023, "logps/chosen": -68.94003295898438, "logps/rejected": -51.33709716796875, "loss": 2.4071, "rewards/accuracies": 0.0, "rewards/chosen": 1.4722763299942017, "rewards/margins": -0.6988204717636108, "rewards/rejected": 2.1710968017578125, "step": 7663 }, { "epoch": 1.24, "learning_rate": 5.891528258216702e-07, "logits/chosen": -0.2317897081375122, "logits/rejected": -0.26416322588920593, "logps/chosen": -85.35195922851562, "logps/rejected": -65.54826354980469, "loss": 1.9003, "rewards/accuracies": 0.0, "rewards/chosen": 0.5758407711982727, "rewards/margins": -2.543956756591797, "rewards/rejected": 3.119797468185425, "step": 7664 }, { "epoch": 1.24, "learning_rate": 5.890235034202001e-07, "logits/chosen": -0.5412725210189819, "logits/rejected": -0.5619663000106812, "logps/chosen": -76.7973403930664, "logps/rejected": -84.29507446289062, "loss": 0.4281, "rewards/accuracies": 0.0, "rewards/chosen": 0.2587638795375824, "rewards/margins": -0.17356720566749573, "rewards/rejected": 0.4323310852050781, "step": 7665 }, { "epoch": 1.24, "learning_rate": 5.888941748680484e-07, "logits/chosen": -0.6364015340805054, "logits/rejected": -0.619567334651947, "logps/chosen": -62.308597564697266, "logps/rejected": -117.87690734863281, "loss": 0.6229, "rewards/accuracies": 1.0, "rewards/chosen": 0.9079174399375916, "rewards/margins": 0.8031025528907776, "rewards/rejected": 0.10481490939855576, "step": 7666 }, { "epoch": 1.24, "learning_rate": 5.887648401741502e-07, "logits/chosen": -0.8078649640083313, "logits/rejected": -0.6606227159500122, "logps/chosen": -168.45553588867188, "logps/rejected": -95.78338623046875, "loss": 0.2609, "rewards/accuracies": 1.0, "rewards/chosen": 5.534242153167725, "rewards/margins": 0.7478909492492676, "rewards/rejected": 4.786351203918457, "step": 7667 }, { "epoch": 1.24, "learning_rate": 5.886354993474414e-07, "logits/chosen": -0.39677077531814575, "logits/rejected": -0.39677077531814575, "logps/chosen": -0.7766616940498352, "logps/rejected": -0.7766616940498352, "loss": 1.7545, "rewards/accuracies": 0.0, "rewards/chosen": 0.07059495151042938, "rewards/margins": 0.0, "rewards/rejected": 0.07059495151042938, "step": 7668 }, { "epoch": 1.24, "learning_rate": 5.885061523968583e-07, "logits/chosen": -0.9713528156280518, "logits/rejected": -0.8520964980125427, "logps/chosen": -128.10946655273438, "logps/rejected": -53.63690185546875, "loss": 1.0525, "rewards/accuracies": 0.0, "rewards/chosen": 0.8580169677734375, "rewards/margins": -1.1940734386444092, "rewards/rejected": 2.0520904064178467, "step": 7669 }, { "epoch": 1.24, "learning_rate": 5.883767993313375e-07, "logits/chosen": -0.9253756403923035, "logits/rejected": -0.9292196035385132, "logps/chosen": -109.3459243774414, "logps/rejected": -90.99564361572266, "loss": 0.9852, "rewards/accuracies": 1.0, "rewards/chosen": 2.461838483810425, "rewards/margins": 0.43929123878479004, "rewards/rejected": 2.0225472450256348, "step": 7670 }, { "epoch": 1.25, "learning_rate": 5.882474401598162e-07, "logits/chosen": -0.8583391904830933, "logits/rejected": -0.8433137536048889, "logps/chosen": -77.17811584472656, "logps/rejected": -63.28730010986328, "loss": 0.6003, "rewards/accuracies": 1.0, "rewards/chosen": 1.4395416975021362, "rewards/margins": 0.14743733406066895, "rewards/rejected": 1.2921043634414673, "step": 7671 }, { "epoch": 1.25, "learning_rate": 5.881180748912318e-07, "logits/chosen": -0.8423454165458679, "logits/rejected": -0.7621663212776184, "logps/chosen": -114.11566925048828, "logps/rejected": -34.43844985961914, "loss": 0.1601, "rewards/accuracies": 1.0, "rewards/chosen": 4.256350040435791, "rewards/margins": 3.653135299682617, "rewards/rejected": 0.603214681148529, "step": 7672 }, { "epoch": 1.25, "learning_rate": 5.87988703534522e-07, "logits/chosen": -0.39766350388526917, "logits/rejected": -0.29312095046043396, "logps/chosen": -74.0773696899414, "logps/rejected": -78.83665466308594, "loss": 0.5731, "rewards/accuracies": 0.0, "rewards/chosen": 1.3732330799102783, "rewards/margins": -0.7026932239532471, "rewards/rejected": 2.0759263038635254, "step": 7673 }, { "epoch": 1.25, "learning_rate": 5.878593260986255e-07, "logits/chosen": -0.7223491072654724, "logits/rejected": -0.7614459991455078, "logps/chosen": -222.65145874023438, "logps/rejected": -106.87542724609375, "loss": 0.025, "rewards/accuracies": 1.0, "rewards/chosen": 4.1247406005859375, "rewards/margins": 3.199610948562622, "rewards/rejected": 0.9251297116279602, "step": 7674 }, { "epoch": 1.25, "learning_rate": 5.877299425924809e-07, "logits/chosen": -0.6459119319915771, "logits/rejected": -0.44080501794815063, "logps/chosen": -101.1007080078125, "logps/rejected": -52.3531494140625, "loss": 0.1037, "rewards/accuracies": 1.0, "rewards/chosen": 3.917083740234375, "rewards/margins": 2.245877742767334, "rewards/rejected": 1.6712058782577515, "step": 7675 }, { "epoch": 1.25, "learning_rate": 5.876005530250273e-07, "logits/chosen": -0.43640223145484924, "logits/rejected": -0.3298034965991974, "logps/chosen": -82.87860870361328, "logps/rejected": -9.353540420532227, "loss": 0.6114, "rewards/accuracies": 1.0, "rewards/chosen": 2.6099839210510254, "rewards/margins": 2.047478437423706, "rewards/rejected": 0.5625055432319641, "step": 7676 }, { "epoch": 1.25, "learning_rate": 5.874711574052044e-07, "logits/chosen": -0.39227157831192017, "logits/rejected": -0.298552930355072, "logps/chosen": -82.24288940429688, "logps/rejected": -22.766345977783203, "loss": 0.142, "rewards/accuracies": 1.0, "rewards/chosen": 5.276028633117676, "rewards/margins": 4.083789348602295, "rewards/rejected": 1.1922394037246704, "step": 7677 }, { "epoch": 1.25, "learning_rate": 5.873417557419522e-07, "logits/chosen": -0.46049964427948, "logits/rejected": -0.4631519913673401, "logps/chosen": -93.06676483154297, "logps/rejected": -78.41893768310547, "loss": 1.5042, "rewards/accuracies": 0.0, "rewards/chosen": 0.9476234316825867, "rewards/margins": -1.6186699867248535, "rewards/rejected": 2.566293478012085, "step": 7678 }, { "epoch": 1.25, "learning_rate": 5.872123480442112e-07, "logits/chosen": -0.4992254078388214, "logits/rejected": -0.5555937886238098, "logps/chosen": -137.06536865234375, "logps/rejected": -142.88302612304688, "loss": 1.7047, "rewards/accuracies": 0.0, "rewards/chosen": 1.018524169921875, "rewards/margins": -3.337918281555176, "rewards/rejected": 4.356442451477051, "step": 7679 }, { "epoch": 1.25, "learning_rate": 5.87082934320922e-07, "logits/chosen": -1.0984790325164795, "logits/rejected": -1.0538400411605835, "logps/chosen": -105.38346862792969, "logps/rejected": -59.446014404296875, "loss": 1.1776, "rewards/accuracies": 0.0, "rewards/chosen": 0.7713714838027954, "rewards/margins": -2.027609348297119, "rewards/rejected": 2.798980712890625, "step": 7680 }, { "epoch": 1.25, "learning_rate": 5.869535145810262e-07, "logits/chosen": -0.4805637300014496, "logits/rejected": -0.4564763009548187, "logps/chosen": -106.81010437011719, "logps/rejected": -192.31204223632812, "loss": 1.9284, "rewards/accuracies": 1.0, "rewards/chosen": 1.400593638420105, "rewards/margins": 1.1499557495117188, "rewards/rejected": 0.25063782930374146, "step": 7681 }, { "epoch": 1.25, "learning_rate": 5.868240888334652e-07, "logits/chosen": -0.8955696821212769, "logits/rejected": -0.893667459487915, "logps/chosen": -90.88369750976562, "logps/rejected": -58.965187072753906, "loss": 0.2913, "rewards/accuracies": 1.0, "rewards/chosen": 2.8681740760803223, "rewards/margins": 1.0177292823791504, "rewards/rejected": 1.8504447937011719, "step": 7682 }, { "epoch": 1.25, "learning_rate": 5.866946570871811e-07, "logits/chosen": -0.5991792678833008, "logits/rejected": -0.5285913944244385, "logps/chosen": -59.62264633178711, "logps/rejected": -77.60015106201172, "loss": 0.7023, "rewards/accuracies": 0.0, "rewards/chosen": 1.864419937133789, "rewards/margins": -1.1025593280792236, "rewards/rejected": 2.9669792652130127, "step": 7683 }, { "epoch": 1.25, "learning_rate": 5.865652193511167e-07, "logits/chosen": -0.8398142457008362, "logits/rejected": -0.8594187498092651, "logps/chosen": -96.15996551513672, "logps/rejected": -147.820556640625, "loss": 1.0423, "rewards/accuracies": 0.0, "rewards/chosen": 4.93908166885376, "rewards/margins": -1.5496940612792969, "rewards/rejected": 6.488775730133057, "step": 7684 }, { "epoch": 1.25, "learning_rate": 5.864357756342146e-07, "logits/chosen": -0.21303272247314453, "logits/rejected": -0.238957017660141, "logps/chosen": -79.19725799560547, "logps/rejected": -83.17239379882812, "loss": 2.41, "rewards/accuracies": 1.0, "rewards/chosen": 0.4528648555278778, "rewards/margins": 0.5706993341445923, "rewards/rejected": -0.11783447116613388, "step": 7685 }, { "epoch": 1.25, "learning_rate": 5.863063259454184e-07, "logits/chosen": -0.6077402234077454, "logits/rejected": -0.229579359292984, "logps/chosen": -120.37200164794922, "logps/rejected": -41.94852828979492, "loss": 0.0782, "rewards/accuracies": 1.0, "rewards/chosen": 4.16533899307251, "rewards/margins": 3.693861484527588, "rewards/rejected": 0.4714775085449219, "step": 7686 }, { "epoch": 1.25, "learning_rate": 5.861768702936716e-07, "logits/chosen": -0.4969199299812317, "logits/rejected": -0.5105499625205994, "logps/chosen": -18.913158416748047, "logps/rejected": -6.534717559814453, "loss": 0.457, "rewards/accuracies": 0.0, "rewards/chosen": -0.018542099744081497, "rewards/margins": -0.2512829899787903, "rewards/rejected": 0.2327408790588379, "step": 7687 }, { "epoch": 1.25, "learning_rate": 5.860474086879184e-07, "logits/chosen": -0.8506083488464355, "logits/rejected": -0.9590709209442139, "logps/chosen": -251.87481689453125, "logps/rejected": -115.05509185791016, "loss": 0.8865, "rewards/accuracies": 0.0, "rewards/chosen": 4.039337158203125, "rewards/margins": -1.4967613220214844, "rewards/rejected": 5.536098480224609, "step": 7688 }, { "epoch": 1.25, "learning_rate": 5.859179411371037e-07, "logits/chosen": -0.5937913060188293, "logits/rejected": -0.5563660264015198, "logps/chosen": -60.68877029418945, "logps/rejected": -18.973209381103516, "loss": 0.2905, "rewards/accuracies": 1.0, "rewards/chosen": 1.9222644567489624, "rewards/margins": 0.6876994371414185, "rewards/rejected": 1.234565019607544, "step": 7689 }, { "epoch": 1.25, "learning_rate": 5.85788467650172e-07, "logits/chosen": -0.6021647453308105, "logits/rejected": -0.5656627416610718, "logps/chosen": -86.93323516845703, "logps/rejected": -40.02294158935547, "loss": 0.4558, "rewards/accuracies": 1.0, "rewards/chosen": 1.3724991083145142, "rewards/margins": 0.050328850746154785, "rewards/rejected": 1.3221702575683594, "step": 7690 }, { "epoch": 1.25, "learning_rate": 5.856589882360691e-07, "logits/chosen": -0.8215317726135254, "logits/rejected": -0.8069431781768799, "logps/chosen": -153.2012939453125, "logps/rejected": -19.445646286010742, "loss": 0.4218, "rewards/accuracies": 1.0, "rewards/chosen": 4.004216194152832, "rewards/margins": 3.3079991340637207, "rewards/rejected": 0.6962171792984009, "step": 7691 }, { "epoch": 1.25, "learning_rate": 5.855295029037404e-07, "logits/chosen": -0.7594594359397888, "logits/rejected": -0.7041401863098145, "logps/chosen": -82.54121398925781, "logps/rejected": -50.72509765625, "loss": 0.9683, "rewards/accuracies": 1.0, "rewards/chosen": 1.48486328125, "rewards/margins": 0.49355393648147583, "rewards/rejected": 0.9913093447685242, "step": 7692 }, { "epoch": 1.25, "learning_rate": 5.854000116621325e-07, "logits/chosen": -0.35623255372047424, "logits/rejected": -0.34944093227386475, "logps/chosen": -54.23625946044922, "logps/rejected": -67.9638442993164, "loss": 0.4054, "rewards/accuracies": 1.0, "rewards/chosen": 1.2767380475997925, "rewards/margins": 0.9430115222930908, "rewards/rejected": 0.3337264955043793, "step": 7693 }, { "epoch": 1.25, "learning_rate": 5.852705145201918e-07, "logits/chosen": -0.6326189041137695, "logits/rejected": -0.6326189041137695, "logps/chosen": -67.75221252441406, "logps/rejected": -67.75221252441406, "loss": 0.4033, "rewards/accuracies": 0.0, "rewards/chosen": 3.148144483566284, "rewards/margins": 0.0, "rewards/rejected": 3.148144483566284, "step": 7694 }, { "epoch": 1.25, "learning_rate": 5.851410114868655e-07, "logits/chosen": -0.8802977204322815, "logits/rejected": -0.82088702917099, "logps/chosen": -112.44222259521484, "logps/rejected": -53.43511962890625, "loss": 0.4233, "rewards/accuracies": 1.0, "rewards/chosen": 1.9337990283966064, "rewards/margins": 0.511949896812439, "rewards/rejected": 1.4218491315841675, "step": 7695 }, { "epoch": 1.25, "learning_rate": 5.850115025711008e-07, "logits/chosen": -0.7153626084327698, "logits/rejected": -0.7019427418708801, "logps/chosen": -82.64991760253906, "logps/rejected": -151.22398376464844, "loss": 0.6656, "rewards/accuracies": 1.0, "rewards/chosen": 2.2210328578948975, "rewards/margins": 0.02305150032043457, "rewards/rejected": 2.197981357574463, "step": 7696 }, { "epoch": 1.25, "learning_rate": 5.848819877818457e-07, "logits/chosen": -0.5491303205490112, "logits/rejected": -0.5651127099990845, "logps/chosen": -21.599979400634766, "logps/rejected": -43.26130294799805, "loss": 0.7487, "rewards/accuracies": 1.0, "rewards/chosen": 1.7195297479629517, "rewards/margins": 0.1425243616104126, "rewards/rejected": 1.577005386352539, "step": 7697 }, { "epoch": 1.25, "learning_rate": 5.847524671280483e-07, "logits/chosen": -0.5570030808448792, "logits/rejected": -0.534479022026062, "logps/chosen": -60.08883285522461, "logps/rejected": -55.30187225341797, "loss": 1.9851, "rewards/accuracies": 0.0, "rewards/chosen": 0.7720401883125305, "rewards/margins": -0.3973621726036072, "rewards/rejected": 1.1694023609161377, "step": 7698 }, { "epoch": 1.25, "learning_rate": 5.846229406186575e-07, "logits/chosen": -0.9201492667198181, "logits/rejected": -0.8733937740325928, "logps/chosen": -103.8530502319336, "logps/rejected": -61.86555099487305, "loss": 0.6492, "rewards/accuracies": 0.0, "rewards/chosen": 1.1816574335098267, "rewards/margins": -0.8035571575164795, "rewards/rejected": 1.9852145910263062, "step": 7699 }, { "epoch": 1.25, "learning_rate": 5.844934082626221e-07, "logits/chosen": -0.47849881649017334, "logits/rejected": -0.4915381968021393, "logps/chosen": -101.02001953125, "logps/rejected": -89.57868957519531, "loss": 1.0913, "rewards/accuracies": 0.0, "rewards/chosen": 1.1064919233322144, "rewards/margins": -0.18388664722442627, "rewards/rejected": 1.2903785705566406, "step": 7700 }, { "epoch": 1.25, "learning_rate": 5.843638700688917e-07, "logits/chosen": -0.6771617531776428, "logits/rejected": -0.6510220766067505, "logps/chosen": -91.67323303222656, "logps/rejected": -95.74301147460938, "loss": 0.4256, "rewards/accuracies": 0.0, "rewards/chosen": 1.179517388343811, "rewards/margins": -0.2683464288711548, "rewards/rejected": 1.4478638172149658, "step": 7701 }, { "epoch": 1.25, "learning_rate": 5.842343260464163e-07, "logits/chosen": -0.3859768807888031, "logits/rejected": -0.3397095501422882, "logps/chosen": -61.004981994628906, "logps/rejected": -83.97085571289062, "loss": 0.8589, "rewards/accuracies": 1.0, "rewards/chosen": 1.0280113220214844, "rewards/margins": 0.604632556438446, "rewards/rejected": 0.42337876558303833, "step": 7702 }, { "epoch": 1.25, "learning_rate": 5.841047762041459e-07, "logits/chosen": -0.8421485424041748, "logits/rejected": -0.8195810317993164, "logps/chosen": -77.19447326660156, "logps/rejected": -11.617432594299316, "loss": 0.7108, "rewards/accuracies": 0.0, "rewards/chosen": 0.8952560424804688, "rewards/margins": -0.19103610515594482, "rewards/rejected": 1.0862921476364136, "step": 7703 }, { "epoch": 1.25, "learning_rate": 5.839752205510312e-07, "logits/chosen": -0.5203909277915955, "logits/rejected": -0.6262462139129639, "logps/chosen": -107.43506622314453, "logps/rejected": -104.08348083496094, "loss": 0.8475, "rewards/accuracies": 0.0, "rewards/chosen": 0.9183281064033508, "rewards/margins": -1.4057579040527344, "rewards/rejected": 2.3240859508514404, "step": 7704 }, { "epoch": 1.25, "learning_rate": 5.838456590960233e-07, "logits/chosen": -0.5482780933380127, "logits/rejected": -0.5482780933380127, "logps/chosen": -36.85440444946289, "logps/rejected": -36.85440444946289, "loss": 0.3542, "rewards/accuracies": 0.0, "rewards/chosen": 0.1760120391845703, "rewards/margins": 0.0, "rewards/rejected": 0.1760120391845703, "step": 7705 }, { "epoch": 1.25, "learning_rate": 5.837160918480739e-07, "logits/chosen": -0.6706526875495911, "logits/rejected": -0.6994130611419678, "logps/chosen": -192.20684814453125, "logps/rejected": -100.12601470947266, "loss": 1.5941, "rewards/accuracies": 0.0, "rewards/chosen": 3.7659547328948975, "rewards/margins": -0.18662333488464355, "rewards/rejected": 3.952578067779541, "step": 7706 }, { "epoch": 1.25, "learning_rate": 5.835865188161345e-07, "logits/chosen": -0.8877851963043213, "logits/rejected": -1.2012765407562256, "logps/chosen": -83.51701354980469, "logps/rejected": -35.364749908447266, "loss": 0.99, "rewards/accuracies": 1.0, "rewards/chosen": 1.38524329662323, "rewards/margins": 0.999606728553772, "rewards/rejected": 0.3856365382671356, "step": 7707 }, { "epoch": 1.25, "learning_rate": 5.834569400091577e-07, "logits/chosen": -0.49229368567466736, "logits/rejected": -0.38054123520851135, "logps/chosen": -59.42933654785156, "logps/rejected": -68.67762756347656, "loss": 0.0897, "rewards/accuracies": 1.0, "rewards/chosen": 3.044818162918091, "rewards/margins": 1.8643379211425781, "rewards/rejected": 1.1804802417755127, "step": 7708 }, { "epoch": 1.25, "learning_rate": 5.833273554360958e-07, "logits/chosen": -0.9206047654151917, "logits/rejected": -0.898859441280365, "logps/chosen": -252.31845092773438, "logps/rejected": -59.612022399902344, "loss": 1.2369, "rewards/accuracies": 0.0, "rewards/chosen": 0.049957275390625, "rewards/margins": -2.377336263656616, "rewards/rejected": 2.427293539047241, "step": 7709 }, { "epoch": 1.25, "learning_rate": 5.831977651059022e-07, "logits/chosen": -0.6696385741233826, "logits/rejected": -0.6914557814598083, "logps/chosen": -63.546295166015625, "logps/rejected": -49.240966796875, "loss": 0.3882, "rewards/accuracies": 0.0, "rewards/chosen": 1.3974609375, "rewards/margins": -0.1075981855392456, "rewards/rejected": 1.5050591230392456, "step": 7710 }, { "epoch": 1.25, "learning_rate": 5.830681690275303e-07, "logits/chosen": -0.8311462998390198, "logits/rejected": -0.776350736618042, "logps/chosen": -47.7877311706543, "logps/rejected": -48.756004333496094, "loss": 0.4011, "rewards/accuracies": 0.0, "rewards/chosen": 2.125988483428955, "rewards/margins": -0.10478544235229492, "rewards/rejected": 2.23077392578125, "step": 7711 }, { "epoch": 1.25, "learning_rate": 5.829385672099339e-07, "logits/chosen": -0.6342289447784424, "logits/rejected": -0.6342289447784424, "logps/chosen": -45.35874938964844, "logps/rejected": -45.35874938964844, "loss": 0.913, "rewards/accuracies": 0.0, "rewards/chosen": 1.152722954750061, "rewards/margins": 0.0, "rewards/rejected": 1.152722954750061, "step": 7712 }, { "epoch": 1.25, "learning_rate": 5.828089596620674e-07, "logits/chosen": -0.42351293563842773, "logits/rejected": -0.39261394739151, "logps/chosen": -37.10799026489258, "logps/rejected": -39.96977615356445, "loss": 0.4632, "rewards/accuracies": 0.0, "rewards/chosen": 0.972080647945404, "rewards/margins": -0.1233261227607727, "rewards/rejected": 1.0954067707061768, "step": 7713 }, { "epoch": 1.25, "learning_rate": 5.826793463928852e-07, "logits/chosen": -1.0321485996246338, "logits/rejected": -1.0023530721664429, "logps/chosen": -34.773765563964844, "logps/rejected": -30.699899673461914, "loss": 0.5429, "rewards/accuracies": 1.0, "rewards/chosen": 0.8733421564102173, "rewards/margins": 0.03574579954147339, "rewards/rejected": 0.8375963568687439, "step": 7714 }, { "epoch": 1.25, "learning_rate": 5.825497274113425e-07, "logits/chosen": -0.4854394793510437, "logits/rejected": -0.3502868115901947, "logps/chosen": -43.785797119140625, "logps/rejected": -36.49452590942383, "loss": 0.373, "rewards/accuracies": 1.0, "rewards/chosen": 1.6282074451446533, "rewards/margins": 0.6421715021133423, "rewards/rejected": 0.986035943031311, "step": 7715 }, { "epoch": 1.25, "learning_rate": 5.824201027263947e-07, "logits/chosen": -1.7261770963668823, "logits/rejected": -1.7822908163070679, "logps/chosen": -283.75604248046875, "logps/rejected": -134.98904418945312, "loss": 0.9142, "rewards/accuracies": 0.0, "rewards/chosen": 3.8077149391174316, "rewards/margins": -1.5970244407653809, "rewards/rejected": 5.4047393798828125, "step": 7716 }, { "epoch": 1.25, "learning_rate": 5.822904723469978e-07, "logits/chosen": -1.0740668773651123, "logits/rejected": -0.9354809522628784, "logps/chosen": -114.12342834472656, "logps/rejected": -22.514293670654297, "loss": 0.197, "rewards/accuracies": 1.0, "rewards/chosen": 1.3079971075057983, "rewards/margins": 0.8217906951904297, "rewards/rejected": 0.48620644211769104, "step": 7717 }, { "epoch": 1.25, "learning_rate": 5.821608362821077e-07, "logits/chosen": -0.7182820439338684, "logits/rejected": -0.6846757531166077, "logps/chosen": -134.38897705078125, "logps/rejected": -86.78133392333984, "loss": 0.4137, "rewards/accuracies": 0.0, "rewards/chosen": 1.5142288208007812, "rewards/margins": -0.21060264110565186, "rewards/rejected": 1.724831461906433, "step": 7718 }, { "epoch": 1.25, "learning_rate": 5.820311945406814e-07, "logits/chosen": -0.6816153526306152, "logits/rejected": -0.7323092818260193, "logps/chosen": -55.3829345703125, "logps/rejected": -90.10482788085938, "loss": 3.0478, "rewards/accuracies": 0.0, "rewards/chosen": 2.5860671997070312, "rewards/margins": -3.3471803665161133, "rewards/rejected": 5.9332475662231445, "step": 7719 }, { "epoch": 1.25, "learning_rate": 5.819015471316757e-07, "logits/chosen": -0.9104918241500854, "logits/rejected": -0.8525902628898621, "logps/chosen": -61.107666015625, "logps/rejected": -59.47813034057617, "loss": 0.6087, "rewards/accuracies": 1.0, "rewards/chosen": 2.3677475452423096, "rewards/margins": 0.37218594551086426, "rewards/rejected": 1.9955615997314453, "step": 7720 }, { "epoch": 1.25, "learning_rate": 5.817718940640481e-07, "logits/chosen": -0.7047951221466064, "logits/rejected": -0.6035788059234619, "logps/chosen": -53.691776275634766, "logps/rejected": -71.00274658203125, "loss": 1.2937, "rewards/accuracies": 0.0, "rewards/chosen": 1.9035274982452393, "rewards/margins": -0.6262485980987549, "rewards/rejected": 2.529776096343994, "step": 7721 }, { "epoch": 1.25, "learning_rate": 5.816422353467562e-07, "logits/chosen": -0.9195448756217957, "logits/rejected": -0.9122183918952942, "logps/chosen": -101.850830078125, "logps/rejected": -71.99207305908203, "loss": 0.7158, "rewards/accuracies": 0.0, "rewards/chosen": 1.0365616083145142, "rewards/margins": -1.0624946355819702, "rewards/rejected": 2.0990562438964844, "step": 7722 }, { "epoch": 1.25, "learning_rate": 5.815125709887584e-07, "logits/chosen": -0.9715000987052917, "logits/rejected": -0.8882077932357788, "logps/chosen": -48.816444396972656, "logps/rejected": -49.23590850830078, "loss": 0.4377, "rewards/accuracies": 0.0, "rewards/chosen": 1.795690894126892, "rewards/margins": -0.09357309341430664, "rewards/rejected": 1.8892639875411987, "step": 7723 }, { "epoch": 1.25, "learning_rate": 5.813829009990132e-07, "logits/chosen": -0.5328758358955383, "logits/rejected": -0.5328758358955383, "logps/chosen": -84.62132263183594, "logps/rejected": -84.62132263183594, "loss": 0.95, "rewards/accuracies": 0.0, "rewards/chosen": 1.8759063482284546, "rewards/margins": 0.0, "rewards/rejected": 1.8759063482284546, "step": 7724 }, { "epoch": 1.25, "learning_rate": 5.812532253864797e-07, "logits/chosen": -0.13476170599460602, "logits/rejected": -0.1372859925031662, "logps/chosen": -2.1908035278320312, "logps/rejected": -2.1514415740966797, "loss": 0.434, "rewards/accuracies": 1.0, "rewards/chosen": 0.21138668060302734, "rewards/margins": 0.0013686120510101318, "rewards/rejected": 0.2100180685520172, "step": 7725 }, { "epoch": 1.25, "learning_rate": 5.811235441601171e-07, "logits/chosen": -0.4872174561023712, "logits/rejected": -0.4872174561023712, "logps/chosen": -40.72310256958008, "logps/rejected": -40.72310256958008, "loss": 0.8772, "rewards/accuracies": 0.0, "rewards/chosen": 1.7011768817901611, "rewards/margins": 0.0, "rewards/rejected": 1.7011768817901611, "step": 7726 }, { "epoch": 1.25, "learning_rate": 5.809938573288852e-07, "logits/chosen": -0.5388674736022949, "logits/rejected": -0.600586473941803, "logps/chosen": -77.35368347167969, "logps/rejected": -100.40142059326172, "loss": 1.2723, "rewards/accuracies": 0.0, "rewards/chosen": 2.0440895557403564, "rewards/margins": -2.123840570449829, "rewards/rejected": 4.1679301261901855, "step": 7727 }, { "epoch": 1.25, "learning_rate": 5.80864164901744e-07, "logits/chosen": -0.5816952586174011, "logits/rejected": -0.6148470640182495, "logps/chosen": -103.85881042480469, "logps/rejected": -56.37690353393555, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 6.458400249481201, "rewards/margins": 4.238607406616211, "rewards/rejected": 2.219792604446411, "step": 7728 }, { "epoch": 1.25, "learning_rate": 5.807344668876544e-07, "logits/chosen": -0.8744063973426819, "logits/rejected": -0.8094122409820557, "logps/chosen": -84.22651672363281, "logps/rejected": -41.42755889892578, "loss": 0.0311, "rewards/accuracies": 1.0, "rewards/chosen": 4.3472723960876465, "rewards/margins": 2.776395797729492, "rewards/rejected": 1.5708767175674438, "step": 7729 }, { "epoch": 1.25, "learning_rate": 5.806047632955769e-07, "logits/chosen": -0.9562639594078064, "logits/rejected": -0.9014142751693726, "logps/chosen": -114.24917602539062, "logps/rejected": -102.33871459960938, "loss": 0.2861, "rewards/accuracies": 1.0, "rewards/chosen": 3.073800802230835, "rewards/margins": 0.5565552711486816, "rewards/rejected": 2.5172455310821533, "step": 7730 }, { "epoch": 1.25, "learning_rate": 5.80475054134473e-07, "logits/chosen": -0.7128648161888123, "logits/rejected": -0.8717788457870483, "logps/chosen": -64.4475326538086, "logps/rejected": -1947.4110107421875, "loss": 0.073, "rewards/accuracies": 1.0, "rewards/chosen": 3.37724232673645, "rewards/margins": 2.028597354888916, "rewards/rejected": 1.3486450910568237, "step": 7731 }, { "epoch": 1.25, "learning_rate": 5.803453394133042e-07, "logits/chosen": -0.7464585900306702, "logits/rejected": -0.7974638938903809, "logps/chosen": -131.67613220214844, "logps/rejected": -100.771240234375, "loss": 0.1309, "rewards/accuracies": 1.0, "rewards/chosen": 5.178070068359375, "rewards/margins": 1.2781448364257812, "rewards/rejected": 3.8999252319335938, "step": 7732 }, { "epoch": 1.26, "learning_rate": 5.80215619141033e-07, "logits/chosen": -0.5649585127830505, "logits/rejected": -0.6281102299690247, "logps/chosen": -71.69279479980469, "logps/rejected": -127.65605926513672, "loss": 1.2651, "rewards/accuracies": 0.0, "rewards/chosen": 1.7161781787872314, "rewards/margins": -1.9903168678283691, "rewards/rejected": 3.7064950466156006, "step": 7733 }, { "epoch": 1.26, "learning_rate": 5.800858933266212e-07, "logits/chosen": -0.727191686630249, "logits/rejected": -0.6840276122093201, "logps/chosen": -46.16596984863281, "logps/rejected": -141.06846618652344, "loss": 0.661, "rewards/accuracies": 0.0, "rewards/chosen": 4.137908935546875, "rewards/margins": -0.9974703788757324, "rewards/rejected": 5.135379314422607, "step": 7734 }, { "epoch": 1.26, "learning_rate": 5.799561619790321e-07, "logits/chosen": -0.4156448245048523, "logits/rejected": -0.3339058458805084, "logps/chosen": -71.62417602539062, "logps/rejected": -72.29022216796875, "loss": 0.1554, "rewards/accuracies": 1.0, "rewards/chosen": 2.164532423019409, "rewards/margins": 1.486783504486084, "rewards/rejected": 0.6777488589286804, "step": 7735 }, { "epoch": 1.26, "learning_rate": 5.798264251072286e-07, "logits/chosen": -0.5239267349243164, "logits/rejected": -0.5239267349243164, "logps/chosen": -51.328887939453125, "logps/rejected": -51.328887939453125, "loss": 1.9147, "rewards/accuracies": 0.0, "rewards/chosen": 1.315894365310669, "rewards/margins": 0.0, "rewards/rejected": 1.315894365310669, "step": 7736 }, { "epoch": 1.26, "learning_rate": 5.796966827201746e-07, "logits/chosen": -0.5288100242614746, "logits/rejected": -0.46040794253349304, "logps/chosen": -52.75061798095703, "logps/rejected": -69.97260284423828, "loss": 0.3971, "rewards/accuracies": 0.0, "rewards/chosen": 1.3105354309082031, "rewards/margins": -0.16804051399230957, "rewards/rejected": 1.4785759449005127, "step": 7737 }, { "epoch": 1.26, "learning_rate": 5.795669348268338e-07, "logits/chosen": -0.5406152009963989, "logits/rejected": -0.6515023112297058, "logps/chosen": -49.96797561645508, "logps/rejected": -116.24797058105469, "loss": 0.7508, "rewards/accuracies": 0.0, "rewards/chosen": 1.6746082305908203, "rewards/margins": -0.21879231929779053, "rewards/rejected": 1.8934005498886108, "step": 7738 }, { "epoch": 1.26, "learning_rate": 5.794371814361708e-07, "logits/chosen": -0.21913909912109375, "logits/rejected": -0.21913909912109375, "logps/chosen": -44.35877990722656, "logps/rejected": -44.35877990722656, "loss": 0.3524, "rewards/accuracies": 0.0, "rewards/chosen": 2.217182159423828, "rewards/margins": 0.0, "rewards/rejected": 2.217182159423828, "step": 7739 }, { "epoch": 1.26, "learning_rate": 5.793074225571501e-07, "logits/chosen": -0.4985668361186981, "logits/rejected": -0.4985668361186981, "logps/chosen": -35.824466705322266, "logps/rejected": -35.824466705322266, "loss": 0.5148, "rewards/accuracies": 0.0, "rewards/chosen": 0.2291431427001953, "rewards/margins": 0.0, "rewards/rejected": 0.2291431427001953, "step": 7740 }, { "epoch": 1.26, "learning_rate": 5.79177658198737e-07, "logits/chosen": -0.6031630039215088, "logits/rejected": -0.5662983059883118, "logps/chosen": -60.05674743652344, "logps/rejected": -58.387062072753906, "loss": 0.8657, "rewards/accuracies": 0.0, "rewards/chosen": 1.167872667312622, "rewards/margins": -1.5198884010314941, "rewards/rejected": 2.687761068344116, "step": 7741 }, { "epoch": 1.26, "learning_rate": 5.790478883698969e-07, "logits/chosen": -1.1992789506912231, "logits/rejected": -1.1860063076019287, "logps/chosen": -128.2572021484375, "logps/rejected": -68.68123626708984, "loss": 0.5884, "rewards/accuracies": 0.0, "rewards/chosen": 1.0177948474884033, "rewards/margins": -0.8074439764022827, "rewards/rejected": 1.825238823890686, "step": 7742 }, { "epoch": 1.26, "learning_rate": 5.789181130795957e-07, "logits/chosen": -0.5762159824371338, "logits/rejected": -0.5435615181922913, "logps/chosen": -46.72467803955078, "logps/rejected": -67.91667175292969, "loss": 0.9656, "rewards/accuracies": 0.0, "rewards/chosen": 1.214411973953247, "rewards/margins": -0.6989539861679077, "rewards/rejected": 1.9133659601211548, "step": 7743 }, { "epoch": 1.26, "learning_rate": 5.787883323367995e-07, "logits/chosen": -0.720475435256958, "logits/rejected": -0.7487469911575317, "logps/chosen": -151.57594299316406, "logps/rejected": -177.99533081054688, "loss": 0.837, "rewards/accuracies": 1.0, "rewards/chosen": 4.7354631423950195, "rewards/margins": 0.15165424346923828, "rewards/rejected": 4.583808898925781, "step": 7744 }, { "epoch": 1.26, "learning_rate": 5.786585461504751e-07, "logits/chosen": -0.5282022356987, "logits/rejected": -0.6012980341911316, "logps/chosen": -233.69369506835938, "logps/rejected": -188.6147003173828, "loss": 0.0717, "rewards/accuracies": 1.0, "rewards/chosen": 3.36094069480896, "rewards/margins": 2.0213546752929688, "rewards/rejected": 1.3395859003067017, "step": 7745 }, { "epoch": 1.26, "learning_rate": 5.785287545295895e-07, "logits/chosen": -0.808440089225769, "logits/rejected": -0.8030269742012024, "logps/chosen": -39.61839294433594, "logps/rejected": -25.16280746459961, "loss": 0.5606, "rewards/accuracies": 0.0, "rewards/chosen": 1.6163711547851562, "rewards/margins": -0.21604466438293457, "rewards/rejected": 1.8324158191680908, "step": 7746 }, { "epoch": 1.26, "learning_rate": 5.783989574831099e-07, "logits/chosen": -0.4679763615131378, "logits/rejected": -0.47244781255722046, "logps/chosen": -8.674182891845703, "logps/rejected": -7.658725738525391, "loss": 2.1575, "rewards/accuracies": 0.0, "rewards/chosen": -0.26460000872612, "rewards/margins": -0.2681683301925659, "rewards/rejected": 0.0035683156456798315, "step": 7747 }, { "epoch": 1.26, "learning_rate": 5.782691550200042e-07, "logits/chosen": -0.748333215713501, "logits/rejected": -0.7440991401672363, "logps/chosen": -91.1102294921875, "logps/rejected": -62.34156036376953, "loss": 0.6921, "rewards/accuracies": 0.0, "rewards/chosen": 1.423298716545105, "rewards/margins": -0.4884955883026123, "rewards/rejected": 1.9117943048477173, "step": 7748 }, { "epoch": 1.26, "learning_rate": 5.781393471492405e-07, "logits/chosen": -0.922931432723999, "logits/rejected": -0.922931432723999, "logps/chosen": -83.3026351928711, "logps/rejected": -83.3026351928711, "loss": 0.3611, "rewards/accuracies": 0.0, "rewards/chosen": 3.5994088649749756, "rewards/margins": 0.0, "rewards/rejected": 3.5994088649749756, "step": 7749 }, { "epoch": 1.26, "learning_rate": 5.780095338797873e-07, "logits/chosen": -1.3033123016357422, "logits/rejected": -1.234708309173584, "logps/chosen": -84.40116119384766, "logps/rejected": -7.375661373138428, "loss": 0.4335, "rewards/accuracies": 1.0, "rewards/chosen": 1.4009681940078735, "rewards/margins": 0.7333776354789734, "rewards/rejected": 0.6675905585289001, "step": 7750 }, { "epoch": 1.26, "learning_rate": 5.778797152206133e-07, "logits/chosen": -0.32117828726768494, "logits/rejected": -0.3963527977466583, "logps/chosen": -102.8704833984375, "logps/rejected": -54.651329040527344, "loss": 1.3641, "rewards/accuracies": 0.0, "rewards/chosen": 0.8626465201377869, "rewards/margins": -1.8949224948883057, "rewards/rejected": 2.7575690746307373, "step": 7751 }, { "epoch": 1.26, "learning_rate": 5.777498911806879e-07, "logits/chosen": -0.7319853901863098, "logits/rejected": -0.5474449992179871, "logps/chosen": -119.93009185791016, "logps/rejected": -61.8193359375, "loss": 0.3249, "rewards/accuracies": 1.0, "rewards/chosen": 2.421102285385132, "rewards/margins": 0.1335442066192627, "rewards/rejected": 2.287558078765869, "step": 7752 }, { "epoch": 1.26, "learning_rate": 5.776200617689808e-07, "logits/chosen": -0.7160201668739319, "logits/rejected": -0.6827492713928223, "logps/chosen": -57.19945526123047, "logps/rejected": -11.629118919372559, "loss": 2.3291, "rewards/accuracies": 0.0, "rewards/chosen": 0.04923553392291069, "rewards/margins": -0.42866984009742737, "rewards/rejected": 0.47790536284446716, "step": 7753 }, { "epoch": 1.26, "learning_rate": 5.774902269944619e-07, "logits/chosen": -0.9984578490257263, "logits/rejected": -1.014145016670227, "logps/chosen": -52.53705596923828, "logps/rejected": -84.23748779296875, "loss": 0.3707, "rewards/accuracies": 0.0, "rewards/chosen": 1.8480781316757202, "rewards/margins": -0.08403706550598145, "rewards/rejected": 1.9321151971817017, "step": 7754 }, { "epoch": 1.26, "learning_rate": 5.773603868661014e-07, "logits/chosen": -0.5634953379631042, "logits/rejected": -0.49336862564086914, "logps/chosen": -56.85000991821289, "logps/rejected": -35.3455924987793, "loss": 0.2138, "rewards/accuracies": 1.0, "rewards/chosen": 2.2070324420928955, "rewards/margins": 0.7735550403594971, "rewards/rejected": 1.4334774017333984, "step": 7755 }, { "epoch": 1.26, "learning_rate": 5.772305413928703e-07, "logits/chosen": -0.6051425337791443, "logits/rejected": -0.3145190477371216, "logps/chosen": -74.5982894897461, "logps/rejected": -46.540409088134766, "loss": 1.1366, "rewards/accuracies": 1.0, "rewards/chosen": 5.031951427459717, "rewards/margins": 3.2347960472106934, "rewards/rejected": 1.7971553802490234, "step": 7756 }, { "epoch": 1.26, "learning_rate": 5.771006905837395e-07, "logits/chosen": -0.6190364956855774, "logits/rejected": -0.5568051338195801, "logps/chosen": -57.72905349731445, "logps/rejected": -73.81501770019531, "loss": 0.3991, "rewards/accuracies": 0.0, "rewards/chosen": 2.229336977005005, "rewards/margins": -0.1648106575012207, "rewards/rejected": 2.3941476345062256, "step": 7757 }, { "epoch": 1.26, "learning_rate": 5.769708344476805e-07, "logits/chosen": -0.8854254484176636, "logits/rejected": -0.7673252820968628, "logps/chosen": -39.76633071899414, "logps/rejected": -57.525238037109375, "loss": 0.5684, "rewards/accuracies": 0.0, "rewards/chosen": 2.053663969039917, "rewards/margins": -0.7413129806518555, "rewards/rejected": 2.7949769496917725, "step": 7758 }, { "epoch": 1.26, "learning_rate": 5.768409729936652e-07, "logits/chosen": -0.8487996459007263, "logits/rejected": -0.8903274536132812, "logps/chosen": -69.57095336914062, "logps/rejected": -117.38719177246094, "loss": 1.7942, "rewards/accuracies": 0.0, "rewards/chosen": 0.6553894281387329, "rewards/margins": -2.156519889831543, "rewards/rejected": 2.8119094371795654, "step": 7759 }, { "epoch": 1.26, "learning_rate": 5.767111062306657e-07, "logits/chosen": -0.6761399507522583, "logits/rejected": -0.6761399507522583, "logps/chosen": -116.37094116210938, "logps/rejected": -116.37094116210938, "loss": 1.0983, "rewards/accuracies": 0.0, "rewards/chosen": 1.9465820789337158, "rewards/margins": 0.0, "rewards/rejected": 1.9465820789337158, "step": 7760 }, { "epoch": 1.26, "learning_rate": 5.765812341676546e-07, "logits/chosen": -0.6241941452026367, "logits/rejected": -0.6267293095588684, "logps/chosen": -126.7549057006836, "logps/rejected": -138.2049560546875, "loss": 1.0724, "rewards/accuracies": 0.0, "rewards/chosen": 1.4869804382324219, "rewards/margins": -2.0103538036346436, "rewards/rejected": 3.4973342418670654, "step": 7761 }, { "epoch": 1.26, "learning_rate": 5.76451356813605e-07, "logits/chosen": -0.368274450302124, "logits/rejected": -0.3827294707298279, "logps/chosen": -47.37725830078125, "logps/rejected": -75.8124008178711, "loss": 2.4623, "rewards/accuracies": 0.0, "rewards/chosen": 0.025511933490633965, "rewards/margins": -0.21619567275047302, "rewards/rejected": 0.24170760810375214, "step": 7762 }, { "epoch": 1.26, "learning_rate": 5.763214741774898e-07, "logits/chosen": -1.067901372909546, "logits/rejected": -0.9823569059371948, "logps/chosen": -42.41449737548828, "logps/rejected": -20.6666259765625, "loss": 0.0899, "rewards/accuracies": 1.0, "rewards/chosen": 2.176137685775757, "rewards/margins": 1.8268687725067139, "rewards/rejected": 0.34926891326904297, "step": 7763 }, { "epoch": 1.26, "learning_rate": 5.761915862682829e-07, "logits/chosen": -0.5972376465797424, "logits/rejected": -0.6152852773666382, "logps/chosen": -39.44200897216797, "logps/rejected": -101.82499694824219, "loss": 0.2567, "rewards/accuracies": 1.0, "rewards/chosen": 1.2302772998809814, "rewards/margins": 1.7805747985839844, "rewards/rejected": -0.5502975583076477, "step": 7764 }, { "epoch": 1.26, "learning_rate": 5.760616930949584e-07, "logits/chosen": -0.5186898708343506, "logits/rejected": -0.3195141553878784, "logps/chosen": -84.46646118164062, "logps/rejected": -29.575424194335938, "loss": 0.4739, "rewards/accuracies": 1.0, "rewards/chosen": 0.42012786865234375, "rewards/margins": 0.44398385286331177, "rewards/rejected": -0.023855973035097122, "step": 7765 }, { "epoch": 1.26, "learning_rate": 5.759317946664905e-07, "logits/chosen": -0.6639750003814697, "logits/rejected": -0.5918102860450745, "logps/chosen": -48.46721649169922, "logps/rejected": -53.093055725097656, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 1.3891441822052002, "rewards/margins": -0.18499290943145752, "rewards/rejected": 1.5741370916366577, "step": 7766 }, { "epoch": 1.26, "learning_rate": 5.758018909918542e-07, "logits/chosen": -0.6706414222717285, "logits/rejected": -0.6706414222717285, "logps/chosen": -41.77796936035156, "logps/rejected": -41.77796936035156, "loss": 0.3903, "rewards/accuracies": 0.0, "rewards/chosen": 0.7225742340087891, "rewards/margins": 0.0, "rewards/rejected": 0.7225742340087891, "step": 7767 }, { "epoch": 1.26, "learning_rate": 5.756719820800245e-07, "logits/chosen": -0.6759992241859436, "logits/rejected": -1.1027653217315674, "logps/chosen": -139.18832397460938, "logps/rejected": -36.45045471191406, "loss": 0.2985, "rewards/accuracies": 1.0, "rewards/chosen": 1.7477387189865112, "rewards/margins": 1.4883408546447754, "rewards/rejected": 0.2593978941440582, "step": 7768 }, { "epoch": 1.26, "learning_rate": 5.755420679399768e-07, "logits/chosen": -0.6256492137908936, "logits/rejected": -0.6268749237060547, "logps/chosen": -66.94898223876953, "logps/rejected": -84.52976989746094, "loss": 1.1222, "rewards/accuracies": 1.0, "rewards/chosen": 1.801042914390564, "rewards/margins": 1.0213630199432373, "rewards/rejected": 0.7796798944473267, "step": 7769 }, { "epoch": 1.26, "learning_rate": 5.75412148580687e-07, "logits/chosen": -0.701188862323761, "logits/rejected": -0.6866200566291809, "logps/chosen": -80.99087524414062, "logps/rejected": -57.4846076965332, "loss": 1.4956, "rewards/accuracies": 1.0, "rewards/chosen": 2.20024037361145, "rewards/margins": 0.8632549047470093, "rewards/rejected": 1.336985468864441, "step": 7770 }, { "epoch": 1.26, "learning_rate": 5.752822240111312e-07, "logits/chosen": -0.32672828435897827, "logits/rejected": -0.331010103225708, "logps/chosen": -8.964390754699707, "logps/rejected": -21.309406280517578, "loss": 0.7793, "rewards/accuracies": 0.0, "rewards/chosen": -0.059754181653261185, "rewards/margins": -0.1731395721435547, "rewards/rejected": 0.1133853942155838, "step": 7771 }, { "epoch": 1.26, "learning_rate": 5.75152294240286e-07, "logits/chosen": -0.41277366876602173, "logits/rejected": -0.3628672659397125, "logps/chosen": -48.71019744873047, "logps/rejected": -55.57478332519531, "loss": 0.9872, "rewards/accuracies": 0.0, "rewards/chosen": 0.5480255484580994, "rewards/margins": -0.9985740780830383, "rewards/rejected": 1.5465996265411377, "step": 7772 }, { "epoch": 1.26, "learning_rate": 5.750223592771285e-07, "logits/chosen": -1.0525438785552979, "logits/rejected": -0.9263230562210083, "logps/chosen": -105.5816421508789, "logps/rejected": -20.6671199798584, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 5.6967082023620605, "rewards/margins": 5.327296733856201, "rewards/rejected": 0.3694114685058594, "step": 7773 }, { "epoch": 1.26, "learning_rate": 5.748924191306358e-07, "logits/chosen": -0.7077468633651733, "logits/rejected": -0.5820004343986511, "logps/chosen": -163.15829467773438, "logps/rejected": -144.65875244140625, "loss": 0.8505, "rewards/accuracies": 1.0, "rewards/chosen": 5.943321228027344, "rewards/margins": 1.2366089820861816, "rewards/rejected": 4.706712245941162, "step": 7774 }, { "epoch": 1.26, "learning_rate": 5.747624738097856e-07, "logits/chosen": -0.8447723984718323, "logits/rejected": -0.5309977531433105, "logps/chosen": -57.895267486572266, "logps/rejected": -101.48758697509766, "loss": 1.3921, "rewards/accuracies": 0.0, "rewards/chosen": 2.251983404159546, "rewards/margins": -1.8569238185882568, "rewards/rejected": 4.108907222747803, "step": 7775 }, { "epoch": 1.26, "learning_rate": 5.746325233235559e-07, "logits/chosen": -0.5551899075508118, "logits/rejected": -0.5627840161323547, "logps/chosen": -1.98361337184906, "logps/rejected": -4.257194519042969, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 0.34406420588493347, "rewards/margins": 0.05785202980041504, "rewards/rejected": 0.28621217608451843, "step": 7776 }, { "epoch": 1.26, "learning_rate": 5.74502567680925e-07, "logits/chosen": -0.6103280186653137, "logits/rejected": -0.4900428354740143, "logps/chosen": -248.25794982910156, "logps/rejected": -166.95335388183594, "loss": 1.2828, "rewards/accuracies": 0.0, "rewards/chosen": 2.1596665382385254, "rewards/margins": -2.472154140472412, "rewards/rejected": 4.6318206787109375, "step": 7777 }, { "epoch": 1.26, "learning_rate": 5.743726068908717e-07, "logits/chosen": -0.81043541431427, "logits/rejected": -0.894629955291748, "logps/chosen": -88.4842300415039, "logps/rejected": -110.65135955810547, "loss": 1.6546, "rewards/accuracies": 0.0, "rewards/chosen": 2.3699090480804443, "rewards/margins": -1.171281337738037, "rewards/rejected": 3.5411903858184814, "step": 7778 }, { "epoch": 1.26, "learning_rate": 5.742426409623748e-07, "logits/chosen": -0.7203461527824402, "logits/rejected": -0.6643933653831482, "logps/chosen": -46.670719146728516, "logps/rejected": -136.3328399658203, "loss": 0.2554, "rewards/accuracies": 1.0, "rewards/chosen": 1.941383719444275, "rewards/margins": 0.5252689123153687, "rewards/rejected": 1.4161148071289062, "step": 7779 }, { "epoch": 1.26, "learning_rate": 5.74112669904414e-07, "logits/chosen": -0.495449423789978, "logits/rejected": -0.38587167859077454, "logps/chosen": -96.82540130615234, "logps/rejected": -17.501018524169922, "loss": 0.3474, "rewards/accuracies": 1.0, "rewards/chosen": 1.0376685857772827, "rewards/margins": 0.8475528359413147, "rewards/rejected": 0.19011573493480682, "step": 7780 }, { "epoch": 1.26, "learning_rate": 5.73982693725969e-07, "logits/chosen": -0.8313154578208923, "logits/rejected": -0.8912534713745117, "logps/chosen": -22.387939453125, "logps/rejected": -61.33937072753906, "loss": 0.5968, "rewards/accuracies": 0.0, "rewards/chosen": 1.9819408655166626, "rewards/margins": -0.09704101085662842, "rewards/rejected": 2.078981876373291, "step": 7781 }, { "epoch": 1.26, "learning_rate": 5.738527124360199e-07, "logits/chosen": -0.9657005667686462, "logits/rejected": -0.9657005667686462, "logps/chosen": -71.33015441894531, "logps/rejected": -71.33015441894531, "loss": 0.36, "rewards/accuracies": 0.0, "rewards/chosen": 1.7286690473556519, "rewards/margins": 0.0, "rewards/rejected": 1.7286690473556519, "step": 7782 }, { "epoch": 1.26, "learning_rate": 5.737227260435471e-07, "logits/chosen": -0.12196168303489685, "logits/rejected": -0.12298320978879929, "logps/chosen": -4.273199558258057, "logps/rejected": -8.522098541259766, "loss": 0.6576, "rewards/accuracies": 1.0, "rewards/chosen": 0.23468776047229767, "rewards/margins": 0.28393760323524475, "rewards/rejected": -0.049249839037656784, "step": 7783 }, { "epoch": 1.26, "learning_rate": 5.735927345575315e-07, "logits/chosen": -1.176474928855896, "logits/rejected": -1.1311795711517334, "logps/chosen": -119.2844009399414, "logps/rejected": -39.62744140625, "loss": 0.2328, "rewards/accuracies": 1.0, "rewards/chosen": 1.412981390953064, "rewards/margins": 1.2005199193954468, "rewards/rejected": 0.2124614715576172, "step": 7784 }, { "epoch": 1.26, "learning_rate": 5.734627379869543e-07, "logits/chosen": -0.9706023335456848, "logits/rejected": -0.829045295715332, "logps/chosen": -128.17678833007812, "logps/rejected": -105.24076843261719, "loss": 0.038, "rewards/accuracies": 1.0, "rewards/chosen": 6.719607830047607, "rewards/margins": 2.835624933242798, "rewards/rejected": 3.8839828968048096, "step": 7785 }, { "epoch": 1.26, "learning_rate": 5.733327363407972e-07, "logits/chosen": -0.3958686590194702, "logits/rejected": -0.3957265019416809, "logps/chosen": -78.05557250976562, "logps/rejected": -61.985145568847656, "loss": 0.5288, "rewards/accuracies": 0.0, "rewards/chosen": 1.5349518060684204, "rewards/margins": -0.6211251020431519, "rewards/rejected": 2.1560769081115723, "step": 7786 }, { "epoch": 1.26, "learning_rate": 5.732027296280417e-07, "logits/chosen": -0.16289252042770386, "logits/rejected": -0.15857064723968506, "logps/chosen": -3.6420488357543945, "logps/rejected": -2.9688336849212646, "loss": 0.7296, "rewards/accuracies": 0.0, "rewards/chosen": 0.46277618408203125, "rewards/margins": -0.14093893766403198, "rewards/rejected": 0.6037151217460632, "step": 7787 }, { "epoch": 1.26, "learning_rate": 5.730727178576703e-07, "logits/chosen": -0.6950513124465942, "logits/rejected": -0.6595087051391602, "logps/chosen": -82.926025390625, "logps/rejected": -63.97787857055664, "loss": 0.187, "rewards/accuracies": 1.0, "rewards/chosen": 1.162604570388794, "rewards/margins": 0.9901531934738159, "rewards/rejected": 0.17245140671730042, "step": 7788 }, { "epoch": 1.26, "learning_rate": 5.729427010386655e-07, "logits/chosen": -0.5460684895515442, "logits/rejected": -0.5460684895515442, "logps/chosen": -0.9391068816184998, "logps/rejected": -0.9391068816184998, "loss": 0.8216, "rewards/accuracies": 0.0, "rewards/chosen": 0.23631639778614044, "rewards/margins": 0.0, "rewards/rejected": 0.23631639778614044, "step": 7789 }, { "epoch": 1.26, "learning_rate": 5.728126791800102e-07, "logits/chosen": -0.2459912747144699, "logits/rejected": -0.2459912747144699, "logps/chosen": -0.39252036809921265, "logps/rejected": -0.39252036809921265, "loss": 0.3883, "rewards/accuracies": 0.0, "rewards/chosen": 0.11323447525501251, "rewards/margins": 0.0, "rewards/rejected": 0.11323447525501251, "step": 7790 }, { "epoch": 1.26, "learning_rate": 5.726826522906878e-07, "logits/chosen": -0.8343616127967834, "logits/rejected": -0.7843611836433411, "logps/chosen": -43.664791107177734, "logps/rejected": -122.95668029785156, "loss": 0.468, "rewards/accuracies": 1.0, "rewards/chosen": 1.9772571325302124, "rewards/margins": 0.04306364059448242, "rewards/rejected": 1.93419349193573, "step": 7791 }, { "epoch": 1.26, "learning_rate": 5.725526203796818e-07, "logits/chosen": -0.5924146175384521, "logits/rejected": -0.5598639249801636, "logps/chosen": -32.57434844970703, "logps/rejected": -81.18170928955078, "loss": 0.4805, "rewards/accuracies": 0.0, "rewards/chosen": 0.919342041015625, "rewards/margins": -0.21890640258789062, "rewards/rejected": 1.1382484436035156, "step": 7792 }, { "epoch": 1.26, "learning_rate": 5.724225834559761e-07, "logits/chosen": -0.9340782165527344, "logits/rejected": -0.7994042634963989, "logps/chosen": -157.20327758789062, "logps/rejected": -98.42692565917969, "loss": 0.1671, "rewards/accuracies": 1.0, "rewards/chosen": 4.447348117828369, "rewards/margins": 0.9565689563751221, "rewards/rejected": 3.490779161453247, "step": 7793 }, { "epoch": 1.27, "learning_rate": 5.722925415285554e-07, "logits/chosen": -0.670626699924469, "logits/rejected": -0.6907364726066589, "logps/chosen": -41.2393684387207, "logps/rejected": -77.24209594726562, "loss": 0.7633, "rewards/accuracies": 0.0, "rewards/chosen": 1.675183892250061, "rewards/margins": -0.45597612857818604, "rewards/rejected": 2.131160020828247, "step": 7794 }, { "epoch": 1.27, "learning_rate": 5.721624946064041e-07, "logits/chosen": -0.5551603436470032, "logits/rejected": -0.5273990631103516, "logps/chosen": -66.42135620117188, "logps/rejected": -60.79496765136719, "loss": 0.3135, "rewards/accuracies": 1.0, "rewards/chosen": 2.1967239379882812, "rewards/margins": 0.39796602725982666, "rewards/rejected": 1.7987579107284546, "step": 7795 }, { "epoch": 1.27, "learning_rate": 5.72032442698507e-07, "logits/chosen": -0.5844001770019531, "logits/rejected": -0.4538692235946655, "logps/chosen": -36.35869216918945, "logps/rejected": -24.20198631286621, "loss": 0.9015, "rewards/accuracies": 1.0, "rewards/chosen": 1.4528034925460815, "rewards/margins": 1.1871886253356934, "rewards/rejected": 0.26561489701271057, "step": 7796 }, { "epoch": 1.27, "learning_rate": 5.719023858138499e-07, "logits/chosen": -0.7027592062950134, "logits/rejected": -0.38926658034324646, "logps/chosen": -91.23623657226562, "logps/rejected": -62.22087860107422, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 5.896356105804443, "rewards/margins": 3.779407262802124, "rewards/rejected": 2.1169488430023193, "step": 7797 }, { "epoch": 1.27, "learning_rate": 5.717723239614182e-07, "logits/chosen": -0.2679883539676666, "logits/rejected": -0.2679883539676666, "logps/chosen": -5.509575843811035, "logps/rejected": -5.509575843811035, "loss": 0.736, "rewards/accuracies": 0.0, "rewards/chosen": 0.2408524602651596, "rewards/margins": 0.0, "rewards/rejected": 0.2408524602651596, "step": 7798 }, { "epoch": 1.27, "learning_rate": 5.716422571501981e-07, "logits/chosen": -0.8569248914718628, "logits/rejected": -0.9331606030464172, "logps/chosen": -153.99266052246094, "logps/rejected": -184.57565307617188, "loss": 1.7818, "rewards/accuracies": 0.0, "rewards/chosen": 3.3711624145507812, "rewards/margins": -3.1607470512390137, "rewards/rejected": 6.531909465789795, "step": 7799 }, { "epoch": 1.27, "learning_rate": 5.715121853891757e-07, "logits/chosen": -1.025484323501587, "logits/rejected": -0.643248975276947, "logps/chosen": -115.9770278930664, "logps/rejected": -47.58546447753906, "loss": 0.0777, "rewards/accuracies": 1.0, "rewards/chosen": 4.360572338104248, "rewards/margins": 3.2717535495758057, "rewards/rejected": 1.0888187885284424, "step": 7800 }, { "epoch": 1.27, "learning_rate": 5.713821086873382e-07, "logits/chosen": -0.9594162106513977, "logits/rejected": -0.866966724395752, "logps/chosen": -66.78547668457031, "logps/rejected": -39.88700485229492, "loss": 0.1862, "rewards/accuracies": 1.0, "rewards/chosen": 1.3011215925216675, "rewards/margins": 1.1062252521514893, "rewards/rejected": 0.19489632546901703, "step": 7801 }, { "epoch": 1.27, "learning_rate": 5.712520270536722e-07, "logits/chosen": -0.396434485912323, "logits/rejected": -0.41113755106925964, "logps/chosen": -6.634773254394531, "logps/rejected": -2.7567756175994873, "loss": 0.5837, "rewards/accuracies": 0.0, "rewards/chosen": -0.13723574578762054, "rewards/margins": -0.329028844833374, "rewards/rejected": 0.19179308414459229, "step": 7802 }, { "epoch": 1.27, "learning_rate": 5.711219404971656e-07, "logits/chosen": -0.6604675650596619, "logits/rejected": -0.5241809487342834, "logps/chosen": -75.85710144042969, "logps/rejected": -60.970577239990234, "loss": 0.9223, "rewards/accuracies": 1.0, "rewards/chosen": 2.9119439125061035, "rewards/margins": 1.2692158222198486, "rewards/rejected": 1.6427280902862549, "step": 7803 }, { "epoch": 1.27, "learning_rate": 5.709918490268056e-07, "logits/chosen": -1.070428490638733, "logits/rejected": -0.96133953332901, "logps/chosen": -166.92042541503906, "logps/rejected": -131.75253295898438, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 6.749492168426514, "rewards/margins": 5.007669448852539, "rewards/rejected": 1.7418228387832642, "step": 7804 }, { "epoch": 1.27, "learning_rate": 5.708617526515808e-07, "logits/chosen": -0.7003183960914612, "logits/rejected": -0.6989700198173523, "logps/chosen": -31.111202239990234, "logps/rejected": -4.702122211456299, "loss": 0.7878, "rewards/accuracies": 0.0, "rewards/chosen": 0.1776554137468338, "rewards/margins": -0.12378774583339691, "rewards/rejected": 0.3014431595802307, "step": 7805 }, { "epoch": 1.27, "learning_rate": 5.707316513804792e-07, "logits/chosen": -0.8597238063812256, "logits/rejected": -0.8370150923728943, "logps/chosen": -120.06450653076172, "logps/rejected": -58.055503845214844, "loss": 0.5879, "rewards/accuracies": 0.0, "rewards/chosen": 2.287595510482788, "rewards/margins": -0.7812559604644775, "rewards/rejected": 3.0688514709472656, "step": 7806 }, { "epoch": 1.27, "learning_rate": 5.706015452224899e-07, "logits/chosen": -0.7960176467895508, "logits/rejected": -0.6746273636817932, "logps/chosen": -69.72221374511719, "logps/rejected": -89.38461303710938, "loss": 0.9957, "rewards/accuracies": 1.0, "rewards/chosen": 1.6150330305099487, "rewards/margins": 0.02735447883605957, "rewards/rejected": 1.5876785516738892, "step": 7807 }, { "epoch": 1.27, "learning_rate": 5.704714341866018e-07, "logits/chosen": -0.6405354738235474, "logits/rejected": -0.6853970885276794, "logps/chosen": -84.02986907958984, "logps/rejected": -69.65487670898438, "loss": 1.076, "rewards/accuracies": 0.0, "rewards/chosen": 1.2450675964355469, "rewards/margins": -1.3250808715820312, "rewards/rejected": 2.570148468017578, "step": 7808 }, { "epoch": 1.27, "learning_rate": 5.703413182818044e-07, "logits/chosen": -0.6479040384292603, "logits/rejected": -0.640418291091919, "logps/chosen": -52.23414611816406, "logps/rejected": -108.3974838256836, "loss": 0.3192, "rewards/accuracies": 1.0, "rewards/chosen": 1.4046920537948608, "rewards/margins": 0.6120185256004333, "rewards/rejected": 0.7926735281944275, "step": 7809 }, { "epoch": 1.27, "learning_rate": 5.702111975170875e-07, "logits/chosen": -0.4904874563217163, "logits/rejected": -0.48496514558792114, "logps/chosen": -139.2710723876953, "logps/rejected": -77.8839111328125, "loss": 2.023, "rewards/accuracies": 0.0, "rewards/chosen": 0.5358414053916931, "rewards/margins": -1.001695156097412, "rewards/rejected": 1.53753662109375, "step": 7810 }, { "epoch": 1.27, "learning_rate": 5.700810719014412e-07, "logits/chosen": -1.0523247718811035, "logits/rejected": -1.0447217226028442, "logps/chosen": -79.34164428710938, "logps/rejected": -41.7170524597168, "loss": 0.6588, "rewards/accuracies": 1.0, "rewards/chosen": 1.0858505964279175, "rewards/margins": 0.4192841053009033, "rewards/rejected": 0.6665664911270142, "step": 7811 }, { "epoch": 1.27, "learning_rate": 5.699509414438559e-07, "logits/chosen": -0.6700646281242371, "logits/rejected": -0.6581907272338867, "logps/chosen": -107.73135375976562, "logps/rejected": -63.523460388183594, "loss": 0.3521, "rewards/accuracies": 1.0, "rewards/chosen": -0.1082763671875, "rewards/margins": 0.031582266092300415, "rewards/rejected": -0.13985863327980042, "step": 7812 }, { "epoch": 1.27, "learning_rate": 5.698208061533224e-07, "logits/chosen": -0.650334894657135, "logits/rejected": -0.6221778988838196, "logps/chosen": -120.86066436767578, "logps/rejected": -109.64970397949219, "loss": 2.3388, "rewards/accuracies": 0.0, "rewards/chosen": 1.7063499689102173, "rewards/margins": -0.17415082454681396, "rewards/rejected": 1.8805007934570312, "step": 7813 }, { "epoch": 1.27, "learning_rate": 5.69690666038832e-07, "logits/chosen": -0.6015194654464722, "logits/rejected": -0.576018214225769, "logps/chosen": -130.51681518554688, "logps/rejected": -78.17584228515625, "loss": 0.5191, "rewards/accuracies": 1.0, "rewards/chosen": 2.7602691650390625, "rewards/margins": 0.6188056468963623, "rewards/rejected": 2.1414635181427, "step": 7814 }, { "epoch": 1.27, "learning_rate": 5.695605211093758e-07, "logits/chosen": -0.9219198822975159, "logits/rejected": -1.1720224618911743, "logps/chosen": -185.34796142578125, "logps/rejected": -53.24808883666992, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": 4.870288372039795, "rewards/margins": 3.159252643585205, "rewards/rejected": 1.7110356092453003, "step": 7815 }, { "epoch": 1.27, "learning_rate": 5.694303713739459e-07, "logits/chosen": -0.6373938918113708, "logits/rejected": -0.6347905397415161, "logps/chosen": -43.86266326904297, "logps/rejected": -105.76516723632812, "loss": 0.8067, "rewards/accuracies": 0.0, "rewards/chosen": 1.3782974481582642, "rewards/margins": -1.377203345298767, "rewards/rejected": 2.7555007934570312, "step": 7816 }, { "epoch": 1.27, "learning_rate": 5.693002168415343e-07, "logits/chosen": -0.8704624176025391, "logits/rejected": -0.8874503374099731, "logps/chosen": -79.24967956542969, "logps/rejected": -103.70886993408203, "loss": 0.4532, "rewards/accuracies": 0.0, "rewards/chosen": 1.79193115234375, "rewards/margins": -0.3412201404571533, "rewards/rejected": 2.1331512928009033, "step": 7817 }, { "epoch": 1.27, "learning_rate": 5.691700575211334e-07, "logits/chosen": -0.728526771068573, "logits/rejected": -0.7460231781005859, "logps/chosen": -88.50228881835938, "logps/rejected": -64.57504272460938, "loss": 2.4699, "rewards/accuracies": 0.0, "rewards/chosen": -0.06868286430835724, "rewards/margins": -1.895553708076477, "rewards/rejected": 1.8268707990646362, "step": 7818 }, { "epoch": 1.27, "learning_rate": 5.690398934217361e-07, "logits/chosen": -0.9272538423538208, "logits/rejected": -0.8014528751373291, "logps/chosen": -56.206207275390625, "logps/rejected": -28.883848190307617, "loss": 0.1815, "rewards/accuracies": 1.0, "rewards/chosen": 0.9535690546035767, "rewards/margins": 1.0454500913619995, "rewards/rejected": -0.09188099205493927, "step": 7819 }, { "epoch": 1.27, "learning_rate": 5.689097245523353e-07, "logits/chosen": -0.6073958277702332, "logits/rejected": -0.6211591958999634, "logps/chosen": -86.64940643310547, "logps/rejected": -88.47691345214844, "loss": 0.3362, "rewards/accuracies": 1.0, "rewards/chosen": 1.6730369329452515, "rewards/margins": 0.08400487899780273, "rewards/rejected": 1.5890320539474487, "step": 7820 }, { "epoch": 1.27, "learning_rate": 5.687795509219246e-07, "logits/chosen": -0.8457876443862915, "logits/rejected": -0.8248346447944641, "logps/chosen": -84.03849792480469, "logps/rejected": -75.98158264160156, "loss": 1.1395, "rewards/accuracies": 1.0, "rewards/chosen": 2.670445203781128, "rewards/margins": 0.33797454833984375, "rewards/rejected": 2.332470655441284, "step": 7821 }, { "epoch": 1.27, "learning_rate": 5.686493725394977e-07, "logits/chosen": -0.6808344721794128, "logits/rejected": -0.6369518041610718, "logps/chosen": -101.80322265625, "logps/rejected": -38.59400939941406, "loss": 0.191, "rewards/accuracies": 1.0, "rewards/chosen": 1.4854332208633423, "rewards/margins": 1.0705238580703735, "rewards/rejected": 0.41490936279296875, "step": 7822 }, { "epoch": 1.27, "learning_rate": 5.685191894140488e-07, "logits/chosen": -0.4929048418998718, "logits/rejected": -0.48067623376846313, "logps/chosen": -31.95315933227539, "logps/rejected": -72.83010864257812, "loss": 1.4726, "rewards/accuracies": 0.0, "rewards/chosen": 0.9433738589286804, "rewards/margins": -1.0995056629180908, "rewards/rejected": 2.042879581451416, "step": 7823 }, { "epoch": 1.27, "learning_rate": 5.683890015545722e-07, "logits/chosen": -0.8803583383560181, "logits/rejected": -0.7367011904716492, "logps/chosen": -134.05435180664062, "logps/rejected": -32.44990158081055, "loss": 0.8492, "rewards/accuracies": 1.0, "rewards/chosen": 1.0272674560546875, "rewards/margins": 0.924399197101593, "rewards/rejected": 0.10286827385425568, "step": 7824 }, { "epoch": 1.27, "learning_rate": 5.682588089700629e-07, "logits/chosen": -0.9104428887367249, "logits/rejected": -0.9130907654762268, "logps/chosen": -52.93971252441406, "logps/rejected": -117.3319320678711, "loss": 0.5244, "rewards/accuracies": 0.0, "rewards/chosen": 2.6594924926757812, "rewards/margins": -0.40695881843566895, "rewards/rejected": 3.06645131111145, "step": 7825 }, { "epoch": 1.27, "learning_rate": 5.681286116695154e-07, "logits/chosen": -0.7156755328178406, "logits/rejected": -0.6414540410041809, "logps/chosen": -143.359130859375, "logps/rejected": -103.61015319824219, "loss": 0.0963, "rewards/accuracies": 1.0, "rewards/chosen": 5.115832805633545, "rewards/margins": 2.2317004203796387, "rewards/rejected": 2.8841323852539062, "step": 7826 }, { "epoch": 1.27, "learning_rate": 5.679984096619257e-07, "logits/chosen": -0.5821007490158081, "logits/rejected": -0.44251060485839844, "logps/chosen": -40.522403717041016, "logps/rejected": -9.858781814575195, "loss": 0.2484, "rewards/accuracies": 1.0, "rewards/chosen": 1.5240001678466797, "rewards/margins": 0.9372261762619019, "rewards/rejected": 0.5867739915847778, "step": 7827 }, { "epoch": 1.27, "learning_rate": 5.678682029562893e-07, "logits/chosen": -0.6758739352226257, "logits/rejected": -0.6664998531341553, "logps/chosen": -52.91382598876953, "logps/rejected": -68.88136291503906, "loss": 1.7601, "rewards/accuracies": 1.0, "rewards/chosen": 2.0523834228515625, "rewards/margins": 0.5880279541015625, "rewards/rejected": 1.46435546875, "step": 7828 }, { "epoch": 1.27, "learning_rate": 5.677379915616022e-07, "logits/chosen": -0.5224865078926086, "logits/rejected": -0.5226632952690125, "logps/chosen": -68.84669494628906, "logps/rejected": -51.92803192138672, "loss": 1.0445, "rewards/accuracies": 0.0, "rewards/chosen": 1.616448998451233, "rewards/margins": -0.06734466552734375, "rewards/rejected": 1.6837936639785767, "step": 7829 }, { "epoch": 1.27, "learning_rate": 5.676077754868609e-07, "logits/chosen": -0.654466450214386, "logits/rejected": -0.451352059841156, "logps/chosen": -116.09708404541016, "logps/rejected": -168.22238159179688, "loss": 0.7908, "rewards/accuracies": 1.0, "rewards/chosen": 5.529245853424072, "rewards/margins": 0.13064956665039062, "rewards/rejected": 5.398596286773682, "step": 7830 }, { "epoch": 1.27, "learning_rate": 5.674775547410619e-07, "logits/chosen": -1.2510544061660767, "logits/rejected": -1.375423789024353, "logps/chosen": -221.95680236816406, "logps/rejected": -26.286148071289062, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": 2.842878818511963, "rewards/margins": 2.653735399246216, "rewards/rejected": 0.1891433745622635, "step": 7831 }, { "epoch": 1.27, "learning_rate": 5.673473293332024e-07, "logits/chosen": -0.8575529456138611, "logits/rejected": -0.8061031103134155, "logps/chosen": -139.62667846679688, "logps/rejected": -42.469173431396484, "loss": 0.8012, "rewards/accuracies": 0.0, "rewards/chosen": 1.0440887212753296, "rewards/margins": -0.37465476989746094, "rewards/rejected": 1.4187434911727905, "step": 7832 }, { "epoch": 1.27, "learning_rate": 5.672170992722797e-07, "logits/chosen": -0.69792640209198, "logits/rejected": -0.6177303194999695, "logps/chosen": -72.54898071289062, "logps/rejected": -76.31048583984375, "loss": 0.3738, "rewards/accuracies": 0.0, "rewards/chosen": 1.8045669794082642, "rewards/margins": -0.08626711368560791, "rewards/rejected": 1.890834093093872, "step": 7833 }, { "epoch": 1.27, "learning_rate": 5.670868645672915e-07, "logits/chosen": -0.9343448877334595, "logits/rejected": -0.9303393959999084, "logps/chosen": -176.143310546875, "logps/rejected": -149.09005737304688, "loss": 0.0503, "rewards/accuracies": 1.0, "rewards/chosen": 4.306178569793701, "rewards/margins": 3.1149370670318604, "rewards/rejected": 1.1912415027618408, "step": 7834 }, { "epoch": 1.27, "learning_rate": 5.669566252272357e-07, "logits/chosen": -0.6596592664718628, "logits/rejected": -0.7857702970504761, "logps/chosen": -112.54811096191406, "logps/rejected": -144.6612091064453, "loss": 1.7275, "rewards/accuracies": 0.0, "rewards/chosen": 4.139262676239014, "rewards/margins": -2.045931816101074, "rewards/rejected": 6.185194492340088, "step": 7835 }, { "epoch": 1.27, "learning_rate": 5.668263812611105e-07, "logits/chosen": -0.8328121304512024, "logits/rejected": -1.0573946237564087, "logps/chosen": -255.92417907714844, "logps/rejected": -167.9097442626953, "loss": 1.2489, "rewards/accuracies": 0.0, "rewards/chosen": 3.4116928577423096, "rewards/margins": -2.2938997745513916, "rewards/rejected": 5.705592632293701, "step": 7836 }, { "epoch": 1.27, "learning_rate": 5.666961326779147e-07, "logits/chosen": -0.770243763923645, "logits/rejected": -0.7763809561729431, "logps/chosen": -59.257080078125, "logps/rejected": -61.774566650390625, "loss": 0.5042, "rewards/accuracies": 0.0, "rewards/chosen": 1.1292823553085327, "rewards/margins": -8.094310760498047e-05, "rewards/rejected": 1.1293632984161377, "step": 7837 }, { "epoch": 1.27, "learning_rate": 5.665658794866473e-07, "logits/chosen": -0.740405797958374, "logits/rejected": -0.6396101713180542, "logps/chosen": -180.17445373535156, "logps/rejected": -79.34339904785156, "loss": 0.5021, "rewards/accuracies": 1.0, "rewards/chosen": 4.128419399261475, "rewards/margins": 0.29934215545654297, "rewards/rejected": 3.8290772438049316, "step": 7838 }, { "epoch": 1.27, "learning_rate": 5.664356216963075e-07, "logits/chosen": -0.7011770606040955, "logits/rejected": -0.619558572769165, "logps/chosen": -48.964927673339844, "logps/rejected": -15.393594741821289, "loss": 0.4739, "rewards/accuracies": 1.0, "rewards/chosen": 1.68663489818573, "rewards/margins": 0.8002819418907166, "rewards/rejected": 0.8863529562950134, "step": 7839 }, { "epoch": 1.27, "learning_rate": 5.663053593158951e-07, "logits/chosen": -0.5093571543693542, "logits/rejected": -0.5986015796661377, "logps/chosen": -67.21286010742188, "logps/rejected": -108.88604736328125, "loss": 3.1674, "rewards/accuracies": 0.0, "rewards/chosen": 1.3778671026229858, "rewards/margins": -3.821577548980713, "rewards/rejected": 5.199444770812988, "step": 7840 }, { "epoch": 1.27, "learning_rate": 5.661750923544095e-07, "logits/chosen": -0.8424410223960876, "logits/rejected": -0.795968770980835, "logps/chosen": -87.3615493774414, "logps/rejected": -87.96095275878906, "loss": 0.3758, "rewards/accuracies": 0.0, "rewards/chosen": 2.0042741298675537, "rewards/margins": -0.04813385009765625, "rewards/rejected": 2.05240797996521, "step": 7841 }, { "epoch": 1.27, "learning_rate": 5.660448208208513e-07, "logits/chosen": -0.5934790372848511, "logits/rejected": -0.5323982834815979, "logps/chosen": -43.89967727661133, "logps/rejected": -57.64731216430664, "loss": 0.8252, "rewards/accuracies": 1.0, "rewards/chosen": 1.962424874305725, "rewards/margins": 0.07680666446685791, "rewards/rejected": 1.8856182098388672, "step": 7842 }, { "epoch": 1.27, "learning_rate": 5.659145447242208e-07, "logits/chosen": -0.7776317000389099, "logits/rejected": -0.7933184504508972, "logps/chosen": -123.72464752197266, "logps/rejected": -54.38669204711914, "loss": 0.1686, "rewards/accuracies": 1.0, "rewards/chosen": 3.7553582191467285, "rewards/margins": 1.1940853595733643, "rewards/rejected": 2.5612728595733643, "step": 7843 }, { "epoch": 1.27, "learning_rate": 5.657842640735189e-07, "logits/chosen": -0.8740291595458984, "logits/rejected": -0.9530763626098633, "logps/chosen": -161.88235473632812, "logps/rejected": -105.7657699584961, "loss": 0.6643, "rewards/accuracies": 0.0, "rewards/chosen": -0.30162811279296875, "rewards/margins": -0.8477821350097656, "rewards/rejected": 0.5461540222167969, "step": 7844 }, { "epoch": 1.27, "learning_rate": 5.656539788777467e-07, "logits/chosen": -0.9618566036224365, "logits/rejected": -0.8616905808448792, "logps/chosen": -78.37045288085938, "logps/rejected": -41.06346130371094, "loss": 1.1007, "rewards/accuracies": 0.0, "rewards/chosen": 0.8112716674804688, "rewards/margins": -0.6153602600097656, "rewards/rejected": 1.4266319274902344, "step": 7845 }, { "epoch": 1.27, "learning_rate": 5.655236891459061e-07, "logits/chosen": -0.7340418696403503, "logits/rejected": -0.6352009177207947, "logps/chosen": -94.96211242675781, "logps/rejected": -43.83466339111328, "loss": 0.7592, "rewards/accuracies": 1.0, "rewards/chosen": 2.2165284156799316, "rewards/margins": 1.7071144580841064, "rewards/rejected": 0.5094138979911804, "step": 7846 }, { "epoch": 1.27, "learning_rate": 5.653933948869984e-07, "logits/chosen": -0.827618420124054, "logits/rejected": -0.7691392302513123, "logps/chosen": -89.51239013671875, "logps/rejected": -73.13859558105469, "loss": 0.1226, "rewards/accuracies": 1.0, "rewards/chosen": 4.931098937988281, "rewards/margins": 1.4550604820251465, "rewards/rejected": 3.4760384559631348, "step": 7847 }, { "epoch": 1.27, "learning_rate": 5.652630961100258e-07, "logits/chosen": -0.5625367164611816, "logits/rejected": -0.5169907808303833, "logps/chosen": -58.480712890625, "logps/rejected": -39.947601318359375, "loss": 1.6227, "rewards/accuracies": 1.0, "rewards/chosen": 0.7261596918106079, "rewards/margins": 0.13792002201080322, "rewards/rejected": 0.5882396697998047, "step": 7848 }, { "epoch": 1.27, "learning_rate": 5.651327928239908e-07, "logits/chosen": -0.47044435143470764, "logits/rejected": -0.47044435143470764, "logps/chosen": -0.35215336084365845, "logps/rejected": -0.35215336084365845, "loss": 0.6079, "rewards/accuracies": 0.0, "rewards/chosen": 0.07879776507616043, "rewards/margins": 0.0, "rewards/rejected": 0.07879776507616043, "step": 7849 }, { "epoch": 1.27, "learning_rate": 5.650024850378963e-07, "logits/chosen": -0.9830171465873718, "logits/rejected": -0.957463800907135, "logps/chosen": -124.64515686035156, "logps/rejected": -335.124755859375, "loss": 1.4017, "rewards/accuracies": 0.0, "rewards/chosen": 3.0300490856170654, "rewards/margins": -2.5844619274139404, "rewards/rejected": 5.614511013031006, "step": 7850 }, { "epoch": 1.27, "learning_rate": 5.648721727607449e-07, "logits/chosen": -0.9110721945762634, "logits/rejected": -0.7996763586997986, "logps/chosen": -82.20321655273438, "logps/rejected": -63.906768798828125, "loss": 0.2013, "rewards/accuracies": 1.0, "rewards/chosen": 5.0754852294921875, "rewards/margins": 2.838568925857544, "rewards/rejected": 2.2369163036346436, "step": 7851 }, { "epoch": 1.27, "learning_rate": 5.647418560015404e-07, "logits/chosen": -0.497244268655777, "logits/rejected": -0.4889833629131317, "logps/chosen": -20.10248565673828, "logps/rejected": -25.485946655273438, "loss": 0.4847, "rewards/accuracies": 0.0, "rewards/chosen": 0.22948017716407776, "rewards/margins": -0.4615476429462433, "rewards/rejected": 0.691027820110321, "step": 7852 }, { "epoch": 1.27, "learning_rate": 5.646115347692861e-07, "logits/chosen": -0.8001735210418701, "logits/rejected": -0.8202700018882751, "logps/chosen": -71.5003433227539, "logps/rejected": -77.83804321289062, "loss": 1.5581, "rewards/accuracies": 0.0, "rewards/chosen": 2.009610891342163, "rewards/margins": -0.769737958908081, "rewards/rejected": 2.779348850250244, "step": 7853 }, { "epoch": 1.27, "learning_rate": 5.644812090729863e-07, "logits/chosen": -0.7502080798149109, "logits/rejected": -0.793219804763794, "logps/chosen": -89.26763916015625, "logps/rejected": -96.80592346191406, "loss": 0.9093, "rewards/accuracies": 1.0, "rewards/chosen": 1.8944931030273438, "rewards/margins": 0.7215248346328735, "rewards/rejected": 1.1729682683944702, "step": 7854 }, { "epoch": 1.27, "learning_rate": 5.643508789216449e-07, "logits/chosen": -0.6176751852035522, "logits/rejected": -0.593355119228363, "logps/chosen": -73.23043060302734, "logps/rejected": -38.00408935546875, "loss": 0.5734, "rewards/accuracies": 0.0, "rewards/chosen": 0.8560264706611633, "rewards/margins": -0.695533812046051, "rewards/rejected": 1.5515602827072144, "step": 7855 }, { "epoch": 1.28, "learning_rate": 5.642205443242667e-07, "logits/chosen": -0.7076540589332581, "logits/rejected": -0.7076540589332581, "logps/chosen": -78.86717224121094, "logps/rejected": -78.86717224121094, "loss": 0.3768, "rewards/accuracies": 0.0, "rewards/chosen": 1.842260718345642, "rewards/margins": 0.0, "rewards/rejected": 1.842260718345642, "step": 7856 }, { "epoch": 1.28, "learning_rate": 5.640902052898565e-07, "logits/chosen": -0.7684738636016846, "logits/rejected": -0.7252216935157776, "logps/chosen": -92.99907684326172, "logps/rejected": -35.74101257324219, "loss": 0.6272, "rewards/accuracies": 1.0, "rewards/chosen": 1.2557785511016846, "rewards/margins": 0.45567822456359863, "rewards/rejected": 0.8001003265380859, "step": 7857 }, { "epoch": 1.28, "learning_rate": 5.639598618274196e-07, "logits/chosen": -0.7689094543457031, "logits/rejected": -0.850676953792572, "logps/chosen": -93.36241912841797, "logps/rejected": -118.41033935546875, "loss": 1.5045, "rewards/accuracies": 0.0, "rewards/chosen": 1.7693718671798706, "rewards/margins": -1.2482613325119019, "rewards/rejected": 3.0176331996917725, "step": 7858 }, { "epoch": 1.28, "learning_rate": 5.638295139459614e-07, "logits/chosen": -0.615068256855011, "logits/rejected": -0.45036399364471436, "logps/chosen": -87.12199401855469, "logps/rejected": -37.03116226196289, "loss": 0.4448, "rewards/accuracies": 1.0, "rewards/chosen": 1.2775452136993408, "rewards/margins": 1.2024922370910645, "rewards/rejected": 0.07505302876234055, "step": 7859 }, { "epoch": 1.28, "learning_rate": 5.636991616544878e-07, "logits/chosen": -0.7373718619346619, "logits/rejected": -0.6194835901260376, "logps/chosen": -73.92131042480469, "logps/rejected": -24.111656188964844, "loss": 0.3995, "rewards/accuracies": 1.0, "rewards/chosen": 1.1784766912460327, "rewards/margins": 1.258776307106018, "rewards/rejected": -0.08029957115650177, "step": 7860 }, { "epoch": 1.28, "learning_rate": 5.635688049620048e-07, "logits/chosen": -1.055863618850708, "logits/rejected": -0.7631309032440186, "logps/chosen": -236.04283142089844, "logps/rejected": -45.0014533996582, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 4.433772563934326, "rewards/margins": 4.301290988922119, "rewards/rejected": 0.13248176872730255, "step": 7861 }, { "epoch": 1.28, "learning_rate": 5.634384438775189e-07, "logits/chosen": -1.1822178363800049, "logits/rejected": -1.162510633468628, "logps/chosen": -51.80327606201172, "logps/rejected": -83.41798400878906, "loss": 0.4505, "rewards/accuracies": 1.0, "rewards/chosen": 3.0874412059783936, "rewards/margins": 0.5509023666381836, "rewards/rejected": 2.53653883934021, "step": 7862 }, { "epoch": 1.28, "learning_rate": 5.633080784100368e-07, "logits/chosen": -0.7208379507064819, "logits/rejected": -0.7009830474853516, "logps/chosen": -84.29855346679688, "logps/rejected": -69.65914916992188, "loss": 0.2411, "rewards/accuracies": 1.0, "rewards/chosen": 3.449519395828247, "rewards/margins": 0.5935020446777344, "rewards/rejected": 2.8560173511505127, "step": 7863 }, { "epoch": 1.28, "learning_rate": 5.631777085685653e-07, "logits/chosen": -0.7136185169219971, "logits/rejected": -0.753993570804596, "logps/chosen": -65.46893310546875, "logps/rejected": -76.0859375, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 1.8222748041152954, "rewards/margins": 0.06956028938293457, "rewards/rejected": 1.7527145147323608, "step": 7864 }, { "epoch": 1.28, "learning_rate": 5.630473343621123e-07, "logits/chosen": -0.4299300014972687, "logits/rejected": -0.4299300014972687, "logps/chosen": -0.5363153219223022, "logps/rejected": -0.5363153219223022, "loss": 0.4017, "rewards/accuracies": 0.0, "rewards/chosen": 0.15742044150829315, "rewards/margins": 0.0, "rewards/rejected": 0.15742044150829315, "step": 7865 }, { "epoch": 1.28, "learning_rate": 5.629169557996848e-07, "logits/chosen": -0.597213864326477, "logits/rejected": -0.6557023525238037, "logps/chosen": -73.99147033691406, "logps/rejected": -67.96165466308594, "loss": 1.375, "rewards/accuracies": 0.0, "rewards/chosen": 1.7290176153182983, "rewards/margins": -2.4744415283203125, "rewards/rejected": 4.2034592628479, "step": 7866 }, { "epoch": 1.28, "learning_rate": 5.627865728902911e-07, "logits/chosen": -1.1695947647094727, "logits/rejected": -0.9602807760238647, "logps/chosen": -132.85104370117188, "logps/rejected": -77.6474609375, "loss": 1.0123, "rewards/accuracies": 1.0, "rewards/chosen": 4.3453569412231445, "rewards/margins": 1.5940675735473633, "rewards/rejected": 2.7512893676757812, "step": 7867 }, { "epoch": 1.28, "learning_rate": 5.626561856429393e-07, "logits/chosen": -0.8389434218406677, "logits/rejected": -0.8426235318183899, "logps/chosen": -60.57054901123047, "logps/rejected": -30.39559555053711, "loss": 0.6781, "rewards/accuracies": 0.0, "rewards/chosen": 1.0770905017852783, "rewards/margins": -1.0396876335144043, "rewards/rejected": 2.1167781352996826, "step": 7868 }, { "epoch": 1.28, "learning_rate": 5.625257940666378e-07, "logits/chosen": -0.9178715944290161, "logits/rejected": -0.8694654107093811, "logps/chosen": -74.14337158203125, "logps/rejected": -121.2327651977539, "loss": 1.5729, "rewards/accuracies": 0.0, "rewards/chosen": 2.245542287826538, "rewards/margins": -1.4279372692108154, "rewards/rejected": 3.6734795570373535, "step": 7869 }, { "epoch": 1.28, "learning_rate": 5.623953981703958e-07, "logits/chosen": -0.3676523268222809, "logits/rejected": -0.37915197014808655, "logps/chosen": -9.633308410644531, "logps/rejected": -11.102433204650879, "loss": 0.3704, "rewards/accuracies": 1.0, "rewards/chosen": -0.018388748168945312, "rewards/margins": 0.07899446785449982, "rewards/rejected": -0.09738321602344513, "step": 7870 }, { "epoch": 1.28, "learning_rate": 5.622649979632219e-07, "logits/chosen": -0.518510639667511, "logits/rejected": -0.418144166469574, "logps/chosen": -59.081787109375, "logps/rejected": -14.726945877075195, "loss": 0.6262, "rewards/accuracies": 1.0, "rewards/chosen": 2.384018659591675, "rewards/margins": 1.4928181171417236, "rewards/rejected": 0.8912004828453064, "step": 7871 }, { "epoch": 1.28, "learning_rate": 5.621345934541259e-07, "logits/chosen": -1.0995659828186035, "logits/rejected": -1.0955097675323486, "logps/chosen": -114.5522689819336, "logps/rejected": -142.18218994140625, "loss": 1.1852, "rewards/accuracies": 0.0, "rewards/chosen": 1.0775184631347656, "rewards/margins": -1.1298408508300781, "rewards/rejected": 2.2073593139648438, "step": 7872 }, { "epoch": 1.28, "learning_rate": 5.620041846521176e-07, "logits/chosen": -0.9073927402496338, "logits/rejected": -0.8614003658294678, "logps/chosen": -147.16165161132812, "logps/rejected": -157.35777282714844, "loss": 0.2872, "rewards/accuracies": 1.0, "rewards/chosen": 5.122021675109863, "rewards/margins": 0.4503798484802246, "rewards/rejected": 4.671641826629639, "step": 7873 }, { "epoch": 1.28, "learning_rate": 5.618737715662066e-07, "logits/chosen": -1.0200169086456299, "logits/rejected": -0.9549626111984253, "logps/chosen": -67.26698303222656, "logps/rejected": -36.1592903137207, "loss": 0.2417, "rewards/accuracies": 1.0, "rewards/chosen": 1.8144409656524658, "rewards/margins": 1.5550155639648438, "rewards/rejected": 0.2594253718852997, "step": 7874 }, { "epoch": 1.28, "learning_rate": 5.617433542054036e-07, "logits/chosen": -0.6395784616470337, "logits/rejected": -0.634278416633606, "logps/chosen": -50.26097869873047, "logps/rejected": -93.93566131591797, "loss": 0.8601, "rewards/accuracies": 1.0, "rewards/chosen": 1.1617287397384644, "rewards/margins": 0.9270813465118408, "rewards/rejected": 0.23464737832546234, "step": 7875 }, { "epoch": 1.28, "learning_rate": 5.616129325787189e-07, "logits/chosen": -0.6148760318756104, "logits/rejected": -0.6607095003128052, "logps/chosen": -76.92984771728516, "logps/rejected": -163.91485595703125, "loss": 2.729, "rewards/accuracies": 0.0, "rewards/chosen": 1.7935272455215454, "rewards/margins": -4.935022354125977, "rewards/rejected": 6.728549480438232, "step": 7876 }, { "epoch": 1.28, "learning_rate": 5.614825066951637e-07, "logits/chosen": -0.9457105398178101, "logits/rejected": -0.9784740805625916, "logps/chosen": -229.5919952392578, "logps/rejected": -45.429901123046875, "loss": 0.1405, "rewards/accuracies": 1.0, "rewards/chosen": 4.9425249099731445, "rewards/margins": 3.1620888710021973, "rewards/rejected": 1.7804359197616577, "step": 7877 }, { "epoch": 1.28, "learning_rate": 5.613520765637489e-07, "logits/chosen": -0.35991019010543823, "logits/rejected": -0.35991019010543823, "logps/chosen": -0.3180414140224457, "logps/rejected": -0.3180414140224457, "loss": 1.0746, "rewards/accuracies": 0.0, "rewards/chosen": 0.07369361072778702, "rewards/margins": 0.0, "rewards/rejected": 0.07369361072778702, "step": 7878 }, { "epoch": 1.28, "learning_rate": 5.612216421934861e-07, "logits/chosen": -0.4965221583843231, "logits/rejected": -0.4965221583843231, "logps/chosen": -67.33574676513672, "logps/rejected": -67.33574676513672, "loss": 0.3784, "rewards/accuracies": 0.0, "rewards/chosen": 1.8114433288574219, "rewards/margins": 0.0, "rewards/rejected": 1.8114433288574219, "step": 7879 }, { "epoch": 1.28, "learning_rate": 5.610912035933871e-07, "logits/chosen": -0.5381888747215271, "logits/rejected": -0.4369683265686035, "logps/chosen": -46.04560852050781, "logps/rejected": -67.89340209960938, "loss": 0.3971, "rewards/accuracies": 0.0, "rewards/chosen": 2.5270798206329346, "rewards/margins": -0.1361680030822754, "rewards/rejected": 2.66324782371521, "step": 7880 }, { "epoch": 1.28, "learning_rate": 5.609607607724642e-07, "logits/chosen": -0.7990642189979553, "logits/rejected": -0.7191699147224426, "logps/chosen": -78.79959106445312, "logps/rejected": -63.09587860107422, "loss": 0.0878, "rewards/accuracies": 1.0, "rewards/chosen": 2.2286057472229004, "rewards/margins": 1.9744560718536377, "rewards/rejected": 0.2541496455669403, "step": 7881 }, { "epoch": 1.28, "learning_rate": 5.608303137397294e-07, "logits/chosen": -0.660444974899292, "logits/rejected": -0.6863201260566711, "logps/chosen": -51.31753158569336, "logps/rejected": -42.84174346923828, "loss": 2.5779, "rewards/accuracies": 0.0, "rewards/chosen": 1.576944351196289, "rewards/margins": -1.285027027130127, "rewards/rejected": 2.861971378326416, "step": 7882 }, { "epoch": 1.28, "learning_rate": 5.606998625041954e-07, "logits/chosen": -0.6638677716255188, "logits/rejected": -0.6390365362167358, "logps/chosen": -78.3225326538086, "logps/rejected": -95.6739273071289, "loss": 0.4877, "rewards/accuracies": 0.0, "rewards/chosen": 2.059964895248413, "rewards/margins": -0.41921234130859375, "rewards/rejected": 2.479177236557007, "step": 7883 }, { "epoch": 1.28, "learning_rate": 5.605694070748755e-07, "logits/chosen": -1.090707778930664, "logits/rejected": -1.0839595794677734, "logps/chosen": -64.93775939941406, "logps/rejected": -63.82184982299805, "loss": 0.7429, "rewards/accuracies": 0.0, "rewards/chosen": 1.2185043096542358, "rewards/margins": -0.32460832595825195, "rewards/rejected": 1.5431126356124878, "step": 7884 }, { "epoch": 1.28, "learning_rate": 5.604389474607824e-07, "logits/chosen": -0.814100980758667, "logits/rejected": -0.7784858345985413, "logps/chosen": -49.70393753051758, "logps/rejected": -18.65918731689453, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 0.8075153231620789, "rewards/margins": -0.13967174291610718, "rewards/rejected": 0.947187066078186, "step": 7885 }, { "epoch": 1.28, "learning_rate": 5.603084836709301e-07, "logits/chosen": -0.8916343450546265, "logits/rejected": -0.917940080165863, "logps/chosen": -144.46902465820312, "logps/rejected": -129.6875, "loss": 3.7529, "rewards/accuracies": 0.0, "rewards/chosen": 1.0258530378341675, "rewards/margins": -7.474085807800293, "rewards/rejected": 8.49993896484375, "step": 7886 }, { "epoch": 1.28, "learning_rate": 5.601780157143322e-07, "logits/chosen": -0.629000723361969, "logits/rejected": -0.46401289105415344, "logps/chosen": -253.44944763183594, "logps/rejected": -26.823930740356445, "loss": 0.0721, "rewards/accuracies": 1.0, "rewards/chosen": 5.4552202224731445, "rewards/margins": 5.268539905548096, "rewards/rejected": 0.18668042123317719, "step": 7887 }, { "epoch": 1.28, "learning_rate": 5.600475436000028e-07, "logits/chosen": -1.0437610149383545, "logits/rejected": -0.9999306201934814, "logps/chosen": -42.66295623779297, "logps/rejected": -38.691917419433594, "loss": 0.7314, "rewards/accuracies": 1.0, "rewards/chosen": 1.6532214879989624, "rewards/margins": 1.531071424484253, "rewards/rejected": 0.12215004116296768, "step": 7888 }, { "epoch": 1.28, "learning_rate": 5.599170673369563e-07, "logits/chosen": -0.34039971232414246, "logits/rejected": -0.326302170753479, "logps/chosen": -4.89240837097168, "logps/rejected": -3.046593427658081, "loss": 0.8362, "rewards/accuracies": 1.0, "rewards/chosen": 0.20377731323242188, "rewards/margins": 0.08725292235612869, "rewards/rejected": 0.11652439087629318, "step": 7889 }, { "epoch": 1.28, "learning_rate": 5.597865869342074e-07, "logits/chosen": -0.5267026424407959, "logits/rejected": -0.4524688720703125, "logps/chosen": -68.17085266113281, "logps/rejected": -92.03398132324219, "loss": 0.7097, "rewards/accuracies": 1.0, "rewards/chosen": 2.7812271118164062, "rewards/margins": 1.2139960527420044, "rewards/rejected": 1.5672310590744019, "step": 7890 }, { "epoch": 1.28, "learning_rate": 5.596561024007711e-07, "logits/chosen": -0.7961723208427429, "logits/rejected": -0.7727434039115906, "logps/chosen": -46.58849334716797, "logps/rejected": -30.679183959960938, "loss": 0.4532, "rewards/accuracies": 1.0, "rewards/chosen": 1.878804087638855, "rewards/margins": 0.6499305963516235, "rewards/rejected": 1.2288734912872314, "step": 7891 }, { "epoch": 1.28, "learning_rate": 5.595256137456625e-07, "logits/chosen": -0.6099348664283752, "logits/rejected": -0.6465238332748413, "logps/chosen": -78.917236328125, "logps/rejected": -64.49063110351562, "loss": 0.747, "rewards/accuracies": 0.0, "rewards/chosen": 1.2754868268966675, "rewards/margins": -0.6974914073944092, "rewards/rejected": 1.9729782342910767, "step": 7892 }, { "epoch": 1.28, "learning_rate": 5.593951209778973e-07, "logits/chosen": -0.6457826495170593, "logits/rejected": -0.545107364654541, "logps/chosen": -66.642333984375, "logps/rejected": -33.02400207519531, "loss": 0.2301, "rewards/accuracies": 1.0, "rewards/chosen": 2.2835540771484375, "rewards/margins": 0.9959278106689453, "rewards/rejected": 1.2876262664794922, "step": 7893 }, { "epoch": 1.28, "learning_rate": 5.592646241064912e-07, "logits/chosen": -0.9286184906959534, "logits/rejected": -0.7567731142044067, "logps/chosen": -142.66627502441406, "logps/rejected": -83.89875793457031, "loss": 0.0608, "rewards/accuracies": 1.0, "rewards/chosen": 5.918327331542969, "rewards/margins": 2.4697089195251465, "rewards/rejected": 3.4486184120178223, "step": 7894 }, { "epoch": 1.28, "learning_rate": 5.591341231404604e-07, "logits/chosen": -0.8728185892105103, "logits/rejected": -0.7728662490844727, "logps/chosen": -190.89877319335938, "logps/rejected": -80.18354797363281, "loss": 1.1266, "rewards/accuracies": 0.0, "rewards/chosen": 5.427542209625244, "rewards/margins": -2.0241074562072754, "rewards/rejected": 7.4516496658325195, "step": 7895 }, { "epoch": 1.28, "learning_rate": 5.590036180888211e-07, "logits/chosen": -0.7416123747825623, "logits/rejected": -0.3056260943412781, "logps/chosen": -48.93225860595703, "logps/rejected": -89.59896850585938, "loss": 0.6218, "rewards/accuracies": 0.0, "rewards/chosen": 1.605743408203125, "rewards/margins": -0.8964126110076904, "rewards/rejected": 2.5021560192108154, "step": 7896 }, { "epoch": 1.28, "learning_rate": 5.588731089605902e-07, "logits/chosen": -0.7673637866973877, "logits/rejected": -0.5907539129257202, "logps/chosen": -122.19166564941406, "logps/rejected": -40.80192947387695, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 5.050512790679932, "rewards/margins": 4.684643745422363, "rewards/rejected": 0.3658691346645355, "step": 7897 }, { "epoch": 1.28, "learning_rate": 5.587425957647845e-07, "logits/chosen": -0.9418659210205078, "logits/rejected": -0.8999403119087219, "logps/chosen": -169.35940551757812, "logps/rejected": -110.90629577636719, "loss": 0.4347, "rewards/accuracies": 0.0, "rewards/chosen": 4.885572910308838, "rewards/margins": -0.21798992156982422, "rewards/rejected": 5.103562831878662, "step": 7898 }, { "epoch": 1.28, "learning_rate": 5.586120785104212e-07, "logits/chosen": -0.8835644721984863, "logits/rejected": -0.8799923062324524, "logps/chosen": -121.89635467529297, "logps/rejected": -97.82954406738281, "loss": 0.2251, "rewards/accuracies": 1.0, "rewards/chosen": 1.7018135786056519, "rewards/margins": 0.7000832557678223, "rewards/rejected": 1.0017303228378296, "step": 7899 }, { "epoch": 1.28, "learning_rate": 5.58481557206518e-07, "logits/chosen": -0.879288911819458, "logits/rejected": -0.8480720520019531, "logps/chosen": -118.64347076416016, "logps/rejected": -157.49453735351562, "loss": 0.1763, "rewards/accuracies": 1.0, "rewards/chosen": 1.6455665826797485, "rewards/margins": 0.8966728448867798, "rewards/rejected": 0.7488937377929688, "step": 7900 }, { "epoch": 1.28, "learning_rate": 5.583510318620925e-07, "logits/chosen": -1.0623607635498047, "logits/rejected": -1.0074279308319092, "logps/chosen": -79.2572250366211, "logps/rejected": -78.2125244140625, "loss": 0.9214, "rewards/accuracies": 0.0, "rewards/chosen": 1.1927818059921265, "rewards/margins": -1.3488458395004272, "rewards/rejected": 2.5416276454925537, "step": 7901 }, { "epoch": 1.28, "learning_rate": 5.582205024861628e-07, "logits/chosen": -0.3833404779434204, "logits/rejected": -0.3435094952583313, "logps/chosen": -48.52189254760742, "logps/rejected": -7.640054225921631, "loss": 1.3281, "rewards/accuracies": 0.0, "rewards/chosen": 0.6895530819892883, "rewards/margins": -0.030003786087036133, "rewards/rejected": 0.7195568680763245, "step": 7902 }, { "epoch": 1.28, "learning_rate": 5.580899690877474e-07, "logits/chosen": -0.7527406811714172, "logits/rejected": -0.7361165881156921, "logps/chosen": -60.126468658447266, "logps/rejected": -36.68444061279297, "loss": 0.1678, "rewards/accuracies": 1.0, "rewards/chosen": 3.3271076679229736, "rewards/margins": 1.0337581634521484, "rewards/rejected": 2.293349504470825, "step": 7903 }, { "epoch": 1.28, "learning_rate": 5.579594316758645e-07, "logits/chosen": -0.6313937902450562, "logits/rejected": -0.671612560749054, "logps/chosen": -87.90798950195312, "logps/rejected": -92.96394348144531, "loss": 1.6917, "rewards/accuracies": 0.0, "rewards/chosen": -0.28933411836624146, "rewards/margins": -1.6960313320159912, "rewards/rejected": 1.406697154045105, "step": 7904 }, { "epoch": 1.28, "learning_rate": 5.578288902595335e-07, "logits/chosen": -0.6353775858879089, "logits/rejected": -0.6660887598991394, "logps/chosen": -119.07347869873047, "logps/rejected": -126.98014831542969, "loss": 0.8763, "rewards/accuracies": 0.0, "rewards/chosen": 4.237422943115234, "rewards/margins": -0.27354955673217773, "rewards/rejected": 4.510972499847412, "step": 7905 }, { "epoch": 1.28, "learning_rate": 5.576983448477734e-07, "logits/chosen": -0.5434898734092712, "logits/rejected": -0.5434898734092712, "logps/chosen": -41.05164337158203, "logps/rejected": -41.05164337158203, "loss": 0.5283, "rewards/accuracies": 0.0, "rewards/chosen": 0.6662567257881165, "rewards/margins": 0.0, "rewards/rejected": 0.6662567257881165, "step": 7906 }, { "epoch": 1.28, "learning_rate": 5.575677954496034e-07, "logits/chosen": -0.553983211517334, "logits/rejected": -0.772121250629425, "logps/chosen": -120.43638610839844, "logps/rejected": -43.12363052368164, "loss": 1.461, "rewards/accuracies": 0.0, "rewards/chosen": 0.12255020439624786, "rewards/margins": -1.8587188720703125, "rewards/rejected": 1.981269121170044, "step": 7907 }, { "epoch": 1.28, "learning_rate": 5.574372420740436e-07, "logits/chosen": -0.9352530837059021, "logits/rejected": -0.7546952366828918, "logps/chosen": -177.50274658203125, "logps/rejected": -52.72466278076172, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": 4.371127605438232, "rewards/margins": 2.679382562637329, "rewards/rejected": 1.6917450428009033, "step": 7908 }, { "epoch": 1.28, "learning_rate": 5.573066847301138e-07, "logits/chosen": -0.6227841377258301, "logits/rejected": -0.5797957181930542, "logps/chosen": -84.00425720214844, "logps/rejected": -6.94751501083374, "loss": 0.0854, "rewards/accuracies": 1.0, "rewards/chosen": 2.592489719390869, "rewards/margins": 1.7493853569030762, "rewards/rejected": 0.843104362487793, "step": 7909 }, { "epoch": 1.28, "learning_rate": 5.571761234268344e-07, "logits/chosen": -0.45215484499931335, "logits/rejected": -0.4548279941082001, "logps/chosen": -8.498668670654297, "logps/rejected": -2.8186795711517334, "loss": 0.5767, "rewards/accuracies": 0.0, "rewards/chosen": 0.42376241087913513, "rewards/margins": -0.24079087376594543, "rewards/rejected": 0.6645532846450806, "step": 7910 }, { "epoch": 1.28, "learning_rate": 5.570455581732259e-07, "logits/chosen": -0.6558948159217834, "logits/rejected": -0.4991602599620819, "logps/chosen": -117.97955322265625, "logps/rejected": -28.1214599609375, "loss": 0.5162, "rewards/accuracies": 1.0, "rewards/chosen": 4.840750217437744, "rewards/margins": 4.505667209625244, "rewards/rejected": 0.3350830078125, "step": 7911 }, { "epoch": 1.28, "learning_rate": 5.569149889783089e-07, "logits/chosen": -0.31434184312820435, "logits/rejected": -0.39271819591522217, "logps/chosen": -115.90274047851562, "logps/rejected": -86.37555694580078, "loss": 1.0071, "rewards/accuracies": 1.0, "rewards/chosen": 1.7757705450057983, "rewards/margins": 1.0378432273864746, "rewards/rejected": 0.737927258014679, "step": 7912 }, { "epoch": 1.28, "learning_rate": 5.567844158511048e-07, "logits/chosen": -0.8371542692184448, "logits/rejected": -0.6262949705123901, "logps/chosen": -217.62081909179688, "logps/rejected": -288.17401123046875, "loss": 0.7305, "rewards/accuracies": 0.0, "rewards/chosen": 4.115225315093994, "rewards/margins": -0.9744048118591309, "rewards/rejected": 5.089630126953125, "step": 7913 }, { "epoch": 1.28, "learning_rate": 5.56653838800635e-07, "logits/chosen": -0.358286589384079, "logits/rejected": -0.357669472694397, "logps/chosen": -0.47837167978286743, "logps/rejected": -30.864469528198242, "loss": 0.6219, "rewards/accuracies": 1.0, "rewards/chosen": 0.2069542407989502, "rewards/margins": 0.3086906969547272, "rewards/rejected": -0.10173644870519638, "step": 7914 }, { "epoch": 1.28, "learning_rate": 5.565232578359208e-07, "logits/chosen": -0.7855650186538696, "logits/rejected": -0.6184999942779541, "logps/chosen": -53.26996994018555, "logps/rejected": -98.42727661132812, "loss": 1.9584, "rewards/accuracies": 0.0, "rewards/chosen": 2.1404407024383545, "rewards/margins": -3.645615816116333, "rewards/rejected": 5.7860565185546875, "step": 7915 }, { "epoch": 1.28, "learning_rate": 5.563926729659845e-07, "logits/chosen": -0.7991193532943726, "logits/rejected": -0.7298356294631958, "logps/chosen": -72.78890991210938, "logps/rejected": -77.81039428710938, "loss": 0.4061, "rewards/accuracies": 0.0, "rewards/chosen": 1.7766937017440796, "rewards/margins": -0.18458104133605957, "rewards/rejected": 1.9612747430801392, "step": 7916 }, { "epoch": 1.29, "learning_rate": 5.56262084199848e-07, "logits/chosen": -0.48470568656921387, "logits/rejected": -0.4785543382167816, "logps/chosen": -82.46832275390625, "logps/rejected": -136.8623046875, "loss": 0.7374, "rewards/accuracies": 0.0, "rewards/chosen": 0.9027015566825867, "rewards/margins": -0.5937339663505554, "rewards/rejected": 1.496435523033142, "step": 7917 }, { "epoch": 1.29, "learning_rate": 5.561314915465338e-07, "logits/chosen": -0.7253221273422241, "logits/rejected": -0.5992492437362671, "logps/chosen": -66.09001922607422, "logps/rejected": -13.833718299865723, "loss": 0.3647, "rewards/accuracies": 1.0, "rewards/chosen": 2.3962395191192627, "rewards/margins": 1.4070522785186768, "rewards/rejected": 0.9891871809959412, "step": 7918 }, { "epoch": 1.29, "learning_rate": 5.560008950150646e-07, "logits/chosen": -0.4824819564819336, "logits/rejected": -0.34182339906692505, "logps/chosen": -119.94428253173828, "logps/rejected": -95.80432891845703, "loss": 0.219, "rewards/accuracies": 1.0, "rewards/chosen": 4.224706172943115, "rewards/margins": 2.0699522495269775, "rewards/rejected": 2.1547539234161377, "step": 7919 }, { "epoch": 1.29, "learning_rate": 5.558702946144635e-07, "logits/chosen": -0.7208440899848938, "logits/rejected": -0.7553386688232422, "logps/chosen": -35.448177337646484, "logps/rejected": -123.64808654785156, "loss": 0.5135, "rewards/accuracies": 1.0, "rewards/chosen": 1.104455590248108, "rewards/margins": 0.08477628231048584, "rewards/rejected": 1.019679307937622, "step": 7920 }, { "epoch": 1.29, "learning_rate": 5.557396903537538e-07, "logits/chosen": -0.95881587266922, "logits/rejected": -0.8752120733261108, "logps/chosen": -172.21792602539062, "logps/rejected": -14.936040878295898, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": 4.110782146453857, "rewards/margins": 3.4505367279052734, "rewards/rejected": 0.6602455377578735, "step": 7921 }, { "epoch": 1.29, "learning_rate": 5.556090822419588e-07, "logits/chosen": -0.938605546951294, "logits/rejected": -0.8978077173233032, "logps/chosen": -69.67044067382812, "logps/rejected": -46.570960998535156, "loss": 1.0967, "rewards/accuracies": 0.0, "rewards/chosen": 1.01469886302948, "rewards/margins": -2.066929817199707, "rewards/rejected": 3.0816285610198975, "step": 7922 }, { "epoch": 1.29, "learning_rate": 5.554784702881024e-07, "logits/chosen": -0.5296188592910767, "logits/rejected": -0.46974995732307434, "logps/chosen": -94.86260986328125, "logps/rejected": -62.73838806152344, "loss": 0.5705, "rewards/accuracies": 0.0, "rewards/chosen": 0.8406020998954773, "rewards/margins": -0.49597710371017456, "rewards/rejected": 1.3365792036056519, "step": 7923 }, { "epoch": 1.29, "learning_rate": 5.553478545012087e-07, "logits/chosen": -0.6836034059524536, "logits/rejected": -0.6327754259109497, "logps/chosen": -95.97811126708984, "logps/rejected": -56.223453521728516, "loss": 0.9225, "rewards/accuracies": 0.0, "rewards/chosen": 0.6982460021972656, "rewards/margins": -0.9935940504074097, "rewards/rejected": 1.6918400526046753, "step": 7924 }, { "epoch": 1.29, "learning_rate": 5.55217234890302e-07, "logits/chosen": -0.7558138370513916, "logits/rejected": -1.0642019510269165, "logps/chosen": -73.1026611328125, "logps/rejected": -107.98147583007812, "loss": 1.3514, "rewards/accuracies": 0.0, "rewards/chosen": 1.3698447942733765, "rewards/margins": -0.43761909008026123, "rewards/rejected": 1.8074638843536377, "step": 7925 }, { "epoch": 1.29, "learning_rate": 5.550866114644067e-07, "logits/chosen": -0.41223543882369995, "logits/rejected": -0.41223543882369995, "logps/chosen": -15.503362655639648, "logps/rejected": -15.503362655639648, "loss": 1.356, "rewards/accuracies": 0.0, "rewards/chosen": 0.23062191903591156, "rewards/margins": 0.0, "rewards/rejected": 0.23062191903591156, "step": 7926 }, { "epoch": 1.29, "learning_rate": 5.549559842325478e-07, "logits/chosen": -0.8319324254989624, "logits/rejected": -0.8534381985664368, "logps/chosen": -63.884071350097656, "logps/rejected": -60.454105377197266, "loss": 0.8358, "rewards/accuracies": 0.0, "rewards/chosen": 1.2710427045822144, "rewards/margins": -0.5071620941162109, "rewards/rejected": 1.7782047986984253, "step": 7927 }, { "epoch": 1.29, "learning_rate": 5.548253532037503e-07, "logits/chosen": -1.0359798669815063, "logits/rejected": -1.0111603736877441, "logps/chosen": -83.54949951171875, "logps/rejected": -72.90716552734375, "loss": 1.0253, "rewards/accuracies": 0.0, "rewards/chosen": 1.3190559148788452, "rewards/margins": -1.3725427389144897, "rewards/rejected": 2.691598653793335, "step": 7928 }, { "epoch": 1.29, "learning_rate": 5.546947183870398e-07, "logits/chosen": -0.7280844449996948, "logits/rejected": -0.6947959661483765, "logps/chosen": -59.152862548828125, "logps/rejected": -57.93175506591797, "loss": 1.5494, "rewards/accuracies": 1.0, "rewards/chosen": 1.8056602478027344, "rewards/margins": 0.7436187267303467, "rewards/rejected": 1.0620415210723877, "step": 7929 }, { "epoch": 1.29, "learning_rate": 5.54564079791442e-07, "logits/chosen": -0.9416900277137756, "logits/rejected": -0.9270079135894775, "logps/chosen": -151.40650939941406, "logps/rejected": -49.04646301269531, "loss": 0.0312, "rewards/accuracies": 1.0, "rewards/chosen": 4.759568691253662, "rewards/margins": 2.930246591567993, "rewards/rejected": 1.829322099685669, "step": 7930 }, { "epoch": 1.29, "learning_rate": 5.544334374259823e-07, "logits/chosen": -0.7752839922904968, "logits/rejected": -0.7300958037376404, "logps/chosen": -79.6981201171875, "logps/rejected": -80.26646423339844, "loss": 1.2892, "rewards/accuracies": 0.0, "rewards/chosen": 1.7229965925216675, "rewards/margins": -2.163949489593506, "rewards/rejected": 3.886946201324463, "step": 7931 }, { "epoch": 1.29, "learning_rate": 5.543027912996871e-07, "logits/chosen": -0.32012227177619934, "logits/rejected": -0.28203776478767395, "logps/chosen": -44.99633026123047, "logps/rejected": -39.17626953125, "loss": 0.7743, "rewards/accuracies": 1.0, "rewards/chosen": 0.7434849143028259, "rewards/margins": 0.014962434768676758, "rewards/rejected": 0.7285224795341492, "step": 7932 }, { "epoch": 1.29, "learning_rate": 5.54172141421583e-07, "logits/chosen": -1.084672451019287, "logits/rejected": -0.8857386112213135, "logps/chosen": -133.7069854736328, "logps/rejected": -24.65298843383789, "loss": 0.0169, "rewards/accuracies": 1.0, "rewards/chosen": 5.094766139984131, "rewards/margins": 4.951794624328613, "rewards/rejected": 0.14297142624855042, "step": 7933 }, { "epoch": 1.29, "learning_rate": 5.540414878006964e-07, "logits/chosen": -0.7456010580062866, "logits/rejected": -0.7538966536521912, "logps/chosen": -95.62357330322266, "logps/rejected": -96.40058898925781, "loss": 0.3956, "rewards/accuracies": 1.0, "rewards/chosen": 1.1400169134140015, "rewards/margins": 0.07917094230651855, "rewards/rejected": 1.060845971107483, "step": 7934 }, { "epoch": 1.29, "learning_rate": 5.539108304460543e-07, "logits/chosen": -0.9476430416107178, "logits/rejected": -0.9661741852760315, "logps/chosen": -58.36388397216797, "logps/rejected": -68.72982788085938, "loss": 0.2919, "rewards/accuracies": 1.0, "rewards/chosen": 1.9423760175704956, "rewards/margins": 1.230865478515625, "rewards/rejected": 0.7115104794502258, "step": 7935 }, { "epoch": 1.29, "learning_rate": 5.53780169366684e-07, "logits/chosen": -0.7882108688354492, "logits/rejected": -0.7872941493988037, "logps/chosen": -84.06411743164062, "logps/rejected": -51.400909423828125, "loss": 0.6598, "rewards/accuracies": 0.0, "rewards/chosen": 1.0248008966445923, "rewards/margins": -0.9291969537734985, "rewards/rejected": 1.9539978504180908, "step": 7936 }, { "epoch": 1.29, "learning_rate": 5.536495045716129e-07, "logits/chosen": -0.9286117553710938, "logits/rejected": -0.8455358743667603, "logps/chosen": -106.81143188476562, "logps/rejected": -73.43016815185547, "loss": 0.0977, "rewards/accuracies": 1.0, "rewards/chosen": 4.951684474945068, "rewards/margins": 2.681805372238159, "rewards/rejected": 2.269879102706909, "step": 7937 }, { "epoch": 1.29, "learning_rate": 5.535188360698686e-07, "logits/chosen": -0.7758303880691528, "logits/rejected": -0.8296815156936646, "logps/chosen": -106.00922393798828, "logps/rejected": -136.34353637695312, "loss": 1.1559, "rewards/accuracies": 0.0, "rewards/chosen": 3.582143545150757, "rewards/margins": -1.2295844554901123, "rewards/rejected": 4.811728000640869, "step": 7938 }, { "epoch": 1.29, "learning_rate": 5.533881638704792e-07, "logits/chosen": -1.5739575624465942, "logits/rejected": -0.9560329914093018, "logps/chosen": -106.53083038330078, "logps/rejected": -56.15605926513672, "loss": 0.7802, "rewards/accuracies": 0.0, "rewards/chosen": 1.2673195600509644, "rewards/margins": -0.8388534784317017, "rewards/rejected": 2.106173038482666, "step": 7939 }, { "epoch": 1.29, "learning_rate": 5.532574879824729e-07, "logits/chosen": -0.6529837250709534, "logits/rejected": -0.6283446550369263, "logps/chosen": -42.175682067871094, "logps/rejected": -29.08342170715332, "loss": 0.2407, "rewards/accuracies": 1.0, "rewards/chosen": 0.9091289639472961, "rewards/margins": 0.6952556371688843, "rewards/rejected": 0.21387329697608948, "step": 7940 }, { "epoch": 1.29, "learning_rate": 5.531268084148779e-07, "logits/chosen": -1.1771761178970337, "logits/rejected": -1.1594116687774658, "logps/chosen": -105.07844543457031, "logps/rejected": -75.98332977294922, "loss": 0.525, "rewards/accuracies": 0.0, "rewards/chosen": 1.3924667835235596, "rewards/margins": -0.4814399480819702, "rewards/rejected": 1.8739067316055298, "step": 7941 }, { "epoch": 1.29, "learning_rate": 5.529961251767232e-07, "logits/chosen": -0.7902939319610596, "logits/rejected": -0.5423818230628967, "logps/chosen": -152.42825317382812, "logps/rejected": -87.39707946777344, "loss": 0.3598, "rewards/accuracies": 1.0, "rewards/chosen": 5.406897068023682, "rewards/margins": 3.618710517883301, "rewards/rejected": 1.7881866693496704, "step": 7942 }, { "epoch": 1.29, "learning_rate": 5.528654382770378e-07, "logits/chosen": -0.5975119471549988, "logits/rejected": -0.5401559472084045, "logps/chosen": -63.36647033691406, "logps/rejected": -80.72142028808594, "loss": 0.3789, "rewards/accuracies": 1.0, "rewards/chosen": 2.341951847076416, "rewards/margins": 0.4509545564651489, "rewards/rejected": 1.890997290611267, "step": 7943 }, { "epoch": 1.29, "learning_rate": 5.527347477248507e-07, "logits/chosen": -0.6508080959320068, "logits/rejected": -0.6235646605491638, "logps/chosen": -87.68138122558594, "logps/rejected": -40.25032424926758, "loss": 0.2266, "rewards/accuracies": 1.0, "rewards/chosen": 5.614383220672607, "rewards/margins": 3.552619695663452, "rewards/rejected": 2.0617635250091553, "step": 7944 }, { "epoch": 1.29, "learning_rate": 5.526040535291917e-07, "logits/chosen": -0.5949762463569641, "logits/rejected": -0.3624723255634308, "logps/chosen": -118.60799407958984, "logps/rejected": -46.57315444946289, "loss": 0.0377, "rewards/accuracies": 1.0, "rewards/chosen": 4.735367774963379, "rewards/margins": 2.662409782409668, "rewards/rejected": 2.072957992553711, "step": 7945 }, { "epoch": 1.29, "learning_rate": 5.524733556990904e-07, "logits/chosen": -0.4231680929660797, "logits/rejected": -0.44011250138282776, "logps/chosen": -1.486324667930603, "logps/rejected": -37.146766662597656, "loss": 0.4485, "rewards/accuracies": 0.0, "rewards/chosen": 0.34917721152305603, "rewards/margins": -0.3288463056087494, "rewards/rejected": 0.6780235171318054, "step": 7946 }, { "epoch": 1.29, "learning_rate": 5.523426542435765e-07, "logits/chosen": -0.8893738389015198, "logits/rejected": -0.9265798926353455, "logps/chosen": -142.15670776367188, "logps/rejected": -136.8252716064453, "loss": 0.6488, "rewards/accuracies": 0.0, "rewards/chosen": 4.466133117675781, "rewards/margins": -0.8031830787658691, "rewards/rejected": 5.26931619644165, "step": 7947 }, { "epoch": 1.29, "learning_rate": 5.522119491716806e-07, "logits/chosen": -0.5711284875869751, "logits/rejected": -0.5711284875869751, "logps/chosen": -15.80842113494873, "logps/rejected": -15.80842113494873, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.33697137236595154, "rewards/margins": 0.0, "rewards/rejected": 0.33697137236595154, "step": 7948 }, { "epoch": 1.29, "learning_rate": 5.520812404924329e-07, "logits/chosen": -0.4204598367214203, "logits/rejected": -0.42802029848098755, "logps/chosen": -80.10480499267578, "logps/rejected": -47.21599578857422, "loss": 2.6926, "rewards/accuracies": 0.0, "rewards/chosen": 0.40484926104545593, "rewards/margins": -1.9657180309295654, "rewards/rejected": 2.3705673217773438, "step": 7949 }, { "epoch": 1.29, "learning_rate": 5.519505282148643e-07, "logits/chosen": -1.0389225482940674, "logits/rejected": -0.9648122191429138, "logps/chosen": -34.410858154296875, "logps/rejected": -38.238346099853516, "loss": 0.6579, "rewards/accuracies": 0.0, "rewards/chosen": 2.0370376110076904, "rewards/margins": -0.9466152191162109, "rewards/rejected": 2.9836528301239014, "step": 7950 }, { "epoch": 1.29, "learning_rate": 5.518198123480058e-07, "logits/chosen": -0.8296494483947754, "logits/rejected": -0.7801251411437988, "logps/chosen": -66.97889709472656, "logps/rejected": -46.98426055908203, "loss": 0.613, "rewards/accuracies": 1.0, "rewards/chosen": 3.031845808029175, "rewards/margins": 1.0279541015625, "rewards/rejected": 2.003891706466675, "step": 7951 }, { "epoch": 1.29, "learning_rate": 5.516890929008887e-07, "logits/chosen": -0.9754443764686584, "logits/rejected": -0.8654501438140869, "logps/chosen": -149.43588256835938, "logps/rejected": -56.282752990722656, "loss": 0.1407, "rewards/accuracies": 1.0, "rewards/chosen": 4.025989055633545, "rewards/margins": 1.4504663944244385, "rewards/rejected": 2.5755226612091064, "step": 7952 }, { "epoch": 1.29, "learning_rate": 5.515583698825442e-07, "logits/chosen": -0.730901837348938, "logits/rejected": -0.653584361076355, "logps/chosen": -59.53182601928711, "logps/rejected": -66.66961669921875, "loss": 0.8302, "rewards/accuracies": 1.0, "rewards/chosen": 1.898292899131775, "rewards/margins": 0.49527549743652344, "rewards/rejected": 1.4030174016952515, "step": 7953 }, { "epoch": 1.29, "learning_rate": 5.514276433020043e-07, "logits/chosen": -0.9671028852462769, "logits/rejected": -0.7058517336845398, "logps/chosen": -136.4488067626953, "logps/rejected": -17.012264251708984, "loss": 0.0641, "rewards/accuracies": 1.0, "rewards/chosen": 5.430802822113037, "rewards/margins": 4.227814197540283, "rewards/rejected": 1.2029885053634644, "step": 7954 }, { "epoch": 1.29, "learning_rate": 5.512969131683007e-07, "logits/chosen": -0.6405414938926697, "logits/rejected": -0.5498255491256714, "logps/chosen": -123.70433807373047, "logps/rejected": -89.2010269165039, "loss": 0.3852, "rewards/accuracies": 1.0, "rewards/chosen": 1.7276009321212769, "rewards/margins": 0.6513848304748535, "rewards/rejected": 1.0762161016464233, "step": 7955 }, { "epoch": 1.29, "learning_rate": 5.511661794904659e-07, "logits/chosen": -0.9362650513648987, "logits/rejected": -0.9592172503471375, "logps/chosen": -49.73356246948242, "logps/rejected": -50.722328186035156, "loss": 1.4405, "rewards/accuracies": 0.0, "rewards/chosen": 1.4145790338516235, "rewards/margins": -0.6579209566116333, "rewards/rejected": 2.072499990463257, "step": 7956 }, { "epoch": 1.29, "learning_rate": 5.510354422775323e-07, "logits/chosen": -0.9399635791778564, "logits/rejected": -0.8928309679031372, "logps/chosen": -71.16107177734375, "logps/rejected": -73.5255126953125, "loss": 3.8426, "rewards/accuracies": 0.0, "rewards/chosen": 0.6507766842842102, "rewards/margins": -0.5712494254112244, "rewards/rejected": 1.2220261096954346, "step": 7957 }, { "epoch": 1.29, "learning_rate": 5.509047015385324e-07, "logits/chosen": -0.698667049407959, "logits/rejected": -0.7165414094924927, "logps/chosen": -59.46343231201172, "logps/rejected": -65.95159912109375, "loss": 0.7405, "rewards/accuracies": 0.0, "rewards/chosen": 2.0251030921936035, "rewards/margins": -0.8573830127716064, "rewards/rejected": 2.88248610496521, "step": 7958 }, { "epoch": 1.29, "learning_rate": 5.507739572824994e-07, "logits/chosen": -0.35155579447746277, "logits/rejected": -0.3515577018260956, "logps/chosen": -4.403109073638916, "logps/rejected": -5.736296653747559, "loss": 0.7175, "rewards/accuracies": 0.0, "rewards/chosen": 0.18088828027248383, "rewards/margins": -0.09881390631198883, "rewards/rejected": 0.27970218658447266, "step": 7959 }, { "epoch": 1.29, "learning_rate": 5.506432095184663e-07, "logits/chosen": -0.6599774956703186, "logits/rejected": -0.6746512055397034, "logps/chosen": -25.60560417175293, "logps/rejected": -7.426747798919678, "loss": 0.8574, "rewards/accuracies": 0.0, "rewards/chosen": 0.1580190658569336, "rewards/margins": -0.07141585648059845, "rewards/rejected": 0.22943492233753204, "step": 7960 }, { "epoch": 1.29, "learning_rate": 5.505124582554667e-07, "logits/chosen": -1.1589915752410889, "logits/rejected": -1.0970755815505981, "logps/chosen": -195.9916534423828, "logps/rejected": -68.57061004638672, "loss": 0.5637, "rewards/accuracies": 0.0, "rewards/chosen": 2.4076950550079346, "rewards/margins": -0.6394386291503906, "rewards/rejected": 3.047133684158325, "step": 7961 }, { "epoch": 1.29, "learning_rate": 5.503817035025341e-07, "logits/chosen": -0.688932478427887, "logits/rejected": -0.7018318176269531, "logps/chosen": -103.27249145507812, "logps/rejected": -93.24205017089844, "loss": 0.9131, "rewards/accuracies": 1.0, "rewards/chosen": 1.2828925848007202, "rewards/margins": 0.1440345048904419, "rewards/rejected": 1.1388580799102783, "step": 7962 }, { "epoch": 1.29, "learning_rate": 5.502509452687025e-07, "logits/chosen": -0.7091092467308044, "logits/rejected": -0.6876494884490967, "logps/chosen": -62.473026275634766, "logps/rejected": -104.14646911621094, "loss": 0.2271, "rewards/accuracies": 1.0, "rewards/chosen": 1.7303905487060547, "rewards/margins": 1.5601863861083984, "rewards/rejected": 0.17020416259765625, "step": 7963 }, { "epoch": 1.29, "learning_rate": 5.501201835630062e-07, "logits/chosen": -0.7479110360145569, "logits/rejected": -0.7674626111984253, "logps/chosen": -58.06644058227539, "logps/rejected": -36.36955261230469, "loss": 1.0357, "rewards/accuracies": 0.0, "rewards/chosen": 1.333279013633728, "rewards/margins": -0.3645855188369751, "rewards/rejected": 1.6978645324707031, "step": 7964 }, { "epoch": 1.29, "learning_rate": 5.499894183944793e-07, "logits/chosen": -0.5125992894172668, "logits/rejected": -0.4850308299064636, "logps/chosen": -48.16439437866211, "logps/rejected": -3.695981025695801, "loss": 0.7406, "rewards/accuracies": 0.0, "rewards/chosen": 0.029100799933075905, "rewards/margins": -0.417938232421875, "rewards/rejected": 0.44703903794288635, "step": 7965 }, { "epoch": 1.29, "learning_rate": 5.498586497721567e-07, "logits/chosen": -1.1697901487350464, "logits/rejected": -1.192026138305664, "logps/chosen": -87.32342529296875, "logps/rejected": -170.4461669921875, "loss": 0.943, "rewards/accuracies": 0.0, "rewards/chosen": 1.400294542312622, "rewards/margins": -1.6414368152618408, "rewards/rejected": 3.041731357574463, "step": 7966 }, { "epoch": 1.29, "learning_rate": 5.497278777050731e-07, "logits/chosen": -1.037979006767273, "logits/rejected": -1.0223617553710938, "logps/chosen": -81.33629608154297, "logps/rejected": -52.196144104003906, "loss": 0.3978, "rewards/accuracies": 1.0, "rewards/chosen": 1.595239281654358, "rewards/margins": 0.21501612663269043, "rewards/rejected": 1.3802231550216675, "step": 7967 }, { "epoch": 1.29, "learning_rate": 5.495971022022637e-07, "logits/chosen": -0.6897558569908142, "logits/rejected": -0.7594096660614014, "logps/chosen": -77.20211791992188, "logps/rejected": -149.28236389160156, "loss": 2.0109, "rewards/accuracies": 0.0, "rewards/chosen": 3.1467010974884033, "rewards/margins": -3.2727677822113037, "rewards/rejected": 6.419468879699707, "step": 7968 }, { "epoch": 1.29, "learning_rate": 5.494663232727639e-07, "logits/chosen": -0.7468768954277039, "logits/rejected": -0.6738740801811218, "logps/chosen": -82.48788452148438, "logps/rejected": -78.85057830810547, "loss": 2.0971, "rewards/accuracies": 1.0, "rewards/chosen": 2.059389591217041, "rewards/margins": 0.20764780044555664, "rewards/rejected": 1.8517417907714844, "step": 7969 }, { "epoch": 1.29, "learning_rate": 5.493355409256091e-07, "logits/chosen": -0.9090948104858398, "logits/rejected": -0.8078317046165466, "logps/chosen": -184.74871826171875, "logps/rejected": -80.19908905029297, "loss": 0.5159, "rewards/accuracies": 0.0, "rewards/chosen": 1.5687789916992188, "rewards/margins": -0.5361487865447998, "rewards/rejected": 2.1049277782440186, "step": 7970 }, { "epoch": 1.29, "learning_rate": 5.492047551698354e-07, "logits/chosen": -0.3346356749534607, "logits/rejected": -0.36891621351242065, "logps/chosen": -65.09773254394531, "logps/rejected": -95.96617126464844, "loss": 0.5988, "rewards/accuracies": 0.0, "rewards/chosen": 1.6489051580429077, "rewards/margins": -0.3831840753555298, "rewards/rejected": 2.0320892333984375, "step": 7971 }, { "epoch": 1.29, "learning_rate": 5.490739660144785e-07, "logits/chosen": -0.013880421407520771, "logits/rejected": -0.013880421407520771, "logps/chosen": -1.5945923328399658, "logps/rejected": -1.5945923328399658, "loss": 0.3929, "rewards/accuracies": 0.0, "rewards/chosen": 0.3190441429615021, "rewards/margins": 0.0, "rewards/rejected": 0.3190441429615021, "step": 7972 }, { "epoch": 1.29, "learning_rate": 5.48943173468575e-07, "logits/chosen": -0.5842437148094177, "logits/rejected": -0.5617852210998535, "logps/chosen": -69.61563110351562, "logps/rejected": -57.99169921875, "loss": 1.6626, "rewards/accuracies": 0.0, "rewards/chosen": 1.4215316772460938, "rewards/margins": -1.2417709827423096, "rewards/rejected": 2.6633026599884033, "step": 7973 }, { "epoch": 1.29, "learning_rate": 5.488123775411612e-07, "logits/chosen": -1.1014631986618042, "logits/rejected": -0.692820131778717, "logps/chosen": -102.54084777832031, "logps/rejected": -71.69773864746094, "loss": 0.1296, "rewards/accuracies": 1.0, "rewards/chosen": 4.8775315284729, "rewards/margins": 2.0904953479766846, "rewards/rejected": 2.787036180496216, "step": 7974 }, { "epoch": 1.29, "learning_rate": 5.486815782412742e-07, "logits/chosen": -0.9166004657745361, "logits/rejected": -0.9148635268211365, "logps/chosen": -39.77093505859375, "logps/rejected": -51.31438446044922, "loss": 0.438, "rewards/accuracies": 1.0, "rewards/chosen": 2.3796463012695312, "rewards/margins": 0.7687400579452515, "rewards/rejected": 1.6109062433242798, "step": 7975 }, { "epoch": 1.29, "learning_rate": 5.485507755779506e-07, "logits/chosen": -1.0287604331970215, "logits/rejected": -1.0692499876022339, "logps/chosen": -181.98562622070312, "logps/rejected": -135.0188751220703, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 3.9957520961761475, "rewards/margins": 2.4789581298828125, "rewards/rejected": 1.5167938470840454, "step": 7976 }, { "epoch": 1.29, "learning_rate": 5.484199695602278e-07, "logits/chosen": -0.6566019058227539, "logits/rejected": -0.5645705461502075, "logps/chosen": -77.27714538574219, "logps/rejected": -70.9002914428711, "loss": 0.9759, "rewards/accuracies": 0.0, "rewards/chosen": 3.0348267555236816, "rewards/margins": -0.6333198547363281, "rewards/rejected": 3.6681466102600098, "step": 7977 }, { "epoch": 1.29, "learning_rate": 5.482891601971433e-07, "logits/chosen": -0.3850700855255127, "logits/rejected": -0.4380824565887451, "logps/chosen": -54.85118103027344, "logps/rejected": -83.7729263305664, "loss": 1.4137, "rewards/accuracies": 0.0, "rewards/chosen": 0.671395480632782, "rewards/margins": -2.402085542678833, "rewards/rejected": 3.0734810829162598, "step": 7978 }, { "epoch": 1.3, "learning_rate": 5.481583474977349e-07, "logits/chosen": -0.5399841666221619, "logits/rejected": -0.5250964164733887, "logps/chosen": -27.862749099731445, "logps/rejected": -17.564979553222656, "loss": 0.3613, "rewards/accuracies": 1.0, "rewards/chosen": 1.3274729251861572, "rewards/margins": 0.9628810882568359, "rewards/rejected": 0.3645918071269989, "step": 7979 }, { "epoch": 1.3, "learning_rate": 5.480275314710401e-07, "logits/chosen": -0.7137326598167419, "logits/rejected": -0.7308375835418701, "logps/chosen": -155.46697998046875, "logps/rejected": -155.05105590820312, "loss": 1.7615, "rewards/accuracies": 0.0, "rewards/chosen": 4.363109111785889, "rewards/margins": -3.4489212036132812, "rewards/rejected": 7.81203031539917, "step": 7980 }, { "epoch": 1.3, "learning_rate": 5.478967121260975e-07, "logits/chosen": -0.5933480858802795, "logits/rejected": -0.5262187719345093, "logps/chosen": -159.54493713378906, "logps/rejected": -92.21758270263672, "loss": 0.2816, "rewards/accuracies": 1.0, "rewards/chosen": 1.9400314092636108, "rewards/margins": 0.7243751287460327, "rewards/rejected": 1.2156562805175781, "step": 7981 }, { "epoch": 1.3, "learning_rate": 5.477658894719453e-07, "logits/chosen": -0.8846904039382935, "logits/rejected": -0.7307471632957458, "logps/chosen": -116.29537200927734, "logps/rejected": -26.122303009033203, "loss": 0.118, "rewards/accuracies": 1.0, "rewards/chosen": 5.114963531494141, "rewards/margins": 4.655824661254883, "rewards/rejected": 0.4591386914253235, "step": 7982 }, { "epoch": 1.3, "learning_rate": 5.476350635176219e-07, "logits/chosen": -0.9326052665710449, "logits/rejected": -0.8077952265739441, "logps/chosen": -208.4324951171875, "logps/rejected": -68.28690338134766, "loss": 0.3419, "rewards/accuracies": 1.0, "rewards/chosen": 2.873303174972534, "rewards/margins": 0.9644004106521606, "rewards/rejected": 1.9089027643203735, "step": 7983 }, { "epoch": 1.3, "learning_rate": 5.475042342721666e-07, "logits/chosen": -0.33121806383132935, "logits/rejected": -0.2780114710330963, "logps/chosen": -35.70942687988281, "logps/rejected": -19.119693756103516, "loss": 0.5493, "rewards/accuracies": 1.0, "rewards/chosen": 1.6148617267608643, "rewards/margins": 0.16095507144927979, "rewards/rejected": 1.4539066553115845, "step": 7984 }, { "epoch": 1.3, "learning_rate": 5.47373401744618e-07, "logits/chosen": -0.90056973695755, "logits/rejected": -0.8491697311401367, "logps/chosen": -113.11627197265625, "logps/rejected": -153.51953125, "loss": 1.5696, "rewards/accuracies": 0.0, "rewards/chosen": 4.999519348144531, "rewards/margins": -1.1215286254882812, "rewards/rejected": 6.1210479736328125, "step": 7985 }, { "epoch": 1.3, "learning_rate": 5.472425659440156e-07, "logits/chosen": -0.4377346634864807, "logits/rejected": -0.4377346634864807, "logps/chosen": -91.96900939941406, "logps/rejected": -91.96900939941406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.211639404296875, "rewards/margins": 0.0, "rewards/rejected": 1.211639404296875, "step": 7986 }, { "epoch": 1.3, "learning_rate": 5.47111726879399e-07, "logits/chosen": -0.6471313238143921, "logits/rejected": -0.615943193435669, "logps/chosen": -68.01496887207031, "logps/rejected": -98.99813842773438, "loss": 0.8121, "rewards/accuracies": 1.0, "rewards/chosen": 1.3845627307891846, "rewards/margins": 1.0416687726974487, "rewards/rejected": 0.3428939878940582, "step": 7987 }, { "epoch": 1.3, "learning_rate": 5.469808845598078e-07, "logits/chosen": -0.505932629108429, "logits/rejected": -0.39960700273513794, "logps/chosen": -52.40518569946289, "logps/rejected": -14.571260452270508, "loss": 0.3105, "rewards/accuracies": 1.0, "rewards/chosen": 1.789650321006775, "rewards/margins": 1.474166989326477, "rewards/rejected": 0.31548330187797546, "step": 7988 }, { "epoch": 1.3, "learning_rate": 5.46850038994282e-07, "logits/chosen": -0.6074005961418152, "logits/rejected": -0.6287347674369812, "logps/chosen": -89.90480041503906, "logps/rejected": -85.10725402832031, "loss": 0.9119, "rewards/accuracies": 0.0, "rewards/chosen": 1.7353485822677612, "rewards/margins": -0.7864981889724731, "rewards/rejected": 2.5218467712402344, "step": 7989 }, { "epoch": 1.3, "learning_rate": 5.46719190191862e-07, "logits/chosen": -0.6162006855010986, "logits/rejected": -0.6155045628547668, "logps/chosen": -58.6411247253418, "logps/rejected": -45.96074676513672, "loss": 0.7442, "rewards/accuracies": 0.0, "rewards/chosen": 0.9691662192344666, "rewards/margins": -0.6814135909080505, "rewards/rejected": 1.650579810142517, "step": 7990 }, { "epoch": 1.3, "learning_rate": 5.465883381615877e-07, "logits/chosen": -0.9154609441757202, "logits/rejected": -0.9154609441757202, "logps/chosen": -61.20156478881836, "logps/rejected": -61.20156478881836, "loss": 0.5397, "rewards/accuracies": 0.0, "rewards/chosen": 1.9308948516845703, "rewards/margins": 0.0, "rewards/rejected": 1.9308948516845703, "step": 7991 }, { "epoch": 1.3, "learning_rate": 5.464574829125001e-07, "logits/chosen": -0.943661630153656, "logits/rejected": -0.9630988240242004, "logps/chosen": -132.7508544921875, "logps/rejected": -164.77206420898438, "loss": 0.7612, "rewards/accuracies": 0.0, "rewards/chosen": 1.5630035400390625, "rewards/margins": -1.012648105621338, "rewards/rejected": 2.5756516456604004, "step": 7992 }, { "epoch": 1.3, "learning_rate": 5.463266244536402e-07, "logits/chosen": -0.6761996150016785, "logits/rejected": -0.6761996150016785, "logps/chosen": -77.66913604736328, "logps/rejected": -77.66913604736328, "loss": 0.3702, "rewards/accuracies": 0.0, "rewards/chosen": 2.0275979042053223, "rewards/margins": 0.0, "rewards/rejected": 2.0275979042053223, "step": 7993 }, { "epoch": 1.3, "learning_rate": 5.461957627940487e-07, "logits/chosen": -0.7077716588973999, "logits/rejected": -0.4434593617916107, "logps/chosen": -67.55345153808594, "logps/rejected": -17.747894287109375, "loss": 0.0394, "rewards/accuracies": 1.0, "rewards/chosen": 3.620710849761963, "rewards/margins": 3.3978772163391113, "rewards/rejected": 0.22283363342285156, "step": 7994 }, { "epoch": 1.3, "learning_rate": 5.460648979427673e-07, "logits/chosen": -0.9851599931716919, "logits/rejected": -1.2417364120483398, "logps/chosen": -111.70974731445312, "logps/rejected": -35.3421745300293, "loss": 0.0958, "rewards/accuracies": 1.0, "rewards/chosen": 2.1323089599609375, "rewards/margins": 1.8021084070205688, "rewards/rejected": 0.33020058274269104, "step": 7995 }, { "epoch": 1.3, "learning_rate": 5.459340299088373e-07, "logits/chosen": -0.5270337462425232, "logits/rejected": -0.5270337462425232, "logps/chosen": -5.555231094360352, "logps/rejected": -5.555231094360352, "loss": 0.7583, "rewards/accuracies": 0.0, "rewards/chosen": 0.3975756764411926, "rewards/margins": 0.0, "rewards/rejected": 0.3975756764411926, "step": 7996 }, { "epoch": 1.3, "learning_rate": 5.458031587013005e-07, "logits/chosen": -0.7268372774124146, "logits/rejected": -0.6767200231552124, "logps/chosen": -58.56462860107422, "logps/rejected": -72.24413299560547, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 1.4840507507324219, "rewards/margins": 0.25648033618927, "rewards/rejected": 1.2275704145431519, "step": 7997 }, { "epoch": 1.3, "learning_rate": 5.456722843291987e-07, "logits/chosen": -0.6241304278373718, "logits/rejected": -0.7736333012580872, "logps/chosen": -175.6480255126953, "logps/rejected": -52.492835998535156, "loss": 0.1795, "rewards/accuracies": 1.0, "rewards/chosen": 3.5702896118164062, "rewards/margins": 1.0436363220214844, "rewards/rejected": 2.526653289794922, "step": 7998 }, { "epoch": 1.3, "learning_rate": 5.455414068015742e-07, "logits/chosen": -0.7335290312767029, "logits/rejected": -0.6538301706314087, "logps/chosen": -123.69046783447266, "logps/rejected": -49.580177307128906, "loss": 0.1455, "rewards/accuracies": 1.0, "rewards/chosen": 3.4207236766815186, "rewards/margins": 1.223405361175537, "rewards/rejected": 2.1973183155059814, "step": 7999 }, { "epoch": 1.3, "learning_rate": 5.454105261274695e-07, "logits/chosen": -0.7956742644309998, "logits/rejected": -0.7408982515335083, "logps/chosen": -68.89607238769531, "logps/rejected": -79.62213134765625, "loss": 0.4208, "rewards/accuracies": 1.0, "rewards/chosen": 2.103794813156128, "rewards/margins": 0.24954676628112793, "rewards/rejected": 1.854248046875, "step": 8000 }, { "epoch": 1.3, "learning_rate": 5.452796423159271e-07, "logits/chosen": -0.6934294104576111, "logits/rejected": -0.7075484991073608, "logps/chosen": -92.48868560791016, "logps/rejected": -130.63699340820312, "loss": 0.555, "rewards/accuracies": 0.0, "rewards/chosen": 1.628614068031311, "rewards/margins": -0.5774863958358765, "rewards/rejected": 2.2061004638671875, "step": 8001 }, { "epoch": 1.3, "learning_rate": 5.451487553759898e-07, "logits/chosen": -0.5120633840560913, "logits/rejected": -0.5353283286094666, "logps/chosen": -42.24100112915039, "logps/rejected": -61.06769561767578, "loss": 0.6709, "rewards/accuracies": 1.0, "rewards/chosen": 0.7378444671630859, "rewards/margins": 0.24650916457176208, "rewards/rejected": 0.49133530259132385, "step": 8002 }, { "epoch": 1.3, "learning_rate": 5.450178653167008e-07, "logits/chosen": -0.9375156164169312, "logits/rejected": -0.9669961333274841, "logps/chosen": -100.63296508789062, "logps/rejected": -67.34405517578125, "loss": 0.7262, "rewards/accuracies": 0.0, "rewards/chosen": 3.64023756980896, "rewards/margins": -0.5867111682891846, "rewards/rejected": 4.2269487380981445, "step": 8003 }, { "epoch": 1.3, "learning_rate": 5.448869721471032e-07, "logits/chosen": -0.3128737509250641, "logits/rejected": -0.3128737509250641, "logps/chosen": -69.74363708496094, "logps/rejected": -69.74363708496094, "loss": 0.9707, "rewards/accuracies": 0.0, "rewards/chosen": 0.5627784729003906, "rewards/margins": 0.0, "rewards/rejected": 0.5627784729003906, "step": 8004 }, { "epoch": 1.3, "learning_rate": 5.447560758762405e-07, "logits/chosen": -0.38670897483825684, "logits/rejected": -0.38670897483825684, "logps/chosen": -78.83013916015625, "logps/rejected": -78.83013916015625, "loss": 0.352, "rewards/accuracies": 0.0, "rewards/chosen": 1.1727501153945923, "rewards/margins": 0.0, "rewards/rejected": 1.1727501153945923, "step": 8005 }, { "epoch": 1.3, "learning_rate": 5.446251765131565e-07, "logits/chosen": -0.7395609617233276, "logits/rejected": -0.6598955392837524, "logps/chosen": -37.16149139404297, "logps/rejected": -39.32295608520508, "loss": 0.4506, "rewards/accuracies": 1.0, "rewards/chosen": 2.170671224594116, "rewards/margins": 0.36079537868499756, "rewards/rejected": 1.8098758459091187, "step": 8006 }, { "epoch": 1.3, "learning_rate": 5.444942740668951e-07, "logits/chosen": -0.8713684678077698, "logits/rejected": -0.865432858467102, "logps/chosen": -94.58367919921875, "logps/rejected": -83.11309814453125, "loss": 0.4346, "rewards/accuracies": 0.0, "rewards/chosen": 1.5064315795898438, "rewards/margins": -0.18293607234954834, "rewards/rejected": 1.689367651939392, "step": 8007 }, { "epoch": 1.3, "learning_rate": 5.443633685465003e-07, "logits/chosen": -0.7014403939247131, "logits/rejected": -0.6190799474716187, "logps/chosen": -64.13229370117188, "logps/rejected": -98.92152404785156, "loss": 0.9679, "rewards/accuracies": 1.0, "rewards/chosen": 3.472180128097534, "rewards/margins": 0.8748977184295654, "rewards/rejected": 2.5972824096679688, "step": 8008 }, { "epoch": 1.3, "learning_rate": 5.442324599610165e-07, "logits/chosen": -0.5038958787918091, "logits/rejected": -0.318992018699646, "logps/chosen": -116.36444854736328, "logps/rejected": -62.54045486450195, "loss": 0.1773, "rewards/accuracies": 1.0, "rewards/chosen": 4.075332164764404, "rewards/margins": 1.5865905284881592, "rewards/rejected": 2.488741636276245, "step": 8009 }, { "epoch": 1.3, "learning_rate": 5.441015483194882e-07, "logits/chosen": -0.8706350326538086, "logits/rejected": -0.8913337588310242, "logps/chosen": -41.41120910644531, "logps/rejected": -123.12403106689453, "loss": 4.0629, "rewards/accuracies": 0.0, "rewards/chosen": 2.4285247325897217, "rewards/margins": -3.3568689823150635, "rewards/rejected": 5.785393714904785, "step": 8010 }, { "epoch": 1.3, "learning_rate": 5.439706336309604e-07, "logits/chosen": -0.8796971440315247, "logits/rejected": -0.9143445491790771, "logps/chosen": -100.641357421875, "logps/rejected": -174.9945526123047, "loss": 1.4069, "rewards/accuracies": 0.0, "rewards/chosen": 1.6776779890060425, "rewards/margins": -2.09539794921875, "rewards/rejected": 3.773075819015503, "step": 8011 }, { "epoch": 1.3, "learning_rate": 5.438397159044777e-07, "logits/chosen": -0.870232105255127, "logits/rejected": -0.8219817876815796, "logps/chosen": -84.7906723022461, "logps/rejected": -108.32411193847656, "loss": 0.2957, "rewards/accuracies": 1.0, "rewards/chosen": 3.9706809520721436, "rewards/margins": 0.46921753883361816, "rewards/rejected": 3.5014634132385254, "step": 8012 }, { "epoch": 1.3, "learning_rate": 5.437087951490855e-07, "logits/chosen": -0.6148884892463684, "logits/rejected": -0.7569135427474976, "logps/chosen": -79.42259979248047, "logps/rejected": -139.0236053466797, "loss": 2.7322, "rewards/accuracies": 0.0, "rewards/chosen": 2.558347463607788, "rewards/margins": -5.251997947692871, "rewards/rejected": 7.810345649719238, "step": 8013 }, { "epoch": 1.3, "learning_rate": 5.435778713738292e-07, "logits/chosen": -0.6922329664230347, "logits/rejected": -0.7256511449813843, "logps/chosen": -72.329833984375, "logps/rejected": -86.31423950195312, "loss": 1.7171, "rewards/accuracies": 0.0, "rewards/chosen": 2.260728597640991, "rewards/margins": -0.7889609336853027, "rewards/rejected": 3.049689531326294, "step": 8014 }, { "epoch": 1.3, "learning_rate": 5.434469445877542e-07, "logits/chosen": -0.3989418148994446, "logits/rejected": -0.3904811441898346, "logps/chosen": -76.30661010742188, "logps/rejected": -71.3108139038086, "loss": 0.5666, "rewards/accuracies": 0.0, "rewards/chosen": 0.9649444818496704, "rewards/margins": -0.5409431457519531, "rewards/rejected": 1.5058876276016235, "step": 8015 }, { "epoch": 1.3, "learning_rate": 5.433160147999065e-07, "logits/chosen": -0.1657753735780716, "logits/rejected": -0.16397641599178314, "logps/chosen": -3.692399263381958, "logps/rejected": -1.9379500150680542, "loss": 0.6023, "rewards/accuracies": 0.0, "rewards/chosen": -0.025405192747712135, "rewards/margins": -0.10804029554128647, "rewards/rejected": 0.08263510465621948, "step": 8016 }, { "epoch": 1.3, "learning_rate": 5.43185082019332e-07, "logits/chosen": -0.6350921988487244, "logits/rejected": -0.6269204020500183, "logps/chosen": -65.21615600585938, "logps/rejected": -70.0530014038086, "loss": 0.4385, "rewards/accuracies": 1.0, "rewards/chosen": 1.2221626043319702, "rewards/margins": 0.1192169189453125, "rewards/rejected": 1.1029456853866577, "step": 8017 }, { "epoch": 1.3, "learning_rate": 5.430541462550769e-07, "logits/chosen": -0.7443107962608337, "logits/rejected": -0.5743128061294556, "logps/chosen": -61.9642333984375, "logps/rejected": -29.33601951599121, "loss": 0.1319, "rewards/accuracies": 1.0, "rewards/chosen": 2.054827928543091, "rewards/margins": 1.8951239585876465, "rewards/rejected": 0.15970401465892792, "step": 8018 }, { "epoch": 1.3, "learning_rate": 5.429232075161877e-07, "logits/chosen": -0.9782682657241821, "logits/rejected": -0.9410607814788818, "logps/chosen": -199.56710815429688, "logps/rejected": -47.79056930541992, "loss": 0.3477, "rewards/accuracies": 1.0, "rewards/chosen": 1.7959870100021362, "rewards/margins": 0.19339072704315186, "rewards/rejected": 1.6025962829589844, "step": 8019 }, { "epoch": 1.3, "learning_rate": 5.427922658117111e-07, "logits/chosen": -0.5988005995750427, "logits/rejected": -0.5923112034797668, "logps/chosen": -18.636709213256836, "logps/rejected": -26.427574157714844, "loss": 0.3544, "rewards/accuracies": 1.0, "rewards/chosen": 0.3528408110141754, "rewards/margins": 0.05134829878807068, "rewards/rejected": 0.30149251222610474, "step": 8020 }, { "epoch": 1.3, "learning_rate": 5.426613211506937e-07, "logits/chosen": -1.1194063425064087, "logits/rejected": -1.1127952337265015, "logps/chosen": -48.12372970581055, "logps/rejected": -20.468490600585938, "loss": 0.3569, "rewards/accuracies": 1.0, "rewards/chosen": 2.618504762649536, "rewards/margins": 1.919589638710022, "rewards/rejected": 0.6989151239395142, "step": 8021 }, { "epoch": 1.3, "learning_rate": 5.425303735421828e-07, "logits/chosen": -0.4996494650840759, "logits/rejected": -0.5011017322540283, "logps/chosen": -53.448089599609375, "logps/rejected": -54.88854217529297, "loss": 0.911, "rewards/accuracies": 0.0, "rewards/chosen": 1.6360076665878296, "rewards/margins": -0.7703965902328491, "rewards/rejected": 2.4064042568206787, "step": 8022 }, { "epoch": 1.3, "learning_rate": 5.423994229952255e-07, "logits/chosen": -1.001591444015503, "logits/rejected": -0.6176714897155762, "logps/chosen": -59.653053283691406, "logps/rejected": -82.17443084716797, "loss": 0.4105, "rewards/accuracies": 1.0, "rewards/chosen": 1.9686402082443237, "rewards/margins": 0.6787537336349487, "rewards/rejected": 1.289886474609375, "step": 8023 }, { "epoch": 1.3, "learning_rate": 5.422684695188691e-07, "logits/chosen": -0.8967142105102539, "logits/rejected": -0.9046183228492737, "logps/chosen": -50.71044921875, "logps/rejected": -72.73451232910156, "loss": 0.4638, "rewards/accuracies": 0.0, "rewards/chosen": 2.0637147426605225, "rewards/margins": -0.30485987663269043, "rewards/rejected": 2.368574619293213, "step": 8024 }, { "epoch": 1.3, "learning_rate": 5.421375131221616e-07, "logits/chosen": -0.7283383011817932, "logits/rejected": -0.7682229280471802, "logps/chosen": -59.06962585449219, "logps/rejected": -44.65446472167969, "loss": 1.0864, "rewards/accuracies": 0.0, "rewards/chosen": 0.036731719970703125, "rewards/margins": -1.2211570739746094, "rewards/rejected": 1.2578887939453125, "step": 8025 }, { "epoch": 1.3, "learning_rate": 5.420065538141506e-07, "logits/chosen": -0.8376851677894592, "logits/rejected": -0.8595141768455505, "logps/chosen": -102.38180541992188, "logps/rejected": -64.53579711914062, "loss": 1.0167, "rewards/accuracies": 0.0, "rewards/chosen": 1.0112075805664062, "rewards/margins": -1.0797593593597412, "rewards/rejected": 2.0909669399261475, "step": 8026 }, { "epoch": 1.3, "learning_rate": 5.418755916038842e-07, "logits/chosen": -0.7715526223182678, "logits/rejected": -0.8003033399581909, "logps/chosen": -88.76829528808594, "logps/rejected": -87.88333892822266, "loss": 1.0799, "rewards/accuracies": 0.0, "rewards/chosen": 2.6041412353515625, "rewards/margins": -1.9755029678344727, "rewards/rejected": 4.579644203186035, "step": 8027 }, { "epoch": 1.3, "learning_rate": 5.417446265004107e-07, "logits/chosen": -0.8997063636779785, "logits/rejected": -0.5365208983421326, "logps/chosen": -127.04312896728516, "logps/rejected": -59.56532669067383, "loss": 0.7992, "rewards/accuracies": 0.0, "rewards/chosen": 1.0452232360839844, "rewards/margins": -0.7365070581436157, "rewards/rejected": 1.7817302942276, "step": 8028 }, { "epoch": 1.3, "learning_rate": 5.416136585127784e-07, "logits/chosen": -0.2877329885959625, "logits/rejected": -0.29029807448387146, "logps/chosen": -50.22586441040039, "logps/rejected": -42.64723205566406, "loss": 0.574, "rewards/accuracies": 0.0, "rewards/chosen": 0.7655754089355469, "rewards/margins": -0.09346622228622437, "rewards/rejected": 0.8590416312217712, "step": 8029 }, { "epoch": 1.3, "learning_rate": 5.414826876500361e-07, "logits/chosen": -0.36241358518600464, "logits/rejected": -0.41136428713798523, "logps/chosen": -40.50212478637695, "logps/rejected": -113.2558822631836, "loss": 0.6543, "rewards/accuracies": 0.0, "rewards/chosen": 0.26697197556495667, "rewards/margins": -0.3098979890346527, "rewards/rejected": 0.5768699645996094, "step": 8030 }, { "epoch": 1.3, "learning_rate": 5.413517139212326e-07, "logits/chosen": -0.4882100224494934, "logits/rejected": -0.6508693099021912, "logps/chosen": -76.05728149414062, "logps/rejected": -78.509033203125, "loss": 1.3448, "rewards/accuracies": 0.0, "rewards/chosen": 2.215528964996338, "rewards/margins": -2.6021318435668945, "rewards/rejected": 4.817660808563232, "step": 8031 }, { "epoch": 1.3, "learning_rate": 5.412207373354168e-07, "logits/chosen": -0.4457463324069977, "logits/rejected": -0.4844832718372345, "logps/chosen": -80.87289428710938, "logps/rejected": -45.82362747192383, "loss": 0.4926, "rewards/accuracies": 0.0, "rewards/chosen": 1.4046356678009033, "rewards/margins": -0.512363076210022, "rewards/rejected": 1.9169987440109253, "step": 8032 }, { "epoch": 1.3, "learning_rate": 5.410897579016382e-07, "logits/chosen": -0.28459733724594116, "logits/rejected": -0.30447572469711304, "logps/chosen": -52.81941223144531, "logps/rejected": -80.93988037109375, "loss": 0.4813, "rewards/accuracies": 1.0, "rewards/chosen": 1.9220741987228394, "rewards/margins": 1.1738107204437256, "rewards/rejected": 0.7482635378837585, "step": 8033 }, { "epoch": 1.3, "learning_rate": 5.409587756289462e-07, "logits/chosen": -0.7426234483718872, "logits/rejected": -0.6404922008514404, "logps/chosen": -59.033138275146484, "logps/rejected": -15.650007247924805, "loss": 0.2977, "rewards/accuracies": 1.0, "rewards/chosen": 2.4489002227783203, "rewards/margins": 2.0770514011383057, "rewards/rejected": 0.3718488812446594, "step": 8034 }, { "epoch": 1.3, "learning_rate": 5.408277905263903e-07, "logits/chosen": -0.6283236145973206, "logits/rejected": -0.7915123105049133, "logps/chosen": -77.51093292236328, "logps/rejected": -127.13340759277344, "loss": 1.1064, "rewards/accuracies": 0.0, "rewards/chosen": 1.7287826538085938, "rewards/margins": -1.9554412364959717, "rewards/rejected": 3.6842238903045654, "step": 8035 }, { "epoch": 1.3, "learning_rate": 5.406968026030204e-07, "logits/chosen": -0.8168078660964966, "logits/rejected": -0.6065073013305664, "logps/chosen": -185.5908203125, "logps/rejected": -21.763282775878906, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": 4.932612895965576, "rewards/margins": 4.539500713348389, "rewards/rejected": 0.3931120038032532, "step": 8036 }, { "epoch": 1.3, "learning_rate": 5.405658118678865e-07, "logits/chosen": -0.5785938501358032, "logits/rejected": -0.579004168510437, "logps/chosen": -10.366456031799316, "logps/rejected": -1.8704309463500977, "loss": 0.6054, "rewards/accuracies": 0.0, "rewards/chosen": -0.1275324821472168, "rewards/margins": -0.41776999831199646, "rewards/rejected": 0.29023751616477966, "step": 8037 }, { "epoch": 1.3, "learning_rate": 5.404348183300388e-07, "logits/chosen": -0.9032224416732788, "logits/rejected": -0.8567947745323181, "logps/chosen": -53.15754699707031, "logps/rejected": -10.801149368286133, "loss": 1.5031, "rewards/accuracies": 1.0, "rewards/chosen": 1.6008094549179077, "rewards/margins": 0.9028350710868835, "rewards/rejected": 0.6979743838310242, "step": 8038 }, { "epoch": 1.3, "learning_rate": 5.403038219985278e-07, "logits/chosen": -0.750396192073822, "logits/rejected": -0.6240031123161316, "logps/chosen": -65.480224609375, "logps/rejected": -28.797840118408203, "loss": 0.5943, "rewards/accuracies": 1.0, "rewards/chosen": 2.0482804775238037, "rewards/margins": 1.3230572938919067, "rewards/rejected": 0.725223183631897, "step": 8039 }, { "epoch": 1.3, "learning_rate": 5.401728228824041e-07, "logits/chosen": -0.6492094397544861, "logits/rejected": -0.6646975874900818, "logps/chosen": -114.72925567626953, "logps/rejected": -124.12811279296875, "loss": 1.8465, "rewards/accuracies": 0.0, "rewards/chosen": 0.1287132352590561, "rewards/margins": -3.638112783432007, "rewards/rejected": 3.7668259143829346, "step": 8040 }, { "epoch": 1.31, "learning_rate": 5.400418209907184e-07, "logits/chosen": -0.6881734728813171, "logits/rejected": -0.69809490442276, "logps/chosen": -123.28881072998047, "logps/rejected": -117.9708023071289, "loss": 0.6529, "rewards/accuracies": 1.0, "rewards/chosen": 1.0600899457931519, "rewards/margins": 0.4773957133293152, "rewards/rejected": 0.5826942324638367, "step": 8041 }, { "epoch": 1.31, "learning_rate": 5.399108163325217e-07, "logits/chosen": -0.2564385235309601, "logits/rejected": -0.2536182701587677, "logps/chosen": -5.030296802520752, "logps/rejected": -5.887682914733887, "loss": 2.8593, "rewards/accuracies": 0.0, "rewards/chosen": 0.3273685872554779, "rewards/margins": -0.06330877542495728, "rewards/rejected": 0.3906773626804352, "step": 8042 }, { "epoch": 1.31, "learning_rate": 5.397798089168652e-07, "logits/chosen": -0.4143908619880676, "logits/rejected": -0.4332911968231201, "logps/chosen": -74.29894256591797, "logps/rejected": -88.8424301147461, "loss": 0.2913, "rewards/accuracies": 1.0, "rewards/chosen": 0.5969077944755554, "rewards/margins": 0.2546844184398651, "rewards/rejected": 0.3422233760356903, "step": 8043 }, { "epoch": 1.31, "learning_rate": 5.396487987528004e-07, "logits/chosen": -1.1191500425338745, "logits/rejected": -1.1030657291412354, "logps/chosen": -84.48395538330078, "logps/rejected": -163.11917114257812, "loss": 1.3036, "rewards/accuracies": 0.0, "rewards/chosen": 1.283161997795105, "rewards/margins": -2.2198591232299805, "rewards/rejected": 3.503021240234375, "step": 8044 }, { "epoch": 1.31, "learning_rate": 5.395177858493788e-07, "logits/chosen": -0.3788762390613556, "logits/rejected": -0.33644184470176697, "logps/chosen": -33.754249572753906, "logps/rejected": -69.81731414794922, "loss": 0.986, "rewards/accuracies": 0.0, "rewards/chosen": 1.289759874343872, "rewards/margins": -0.45097124576568604, "rewards/rejected": 1.740731120109558, "step": 8045 }, { "epoch": 1.31, "learning_rate": 5.393867702156522e-07, "logits/chosen": -0.41961953043937683, "logits/rejected": -0.43139591813087463, "logps/chosen": -42.80816650390625, "logps/rejected": -46.983673095703125, "loss": 0.3914, "rewards/accuracies": 0.0, "rewards/chosen": 1.515405297279358, "rewards/margins": -0.11867105960845947, "rewards/rejected": 1.6340763568878174, "step": 8046 }, { "epoch": 1.31, "learning_rate": 5.392557518606723e-07, "logits/chosen": -0.7484962940216064, "logits/rejected": -0.6734309196472168, "logps/chosen": -49.82521057128906, "logps/rejected": -39.143638610839844, "loss": 0.7088, "rewards/accuracies": 1.0, "rewards/chosen": 1.641487956047058, "rewards/margins": 0.04377484321594238, "rewards/rejected": 1.5977131128311157, "step": 8047 }, { "epoch": 1.31, "learning_rate": 5.391247307934914e-07, "logits/chosen": -0.7796659469604492, "logits/rejected": -0.7700328826904297, "logps/chosen": -68.9142837524414, "logps/rejected": -151.6407012939453, "loss": 0.7261, "rewards/accuracies": 0.0, "rewards/chosen": 2.3077950477600098, "rewards/margins": -0.8349189758300781, "rewards/rejected": 3.142714023590088, "step": 8048 }, { "epoch": 1.31, "learning_rate": 5.389937070231618e-07, "logits/chosen": -1.1025959253311157, "logits/rejected": -1.0060021877288818, "logps/chosen": -96.36038970947266, "logps/rejected": -88.05944061279297, "loss": 0.9111, "rewards/accuracies": 1.0, "rewards/chosen": 4.5582194328308105, "rewards/margins": 0.15949440002441406, "rewards/rejected": 4.3987250328063965, "step": 8049 }, { "epoch": 1.31, "learning_rate": 5.388626805587359e-07, "logits/chosen": -0.7958111763000488, "logits/rejected": -0.6951547861099243, "logps/chosen": -100.91764831542969, "logps/rejected": -70.98583984375, "loss": 0.3083, "rewards/accuracies": 1.0, "rewards/chosen": 1.4805335998535156, "rewards/margins": 0.21985626220703125, "rewards/rejected": 1.2606773376464844, "step": 8050 }, { "epoch": 1.31, "learning_rate": 5.387316514092668e-07, "logits/chosen": -0.7462087869644165, "logits/rejected": -0.6523683667182922, "logps/chosen": -94.74539184570312, "logps/rejected": -55.50806427001953, "loss": 1.1074, "rewards/accuracies": 1.0, "rewards/chosen": 1.6814689636230469, "rewards/margins": 1.2315829992294312, "rewards/rejected": 0.44988593459129333, "step": 8051 }, { "epoch": 1.31, "learning_rate": 5.386006195838069e-07, "logits/chosen": -0.38965409994125366, "logits/rejected": -0.38965409994125366, "logps/chosen": -47.2409553527832, "logps/rejected": -47.2409553527832, "loss": 0.5058, "rewards/accuracies": 0.0, "rewards/chosen": 0.14102783799171448, "rewards/margins": 0.0, "rewards/rejected": 0.14102783799171448, "step": 8052 }, { "epoch": 1.31, "learning_rate": 5.384695850914094e-07, "logits/chosen": -0.9838613867759705, "logits/rejected": -1.0535094738006592, "logps/chosen": -96.52731323242188, "logps/rejected": -119.12516784667969, "loss": 1.4057, "rewards/accuracies": 0.0, "rewards/chosen": 0.3606414794921875, "rewards/margins": -2.7063920497894287, "rewards/rejected": 3.067033529281616, "step": 8053 }, { "epoch": 1.31, "learning_rate": 5.383385479411276e-07, "logits/chosen": -0.46042051911354065, "logits/rejected": -0.4531271159648895, "logps/chosen": -9.156076431274414, "logps/rejected": -8.929405212402344, "loss": 1.0822, "rewards/accuracies": 1.0, "rewards/chosen": 0.6123308539390564, "rewards/margins": 0.035813748836517334, "rewards/rejected": 0.5765171051025391, "step": 8054 }, { "epoch": 1.31, "learning_rate": 5.382075081420148e-07, "logits/chosen": -0.5232102274894714, "logits/rejected": -0.48624125123023987, "logps/chosen": -56.75239181518555, "logps/rejected": -79.61312103271484, "loss": 0.1513, "rewards/accuracies": 1.0, "rewards/chosen": 1.2010303735733032, "rewards/margins": 1.0523754358291626, "rewards/rejected": 0.14865493774414062, "step": 8055 }, { "epoch": 1.31, "learning_rate": 5.38076465703125e-07, "logits/chosen": -0.5631656646728516, "logits/rejected": -0.5631656646728516, "logps/chosen": -70.3037109375, "logps/rejected": -70.3037109375, "loss": 0.3505, "rewards/accuracies": 0.0, "rewards/chosen": 2.301682233810425, "rewards/margins": 0.0, "rewards/rejected": 2.301682233810425, "step": 8056 }, { "epoch": 1.31, "learning_rate": 5.379454206335114e-07, "logits/chosen": -0.536135196685791, "logits/rejected": -0.38116151094436646, "logps/chosen": -51.38006591796875, "logps/rejected": -83.94529724121094, "loss": 2.0449, "rewards/accuracies": 0.0, "rewards/chosen": 2.6057891845703125, "rewards/margins": -2.722041606903076, "rewards/rejected": 5.327830791473389, "step": 8057 }, { "epoch": 1.31, "learning_rate": 5.378143729422284e-07, "logits/chosen": -0.8009170889854431, "logits/rejected": -0.782484769821167, "logps/chosen": -106.8871078491211, "logps/rejected": -106.70079040527344, "loss": 1.4584, "rewards/accuracies": 0.0, "rewards/chosen": 1.3840080499649048, "rewards/margins": -2.6552486419677734, "rewards/rejected": 4.039256572723389, "step": 8058 }, { "epoch": 1.31, "learning_rate": 5.3768332263833e-07, "logits/chosen": -0.6850258708000183, "logits/rejected": -0.6673139333724976, "logps/chosen": -39.66914367675781, "logps/rejected": -33.14557647705078, "loss": 0.6771, "rewards/accuracies": 0.0, "rewards/chosen": 1.5908154249191284, "rewards/margins": -0.17990148067474365, "rewards/rejected": 1.770716905593872, "step": 8059 }, { "epoch": 1.31, "learning_rate": 5.375522697308705e-07, "logits/chosen": -0.8296430110931396, "logits/rejected": -0.8557010293006897, "logps/chosen": -99.38285827636719, "logps/rejected": -55.370086669921875, "loss": 0.5588, "rewards/accuracies": 0.0, "rewards/chosen": 1.8274399042129517, "rewards/margins": -0.5103522539138794, "rewards/rejected": 2.337792158126831, "step": 8060 }, { "epoch": 1.31, "learning_rate": 5.374212142289046e-07, "logits/chosen": -0.7200143933296204, "logits/rejected": -0.6964900493621826, "logps/chosen": -68.46216583251953, "logps/rejected": -146.01918029785156, "loss": 1.324, "rewards/accuracies": 0.0, "rewards/chosen": 0.697460949420929, "rewards/margins": -1.0907790660858154, "rewards/rejected": 1.7882400751113892, "step": 8061 }, { "epoch": 1.31, "learning_rate": 5.372901561414869e-07, "logits/chosen": -0.7474930286407471, "logits/rejected": -1.1147345304489136, "logps/chosen": -135.41351318359375, "logps/rejected": -35.056156158447266, "loss": 0.1972, "rewards/accuracies": 1.0, "rewards/chosen": 2.877423048019409, "rewards/margins": 2.6994521617889404, "rewards/rejected": 0.17797088623046875, "step": 8062 }, { "epoch": 1.31, "learning_rate": 5.371590954776722e-07, "logits/chosen": -0.3129997253417969, "logits/rejected": -0.3127758204936981, "logps/chosen": -66.07075500488281, "logps/rejected": -51.60639190673828, "loss": 0.4631, "rewards/accuracies": 1.0, "rewards/chosen": 1.3040618896484375, "rewards/margins": 0.00597834587097168, "rewards/rejected": 1.2980835437774658, "step": 8063 }, { "epoch": 1.31, "learning_rate": 5.370280322465156e-07, "logits/chosen": -0.7818416953086853, "logits/rejected": -0.78996741771698, "logps/chosen": -90.08040618896484, "logps/rejected": -91.96640014648438, "loss": 0.6085, "rewards/accuracies": 1.0, "rewards/chosen": 4.982340335845947, "rewards/margins": 2.074544668197632, "rewards/rejected": 2.9077956676483154, "step": 8064 }, { "epoch": 1.31, "learning_rate": 5.368969664570724e-07, "logits/chosen": -0.5471351742744446, "logits/rejected": -0.4945082366466522, "logps/chosen": -60.24596405029297, "logps/rejected": -69.51945495605469, "loss": 0.6639, "rewards/accuracies": 0.0, "rewards/chosen": 1.0198860168457031, "rewards/margins": -0.8174499273300171, "rewards/rejected": 1.8373359441757202, "step": 8065 }, { "epoch": 1.31, "learning_rate": 5.367658981183979e-07, "logits/chosen": -0.37278151512145996, "logits/rejected": -0.37278151512145996, "logps/chosen": -21.510698318481445, "logps/rejected": -21.510698318481445, "loss": 1.8118, "rewards/accuracies": 0.0, "rewards/chosen": 0.9184786081314087, "rewards/margins": 0.0, "rewards/rejected": 0.9184786081314087, "step": 8066 }, { "epoch": 1.31, "learning_rate": 5.366348272395477e-07, "logits/chosen": -0.6782969832420349, "logits/rejected": -0.4891272187232971, "logps/chosen": -153.08828735351562, "logps/rejected": -19.81009292602539, "loss": 1.0185, "rewards/accuracies": 1.0, "rewards/chosen": 0.8120483756065369, "rewards/margins": 0.5801929831504822, "rewards/rejected": 0.2318553924560547, "step": 8067 }, { "epoch": 1.31, "learning_rate": 5.365037538295775e-07, "logits/chosen": -0.47175467014312744, "logits/rejected": -0.3124486804008484, "logps/chosen": -61.65133285522461, "logps/rejected": -23.588314056396484, "loss": 0.0989, "rewards/accuracies": 1.0, "rewards/chosen": 1.747287392616272, "rewards/margins": 1.6720815896987915, "rewards/rejected": 0.07520580291748047, "step": 8068 }, { "epoch": 1.31, "learning_rate": 5.363726778975435e-07, "logits/chosen": -0.6404395699501038, "logits/rejected": -0.6396540999412537, "logps/chosen": -61.720550537109375, "logps/rejected": -89.19640350341797, "loss": 0.3518, "rewards/accuracies": 1.0, "rewards/chosen": 2.4953866004943848, "rewards/margins": 0.5402023792266846, "rewards/rejected": 1.9551842212677002, "step": 8069 }, { "epoch": 1.31, "learning_rate": 5.362415994525015e-07, "logits/chosen": -0.634238064289093, "logits/rejected": -0.6372327208518982, "logps/chosen": -27.29111099243164, "logps/rejected": -17.221040725708008, "loss": 0.415, "rewards/accuracies": 0.0, "rewards/chosen": 0.6613918542861938, "rewards/margins": -0.008257865905761719, "rewards/rejected": 0.6696497201919556, "step": 8070 }, { "epoch": 1.31, "learning_rate": 5.361105185035079e-07, "logits/chosen": -0.9648873805999756, "logits/rejected": -0.9013832211494446, "logps/chosen": -169.79283142089844, "logps/rejected": -23.115116119384766, "loss": 0.9967, "rewards/accuracies": 0.0, "rewards/chosen": 0.5433349609375, "rewards/margins": -0.24212646484375, "rewards/rejected": 0.78546142578125, "step": 8071 }, { "epoch": 1.31, "learning_rate": 5.359794350596191e-07, "logits/chosen": -1.0093897581100464, "logits/rejected": -0.8998831510543823, "logps/chosen": -109.42511749267578, "logps/rejected": -15.606575012207031, "loss": 0.3096, "rewards/accuracies": 1.0, "rewards/chosen": 2.4089303016662598, "rewards/margins": 2.0872578620910645, "rewards/rejected": 0.3216724395751953, "step": 8072 }, { "epoch": 1.31, "learning_rate": 5.358483491298919e-07, "logits/chosen": -0.8527637720108032, "logits/rejected": -0.7780529260635376, "logps/chosen": -96.79534149169922, "logps/rejected": -52.60563659667969, "loss": 0.1261, "rewards/accuracies": 1.0, "rewards/chosen": 4.819051265716553, "rewards/margins": 2.57150936126709, "rewards/rejected": 2.247541904449463, "step": 8073 }, { "epoch": 1.31, "learning_rate": 5.35717260723383e-07, "logits/chosen": -0.4664202332496643, "logits/rejected": -0.5052245259284973, "logps/chosen": -112.25811767578125, "logps/rejected": -131.1952667236328, "loss": 0.2812, "rewards/accuracies": 1.0, "rewards/chosen": 0.8612068295478821, "rewards/margins": 0.49504318833351135, "rewards/rejected": 0.3661636412143707, "step": 8074 }, { "epoch": 1.31, "learning_rate": 5.355861698491492e-07, "logits/chosen": -0.6695308089256287, "logits/rejected": -0.6448010206222534, "logps/chosen": -64.96347045898438, "logps/rejected": -41.92507553100586, "loss": 1.4247, "rewards/accuracies": 0.0, "rewards/chosen": 1.39354407787323, "rewards/margins": -0.18956828117370605, "rewards/rejected": 1.583112359046936, "step": 8075 }, { "epoch": 1.31, "learning_rate": 5.354550765162479e-07, "logits/chosen": -0.8066174983978271, "logits/rejected": -0.7223222255706787, "logps/chosen": -61.27896499633789, "logps/rejected": -32.95991516113281, "loss": 0.9169, "rewards/accuracies": 0.0, "rewards/chosen": 1.8875926733016968, "rewards/margins": -0.0965874195098877, "rewards/rejected": 1.9841800928115845, "step": 8076 }, { "epoch": 1.31, "learning_rate": 5.353239807337362e-07, "logits/chosen": -0.7692632675170898, "logits/rejected": -0.7152954936027527, "logps/chosen": -45.68025207519531, "logps/rejected": -35.02443313598633, "loss": 0.5333, "rewards/accuracies": 0.0, "rewards/chosen": 2.0446128845214844, "rewards/margins": -0.15540766716003418, "rewards/rejected": 2.2000205516815186, "step": 8077 }, { "epoch": 1.31, "learning_rate": 5.351928825106717e-07, "logits/chosen": -0.8390511870384216, "logits/rejected": -0.8156901001930237, "logps/chosen": -70.31367492675781, "logps/rejected": -50.2094841003418, "loss": 0.759, "rewards/accuracies": 0.0, "rewards/chosen": 1.3628143072128296, "rewards/margins": -0.24352610111236572, "rewards/rejected": 1.6063404083251953, "step": 8078 }, { "epoch": 1.31, "learning_rate": 5.350617818561121e-07, "logits/chosen": -0.9269125461578369, "logits/rejected": -0.7439837455749512, "logps/chosen": -92.49267578125, "logps/rejected": -22.25567626953125, "loss": 0.1499, "rewards/accuracies": 1.0, "rewards/chosen": 2.995718479156494, "rewards/margins": 2.6182594299316406, "rewards/rejected": 0.37745895981788635, "step": 8079 }, { "epoch": 1.31, "learning_rate": 5.34930678779115e-07, "logits/chosen": -0.5667401552200317, "logits/rejected": -0.5091721415519714, "logps/chosen": -103.98948669433594, "logps/rejected": -77.8260498046875, "loss": 1.3656, "rewards/accuracies": 0.0, "rewards/chosen": 1.574041724205017, "rewards/margins": -0.6183570623397827, "rewards/rejected": 2.1923987865448, "step": 8080 }, { "epoch": 1.31, "learning_rate": 5.347995732887386e-07, "logits/chosen": -0.45096850395202637, "logits/rejected": -0.46364331245422363, "logps/chosen": -107.80416870117188, "logps/rejected": -88.75738525390625, "loss": 1.1849, "rewards/accuracies": 0.0, "rewards/chosen": 0.11229095607995987, "rewards/margins": -1.4695991277694702, "rewards/rejected": 1.5818901062011719, "step": 8081 }, { "epoch": 1.31, "learning_rate": 5.346684653940408e-07, "logits/chosen": -1.5288268327713013, "logits/rejected": -1.5348235368728638, "logps/chosen": -30.40660285949707, "logps/rejected": -19.78829002380371, "loss": 1.2204, "rewards/accuracies": 1.0, "rewards/chosen": 1.146797776222229, "rewards/margins": 0.0005064010620117188, "rewards/rejected": 1.1462913751602173, "step": 8082 }, { "epoch": 1.31, "learning_rate": 5.345373551040801e-07, "logits/chosen": -0.7400469779968262, "logits/rejected": -0.688092052936554, "logps/chosen": -65.86695861816406, "logps/rejected": -75.53170013427734, "loss": 0.1951, "rewards/accuracies": 1.0, "rewards/chosen": 2.9304826259613037, "rewards/margins": 0.7902200222015381, "rewards/rejected": 2.1402626037597656, "step": 8083 }, { "epoch": 1.31, "learning_rate": 5.344062424279152e-07, "logits/chosen": -0.747016966342926, "logits/rejected": -0.6646113395690918, "logps/chosen": -135.69000244140625, "logps/rejected": -109.40313720703125, "loss": 0.0643, "rewards/accuracies": 1.0, "rewards/chosen": 5.262414455413818, "rewards/margins": 3.14522385597229, "rewards/rejected": 2.1171905994415283, "step": 8084 }, { "epoch": 1.31, "learning_rate": 5.342751273746043e-07, "logits/chosen": -0.5256734490394592, "logits/rejected": -0.5256734490394592, "logps/chosen": -87.11856079101562, "logps/rejected": -87.11856079101562, "loss": 0.3641, "rewards/accuracies": 0.0, "rewards/chosen": 2.3414833545684814, "rewards/margins": 0.0, "rewards/rejected": 2.3414833545684814, "step": 8085 }, { "epoch": 1.31, "learning_rate": 5.341440099532065e-07, "logits/chosen": -0.6785593628883362, "logits/rejected": -0.6636394262313843, "logps/chosen": -95.16997528076172, "logps/rejected": -54.466636657714844, "loss": 0.5729, "rewards/accuracies": 0.0, "rewards/chosen": 1.115186333656311, "rewards/margins": -0.5799926519393921, "rewards/rejected": 1.6951789855957031, "step": 8086 }, { "epoch": 1.31, "learning_rate": 5.340128901727807e-07, "logits/chosen": -0.7341040372848511, "logits/rejected": -0.7094244956970215, "logps/chosen": -79.8694076538086, "logps/rejected": -88.85689544677734, "loss": 0.2702, "rewards/accuracies": 1.0, "rewards/chosen": 2.076448917388916, "rewards/margins": 1.725583791732788, "rewards/rejected": 0.3508651852607727, "step": 8087 }, { "epoch": 1.31, "learning_rate": 5.338817680423861e-07, "logits/chosen": -0.5379139184951782, "logits/rejected": -0.5264283418655396, "logps/chosen": -105.367919921875, "logps/rejected": -101.62653350830078, "loss": 1.2479, "rewards/accuracies": 1.0, "rewards/chosen": 0.8029045462608337, "rewards/margins": 0.21610110998153687, "rewards/rejected": 0.5868034362792969, "step": 8088 }, { "epoch": 1.31, "learning_rate": 5.337506435710817e-07, "logits/chosen": -0.8078354001045227, "logits/rejected": -0.8220037817955017, "logps/chosen": -63.93809509277344, "logps/rejected": -126.55133056640625, "loss": 2.0599, "rewards/accuracies": 0.0, "rewards/chosen": 2.5162856578826904, "rewards/margins": -1.948582410812378, "rewards/rejected": 4.464868068695068, "step": 8089 }, { "epoch": 1.31, "learning_rate": 5.336195167679274e-07, "logits/chosen": -0.6595924496650696, "logits/rejected": -0.6595924496650696, "logps/chosen": -102.22896575927734, "logps/rejected": -102.22896575927734, "loss": 0.365, "rewards/accuracies": 0.0, "rewards/chosen": 1.1444313526153564, "rewards/margins": 0.0, "rewards/rejected": 1.1444313526153564, "step": 8090 }, { "epoch": 1.31, "learning_rate": 5.334883876419824e-07, "logits/chosen": -0.8579390645027161, "logits/rejected": -0.9360926151275635, "logps/chosen": -130.91598510742188, "logps/rejected": -163.89427185058594, "loss": 1.8782, "rewards/accuracies": 0.0, "rewards/chosen": 1.2821320295333862, "rewards/margins": -3.693903923034668, "rewards/rejected": 4.976036071777344, "step": 8091 }, { "epoch": 1.31, "learning_rate": 5.333572562023067e-07, "logits/chosen": -0.6311994194984436, "logits/rejected": -0.49722397327423096, "logps/chosen": -75.64952087402344, "logps/rejected": -5.998462677001953, "loss": 1.1133, "rewards/accuracies": 1.0, "rewards/chosen": 2.2897331714630127, "rewards/margins": 1.6921391487121582, "rewards/rejected": 0.5975939631462097, "step": 8092 }, { "epoch": 1.31, "learning_rate": 5.332261224579604e-07, "logits/chosen": -0.37309423089027405, "logits/rejected": -0.42674413323402405, "logps/chosen": -0.7552571892738342, "logps/rejected": -68.23573303222656, "loss": 1.3066, "rewards/accuracies": 1.0, "rewards/chosen": 0.12974219024181366, "rewards/margins": 0.06250433623790741, "rewards/rejected": 0.06723785400390625, "step": 8093 }, { "epoch": 1.31, "learning_rate": 5.330949864180033e-07, "logits/chosen": -0.6102721095085144, "logits/rejected": -0.6102721095085144, "logps/chosen": -39.095184326171875, "logps/rejected": -39.095184326171875, "loss": 0.3852, "rewards/accuracies": 0.0, "rewards/chosen": 1.5328396558761597, "rewards/margins": 0.0, "rewards/rejected": 1.5328396558761597, "step": 8094 }, { "epoch": 1.31, "learning_rate": 5.329638480914958e-07, "logits/chosen": -0.8983389735221863, "logits/rejected": -0.8169897198677063, "logps/chosen": -143.11093139648438, "logps/rejected": -39.07396697998047, "loss": 0.0894, "rewards/accuracies": 1.0, "rewards/chosen": 4.415051460266113, "rewards/margins": 2.3315439224243164, "rewards/rejected": 2.083507537841797, "step": 8095 }, { "epoch": 1.31, "learning_rate": 5.328327074874984e-07, "logits/chosen": -0.8210857510566711, "logits/rejected": -0.8238731622695923, "logps/chosen": -78.3927230834961, "logps/rejected": -78.76724243164062, "loss": 0.9799, "rewards/accuracies": 0.0, "rewards/chosen": 0.33261415362358093, "rewards/margins": -0.9211654663085938, "rewards/rejected": 1.253779649734497, "step": 8096 }, { "epoch": 1.31, "learning_rate": 5.327015646150716e-07, "logits/chosen": -0.5371999740600586, "logits/rejected": -0.5371999740600586, "logps/chosen": -1.8387895822525024, "logps/rejected": -1.8387895822525024, "loss": 0.935, "rewards/accuracies": 0.0, "rewards/chosen": 0.313485711812973, "rewards/margins": 0.0, "rewards/rejected": 0.313485711812973, "step": 8097 }, { "epoch": 1.31, "learning_rate": 5.325704194832759e-07, "logits/chosen": -0.7471985220909119, "logits/rejected": -0.6690062880516052, "logps/chosen": -87.4288558959961, "logps/rejected": -61.46424865722656, "loss": 0.7901, "rewards/accuracies": 0.0, "rewards/chosen": 2.5262444019317627, "rewards/margins": -1.3048300743103027, "rewards/rejected": 3.8310744762420654, "step": 8098 }, { "epoch": 1.31, "learning_rate": 5.324392721011726e-07, "logits/chosen": -0.7503067255020142, "logits/rejected": -0.5951447486877441, "logps/chosen": -104.47118377685547, "logps/rejected": -104.64956665039062, "loss": 3.5035, "rewards/accuracies": 1.0, "rewards/chosen": 4.582904815673828, "rewards/margins": 0.8005576133728027, "rewards/rejected": 3.7823472023010254, "step": 8099 }, { "epoch": 1.31, "learning_rate": 5.323081224778224e-07, "logits/chosen": -0.4836289584636688, "logits/rejected": -0.4805733859539032, "logps/chosen": -18.988967895507812, "logps/rejected": -2.105252981185913, "loss": 0.8143, "rewards/accuracies": 0.0, "rewards/chosen": 0.3132999539375305, "rewards/margins": -0.028123199939727783, "rewards/rejected": 0.3414231538772583, "step": 8100 }, { "epoch": 1.31, "learning_rate": 5.321769706222867e-07, "logits/chosen": -0.5730111002922058, "logits/rejected": -0.566688060760498, "logps/chosen": -64.5272216796875, "logps/rejected": -10.691411018371582, "loss": 0.3776, "rewards/accuracies": 1.0, "rewards/chosen": 2.266005039215088, "rewards/margins": 1.108159065246582, "rewards/rejected": 1.1578459739685059, "step": 8101 }, { "epoch": 1.32, "learning_rate": 5.320458165436268e-07, "logits/chosen": -0.008627291768789291, "logits/rejected": -0.008627291768789291, "logps/chosen": -17.349836349487305, "logps/rejected": -17.349836349487305, "loss": 0.6826, "rewards/accuracies": 0.0, "rewards/chosen": 0.3447921872138977, "rewards/margins": 0.0, "rewards/rejected": 0.3447921872138977, "step": 8102 }, { "epoch": 1.32, "learning_rate": 5.319146602509041e-07, "logits/chosen": -0.7073670029640198, "logits/rejected": -0.8426237106323242, "logps/chosen": -98.60721588134766, "logps/rejected": -68.30241394042969, "loss": 0.085, "rewards/accuracies": 1.0, "rewards/chosen": 4.592388153076172, "rewards/margins": 1.7155113220214844, "rewards/rejected": 2.8768768310546875, "step": 8103 }, { "epoch": 1.32, "learning_rate": 5.317835017531804e-07, "logits/chosen": -0.5089188814163208, "logits/rejected": -0.5363156199455261, "logps/chosen": -89.98777770996094, "logps/rejected": -82.82807159423828, "loss": 0.8124, "rewards/accuracies": 0.0, "rewards/chosen": 0.5081024169921875, "rewards/margins": -0.6015831232070923, "rewards/rejected": 1.1096855401992798, "step": 8104 }, { "epoch": 1.32, "learning_rate": 5.316523410595177e-07, "logits/chosen": -0.8629344701766968, "logits/rejected": -0.7681341767311096, "logps/chosen": -152.753662109375, "logps/rejected": -53.12322998046875, "loss": 0.7659, "rewards/accuracies": 1.0, "rewards/chosen": 1.0051193237304688, "rewards/margins": 0.19201123714447021, "rewards/rejected": 0.8131080865859985, "step": 8105 }, { "epoch": 1.32, "learning_rate": 5.315211781789775e-07, "logits/chosen": -0.5356162786483765, "logits/rejected": -0.553122341632843, "logps/chosen": -67.74765014648438, "logps/rejected": -46.54841613769531, "loss": 0.6267, "rewards/accuracies": 1.0, "rewards/chosen": 1.0552780628204346, "rewards/margins": 0.27406466007232666, "rewards/rejected": 0.7812134027481079, "step": 8106 }, { "epoch": 1.32, "learning_rate": 5.313900131206221e-07, "logits/chosen": -0.9598780870437622, "logits/rejected": -0.7856923937797546, "logps/chosen": -141.1378173828125, "logps/rejected": -65.74638366699219, "loss": 0.178, "rewards/accuracies": 1.0, "rewards/chosen": 3.4356415271759033, "rewards/margins": 1.10919189453125, "rewards/rejected": 2.3264496326446533, "step": 8107 }, { "epoch": 1.32, "learning_rate": 5.31258845893514e-07, "logits/chosen": -0.6967220902442932, "logits/rejected": -0.7503308057785034, "logps/chosen": -106.94310760498047, "logps/rejected": -85.9515380859375, "loss": 0.8042, "rewards/accuracies": 0.0, "rewards/chosen": 1.4768654108047485, "rewards/margins": -1.3099128007888794, "rewards/rejected": 2.786778211593628, "step": 8108 }, { "epoch": 1.32, "learning_rate": 5.311276765067153e-07, "logits/chosen": -0.02749749645590782, "logits/rejected": -0.02749749645590782, "logps/chosen": -39.815311431884766, "logps/rejected": -39.815311431884766, "loss": 0.723, "rewards/accuracies": 0.0, "rewards/chosen": 0.41902658343315125, "rewards/margins": 0.0, "rewards/rejected": 0.41902658343315125, "step": 8109 }, { "epoch": 1.32, "learning_rate": 5.309965049692886e-07, "logits/chosen": -0.8754311203956604, "logits/rejected": -0.7734091877937317, "logps/chosen": -85.76686096191406, "logps/rejected": -100.33407592773438, "loss": 0.1339, "rewards/accuracies": 1.0, "rewards/chosen": 4.828698635101318, "rewards/margins": 2.0831899642944336, "rewards/rejected": 2.7455086708068848, "step": 8110 }, { "epoch": 1.32, "learning_rate": 5.308653312902967e-07, "logits/chosen": -0.7444908022880554, "logits/rejected": -0.6098685264587402, "logps/chosen": -165.61073303222656, "logps/rejected": -74.023681640625, "loss": 0.4406, "rewards/accuracies": 1.0, "rewards/chosen": 4.741702556610107, "rewards/margins": 0.6121063232421875, "rewards/rejected": 4.12959623336792, "step": 8111 }, { "epoch": 1.32, "learning_rate": 5.307341554788026e-07, "logits/chosen": -1.2941100597381592, "logits/rejected": -1.2632287740707397, "logps/chosen": -29.699451446533203, "logps/rejected": -93.18579864501953, "loss": 0.8056, "rewards/accuracies": 0.0, "rewards/chosen": 2.0158908367156982, "rewards/margins": -0.026978731155395508, "rewards/rejected": 2.0428695678710938, "step": 8112 }, { "epoch": 1.32, "learning_rate": 5.306029775438692e-07, "logits/chosen": -1.0403008460998535, "logits/rejected": -0.9437646865844727, "logps/chosen": -104.82084655761719, "logps/rejected": -39.1706428527832, "loss": 0.0922, "rewards/accuracies": 1.0, "rewards/chosen": 2.3666000366210938, "rewards/margins": 2.07792592048645, "rewards/rejected": 0.28867417573928833, "step": 8113 }, { "epoch": 1.32, "learning_rate": 5.304717974945595e-07, "logits/chosen": -0.9114769101142883, "logits/rejected": -0.8674965500831604, "logps/chosen": -165.96250915527344, "logps/rejected": -86.53036499023438, "loss": 0.965, "rewards/accuracies": 0.0, "rewards/chosen": 2.9356980323791504, "rewards/margins": -1.6307754516601562, "rewards/rejected": 4.566473484039307, "step": 8114 }, { "epoch": 1.32, "learning_rate": 5.303406153399369e-07, "logits/chosen": -0.7812076210975647, "logits/rejected": -0.4615795612335205, "logps/chosen": -201.06167602539062, "logps/rejected": -12.457348823547363, "loss": 0.0767, "rewards/accuracies": 1.0, "rewards/chosen": 6.195892333984375, "rewards/margins": 5.6563544273376465, "rewards/rejected": 0.5395380854606628, "step": 8115 }, { "epoch": 1.32, "learning_rate": 5.302094310890649e-07, "logits/chosen": -0.7415823340415955, "logits/rejected": -0.7415823340415955, "logps/chosen": -57.36963653564453, "logps/rejected": -57.36963653564453, "loss": 1.7851, "rewards/accuracies": 0.0, "rewards/chosen": 2.3549773693084717, "rewards/margins": 0.0, "rewards/rejected": 2.3549773693084717, "step": 8116 }, { "epoch": 1.32, "learning_rate": 5.300782447510071e-07, "logits/chosen": -1.0930792093276978, "logits/rejected": -1.1048572063446045, "logps/chosen": -111.34342956542969, "logps/rejected": -36.29283142089844, "loss": 1.4526, "rewards/accuracies": 1.0, "rewards/chosen": 0.2823471128940582, "rewards/margins": 0.04355621337890625, "rewards/rejected": 0.23879089951515198, "step": 8117 }, { "epoch": 1.32, "learning_rate": 5.299470563348272e-07, "logits/chosen": -0.8739573955535889, "logits/rejected": -0.8120684027671814, "logps/chosen": -98.19677734375, "logps/rejected": -109.60514831542969, "loss": 0.122, "rewards/accuracies": 1.0, "rewards/chosen": 4.709722995758057, "rewards/margins": 2.5656814575195312, "rewards/rejected": 2.1440415382385254, "step": 8118 }, { "epoch": 1.32, "learning_rate": 5.29815865849589e-07, "logits/chosen": -0.9616304636001587, "logits/rejected": -0.9348070025444031, "logps/chosen": -162.1513671875, "logps/rejected": -180.12017822265625, "loss": 1.3928, "rewards/accuracies": 0.0, "rewards/chosen": 0.4780639708042145, "rewards/margins": -0.8795746564865112, "rewards/rejected": 1.3576385974884033, "step": 8119 }, { "epoch": 1.32, "learning_rate": 5.296846733043566e-07, "logits/chosen": -0.9819495677947998, "logits/rejected": -0.999994695186615, "logps/chosen": -113.78755187988281, "logps/rejected": -118.17780303955078, "loss": 0.4822, "rewards/accuracies": 0.0, "rewards/chosen": 0.5754913687705994, "rewards/margins": -0.43729013204574585, "rewards/rejected": 1.0127815008163452, "step": 8120 }, { "epoch": 1.32, "learning_rate": 5.295534787081942e-07, "logits/chosen": -1.1101479530334473, "logits/rejected": -1.028420090675354, "logps/chosen": -87.20631408691406, "logps/rejected": -73.96732330322266, "loss": 0.2625, "rewards/accuracies": 1.0, "rewards/chosen": 3.339399814605713, "rewards/margins": 1.3780999183654785, "rewards/rejected": 1.9612998962402344, "step": 8121 }, { "epoch": 1.32, "learning_rate": 5.29422282070166e-07, "logits/chosen": -0.7910910248756409, "logits/rejected": -0.7863474488258362, "logps/chosen": -178.37301635742188, "logps/rejected": -121.4681167602539, "loss": 1.6098, "rewards/accuracies": 0.0, "rewards/chosen": 0.04023437574505806, "rewards/margins": -3.1699349880218506, "rewards/rejected": 3.2101693153381348, "step": 8122 }, { "epoch": 1.32, "learning_rate": 5.292910833993365e-07, "logits/chosen": -0.9973989129066467, "logits/rejected": -0.9973989129066467, "logps/chosen": -44.25941467285156, "logps/rejected": -44.25941467285156, "loss": 0.8754, "rewards/accuracies": 0.0, "rewards/chosen": 0.40049171447753906, "rewards/margins": 0.0, "rewards/rejected": 0.40049171447753906, "step": 8123 }, { "epoch": 1.32, "learning_rate": 5.291598827047705e-07, "logits/chosen": -0.46758487820625305, "logits/rejected": -0.3883901834487915, "logps/chosen": -69.58558654785156, "logps/rejected": -8.989645004272461, "loss": 0.235, "rewards/accuracies": 1.0, "rewards/chosen": 2.296733856201172, "rewards/margins": 1.2215166091918945, "rewards/rejected": 1.0752172470092773, "step": 8124 }, { "epoch": 1.32, "learning_rate": 5.290286799955323e-07, "logits/chosen": -0.5375180244445801, "logits/rejected": -0.4142239987850189, "logps/chosen": -86.10282135009766, "logps/rejected": -64.42718505859375, "loss": 0.2618, "rewards/accuracies": 1.0, "rewards/chosen": 2.755819082260132, "rewards/margins": 0.47640466690063477, "rewards/rejected": 2.279414415359497, "step": 8125 }, { "epoch": 1.32, "learning_rate": 5.288974752806871e-07, "logits/chosen": -0.07355834543704987, "logits/rejected": -0.08146646618843079, "logps/chosen": -5.471269130706787, "logps/rejected": -1.0810835361480713, "loss": 0.7172, "rewards/accuracies": 0.0, "rewards/chosen": 0.21156688034534454, "rewards/margins": -0.04811002314090729, "rewards/rejected": 0.25967690348625183, "step": 8126 }, { "epoch": 1.32, "learning_rate": 5.287662685692997e-07, "logits/chosen": -0.798509955406189, "logits/rejected": -0.722383439540863, "logps/chosen": -81.30241394042969, "logps/rejected": -41.57895278930664, "loss": 0.8047, "rewards/accuracies": 1.0, "rewards/chosen": 1.9864425659179688, "rewards/margins": 1.7415745258331299, "rewards/rejected": 0.24486808478832245, "step": 8127 }, { "epoch": 1.32, "learning_rate": 5.286350598704353e-07, "logits/chosen": -0.7188993692398071, "logits/rejected": -0.6943610310554504, "logps/chosen": -120.84376525878906, "logps/rejected": -83.747314453125, "loss": 0.3983, "rewards/accuracies": 0.0, "rewards/chosen": 2.9224884510040283, "rewards/margins": -0.11782598495483398, "rewards/rejected": 3.0403144359588623, "step": 8128 }, { "epoch": 1.32, "learning_rate": 5.285038491931593e-07, "logits/chosen": -0.7132517695426941, "logits/rejected": -0.7132517695426941, "logps/chosen": -38.51982116699219, "logps/rejected": -38.51982116699219, "loss": 0.6417, "rewards/accuracies": 0.0, "rewards/chosen": 1.7517441511154175, "rewards/margins": 0.0, "rewards/rejected": 1.7517441511154175, "step": 8129 }, { "epoch": 1.32, "learning_rate": 5.28372636546537e-07, "logits/chosen": -0.6943552494049072, "logits/rejected": -0.585760772228241, "logps/chosen": -115.98966979980469, "logps/rejected": -64.17021942138672, "loss": 0.7237, "rewards/accuracies": 1.0, "rewards/chosen": 5.755117893218994, "rewards/margins": 3.901991367340088, "rewards/rejected": 1.8531265258789062, "step": 8130 }, { "epoch": 1.32, "learning_rate": 5.28241421939634e-07, "logits/chosen": -0.4161950945854187, "logits/rejected": -0.23052266240119934, "logps/chosen": -100.62813568115234, "logps/rejected": -45.775299072265625, "loss": 0.2723, "rewards/accuracies": 1.0, "rewards/chosen": 3.6636712551116943, "rewards/margins": 1.0135178565979004, "rewards/rejected": 2.650153398513794, "step": 8131 }, { "epoch": 1.32, "learning_rate": 5.28110205381516e-07, "logits/chosen": -0.6731173992156982, "logits/rejected": -0.7685021162033081, "logps/chosen": -116.80513000488281, "logps/rejected": -133.9024658203125, "loss": 1.7153, "rewards/accuracies": 0.0, "rewards/chosen": 1.3070220947265625, "rewards/margins": -1.4998290538787842, "rewards/rejected": 2.8068511486053467, "step": 8132 }, { "epoch": 1.32, "learning_rate": 5.279789868812486e-07, "logits/chosen": -0.880847156047821, "logits/rejected": -0.9149045944213867, "logps/chosen": -81.01791381835938, "logps/rejected": -80.63330078125, "loss": 1.754, "rewards/accuracies": 0.0, "rewards/chosen": 1.2174606323242188, "rewards/margins": -0.8893096446990967, "rewards/rejected": 2.1067702770233154, "step": 8133 }, { "epoch": 1.32, "learning_rate": 5.278477664478982e-07, "logits/chosen": -0.6047289371490479, "logits/rejected": -0.5581337809562683, "logps/chosen": -90.17642211914062, "logps/rejected": -59.01288604736328, "loss": 0.8162, "rewards/accuracies": 0.0, "rewards/chosen": 1.070177435874939, "rewards/margins": -0.6367546319961548, "rewards/rejected": 1.7069320678710938, "step": 8134 }, { "epoch": 1.32, "learning_rate": 5.277165440905305e-07, "logits/chosen": -0.5333831906318665, "logits/rejected": -0.5333831906318665, "logps/chosen": -19.299474716186523, "logps/rejected": -19.299474716186523, "loss": 0.7961, "rewards/accuracies": 0.0, "rewards/chosen": 1.5725549459457397, "rewards/margins": 0.0, "rewards/rejected": 1.5725549459457397, "step": 8135 }, { "epoch": 1.32, "learning_rate": 5.275853198182118e-07, "logits/chosen": -1.0356074571609497, "logits/rejected": -1.0608021020889282, "logps/chosen": -107.73895263671875, "logps/rejected": -108.26181030273438, "loss": 1.1836, "rewards/accuracies": 0.0, "rewards/chosen": 0.9989776611328125, "rewards/margins": -1.2969512939453125, "rewards/rejected": 2.295928955078125, "step": 8136 }, { "epoch": 1.32, "learning_rate": 5.274540936400086e-07, "logits/chosen": -0.6195809245109558, "logits/rejected": -0.5301886200904846, "logps/chosen": -61.022369384765625, "logps/rejected": -59.11874771118164, "loss": 1.6896, "rewards/accuracies": 0.0, "rewards/chosen": 1.03703773021698, "rewards/margins": -2.214564323425293, "rewards/rejected": 3.2516019344329834, "step": 8137 }, { "epoch": 1.32, "learning_rate": 5.273228655649873e-07, "logits/chosen": -0.7808557748794556, "logits/rejected": -0.7553796172142029, "logps/chosen": -125.90869140625, "logps/rejected": -91.13874816894531, "loss": 0.6649, "rewards/accuracies": 0.0, "rewards/chosen": 4.038861274719238, "rewards/margins": -0.8863587379455566, "rewards/rejected": 4.925220012664795, "step": 8138 }, { "epoch": 1.32, "learning_rate": 5.271916356022145e-07, "logits/chosen": -0.7779102325439453, "logits/rejected": -0.7313523888587952, "logps/chosen": -201.41244506835938, "logps/rejected": -132.994140625, "loss": 0.2713, "rewards/accuracies": 1.0, "rewards/chosen": 4.5774383544921875, "rewards/margins": 0.8338806629180908, "rewards/rejected": 3.7435576915740967, "step": 8139 }, { "epoch": 1.32, "learning_rate": 5.27060403760757e-07, "logits/chosen": -0.8381991982460022, "logits/rejected": -0.798214316368103, "logps/chosen": -74.21675109863281, "logps/rejected": -60.84002685546875, "loss": 0.9624, "rewards/accuracies": 1.0, "rewards/chosen": 2.6659042835235596, "rewards/margins": 0.7524658441543579, "rewards/rejected": 1.9134384393692017, "step": 8140 }, { "epoch": 1.32, "learning_rate": 5.269291700496816e-07, "logits/chosen": -0.832852840423584, "logits/rejected": -0.8386720418930054, "logps/chosen": -120.2844467163086, "logps/rejected": -95.11320495605469, "loss": 1.4759, "rewards/accuracies": 0.0, "rewards/chosen": 1.2970832586288452, "rewards/margins": -2.8909707069396973, "rewards/rejected": 4.188054084777832, "step": 8141 }, { "epoch": 1.32, "learning_rate": 5.267979344780554e-07, "logits/chosen": -0.5623924136161804, "logits/rejected": -0.5858762264251709, "logps/chosen": -9.55698013305664, "logps/rejected": -32.10791015625, "loss": 0.9054, "rewards/accuracies": 1.0, "rewards/chosen": 0.11884298175573349, "rewards/margins": 0.17369452118873596, "rewards/rejected": -0.054851531982421875, "step": 8142 }, { "epoch": 1.32, "learning_rate": 5.266666970549455e-07, "logits/chosen": -0.9134613871574402, "logits/rejected": -0.9715863466262817, "logps/chosen": -87.96102905273438, "logps/rejected": -100.15596008300781, "loss": 1.7053, "rewards/accuracies": 0.0, "rewards/chosen": 1.1220871210098267, "rewards/margins": -2.6491241455078125, "rewards/rejected": 3.7712113857269287, "step": 8143 }, { "epoch": 1.32, "learning_rate": 5.265354577894191e-07, "logits/chosen": -0.6988217830657959, "logits/rejected": -0.4713289141654968, "logps/chosen": -124.02877044677734, "logps/rejected": -55.64260482788086, "loss": 0.0444, "rewards/accuracies": 1.0, "rewards/chosen": 4.301044464111328, "rewards/margins": 3.1099796295166016, "rewards/rejected": 1.1910648345947266, "step": 8144 }, { "epoch": 1.32, "learning_rate": 5.264042166905437e-07, "logits/chosen": -1.0968232154846191, "logits/rejected": -1.0944747924804688, "logps/chosen": -164.8011016845703, "logps/rejected": -77.13807678222656, "loss": 0.6179, "rewards/accuracies": 1.0, "rewards/chosen": 4.311931133270264, "rewards/margins": 1.96980619430542, "rewards/rejected": 2.3421249389648438, "step": 8145 }, { "epoch": 1.32, "learning_rate": 5.262729737673867e-07, "logits/chosen": -0.6905105710029602, "logits/rejected": -0.7421587705612183, "logps/chosen": -122.38735961914062, "logps/rejected": -150.36767578125, "loss": 1.9586, "rewards/accuracies": 0.0, "rewards/chosen": 0.9468475580215454, "rewards/margins": -3.8640122413635254, "rewards/rejected": 4.810859680175781, "step": 8146 }, { "epoch": 1.32, "learning_rate": 5.261417290290157e-07, "logits/chosen": -0.25856542587280273, "logits/rejected": -0.25856542587280273, "logps/chosen": -54.24626159667969, "logps/rejected": -54.24626159667969, "loss": 0.5241, "rewards/accuracies": 0.0, "rewards/chosen": 1.8789535760879517, "rewards/margins": 0.0, "rewards/rejected": 1.8789535760879517, "step": 8147 }, { "epoch": 1.32, "learning_rate": 5.260104824844989e-07, "logits/chosen": -1.107836365699768, "logits/rejected": -1.0883164405822754, "logps/chosen": -195.4527130126953, "logps/rejected": -58.71875, "loss": 0.9878, "rewards/accuracies": 0.0, "rewards/chosen": 1.7965682744979858, "rewards/margins": -1.6368592977523804, "rewards/rejected": 3.433427572250366, "step": 8148 }, { "epoch": 1.32, "learning_rate": 5.258792341429037e-07, "logits/chosen": -0.7628011107444763, "logits/rejected": -0.741600751876831, "logps/chosen": -53.154876708984375, "logps/rejected": -90.53861999511719, "loss": 0.8615, "rewards/accuracies": 1.0, "rewards/chosen": 1.0134934186935425, "rewards/margins": 0.3673447370529175, "rewards/rejected": 0.646148681640625, "step": 8149 }, { "epoch": 1.32, "learning_rate": 5.257479840132982e-07, "logits/chosen": -0.3965476453304291, "logits/rejected": -0.3976103663444519, "logps/chosen": -8.118661880493164, "logps/rejected": -4.861672401428223, "loss": 0.4315, "rewards/accuracies": 0.0, "rewards/chosen": 0.20764903724193573, "rewards/margins": -0.13859720528125763, "rewards/rejected": 0.34624624252319336, "step": 8150 }, { "epoch": 1.32, "learning_rate": 5.256167321047508e-07, "logits/chosen": -0.9484443068504333, "logits/rejected": -0.9426245093345642, "logps/chosen": -62.295536041259766, "logps/rejected": -67.84606170654297, "loss": 0.7179, "rewards/accuracies": 0.0, "rewards/chosen": 0.8775295615196228, "rewards/margins": -0.09657400846481323, "rewards/rejected": 0.974103569984436, "step": 8151 }, { "epoch": 1.32, "learning_rate": 5.254854784263295e-07, "logits/chosen": -0.14563021063804626, "logits/rejected": -0.14563021063804626, "logps/chosen": -20.839004516601562, "logps/rejected": -20.839004516601562, "loss": 0.534, "rewards/accuracies": 0.0, "rewards/chosen": 0.06805401295423508, "rewards/margins": 0.0, "rewards/rejected": 0.06805401295423508, "step": 8152 }, { "epoch": 1.32, "learning_rate": 5.253542229871028e-07, "logits/chosen": -0.9407328963279724, "logits/rejected": -0.9837672710418701, "logps/chosen": -68.1845932006836, "logps/rejected": -165.83616638183594, "loss": 2.727, "rewards/accuracies": 0.0, "rewards/chosen": 1.2227524518966675, "rewards/margins": -4.158392429351807, "rewards/rejected": 5.381145000457764, "step": 8153 }, { "epoch": 1.32, "learning_rate": 5.252229657961393e-07, "logits/chosen": -0.7452903985977173, "logits/rejected": -0.7390123605728149, "logps/chosen": -105.87744140625, "logps/rejected": -46.205013275146484, "loss": 0.7237, "rewards/accuracies": 0.0, "rewards/chosen": 0.5150405764579773, "rewards/margins": -1.1357877254486084, "rewards/rejected": 1.650828242301941, "step": 8154 }, { "epoch": 1.32, "learning_rate": 5.250917068625074e-07, "logits/chosen": -0.8254496455192566, "logits/rejected": -0.8305613994598389, "logps/chosen": -112.03943634033203, "logps/rejected": -39.63371276855469, "loss": 1.7919, "rewards/accuracies": 1.0, "rewards/chosen": 3.310994863510132, "rewards/margins": 0.1565713882446289, "rewards/rejected": 3.154423475265503, "step": 8155 }, { "epoch": 1.32, "learning_rate": 5.24960446195276e-07, "logits/chosen": -0.703025758266449, "logits/rejected": -0.571989119052887, "logps/chosen": -130.12539672851562, "logps/rejected": -118.26335144042969, "loss": 0.0828, "rewards/accuracies": 1.0, "rewards/chosen": 7.888302803039551, "rewards/margins": 2.6286301612854004, "rewards/rejected": 5.25967264175415, "step": 8156 }, { "epoch": 1.32, "learning_rate": 5.24829183803514e-07, "logits/chosen": -0.6923469305038452, "logits/rejected": -0.7939802408218384, "logps/chosen": -74.32774353027344, "logps/rejected": -124.2863540649414, "loss": 1.3206, "rewards/accuracies": 0.0, "rewards/chosen": 2.4985673427581787, "rewards/margins": -2.543935537338257, "rewards/rejected": 5.0425028800964355, "step": 8157 }, { "epoch": 1.32, "learning_rate": 5.246979196962903e-07, "logits/chosen": -0.8240055441856384, "logits/rejected": -0.7282643914222717, "logps/chosen": -106.98860931396484, "logps/rejected": -35.31249237060547, "loss": 0.1165, "rewards/accuracies": 1.0, "rewards/chosen": 1.7369216680526733, "rewards/margins": 1.458012342453003, "rewards/rejected": 0.278909295797348, "step": 8158 }, { "epoch": 1.32, "learning_rate": 5.245666538826741e-07, "logits/chosen": -1.0117441415786743, "logits/rejected": -1.0093598365783691, "logps/chosen": -73.8947525024414, "logps/rejected": -77.96034240722656, "loss": 0.2683, "rewards/accuracies": 1.0, "rewards/chosen": 1.2239677906036377, "rewards/margins": 0.3736511468887329, "rewards/rejected": 0.8503166437149048, "step": 8159 }, { "epoch": 1.32, "learning_rate": 5.244353863717345e-07, "logits/chosen": -0.44564422965049744, "logits/rejected": -0.6080199480056763, "logps/chosen": -155.944091796875, "logps/rejected": -101.44566345214844, "loss": 0.4899, "rewards/accuracies": 0.0, "rewards/chosen": 3.5436432361602783, "rewards/margins": -0.1411299705505371, "rewards/rejected": 3.6847732067108154, "step": 8160 }, { "epoch": 1.32, "learning_rate": 5.243041171725409e-07, "logits/chosen": -0.4922322630882263, "logits/rejected": -0.5444031953811646, "logps/chosen": -77.27412414550781, "logps/rejected": -44.986412048339844, "loss": 1.0201, "rewards/accuracies": 0.0, "rewards/chosen": 1.5495026111602783, "rewards/margins": -0.2582789659500122, "rewards/rejected": 1.8077815771102905, "step": 8161 }, { "epoch": 1.32, "learning_rate": 5.24172846294163e-07, "logits/chosen": -0.45612406730651855, "logits/rejected": -0.45612406730651855, "logps/chosen": -73.25595092773438, "logps/rejected": -73.25595092773438, "loss": 0.3777, "rewards/accuracies": 0.0, "rewards/chosen": 2.419232130050659, "rewards/margins": 0.0, "rewards/rejected": 2.419232130050659, "step": 8162 }, { "epoch": 1.32, "learning_rate": 5.240415737456698e-07, "logits/chosen": -0.5812889933586121, "logits/rejected": -0.4581993818283081, "logps/chosen": -53.96247863769531, "logps/rejected": -16.09893226623535, "loss": 0.2893, "rewards/accuracies": 1.0, "rewards/chosen": 1.5184334516525269, "rewards/margins": 1.0796303749084473, "rewards/rejected": 0.438803106546402, "step": 8163 }, { "epoch": 1.33, "learning_rate": 5.239102995361316e-07, "logits/chosen": -0.7898409366607666, "logits/rejected": -0.7939640283584595, "logps/chosen": -105.57330322265625, "logps/rejected": -83.28248596191406, "loss": 1.7825, "rewards/accuracies": 0.0, "rewards/chosen": 0.6608161926269531, "rewards/margins": -2.649094343185425, "rewards/rejected": 3.309910535812378, "step": 8164 }, { "epoch": 1.33, "learning_rate": 5.237790236746178e-07, "logits/chosen": -0.43251940608024597, "logits/rejected": -0.4300888180732727, "logps/chosen": -84.61044311523438, "logps/rejected": -66.15856170654297, "loss": 0.7332, "rewards/accuracies": 0.0, "rewards/chosen": 1.0035308599472046, "rewards/margins": -0.33294451236724854, "rewards/rejected": 1.3364753723144531, "step": 8165 }, { "epoch": 1.33, "learning_rate": 5.236477461701985e-07, "logits/chosen": -0.7413782477378845, "logits/rejected": -0.7334666848182678, "logps/chosen": -99.40454864501953, "logps/rejected": -109.74320983886719, "loss": 1.9946, "rewards/accuracies": 0.0, "rewards/chosen": 1.6361640691757202, "rewards/margins": -3.0504159927368164, "rewards/rejected": 4.686580181121826, "step": 8166 }, { "epoch": 1.33, "learning_rate": 5.235164670319436e-07, "logits/chosen": -0.9214625954627991, "logits/rejected": -0.8948012590408325, "logps/chosen": -48.43971252441406, "logps/rejected": -105.81324768066406, "loss": 0.7662, "rewards/accuracies": 0.0, "rewards/chosen": 1.8577369451522827, "rewards/margins": -0.7601662874221802, "rewards/rejected": 2.617903232574463, "step": 8167 }, { "epoch": 1.33, "learning_rate": 5.233851862689235e-07, "logits/chosen": -0.8931344151496887, "logits/rejected": -0.8974744081497192, "logps/chosen": -68.06216430664062, "logps/rejected": -150.4855499267578, "loss": 0.1642, "rewards/accuracies": 1.0, "rewards/chosen": 1.5061172246932983, "rewards/margins": 1.3027770519256592, "rewards/rejected": 0.20334015786647797, "step": 8168 }, { "epoch": 1.33, "learning_rate": 5.232539038902082e-07, "logits/chosen": -0.8713972568511963, "logits/rejected": -1.0050824880599976, "logps/chosen": -253.53997802734375, "logps/rejected": -140.0357208251953, "loss": 0.3709, "rewards/accuracies": 1.0, "rewards/chosen": 6.132092475891113, "rewards/margins": 0.22257566452026367, "rewards/rejected": 5.90951681137085, "step": 8169 }, { "epoch": 1.33, "learning_rate": 5.231226199048682e-07, "logits/chosen": -0.7430261969566345, "logits/rejected": -0.7154476046562195, "logps/chosen": -34.258689880371094, "logps/rejected": -48.346763610839844, "loss": 0.823, "rewards/accuracies": 0.0, "rewards/chosen": 0.5294575095176697, "rewards/margins": -1.1736118793487549, "rewards/rejected": 1.7030693292617798, "step": 8170 }, { "epoch": 1.33, "learning_rate": 5.22991334321974e-07, "logits/chosen": -0.5468527674674988, "logits/rejected": -0.5808361172676086, "logps/chosen": -25.43882179260254, "logps/rejected": -65.51177978515625, "loss": 0.4852, "rewards/accuracies": 1.0, "rewards/chosen": 0.6079134345054626, "rewards/margins": 0.425584614276886, "rewards/rejected": 0.18232880532741547, "step": 8171 }, { "epoch": 1.33, "learning_rate": 5.22860047150596e-07, "logits/chosen": -0.9398656487464905, "logits/rejected": -0.8871515393257141, "logps/chosen": -59.224952697753906, "logps/rejected": -69.54308319091797, "loss": 0.567, "rewards/accuracies": 0.0, "rewards/chosen": 1.2577370405197144, "rewards/margins": -0.7086226940155029, "rewards/rejected": 1.9663597345352173, "step": 8172 }, { "epoch": 1.33, "learning_rate": 5.227287583998051e-07, "logits/chosen": -0.4516828656196594, "logits/rejected": -0.4516828656196594, "logps/chosen": -63.061012268066406, "logps/rejected": -63.061012268066406, "loss": 1.2786, "rewards/accuracies": 0.0, "rewards/chosen": 0.7871200442314148, "rewards/margins": 0.0, "rewards/rejected": 0.7871200442314148, "step": 8173 }, { "epoch": 1.33, "learning_rate": 5.22597468078672e-07, "logits/chosen": -0.5019063353538513, "logits/rejected": -0.5068643093109131, "logps/chosen": -60.66675567626953, "logps/rejected": -101.61673736572266, "loss": 2.2077, "rewards/accuracies": 0.0, "rewards/chosen": 1.4224189519882202, "rewards/margins": -3.4630632400512695, "rewards/rejected": 4.885482311248779, "step": 8174 }, { "epoch": 1.33, "learning_rate": 5.224661761962679e-07, "logits/chosen": -0.7480717301368713, "logits/rejected": -0.7610934376716614, "logps/chosen": -46.04740905761719, "logps/rejected": -134.06997680664062, "loss": 1.8861, "rewards/accuracies": 0.0, "rewards/chosen": 1.1855376958847046, "rewards/margins": -2.8227462768554688, "rewards/rejected": 4.008284091949463, "step": 8175 }, { "epoch": 1.33, "learning_rate": 5.223348827616634e-07, "logits/chosen": -0.639934241771698, "logits/rejected": -0.6167973279953003, "logps/chosen": -72.19740295410156, "logps/rejected": -51.06955337524414, "loss": 0.4718, "rewards/accuracies": 1.0, "rewards/chosen": 2.1080148220062256, "rewards/margins": 0.0180356502532959, "rewards/rejected": 2.0899791717529297, "step": 8176 }, { "epoch": 1.33, "learning_rate": 5.222035877839299e-07, "logits/chosen": -0.4809471070766449, "logits/rejected": -0.5005602240562439, "logps/chosen": -89.92453002929688, "logps/rejected": -93.79641723632812, "loss": 1.712, "rewards/accuracies": 0.0, "rewards/chosen": 1.3946609497070312, "rewards/margins": -2.9198641777038574, "rewards/rejected": 4.314525127410889, "step": 8177 }, { "epoch": 1.33, "learning_rate": 5.220722912721386e-07, "logits/chosen": -0.21105198562145233, "logits/rejected": -0.21685075759887695, "logps/chosen": -1.4878323078155518, "logps/rejected": -2.1946253776550293, "loss": 0.3613, "rewards/accuracies": 0.0, "rewards/chosen": 0.202663853764534, "rewards/margins": -0.051267191767692566, "rewards/rejected": 0.25393104553222656, "step": 8178 }, { "epoch": 1.33, "learning_rate": 5.219409932353608e-07, "logits/chosen": -0.8049653172492981, "logits/rejected": -0.763230562210083, "logps/chosen": -115.53137969970703, "logps/rejected": -41.896202087402344, "loss": 1.1728, "rewards/accuracies": 0.0, "rewards/chosen": -0.047556307166814804, "rewards/margins": -2.0482065677642822, "rewards/rejected": 2.00065016746521, "step": 8179 }, { "epoch": 1.33, "learning_rate": 5.21809693682668e-07, "logits/chosen": -0.6767221093177795, "logits/rejected": -0.6767221093177795, "logps/chosen": -60.82981491088867, "logps/rejected": -60.82981491088867, "loss": 0.4837, "rewards/accuracies": 0.0, "rewards/chosen": 2.587904691696167, "rewards/margins": 0.0, "rewards/rejected": 2.587904691696167, "step": 8180 }, { "epoch": 1.33, "learning_rate": 5.216783926231317e-07, "logits/chosen": -0.6608660221099854, "logits/rejected": -0.6636486053466797, "logps/chosen": -127.53425598144531, "logps/rejected": -152.45297241210938, "loss": 2.5276, "rewards/accuracies": 0.0, "rewards/chosen": 0.8452163934707642, "rewards/margins": -4.273410320281982, "rewards/rejected": 5.118626594543457, "step": 8181 }, { "epoch": 1.33, "learning_rate": 5.215470900658236e-07, "logits/chosen": -0.9322360754013062, "logits/rejected": -0.8555155396461487, "logps/chosen": -56.286720275878906, "logps/rejected": -129.733642578125, "loss": 0.2002, "rewards/accuracies": 1.0, "rewards/chosen": 1.846594214439392, "rewards/margins": 0.8656554818153381, "rewards/rejected": 0.980938732624054, "step": 8182 }, { "epoch": 1.33, "learning_rate": 5.214157860198155e-07, "logits/chosen": -0.7605834603309631, "logits/rejected": -0.7714683413505554, "logps/chosen": -79.11128234863281, "logps/rejected": -39.21364212036133, "loss": 1.0953, "rewards/accuracies": 0.0, "rewards/chosen": 0.8473892211914062, "rewards/margins": -1.0531338453292847, "rewards/rejected": 1.900523066520691, "step": 8183 }, { "epoch": 1.33, "learning_rate": 5.212844804941792e-07, "logits/chosen": -0.7468059659004211, "logits/rejected": -0.46578529477119446, "logps/chosen": -141.2847900390625, "logps/rejected": -47.989585876464844, "loss": 0.2049, "rewards/accuracies": 1.0, "rewards/chosen": 1.38262939453125, "rewards/margins": 0.8947509527206421, "rewards/rejected": 0.4878784120082855, "step": 8184 }, { "epoch": 1.33, "learning_rate": 5.211531734979866e-07, "logits/chosen": -0.9137223362922668, "logits/rejected": -0.6696138381958008, "logps/chosen": -104.22929382324219, "logps/rejected": -167.2760009765625, "loss": 1.4056, "rewards/accuracies": 0.0, "rewards/chosen": 1.78656005859375, "rewards/margins": -2.5361571311950684, "rewards/rejected": 4.322717189788818, "step": 8185 }, { "epoch": 1.33, "learning_rate": 5.2102186504031e-07, "logits/chosen": -0.7313160300254822, "logits/rejected": -0.6693240404129028, "logps/chosen": -104.61522674560547, "logps/rejected": -61.0590934753418, "loss": 1.4119, "rewards/accuracies": 0.0, "rewards/chosen": 0.5565162897109985, "rewards/margins": -0.44033241271972656, "rewards/rejected": 0.9968487024307251, "step": 8186 }, { "epoch": 1.33, "learning_rate": 5.208905551302214e-07, "logits/chosen": -0.6067367196083069, "logits/rejected": -0.5448036789894104, "logps/chosen": -104.80352020263672, "logps/rejected": -41.743255615234375, "loss": 0.4364, "rewards/accuracies": 0.0, "rewards/chosen": 1.0479965209960938, "rewards/margins": -0.28827977180480957, "rewards/rejected": 1.3362762928009033, "step": 8187 }, { "epoch": 1.33, "learning_rate": 5.207592437767931e-07, "logits/chosen": -1.209489345550537, "logits/rejected": -1.2211154699325562, "logps/chosen": -127.69823455810547, "logps/rejected": -19.520530700683594, "loss": 0.0199, "rewards/accuracies": 1.0, "rewards/chosen": 6.195359230041504, "rewards/margins": 5.729077339172363, "rewards/rejected": 0.4662818908691406, "step": 8188 }, { "epoch": 1.33, "learning_rate": 5.206279309890975e-07, "logits/chosen": -0.8948560357093811, "logits/rejected": -0.9033029675483704, "logps/chosen": -239.6815948486328, "logps/rejected": -124.03289794921875, "loss": 0.1787, "rewards/accuracies": 1.0, "rewards/chosen": 5.289830207824707, "rewards/margins": 1.034287929534912, "rewards/rejected": 4.255542278289795, "step": 8189 }, { "epoch": 1.33, "learning_rate": 5.20496616776207e-07, "logits/chosen": -0.8192173838615417, "logits/rejected": -0.7855483293533325, "logps/chosen": -72.37946319580078, "logps/rejected": -39.46994400024414, "loss": 0.9774, "rewards/accuracies": 1.0, "rewards/chosen": 1.628008246421814, "rewards/margins": 1.3328498601913452, "rewards/rejected": 0.29515838623046875, "step": 8190 }, { "epoch": 1.33, "learning_rate": 5.203653011471943e-07, "logits/chosen": -0.70771324634552, "logits/rejected": -0.7212669253349304, "logps/chosen": -95.942138671875, "logps/rejected": -113.49082946777344, "loss": 0.9122, "rewards/accuracies": 0.0, "rewards/chosen": 3.9082794189453125, "rewards/margins": -1.500767707824707, "rewards/rejected": 5.4090471267700195, "step": 8191 }, { "epoch": 1.33, "learning_rate": 5.202339841111318e-07, "logits/chosen": -0.9679073691368103, "logits/rejected": -1.0070635080337524, "logps/chosen": -185.52854919433594, "logps/rejected": -49.403778076171875, "loss": 0.0797, "rewards/accuracies": 1.0, "rewards/chosen": 3.805973768234253, "rewards/margins": 2.354915142059326, "rewards/rejected": 1.4510586261749268, "step": 8192 }, { "epoch": 1.33, "learning_rate": 5.201026656770926e-07, "logits/chosen": -0.981826663017273, "logits/rejected": -1.0005098581314087, "logps/chosen": -68.16500091552734, "logps/rejected": -86.32636260986328, "loss": 0.9321, "rewards/accuracies": 0.0, "rewards/chosen": 1.6503394842147827, "rewards/margins": -0.3302643299102783, "rewards/rejected": 1.980603814125061, "step": 8193 }, { "epoch": 1.33, "learning_rate": 5.199713458541495e-07, "logits/chosen": -0.5113774538040161, "logits/rejected": -0.5287536382675171, "logps/chosen": -17.090574264526367, "logps/rejected": -57.009361267089844, "loss": 0.4816, "rewards/accuracies": 0.0, "rewards/chosen": 0.12919273972511292, "rewards/margins": -0.19452476501464844, "rewards/rejected": 0.32371750473976135, "step": 8194 }, { "epoch": 1.33, "learning_rate": 5.198400246513752e-07, "logits/chosen": -0.27028197050094604, "logits/rejected": -0.27028197050094604, "logps/chosen": -43.4310302734375, "logps/rejected": -43.4310302734375, "loss": 0.9938, "rewards/accuracies": 0.0, "rewards/chosen": -0.03010406531393528, "rewards/margins": 0.0, "rewards/rejected": -0.03010406531393528, "step": 8195 }, { "epoch": 1.33, "learning_rate": 5.197087020778431e-07, "logits/chosen": -0.6927579045295715, "logits/rejected": -0.6765622496604919, "logps/chosen": -99.72956848144531, "logps/rejected": -77.68464660644531, "loss": 0.9393, "rewards/accuracies": 0.0, "rewards/chosen": 0.6098160147666931, "rewards/margins": -1.1214141845703125, "rewards/rejected": 1.7312301397323608, "step": 8196 }, { "epoch": 1.33, "learning_rate": 5.19577378142626e-07, "logits/chosen": 0.0009692244348116219, "logits/rejected": -0.00043451791862025857, "logps/chosen": -5.069617748260498, "logps/rejected": -3.8867127895355225, "loss": 0.8769, "rewards/accuracies": 1.0, "rewards/chosen": 0.10116267204284668, "rewards/margins": 0.014422155916690826, "rewards/rejected": 0.08674051612615585, "step": 8197 }, { "epoch": 1.33, "learning_rate": 5.194460528547974e-07, "logits/chosen": -0.8272279500961304, "logits/rejected": -0.8011476993560791, "logps/chosen": -80.31967163085938, "logps/rejected": -55.768890380859375, "loss": 0.6758, "rewards/accuracies": 0.0, "rewards/chosen": 2.009478807449341, "rewards/margins": -1.0458037853240967, "rewards/rejected": 3.0552825927734375, "step": 8198 }, { "epoch": 1.33, "learning_rate": 5.193147262234306e-07, "logits/chosen": -0.4301270842552185, "logits/rejected": -0.36752811074256897, "logps/chosen": -70.37147521972656, "logps/rejected": -40.65018844604492, "loss": 0.2237, "rewards/accuracies": 1.0, "rewards/chosen": 1.0815582275390625, "rewards/margins": 1.000888466835022, "rewards/rejected": 0.08066978305578232, "step": 8199 }, { "epoch": 1.33, "learning_rate": 5.191833982575989e-07, "logits/chosen": -0.4464561343193054, "logits/rejected": -0.278766006231308, "logps/chosen": -62.71122741699219, "logps/rejected": -31.27922821044922, "loss": 0.3437, "rewards/accuracies": 1.0, "rewards/chosen": 1.3465805053710938, "rewards/margins": 1.0533519983291626, "rewards/rejected": 0.29322853684425354, "step": 8200 }, { "epoch": 1.33, "learning_rate": 5.190520689663759e-07, "logits/chosen": -0.7064325213432312, "logits/rejected": -0.7603736519813538, "logps/chosen": -61.30665969848633, "logps/rejected": -126.95848083496094, "loss": 1.7548, "rewards/accuracies": 0.0, "rewards/chosen": 1.1828136444091797, "rewards/margins": -2.796344518661499, "rewards/rejected": 3.9791581630706787, "step": 8201 }, { "epoch": 1.33, "learning_rate": 5.189207383588352e-07, "logits/chosen": -0.21601544320583344, "logits/rejected": -0.14513078331947327, "logps/chosen": -37.95095443725586, "logps/rejected": -46.611549377441406, "loss": 0.4607, "rewards/accuracies": 1.0, "rewards/chosen": 1.785464882850647, "rewards/margins": 0.029345273971557617, "rewards/rejected": 1.7561196088790894, "step": 8202 }, { "epoch": 1.33, "learning_rate": 5.187894064440505e-07, "logits/chosen": -0.3341723382472992, "logits/rejected": -0.30561164021492004, "logps/chosen": -141.93368530273438, "logps/rejected": -112.46647644042969, "loss": 0.474, "rewards/accuracies": 0.0, "rewards/chosen": 4.9969940185546875, "rewards/margins": -0.018756389617919922, "rewards/rejected": 5.015750408172607, "step": 8203 }, { "epoch": 1.33, "learning_rate": 5.186580732310955e-07, "logits/chosen": -0.6000611186027527, "logits/rejected": -0.5061767101287842, "logps/chosen": -109.93511199951172, "logps/rejected": -17.045595169067383, "loss": 0.1632, "rewards/accuracies": 1.0, "rewards/chosen": 1.488929033279419, "rewards/margins": 1.2429792881011963, "rewards/rejected": 0.24594974517822266, "step": 8204 }, { "epoch": 1.33, "learning_rate": 5.185267387290444e-07, "logits/chosen": -0.6983455419540405, "logits/rejected": -0.708901584148407, "logps/chosen": -128.8076934814453, "logps/rejected": -138.95086669921875, "loss": 0.6094, "rewards/accuracies": 0.0, "rewards/chosen": 0.33592531085014343, "rewards/margins": -0.8130645751953125, "rewards/rejected": 1.1489899158477783, "step": 8205 }, { "epoch": 1.33, "learning_rate": 5.183954029469709e-07, "logits/chosen": -0.5438796281814575, "logits/rejected": -0.4631763696670532, "logps/chosen": -114.65593719482422, "logps/rejected": -76.37948608398438, "loss": 0.3736, "rewards/accuracies": 1.0, "rewards/chosen": 4.509127140045166, "rewards/margins": 2.1977570056915283, "rewards/rejected": 2.3113701343536377, "step": 8206 }, { "epoch": 1.33, "learning_rate": 5.18264065893949e-07, "logits/chosen": -0.8046992421150208, "logits/rejected": -0.8163456320762634, "logps/chosen": -145.3358154296875, "logps/rejected": -126.31414794921875, "loss": 0.3541, "rewards/accuracies": 0.0, "rewards/chosen": 4.168501377105713, "rewards/margins": -0.02425670623779297, "rewards/rejected": 4.192758083343506, "step": 8207 }, { "epoch": 1.33, "learning_rate": 5.181327275790531e-07, "logits/chosen": -0.8303241729736328, "logits/rejected": -0.837067186832428, "logps/chosen": -117.88093566894531, "logps/rejected": -145.15269470214844, "loss": 0.5642, "rewards/accuracies": 1.0, "rewards/chosen": 4.799064636230469, "rewards/margins": 0.3338165283203125, "rewards/rejected": 4.465248107910156, "step": 8208 }, { "epoch": 1.33, "learning_rate": 5.180013880113571e-07, "logits/chosen": -0.8075168132781982, "logits/rejected": -0.7586972713470459, "logps/chosen": -103.37406921386719, "logps/rejected": -132.1357421875, "loss": 1.0057, "rewards/accuracies": 0.0, "rewards/chosen": 1.146209716796875, "rewards/margins": -0.6888504028320312, "rewards/rejected": 1.8350601196289062, "step": 8209 }, { "epoch": 1.33, "learning_rate": 5.178700471999357e-07, "logits/chosen": -1.0239348411560059, "logits/rejected": -0.9459797739982605, "logps/chosen": -77.67448425292969, "logps/rejected": -12.874743461608887, "loss": 0.075, "rewards/accuracies": 1.0, "rewards/chosen": 2.7373757362365723, "rewards/margins": 2.0219035148620605, "rewards/rejected": 0.7154723405838013, "step": 8210 }, { "epoch": 1.33, "learning_rate": 5.17738705153863e-07, "logits/chosen": -0.7883105874061584, "logits/rejected": -0.7679244875907898, "logps/chosen": -42.271148681640625, "logps/rejected": -60.161930084228516, "loss": 0.6451, "rewards/accuracies": 0.0, "rewards/chosen": 1.467734932899475, "rewards/margins": -0.26069867610931396, "rewards/rejected": 1.728433609008789, "step": 8211 }, { "epoch": 1.33, "learning_rate": 5.176073618822138e-07, "logits/chosen": -0.6873326897621155, "logits/rejected": -0.6873326897621155, "logps/chosen": -31.92156982421875, "logps/rejected": -31.92156982421875, "loss": 1.1971, "rewards/accuracies": 0.0, "rewards/chosen": 0.3216361999511719, "rewards/margins": 0.0, "rewards/rejected": 0.3216361999511719, "step": 8212 }, { "epoch": 1.33, "learning_rate": 5.174760173940624e-07, "logits/chosen": -0.7594340443611145, "logits/rejected": -0.7826143503189087, "logps/chosen": -246.9594268798828, "logps/rejected": -65.5445785522461, "loss": 0.4987, "rewards/accuracies": 1.0, "rewards/chosen": 3.262590169906616, "rewards/margins": 0.8960747718811035, "rewards/rejected": 2.3665153980255127, "step": 8213 }, { "epoch": 1.33, "learning_rate": 5.173446716984836e-07, "logits/chosen": -0.787024736404419, "logits/rejected": -0.7618715763092041, "logps/chosen": -54.74700164794922, "logps/rejected": -69.03369903564453, "loss": 0.6296, "rewards/accuracies": 0.0, "rewards/chosen": 1.0217125415802002, "rewards/margins": -0.9044090509414673, "rewards/rejected": 1.9261215925216675, "step": 8214 }, { "epoch": 1.33, "learning_rate": 5.172133248045521e-07, "logits/chosen": -0.731486439704895, "logits/rejected": -0.7207993268966675, "logps/chosen": -87.57614135742188, "logps/rejected": -118.41470336914062, "loss": 0.1887, "rewards/accuracies": 1.0, "rewards/chosen": 3.8607406616210938, "rewards/margins": 1.2034118175506592, "rewards/rejected": 2.6573288440704346, "step": 8215 }, { "epoch": 1.33, "learning_rate": 5.170819767213427e-07, "logits/chosen": -0.5188378691673279, "logits/rejected": -0.4988279342651367, "logps/chosen": -29.21758460998535, "logps/rejected": -26.723527908325195, "loss": 0.641, "rewards/accuracies": 1.0, "rewards/chosen": 0.3079465925693512, "rewards/margins": 0.14984703063964844, "rewards/rejected": 0.15809956192970276, "step": 8216 }, { "epoch": 1.33, "learning_rate": 5.169506274579303e-07, "logits/chosen": -0.7062773704528809, "logits/rejected": -0.7688794136047363, "logps/chosen": -70.07698059082031, "logps/rejected": -133.0674285888672, "loss": 1.5431, "rewards/accuracies": 0.0, "rewards/chosen": 1.833757758140564, "rewards/margins": -2.8980889320373535, "rewards/rejected": 4.731846809387207, "step": 8217 }, { "epoch": 1.33, "learning_rate": 5.1681927702339e-07, "logits/chosen": -0.8164694309234619, "logits/rejected": -0.49840205907821655, "logps/chosen": -97.64070129394531, "logps/rejected": -172.28903198242188, "loss": 0.3229, "rewards/accuracies": 1.0, "rewards/chosen": 5.775821208953857, "rewards/margins": 0.7243013381958008, "rewards/rejected": 5.051519870758057, "step": 8218 }, { "epoch": 1.33, "learning_rate": 5.166879254267967e-07, "logits/chosen": -0.8774124383926392, "logits/rejected": -0.8903597593307495, "logps/chosen": -277.9461669921875, "logps/rejected": -109.73882293701172, "loss": 1.0614, "rewards/accuracies": 0.0, "rewards/chosen": 4.521826267242432, "rewards/margins": -0.016564369201660156, "rewards/rejected": 4.538390636444092, "step": 8219 }, { "epoch": 1.33, "learning_rate": 5.165565726772258e-07, "logits/chosen": -0.7898944616317749, "logits/rejected": -0.6403130888938904, "logps/chosen": -66.86210632324219, "logps/rejected": -18.966796875, "loss": 0.1597, "rewards/accuracies": 1.0, "rewards/chosen": 1.8734444379806519, "rewards/margins": 1.6148710250854492, "rewards/rejected": 0.25857335329055786, "step": 8220 }, { "epoch": 1.33, "learning_rate": 5.164252187837522e-07, "logits/chosen": -0.5677834153175354, "logits/rejected": -0.6545484662055969, "logps/chosen": -68.94574737548828, "logps/rejected": -103.67189025878906, "loss": 1.4412, "rewards/accuracies": 0.0, "rewards/chosen": 2.4618377685546875, "rewards/margins": -2.5656752586364746, "rewards/rejected": 5.027513027191162, "step": 8221 }, { "epoch": 1.33, "learning_rate": 5.162938637554515e-07, "logits/chosen": -0.5696568489074707, "logits/rejected": -0.41872572898864746, "logps/chosen": -79.61992645263672, "logps/rejected": -14.043970108032227, "loss": 0.5457, "rewards/accuracies": 1.0, "rewards/chosen": 1.4356011152267456, "rewards/margins": 1.0544079542160034, "rewards/rejected": 0.3811931610107422, "step": 8222 }, { "epoch": 1.33, "learning_rate": 5.161625076013991e-07, "logits/chosen": -0.7125281691551208, "logits/rejected": -0.7583847641944885, "logps/chosen": -88.31976318359375, "logps/rejected": -35.848480224609375, "loss": 0.0735, "rewards/accuracies": 1.0, "rewards/chosen": 3.6191070079803467, "rewards/margins": 2.1237263679504395, "rewards/rejected": 1.4953807592391968, "step": 8223 }, { "epoch": 1.33, "learning_rate": 5.160311503306703e-07, "logits/chosen": -0.7982616424560547, "logits/rejected": -0.7819429636001587, "logps/chosen": -44.824851989746094, "logps/rejected": -69.13177490234375, "loss": 1.5531, "rewards/accuracies": 0.0, "rewards/chosen": 0.9632625579833984, "rewards/margins": -1.6664044857025146, "rewards/rejected": 2.629667043685913, "step": 8224 }, { "epoch": 1.34, "learning_rate": 5.158997919523405e-07, "logits/chosen": -0.6691290736198425, "logits/rejected": -0.6584924459457397, "logps/chosen": -115.12239074707031, "logps/rejected": -77.91653442382812, "loss": 1.7155, "rewards/accuracies": 0.0, "rewards/chosen": 0.6464653015136719, "rewards/margins": -1.9240891933441162, "rewards/rejected": 2.570554494857788, "step": 8225 }, { "epoch": 1.34, "learning_rate": 5.157684324754857e-07, "logits/chosen": -0.6240676045417786, "logits/rejected": -0.6291615962982178, "logps/chosen": -7.1892595291137695, "logps/rejected": -9.252666473388672, "loss": 1.2034, "rewards/accuracies": 1.0, "rewards/chosen": 0.13131876289844513, "rewards/margins": 0.10467157512903214, "rewards/rejected": 0.026647185906767845, "step": 8226 }, { "epoch": 1.34, "learning_rate": 5.156370719091814e-07, "logits/chosen": -1.0097540616989136, "logits/rejected": -0.90292888879776, "logps/chosen": -120.34192657470703, "logps/rejected": -29.918203353881836, "loss": 0.4128, "rewards/accuracies": 1.0, "rewards/chosen": 1.0036872625350952, "rewards/margins": 0.827829897403717, "rewards/rejected": 0.17585735023021698, "step": 8227 }, { "epoch": 1.34, "learning_rate": 5.155057102625035e-07, "logits/chosen": -0.5162056684494019, "logits/rejected": -0.41055095195770264, "logps/chosen": -54.98981857299805, "logps/rejected": -48.22370147705078, "loss": 0.4115, "rewards/accuracies": 0.0, "rewards/chosen": 2.222111940383911, "rewards/margins": -0.21761512756347656, "rewards/rejected": 2.4397270679473877, "step": 8228 }, { "epoch": 1.34, "learning_rate": 5.153743475445276e-07, "logits/chosen": -0.7324753999710083, "logits/rejected": -0.7057167291641235, "logps/chosen": -99.69886779785156, "logps/rejected": -62.966224670410156, "loss": 1.33, "rewards/accuracies": 1.0, "rewards/chosen": 1.8286590576171875, "rewards/margins": 0.8311362862586975, "rewards/rejected": 0.99752277135849, "step": 8229 }, { "epoch": 1.34, "learning_rate": 5.152429837643297e-07, "logits/chosen": -0.589322566986084, "logits/rejected": -0.589322566986084, "logps/chosen": -25.837154388427734, "logps/rejected": -25.837154388427734, "loss": 0.359, "rewards/accuracies": 0.0, "rewards/chosen": 1.097333550453186, "rewards/margins": 0.0, "rewards/rejected": 1.097333550453186, "step": 8230 }, { "epoch": 1.34, "learning_rate": 5.151116189309859e-07, "logits/chosen": -0.6052852272987366, "logits/rejected": -0.6052852272987366, "logps/chosen": -26.818897247314453, "logps/rejected": -26.818897247314453, "loss": 0.3579, "rewards/accuracies": 0.0, "rewards/chosen": 1.655435562133789, "rewards/margins": 0.0, "rewards/rejected": 1.655435562133789, "step": 8231 }, { "epoch": 1.34, "learning_rate": 5.149802530535723e-07, "logits/chosen": -0.4610324203968048, "logits/rejected": -0.505891740322113, "logps/chosen": -80.9701156616211, "logps/rejected": -69.67051696777344, "loss": 1.5185, "rewards/accuracies": 0.0, "rewards/chosen": 1.6468498706817627, "rewards/margins": -0.9150130748748779, "rewards/rejected": 2.5618629455566406, "step": 8232 }, { "epoch": 1.34, "learning_rate": 5.148488861411648e-07, "logits/chosen": -0.6983985304832458, "logits/rejected": -0.6573786735534668, "logps/chosen": -55.238807678222656, "logps/rejected": -100.11054229736328, "loss": 0.3244, "rewards/accuracies": 1.0, "rewards/chosen": 3.5805702209472656, "rewards/margins": 0.6622116565704346, "rewards/rejected": 2.918358564376831, "step": 8233 }, { "epoch": 1.34, "learning_rate": 5.1471751820284e-07, "logits/chosen": -0.12859587371349335, "logits/rejected": -0.16765320301055908, "logps/chosen": -19.66326141357422, "logps/rejected": -47.75924301147461, "loss": 0.5483, "rewards/accuracies": 1.0, "rewards/chosen": -0.01248016394674778, "rewards/margins": 0.20926666259765625, "rewards/rejected": -0.22174683213233948, "step": 8234 }, { "epoch": 1.34, "learning_rate": 5.145861492476739e-07, "logits/chosen": -0.558383047580719, "logits/rejected": -0.5486712455749512, "logps/chosen": -9.745063781738281, "logps/rejected": -19.410919189453125, "loss": 0.5179, "rewards/accuracies": 1.0, "rewards/chosen": 0.305234432220459, "rewards/margins": 0.035726070404052734, "rewards/rejected": 0.26950836181640625, "step": 8235 }, { "epoch": 1.34, "learning_rate": 5.144547792847427e-07, "logits/chosen": -0.7564899325370789, "logits/rejected": -0.7022069096565247, "logps/chosen": -67.49240112304688, "logps/rejected": -101.701904296875, "loss": 0.4021, "rewards/accuracies": 1.0, "rewards/chosen": 0.9132171869277954, "rewards/margins": 0.11793369054794312, "rewards/rejected": 0.7952834963798523, "step": 8236 }, { "epoch": 1.34, "learning_rate": 5.14323408323123e-07, "logits/chosen": -0.7034382224082947, "logits/rejected": -0.5085936784744263, "logps/chosen": -180.0284881591797, "logps/rejected": -65.27312469482422, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": 5.128297328948975, "rewards/margins": 2.6289846897125244, "rewards/rejected": 2.49931263923645, "step": 8237 }, { "epoch": 1.34, "learning_rate": 5.141920363718916e-07, "logits/chosen": -0.5381854772567749, "logits/rejected": -0.5362587571144104, "logps/chosen": -53.55964660644531, "logps/rejected": -88.73735809326172, "loss": 0.3575, "rewards/accuracies": 1.0, "rewards/chosen": 1.6438102722167969, "rewards/margins": 1.0393531322479248, "rewards/rejected": 0.6044570803642273, "step": 8238 }, { "epoch": 1.34, "learning_rate": 5.140606634401246e-07, "logits/chosen": -0.6964846849441528, "logits/rejected": -0.6912478804588318, "logps/chosen": -44.13578796386719, "logps/rejected": -142.69192504882812, "loss": 0.3386, "rewards/accuracies": 1.0, "rewards/chosen": 2.090747117996216, "rewards/margins": 1.6120483875274658, "rewards/rejected": 0.47869873046875, "step": 8239 }, { "epoch": 1.34, "learning_rate": 5.139292895368988e-07, "logits/chosen": -0.31050604581832886, "logits/rejected": -0.2903241813182831, "logps/chosen": -19.904348373413086, "logps/rejected": -0.7148718237876892, "loss": 0.6593, "rewards/accuracies": 0.0, "rewards/chosen": -0.16087742149829865, "rewards/margins": -0.35046374797821045, "rewards/rejected": 0.1895863264799118, "step": 8240 }, { "epoch": 1.34, "learning_rate": 5.13797914671291e-07, "logits/chosen": -0.5009751319885254, "logits/rejected": -0.4923006594181061, "logps/chosen": -66.92005157470703, "logps/rejected": -112.03599548339844, "loss": 0.2766, "rewards/accuracies": 1.0, "rewards/chosen": 2.1720123291015625, "rewards/margins": 0.5501967668533325, "rewards/rejected": 1.62181556224823, "step": 8241 }, { "epoch": 1.34, "learning_rate": 5.136665388523777e-07, "logits/chosen": -0.8578247427940369, "logits/rejected": -0.8431028127670288, "logps/chosen": -134.9481658935547, "logps/rejected": -189.92715454101562, "loss": 0.1355, "rewards/accuracies": 1.0, "rewards/chosen": 5.254998683929443, "rewards/margins": 1.2166805267333984, "rewards/rejected": 4.038318157196045, "step": 8242 }, { "epoch": 1.34, "learning_rate": 5.13535162089236e-07, "logits/chosen": -0.8426232933998108, "logits/rejected": -0.7638791799545288, "logps/chosen": -76.79035186767578, "logps/rejected": -63.433677673339844, "loss": 0.264, "rewards/accuracies": 1.0, "rewards/chosen": 2.4979865550994873, "rewards/margins": 0.6515364646911621, "rewards/rejected": 1.8464500904083252, "step": 8243 }, { "epoch": 1.34, "learning_rate": 5.134037843909427e-07, "logits/chosen": -0.782119631767273, "logits/rejected": -0.782119631767273, "logps/chosen": -0.644972026348114, "logps/rejected": -0.644972026348114, "loss": 0.5883, "rewards/accuracies": 0.0, "rewards/chosen": 0.3116603195667267, "rewards/margins": 0.0, "rewards/rejected": 0.3116603195667267, "step": 8244 }, { "epoch": 1.34, "learning_rate": 5.132724057665746e-07, "logits/chosen": -0.6303388476371765, "logits/rejected": -0.6571246981620789, "logps/chosen": -61.92609405517578, "logps/rejected": -141.56993103027344, "loss": 1.6236, "rewards/accuracies": 0.0, "rewards/chosen": 1.4868263006210327, "rewards/margins": -1.704058051109314, "rewards/rejected": 3.1908843517303467, "step": 8245 }, { "epoch": 1.34, "learning_rate": 5.13141026225209e-07, "logits/chosen": -0.39455127716064453, "logits/rejected": -0.4726436138153076, "logps/chosen": -76.70963287353516, "logps/rejected": -95.97197723388672, "loss": 1.9723, "rewards/accuracies": 0.0, "rewards/chosen": 1.8160728216171265, "rewards/margins": -3.445256233215332, "rewards/rejected": 5.261329174041748, "step": 8246 }, { "epoch": 1.34, "learning_rate": 5.130096457759227e-07, "logits/chosen": -0.6191126108169556, "logits/rejected": -0.4496407210826874, "logps/chosen": -145.772216796875, "logps/rejected": -104.56405639648438, "loss": 0.5198, "rewards/accuracies": 1.0, "rewards/chosen": 5.215109348297119, "rewards/margins": 1.6242339611053467, "rewards/rejected": 3.5908753871917725, "step": 8247 }, { "epoch": 1.34, "learning_rate": 5.12878264427793e-07, "logits/chosen": -0.4603930711746216, "logits/rejected": -0.4603930711746216, "logps/chosen": -82.26040649414062, "logps/rejected": -82.26040649414062, "loss": 0.6297, "rewards/accuracies": 0.0, "rewards/chosen": 0.7300392389297485, "rewards/margins": 0.0, "rewards/rejected": 0.7300392389297485, "step": 8248 }, { "epoch": 1.34, "learning_rate": 5.127468821898971e-07, "logits/chosen": -0.6857340931892395, "logits/rejected": -0.5698575973510742, "logps/chosen": -99.38874053955078, "logps/rejected": -111.07666015625, "loss": 0.516, "rewards/accuracies": 1.0, "rewards/chosen": 4.946681976318359, "rewards/margins": 0.7143149375915527, "rewards/rejected": 4.232367038726807, "step": 8249 }, { "epoch": 1.34, "learning_rate": 5.126154990713122e-07, "logits/chosen": -0.38519611954689026, "logits/rejected": -0.250789612531662, "logps/chosen": -53.45826721191406, "logps/rejected": -46.29119873046875, "loss": 0.3961, "rewards/accuracies": 1.0, "rewards/chosen": 2.084851026535034, "rewards/margins": 0.8443102836608887, "rewards/rejected": 1.2405407428741455, "step": 8250 }, { "epoch": 1.34, "learning_rate": 5.124841150811157e-07, "logits/chosen": -0.46820929646492004, "logits/rejected": -0.6154682040214539, "logps/chosen": -123.84740447998047, "logps/rejected": -142.73684692382812, "loss": 1.8852, "rewards/accuracies": 0.0, "rewards/chosen": 0.43501052260398865, "rewards/margins": -2.235170841217041, "rewards/rejected": 2.6701812744140625, "step": 8251 }, { "epoch": 1.34, "learning_rate": 5.12352730228385e-07, "logits/chosen": -0.8571836948394775, "logits/rejected": -0.7840155363082886, "logps/chosen": -120.90523529052734, "logps/rejected": -69.78943634033203, "loss": 0.2231, "rewards/accuracies": 1.0, "rewards/chosen": 3.1785104274749756, "rewards/margins": 0.6054964065551758, "rewards/rejected": 2.5730140209198, "step": 8252 }, { "epoch": 1.34, "learning_rate": 5.122213445221976e-07, "logits/chosen": -0.3200138211250305, "logits/rejected": -0.3156323730945587, "logps/chosen": -52.70051574707031, "logps/rejected": -84.242431640625, "loss": 0.5578, "rewards/accuracies": 0.0, "rewards/chosen": -0.025891875848174095, "rewards/margins": -0.6520622372627258, "rewards/rejected": 0.6261703372001648, "step": 8253 }, { "epoch": 1.34, "learning_rate": 5.120899579716308e-07, "logits/chosen": -0.8112704753875732, "logits/rejected": -0.786560595035553, "logps/chosen": -83.61895751953125, "logps/rejected": -77.27854919433594, "loss": 0.798, "rewards/accuracies": 0.0, "rewards/chosen": 1.2218109369277954, "rewards/margins": -0.8325835466384888, "rewards/rejected": 2.054394483566284, "step": 8254 }, { "epoch": 1.34, "learning_rate": 5.119585705857624e-07, "logits/chosen": -0.7920332551002502, "logits/rejected": -0.7246791124343872, "logps/chosen": -60.22639465332031, "logps/rejected": -88.8554916381836, "loss": 0.419, "rewards/accuracies": 0.0, "rewards/chosen": 1.6408509016036987, "rewards/margins": -0.14893412590026855, "rewards/rejected": 1.7897850275039673, "step": 8255 }, { "epoch": 1.34, "learning_rate": 5.118271823736698e-07, "logits/chosen": -0.8592019081115723, "logits/rejected": -0.8282004594802856, "logps/chosen": -71.7918701171875, "logps/rejected": -52.229766845703125, "loss": 0.9911, "rewards/accuracies": 1.0, "rewards/chosen": 0.9283004999160767, "rewards/margins": 0.27711790800094604, "rewards/rejected": 0.6511825919151306, "step": 8256 }, { "epoch": 1.34, "learning_rate": 5.11695793344431e-07, "logits/chosen": -0.6343391537666321, "logits/rejected": -0.5614474415779114, "logps/chosen": -72.18885803222656, "logps/rejected": -72.74807739257812, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 1.2287925481796265, "rewards/margins": -0.605139970779419, "rewards/rejected": 1.8339325189590454, "step": 8257 }, { "epoch": 1.34, "learning_rate": 5.115644035071233e-07, "logits/chosen": -0.7754858136177063, "logits/rejected": -0.8651531338691711, "logps/chosen": -57.740142822265625, "logps/rejected": -77.37024688720703, "loss": 2.4902, "rewards/accuracies": 0.0, "rewards/chosen": 0.8876350522041321, "rewards/margins": -2.5161468982696533, "rewards/rejected": 3.4037818908691406, "step": 8258 }, { "epoch": 1.34, "learning_rate": 5.114330128708249e-07, "logits/chosen": -0.5821483135223389, "logits/rejected": -0.4934634864330292, "logps/chosen": -68.25065612792969, "logps/rejected": -57.19654083251953, "loss": 0.2214, "rewards/accuracies": 1.0, "rewards/chosen": 3.2689590454101562, "rewards/margins": 0.6998825073242188, "rewards/rejected": 2.5690765380859375, "step": 8259 }, { "epoch": 1.34, "learning_rate": 5.113016214446136e-07, "logits/chosen": -0.48751139640808105, "logits/rejected": -0.48751139640808105, "logps/chosen": -0.9264479875564575, "logps/rejected": -0.9264479875564575, "loss": 0.4675, "rewards/accuracies": 0.0, "rewards/chosen": 0.37041494250297546, "rewards/margins": 0.0, "rewards/rejected": 0.37041494250297546, "step": 8260 }, { "epoch": 1.34, "learning_rate": 5.11170229237567e-07, "logits/chosen": -0.8402186036109924, "logits/rejected": -0.7004554271697998, "logps/chosen": -143.02603149414062, "logps/rejected": -78.12669372558594, "loss": 0.1634, "rewards/accuracies": 1.0, "rewards/chosen": 3.8311753273010254, "rewards/margins": 1.0197389125823975, "rewards/rejected": 2.811436414718628, "step": 8261 }, { "epoch": 1.34, "learning_rate": 5.110388362587633e-07, "logits/chosen": -0.7716299891471863, "logits/rejected": -0.6916732788085938, "logps/chosen": -156.37530517578125, "logps/rejected": -52.336917877197266, "loss": 0.1202, "rewards/accuracies": 1.0, "rewards/chosen": 4.131053447723389, "rewards/margins": 1.3206870555877686, "rewards/rejected": 2.81036639213562, "step": 8262 }, { "epoch": 1.34, "learning_rate": 5.109074425172805e-07, "logits/chosen": -0.5442154407501221, "logits/rejected": -0.5101629495620728, "logps/chosen": -68.01124572753906, "logps/rejected": -80.11912536621094, "loss": 0.1602, "rewards/accuracies": 1.0, "rewards/chosen": 2.135183811187744, "rewards/margins": 1.189045786857605, "rewards/rejected": 0.9461380243301392, "step": 8263 }, { "epoch": 1.34, "learning_rate": 5.107760480221967e-07, "logits/chosen": -1.0112659931182861, "logits/rejected": -1.2748832702636719, "logps/chosen": -88.48837280273438, "logps/rejected": -36.43832778930664, "loss": 0.4944, "rewards/accuracies": 1.0, "rewards/chosen": 2.495382070541382, "rewards/margins": 2.2646963596343994, "rewards/rejected": 0.23068581521511078, "step": 8264 }, { "epoch": 1.34, "learning_rate": 5.106446527825898e-07, "logits/chosen": -0.43504440784454346, "logits/rejected": -0.4092741012573242, "logps/chosen": -65.94490814208984, "logps/rejected": -89.09780883789062, "loss": 1.5803, "rewards/accuracies": 1.0, "rewards/chosen": 1.6457023620605469, "rewards/margins": 0.5381218194961548, "rewards/rejected": 1.107580542564392, "step": 8265 }, { "epoch": 1.34, "learning_rate": 5.105132568075382e-07, "logits/chosen": -0.5231670141220093, "logits/rejected": -0.4520472288131714, "logps/chosen": -63.18719482421875, "logps/rejected": -58.421104431152344, "loss": 0.5894, "rewards/accuracies": 0.0, "rewards/chosen": 0.8359237909317017, "rewards/margins": -0.7153899669647217, "rewards/rejected": 1.5513137578964233, "step": 8266 }, { "epoch": 1.34, "learning_rate": 5.103818601061201e-07, "logits/chosen": -0.8144846558570862, "logits/rejected": -0.6596932411193848, "logps/chosen": -128.4155731201172, "logps/rejected": -183.850341796875, "loss": 0.2395, "rewards/accuracies": 1.0, "rewards/chosen": 7.990962505340576, "rewards/margins": 0.48708057403564453, "rewards/rejected": 7.503881931304932, "step": 8267 }, { "epoch": 1.34, "learning_rate": 5.102504626874136e-07, "logits/chosen": -0.7693830728530884, "logits/rejected": -0.6178498864173889, "logps/chosen": -207.03704833984375, "logps/rejected": -95.31837463378906, "loss": 0.114, "rewards/accuracies": 1.0, "rewards/chosen": 4.650909423828125, "rewards/margins": 1.5679748058319092, "rewards/rejected": 3.082934617996216, "step": 8268 }, { "epoch": 1.34, "learning_rate": 5.101190645604971e-07, "logits/chosen": -0.7955760955810547, "logits/rejected": -0.6908590197563171, "logps/chosen": -172.50799560546875, "logps/rejected": -65.97965240478516, "loss": 1.0824, "rewards/accuracies": 0.0, "rewards/chosen": 0.9728302359580994, "rewards/margins": -0.9758842587471008, "rewards/rejected": 1.9487144947052002, "step": 8269 }, { "epoch": 1.34, "learning_rate": 5.09987665734449e-07, "logits/chosen": -0.330118864774704, "logits/rejected": -0.330118864774704, "logps/chosen": -47.04994583129883, "logps/rejected": -47.04994583129883, "loss": 0.805, "rewards/accuracies": 0.0, "rewards/chosen": 0.585932195186615, "rewards/margins": 0.0, "rewards/rejected": 0.585932195186615, "step": 8270 }, { "epoch": 1.34, "learning_rate": 5.098562662183477e-07, "logits/chosen": -0.6131731271743774, "logits/rejected": -0.5446276664733887, "logps/chosen": -46.01670837402344, "logps/rejected": -18.491758346557617, "loss": 0.2738, "rewards/accuracies": 1.0, "rewards/chosen": 0.7435699701309204, "rewards/margins": 0.36404743790626526, "rewards/rejected": 0.37952253222465515, "step": 8271 }, { "epoch": 1.34, "learning_rate": 5.097248660212716e-07, "logits/chosen": -0.7162089347839355, "logits/rejected": -0.6179125308990479, "logps/chosen": -50.22160339355469, "logps/rejected": -116.57656860351562, "loss": 0.7197, "rewards/accuracies": 0.0, "rewards/chosen": 1.1320209503173828, "rewards/margins": -0.3625713586807251, "rewards/rejected": 1.494592308998108, "step": 8272 }, { "epoch": 1.34, "learning_rate": 5.095934651522994e-07, "logits/chosen": -1.052573561668396, "logits/rejected": -0.9593885540962219, "logps/chosen": -129.65753173828125, "logps/rejected": -25.563308715820312, "loss": 0.3071, "rewards/accuracies": 1.0, "rewards/chosen": 5.49513578414917, "rewards/margins": 5.0693793296813965, "rewards/rejected": 0.42575666308403015, "step": 8273 }, { "epoch": 1.34, "learning_rate": 5.094620636205095e-07, "logits/chosen": -0.7855477333068848, "logits/rejected": -0.7839378118515015, "logps/chosen": -83.99270629882812, "logps/rejected": -44.995140075683594, "loss": 0.3149, "rewards/accuracies": 1.0, "rewards/chosen": 1.4787269830703735, "rewards/margins": 0.28813624382019043, "rewards/rejected": 1.190590739250183, "step": 8274 }, { "epoch": 1.34, "learning_rate": 5.093306614349805e-07, "logits/chosen": -1.1363654136657715, "logits/rejected": -1.0059502124786377, "logps/chosen": -134.6239013671875, "logps/rejected": -99.94544982910156, "loss": 0.1335, "rewards/accuracies": 1.0, "rewards/chosen": 6.955893039703369, "rewards/margins": 1.3987212181091309, "rewards/rejected": 5.557171821594238, "step": 8275 }, { "epoch": 1.34, "learning_rate": 5.091992586047911e-07, "logits/chosen": -0.7295628190040588, "logits/rejected": -0.7116177082061768, "logps/chosen": -68.01573944091797, "logps/rejected": -101.45655059814453, "loss": 0.2539, "rewards/accuracies": 1.0, "rewards/chosen": 1.787591576576233, "rewards/margins": 1.2921874523162842, "rewards/rejected": 0.49540406465530396, "step": 8276 }, { "epoch": 1.34, "learning_rate": 5.0906785513902e-07, "logits/chosen": -0.19342491030693054, "logits/rejected": -0.12332100421190262, "logps/chosen": -79.20272827148438, "logps/rejected": -76.90730285644531, "loss": 0.834, "rewards/accuracies": 0.0, "rewards/chosen": 0.8040542602539062, "rewards/margins": -1.4550468921661377, "rewards/rejected": 2.259101152420044, "step": 8277 }, { "epoch": 1.34, "learning_rate": 5.089364510467458e-07, "logits/chosen": -0.19028133153915405, "logits/rejected": -0.19028133153915405, "logps/chosen": -48.27922058105469, "logps/rejected": -48.27922058105469, "loss": 0.6607, "rewards/accuracies": 0.0, "rewards/chosen": 1.652197241783142, "rewards/margins": 0.0, "rewards/rejected": 1.652197241783142, "step": 8278 }, { "epoch": 1.34, "learning_rate": 5.088050463370475e-07, "logits/chosen": -0.6595238447189331, "logits/rejected": -0.6588024497032166, "logps/chosen": -70.50312805175781, "logps/rejected": -86.86468505859375, "loss": 0.6639, "rewards/accuracies": 1.0, "rewards/chosen": 1.5668877363204956, "rewards/margins": 0.19519197940826416, "rewards/rejected": 1.3716957569122314, "step": 8279 }, { "epoch": 1.34, "learning_rate": 5.086736410190039e-07, "logits/chosen": -0.44336172938346863, "logits/rejected": -0.3970291018486023, "logps/chosen": -55.4183235168457, "logps/rejected": -1.9660000801086426, "loss": 0.6037, "rewards/accuracies": 0.0, "rewards/chosen": -0.13889427483081818, "rewards/margins": -0.6228364706039429, "rewards/rejected": 0.4839421808719635, "step": 8280 }, { "epoch": 1.34, "learning_rate": 5.085422351016937e-07, "logits/chosen": -0.848645806312561, "logits/rejected": -0.7686340808868408, "logps/chosen": -96.87120056152344, "logps/rejected": -93.39881896972656, "loss": 0.5559, "rewards/accuracies": 0.0, "rewards/chosen": 1.3075379133224487, "rewards/margins": -0.4353523254394531, "rewards/rejected": 1.7428902387619019, "step": 8281 }, { "epoch": 1.34, "learning_rate": 5.084108285941958e-07, "logits/chosen": -0.3763476014137268, "logits/rejected": -0.33600372076034546, "logps/chosen": -5.049454212188721, "logps/rejected": -17.748023986816406, "loss": 0.2685, "rewards/accuracies": 1.0, "rewards/chosen": 0.9377560019493103, "rewards/margins": 0.5428221225738525, "rewards/rejected": 0.39493390917778015, "step": 8282 }, { "epoch": 1.34, "learning_rate": 5.082794215055894e-07, "logits/chosen": -0.5641742944717407, "logits/rejected": -0.5641742944717407, "logps/chosen": -91.38900756835938, "logps/rejected": -91.38900756835938, "loss": 1.1996, "rewards/accuracies": 0.0, "rewards/chosen": 1.9216011762619019, "rewards/margins": 0.0, "rewards/rejected": 1.9216011762619019, "step": 8283 }, { "epoch": 1.34, "learning_rate": 5.081480138449531e-07, "logits/chosen": -0.8924497365951538, "logits/rejected": -0.7304714322090149, "logps/chosen": -112.7691421508789, "logps/rejected": -168.749267578125, "loss": 0.2522, "rewards/accuracies": 1.0, "rewards/chosen": 5.149538516998291, "rewards/margins": 0.628206729888916, "rewards/rejected": 4.521331787109375, "step": 8284 }, { "epoch": 1.34, "learning_rate": 5.080166056213663e-07, "logits/chosen": -0.8544882535934448, "logits/rejected": -0.8365814089775085, "logps/chosen": -94.87226104736328, "logps/rejected": -62.55670928955078, "loss": 0.433, "rewards/accuracies": 0.0, "rewards/chosen": 2.0501372814178467, "rewards/margins": -0.2075340747833252, "rewards/rejected": 2.257671356201172, "step": 8285 }, { "epoch": 1.34, "learning_rate": 5.078851968439077e-07, "logits/chosen": -1.1510554552078247, "logits/rejected": -1.1125422716140747, "logps/chosen": -110.00857543945312, "logps/rejected": -67.00981903076172, "loss": 1.035, "rewards/accuracies": 0.0, "rewards/chosen": 1.5332481861114502, "rewards/margins": -0.3840789794921875, "rewards/rejected": 1.9173271656036377, "step": 8286 }, { "epoch": 1.35, "learning_rate": 5.077537875216568e-07, "logits/chosen": -0.7688966393470764, "logits/rejected": -0.7929427027702332, "logps/chosen": -64.94454193115234, "logps/rejected": -107.74270629882812, "loss": 0.5562, "rewards/accuracies": 0.0, "rewards/chosen": 1.9277015924453735, "rewards/margins": -0.536389946937561, "rewards/rejected": 2.4640915393829346, "step": 8287 }, { "epoch": 1.35, "learning_rate": 5.076223776636925e-07, "logits/chosen": -1.0322128534317017, "logits/rejected": -1.1335736513137817, "logps/chosen": -88.45478820800781, "logps/rejected": -168.96202087402344, "loss": 3.8526, "rewards/accuracies": 0.0, "rewards/chosen": 0.7453522086143494, "rewards/margins": -7.684007167816162, "rewards/rejected": 8.429359436035156, "step": 8288 }, { "epoch": 1.35, "learning_rate": 5.07490967279094e-07, "logits/chosen": -0.5504222512245178, "logits/rejected": -0.5845534205436707, "logps/chosen": -93.63391876220703, "logps/rejected": -107.5886001586914, "loss": 0.4641, "rewards/accuracies": 0.0, "rewards/chosen": 1.247209906578064, "rewards/margins": -0.07106709480285645, "rewards/rejected": 1.3182770013809204, "step": 8289 }, { "epoch": 1.35, "learning_rate": 5.073595563769406e-07, "logits/chosen": -0.6730419397354126, "logits/rejected": -0.6730419397354126, "logps/chosen": -187.05902099609375, "logps/rejected": -187.05902099609375, "loss": 0.6141, "rewards/accuracies": 0.0, "rewards/chosen": 4.410003662109375, "rewards/margins": 0.0, "rewards/rejected": 4.410003662109375, "step": 8290 }, { "epoch": 1.35, "learning_rate": 5.072281449663115e-07, "logits/chosen": -0.28537800908088684, "logits/rejected": -0.24783901870250702, "logps/chosen": -52.54778289794922, "logps/rejected": -107.02009582519531, "loss": 0.6519, "rewards/accuracies": 1.0, "rewards/chosen": 1.0460869073867798, "rewards/margins": 0.2810981869697571, "rewards/rejected": 0.7649887204170227, "step": 8291 }, { "epoch": 1.35, "learning_rate": 5.070967330562859e-07, "logits/chosen": -0.48647168278694153, "logits/rejected": -0.44958797097206116, "logps/chosen": -68.483642578125, "logps/rejected": -85.41178894042969, "loss": 0.5385, "rewards/accuracies": 0.0, "rewards/chosen": 1.5761154890060425, "rewards/margins": -0.4016585350036621, "rewards/rejected": 1.9777740240097046, "step": 8292 }, { "epoch": 1.35, "learning_rate": 5.069653206559432e-07, "logits/chosen": -0.6042669415473938, "logits/rejected": -0.5978251099586487, "logps/chosen": -0.9614425301551819, "logps/rejected": -8.753692626953125, "loss": 0.7833, "rewards/accuracies": 1.0, "rewards/chosen": 0.35262686014175415, "rewards/margins": 0.29588258266448975, "rewards/rejected": 0.0567442886531353, "step": 8293 }, { "epoch": 1.35, "learning_rate": 5.068339077743628e-07, "logits/chosen": -0.6103448271751404, "logits/rejected": -0.6009950637817383, "logps/chosen": -228.37442016601562, "logps/rejected": -184.9468994140625, "loss": 0.8646, "rewards/accuracies": 0.0, "rewards/chosen": 4.2476959228515625, "rewards/margins": -1.4018068313598633, "rewards/rejected": 5.649502754211426, "step": 8294 }, { "epoch": 1.35, "learning_rate": 5.067024944206241e-07, "logits/chosen": -0.5788228511810303, "logits/rejected": -0.5257319211959839, "logps/chosen": -108.96330261230469, "logps/rejected": -70.57996368408203, "loss": 0.2971, "rewards/accuracies": 1.0, "rewards/chosen": 1.3579987287521362, "rewards/margins": 0.3268517255783081, "rewards/rejected": 1.0311470031738281, "step": 8295 }, { "epoch": 1.35, "learning_rate": 5.065710806038062e-07, "logits/chosen": -0.5573729872703552, "logits/rejected": -0.4268844723701477, "logps/chosen": -63.91978454589844, "logps/rejected": -39.97357177734375, "loss": 0.1902, "rewards/accuracies": 1.0, "rewards/chosen": 1.496759057044983, "rewards/margins": 0.9330520629882812, "rewards/rejected": 0.5637069940567017, "step": 8296 }, { "epoch": 1.35, "learning_rate": 5.064396663329891e-07, "logits/chosen": -0.6725326180458069, "logits/rejected": -0.5895039439201355, "logps/chosen": -74.54574584960938, "logps/rejected": -134.01882934570312, "loss": 0.2156, "rewards/accuracies": 1.0, "rewards/chosen": 2.10530161857605, "rewards/margins": 0.8237677812576294, "rewards/rejected": 1.2815338373184204, "step": 8297 }, { "epoch": 1.35, "learning_rate": 5.063082516172519e-07, "logits/chosen": -0.4072231948375702, "logits/rejected": -0.36845773458480835, "logps/chosen": -99.66890716552734, "logps/rejected": -64.89814758300781, "loss": 0.2151, "rewards/accuracies": 1.0, "rewards/chosen": 3.052910566329956, "rewards/margins": 1.437772274017334, "rewards/rejected": 1.615138292312622, "step": 8298 }, { "epoch": 1.35, "learning_rate": 5.061768364656741e-07, "logits/chosen": -0.549044668674469, "logits/rejected": -0.47577032446861267, "logps/chosen": -31.898635864257812, "logps/rejected": -16.995676040649414, "loss": 0.5805, "rewards/accuracies": 1.0, "rewards/chosen": 1.0676491260528564, "rewards/margins": 0.5395966172218323, "rewards/rejected": 0.5280525088310242, "step": 8299 }, { "epoch": 1.35, "learning_rate": 5.060454208873353e-07, "logits/chosen": -0.1736276000738144, "logits/rejected": -0.17928147315979004, "logps/chosen": -5.8778767585754395, "logps/rejected": -2.229029655456543, "loss": 0.7044, "rewards/accuracies": 0.0, "rewards/chosen": 0.023754263296723366, "rewards/margins": -0.24052530527114868, "rewards/rejected": 0.2642795741558075, "step": 8300 }, { "epoch": 1.35, "learning_rate": 5.059140048913152e-07, "logits/chosen": -0.6994196772575378, "logits/rejected": -0.6891918778419495, "logps/chosen": -62.679847717285156, "logps/rejected": -83.73945617675781, "loss": 0.7996, "rewards/accuracies": 1.0, "rewards/chosen": 2.3442955017089844, "rewards/margins": 0.40895307064056396, "rewards/rejected": 1.9353424310684204, "step": 8301 }, { "epoch": 1.35, "learning_rate": 5.057825884866934e-07, "logits/chosen": -0.4619743525981903, "logits/rejected": -0.4593350887298584, "logps/chosen": -88.63465118408203, "logps/rejected": -88.21463012695312, "loss": 0.7561, "rewards/accuracies": 0.0, "rewards/chosen": 2.575209856033325, "rewards/margins": -0.9694809913635254, "rewards/rejected": 3.5446908473968506, "step": 8302 }, { "epoch": 1.35, "learning_rate": 5.056511716825494e-07, "logits/chosen": -0.8626195192337036, "logits/rejected": -0.8595820665359497, "logps/chosen": -55.0174674987793, "logps/rejected": -76.52207946777344, "loss": 1.3531, "rewards/accuracies": 1.0, "rewards/chosen": 3.104015827178955, "rewards/margins": 0.9940807819366455, "rewards/rejected": 2.1099350452423096, "step": 8303 }, { "epoch": 1.35, "learning_rate": 5.055197544879629e-07, "logits/chosen": -0.8580037355422974, "logits/rejected": -0.85915607213974, "logps/chosen": -112.37621307373047, "logps/rejected": -131.20529174804688, "loss": 0.15, "rewards/accuracies": 1.0, "rewards/chosen": 5.299234867095947, "rewards/margins": 1.0668344497680664, "rewards/rejected": 4.232400417327881, "step": 8304 }, { "epoch": 1.35, "learning_rate": 5.053883369120136e-07, "logits/chosen": -0.6023373007774353, "logits/rejected": -0.6209628582000732, "logps/chosen": -10.991170883178711, "logps/rejected": -2.7327487468719482, "loss": 0.8133, "rewards/accuracies": 0.0, "rewards/chosen": 0.2877984941005707, "rewards/margins": -0.17631399631500244, "rewards/rejected": 0.4641124904155731, "step": 8305 }, { "epoch": 1.35, "learning_rate": 5.052569189637812e-07, "logits/chosen": -0.818879246711731, "logits/rejected": -0.6945194005966187, "logps/chosen": -51.317901611328125, "logps/rejected": -65.2781982421875, "loss": 0.2025, "rewards/accuracies": 1.0, "rewards/chosen": 1.603554606437683, "rewards/margins": 0.7073212265968323, "rewards/rejected": 0.8962333798408508, "step": 8306 }, { "epoch": 1.35, "learning_rate": 5.051255006523455e-07, "logits/chosen": -0.8668709993362427, "logits/rejected": -0.8554907441139221, "logps/chosen": -103.0203857421875, "logps/rejected": -76.16847229003906, "loss": 0.6189, "rewards/accuracies": 0.0, "rewards/chosen": 1.974677324295044, "rewards/margins": -0.10436487197875977, "rewards/rejected": 2.0790421962738037, "step": 8307 }, { "epoch": 1.35, "learning_rate": 5.049940819867861e-07, "logits/chosen": -1.2166029214859009, "logits/rejected": -1.012032389640808, "logps/chosen": -159.3136749267578, "logps/rejected": -109.51029968261719, "loss": 0.093, "rewards/accuracies": 1.0, "rewards/chosen": 5.146484375, "rewards/margins": 1.73409104347229, "rewards/rejected": 3.41239333152771, "step": 8308 }, { "epoch": 1.35, "learning_rate": 5.048626629761829e-07, "logits/chosen": -0.6316033601760864, "logits/rejected": -0.44079819321632385, "logps/chosen": -80.23928833007812, "logps/rejected": -111.91752624511719, "loss": 0.2217, "rewards/accuracies": 1.0, "rewards/chosen": 3.091963291168213, "rewards/margins": 1.2497864961624146, "rewards/rejected": 1.8421767950057983, "step": 8309 }, { "epoch": 1.35, "learning_rate": 5.047312436296158e-07, "logits/chosen": -0.36557772755622864, "logits/rejected": -0.3968467116355896, "logps/chosen": -65.35070037841797, "logps/rejected": -100.85287475585938, "loss": 0.955, "rewards/accuracies": 0.0, "rewards/chosen": 1.2599724531173706, "rewards/margins": -0.8782051801681519, "rewards/rejected": 2.1381776332855225, "step": 8310 }, { "epoch": 1.35, "learning_rate": 5.045998239561646e-07, "logits/chosen": -0.792256772518158, "logits/rejected": -0.6288262605667114, "logps/chosen": -130.61427307128906, "logps/rejected": -92.41119384765625, "loss": 0.1736, "rewards/accuracies": 1.0, "rewards/chosen": 3.7504653930664062, "rewards/margins": 1.4272964000701904, "rewards/rejected": 2.323168992996216, "step": 8311 }, { "epoch": 1.35, "learning_rate": 5.044684039649089e-07, "logits/chosen": -0.9259839057922363, "logits/rejected": -0.8096418976783752, "logps/chosen": -117.72767639160156, "logps/rejected": -184.823486328125, "loss": 0.5792, "rewards/accuracies": 0.0, "rewards/chosen": 5.233758449554443, "rewards/margins": -0.189788818359375, "rewards/rejected": 5.423547267913818, "step": 8312 }, { "epoch": 1.35, "learning_rate": 5.043369836649288e-07, "logits/chosen": -0.6510307192802429, "logits/rejected": -0.6011763215065002, "logps/chosen": -72.419189453125, "logps/rejected": -112.16020965576172, "loss": 0.5022, "rewards/accuracies": 1.0, "rewards/chosen": 1.8879212141036987, "rewards/margins": 0.4640854597091675, "rewards/rejected": 1.4238357543945312, "step": 8313 }, { "epoch": 1.35, "learning_rate": 5.042055630653042e-07, "logits/chosen": -0.3940589725971222, "logits/rejected": -0.4001178443431854, "logps/chosen": -3.984858989715576, "logps/rejected": -2.950813055038452, "loss": 0.6447, "rewards/accuracies": 0.0, "rewards/chosen": 0.15494294464588165, "rewards/margins": -0.14785335958003998, "rewards/rejected": 0.30279630422592163, "step": 8314 }, { "epoch": 1.35, "learning_rate": 5.04074142175115e-07, "logits/chosen": -0.5134324431419373, "logits/rejected": -0.6698437929153442, "logps/chosen": -58.00835418701172, "logps/rejected": -111.1903076171875, "loss": 1.8607, "rewards/accuracies": 0.0, "rewards/chosen": 2.2567505836486816, "rewards/margins": -3.686789035797119, "rewards/rejected": 5.943539619445801, "step": 8315 }, { "epoch": 1.35, "learning_rate": 5.039427210034411e-07, "logits/chosen": -0.7446070909500122, "logits/rejected": -0.7034014463424683, "logps/chosen": -116.30233001708984, "logps/rejected": -103.78144836425781, "loss": 0.9299, "rewards/accuracies": 0.0, "rewards/chosen": 0.9391639828681946, "rewards/margins": -1.6344506740570068, "rewards/rejected": 2.5736145973205566, "step": 8316 }, { "epoch": 1.35, "learning_rate": 5.038112995593625e-07, "logits/chosen": -0.3687942922115326, "logits/rejected": -0.336238294839859, "logps/chosen": -88.59963989257812, "logps/rejected": -73.69001770019531, "loss": 0.3147, "rewards/accuracies": 1.0, "rewards/chosen": 1.4796288013458252, "rewards/margins": 0.9656891226768494, "rewards/rejected": 0.5139396786689758, "step": 8317 }, { "epoch": 1.35, "learning_rate": 5.036798778519591e-07, "logits/chosen": -0.5586283206939697, "logits/rejected": -0.4547661244869232, "logps/chosen": -63.11457824707031, "logps/rejected": -54.66508483886719, "loss": 0.2155, "rewards/accuracies": 1.0, "rewards/chosen": 1.7730331420898438, "rewards/margins": 1.4730781316757202, "rewards/rejected": 0.29995498061180115, "step": 8318 }, { "epoch": 1.35, "learning_rate": 5.035484558903111e-07, "logits/chosen": -0.6210334897041321, "logits/rejected": -0.6994450688362122, "logps/chosen": -128.1577911376953, "logps/rejected": -118.08802795410156, "loss": 1.6433, "rewards/accuracies": 0.0, "rewards/chosen": 0.4877380430698395, "rewards/margins": -1.7742751836776733, "rewards/rejected": 2.2620131969451904, "step": 8319 }, { "epoch": 1.35, "learning_rate": 5.034170336834983e-07, "logits/chosen": -0.5922071933746338, "logits/rejected": -0.6676192879676819, "logps/chosen": -88.44612121582031, "logps/rejected": -168.68878173828125, "loss": 0.285, "rewards/accuracies": 1.0, "rewards/chosen": 5.193202495574951, "rewards/margins": 0.2795124053955078, "rewards/rejected": 4.913690090179443, "step": 8320 }, { "epoch": 1.35, "learning_rate": 5.032856112406009e-07, "logits/chosen": -0.43239399790763855, "logits/rejected": -0.43882331252098083, "logps/chosen": -64.12088012695312, "logps/rejected": -58.284690856933594, "loss": 0.4791, "rewards/accuracies": 1.0, "rewards/chosen": 1.7077499628067017, "rewards/margins": 0.09521865844726562, "rewards/rejected": 1.612531304359436, "step": 8321 }, { "epoch": 1.35, "learning_rate": 5.031541885706987e-07, "logits/chosen": -0.32133597135543823, "logits/rejected": -0.32133597135543823, "logps/chosen": -16.951091766357422, "logps/rejected": -16.951091766357422, "loss": 0.3997, "rewards/accuracies": 0.0, "rewards/chosen": 0.09887237846851349, "rewards/margins": 0.0, "rewards/rejected": 0.09887237846851349, "step": 8322 }, { "epoch": 1.35, "learning_rate": 5.030227656828719e-07, "logits/chosen": -0.8315978050231934, "logits/rejected": -0.815015971660614, "logps/chosen": -76.42083740234375, "logps/rejected": -110.22525024414062, "loss": 1.5338, "rewards/accuracies": 0.0, "rewards/chosen": 2.3783371448516846, "rewards/margins": -1.7045881748199463, "rewards/rejected": 4.082925319671631, "step": 8323 }, { "epoch": 1.35, "learning_rate": 5.028913425862007e-07, "logits/chosen": -0.676657497882843, "logits/rejected": -0.6340287327766418, "logps/chosen": -76.97258758544922, "logps/rejected": -40.028358459472656, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 2.180454969406128, "rewards/margins": 0.14707183837890625, "rewards/rejected": 2.0333831310272217, "step": 8324 }, { "epoch": 1.35, "learning_rate": 5.027599192897651e-07, "logits/chosen": -0.6448518633842468, "logits/rejected": -0.6645174026489258, "logps/chosen": -61.78417205810547, "logps/rejected": -60.11347961425781, "loss": 0.6266, "rewards/accuracies": 1.0, "rewards/chosen": 2.193108320236206, "rewards/margins": 0.6578208208084106, "rewards/rejected": 1.5352874994277954, "step": 8325 }, { "epoch": 1.35, "learning_rate": 5.026284958026451e-07, "logits/chosen": -0.5591058135032654, "logits/rejected": -0.5503239035606384, "logps/chosen": -49.40664291381836, "logps/rejected": -18.609342575073242, "loss": 0.4755, "rewards/accuracies": 0.0, "rewards/chosen": 0.19246673583984375, "rewards/margins": -0.16585826873779297, "rewards/rejected": 0.3583250045776367, "step": 8326 }, { "epoch": 1.35, "learning_rate": 5.024970721339209e-07, "logits/chosen": -0.6634572744369507, "logits/rejected": -0.636908233165741, "logps/chosen": -53.559288024902344, "logps/rejected": -83.42056274414062, "loss": 0.2587, "rewards/accuracies": 1.0, "rewards/chosen": 1.1546547412872314, "rewards/margins": 1.0576133728027344, "rewards/rejected": 0.09704132378101349, "step": 8327 }, { "epoch": 1.35, "learning_rate": 5.023656482926727e-07, "logits/chosen": -1.0746495723724365, "logits/rejected": -1.0876861810684204, "logps/chosen": -142.38558959960938, "logps/rejected": -93.53053283691406, "loss": 0.8861, "rewards/accuracies": 0.0, "rewards/chosen": 3.8154799938201904, "rewards/margins": -0.5464937686920166, "rewards/rejected": 4.361973762512207, "step": 8328 }, { "epoch": 1.35, "learning_rate": 5.022342242879805e-07, "logits/chosen": -0.6597123742103577, "logits/rejected": -0.8066036105155945, "logps/chosen": -68.287841796875, "logps/rejected": -61.19451141357422, "loss": 0.4165, "rewards/accuracies": 0.0, "rewards/chosen": 1.584130883216858, "rewards/margins": -0.13906323909759521, "rewards/rejected": 1.7231941223144531, "step": 8329 }, { "epoch": 1.35, "learning_rate": 5.021028001289246e-07, "logits/chosen": -0.6178569197654724, "logits/rejected": -0.5340920686721802, "logps/chosen": -89.0937728881836, "logps/rejected": -84.0865478515625, "loss": 0.0291, "rewards/accuracies": 1.0, "rewards/chosen": 3.939878225326538, "rewards/margins": 2.854416847229004, "rewards/rejected": 1.0854614973068237, "step": 8330 }, { "epoch": 1.35, "learning_rate": 5.019713758245851e-07, "logits/chosen": -0.7871258854866028, "logits/rejected": -0.7167178988456726, "logps/chosen": -138.93460083007812, "logps/rejected": -150.09982299804688, "loss": 1.48, "rewards/accuracies": 0.0, "rewards/chosen": 2.8068130016326904, "rewards/margins": -2.7365310192108154, "rewards/rejected": 5.543344020843506, "step": 8331 }, { "epoch": 1.35, "learning_rate": 5.018399513840422e-07, "logits/chosen": -0.8852441906929016, "logits/rejected": -0.8575000166893005, "logps/chosen": -77.30668640136719, "logps/rejected": -126.71542358398438, "loss": 0.5031, "rewards/accuracies": 1.0, "rewards/chosen": 1.9478012323379517, "rewards/margins": 0.14712214469909668, "rewards/rejected": 1.800679087638855, "step": 8332 }, { "epoch": 1.35, "learning_rate": 5.017085268163761e-07, "logits/chosen": -0.8643838167190552, "logits/rejected": -0.4510840177536011, "logps/chosen": -153.21661376953125, "logps/rejected": -103.99331665039062, "loss": 0.2748, "rewards/accuracies": 1.0, "rewards/chosen": 6.56447172164917, "rewards/margins": 3.6944918632507324, "rewards/rejected": 2.8699798583984375, "step": 8333 }, { "epoch": 1.35, "learning_rate": 5.015771021306669e-07, "logits/chosen": -0.8962260484695435, "logits/rejected": -0.9661227464675903, "logps/chosen": -171.53585815429688, "logps/rejected": -10.207169532775879, "loss": 0.0316, "rewards/accuracies": 1.0, "rewards/chosen": 3.9893295764923096, "rewards/margins": 3.0319433212280273, "rewards/rejected": 0.9573861360549927, "step": 8334 }, { "epoch": 1.35, "learning_rate": 5.01445677335995e-07, "logits/chosen": -1.280490756034851, "logits/rejected": -1.3228641748428345, "logps/chosen": -246.47422790527344, "logps/rejected": -21.24860382080078, "loss": 0.0559, "rewards/accuracies": 1.0, "rewards/chosen": 2.528193712234497, "rewards/margins": 2.384333848953247, "rewards/rejected": 0.14385986328125, "step": 8335 }, { "epoch": 1.35, "learning_rate": 5.013142524414403e-07, "logits/chosen": -1.175550103187561, "logits/rejected": -1.2165918350219727, "logps/chosen": -117.4938735961914, "logps/rejected": -40.4538459777832, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": 4.695864200592041, "rewards/margins": 4.471810817718506, "rewards/rejected": 0.22405357658863068, "step": 8336 }, { "epoch": 1.35, "learning_rate": 5.011828274560833e-07, "logits/chosen": -1.1320228576660156, "logits/rejected": -1.0679395198822021, "logps/chosen": -80.38481140136719, "logps/rejected": -19.2593994140625, "loss": 2.4796, "rewards/accuracies": 1.0, "rewards/chosen": 1.6933869123458862, "rewards/margins": 1.4386959075927734, "rewards/rejected": 0.254690945148468, "step": 8337 }, { "epoch": 1.35, "learning_rate": 5.01051402389004e-07, "logits/chosen": -0.7787581086158752, "logits/rejected": -0.6046549081802368, "logps/chosen": -80.43236541748047, "logps/rejected": -44.069847106933594, "loss": 0.0501, "rewards/accuracies": 1.0, "rewards/chosen": 4.588708400726318, "rewards/margins": 2.948538064956665, "rewards/rejected": 1.6401703357696533, "step": 8338 }, { "epoch": 1.35, "learning_rate": 5.009199772492829e-07, "logits/chosen": -0.8083129525184631, "logits/rejected": -0.7580340504646301, "logps/chosen": -139.4931640625, "logps/rejected": -116.93086242675781, "loss": 0.7501, "rewards/accuracies": 1.0, "rewards/chosen": 5.509101867675781, "rewards/margins": 0.5921387672424316, "rewards/rejected": 4.91696310043335, "step": 8339 }, { "epoch": 1.35, "learning_rate": 5.00788552046e-07, "logits/chosen": -0.6114012002944946, "logits/rejected": -0.5957674980163574, "logps/chosen": -40.73999786376953, "logps/rejected": -28.766435623168945, "loss": 0.3544, "rewards/accuracies": 1.0, "rewards/chosen": 1.8162673711776733, "rewards/margins": 0.06959891319274902, "rewards/rejected": 1.7466684579849243, "step": 8340 }, { "epoch": 1.35, "learning_rate": 5.006571267882356e-07, "logits/chosen": -0.6216351389884949, "logits/rejected": -0.6046521067619324, "logps/chosen": -68.82719421386719, "logps/rejected": -82.39312744140625, "loss": 2.7238, "rewards/accuracies": 0.0, "rewards/chosen": 2.194157361984253, "rewards/margins": -3.2649643421173096, "rewards/rejected": 5.4591217041015625, "step": 8341 }, { "epoch": 1.35, "learning_rate": 5.0052570148507e-07, "logits/chosen": -1.0340827703475952, "logits/rejected": -1.0340827703475952, "logps/chosen": -62.32951736450195, "logps/rejected": -62.32951736450195, "loss": 0.594, "rewards/accuracies": 0.0, "rewards/chosen": 1.0941082239151, "rewards/margins": 0.0, "rewards/rejected": 1.0941082239151, "step": 8342 }, { "epoch": 1.35, "learning_rate": 5.003942761455834e-07, "logits/chosen": -0.019806567579507828, "logits/rejected": 0.013267411850392818, "logps/chosen": -6.435762405395508, "logps/rejected": -11.37235164642334, "loss": 1.0862, "rewards/accuracies": 0.0, "rewards/chosen": 0.5550693869590759, "rewards/margins": -0.2587857246398926, "rewards/rejected": 0.8138551115989685, "step": 8343 }, { "epoch": 1.35, "learning_rate": 5.00262850778856e-07, "logits/chosen": -0.23126265406608582, "logits/rejected": -0.24763549864292145, "logps/chosen": -4.383342266082764, "logps/rejected": -1.452595829963684, "loss": 1.0608, "rewards/accuracies": 0.0, "rewards/chosen": 0.1776803582906723, "rewards/margins": -0.07170328497886658, "rewards/rejected": 0.24938364326953888, "step": 8344 }, { "epoch": 1.35, "learning_rate": 5.001314253939681e-07, "logits/chosen": -0.7561153769493103, "logits/rejected": -0.8396936655044556, "logps/chosen": -71.17379760742188, "logps/rejected": -87.17875671386719, "loss": 1.6729, "rewards/accuracies": 0.0, "rewards/chosen": 2.971792697906494, "rewards/margins": -1.9605989456176758, "rewards/rejected": 4.93239164352417, "step": 8345 }, { "epoch": 1.35, "learning_rate": 5e-07, "logits/chosen": -0.36246922612190247, "logits/rejected": -0.3322467803955078, "logps/chosen": -62.127593994140625, "logps/rejected": -111.67952728271484, "loss": 0.7782, "rewards/accuracies": 1.0, "rewards/chosen": 1.313855767250061, "rewards/margins": 1.4341744184494019, "rewards/rejected": -0.12031860649585724, "step": 8346 }, { "epoch": 1.35, "learning_rate": 4.998685746060319e-07, "logits/chosen": -0.7445271015167236, "logits/rejected": -0.6351377964019775, "logps/chosen": -53.84181594848633, "logps/rejected": -57.684364318847656, "loss": 0.3087, "rewards/accuracies": 1.0, "rewards/chosen": 1.2722851037979126, "rewards/margins": 0.19233667850494385, "rewards/rejected": 1.0799484252929688, "step": 8347 }, { "epoch": 1.35, "learning_rate": 4.99737149221144e-07, "logits/chosen": -0.6087514758110046, "logits/rejected": -0.6275587677955627, "logps/chosen": -6.778507232666016, "logps/rejected": -2.8412160873413086, "loss": 2.1942, "rewards/accuracies": 0.0, "rewards/chosen": -0.010776424780488014, "rewards/margins": -0.3617478609085083, "rewards/rejected": 0.35097143054008484, "step": 8348 }, { "epoch": 1.36, "learning_rate": 4.996057238544166e-07, "logits/chosen": -0.3940063416957855, "logits/rejected": -0.27022209763526917, "logps/chosen": -46.04624557495117, "logps/rejected": -35.0269775390625, "loss": 0.3294, "rewards/accuracies": 1.0, "rewards/chosen": 1.2035255432128906, "rewards/margins": 1.3146820068359375, "rewards/rejected": -0.11115646362304688, "step": 8349 }, { "epoch": 1.36, "learning_rate": 4.994742985149301e-07, "logits/chosen": -0.5389379262924194, "logits/rejected": -0.5302258729934692, "logps/chosen": -64.6730728149414, "logps/rejected": -51.41261672973633, "loss": 0.4079, "rewards/accuracies": 1.0, "rewards/chosen": 0.04745331034064293, "rewards/margins": 0.15864066779613495, "rewards/rejected": -0.11118736118078232, "step": 8350 }, { "epoch": 1.36, "learning_rate": 4.993428732117644e-07, "logits/chosen": -0.20061317086219788, "logits/rejected": -0.29574349522590637, "logps/chosen": -105.72379302978516, "logps/rejected": -106.72964477539062, "loss": 2.5194, "rewards/accuracies": 0.0, "rewards/chosen": 0.5888809561729431, "rewards/margins": -4.1026811599731445, "rewards/rejected": 4.691562175750732, "step": 8351 }, { "epoch": 1.36, "learning_rate": 4.99211447954e-07, "logits/chosen": -0.8280495405197144, "logits/rejected": -0.7932809591293335, "logps/chosen": -34.5008544921875, "logps/rejected": -30.77855682373047, "loss": 0.3937, "rewards/accuracies": 0.0, "rewards/chosen": 1.118585228919983, "rewards/margins": -0.014586210250854492, "rewards/rejected": 1.1331714391708374, "step": 8352 }, { "epoch": 1.36, "learning_rate": 4.990800227507171e-07, "logits/chosen": -0.7239159941673279, "logits/rejected": -0.6833920478820801, "logps/chosen": -100.76509094238281, "logps/rejected": -96.03996276855469, "loss": 0.9039, "rewards/accuracies": 0.0, "rewards/chosen": 0.6463150382041931, "rewards/margins": -1.6232178211212158, "rewards/rejected": 2.2695329189300537, "step": 8353 }, { "epoch": 1.36, "learning_rate": 4.98948597610996e-07, "logits/chosen": -0.6016311645507812, "logits/rejected": -0.6195383667945862, "logps/chosen": -62.3685188293457, "logps/rejected": -60.062782287597656, "loss": 0.9373, "rewards/accuracies": 0.0, "rewards/chosen": 0.7783123254776001, "rewards/margins": -1.205849051475525, "rewards/rejected": 1.984161376953125, "step": 8354 }, { "epoch": 1.36, "learning_rate": 4.988171725439167e-07, "logits/chosen": -0.5609144568443298, "logits/rejected": -0.5374836325645447, "logps/chosen": -101.46576690673828, "logps/rejected": -101.2933349609375, "loss": 0.3244, "rewards/accuracies": 1.0, "rewards/chosen": 3.0657737255096436, "rewards/margins": 0.7209784984588623, "rewards/rejected": 2.3447952270507812, "step": 8355 }, { "epoch": 1.36, "learning_rate": 4.986857475585597e-07, "logits/chosen": -0.9811119437217712, "logits/rejected": -0.9403352737426758, "logps/chosen": -75.32527160644531, "logps/rejected": -63.15532684326172, "loss": 0.8127, "rewards/accuracies": 0.0, "rewards/chosen": 2.2130706310272217, "rewards/margins": -0.22247099876403809, "rewards/rejected": 2.4355416297912598, "step": 8356 }, { "epoch": 1.36, "learning_rate": 4.985543226640051e-07, "logits/chosen": -1.069244623184204, "logits/rejected": -1.062239646911621, "logps/chosen": -90.01339721679688, "logps/rejected": -60.74293518066406, "loss": 0.3729, "rewards/accuracies": 1.0, "rewards/chosen": 1.7490707635879517, "rewards/margins": 1.3164870738983154, "rewards/rejected": 0.43258363008499146, "step": 8357 }, { "epoch": 1.36, "learning_rate": 4.984228978693331e-07, "logits/chosen": -0.582655131816864, "logits/rejected": -0.6401579976081848, "logps/chosen": -72.56752014160156, "logps/rejected": -87.58317565917969, "loss": 0.6478, "rewards/accuracies": 1.0, "rewards/chosen": 0.9921402335166931, "rewards/margins": 0.1902969479560852, "rewards/rejected": 0.8018432855606079, "step": 8358 }, { "epoch": 1.36, "learning_rate": 4.982914731836239e-07, "logits/chosen": -0.5270516276359558, "logits/rejected": -0.3834545612335205, "logps/chosen": -109.6588134765625, "logps/rejected": -87.51905822753906, "loss": 0.1747, "rewards/accuracies": 1.0, "rewards/chosen": 5.694572448730469, "rewards/margins": 3.0714704990386963, "rewards/rejected": 2.6231019496917725, "step": 8359 }, { "epoch": 1.36, "learning_rate": 4.981600486159578e-07, "logits/chosen": -0.6548071503639221, "logits/rejected": -0.6159701943397522, "logps/chosen": -40.91126251220703, "logps/rejected": -64.17218017578125, "loss": 0.4204, "rewards/accuracies": 0.0, "rewards/chosen": 1.729434609413147, "rewards/margins": -0.2050342559814453, "rewards/rejected": 1.9344688653945923, "step": 8360 }, { "epoch": 1.36, "learning_rate": 4.980286241754149e-07, "logits/chosen": -0.4261934459209442, "logits/rejected": -0.38640403747558594, "logps/chosen": -53.324310302734375, "logps/rejected": -20.631105422973633, "loss": 0.6304, "rewards/accuracies": 0.0, "rewards/chosen": 0.06613502651453018, "rewards/margins": -0.1526767611503601, "rewards/rejected": 0.2188117951154709, "step": 8361 }, { "epoch": 1.36, "learning_rate": 4.978971998710754e-07, "logits/chosen": -0.35346412658691406, "logits/rejected": -0.35346412658691406, "logps/chosen": -48.001739501953125, "logps/rejected": -48.001739501953125, "loss": 1.3186, "rewards/accuracies": 0.0, "rewards/chosen": -0.40448760986328125, "rewards/margins": 0.0, "rewards/rejected": -0.40448760986328125, "step": 8362 }, { "epoch": 1.36, "learning_rate": 4.977657757120196e-07, "logits/chosen": -0.7770464420318604, "logits/rejected": -0.726614773273468, "logps/chosen": -102.82687377929688, "logps/rejected": -78.80357360839844, "loss": 2.0257, "rewards/accuracies": 0.0, "rewards/chosen": 4.434858798980713, "rewards/margins": -1.2192716598510742, "rewards/rejected": 5.654130458831787, "step": 8363 }, { "epoch": 1.36, "learning_rate": 4.976343517073274e-07, "logits/chosen": -0.32994604110717773, "logits/rejected": -0.29039397835731506, "logps/chosen": -112.41461944580078, "logps/rejected": -74.48493957519531, "loss": 2.3043, "rewards/accuracies": 0.0, "rewards/chosen": -0.03135833889245987, "rewards/margins": -2.5423834323883057, "rewards/rejected": 2.5110251903533936, "step": 8364 }, { "epoch": 1.36, "learning_rate": 4.975029278660791e-07, "logits/chosen": -0.9756861329078674, "logits/rejected": -0.9690565466880798, "logps/chosen": -100.42691040039062, "logps/rejected": -127.20475769042969, "loss": 0.3539, "rewards/accuracies": 1.0, "rewards/chosen": 1.7659400701522827, "rewards/margins": 0.014540791511535645, "rewards/rejected": 1.751399278640747, "step": 8365 }, { "epoch": 1.36, "learning_rate": 4.973715041973549e-07, "logits/chosen": -1.0101802349090576, "logits/rejected": -0.9189510345458984, "logps/chosen": -167.21182250976562, "logps/rejected": -39.51654052734375, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": 2.352465867996216, "rewards/margins": 2.078371047973633, "rewards/rejected": 0.2740947902202606, "step": 8366 }, { "epoch": 1.36, "learning_rate": 4.97240080710235e-07, "logits/chosen": -0.5964706540107727, "logits/rejected": -0.5870970487594604, "logps/chosen": -62.992820739746094, "logps/rejected": -54.45103073120117, "loss": 0.6567, "rewards/accuracies": 0.0, "rewards/chosen": 0.6899658441543579, "rewards/margins": -0.6204227209091187, "rewards/rejected": 1.3103885650634766, "step": 8367 }, { "epoch": 1.36, "learning_rate": 4.971086574137993e-07, "logits/chosen": -0.5765132904052734, "logits/rejected": -0.5739948153495789, "logps/chosen": -95.53466033935547, "logps/rejected": -68.8902587890625, "loss": 0.5458, "rewards/accuracies": 0.0, "rewards/chosen": 0.4318962097167969, "rewards/margins": -0.6599282026290894, "rewards/rejected": 1.0918244123458862, "step": 8368 }, { "epoch": 1.36, "learning_rate": 4.96977234317128e-07, "logits/chosen": -0.8226577043533325, "logits/rejected": -0.9497368335723877, "logps/chosen": -175.30126953125, "logps/rejected": -177.367431640625, "loss": 0.6745, "rewards/accuracies": 0.0, "rewards/chosen": 3.79337477684021, "rewards/margins": -1.0434386730194092, "rewards/rejected": 4.836813449859619, "step": 8369 }, { "epoch": 1.36, "learning_rate": 4.968458114293013e-07, "logits/chosen": -0.6721422076225281, "logits/rejected": -0.6216471195220947, "logps/chosen": -94.42076110839844, "logps/rejected": -136.03427124023438, "loss": 1.2788, "rewards/accuracies": 0.0, "rewards/chosen": 1.5996971130371094, "rewards/margins": -2.4354472160339355, "rewards/rejected": 4.035144329071045, "step": 8370 }, { "epoch": 1.36, "learning_rate": 4.967143887593992e-07, "logits/chosen": -1.1315773725509644, "logits/rejected": -1.083276391029358, "logps/chosen": -110.55487060546875, "logps/rejected": -202.9544219970703, "loss": 1.1855, "rewards/accuracies": 0.0, "rewards/chosen": 3.3396224975585938, "rewards/margins": -2.0629305839538574, "rewards/rejected": 5.402553081512451, "step": 8371 }, { "epoch": 1.36, "learning_rate": 4.965829663165017e-07, "logits/chosen": -0.46784502267837524, "logits/rejected": -0.4815579950809479, "logps/chosen": -27.159866333007812, "logps/rejected": -67.83624267578125, "loss": 1.1382, "rewards/accuracies": 0.0, "rewards/chosen": 0.340436190366745, "rewards/margins": -0.1744968593120575, "rewards/rejected": 0.5149330496788025, "step": 8372 }, { "epoch": 1.36, "learning_rate": 4.964515441096889e-07, "logits/chosen": -0.7457366585731506, "logits/rejected": -0.7397211194038391, "logps/chosen": -123.40194702148438, "logps/rejected": -55.007137298583984, "loss": 0.0487, "rewards/accuracies": 1.0, "rewards/chosen": 2.939199924468994, "rewards/margins": 2.3456220626831055, "rewards/rejected": 0.5935779809951782, "step": 8373 }, { "epoch": 1.36, "learning_rate": 4.963201221480408e-07, "logits/chosen": -0.15258939564228058, "logits/rejected": -0.17547573149204254, "logps/chosen": -3.50866436958313, "logps/rejected": -26.43749237060547, "loss": 0.3154, "rewards/accuracies": 1.0, "rewards/chosen": 0.2168518751859665, "rewards/margins": 0.3201367259025574, "rewards/rejected": -0.10328483581542969, "step": 8374 }, { "epoch": 1.36, "learning_rate": 4.961887004406374e-07, "logits/chosen": -0.7367877960205078, "logits/rejected": -0.6759874224662781, "logps/chosen": -67.73187255859375, "logps/rejected": -71.12895202636719, "loss": 0.2121, "rewards/accuracies": 1.0, "rewards/chosen": 4.046089172363281, "rewards/margins": 0.6471602916717529, "rewards/rejected": 3.3989288806915283, "step": 8375 }, { "epoch": 1.36, "learning_rate": 4.960572789965588e-07, "logits/chosen": -0.6392231583595276, "logits/rejected": -0.6129885315895081, "logps/chosen": -67.29156494140625, "logps/rejected": -80.07785034179688, "loss": 0.5001, "rewards/accuracies": 0.0, "rewards/chosen": 2.241870164871216, "rewards/margins": -0.2671928405761719, "rewards/rejected": 2.5090630054473877, "step": 8376 }, { "epoch": 1.36, "learning_rate": 4.959258578248849e-07, "logits/chosen": -0.8622018694877625, "logits/rejected": -0.8558535575866699, "logps/chosen": -34.52000045776367, "logps/rejected": -115.58151245117188, "loss": 0.0708, "rewards/accuracies": 1.0, "rewards/chosen": 1.1956814527511597, "rewards/margins": 2.021428346633911, "rewards/rejected": -0.8257469534873962, "step": 8377 }, { "epoch": 1.36, "learning_rate": 4.957944369346957e-07, "logits/chosen": -0.8926846981048584, "logits/rejected": -0.8788288235664368, "logps/chosen": -115.11869812011719, "logps/rejected": -83.6864013671875, "loss": 2.9878, "rewards/accuracies": 0.0, "rewards/chosen": 0.9545181393623352, "rewards/margins": -2.25639271736145, "rewards/rejected": 3.2109107971191406, "step": 8378 }, { "epoch": 1.36, "learning_rate": 4.956630163350712e-07, "logits/chosen": -0.7184848785400391, "logits/rejected": -0.7126302123069763, "logps/chosen": -72.50033569335938, "logps/rejected": -60.67682647705078, "loss": 1.0333, "rewards/accuracies": 0.0, "rewards/chosen": 1.1678284406661987, "rewards/margins": -1.9225105047225952, "rewards/rejected": 3.090338945388794, "step": 8379 }, { "epoch": 1.36, "learning_rate": 4.95531596035091e-07, "logits/chosen": -0.6135056614875793, "logits/rejected": -0.6179962754249573, "logps/chosen": -3.0106565952301025, "logps/rejected": -10.629022598266602, "loss": 0.3544, "rewards/accuracies": 0.0, "rewards/chosen": 0.10044290870428085, "rewards/margins": -0.004976153373718262, "rewards/rejected": 0.10541906207799911, "step": 8380 }, { "epoch": 1.36, "learning_rate": 4.954001760438354e-07, "logits/chosen": -0.779459536075592, "logits/rejected": -0.7791004180908203, "logps/chosen": -88.91152954101562, "logps/rejected": -100.9559326171875, "loss": 0.8735, "rewards/accuracies": 0.0, "rewards/chosen": 1.1910942792892456, "rewards/margins": -1.5229071378707886, "rewards/rejected": 2.714001417160034, "step": 8381 }, { "epoch": 1.36, "learning_rate": 4.952687563703841e-07, "logits/chosen": -0.7532634139060974, "logits/rejected": -0.7822019457817078, "logps/chosen": -13.12660026550293, "logps/rejected": -62.766014099121094, "loss": 1.723, "rewards/accuracies": 0.0, "rewards/chosen": 0.30845680832862854, "rewards/margins": -0.021917343139648438, "rewards/rejected": 0.330374151468277, "step": 8382 }, { "epoch": 1.36, "learning_rate": 4.951373370238169e-07, "logits/chosen": -0.28559157252311707, "logits/rejected": -0.3402385115623474, "logps/chosen": -82.11870574951172, "logps/rejected": -63.1217041015625, "loss": 0.4451, "rewards/accuracies": 0.0, "rewards/chosen": 1.7735321521759033, "rewards/margins": -0.2883460521697998, "rewards/rejected": 2.061878204345703, "step": 8383 }, { "epoch": 1.36, "learning_rate": 4.950059180132138e-07, "logits/chosen": -0.7662692070007324, "logits/rejected": -0.743040144443512, "logps/chosen": -130.7191925048828, "logps/rejected": -126.7735595703125, "loss": 0.5109, "rewards/accuracies": 1.0, "rewards/chosen": 5.414746284484863, "rewards/margins": 0.0901336669921875, "rewards/rejected": 5.324612617492676, "step": 8384 }, { "epoch": 1.36, "learning_rate": 4.948744993476545e-07, "logits/chosen": -0.41115275025367737, "logits/rejected": -0.2759615480899811, "logps/chosen": -101.2398681640625, "logps/rejected": -1.0160465240478516, "loss": 0.4809, "rewards/accuracies": 0.0, "rewards/chosen": 0.14917679131031036, "rewards/margins": -0.40103578567504883, "rewards/rejected": 0.550212562084198, "step": 8385 }, { "epoch": 1.36, "learning_rate": 4.947430810362187e-07, "logits/chosen": -0.5375238060951233, "logits/rejected": -0.5649014115333557, "logps/chosen": -60.38469696044922, "logps/rejected": -108.21743774414062, "loss": 1.2683, "rewards/accuracies": 0.0, "rewards/chosen": 0.8902503848075867, "rewards/margins": -1.9631431102752686, "rewards/rejected": 2.8533935546875, "step": 8386 }, { "epoch": 1.36, "learning_rate": 4.946116630879863e-07, "logits/chosen": -0.7901363372802734, "logits/rejected": -0.46163925528526306, "logps/chosen": -120.5051040649414, "logps/rejected": -60.053497314453125, "loss": 0.6846, "rewards/accuracies": 0.0, "rewards/chosen": 3.225527286529541, "rewards/margins": -0.2741560935974121, "rewards/rejected": 3.499683380126953, "step": 8387 }, { "epoch": 1.36, "learning_rate": 4.94480245512037e-07, "logits/chosen": -0.9079427123069763, "logits/rejected": -0.8918657898902893, "logps/chosen": -123.8143310546875, "logps/rejected": -164.69998168945312, "loss": 0.3491, "rewards/accuracies": 1.0, "rewards/chosen": 3.752612352371216, "rewards/margins": 0.7296905517578125, "rewards/rejected": 3.0229218006134033, "step": 8388 }, { "epoch": 1.36, "learning_rate": 4.943488283174505e-07, "logits/chosen": -0.8249257206916809, "logits/rejected": -0.5888382196426392, "logps/chosen": -104.05268859863281, "logps/rejected": -71.79789733886719, "loss": 0.0852, "rewards/accuracies": 1.0, "rewards/chosen": 3.3434417247772217, "rewards/margins": 1.837467908859253, "rewards/rejected": 1.5059738159179688, "step": 8389 }, { "epoch": 1.36, "learning_rate": 4.942174115133065e-07, "logits/chosen": -0.8692967891693115, "logits/rejected": -0.8705955147743225, "logps/chosen": -126.08680725097656, "logps/rejected": -86.4036865234375, "loss": 1.0737, "rewards/accuracies": 0.0, "rewards/chosen": 2.5913193225860596, "rewards/margins": -1.9430434703826904, "rewards/rejected": 4.53436279296875, "step": 8390 }, { "epoch": 1.36, "learning_rate": 4.940859951086847e-07, "logits/chosen": -0.49064886569976807, "logits/rejected": -0.23367194831371307, "logps/chosen": -93.9267578125, "logps/rejected": -23.06905746459961, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": 3.6832306385040283, "rewards/margins": 3.458470582962036, "rewards/rejected": 0.2247600555419922, "step": 8391 }, { "epoch": 1.36, "learning_rate": 4.939545791126646e-07, "logits/chosen": -0.7126710414886475, "logits/rejected": -0.5981096029281616, "logps/chosen": -60.285606384277344, "logps/rejected": -21.0555362701416, "loss": 0.3321, "rewards/accuracies": 1.0, "rewards/chosen": 2.3352067470550537, "rewards/margins": 1.711092233657837, "rewards/rejected": 0.624114453792572, "step": 8392 }, { "epoch": 1.36, "learning_rate": 4.93823163534326e-07, "logits/chosen": -0.7848271727561951, "logits/rejected": -0.7041359543800354, "logps/chosen": -44.49688720703125, "logps/rejected": -52.45092010498047, "loss": 0.7248, "rewards/accuracies": 1.0, "rewards/chosen": 2.4797990322113037, "rewards/margins": 0.4089226722717285, "rewards/rejected": 2.070876359939575, "step": 8393 }, { "epoch": 1.36, "learning_rate": 4.936917483827482e-07, "logits/chosen": -0.38802292943000793, "logits/rejected": -0.38802292943000793, "logps/chosen": -71.93600463867188, "logps/rejected": -71.93600463867188, "loss": 0.6583, "rewards/accuracies": 0.0, "rewards/chosen": 1.834564208984375, "rewards/margins": 0.0, "rewards/rejected": 1.834564208984375, "step": 8394 }, { "epoch": 1.36, "learning_rate": 4.935603336670109e-07, "logits/chosen": -0.25415799021720886, "logits/rejected": -0.25415799021720886, "logps/chosen": -26.87029266357422, "logps/rejected": -26.87029266357422, "loss": 0.6357, "rewards/accuracies": 0.0, "rewards/chosen": 0.46206170320510864, "rewards/margins": 0.0, "rewards/rejected": 0.46206170320510864, "step": 8395 }, { "epoch": 1.36, "learning_rate": 4.934289193961937e-07, "logits/chosen": -0.8544391393661499, "logits/rejected": -0.7072560787200928, "logps/chosen": -63.988677978515625, "logps/rejected": -18.32632064819336, "loss": 0.0965, "rewards/accuracies": 1.0, "rewards/chosen": 3.5573883056640625, "rewards/margins": 3.1234264373779297, "rewards/rejected": 0.4339618682861328, "step": 8396 }, { "epoch": 1.36, "learning_rate": 4.93297505579376e-07, "logits/chosen": -0.8784631490707397, "logits/rejected": -0.6190686225891113, "logps/chosen": -171.51071166992188, "logps/rejected": -75.24020385742188, "loss": 0.2702, "rewards/accuracies": 1.0, "rewards/chosen": 3.6853790283203125, "rewards/margins": 0.4130065441131592, "rewards/rejected": 3.2723724842071533, "step": 8397 }, { "epoch": 1.36, "learning_rate": 4.931660922256372e-07, "logits/chosen": -0.5634251832962036, "logits/rejected": -0.4937274754047394, "logps/chosen": -56.538551330566406, "logps/rejected": -60.00275421142578, "loss": 0.3621, "rewards/accuracies": 1.0, "rewards/chosen": 2.3281121253967285, "rewards/margins": 1.81622314453125, "rewards/rejected": 0.5118889212608337, "step": 8398 }, { "epoch": 1.36, "learning_rate": 4.930346793440568e-07, "logits/chosen": -0.2434389442205429, "logits/rejected": -0.1576778143644333, "logps/chosen": -50.45220184326172, "logps/rejected": -45.917015075683594, "loss": 0.9152, "rewards/accuracies": 1.0, "rewards/chosen": 1.5059417486190796, "rewards/margins": 0.13482356071472168, "rewards/rejected": 1.371118187904358, "step": 8399 }, { "epoch": 1.36, "learning_rate": 4.929032669437142e-07, "logits/chosen": -0.9053986072540283, "logits/rejected": -0.8443319201469421, "logps/chosen": -50.086273193359375, "logps/rejected": -20.865354537963867, "loss": 0.3457, "rewards/accuracies": 1.0, "rewards/chosen": 2.1548097133636475, "rewards/margins": 0.8412612676620483, "rewards/rejected": 1.3135484457015991, "step": 8400 }, { "epoch": 1.36, "learning_rate": 4.927718550336887e-07, "logits/chosen": -0.6612065434455872, "logits/rejected": -0.6612065434455872, "logps/chosen": -62.943058013916016, "logps/rejected": -62.943058013916016, "loss": 0.3813, "rewards/accuracies": 0.0, "rewards/chosen": 2.511385679244995, "rewards/margins": 0.0, "rewards/rejected": 2.511385679244995, "step": 8401 }, { "epoch": 1.36, "learning_rate": 4.926404436230596e-07, "logits/chosen": -0.645566999912262, "logits/rejected": -0.5126590132713318, "logps/chosen": -62.86579895019531, "logps/rejected": -18.38709259033203, "loss": 0.2853, "rewards/accuracies": 1.0, "rewards/chosen": 1.496630072593689, "rewards/margins": 1.1966944932937622, "rewards/rejected": 0.29993554949760437, "step": 8402 }, { "epoch": 1.36, "learning_rate": 4.925090327209061e-07, "logits/chosen": -0.9256680607795715, "logits/rejected": -0.8176817893981934, "logps/chosen": -114.71632385253906, "logps/rejected": -67.21102905273438, "loss": 0.2154, "rewards/accuracies": 1.0, "rewards/chosen": 4.674388408660889, "rewards/margins": 1.7300066947937012, "rewards/rejected": 2.9443817138671875, "step": 8403 }, { "epoch": 1.36, "learning_rate": 4.923776223363076e-07, "logits/chosen": -1.2859846353530884, "logits/rejected": -1.2818070650100708, "logps/chosen": -100.02650451660156, "logps/rejected": -122.09513854980469, "loss": 1.7636, "rewards/accuracies": 0.0, "rewards/chosen": 1.1340179443359375, "rewards/margins": -3.250807285308838, "rewards/rejected": 4.384825229644775, "step": 8404 }, { "epoch": 1.36, "learning_rate": 4.922462124783433e-07, "logits/chosen": -0.8174017667770386, "logits/rejected": -0.8076527714729309, "logps/chosen": -10.604793548583984, "logps/rejected": -3.7036890983581543, "loss": 0.3858, "rewards/accuracies": 0.0, "rewards/chosen": 0.011883926577866077, "rewards/margins": -0.08805479854345322, "rewards/rejected": 0.09993872791528702, "step": 8405 }, { "epoch": 1.36, "learning_rate": 4.921148031560924e-07, "logits/chosen": -0.8485659956932068, "logits/rejected": -0.7595070600509644, "logps/chosen": -89.96692657470703, "logps/rejected": -31.750619888305664, "loss": 1.3737, "rewards/accuracies": 1.0, "rewards/chosen": 1.312334418296814, "rewards/margins": 1.4328168630599976, "rewards/rejected": -0.1204824447631836, "step": 8406 }, { "epoch": 1.36, "learning_rate": 4.919833943786339e-07, "logits/chosen": -0.7965940833091736, "logits/rejected": -0.6522665023803711, "logps/chosen": -138.61517333984375, "logps/rejected": -53.851829528808594, "loss": 0.0375, "rewards/accuracies": 1.0, "rewards/chosen": 5.360558986663818, "rewards/margins": 3.4077606201171875, "rewards/rejected": 1.9527984857559204, "step": 8407 }, { "epoch": 1.36, "learning_rate": 4.918519861550471e-07, "logits/chosen": -0.9377873539924622, "logits/rejected": -0.9635263681411743, "logps/chosen": -269.0860290527344, "logps/rejected": -67.3394775390625, "loss": 1.9935, "rewards/accuracies": 1.0, "rewards/chosen": 4.826587200164795, "rewards/margins": 0.821418285369873, "rewards/rejected": 4.005168914794922, "step": 8408 }, { "epoch": 1.36, "learning_rate": 4.917205784944108e-07, "logits/chosen": -0.5235751271247864, "logits/rejected": -0.5450128316879272, "logps/chosen": -118.4128646850586, "logps/rejected": -81.58355712890625, "loss": 0.4299, "rewards/accuracies": 0.0, "rewards/chosen": 0.8959831595420837, "rewards/margins": -0.0633651614189148, "rewards/rejected": 0.9593483209609985, "step": 8409 }, { "epoch": 1.37, "learning_rate": 4.915891714058043e-07, "logits/chosen": -0.6952618956565857, "logits/rejected": -0.6966686248779297, "logps/chosen": -128.08090209960938, "logps/rejected": -57.19901657104492, "loss": 0.562, "rewards/accuracies": 0.0, "rewards/chosen": 1.3418549299240112, "rewards/margins": -0.6270275115966797, "rewards/rejected": 1.968882441520691, "step": 8410 }, { "epoch": 1.37, "learning_rate": 4.914577648983065e-07, "logits/chosen": -0.6374272704124451, "logits/rejected": -0.6374272704124451, "logps/chosen": -47.958099365234375, "logps/rejected": -47.958099365234375, "loss": 0.7354, "rewards/accuracies": 0.0, "rewards/chosen": 2.4081878662109375, "rewards/margins": 0.0, "rewards/rejected": 2.4081878662109375, "step": 8411 }, { "epoch": 1.37, "learning_rate": 4.913263589809963e-07, "logits/chosen": -0.8504832983016968, "logits/rejected": -0.9125262498855591, "logps/chosen": -163.37908935546875, "logps/rejected": -130.14956665039062, "loss": 0.9818, "rewards/accuracies": 0.0, "rewards/chosen": 2.792471408843994, "rewards/margins": -1.7095489501953125, "rewards/rejected": 4.502020359039307, "step": 8412 }, { "epoch": 1.37, "learning_rate": 4.911949536629525e-07, "logits/chosen": -0.9408074617385864, "logits/rejected": -0.9613085389137268, "logps/chosen": -76.07787322998047, "logps/rejected": -137.76519775390625, "loss": 0.8614, "rewards/accuracies": 0.0, "rewards/chosen": 1.771826982498169, "rewards/margins": -0.5109825134277344, "rewards/rejected": 2.2828094959259033, "step": 8413 }, { "epoch": 1.37, "learning_rate": 4.910635489532542e-07, "logits/chosen": -0.28975409269332886, "logits/rejected": -0.29886800050735474, "logps/chosen": -50.95856857299805, "logps/rejected": -45.841087341308594, "loss": 0.6262, "rewards/accuracies": 0.0, "rewards/chosen": 1.2166500091552734, "rewards/margins": -0.9112133979797363, "rewards/rejected": 2.1278634071350098, "step": 8414 }, { "epoch": 1.37, "learning_rate": 4.909321448609801e-07, "logits/chosen": -0.6650012731552124, "logits/rejected": -0.662765622138977, "logps/chosen": -58.98418426513672, "logps/rejected": -63.342506408691406, "loss": 0.6253, "rewards/accuracies": 1.0, "rewards/chosen": 1.7103630304336548, "rewards/margins": 0.2929474115371704, "rewards/rejected": 1.4174156188964844, "step": 8415 }, { "epoch": 1.37, "learning_rate": 4.90800741395209e-07, "logits/chosen": -0.6552445292472839, "logits/rejected": -0.6700395345687866, "logps/chosen": -46.469749450683594, "logps/rejected": -106.50360107421875, "loss": 1.0374, "rewards/accuracies": 1.0, "rewards/chosen": 1.7394790649414062, "rewards/margins": 0.40519940853118896, "rewards/rejected": 1.3342796564102173, "step": 8416 }, { "epoch": 1.37, "learning_rate": 4.906693385650195e-07, "logits/chosen": -0.4701651334762573, "logits/rejected": -0.5180222392082214, "logps/chosen": -5.41760778427124, "logps/rejected": -48.15277099609375, "loss": 0.672, "rewards/accuracies": 0.0, "rewards/chosen": 0.41125941276550293, "rewards/margins": -0.5251545310020447, "rewards/rejected": 0.9364139437675476, "step": 8417 }, { "epoch": 1.37, "learning_rate": 4.905379363794906e-07, "logits/chosen": -1.2202759981155396, "logits/rejected": -1.035595417022705, "logps/chosen": -109.46587371826172, "logps/rejected": -23.971351623535156, "loss": 0.0149, "rewards/accuracies": 1.0, "rewards/chosen": 5.310950756072998, "rewards/margins": 5.006821155548096, "rewards/rejected": 0.30412960052490234, "step": 8418 }, { "epoch": 1.37, "learning_rate": 4.904065348477007e-07, "logits/chosen": -0.5834429860115051, "logits/rejected": -0.5071344375610352, "logps/chosen": -56.833213806152344, "logps/rejected": -40.88674545288086, "loss": 0.3124, "rewards/accuracies": 1.0, "rewards/chosen": 2.0844674110412598, "rewards/margins": 0.4032829999923706, "rewards/rejected": 1.6811844110488892, "step": 8419 }, { "epoch": 1.37, "learning_rate": 4.902751339787284e-07, "logits/chosen": -0.5942502617835999, "logits/rejected": -0.611316442489624, "logps/chosen": -74.08538818359375, "logps/rejected": -111.277587890625, "loss": 0.2152, "rewards/accuracies": 1.0, "rewards/chosen": 3.46415114402771, "rewards/margins": 0.9390931129455566, "rewards/rejected": 2.5250580310821533, "step": 8420 }, { "epoch": 1.37, "learning_rate": 4.901437337816523e-07, "logits/chosen": -0.8045855164527893, "logits/rejected": -0.8985147476196289, "logps/chosen": -68.27693939208984, "logps/rejected": -151.69190979003906, "loss": 2.4294, "rewards/accuracies": 0.0, "rewards/chosen": 0.9263618588447571, "rewards/margins": -4.568230152130127, "rewards/rejected": 5.494592189788818, "step": 8421 }, { "epoch": 1.37, "learning_rate": 4.900123342655511e-07, "logits/chosen": -0.6244114637374878, "logits/rejected": -0.6382185220718384, "logps/chosen": -63.35200500488281, "logps/rejected": -96.73116302490234, "loss": 1.7361, "rewards/accuracies": 0.0, "rewards/chosen": 0.28358229994773865, "rewards/margins": -0.5918244123458862, "rewards/rejected": 0.8754066824913025, "step": 8422 }, { "epoch": 1.37, "learning_rate": 4.89880935439503e-07, "logits/chosen": -0.5276436805725098, "logits/rejected": -0.5881451368331909, "logps/chosen": -113.22525787353516, "logps/rejected": -104.90525817871094, "loss": 2.2053, "rewards/accuracies": 0.0, "rewards/chosen": 1.5143593549728394, "rewards/margins": -4.235275745391846, "rewards/rejected": 5.749635219573975, "step": 8423 }, { "epoch": 1.37, "learning_rate": 4.897495373125865e-07, "logits/chosen": -1.1408958435058594, "logits/rejected": -1.1730796098709106, "logps/chosen": -124.56980895996094, "logps/rejected": -52.665042877197266, "loss": 0.9181, "rewards/accuracies": 1.0, "rewards/chosen": 1.8465255498886108, "rewards/margins": 0.05044126510620117, "rewards/rejected": 1.7960842847824097, "step": 8424 }, { "epoch": 1.37, "learning_rate": 4.8961813989388e-07, "logits/chosen": -1.1254725456237793, "logits/rejected": -1.117749810218811, "logps/chosen": -132.631103515625, "logps/rejected": -112.66264343261719, "loss": 0.8187, "rewards/accuracies": 0.0, "rewards/chosen": 0.6589813232421875, "rewards/margins": -0.35448455810546875, "rewards/rejected": 1.0134658813476562, "step": 8425 }, { "epoch": 1.37, "learning_rate": 4.894867431924618e-07, "logits/chosen": -0.5905885100364685, "logits/rejected": -0.35724154114723206, "logps/chosen": -158.06179809570312, "logps/rejected": -83.89971923828125, "loss": 0.3533, "rewards/accuracies": 1.0, "rewards/chosen": 3.5040009021759033, "rewards/margins": 1.3012313842773438, "rewards/rejected": 2.2027695178985596, "step": 8426 }, { "epoch": 1.37, "learning_rate": 4.893553472174102e-07, "logits/chosen": -0.7422720193862915, "logits/rejected": -0.7477025985717773, "logps/chosen": -79.87513732910156, "logps/rejected": -58.07300567626953, "loss": 0.7324, "rewards/accuracies": 0.0, "rewards/chosen": 1.2257355451583862, "rewards/margins": -1.1677871942520142, "rewards/rejected": 2.3935227394104004, "step": 8427 }, { "epoch": 1.37, "learning_rate": 4.892239519778034e-07, "logits/chosen": -0.8122910261154175, "logits/rejected": -0.7318617105484009, "logps/chosen": -65.08020782470703, "logps/rejected": -62.98807144165039, "loss": 0.5573, "rewards/accuracies": 0.0, "rewards/chosen": 0.7398216128349304, "rewards/margins": -0.0006519556045532227, "rewards/rejected": 0.7404735684394836, "step": 8428 }, { "epoch": 1.37, "learning_rate": 4.890925574827194e-07, "logits/chosen": -0.3600897192955017, "logits/rejected": -0.337108314037323, "logps/chosen": -13.68429946899414, "logps/rejected": -21.447032928466797, "loss": 0.7143, "rewards/accuracies": 1.0, "rewards/chosen": 0.8212777972221375, "rewards/margins": 0.48500344157218933, "rewards/rejected": 0.3362743556499481, "step": 8429 }, { "epoch": 1.37, "learning_rate": 4.889611637412367e-07, "logits/chosen": -0.7013195753097534, "logits/rejected": -0.5940704345703125, "logps/chosen": -102.6151123046875, "logps/rejected": -81.89789581298828, "loss": 0.4153, "rewards/accuracies": 0.0, "rewards/chosen": 1.214837670326233, "rewards/margins": -0.20631635189056396, "rewards/rejected": 1.4211540222167969, "step": 8430 }, { "epoch": 1.37, "learning_rate": 4.88829770762433e-07, "logits/chosen": -0.8411160707473755, "logits/rejected": -0.9031018018722534, "logps/chosen": -112.68125915527344, "logps/rejected": -126.42555236816406, "loss": 2.9179, "rewards/accuracies": 0.0, "rewards/chosen": 1.0033706426620483, "rewards/margins": -0.26479196548461914, "rewards/rejected": 1.2681626081466675, "step": 8431 }, { "epoch": 1.37, "learning_rate": 4.886983785553864e-07, "logits/chosen": -0.7065481543540955, "logits/rejected": -0.6976194977760315, "logps/chosen": -91.20252990722656, "logps/rejected": -48.83745193481445, "loss": 0.2407, "rewards/accuracies": 1.0, "rewards/chosen": 2.242199659347534, "rewards/margins": 0.9500285387039185, "rewards/rejected": 1.2921711206436157, "step": 8432 }, { "epoch": 1.37, "learning_rate": 4.88566987129175e-07, "logits/chosen": -0.8250060677528381, "logits/rejected": -0.7692179083824158, "logps/chosen": -91.3255844116211, "logps/rejected": -60.25180435180664, "loss": 0.4674, "rewards/accuracies": 0.0, "rewards/chosen": 0.5459205508232117, "rewards/margins": -0.39337581396102905, "rewards/rejected": 0.9392963647842407, "step": 8433 }, { "epoch": 1.37, "learning_rate": 4.884355964928766e-07, "logits/chosen": -0.9038053750991821, "logits/rejected": -0.6515887379646301, "logps/chosen": -172.00436401367188, "logps/rejected": -24.114355087280273, "loss": 0.075, "rewards/accuracies": 1.0, "rewards/chosen": 5.226873874664307, "rewards/margins": 4.871396541595459, "rewards/rejected": 0.35547715425491333, "step": 8434 }, { "epoch": 1.37, "learning_rate": 4.88304206655569e-07, "logits/chosen": -0.6841174960136414, "logits/rejected": -0.49645116925239563, "logps/chosen": -129.73599243164062, "logps/rejected": -52.421173095703125, "loss": 0.3297, "rewards/accuracies": 1.0, "rewards/chosen": 5.003476142883301, "rewards/margins": 2.050865411758423, "rewards/rejected": 2.952610731124878, "step": 8435 }, { "epoch": 1.37, "learning_rate": 4.881728176263302e-07, "logits/chosen": -0.6921533346176147, "logits/rejected": -0.722673773765564, "logps/chosen": -63.047027587890625, "logps/rejected": -87.19354248046875, "loss": 0.8028, "rewards/accuracies": 0.0, "rewards/chosen": 0.8989952206611633, "rewards/margins": -0.6143631339073181, "rewards/rejected": 1.5133583545684814, "step": 8436 }, { "epoch": 1.37, "learning_rate": 4.880414294142376e-07, "logits/chosen": -0.45784276723861694, "logits/rejected": -0.46560636162757874, "logps/chosen": -68.79725646972656, "logps/rejected": -58.00454330444336, "loss": 0.7596, "rewards/accuracies": 0.0, "rewards/chosen": 1.9571975469589233, "rewards/margins": -0.2503713369369507, "rewards/rejected": 2.207568883895874, "step": 8437 }, { "epoch": 1.37, "learning_rate": 4.879100420283692e-07, "logits/chosen": -0.3018365800380707, "logits/rejected": -0.3018365800380707, "logps/chosen": -36.549102783203125, "logps/rejected": -36.549102783203125, "loss": 0.347, "rewards/accuracies": 0.0, "rewards/chosen": 0.3721511960029602, "rewards/margins": 0.0, "rewards/rejected": 0.3721511960029602, "step": 8438 }, { "epoch": 1.37, "learning_rate": 4.877786554778025e-07, "logits/chosen": -1.298109769821167, "logits/rejected": -1.2968131303787231, "logps/chosen": -71.26810455322266, "logps/rejected": -120.3712387084961, "loss": 1.2438, "rewards/accuracies": 0.0, "rewards/chosen": 1.0261436700820923, "rewards/margins": -1.726582407951355, "rewards/rejected": 2.7527260780334473, "step": 8439 }, { "epoch": 1.37, "learning_rate": 4.87647269771615e-07, "logits/chosen": -0.8922834992408752, "logits/rejected": -0.8512526154518127, "logps/chosen": -67.5748291015625, "logps/rejected": -239.09182739257812, "loss": 2.3672, "rewards/accuracies": 0.0, "rewards/chosen": 2.934856414794922, "rewards/margins": -4.2216315269470215, "rewards/rejected": 7.156487941741943, "step": 8440 }, { "epoch": 1.37, "learning_rate": 4.875158849188843e-07, "logits/chosen": -0.4177551865577698, "logits/rejected": -0.47162315249443054, "logps/chosen": -271.86822509765625, "logps/rejected": -146.1234130859375, "loss": 0.7935, "rewards/accuracies": 0.0, "rewards/chosen": 5.01942777633667, "rewards/margins": -0.47791099548339844, "rewards/rejected": 5.497338771820068, "step": 8441 }, { "epoch": 1.37, "learning_rate": 4.873845009286879e-07, "logits/chosen": -0.6982647180557251, "logits/rejected": -0.7614728212356567, "logps/chosen": -99.20697021484375, "logps/rejected": -70.63512420654297, "loss": 1.3054, "rewards/accuracies": 0.0, "rewards/chosen": 1.4273582696914673, "rewards/margins": -0.28011929988861084, "rewards/rejected": 1.7074775695800781, "step": 8442 }, { "epoch": 1.37, "learning_rate": 4.87253117810103e-07, "logits/chosen": -0.8163173198699951, "logits/rejected": -0.8163173198699951, "logps/chosen": -96.39739227294922, "logps/rejected": -96.39739227294922, "loss": 0.3802, "rewards/accuracies": 0.0, "rewards/chosen": 2.575702667236328, "rewards/margins": 0.0, "rewards/rejected": 2.575702667236328, "step": 8443 }, { "epoch": 1.37, "learning_rate": 4.87121735572207e-07, "logits/chosen": -0.5934560298919678, "logits/rejected": -0.6094368696212769, "logps/chosen": -44.3641357421875, "logps/rejected": -55.00292205810547, "loss": 1.3252, "rewards/accuracies": 1.0, "rewards/chosen": 0.9630210995674133, "rewards/margins": 0.34906232357025146, "rewards/rejected": 0.6139587759971619, "step": 8444 }, { "epoch": 1.37, "learning_rate": 4.869903542240774e-07, "logits/chosen": -0.3916980028152466, "logits/rejected": -0.3916980028152466, "logps/chosen": -63.28486633300781, "logps/rejected": -63.28486633300781, "loss": 0.5319, "rewards/accuracies": 0.0, "rewards/chosen": 1.1800063848495483, "rewards/margins": 0.0, "rewards/rejected": 1.1800063848495483, "step": 8445 }, { "epoch": 1.37, "learning_rate": 4.868589737747911e-07, "logits/chosen": -0.2651766240596771, "logits/rejected": -0.2651766240596771, "logps/chosen": -0.8242524266242981, "logps/rejected": -0.8242524266242981, "loss": 0.3687, "rewards/accuracies": 0.0, "rewards/chosen": 0.16494135558605194, "rewards/margins": 0.0, "rewards/rejected": 0.16494135558605194, "step": 8446 }, { "epoch": 1.37, "learning_rate": 4.867275942334254e-07, "logits/chosen": -0.6691746115684509, "logits/rejected": -0.747897207736969, "logps/chosen": -45.62138748168945, "logps/rejected": -140.8491668701172, "loss": 2.8641, "rewards/accuracies": 0.0, "rewards/chosen": 0.9370312094688416, "rewards/margins": -3.654581069946289, "rewards/rejected": 4.591612339019775, "step": 8447 }, { "epoch": 1.37, "learning_rate": 4.865962156090574e-07, "logits/chosen": -0.46075645089149475, "logits/rejected": -0.49253419041633606, "logps/chosen": -63.808433532714844, "logps/rejected": -52.72075653076172, "loss": 1.2884, "rewards/accuracies": 0.0, "rewards/chosen": 1.2439545392990112, "rewards/margins": -1.1545096635818481, "rewards/rejected": 2.3984642028808594, "step": 8448 }, { "epoch": 1.37, "learning_rate": 4.864648379107641e-07, "logits/chosen": -0.6687163710594177, "logits/rejected": -0.5880607962608337, "logps/chosen": -97.53449249267578, "logps/rejected": -81.38618469238281, "loss": 0.545, "rewards/accuracies": 0.0, "rewards/chosen": 1.0786354541778564, "rewards/margins": -0.03383791446685791, "rewards/rejected": 1.1124733686447144, "step": 8449 }, { "epoch": 1.37, "learning_rate": 4.863334611476223e-07, "logits/chosen": -0.9207768440246582, "logits/rejected": -0.7930148839950562, "logps/chosen": -168.12847900390625, "logps/rejected": -63.47856140136719, "loss": 0.0444, "rewards/accuracies": 1.0, "rewards/chosen": 5.6095428466796875, "rewards/margins": 2.51593017578125, "rewards/rejected": 3.0936126708984375, "step": 8450 }, { "epoch": 1.37, "learning_rate": 4.862020853287091e-07, "logits/chosen": -0.5944327712059021, "logits/rejected": -0.6217813491821289, "logps/chosen": -72.42579650878906, "logps/rejected": -82.25865173339844, "loss": 0.5627, "rewards/accuracies": 0.0, "rewards/chosen": 2.0218186378479004, "rewards/margins": -0.5823180675506592, "rewards/rejected": 2.6041367053985596, "step": 8451 }, { "epoch": 1.37, "learning_rate": 4.860707104631012e-07, "logits/chosen": -0.2221221774816513, "logits/rejected": -0.3852826654911041, "logps/chosen": -56.250797271728516, "logps/rejected": -89.67106628417969, "loss": 1.2573, "rewards/accuracies": 0.0, "rewards/chosen": 1.3865588903427124, "rewards/margins": -1.779943585395813, "rewards/rejected": 3.1665024757385254, "step": 8452 }, { "epoch": 1.37, "learning_rate": 4.859393365598755e-07, "logits/chosen": -0.6176317930221558, "logits/rejected": -0.3884775638580322, "logps/chosen": -233.89767456054688, "logps/rejected": -52.91197967529297, "loss": 0.8545, "rewards/accuracies": 1.0, "rewards/chosen": 0.5733093619346619, "rewards/margins": 0.6094948053359985, "rewards/rejected": -0.036185454577207565, "step": 8453 }, { "epoch": 1.37, "learning_rate": 4.858079636281084e-07, "logits/chosen": -0.9380318522453308, "logits/rejected": -0.929531991481781, "logps/chosen": -39.922096252441406, "logps/rejected": -56.087913513183594, "loss": 0.3906, "rewards/accuracies": 1.0, "rewards/chosen": 1.4680465459823608, "rewards/margins": 0.4594680070877075, "rewards/rejected": 1.0085785388946533, "step": 8454 }, { "epoch": 1.37, "learning_rate": 4.85676591676877e-07, "logits/chosen": -0.8184688091278076, "logits/rejected": -0.7381396889686584, "logps/chosen": -122.96080017089844, "logps/rejected": -89.85771942138672, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 5.141859531402588, "rewards/margins": 2.270395040512085, "rewards/rejected": 2.871464490890503, "step": 8455 }, { "epoch": 1.37, "learning_rate": 4.855452207152573e-07, "logits/chosen": -0.6562842726707458, "logits/rejected": -0.41577091813087463, "logps/chosen": -108.62725830078125, "logps/rejected": -17.946720123291016, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 6.651358127593994, "rewards/margins": 6.218493938446045, "rewards/rejected": 0.43286439776420593, "step": 8456 }, { "epoch": 1.37, "learning_rate": 4.854138507523262e-07, "logits/chosen": -0.6681617498397827, "logits/rejected": -0.6375014185905457, "logps/chosen": -106.28659057617188, "logps/rejected": -80.86341857910156, "loss": 1.4983, "rewards/accuracies": 0.0, "rewards/chosen": -0.012867736630141735, "rewards/margins": -1.2345763444900513, "rewards/rejected": 1.2217086553573608, "step": 8457 }, { "epoch": 1.37, "learning_rate": 4.8528248179716e-07, "logits/chosen": -0.5570476055145264, "logits/rejected": -0.5638798475265503, "logps/chosen": -70.58442687988281, "logps/rejected": -53.112152099609375, "loss": 0.4139, "rewards/accuracies": 1.0, "rewards/chosen": 2.616628408432007, "rewards/margins": 0.5386009216308594, "rewards/rejected": 2.0780274868011475, "step": 8458 }, { "epoch": 1.37, "learning_rate": 4.851511138588351e-07, "logits/chosen": -0.4631745219230652, "logits/rejected": -0.44844502210617065, "logps/chosen": -42.631614685058594, "logps/rejected": -40.21563720703125, "loss": 0.638, "rewards/accuracies": 0.0, "rewards/chosen": 1.552316665649414, "rewards/margins": -0.8488521575927734, "rewards/rejected": 2.4011688232421875, "step": 8459 }, { "epoch": 1.37, "learning_rate": 4.850197469464277e-07, "logits/chosen": -1.335358738899231, "logits/rejected": -1.3724066019058228, "logps/chosen": -55.26852035522461, "logps/rejected": -29.7872371673584, "loss": 1.6791, "rewards/accuracies": 1.0, "rewards/chosen": 1.779684066772461, "rewards/margins": 0.7978143692016602, "rewards/rejected": 0.9818696975708008, "step": 8460 }, { "epoch": 1.37, "learning_rate": 4.848883810690141e-07, "logits/chosen": -0.672391414642334, "logits/rejected": -0.5047004222869873, "logps/chosen": -58.437461853027344, "logps/rejected": -26.828561782836914, "loss": 1.1851, "rewards/accuracies": 1.0, "rewards/chosen": 1.70512855052948, "rewards/margins": 1.642191767692566, "rewards/rejected": 0.06293678283691406, "step": 8461 }, { "epoch": 1.37, "learning_rate": 4.847570162356703e-07, "logits/chosen": -0.6943281292915344, "logits/rejected": -0.7238062024116516, "logps/chosen": -127.40738677978516, "logps/rejected": -76.75321960449219, "loss": 0.1307, "rewards/accuracies": 1.0, "rewards/chosen": 2.94587779045105, "rewards/margins": 1.477146029472351, "rewards/rejected": 1.4687317609786987, "step": 8462 }, { "epoch": 1.37, "learning_rate": 4.846256524554725e-07, "logits/chosen": -0.9167537689208984, "logits/rejected": -0.9069506525993347, "logps/chosen": -46.386959075927734, "logps/rejected": -109.83232879638672, "loss": 0.5809, "rewards/accuracies": 0.0, "rewards/chosen": 2.2907025814056396, "rewards/margins": -0.09369611740112305, "rewards/rejected": 2.3843986988067627, "step": 8463 }, { "epoch": 1.37, "learning_rate": 4.844942897374967e-07, "logits/chosen": -0.5041535496711731, "logits/rejected": -0.5777774453163147, "logps/chosen": -93.81916046142578, "logps/rejected": -91.72325134277344, "loss": 0.1008, "rewards/accuracies": 1.0, "rewards/chosen": 3.6420860290527344, "rewards/margins": 2.0125999450683594, "rewards/rejected": 1.629486083984375, "step": 8464 }, { "epoch": 1.37, "learning_rate": 4.843629280908186e-07, "logits/chosen": -0.7457154393196106, "logits/rejected": -0.5798009037971497, "logps/chosen": -89.85073852539062, "logps/rejected": -80.97525024414062, "loss": 0.0295, "rewards/accuracies": 1.0, "rewards/chosen": 5.199188232421875, "rewards/margins": 3.34527587890625, "rewards/rejected": 1.853912353515625, "step": 8465 }, { "epoch": 1.37, "learning_rate": 4.842315675245143e-07, "logits/chosen": -0.42024335265159607, "logits/rejected": -0.42024335265159607, "logps/chosen": -59.0948600769043, "logps/rejected": -59.0948600769043, "loss": 2.2138, "rewards/accuracies": 0.0, "rewards/chosen": 0.11092986911535263, "rewards/margins": 0.0, "rewards/rejected": 0.11092986911535263, "step": 8466 }, { "epoch": 1.37, "learning_rate": 4.841002080476594e-07, "logits/chosen": -0.8652697205543518, "logits/rejected": -0.9396371841430664, "logps/chosen": -47.16582489013672, "logps/rejected": -84.86901092529297, "loss": 2.4049, "rewards/accuracies": 0.0, "rewards/chosen": 1.5112972259521484, "rewards/margins": -2.048557758331299, "rewards/rejected": 3.5598549842834473, "step": 8467 }, { "epoch": 1.37, "learning_rate": 4.839688496693298e-07, "logits/chosen": -0.822008490562439, "logits/rejected": -0.7729048132896423, "logps/chosen": -212.9897918701172, "logps/rejected": -43.76055908203125, "loss": 0.0426, "rewards/accuracies": 1.0, "rewards/chosen": 7.227330207824707, "rewards/margins": 6.06618070602417, "rewards/rejected": 1.1611496210098267, "step": 8468 }, { "epoch": 1.37, "learning_rate": 4.83837492398601e-07, "logits/chosen": -0.855100691318512, "logits/rejected": -0.7827075123786926, "logps/chosen": -142.51583862304688, "logps/rejected": -171.82916259765625, "loss": 2.0057, "rewards/accuracies": 0.0, "rewards/chosen": 4.249053955078125, "rewards/margins": -2.4649901390075684, "rewards/rejected": 6.714044094085693, "step": 8469 }, { "epoch": 1.37, "learning_rate": 4.837061362445484e-07, "logits/chosen": -0.5618956089019775, "logits/rejected": -0.5042139291763306, "logps/chosen": -63.44353485107422, "logps/rejected": -46.63166046142578, "loss": 0.1047, "rewards/accuracies": 1.0, "rewards/chosen": 2.586805820465088, "rewards/margins": 1.6123112440109253, "rewards/rejected": 0.9744945764541626, "step": 8470 }, { "epoch": 1.37, "learning_rate": 4.835747812162477e-07, "logits/chosen": -0.7959712147712708, "logits/rejected": -0.8906210660934448, "logps/chosen": -85.19564056396484, "logps/rejected": -135.7442626953125, "loss": 2.49, "rewards/accuracies": 0.0, "rewards/chosen": 1.2266441583633423, "rewards/margins": -1.9932137727737427, "rewards/rejected": 3.219857931137085, "step": 8471 }, { "epoch": 1.38, "learning_rate": 4.834434273227743e-07, "logits/chosen": -0.34774941205978394, "logits/rejected": -0.4251617193222046, "logps/chosen": -77.52653503417969, "logps/rejected": -144.56594848632812, "loss": 0.1162, "rewards/accuracies": 1.0, "rewards/chosen": 1.2873306274414062, "rewards/margins": 1.4262313842773438, "rewards/rejected": -0.1389007568359375, "step": 8472 }, { "epoch": 1.38, "learning_rate": 4.833120745732032e-07, "logits/chosen": -0.7767781019210815, "logits/rejected": -0.7133650183677673, "logps/chosen": -103.41223907470703, "logps/rejected": -114.26485443115234, "loss": 0.314, "rewards/accuracies": 1.0, "rewards/chosen": 2.2814903259277344, "rewards/margins": 0.22168421745300293, "rewards/rejected": 2.0598061084747314, "step": 8473 }, { "epoch": 1.38, "learning_rate": 4.831807229766101e-07, "logits/chosen": -0.8629299402236938, "logits/rejected": -0.9494196772575378, "logps/chosen": -125.3941650390625, "logps/rejected": -71.33485412597656, "loss": 0.712, "rewards/accuracies": 1.0, "rewards/chosen": 2.526144504547119, "rewards/margins": 0.8474847078323364, "rewards/rejected": 1.6786597967147827, "step": 8474 }, { "epoch": 1.38, "learning_rate": 4.830493725420697e-07, "logits/chosen": -0.621997058391571, "logits/rejected": -0.6827584505081177, "logps/chosen": -11.608080863952637, "logps/rejected": -90.93781280517578, "loss": 0.9596, "rewards/accuracies": 0.0, "rewards/chosen": 0.5683138966560364, "rewards/margins": -0.05740708112716675, "rewards/rejected": 0.6257209777832031, "step": 8475 }, { "epoch": 1.38, "learning_rate": 4.829180232786573e-07, "logits/chosen": -1.3243860006332397, "logits/rejected": -1.2679272890090942, "logps/chosen": -77.82423400878906, "logps/rejected": -149.60415649414062, "loss": 0.5525, "rewards/accuracies": 0.0, "rewards/chosen": 4.3863677978515625, "rewards/margins": -0.34934234619140625, "rewards/rejected": 4.735710144042969, "step": 8476 }, { "epoch": 1.38, "learning_rate": 4.827866751954479e-07, "logits/chosen": -0.9176648259162903, "logits/rejected": -0.9029027819633484, "logps/chosen": -178.74957275390625, "logps/rejected": -190.19683837890625, "loss": 0.4817, "rewards/accuracies": 0.0, "rewards/chosen": 5.203559875488281, "rewards/margins": -0.45982837677001953, "rewards/rejected": 5.663388252258301, "step": 8477 }, { "epoch": 1.38, "learning_rate": 4.826553283015164e-07, "logits/chosen": -0.6567736268043518, "logits/rejected": -0.5344641208648682, "logps/chosen": -142.03659057617188, "logps/rejected": -67.49585723876953, "loss": 0.6353, "rewards/accuracies": 0.0, "rewards/chosen": 1.6112884283065796, "rewards/margins": -0.4008995294570923, "rewards/rejected": 2.012187957763672, "step": 8478 }, { "epoch": 1.38, "learning_rate": 4.825239826059375e-07, "logits/chosen": -0.5040210485458374, "logits/rejected": -0.4518049955368042, "logps/chosen": -114.55015563964844, "logps/rejected": -56.95653533935547, "loss": 0.4956, "rewards/accuracies": 1.0, "rewards/chosen": 3.755824327468872, "rewards/margins": 1.3460662364959717, "rewards/rejected": 2.4097580909729004, "step": 8479 }, { "epoch": 1.38, "learning_rate": 4.823926381177861e-07, "logits/chosen": -0.2871828079223633, "logits/rejected": -0.2862454354763031, "logps/chosen": -100.57244873046875, "logps/rejected": -30.87516212463379, "loss": 1.0966, "rewards/accuracies": 0.0, "rewards/chosen": 0.6172394156455994, "rewards/margins": -0.9911275506019592, "rewards/rejected": 1.6083669662475586, "step": 8480 }, { "epoch": 1.38, "learning_rate": 4.822612948461369e-07, "logits/chosen": -0.7904881238937378, "logits/rejected": -0.7904881238937378, "logps/chosen": -39.433448791503906, "logps/rejected": -39.433448791503906, "loss": 0.3759, "rewards/accuracies": 0.0, "rewards/chosen": 1.6443318128585815, "rewards/margins": 0.0, "rewards/rejected": 1.6443318128585815, "step": 8481 }, { "epoch": 1.38, "learning_rate": 4.821299528000643e-07, "logits/chosen": -0.8155081272125244, "logits/rejected": -0.7165486812591553, "logps/chosen": -162.934326171875, "logps/rejected": -22.402324676513672, "loss": 1.0767, "rewards/accuracies": 0.0, "rewards/chosen": 0.366799920797348, "rewards/margins": -0.6024143695831299, "rewards/rejected": 0.9692142605781555, "step": 8482 }, { "epoch": 1.38, "learning_rate": 4.819986119886428e-07, "logits/chosen": -0.6346511244773865, "logits/rejected": -0.6734601259231567, "logps/chosen": -36.245697021484375, "logps/rejected": -25.91693878173828, "loss": 0.4621, "rewards/accuracies": 0.0, "rewards/chosen": 1.6010849475860596, "rewards/margins": -0.37307047843933105, "rewards/rejected": 1.9741554260253906, "step": 8483 }, { "epoch": 1.38, "learning_rate": 4.81867272420947e-07, "logits/chosen": -0.6655699610710144, "logits/rejected": -0.6834844946861267, "logps/chosen": -68.62149047851562, "logps/rejected": -72.0624771118164, "loss": 0.2817, "rewards/accuracies": 1.0, "rewards/chosen": 1.9900649785995483, "rewards/margins": 0.42086708545684814, "rewards/rejected": 1.5691978931427002, "step": 8484 }, { "epoch": 1.38, "learning_rate": 4.81735934106051e-07, "logits/chosen": -0.6262868642807007, "logits/rejected": -0.6288796663284302, "logps/chosen": -108.45838928222656, "logps/rejected": -46.23260498046875, "loss": 0.6304, "rewards/accuracies": 0.0, "rewards/chosen": 0.8416542410850525, "rewards/margins": -0.720690906047821, "rewards/rejected": 1.5623451471328735, "step": 8485 }, { "epoch": 1.38, "learning_rate": 4.816045970530292e-07, "logits/chosen": -0.7498264908790588, "logits/rejected": -0.7823951244354248, "logps/chosen": -252.9966278076172, "logps/rejected": -96.94844055175781, "loss": 0.0809, "rewards/accuracies": 1.0, "rewards/chosen": 3.9014358520507812, "rewards/margins": 1.9180022478103638, "rewards/rejected": 1.9834336042404175, "step": 8486 }, { "epoch": 1.38, "learning_rate": 4.814732612709556e-07, "logits/chosen": -0.6039565801620483, "logits/rejected": -0.5210880637168884, "logps/chosen": -65.66239929199219, "logps/rejected": -42.49037170410156, "loss": 0.1514, "rewards/accuracies": 1.0, "rewards/chosen": 2.421917676925659, "rewards/margins": 1.0578986406326294, "rewards/rejected": 1.3640190362930298, "step": 8487 }, { "epoch": 1.38, "learning_rate": 4.813419267689044e-07, "logits/chosen": -0.6599854826927185, "logits/rejected": -0.6898582577705383, "logps/chosen": -37.98123550415039, "logps/rejected": -114.5162124633789, "loss": 0.6058, "rewards/accuracies": 0.0, "rewards/chosen": 1.2274048328399658, "rewards/margins": -0.6635520458221436, "rewards/rejected": 1.8909568786621094, "step": 8488 }, { "epoch": 1.38, "learning_rate": 4.812105935559495e-07, "logits/chosen": -0.9121999144554138, "logits/rejected": -0.6180312633514404, "logps/chosen": -72.19599151611328, "logps/rejected": -233.416259765625, "loss": 2.4092, "rewards/accuracies": 0.0, "rewards/chosen": 0.7382980585098267, "rewards/margins": -4.267951965332031, "rewards/rejected": 5.006249904632568, "step": 8489 }, { "epoch": 1.38, "learning_rate": 4.810792616411649e-07, "logits/chosen": -0.9842641949653625, "logits/rejected": -0.8592056632041931, "logps/chosen": -82.15162658691406, "logps/rejected": -61.556697845458984, "loss": 0.4173, "rewards/accuracies": 0.0, "rewards/chosen": 1.313496470451355, "rewards/margins": -0.031099319458007812, "rewards/rejected": 1.3445957899093628, "step": 8490 }, { "epoch": 1.38, "learning_rate": 4.809479310336241e-07, "logits/chosen": -0.4978746175765991, "logits/rejected": -0.4774678349494934, "logps/chosen": -30.484642028808594, "logps/rejected": -22.155567169189453, "loss": 1.177, "rewards/accuracies": 1.0, "rewards/chosen": 0.3446899354457855, "rewards/margins": 0.322771817445755, "rewards/rejected": 0.021918106824159622, "step": 8491 }, { "epoch": 1.38, "learning_rate": 4.808166017424011e-07, "logits/chosen": -0.6578925848007202, "logits/rejected": -0.6731062531471252, "logps/chosen": -66.01937103271484, "logps/rejected": -70.29713439941406, "loss": 1.4162, "rewards/accuracies": 0.0, "rewards/chosen": 0.7681236267089844, "rewards/margins": -0.7117255926132202, "rewards/rejected": 1.4798492193222046, "step": 8492 }, { "epoch": 1.38, "learning_rate": 4.806852737765694e-07, "logits/chosen": -0.8239175081253052, "logits/rejected": -0.8086088299751282, "logps/chosen": -81.04377746582031, "logps/rejected": -113.71640014648438, "loss": 0.3971, "rewards/accuracies": 0.0, "rewards/chosen": 1.9158066511154175, "rewards/margins": -0.17772209644317627, "rewards/rejected": 2.0935287475585938, "step": 8493 }, { "epoch": 1.38, "learning_rate": 4.805539471452025e-07, "logits/chosen": -0.6394258141517639, "logits/rejected": -0.5900008082389832, "logps/chosen": -91.01272583007812, "logps/rejected": -89.32276916503906, "loss": 0.4088, "rewards/accuracies": 0.0, "rewards/chosen": 2.1261353492736816, "rewards/margins": -0.11132502555847168, "rewards/rejected": 2.2374603748321533, "step": 8494 }, { "epoch": 1.38, "learning_rate": 4.804226218573739e-07, "logits/chosen": -0.44661957025527954, "logits/rejected": -0.44661957025527954, "logps/chosen": -45.374752044677734, "logps/rejected": -45.374752044677734, "loss": 0.3478, "rewards/accuracies": 0.0, "rewards/chosen": 1.753665566444397, "rewards/margins": 0.0, "rewards/rejected": 1.753665566444397, "step": 8495 }, { "epoch": 1.38, "learning_rate": 4.802912979221569e-07, "logits/chosen": -0.6957094669342041, "logits/rejected": -0.7206432819366455, "logps/chosen": -49.463478088378906, "logps/rejected": -50.648590087890625, "loss": 0.9233, "rewards/accuracies": 0.0, "rewards/chosen": 1.5730125904083252, "rewards/margins": -0.7743422985076904, "rewards/rejected": 2.3473548889160156, "step": 8496 }, { "epoch": 1.38, "learning_rate": 4.801599753486247e-07, "logits/chosen": -1.245293140411377, "logits/rejected": -1.2568098306655884, "logps/chosen": -10.329446792602539, "logps/rejected": -21.877029418945312, "loss": 0.3833, "rewards/accuracies": 1.0, "rewards/chosen": 0.21728897094726562, "rewards/margins": 0.058226391673088074, "rewards/rejected": 0.15906257927417755, "step": 8497 }, { "epoch": 1.38, "learning_rate": 4.800286541458504e-07, "logits/chosen": -0.632928729057312, "logits/rejected": -0.6591472029685974, "logps/chosen": -138.35232543945312, "logps/rejected": -78.17478942871094, "loss": 1.6333, "rewards/accuracies": 0.0, "rewards/chosen": 0.7968826293945312, "rewards/margins": -1.863050937652588, "rewards/rejected": 2.659933567047119, "step": 8498 }, { "epoch": 1.38, "learning_rate": 4.798973343229073e-07, "logits/chosen": -0.7319429516792297, "logits/rejected": -0.6580016613006592, "logps/chosen": -86.76515197753906, "logps/rejected": -42.43159484863281, "loss": 1.4567, "rewards/accuracies": 1.0, "rewards/chosen": 2.420552968978882, "rewards/margins": 2.093207597732544, "rewards/rejected": 0.3273452818393707, "step": 8499 }, { "epoch": 1.38, "learning_rate": 4.79766015888868e-07, "logits/chosen": -0.4987662732601166, "logits/rejected": -0.5292662978172302, "logps/chosen": -38.77262496948242, "logps/rejected": -53.68519973754883, "loss": 1.0278, "rewards/accuracies": 0.0, "rewards/chosen": 1.433256983757019, "rewards/margins": -1.291124701499939, "rewards/rejected": 2.724381685256958, "step": 8500 }, { "epoch": 1.38, "learning_rate": 4.796346988528057e-07, "logits/chosen": -0.7019062042236328, "logits/rejected": -0.6588594913482666, "logps/chosen": -87.27885437011719, "logps/rejected": -100.17654418945312, "loss": 0.5334, "rewards/accuracies": 1.0, "rewards/chosen": 1.8523025512695312, "rewards/margins": 0.21918177604675293, "rewards/rejected": 1.6331207752227783, "step": 8501 }, { "epoch": 1.38, "learning_rate": 4.795033832237929e-07, "logits/chosen": -0.6569821238517761, "logits/rejected": -0.7141916751861572, "logps/chosen": -62.46758270263672, "logps/rejected": -184.80685424804688, "loss": 2.182, "rewards/accuracies": 0.0, "rewards/chosen": 3.0133919715881348, "rewards/margins": -1.4293999671936035, "rewards/rejected": 4.442791938781738, "step": 8502 }, { "epoch": 1.38, "learning_rate": 4.793720690109024e-07, "logits/chosen": -0.9603238105773926, "logits/rejected": -0.9603238105773926, "logps/chosen": -46.088836669921875, "logps/rejected": -46.088836669921875, "loss": 0.5576, "rewards/accuracies": 0.0, "rewards/chosen": 1.8141602277755737, "rewards/margins": 0.0, "rewards/rejected": 1.8141602277755737, "step": 8503 }, { "epoch": 1.38, "learning_rate": 4.792407562232068e-07, "logits/chosen": -0.6610056161880493, "logits/rejected": -0.48576149344444275, "logps/chosen": -178.13250732421875, "logps/rejected": -47.49623107910156, "loss": 1.5115, "rewards/accuracies": 1.0, "rewards/chosen": 1.0319550037384033, "rewards/margins": 1.017285943031311, "rewards/rejected": 0.01466903742402792, "step": 8504 }, { "epoch": 1.38, "learning_rate": 4.791094448697785e-07, "logits/chosen": -0.8256753087043762, "logits/rejected": -0.6318146586418152, "logps/chosen": -110.61878967285156, "logps/rejected": -94.92926025390625, "loss": 1.1585, "rewards/accuracies": 1.0, "rewards/chosen": 6.885807991027832, "rewards/margins": 1.1489825248718262, "rewards/rejected": 5.736825466156006, "step": 8505 }, { "epoch": 1.38, "learning_rate": 4.789781349596899e-07, "logits/chosen": -0.5203866362571716, "logits/rejected": -0.550917387008667, "logps/chosen": -52.569252014160156, "logps/rejected": -32.28857421875, "loss": 0.6329, "rewards/accuracies": 1.0, "rewards/chosen": 0.9743690490722656, "rewards/margins": 0.25744324922561646, "rewards/rejected": 0.7169257998466492, "step": 8506 }, { "epoch": 1.38, "learning_rate": 4.788468265020135e-07, "logits/chosen": -0.9635257124900818, "logits/rejected": -1.090219259262085, "logps/chosen": -89.04582977294922, "logps/rejected": -106.73921203613281, "loss": 1.5349, "rewards/accuracies": 0.0, "rewards/chosen": 3.2542672157287598, "rewards/margins": -1.9307336807250977, "rewards/rejected": 5.185000896453857, "step": 8507 }, { "epoch": 1.38, "learning_rate": 4.787155195058209e-07, "logits/chosen": -0.5599223971366882, "logits/rejected": 0.3035862445831299, "logps/chosen": -62.1408805847168, "logps/rejected": -36.30604553222656, "loss": 2.0233, "rewards/accuracies": 1.0, "rewards/chosen": 1.9751697778701782, "rewards/margins": 1.4738876819610596, "rewards/rejected": 0.5012821555137634, "step": 8508 }, { "epoch": 1.38, "learning_rate": 4.785842139801847e-07, "logits/chosen": -0.7184725999832153, "logits/rejected": -0.633200466632843, "logps/chosen": -131.04798889160156, "logps/rejected": -56.40388488769531, "loss": 0.6853, "rewards/accuracies": 0.0, "rewards/chosen": 0.872528076171875, "rewards/margins": -0.26508939266204834, "rewards/rejected": 1.1376174688339233, "step": 8509 }, { "epoch": 1.38, "learning_rate": 4.784529099341766e-07, "logits/chosen": -0.6699808239936829, "logits/rejected": -0.4173903465270996, "logps/chosen": -50.15315246582031, "logps/rejected": -53.41545867919922, "loss": 1.1765, "rewards/accuracies": 0.0, "rewards/chosen": 1.1332149505615234, "rewards/margins": -2.1090047359466553, "rewards/rejected": 3.2422196865081787, "step": 8510 }, { "epoch": 1.38, "learning_rate": 4.783216073768684e-07, "logits/chosen": -0.7460910081863403, "logits/rejected": -0.7331048846244812, "logps/chosen": -65.61093139648438, "logps/rejected": -96.22591400146484, "loss": 0.8496, "rewards/accuracies": 0.0, "rewards/chosen": 0.74285888671875, "rewards/margins": -0.5758720636367798, "rewards/rejected": 1.3187309503555298, "step": 8511 }, { "epoch": 1.38, "learning_rate": 4.78190306317332e-07, "logits/chosen": -0.04833924025297165, "logits/rejected": -0.04833924025297165, "logps/chosen": -1.3019893169403076, "logps/rejected": -1.3019893169403076, "loss": 1.0507, "rewards/accuracies": 0.0, "rewards/chosen": 0.336162269115448, "rewards/margins": 0.0, "rewards/rejected": 0.336162269115448, "step": 8512 }, { "epoch": 1.38, "learning_rate": 4.780590067646393e-07, "logits/chosen": -0.3407282829284668, "logits/rejected": -0.37727099657058716, "logps/chosen": -45.64651870727539, "logps/rejected": -87.94888305664062, "loss": 0.3472, "rewards/accuracies": 1.0, "rewards/chosen": 0.8195461630821228, "rewards/margins": 0.7629414200782776, "rewards/rejected": 0.056604769080877304, "step": 8513 }, { "epoch": 1.38, "learning_rate": 4.779277087278614e-07, "logits/chosen": -0.8913140296936035, "logits/rejected": -0.8913140296936035, "logps/chosen": -44.8364372253418, "logps/rejected": -44.8364372253418, "loss": 0.8892, "rewards/accuracies": 0.0, "rewards/chosen": 1.6786552667617798, "rewards/margins": 0.0, "rewards/rejected": 1.6786552667617798, "step": 8514 }, { "epoch": 1.38, "learning_rate": 4.777964122160702e-07, "logits/chosen": -1.2470557689666748, "logits/rejected": -1.2491183280944824, "logps/chosen": -51.34581756591797, "logps/rejected": -39.09368133544922, "loss": 0.1138, "rewards/accuracies": 1.0, "rewards/chosen": 1.9237464666366577, "rewards/margins": 1.5692870616912842, "rewards/rejected": 0.35445937514305115, "step": 8515 }, { "epoch": 1.38, "learning_rate": 4.776651172383366e-07, "logits/chosen": -0.8358530402183533, "logits/rejected": -0.8059438467025757, "logps/chosen": -84.26864624023438, "logps/rejected": -77.5019760131836, "loss": 0.1847, "rewards/accuracies": 1.0, "rewards/chosen": 3.8317222595214844, "rewards/margins": 0.8217215538024902, "rewards/rejected": 3.010000705718994, "step": 8516 }, { "epoch": 1.38, "learning_rate": 4.775338238037322e-07, "logits/chosen": -0.48991861939430237, "logits/rejected": -0.4643688201904297, "logps/chosen": -72.22601318359375, "logps/rejected": -68.6256103515625, "loss": 1.0104, "rewards/accuracies": 1.0, "rewards/chosen": 2.058339834213257, "rewards/margins": 0.9545754194259644, "rewards/rejected": 1.1037644147872925, "step": 8517 }, { "epoch": 1.38, "learning_rate": 4.77402531921328e-07, "logits/chosen": -0.7611066102981567, "logits/rejected": -0.5463928580284119, "logps/chosen": -109.6688461303711, "logps/rejected": -74.44963073730469, "loss": 2.1095, "rewards/accuracies": 1.0, "rewards/chosen": 4.063086986541748, "rewards/margins": 2.336003303527832, "rewards/rejected": 1.7270835638046265, "step": 8518 }, { "epoch": 1.38, "learning_rate": 4.77271241600195e-07, "logits/chosen": -0.38937491178512573, "logits/rejected": -0.3644917905330658, "logps/chosen": -91.49114990234375, "logps/rejected": -56.84729766845703, "loss": 1.3266, "rewards/accuracies": 1.0, "rewards/chosen": 2.182563066482544, "rewards/margins": 0.07217550277709961, "rewards/rejected": 2.1103875637054443, "step": 8519 }, { "epoch": 1.38, "learning_rate": 4.771399528494041e-07, "logits/chosen": -0.7727078795433044, "logits/rejected": -0.6535143256187439, "logps/chosen": -122.21368408203125, "logps/rejected": -86.5631103515625, "loss": 2.2534, "rewards/accuracies": 0.0, "rewards/chosen": 1.3459793329238892, "rewards/margins": -1.9216643571853638, "rewards/rejected": 3.267643690109253, "step": 8520 }, { "epoch": 1.38, "learning_rate": 4.770086656780262e-07, "logits/chosen": -0.6734839081764221, "logits/rejected": -0.6832467317581177, "logps/chosen": -79.82239532470703, "logps/rejected": -64.02159881591797, "loss": 0.3867, "rewards/accuracies": 1.0, "rewards/chosen": 1.6055786609649658, "rewards/margins": 0.021398186683654785, "rewards/rejected": 1.584180474281311, "step": 8521 }, { "epoch": 1.38, "learning_rate": 4.768773800951319e-07, "logits/chosen": -0.7410092949867249, "logits/rejected": -0.6465479731559753, "logps/chosen": -66.98597717285156, "logps/rejected": -41.89530944824219, "loss": 0.9199, "rewards/accuracies": 0.0, "rewards/chosen": 0.9414810538291931, "rewards/margins": -0.40672487020492554, "rewards/rejected": 1.3482059240341187, "step": 8522 }, { "epoch": 1.38, "learning_rate": 4.7674609610979194e-07, "logits/chosen": -0.7924015522003174, "logits/rejected": -0.7326553463935852, "logps/chosen": -72.68019104003906, "logps/rejected": -66.57705688476562, "loss": 0.5092, "rewards/accuracies": 0.0, "rewards/chosen": 1.7059906721115112, "rewards/margins": -0.2989562749862671, "rewards/rejected": 2.0049469470977783, "step": 8523 }, { "epoch": 1.38, "learning_rate": 4.766148137310767e-07, "logits/chosen": -0.4937322735786438, "logits/rejected": -0.48961979150772095, "logps/chosen": -2.7704954147338867, "logps/rejected": -2.0026354789733887, "loss": 0.7613, "rewards/accuracies": 0.0, "rewards/chosen": 0.21920756995677948, "rewards/margins": -0.12723831832408905, "rewards/rejected": 0.34644588828086853, "step": 8524 }, { "epoch": 1.38, "learning_rate": 4.764835329680565e-07, "logits/chosen": -0.9366745948791504, "logits/rejected": -1.0057512521743774, "logps/chosen": -109.96660614013672, "logps/rejected": -110.50279235839844, "loss": 1.348, "rewards/accuracies": 0.0, "rewards/chosen": 4.0137248039245605, "rewards/margins": -2.6112828254699707, "rewards/rejected": 6.625007629394531, "step": 8525 }, { "epoch": 1.38, "learning_rate": 4.763522538298017e-07, "logits/chosen": -0.6003693342208862, "logits/rejected": -0.5892797708511353, "logps/chosen": -56.939273834228516, "logps/rejected": -4.27296257019043, "loss": 0.5173, "rewards/accuracies": 0.0, "rewards/chosen": 0.014025116339325905, "rewards/margins": -0.3221149444580078, "rewards/rejected": 0.33614006638526917, "step": 8526 }, { "epoch": 1.38, "learning_rate": 4.7622097632538237e-07, "logits/chosen": -0.7858319282531738, "logits/rejected": -0.7027156352996826, "logps/chosen": -89.23103332519531, "logps/rejected": -49.3621826171875, "loss": 0.1128, "rewards/accuracies": 1.0, "rewards/chosen": 4.070286750793457, "rewards/margins": 1.4478113651275635, "rewards/rejected": 2.6224753856658936, "step": 8527 }, { "epoch": 1.38, "learning_rate": 4.760897004638686e-07, "logits/chosen": -0.4896342158317566, "logits/rejected": -0.47362270951271057, "logps/chosen": -2.9954655170440674, "logps/rejected": -19.867034912109375, "loss": 0.3057, "rewards/accuracies": 1.0, "rewards/chosen": 0.3578265607357025, "rewards/margins": 0.1960643231868744, "rewards/rejected": 0.16176223754882812, "step": 8528 }, { "epoch": 1.38, "learning_rate": 4.759584262543303e-07, "logits/chosen": -0.859565019607544, "logits/rejected": -0.8688856959342957, "logps/chosen": -109.41256713867188, "logps/rejected": -80.02555084228516, "loss": 0.3844, "rewards/accuracies": 1.0, "rewards/chosen": 2.837939500808716, "rewards/margins": 0.8429062366485596, "rewards/rejected": 1.9950332641601562, "step": 8529 }, { "epoch": 1.38, "learning_rate": 4.758271537058372e-07, "logits/chosen": -0.5601840019226074, "logits/rejected": -0.4830480217933655, "logps/chosen": -73.1708984375, "logps/rejected": -66.42176818847656, "loss": 0.4334, "rewards/accuracies": 1.0, "rewards/chosen": 1.9968292713165283, "rewards/margins": 0.23403394222259521, "rewards/rejected": 1.762795329093933, "step": 8530 }, { "epoch": 1.38, "learning_rate": 4.7569588282745907e-07, "logits/chosen": -0.783132016658783, "logits/rejected": -0.7123146057128906, "logps/chosen": -84.44290161132812, "logps/rejected": -28.19110679626465, "loss": 0.1117, "rewards/accuracies": 1.0, "rewards/chosen": 1.5859466791152954, "rewards/margins": 1.4582945108413696, "rewards/rejected": 0.12765216827392578, "step": 8531 }, { "epoch": 1.38, "learning_rate": 4.755646136282656e-07, "logits/chosen": -0.43382444977760315, "logits/rejected": -0.428660124540329, "logps/chosen": -24.899059295654297, "logps/rejected": -5.289963722229004, "loss": 0.8403, "rewards/accuracies": 0.0, "rewards/chosen": 0.12853126227855682, "rewards/margins": -0.04305344820022583, "rewards/rejected": 0.17158471047878265, "step": 8532 }, { "epoch": 1.39, "learning_rate": 4.7543334611732595e-07, "logits/chosen": -0.6811762452125549, "logits/rejected": -0.5096579790115356, "logps/chosen": -162.08639526367188, "logps/rejected": -53.455718994140625, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": 4.565687656402588, "rewards/margins": 3.0884079933166504, "rewards/rejected": 1.4772796630859375, "step": 8533 }, { "epoch": 1.39, "learning_rate": 4.753020803037097e-07, "logits/chosen": -0.6830424666404724, "logits/rejected": -0.6861885786056519, "logps/chosen": -84.10104370117188, "logps/rejected": -100.67984008789062, "loss": 0.5029, "rewards/accuracies": 0.0, "rewards/chosen": 0.7045059204101562, "rewards/margins": -0.4445136785507202, "rewards/rejected": 1.1490195989608765, "step": 8534 }, { "epoch": 1.39, "learning_rate": 4.75170816196486e-07, "logits/chosen": -0.9842597246170044, "logits/rejected": -0.9426085948944092, "logps/chosen": -21.476900100708008, "logps/rejected": -66.59278106689453, "loss": 1.0541, "rewards/accuracies": 0.0, "rewards/chosen": 0.7288299798965454, "rewards/margins": -1.1201438903808594, "rewards/rejected": 1.8489738702774048, "step": 8535 }, { "epoch": 1.39, "learning_rate": 4.7503955380472393e-07, "logits/chosen": -0.5602636337280273, "logits/rejected": -0.5593732595443726, "logps/chosen": -4.811699867248535, "logps/rejected": -3.085298538208008, "loss": 0.7024, "rewards/accuracies": 0.0, "rewards/chosen": 0.13107624650001526, "rewards/margins": -0.14905476570129395, "rewards/rejected": 0.2801310122013092, "step": 8536 }, { "epoch": 1.39, "learning_rate": 4.7490829313749257e-07, "logits/chosen": -0.6504940390586853, "logits/rejected": -0.5270717740058899, "logps/chosen": -110.22126770019531, "logps/rejected": -57.90080642700195, "loss": 0.4858, "rewards/accuracies": 1.0, "rewards/chosen": 0.730402410030365, "rewards/margins": 0.6666065454483032, "rewards/rejected": 0.06379585713148117, "step": 8537 }, { "epoch": 1.39, "learning_rate": 4.747770342038607e-07, "logits/chosen": -0.43319836258888245, "logits/rejected": -0.4319606423377991, "logps/chosen": -0.7613201141357422, "logps/rejected": -2.0498554706573486, "loss": 0.5673, "rewards/accuracies": 1.0, "rewards/chosen": 0.1738080084323883, "rewards/margins": 0.08306119590997696, "rewards/rejected": 0.09074681252241135, "step": 8538 }, { "epoch": 1.39, "learning_rate": 4.7464577701289715e-07, "logits/chosen": -0.5142067670822144, "logits/rejected": -0.5142067670822144, "logps/chosen": -52.63283157348633, "logps/rejected": -52.63283157348633, "loss": 0.8915, "rewards/accuracies": 0.0, "rewards/chosen": 0.770886242389679, "rewards/margins": 0.0, "rewards/rejected": 0.770886242389679, "step": 8539 }, { "epoch": 1.39, "learning_rate": 4.7451452157367045e-07, "logits/chosen": -0.9001120924949646, "logits/rejected": -0.8839075565338135, "logps/chosen": -84.9136962890625, "logps/rejected": -91.4520263671875, "loss": 0.3239, "rewards/accuracies": 1.0, "rewards/chosen": 3.0709474086761475, "rewards/margins": 0.30782103538513184, "rewards/rejected": 2.7631263732910156, "step": 8540 }, { "epoch": 1.39, "learning_rate": 4.7438326789524917e-07, "logits/chosen": -0.7397221922874451, "logits/rejected": -0.7808921337127686, "logps/chosen": -41.31768798828125, "logps/rejected": -106.02870178222656, "loss": 1.4, "rewards/accuracies": 0.0, "rewards/chosen": 3.35172438621521, "rewards/margins": -1.894160509109497, "rewards/rejected": 5.245884895324707, "step": 8541 }, { "epoch": 1.39, "learning_rate": 4.7425201598670175e-07, "logits/chosen": -1.1534533500671387, "logits/rejected": -1.0455039739608765, "logps/chosen": -144.94821166992188, "logps/rejected": -194.9381103515625, "loss": 0.7788, "rewards/accuracies": 0.0, "rewards/chosen": 5.061718940734863, "rewards/margins": -1.27543306350708, "rewards/rejected": 6.337152004241943, "step": 8542 }, { "epoch": 1.39, "learning_rate": 4.7412076585709637e-07, "logits/chosen": -0.8106207847595215, "logits/rejected": -0.781087338924408, "logps/chosen": -83.18472290039062, "logps/rejected": -99.16632080078125, "loss": 0.4199, "rewards/accuracies": 0.0, "rewards/chosen": 3.644413709640503, "rewards/margins": -0.01273202896118164, "rewards/rejected": 3.6571457386016846, "step": 8543 }, { "epoch": 1.39, "learning_rate": 4.7398951751550115e-07, "logits/chosen": -0.8430332541465759, "logits/rejected": -0.8909189105033875, "logps/chosen": -49.33338928222656, "logps/rejected": -120.0387191772461, "loss": 0.561, "rewards/accuracies": 1.0, "rewards/chosen": 1.7141770124435425, "rewards/margins": 0.07147598266601562, "rewards/rejected": 1.6427010297775269, "step": 8544 }, { "epoch": 1.39, "learning_rate": 4.738582709709842e-07, "logits/chosen": -0.7347099184989929, "logits/rejected": -0.5211588740348816, "logps/chosen": -62.6050910949707, "logps/rejected": -26.813255310058594, "loss": 0.215, "rewards/accuracies": 1.0, "rewards/chosen": 1.853213906288147, "rewards/margins": 1.6015678644180298, "rewards/rejected": 0.2516460418701172, "step": 8545 }, { "epoch": 1.39, "learning_rate": 4.7372702623261336e-07, "logits/chosen": -0.6747910976409912, "logits/rejected": -0.6747910976409912, "logps/chosen": -24.11268424987793, "logps/rejected": -24.11268424987793, "loss": 0.6594, "rewards/accuracies": 0.0, "rewards/chosen": 0.09482784569263458, "rewards/margins": 0.0, "rewards/rejected": 0.09482784569263458, "step": 8546 }, { "epoch": 1.39, "learning_rate": 4.7359578330945624e-07, "logits/chosen": -0.7418293356895447, "logits/rejected": -0.648098349571228, "logps/chosen": -68.37377166748047, "logps/rejected": -7.683553695678711, "loss": 0.6427, "rewards/accuracies": 1.0, "rewards/chosen": 0.846539318561554, "rewards/margins": 0.36679887771606445, "rewards/rejected": 0.4797404408454895, "step": 8547 }, { "epoch": 1.39, "learning_rate": 4.734645422105809e-07, "logits/chosen": -0.7399862408638, "logits/rejected": -0.64573734998703, "logps/chosen": -77.48407745361328, "logps/rejected": -35.2054557800293, "loss": 0.0551, "rewards/accuracies": 1.0, "rewards/chosen": 3.5365235805511475, "rewards/margins": 2.2027435302734375, "rewards/rejected": 1.3337799310684204, "step": 8548 }, { "epoch": 1.39, "learning_rate": 4.7333330294505454e-07, "logits/chosen": -0.14794813096523285, "logits/rejected": -0.17983108758926392, "logps/chosen": -32.23139953613281, "logps/rejected": -56.84172821044922, "loss": 0.5463, "rewards/accuracies": 0.0, "rewards/chosen": 2.1470611095428467, "rewards/margins": -0.2347252368927002, "rewards/rejected": 2.381786346435547, "step": 8549 }, { "epoch": 1.39, "learning_rate": 4.7320206552194456e-07, "logits/chosen": -0.9471992254257202, "logits/rejected": -0.9156774282455444, "logps/chosen": -122.14661407470703, "logps/rejected": -73.09719848632812, "loss": 0.9234, "rewards/accuracies": 0.0, "rewards/chosen": 0.7991844415664673, "rewards/margins": -1.3231269121170044, "rewards/rejected": 2.1223113536834717, "step": 8550 }, { "epoch": 1.39, "learning_rate": 4.730708299503184e-07, "logits/chosen": -0.6352420449256897, "logits/rejected": -0.6869844794273376, "logps/chosen": -56.700836181640625, "logps/rejected": -187.3077850341797, "loss": 0.9728, "rewards/accuracies": 0.0, "rewards/chosen": 0.8370750546455383, "rewards/margins": -0.9504128098487854, "rewards/rejected": 1.7874878644943237, "step": 8551 }, { "epoch": 1.39, "learning_rate": 4.72939596239243e-07, "logits/chosen": -0.851905345916748, "logits/rejected": -0.8537870645523071, "logps/chosen": -80.43060302734375, "logps/rejected": -74.19300842285156, "loss": 1.2012, "rewards/accuracies": 0.0, "rewards/chosen": 1.5316978693008423, "rewards/margins": -1.2896095514297485, "rewards/rejected": 2.821307420730591, "step": 8552 }, { "epoch": 1.39, "learning_rate": 4.7280836439778547e-07, "logits/chosen": -0.27599668502807617, "logits/rejected": -0.2592654526233673, "logps/chosen": -86.80352020263672, "logps/rejected": -94.68470764160156, "loss": 0.577, "rewards/accuracies": 1.0, "rewards/chosen": 1.16374671459198, "rewards/margins": 0.22344672679901123, "rewards/rejected": 0.9402999877929688, "step": 8553 }, { "epoch": 1.39, "learning_rate": 4.7267713443501267e-07, "logits/chosen": -0.31468501687049866, "logits/rejected": -0.31468501687049866, "logps/chosen": -108.320068359375, "logps/rejected": -108.320068359375, "loss": 1.0899, "rewards/accuracies": 0.0, "rewards/chosen": 1.412920355796814, "rewards/margins": 0.0, "rewards/rejected": 1.412920355796814, "step": 8554 }, { "epoch": 1.39, "learning_rate": 4.725459063599914e-07, "logits/chosen": -1.0011460781097412, "logits/rejected": -0.9561994671821594, "logps/chosen": -103.98626708984375, "logps/rejected": -24.961380004882812, "loss": 0.2278, "rewards/accuracies": 1.0, "rewards/chosen": 2.295905351638794, "rewards/margins": 1.9740509986877441, "rewards/rejected": 0.3218544125556946, "step": 8555 }, { "epoch": 1.39, "learning_rate": 4.724146801817882e-07, "logits/chosen": -0.7799879312515259, "logits/rejected": -0.7879527807235718, "logps/chosen": -76.24066162109375, "logps/rejected": -62.348934173583984, "loss": 0.3577, "rewards/accuracies": 0.0, "rewards/chosen": 1.3705796003341675, "rewards/margins": -0.03704559803009033, "rewards/rejected": 1.4076251983642578, "step": 8556 }, { "epoch": 1.39, "learning_rate": 4.722834559094696e-07, "logits/chosen": -0.6990416646003723, "logits/rejected": -0.7299131155014038, "logps/chosen": -81.70993041992188, "logps/rejected": -92.40544128417969, "loss": 0.2539, "rewards/accuracies": 1.0, "rewards/chosen": 1.9101943969726562, "rewards/margins": 0.5645965337753296, "rewards/rejected": 1.3455978631973267, "step": 8557 }, { "epoch": 1.39, "learning_rate": 4.721522335521019e-07, "logits/chosen": -0.4902521073818207, "logits/rejected": -0.4405151307582855, "logps/chosen": -114.13160705566406, "logps/rejected": -49.916133880615234, "loss": 0.2471, "rewards/accuracies": 1.0, "rewards/chosen": 2.8175065517425537, "rewards/margins": 1.0146268606185913, "rewards/rejected": 1.8028796911239624, "step": 8558 }, { "epoch": 1.39, "learning_rate": 4.7202101311875136e-07, "logits/chosen": -0.4291686415672302, "logits/rejected": -0.4291686415672302, "logps/chosen": -60.700096130371094, "logps/rejected": -60.700096130371094, "loss": 0.5043, "rewards/accuracies": 0.0, "rewards/chosen": 1.1678017377853394, "rewards/margins": 0.0, "rewards/rejected": 1.1678017377853394, "step": 8559 }, { "epoch": 1.39, "learning_rate": 4.718897946184841e-07, "logits/chosen": -0.7518221735954285, "logits/rejected": -0.7518221735954285, "logps/chosen": -110.87042236328125, "logps/rejected": -110.87042236328125, "loss": 0.3979, "rewards/accuracies": 0.0, "rewards/chosen": 2.2766754627227783, "rewards/margins": 0.0, "rewards/rejected": 2.2766754627227783, "step": 8560 }, { "epoch": 1.39, "learning_rate": 4.71758578060366e-07, "logits/chosen": -0.4478347599506378, "logits/rejected": -0.4455876052379608, "logps/chosen": -80.57070922851562, "logps/rejected": -55.17512893676758, "loss": 1.5197, "rewards/accuracies": 0.0, "rewards/chosen": 1.0435447692871094, "rewards/margins": -0.9352512359619141, "rewards/rejected": 1.9787960052490234, "step": 8561 }, { "epoch": 1.39, "learning_rate": 4.7162736345346296e-07, "logits/chosen": -0.7872878909111023, "logits/rejected": -0.823304295539856, "logps/chosen": -118.52816009521484, "logps/rejected": -72.3839111328125, "loss": 1.4444, "rewards/accuracies": 0.0, "rewards/chosen": 0.539904773235321, "rewards/margins": -1.9792885780334473, "rewards/rejected": 2.519193410873413, "step": 8562 }, { "epoch": 1.39, "learning_rate": 4.7149615080684067e-07, "logits/chosen": -0.4228256940841675, "logits/rejected": -0.41890421509742737, "logps/chosen": -85.0333251953125, "logps/rejected": -79.25752258300781, "loss": 0.4253, "rewards/accuracies": 1.0, "rewards/chosen": 0.9173492789268494, "rewards/margins": 0.06747132539749146, "rewards/rejected": 0.8498779535293579, "step": 8563 }, { "epoch": 1.39, "learning_rate": 4.713649401295647e-07, "logits/chosen": -0.7198268175125122, "logits/rejected": -0.6137946844100952, "logps/chosen": -58.85296630859375, "logps/rejected": -23.51369285583496, "loss": 0.5593, "rewards/accuracies": 0.0, "rewards/chosen": 0.4022460877895355, "rewards/margins": -0.044853806495666504, "rewards/rejected": 0.447099894285202, "step": 8564 }, { "epoch": 1.39, "learning_rate": 4.712337314307003e-07, "logits/chosen": -0.6880768537521362, "logits/rejected": -0.6196362376213074, "logps/chosen": -75.91673278808594, "logps/rejected": -63.44294357299805, "loss": 0.6183, "rewards/accuracies": 1.0, "rewards/chosen": 2.1135780811309814, "rewards/margins": 0.32583510875701904, "rewards/rejected": 1.7877429723739624, "step": 8565 }, { "epoch": 1.39, "learning_rate": 4.7110252471931295e-07, "logits/chosen": -0.6614325642585754, "logits/rejected": -0.5862801671028137, "logps/chosen": -82.4361343383789, "logps/rejected": -86.41943359375, "loss": 0.3292, "rewards/accuracies": 1.0, "rewards/chosen": 1.8967918157577515, "rewards/margins": 0.8045463562011719, "rewards/rejected": 1.0922454595565796, "step": 8566 }, { "epoch": 1.39, "learning_rate": 4.709713200044677e-07, "logits/chosen": -0.9464095234870911, "logits/rejected": -0.9865285158157349, "logps/chosen": -46.76033401489258, "logps/rejected": -76.00038146972656, "loss": 2.6304, "rewards/accuracies": 0.0, "rewards/chosen": 2.0289013385772705, "rewards/margins": -0.28658032417297363, "rewards/rejected": 2.315481662750244, "step": 8567 }, { "epoch": 1.39, "learning_rate": 4.708401172952295e-07, "logits/chosen": -0.5152138471603394, "logits/rejected": -0.5379468202590942, "logps/chosen": -1.3843684196472168, "logps/rejected": -28.99982261657715, "loss": 0.3239, "rewards/accuracies": 1.0, "rewards/chosen": 0.4533160626888275, "rewards/margins": 0.23720525205135345, "rewards/rejected": 0.21611081063747406, "step": 8568 }, { "epoch": 1.39, "learning_rate": 4.7070891660066335e-07, "logits/chosen": -0.70173579454422, "logits/rejected": -0.7318000793457031, "logps/chosen": -71.84283447265625, "logps/rejected": -101.34080505371094, "loss": 0.4176, "rewards/accuracies": 0.0, "rewards/chosen": 5.346780300140381, "rewards/margins": -0.2324204444885254, "rewards/rejected": 5.579200744628906, "step": 8569 }, { "epoch": 1.39, "learning_rate": 4.705777179298339e-07, "logits/chosen": -0.5637801289558411, "logits/rejected": -0.5770009756088257, "logps/chosen": -75.35108947753906, "logps/rejected": -93.67735290527344, "loss": 0.9014, "rewards/accuracies": 0.0, "rewards/chosen": 0.8530693054199219, "rewards/margins": -1.1138511896133423, "rewards/rejected": 1.9669204950332642, "step": 8570 }, { "epoch": 1.39, "learning_rate": 4.704465212918058e-07, "logits/chosen": -0.9794066548347473, "logits/rejected": -0.961850643157959, "logps/chosen": -55.75932312011719, "logps/rejected": -60.06871032714844, "loss": 0.3229, "rewards/accuracies": 1.0, "rewards/chosen": 0.6249458193778992, "rewards/margins": 0.2612224519252777, "rewards/rejected": 0.36372336745262146, "step": 8571 }, { "epoch": 1.39, "learning_rate": 4.703153266956433e-07, "logits/chosen": -0.9311126470565796, "logits/rejected": -0.9430282711982727, "logps/chosen": -108.2921142578125, "logps/rejected": -49.922603607177734, "loss": 0.5265, "rewards/accuracies": 0.0, "rewards/chosen": 1.5694847106933594, "rewards/margins": -0.6204419136047363, "rewards/rejected": 2.1899266242980957, "step": 8572 }, { "epoch": 1.39, "learning_rate": 4.70184134150411e-07, "logits/chosen": -0.8714004158973694, "logits/rejected": -0.9273635745048523, "logps/chosen": -127.7305908203125, "logps/rejected": -154.93222045898438, "loss": 3.2776, "rewards/accuracies": 0.0, "rewards/chosen": 1.1977875232696533, "rewards/margins": -4.422918319702148, "rewards/rejected": 5.620706081390381, "step": 8573 }, { "epoch": 1.39, "learning_rate": 4.7005294366517284e-07, "logits/chosen": -0.9754708409309387, "logits/rejected": -0.9476694464683533, "logps/chosen": -95.07420349121094, "logps/rejected": -84.91497039794922, "loss": 0.9391, "rewards/accuracies": 0.0, "rewards/chosen": 0.34638139605522156, "rewards/margins": -1.701673984527588, "rewards/rejected": 2.048055410385132, "step": 8574 }, { "epoch": 1.39, "learning_rate": 4.6992175524899293e-07, "logits/chosen": -0.7685351371765137, "logits/rejected": -0.7715725898742676, "logps/chosen": -91.62525939941406, "logps/rejected": -92.70223236083984, "loss": 0.4191, "rewards/accuracies": 1.0, "rewards/chosen": 3.967167615890503, "rewards/margins": 0.9115989208221436, "rewards/rejected": 3.0555686950683594, "step": 8575 }, { "epoch": 1.39, "learning_rate": 4.6979056891093504e-07, "logits/chosen": -0.1957986056804657, "logits/rejected": -0.19561205804347992, "logps/chosen": -3.224611759185791, "logps/rejected": -1.465317726135254, "loss": 0.4794, "rewards/accuracies": 0.0, "rewards/chosen": 0.04540841653943062, "rewards/margins": -0.3331339657306671, "rewards/rejected": 0.37854239344596863, "step": 8576 }, { "epoch": 1.39, "learning_rate": 4.696593846600631e-07, "logits/chosen": -0.5483316779136658, "logits/rejected": -0.5416105389595032, "logps/chosen": -65.31200408935547, "logps/rejected": -58.32283020019531, "loss": 0.607, "rewards/accuracies": 1.0, "rewards/chosen": 1.896807074546814, "rewards/margins": 1.037116289138794, "rewards/rejected": 0.8596908450126648, "step": 8577 }, { "epoch": 1.39, "learning_rate": 4.695282025054406e-07, "logits/chosen": -0.45143595337867737, "logits/rejected": -0.5380746126174927, "logps/chosen": -19.985034942626953, "logps/rejected": -73.5602798461914, "loss": 1.0873, "rewards/accuracies": 0.0, "rewards/chosen": 0.7850807309150696, "rewards/margins": -2.0511436462402344, "rewards/rejected": 2.836224317550659, "step": 8578 }, { "epoch": 1.39, "learning_rate": 4.693970224561309e-07, "logits/chosen": -0.60783451795578, "logits/rejected": -0.5854081511497498, "logps/chosen": -84.8191146850586, "logps/rejected": -81.37303161621094, "loss": 0.7596, "rewards/accuracies": 1.0, "rewards/chosen": 1.6125015020370483, "rewards/margins": 0.09492409229278564, "rewards/rejected": 1.5175774097442627, "step": 8579 }, { "epoch": 1.39, "learning_rate": 4.692658445211974e-07, "logits/chosen": -0.5776355266571045, "logits/rejected": -0.5555447936058044, "logps/chosen": -137.77352905273438, "logps/rejected": -111.71495056152344, "loss": 1.675, "rewards/accuracies": 0.0, "rewards/chosen": 1.6620773077011108, "rewards/margins": -3.300869941711426, "rewards/rejected": 4.962947368621826, "step": 8580 }, { "epoch": 1.39, "learning_rate": 4.6913466870970324e-07, "logits/chosen": -0.9462521076202393, "logits/rejected": -0.8224667906761169, "logps/chosen": -172.5055389404297, "logps/rejected": -54.01887512207031, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": 5.904466152191162, "rewards/margins": 4.1002888679504395, "rewards/rejected": 1.804177165031433, "step": 8581 }, { "epoch": 1.39, "learning_rate": 4.6900349503071146e-07, "logits/chosen": -0.6887701153755188, "logits/rejected": -0.5193389058113098, "logps/chosen": -158.43699645996094, "logps/rejected": -16.334964752197266, "loss": 0.822, "rewards/accuracies": 1.0, "rewards/chosen": 6.097607612609863, "rewards/margins": 5.730679035186768, "rewards/rejected": 0.36692848801612854, "step": 8582 }, { "epoch": 1.39, "learning_rate": 4.688723234932847e-07, "logits/chosen": -0.7134568691253662, "logits/rejected": -0.6774699687957764, "logps/chosen": -80.19493103027344, "logps/rejected": -73.23713684082031, "loss": 2.2798, "rewards/accuracies": 1.0, "rewards/chosen": 0.681683361530304, "rewards/margins": 0.25776368379592896, "rewards/rejected": 0.423919677734375, "step": 8583 }, { "epoch": 1.39, "learning_rate": 4.68741154106486e-07, "logits/chosen": -0.4751128852367401, "logits/rejected": -0.3724132776260376, "logps/chosen": -29.316295623779297, "logps/rejected": -45.64048767089844, "loss": 2.3589, "rewards/accuracies": 1.0, "rewards/chosen": 2.3264145851135254, "rewards/margins": 0.5655735731124878, "rewards/rejected": 1.7608410120010376, "step": 8584 }, { "epoch": 1.39, "learning_rate": 4.686099868793778e-07, "logits/chosen": -1.1119297742843628, "logits/rejected": -1.2220507860183716, "logps/chosen": -117.5038070678711, "logps/rejected": -97.28468322753906, "loss": 0.3885, "rewards/accuracies": 1.0, "rewards/chosen": 5.862372875213623, "rewards/margins": 2.6247339248657227, "rewards/rejected": 3.2376389503479004, "step": 8585 }, { "epoch": 1.39, "learning_rate": 4.684788218210225e-07, "logits/chosen": -0.23208729922771454, "logits/rejected": -0.24436217546463013, "logps/chosen": -1.0916582345962524, "logps/rejected": -24.02396011352539, "loss": 0.5126, "rewards/accuracies": 1.0, "rewards/chosen": 0.37814170122146606, "rewards/margins": 0.23042823374271393, "rewards/rejected": 0.14771346747875214, "step": 8586 }, { "epoch": 1.39, "learning_rate": 4.6834765894048234e-07, "logits/chosen": -0.3500215411186218, "logits/rejected": -0.31404152512550354, "logps/chosen": -132.16355895996094, "logps/rejected": -85.85955810546875, "loss": 0.7646, "rewards/accuracies": 0.0, "rewards/chosen": 1.154107689857483, "rewards/margins": -0.5672485828399658, "rewards/rejected": 1.7213562726974487, "step": 8587 }, { "epoch": 1.39, "learning_rate": 4.682164982468194e-07, "logits/chosen": -0.6059593558311462, "logits/rejected": -0.6493228673934937, "logps/chosen": -76.82364654541016, "logps/rejected": -98.07874298095703, "loss": 2.3887, "rewards/accuracies": 0.0, "rewards/chosen": 1.0063056945800781, "rewards/margins": -4.086452007293701, "rewards/rejected": 5.092757701873779, "step": 8588 }, { "epoch": 1.39, "learning_rate": 4.6808533974909575e-07, "logits/chosen": -0.6773228049278259, "logits/rejected": -0.6012821197509766, "logps/chosen": -101.35562133789062, "logps/rejected": -98.93397521972656, "loss": 1.271, "rewards/accuracies": 1.0, "rewards/chosen": 6.441293239593506, "rewards/margins": 3.2418105602264404, "rewards/rejected": 3.1994826793670654, "step": 8589 }, { "epoch": 1.39, "learning_rate": 4.679541834563732e-07, "logits/chosen": -0.6200823187828064, "logits/rejected": -0.620614230632782, "logps/chosen": -1.834417700767517, "logps/rejected": -1.1377792358398438, "loss": 0.4603, "rewards/accuracies": 1.0, "rewards/chosen": 0.2662765085697174, "rewards/margins": 0.02252359688282013, "rewards/rejected": 0.24375291168689728, "step": 8590 }, { "epoch": 1.39, "learning_rate": 4.678230293777132e-07, "logits/chosen": -0.9115990996360779, "logits/rejected": -0.8548464179039001, "logps/chosen": -108.88105773925781, "logps/rejected": -43.535430908203125, "loss": 0.5923, "rewards/accuracies": 0.0, "rewards/chosen": 1.1067276000976562, "rewards/margins": -0.702717661857605, "rewards/rejected": 1.8094452619552612, "step": 8591 }, { "epoch": 1.39, "learning_rate": 4.6769187752217755e-07, "logits/chosen": -0.5408562421798706, "logits/rejected": -0.6072993874549866, "logps/chosen": -69.2171630859375, "logps/rejected": -64.35869598388672, "loss": 2.1098, "rewards/accuracies": 0.0, "rewards/chosen": 1.4936035871505737, "rewards/margins": -0.3556632995605469, "rewards/rejected": 1.8492668867111206, "step": 8592 }, { "epoch": 1.39, "learning_rate": 4.675607278988274e-07, "logits/chosen": -0.8513889312744141, "logits/rejected": -0.8626773357391357, "logps/chosen": -80.66543579101562, "logps/rejected": -63.93817901611328, "loss": 1.0596, "rewards/accuracies": 0.0, "rewards/chosen": 0.46104738116264343, "rewards/margins": -0.9140616655349731, "rewards/rejected": 1.375109076499939, "step": 8593 }, { "epoch": 1.39, "learning_rate": 4.67429580516724e-07, "logits/chosen": -0.7720667719841003, "logits/rejected": -0.7537500262260437, "logps/chosen": -94.10694122314453, "logps/rejected": -99.01502990722656, "loss": 1.3376, "rewards/accuracies": 0.0, "rewards/chosen": 1.0113266706466675, "rewards/margins": -2.2468061447143555, "rewards/rejected": 3.2581329345703125, "step": 8594 }, { "epoch": 1.4, "learning_rate": 4.672984353849284e-07, "logits/chosen": -0.6617624759674072, "logits/rejected": -0.6728402376174927, "logps/chosen": -64.87039184570312, "logps/rejected": -69.00791931152344, "loss": 0.5386, "rewards/accuracies": 0.0, "rewards/chosen": 2.8059990406036377, "rewards/margins": -0.07666325569152832, "rewards/rejected": 2.882662296295166, "step": 8595 }, { "epoch": 1.4, "learning_rate": 4.671672925125015e-07, "logits/chosen": -0.4919321537017822, "logits/rejected": -0.5104920268058777, "logps/chosen": -3.032433032989502, "logps/rejected": -36.79896545410156, "loss": 0.8747, "rewards/accuracies": 1.0, "rewards/chosen": 0.30326375365257263, "rewards/margins": 0.0496695339679718, "rewards/rejected": 0.25359421968460083, "step": 8596 }, { "epoch": 1.4, "learning_rate": 4.6703615190850403e-07, "logits/chosen": -0.6821116209030151, "logits/rejected": -0.7042855620384216, "logps/chosen": -26.545785903930664, "logps/rejected": -54.237205505371094, "loss": 0.7988, "rewards/accuracies": 0.0, "rewards/chosen": 1.0128469467163086, "rewards/margins": -0.3532625436782837, "rewards/rejected": 1.3661094903945923, "step": 8597 }, { "epoch": 1.4, "learning_rate": 4.6690501358199655e-07, "logits/chosen": -0.8698195219039917, "logits/rejected": -0.7508339285850525, "logps/chosen": -105.77747344970703, "logps/rejected": -245.4735870361328, "loss": 1.0648, "rewards/accuracies": 0.0, "rewards/chosen": 5.36066198348999, "rewards/margins": -1.5375452041625977, "rewards/rejected": 6.898207187652588, "step": 8598 }, { "epoch": 1.4, "learning_rate": 4.667738775420395e-07, "logits/chosen": -0.2803208529949188, "logits/rejected": -0.32656943798065186, "logps/chosen": -93.60609436035156, "logps/rejected": -59.67969512939453, "loss": 1.2317, "rewards/accuracies": 0.0, "rewards/chosen": 0.7677398920059204, "rewards/margins": -1.1434990167617798, "rewards/rejected": 1.9112389087677002, "step": 8599 }, { "epoch": 1.4, "learning_rate": 4.6664274379769307e-07, "logits/chosen": -0.41754499077796936, "logits/rejected": -0.48216134309768677, "logps/chosen": -68.89912414550781, "logps/rejected": -148.78958129882812, "loss": 0.818, "rewards/accuracies": 0.0, "rewards/chosen": 0.9847640991210938, "rewards/margins": -0.03379368782043457, "rewards/rejected": 1.0185577869415283, "step": 8600 }, { "epoch": 1.4, "learning_rate": 4.665116123580175e-07, "logits/chosen": -0.6109914183616638, "logits/rejected": -0.5713804364204407, "logps/chosen": -121.88809967041016, "logps/rejected": -111.25048828125, "loss": 0.1719, "rewards/accuracies": 1.0, "rewards/chosen": 4.39763879776001, "rewards/margins": 1.1709296703338623, "rewards/rejected": 3.2267091274261475, "step": 8601 }, { "epoch": 1.4, "learning_rate": 4.6638048323207255e-07, "logits/chosen": -0.5037308931350708, "logits/rejected": -0.513053297996521, "logps/chosen": -16.942209243774414, "logps/rejected": -3.260268449783325, "loss": 0.4852, "rewards/accuracies": 0.0, "rewards/chosen": 0.08710117638111115, "rewards/margins": -0.20217959582805634, "rewards/rejected": 0.2892807722091675, "step": 8602 }, { "epoch": 1.4, "learning_rate": 4.6624935642891814e-07, "logits/chosen": -0.9494530558586121, "logits/rejected": -0.985279381275177, "logps/chosen": -72.12991333007812, "logps/rejected": -168.00521850585938, "loss": 1.1003, "rewards/accuracies": 0.0, "rewards/chosen": 1.2097519636154175, "rewards/margins": -1.8582900762557983, "rewards/rejected": 3.068042039871216, "step": 8603 }, { "epoch": 1.4, "learning_rate": 4.6611823195761393e-07, "logits/chosen": -0.8855962753295898, "logits/rejected": -0.6687872409820557, "logps/chosen": -85.45028686523438, "logps/rejected": -32.62800979614258, "loss": 0.1024, "rewards/accuracies": 1.0, "rewards/chosen": 2.6588592529296875, "rewards/margins": 2.3711018562316895, "rewards/rejected": 0.2877574861049652, "step": 8604 }, { "epoch": 1.4, "learning_rate": 4.6598710982721915e-07, "logits/chosen": -0.6996314525604248, "logits/rejected": -0.6816521883010864, "logps/chosen": -63.1562614440918, "logps/rejected": -62.27194595336914, "loss": 0.2954, "rewards/accuracies": 1.0, "rewards/chosen": 2.3454625606536865, "rewards/margins": 0.22708749771118164, "rewards/rejected": 2.118375062942505, "step": 8605 }, { "epoch": 1.4, "learning_rate": 4.6585599004679336e-07, "logits/chosen": -0.47152620553970337, "logits/rejected": -0.41318660974502563, "logps/chosen": -46.3686408996582, "logps/rejected": -86.17412567138672, "loss": 3.2651, "rewards/accuracies": 0.0, "rewards/chosen": 1.2632839679718018, "rewards/margins": -0.8766429424285889, "rewards/rejected": 2.1399269104003906, "step": 8606 }, { "epoch": 1.4, "learning_rate": 4.6572487262539556e-07, "logits/chosen": -0.7270999550819397, "logits/rejected": -0.6926918625831604, "logps/chosen": -66.07770538330078, "logps/rejected": -43.02879333496094, "loss": 0.3177, "rewards/accuracies": 1.0, "rewards/chosen": 1.8856887817382812, "rewards/margins": 0.7830146551132202, "rewards/rejected": 1.102674126625061, "step": 8607 }, { "epoch": 1.4, "learning_rate": 4.6559375757208473e-07, "logits/chosen": -0.46533453464508057, "logits/rejected": -0.4367184340953827, "logps/chosen": -67.62492370605469, "logps/rejected": -65.81057739257812, "loss": 0.5613, "rewards/accuracies": 1.0, "rewards/chosen": 2.0521163940429688, "rewards/margins": 0.31045377254486084, "rewards/rejected": 1.741662621498108, "step": 8608 }, { "epoch": 1.4, "learning_rate": 4.6546264489591966e-07, "logits/chosen": -1.486659288406372, "logits/rejected": -1.5117119550704956, "logps/chosen": -52.8575439453125, "logps/rejected": -129.51527404785156, "loss": 0.9392, "rewards/accuracies": 0.0, "rewards/chosen": 0.7983142733573914, "rewards/margins": -1.6699001789093018, "rewards/rejected": 2.468214511871338, "step": 8609 }, { "epoch": 1.4, "learning_rate": 4.6533153460595906e-07, "logits/chosen": -0.4931293725967407, "logits/rejected": -0.4997117519378662, "logps/chosen": -59.012535095214844, "logps/rejected": -75.60826873779297, "loss": 0.8335, "rewards/accuracies": 1.0, "rewards/chosen": 3.0437209606170654, "rewards/margins": 0.9307975769042969, "rewards/rejected": 2.1129233837127686, "step": 8610 }, { "epoch": 1.4, "learning_rate": 4.652004267112614e-07, "logits/chosen": -0.583710789680481, "logits/rejected": -0.5547717809677124, "logps/chosen": -60.03300094604492, "logps/rejected": -34.20648193359375, "loss": 0.5971, "rewards/accuracies": 0.0, "rewards/chosen": 1.3818973302841187, "rewards/margins": -0.5709682703018188, "rewards/rejected": 1.9528656005859375, "step": 8611 }, { "epoch": 1.4, "learning_rate": 4.65069321220885e-07, "logits/chosen": -0.7577833533287048, "logits/rejected": -0.63913893699646, "logps/chosen": -145.89767456054688, "logps/rejected": -85.65437316894531, "loss": 0.3125, "rewards/accuracies": 1.0, "rewards/chosen": 1.5211578607559204, "rewards/margins": 0.1842285394668579, "rewards/rejected": 1.3369293212890625, "step": 8612 }, { "epoch": 1.4, "learning_rate": 4.649382181438881e-07, "logits/chosen": -0.7752176523208618, "logits/rejected": -0.6566762328147888, "logps/chosen": -192.64366149902344, "logps/rejected": -159.21957397460938, "loss": 0.155, "rewards/accuracies": 1.0, "rewards/chosen": 5.0881028175354, "rewards/margins": 1.061173915863037, "rewards/rejected": 4.026928901672363, "step": 8613 }, { "epoch": 1.4, "learning_rate": 4.6480711748932846e-07, "logits/chosen": -0.664685845375061, "logits/rejected": -0.5683107972145081, "logps/chosen": -50.96599578857422, "logps/rejected": -84.41425323486328, "loss": 0.5909, "rewards/accuracies": 0.0, "rewards/chosen": 2.279578447341919, "rewards/margins": -0.35787510871887207, "rewards/rejected": 2.637453556060791, "step": 8614 }, { "epoch": 1.4, "learning_rate": 4.646760192662639e-07, "logits/chosen": -0.9235860705375671, "logits/rejected": -0.920324981212616, "logps/chosen": -120.9734115600586, "logps/rejected": -78.15091705322266, "loss": 0.4491, "rewards/accuracies": 1.0, "rewards/chosen": 2.258861541748047, "rewards/margins": 1.3474364280700684, "rewards/rejected": 0.9114250540733337, "step": 8615 }, { "epoch": 1.4, "learning_rate": 4.645449234837523e-07, "logits/chosen": -0.6129562854766846, "logits/rejected": -0.5091282725334167, "logps/chosen": -60.084320068359375, "logps/rejected": -31.697891235351562, "loss": 0.0329, "rewards/accuracies": 1.0, "rewards/chosen": 3.410714864730835, "rewards/margins": 3.300420045852661, "rewards/rejected": 0.11029472202062607, "step": 8616 }, { "epoch": 1.4, "learning_rate": 4.644138301508509e-07, "logits/chosen": -0.44567620754241943, "logits/rejected": -0.4061310291290283, "logps/chosen": -117.36019897460938, "logps/rejected": -63.1912956237793, "loss": 0.7107, "rewards/accuracies": 0.0, "rewards/chosen": 2.041879415512085, "rewards/margins": -0.20244312286376953, "rewards/rejected": 2.2443225383758545, "step": 8617 }, { "epoch": 1.4, "learning_rate": 4.642827392766172e-07, "logits/chosen": -0.9238097667694092, "logits/rejected": -0.899876594543457, "logps/chosen": -88.61027526855469, "logps/rejected": -60.283843994140625, "loss": 0.5163, "rewards/accuracies": 0.0, "rewards/chosen": 2.7032837867736816, "rewards/margins": -0.481689453125, "rewards/rejected": 3.1849732398986816, "step": 8618 }, { "epoch": 1.4, "learning_rate": 4.6415165087010823e-07, "logits/chosen": -1.028563141822815, "logits/rejected": -0.9885804057121277, "logps/chosen": -239.46434020996094, "logps/rejected": -43.81596374511719, "loss": 0.0449, "rewards/accuracies": 1.0, "rewards/chosen": 5.815614223480225, "rewards/margins": 5.547797679901123, "rewards/rejected": 0.26781654357910156, "step": 8619 }, { "epoch": 1.4, "learning_rate": 4.6402056494038093e-07, "logits/chosen": -0.7381734848022461, "logits/rejected": -0.669671893119812, "logps/chosen": -155.2628936767578, "logps/rejected": -96.33678436279297, "loss": 0.1277, "rewards/accuracies": 1.0, "rewards/chosen": 4.422615051269531, "rewards/margins": 1.7591774463653564, "rewards/rejected": 2.663437604904175, "step": 8620 }, { "epoch": 1.4, "learning_rate": 4.638894814964922e-07, "logits/chosen": -0.19458609819412231, "logits/rejected": -0.19458609819412231, "logps/chosen": -54.29961013793945, "logps/rejected": -54.29961013793945, "loss": 1.8001, "rewards/accuracies": 0.0, "rewards/chosen": 1.0427913665771484, "rewards/margins": 0.0, "rewards/rejected": 1.0427913665771484, "step": 8621 }, { "epoch": 1.4, "learning_rate": 4.6375840054749863e-07, "logits/chosen": -0.7131072878837585, "logits/rejected": -0.7065241932868958, "logps/chosen": -54.75584411621094, "logps/rejected": -67.97474670410156, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 1.469702959060669, "rewards/margins": -1.0822257995605469, "rewards/rejected": 2.551928758621216, "step": 8622 }, { "epoch": 1.4, "learning_rate": 4.636273221024567e-07, "logits/chosen": -0.33257240056991577, "logits/rejected": -0.2887983024120331, "logps/chosen": -103.24767303466797, "logps/rejected": -121.54244995117188, "loss": 0.2968, "rewards/accuracies": 1.0, "rewards/chosen": 1.5619949102401733, "rewards/margins": 0.324809193611145, "rewards/rejected": 1.2371857166290283, "step": 8623 }, { "epoch": 1.4, "learning_rate": 4.6349624617042253e-07, "logits/chosen": -0.6284887790679932, "logits/rejected": -0.5482259392738342, "logps/chosen": -75.49311065673828, "logps/rejected": -53.979736328125, "loss": 0.5725, "rewards/accuracies": 0.0, "rewards/chosen": 1.3449805974960327, "rewards/margins": -0.6489235162734985, "rewards/rejected": 1.9939041137695312, "step": 8624 }, { "epoch": 1.4, "learning_rate": 4.633651727604524e-07, "logits/chosen": -0.8515663743019104, "logits/rejected": -0.8370183706283569, "logps/chosen": -71.11338806152344, "logps/rejected": -74.12097930908203, "loss": 0.579, "rewards/accuracies": 0.0, "rewards/chosen": 0.934100329875946, "rewards/margins": -0.5468025803565979, "rewards/rejected": 1.480902910232544, "step": 8625 }, { "epoch": 1.4, "learning_rate": 4.6323410188160227e-07, "logits/chosen": -0.3989785611629486, "logits/rejected": -0.4086683392524719, "logps/chosen": -60.001991271972656, "logps/rejected": -81.64889526367188, "loss": 0.3973, "rewards/accuracies": 1.0, "rewards/chosen": 2.088542938232422, "rewards/margins": 0.4029731750488281, "rewards/rejected": 1.6855697631835938, "step": 8626 }, { "epoch": 1.4, "learning_rate": 4.6310303354292774e-07, "logits/chosen": -0.2376307249069214, "logits/rejected": -0.2376307249069214, "logps/chosen": -21.663463592529297, "logps/rejected": -21.663463592529297, "loss": 0.3633, "rewards/accuracies": 0.0, "rewards/chosen": 0.11848487704992294, "rewards/margins": 0.0, "rewards/rejected": 0.11848487704992294, "step": 8627 }, { "epoch": 1.4, "learning_rate": 4.629719677534845e-07, "logits/chosen": -0.6981971859931946, "logits/rejected": -0.7027345299720764, "logps/chosen": -61.79161071777344, "logps/rejected": -65.82164001464844, "loss": 1.2512, "rewards/accuracies": 0.0, "rewards/chosen": 2.024728536605835, "rewards/margins": -1.3779478073120117, "rewards/rejected": 3.4026763439178467, "step": 8628 }, { "epoch": 1.4, "learning_rate": 4.6284090452232793e-07, "logits/chosen": -0.8128045201301575, "logits/rejected": -1.0759958028793335, "logps/chosen": -47.29210662841797, "logps/rejected": -76.43351745605469, "loss": 2.4229, "rewards/accuracies": 0.0, "rewards/chosen": 2.5232155323028564, "rewards/margins": -4.313869476318359, "rewards/rejected": 6.837085247039795, "step": 8629 }, { "epoch": 1.4, "learning_rate": 4.627098438585132e-07, "logits/chosen": -0.4361734688282013, "logits/rejected": -0.4307877719402313, "logps/chosen": -33.96399688720703, "logps/rejected": -53.515682220458984, "loss": 1.2829, "rewards/accuracies": 0.0, "rewards/chosen": 1.2437946796417236, "rewards/margins": -1.2905616760253906, "rewards/rejected": 2.5343563556671143, "step": 8630 }, { "epoch": 1.4, "learning_rate": 4.625787857710955e-07, "logits/chosen": -0.6342705488204956, "logits/rejected": -0.8037237524986267, "logps/chosen": -74.37762451171875, "logps/rejected": -127.6399154663086, "loss": 1.5412, "rewards/accuracies": 0.0, "rewards/chosen": 1.7432632446289062, "rewards/margins": -2.2533929347991943, "rewards/rejected": 3.9966561794281006, "step": 8631 }, { "epoch": 1.4, "learning_rate": 4.624477302691295e-07, "logits/chosen": -0.5211985111236572, "logits/rejected": -0.4495121240615845, "logps/chosen": -101.66386413574219, "logps/rejected": -120.57635498046875, "loss": 1.158, "rewards/accuracies": 1.0, "rewards/chosen": 3.09757399559021, "rewards/margins": 0.0035371780395507812, "rewards/rejected": 3.094036817550659, "step": 8632 }, { "epoch": 1.4, "learning_rate": 4.6231667736167014e-07, "logits/chosen": -0.20460280776023865, "logits/rejected": -0.20460280776023865, "logps/chosen": -6.429748058319092, "logps/rejected": -6.429748058319092, "loss": 0.473, "rewards/accuracies": 0.0, "rewards/chosen": 0.5980929136276245, "rewards/margins": 0.0, "rewards/rejected": 0.5980929136276245, "step": 8633 }, { "epoch": 1.4, "learning_rate": 4.621856270577718e-07, "logits/chosen": -0.9312650561332703, "logits/rejected": -0.9377676248550415, "logps/chosen": -161.19937133789062, "logps/rejected": -77.56378173828125, "loss": 0.5856, "rewards/accuracies": 0.0, "rewards/chosen": 1.2950439453125, "rewards/margins": -0.6539443731307983, "rewards/rejected": 1.9489883184432983, "step": 8634 }, { "epoch": 1.4, "learning_rate": 4.6205457936648874e-07, "logits/chosen": -0.44398513436317444, "logits/rejected": -0.44398513436317444, "logps/chosen": -95.67344665527344, "logps/rejected": -95.67344665527344, "loss": 0.9208, "rewards/accuracies": 0.0, "rewards/chosen": 0.8141250610351562, "rewards/margins": 0.0, "rewards/rejected": 0.8141250610351562, "step": 8635 }, { "epoch": 1.4, "learning_rate": 4.6192353429687526e-07, "logits/chosen": -0.7654255628585815, "logits/rejected": -0.7345883250236511, "logps/chosen": -71.01010131835938, "logps/rejected": -13.997353553771973, "loss": 0.2349, "rewards/accuracies": 1.0, "rewards/chosen": 1.0577682256698608, "rewards/margins": 0.5383911728858948, "rewards/rejected": 0.5193770527839661, "step": 8636 }, { "epoch": 1.4, "learning_rate": 4.6179249185798523e-07, "logits/chosen": -0.7075590491294861, "logits/rejected": -0.6879332065582275, "logps/chosen": -41.72254180908203, "logps/rejected": -39.04308319091797, "loss": 0.4233, "rewards/accuracies": 1.0, "rewards/chosen": 0.878644585609436, "rewards/margins": 0.5505645871162415, "rewards/rejected": 0.3280799984931946, "step": 8637 }, { "epoch": 1.4, "learning_rate": 4.616614520588726e-07, "logits/chosen": -0.6586619019508362, "logits/rejected": -0.6586619019508362, "logps/chosen": -0.23458153009414673, "logps/rejected": -0.23458153009414673, "loss": 0.4728, "rewards/accuracies": 0.0, "rewards/chosen": 0.06441312283277512, "rewards/margins": 0.0, "rewards/rejected": 0.06441312283277512, "step": 8638 }, { "epoch": 1.4, "learning_rate": 4.6153041490859064e-07, "logits/chosen": -0.3428541421890259, "logits/rejected": -0.3828408718109131, "logps/chosen": -15.093502044677734, "logps/rejected": -28.966732025146484, "loss": 0.5916, "rewards/accuracies": 0.0, "rewards/chosen": 0.8227106332778931, "rewards/margins": -0.09789520502090454, "rewards/rejected": 0.9206058382987976, "step": 8639 }, { "epoch": 1.4, "learning_rate": 4.613993804161932e-07, "logits/chosen": -0.8155993819236755, "logits/rejected": -0.8155993819236755, "logps/chosen": -92.0392837524414, "logps/rejected": -92.0392837524414, "loss": 0.7722, "rewards/accuracies": 0.0, "rewards/chosen": 1.113287329673767, "rewards/margins": 0.0, "rewards/rejected": 1.113287329673767, "step": 8640 }, { "epoch": 1.4, "learning_rate": 4.6126834859073326e-07, "logits/chosen": -0.8134709596633911, "logits/rejected": -0.7402708530426025, "logps/chosen": -88.79492950439453, "logps/rejected": -48.93841552734375, "loss": 1.2336, "rewards/accuracies": 1.0, "rewards/chosen": 3.7447898387908936, "rewards/margins": 1.346489667892456, "rewards/rejected": 2.3983001708984375, "step": 8641 }, { "epoch": 1.4, "learning_rate": 4.61137319441264e-07, "logits/chosen": -0.8955543637275696, "logits/rejected": -0.9613337516784668, "logps/chosen": -237.27438354492188, "logps/rejected": -38.32147216796875, "loss": 0.0593, "rewards/accuracies": 1.0, "rewards/chosen": 5.5731353759765625, "rewards/margins": 5.378694534301758, "rewards/rejected": 0.1944408416748047, "step": 8642 }, { "epoch": 1.4, "learning_rate": 4.610062929768382e-07, "logits/chosen": -0.4747985601425171, "logits/rejected": -0.49725431203842163, "logps/chosen": -47.28276062011719, "logps/rejected": -62.52756881713867, "loss": 0.5239, "rewards/accuracies": 0.0, "rewards/chosen": 1.4018253087997437, "rewards/margins": -0.5594367980957031, "rewards/rejected": 1.9612621068954468, "step": 8643 }, { "epoch": 1.4, "learning_rate": 4.6087526920650867e-07, "logits/chosen": -0.8126580715179443, "logits/rejected": -0.7335626482963562, "logps/chosen": -78.39849090576172, "logps/rejected": -93.30580139160156, "loss": 0.1319, "rewards/accuracies": 1.0, "rewards/chosen": 6.15569543838501, "rewards/margins": 2.864485263824463, "rewards/rejected": 3.291210174560547, "step": 8644 }, { "epoch": 1.4, "learning_rate": 4.6074424813932775e-07, "logits/chosen": -0.8421468138694763, "logits/rejected": -0.783694863319397, "logps/chosen": -46.856807708740234, "logps/rejected": -12.510392189025879, "loss": 1.181, "rewards/accuracies": 1.0, "rewards/chosen": 1.4448429346084595, "rewards/margins": 0.8666216731071472, "rewards/rejected": 0.5782212615013123, "step": 8645 }, { "epoch": 1.4, "learning_rate": 4.606132297843479e-07, "logits/chosen": -0.23632405698299408, "logits/rejected": -0.2469443529844284, "logps/chosen": -33.67783737182617, "logps/rejected": -35.07257843017578, "loss": 2.0978, "rewards/accuracies": 0.0, "rewards/chosen": -0.4386138916015625, "rewards/margins": -0.38991624116897583, "rewards/rejected": -0.048697661608457565, "step": 8646 }, { "epoch": 1.4, "learning_rate": 4.604822141506212e-07, "logits/chosen": -0.7234545946121216, "logits/rejected": -0.7269318103790283, "logps/chosen": -72.01202392578125, "logps/rejected": -203.97677612304688, "loss": 0.3074, "rewards/accuracies": 1.0, "rewards/chosen": 1.5532668828964233, "rewards/margins": 0.39616847038269043, "rewards/rejected": 1.157098412513733, "step": 8647 }, { "epoch": 1.4, "learning_rate": 4.603512012471995e-07, "logits/chosen": -0.49837249517440796, "logits/rejected": -0.5416297912597656, "logps/chosen": -1.5454626083374023, "logps/rejected": -48.330848693847656, "loss": 0.4989, "rewards/accuracies": 1.0, "rewards/chosen": 0.3797883689403534, "rewards/margins": 0.08565345406532288, "rewards/rejected": 0.2941349148750305, "step": 8648 }, { "epoch": 1.4, "learning_rate": 4.602201910831347e-07, "logits/chosen": -0.4103038012981415, "logits/rejected": -0.32160940766334534, "logps/chosen": -39.53438949584961, "logps/rejected": -13.482400894165039, "loss": 1.2575, "rewards/accuracies": 1.0, "rewards/chosen": 1.8005183935165405, "rewards/margins": 0.5954066514968872, "rewards/rejected": 1.2051117420196533, "step": 8649 }, { "epoch": 1.4, "learning_rate": 4.600891836674783e-07, "logits/chosen": -0.7619093656539917, "logits/rejected": -0.680987536907196, "logps/chosen": -72.34271240234375, "logps/rejected": -72.54621887207031, "loss": 0.8396, "rewards/accuracies": 1.0, "rewards/chosen": 2.7066636085510254, "rewards/margins": 0.1532905101776123, "rewards/rejected": 2.553373098373413, "step": 8650 }, { "epoch": 1.4, "learning_rate": 4.599581790092817e-07, "logits/chosen": -0.895654559135437, "logits/rejected": -0.7804194092750549, "logps/chosen": -46.07868194580078, "logps/rejected": -25.01228141784668, "loss": 1.2518, "rewards/accuracies": 1.0, "rewards/chosen": 1.2992218732833862, "rewards/margins": 1.386391282081604, "rewards/rejected": -0.08716946095228195, "step": 8651 }, { "epoch": 1.4, "learning_rate": 4.5982717711759595e-07, "logits/chosen": -0.5123209357261658, "logits/rejected": -0.3559722900390625, "logps/chosen": -133.23809814453125, "logps/rejected": -58.18034362792969, "loss": 0.1176, "rewards/accuracies": 1.0, "rewards/chosen": 6.994389533996582, "rewards/margins": 4.620083808898926, "rewards/rejected": 2.3743057250976562, "step": 8652 }, { "epoch": 1.4, "learning_rate": 4.596961780014722e-07, "logits/chosen": -0.844832718372345, "logits/rejected": -0.8934094905853271, "logps/chosen": -230.58123779296875, "logps/rejected": -83.44491577148438, "loss": 0.2103, "rewards/accuracies": 1.0, "rewards/chosen": 3.8632447719573975, "rewards/margins": 0.7389748096466064, "rewards/rejected": 3.124269962310791, "step": 8653 }, { "epoch": 1.4, "learning_rate": 4.595651816699612e-07, "logits/chosen": -0.7617949843406677, "logits/rejected": -0.7842475771903992, "logps/chosen": -84.37391662597656, "logps/rejected": -59.61054229736328, "loss": 1.2242, "rewards/accuracies": 0.0, "rewards/chosen": 0.1649932861328125, "rewards/margins": -1.6441749334335327, "rewards/rejected": 1.8091682195663452, "step": 8654 }, { "epoch": 1.4, "learning_rate": 4.5943418813211356e-07, "logits/chosen": -0.14156867563724518, "logits/rejected": -0.14171834290027618, "logps/chosen": -3.6503183841705322, "logps/rejected": -1.5526219606399536, "loss": 0.356, "rewards/accuracies": 0.0, "rewards/chosen": 0.2915860116481781, "rewards/margins": -0.0365297794342041, "rewards/rejected": 0.3281157910823822, "step": 8655 }, { "epoch": 1.4, "learning_rate": 4.5930319739697967e-07, "logits/chosen": -0.40672534704208374, "logits/rejected": -0.3415979743003845, "logps/chosen": -49.346397399902344, "logps/rejected": -39.0655632019043, "loss": 0.9268, "rewards/accuracies": 1.0, "rewards/chosen": 1.9132041931152344, "rewards/margins": 0.12366902828216553, "rewards/rejected": 1.7895351648330688, "step": 8656 }, { "epoch": 1.41, "learning_rate": 4.5917220947360976e-07, "logits/chosen": -0.768834114074707, "logits/rejected": -0.5457000732421875, "logps/chosen": -176.13072204589844, "logps/rejected": -84.32124328613281, "loss": 0.1362, "rewards/accuracies": 1.0, "rewards/chosen": 5.441522121429443, "rewards/margins": 1.164964199066162, "rewards/rejected": 4.276557922363281, "step": 8657 }, { "epoch": 1.41, "learning_rate": 4.590412243710538e-07, "logits/chosen": -0.8918130993843079, "logits/rejected": -0.8559706807136536, "logps/chosen": -124.8878402709961, "logps/rejected": -88.50679016113281, "loss": 0.7092, "rewards/accuracies": 0.0, "rewards/chosen": 0.9645866751670837, "rewards/margins": -1.1356024742126465, "rewards/rejected": 2.100189208984375, "step": 8658 }, { "epoch": 1.41, "learning_rate": 4.5891024209836176e-07, "logits/chosen": -0.4509138762950897, "logits/rejected": -0.3875195384025574, "logps/chosen": -97.82179260253906, "logps/rejected": -39.522254943847656, "loss": 0.5739, "rewards/accuracies": 0.0, "rewards/chosen": 1.3177566528320312, "rewards/margins": -0.6406639814376831, "rewards/rejected": 1.9584206342697144, "step": 8659 }, { "epoch": 1.41, "learning_rate": 4.5877926266458317e-07, "logits/chosen": -1.1708433628082275, "logits/rejected": -1.2066656351089478, "logps/chosen": -110.57276916503906, "logps/rejected": -77.54899597167969, "loss": 0.8676, "rewards/accuracies": 1.0, "rewards/chosen": 4.504803657531738, "rewards/margins": 1.3957092761993408, "rewards/rejected": 3.1090943813323975, "step": 8660 }, { "epoch": 1.41, "learning_rate": 4.5864828607876747e-07, "logits/chosen": -0.810553789138794, "logits/rejected": -0.7380568385124207, "logps/chosen": -68.1981430053711, "logps/rejected": -61.8563232421875, "loss": 0.8159, "rewards/accuracies": 1.0, "rewards/chosen": 3.5167641639709473, "rewards/margins": 1.3063819408416748, "rewards/rejected": 2.2103822231292725, "step": 8661 }, { "epoch": 1.41, "learning_rate": 4.5851731234996394e-07, "logits/chosen": -0.7476934194564819, "logits/rejected": -0.7712556719779968, "logps/chosen": -99.64818572998047, "logps/rejected": -136.40521240234375, "loss": 0.2881, "rewards/accuracies": 1.0, "rewards/chosen": 5.179535865783691, "rewards/margins": 0.407991886138916, "rewards/rejected": 4.771543979644775, "step": 8662 }, { "epoch": 1.41, "learning_rate": 4.5838634148722164e-07, "logits/chosen": -0.3990381062030792, "logits/rejected": -0.4643547236919403, "logps/chosen": -87.88845825195312, "logps/rejected": -89.03425598144531, "loss": 0.6863, "rewards/accuracies": 0.0, "rewards/chosen": 1.1313087940216064, "rewards/margins": -0.3435325622558594, "rewards/rejected": 1.4748413562774658, "step": 8663 }, { "epoch": 1.41, "learning_rate": 4.582553734995894e-07, "logits/chosen": -0.4868532419204712, "logits/rejected": -0.47490647435188293, "logps/chosen": -46.29358673095703, "logps/rejected": -75.0063705444336, "loss": 0.831, "rewards/accuracies": 0.0, "rewards/chosen": 3.1940605640411377, "rewards/margins": -0.8908350467681885, "rewards/rejected": 4.084895610809326, "step": 8664 }, { "epoch": 1.41, "learning_rate": 4.5812440839611584e-07, "logits/chosen": -0.7716318964958191, "logits/rejected": -0.7364495396614075, "logps/chosen": -49.82623291015625, "logps/rejected": -18.420730590820312, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.5725368857383728, "rewards/margins": 0.24028074741363525, "rewards/rejected": 0.33225613832473755, "step": 8665 }, { "epoch": 1.41, "learning_rate": 4.579934461858494e-07, "logits/chosen": -0.4367802143096924, "logits/rejected": -0.4793914556503296, "logps/chosen": -95.43897247314453, "logps/rejected": -68.37213897705078, "loss": 0.3994, "rewards/accuracies": 0.0, "rewards/chosen": 1.5219932794570923, "rewards/margins": -0.14210665225982666, "rewards/rejected": 1.664099931716919, "step": 8666 }, { "epoch": 1.41, "learning_rate": 4.578624868778384e-07, "logits/chosen": -0.8741226196289062, "logits/rejected": -0.9634612202644348, "logps/chosen": -88.73887634277344, "logps/rejected": -202.322021484375, "loss": 1.2581, "rewards/accuracies": 0.0, "rewards/chosen": 1.1021744012832642, "rewards/margins": -2.3860459327697754, "rewards/rejected": 3.48822021484375, "step": 8667 }, { "epoch": 1.41, "learning_rate": 4.577315304811308e-07, "logits/chosen": -0.7295253872871399, "logits/rejected": -0.7280154824256897, "logps/chosen": -60.67279052734375, "logps/rejected": -49.61328125, "loss": 1.9988, "rewards/accuracies": 0.0, "rewards/chosen": 1.9532371759414673, "rewards/margins": -0.18614590167999268, "rewards/rejected": 2.13938307762146, "step": 8668 }, { "epoch": 1.41, "learning_rate": 4.5760057700477456e-07, "logits/chosen": -0.4992468059062958, "logits/rejected": -0.37197446823120117, "logps/chosen": -59.40378952026367, "logps/rejected": -20.072181701660156, "loss": 0.608, "rewards/accuracies": 1.0, "rewards/chosen": 1.4564952850341797, "rewards/margins": 1.2589876651763916, "rewards/rejected": 0.19750766456127167, "step": 8669 }, { "epoch": 1.41, "learning_rate": 4.5746962645781723e-07, "logits/chosen": -0.8406555652618408, "logits/rejected": -0.7861805558204651, "logps/chosen": -121.53782653808594, "logps/rejected": -68.7734603881836, "loss": 1.1198, "rewards/accuracies": 0.0, "rewards/chosen": 0.7269043326377869, "rewards/margins": -1.9587852954864502, "rewards/rejected": 2.685689687728882, "step": 8670 }, { "epoch": 1.41, "learning_rate": 4.573386788493063e-07, "logits/chosen": -0.7353251576423645, "logits/rejected": -0.697966456413269, "logps/chosen": -21.623193740844727, "logps/rejected": -39.566123962402344, "loss": 1.8375, "rewards/accuracies": 0.0, "rewards/chosen": 1.248489260673523, "rewards/margins": -1.9542771577835083, "rewards/rejected": 3.2027664184570312, "step": 8671 }, { "epoch": 1.41, "learning_rate": 4.5720773418828895e-07, "logits/chosen": -0.3424813449382782, "logits/rejected": -0.3624509572982788, "logps/chosen": -77.08598327636719, "logps/rejected": -68.21754455566406, "loss": 0.5152, "rewards/accuracies": 0.0, "rewards/chosen": 1.9048378467559814, "rewards/margins": -0.5249412059783936, "rewards/rejected": 2.429779052734375, "step": 8672 }, { "epoch": 1.41, "learning_rate": 4.570767924838123e-07, "logits/chosen": -0.39575058221817017, "logits/rejected": -0.39575058221817017, "logps/chosen": -1.6624348163604736, "logps/rejected": -1.6624348163604736, "loss": 0.8295, "rewards/accuracies": 0.0, "rewards/chosen": 0.20071308314800262, "rewards/margins": 0.0, "rewards/rejected": 0.20071308314800262, "step": 8673 }, { "epoch": 1.41, "learning_rate": 4.5694585374492304e-07, "logits/chosen": -0.529318630695343, "logits/rejected": -0.6094415187835693, "logps/chosen": -111.51006317138672, "logps/rejected": -116.90724182128906, "loss": 2.0029, "rewards/accuracies": 0.0, "rewards/chosen": 1.473615288734436, "rewards/margins": -3.013914108276367, "rewards/rejected": 4.487529277801514, "step": 8674 }, { "epoch": 1.41, "learning_rate": 4.5681491798066803e-07, "logits/chosen": -0.260990172624588, "logits/rejected": -0.18709716200828552, "logps/chosen": -68.3072509765625, "logps/rejected": -49.62361145019531, "loss": 0.6386, "rewards/accuracies": 1.0, "rewards/chosen": 2.1561081409454346, "rewards/margins": 0.36983418464660645, "rewards/rejected": 1.7862739562988281, "step": 8675 }, { "epoch": 1.41, "learning_rate": 4.566839852000935e-07, "logits/chosen": -0.5746940970420837, "logits/rejected": -0.655616819858551, "logps/chosen": -58.12868118286133, "logps/rejected": -98.40283966064453, "loss": 0.4001, "rewards/accuracies": 0.0, "rewards/chosen": 0.8893070220947266, "rewards/margins": -0.20074963569641113, "rewards/rejected": 1.0900566577911377, "step": 8676 }, { "epoch": 1.41, "learning_rate": 4.565530554122458e-07, "logits/chosen": -0.8077675700187683, "logits/rejected": -0.793258547782898, "logps/chosen": -80.49759674072266, "logps/rejected": -61.99810028076172, "loss": 1.5568, "rewards/accuracies": 0.0, "rewards/chosen": 1.1980087757110596, "rewards/margins": -0.3526496887207031, "rewards/rejected": 1.5506584644317627, "step": 8677 }, { "epoch": 1.41, "learning_rate": 4.5642212862617085e-07, "logits/chosen": -0.543160080909729, "logits/rejected": -0.543160080909729, "logps/chosen": -45.529151916503906, "logps/rejected": -45.529151916503906, "loss": 0.3734, "rewards/accuracies": 0.0, "rewards/chosen": 0.9123153686523438, "rewards/margins": 0.0, "rewards/rejected": 0.9123153686523438, "step": 8678 }, { "epoch": 1.41, "learning_rate": 4.5629120485091447e-07, "logits/chosen": -0.36295175552368164, "logits/rejected": -0.37568584084510803, "logps/chosen": -167.77957153320312, "logps/rejected": -43.736724853515625, "loss": 0.096, "rewards/accuracies": 1.0, "rewards/chosen": 6.904614448547363, "rewards/margins": 4.7732720375061035, "rewards/rejected": 2.1313424110412598, "step": 8679 }, { "epoch": 1.41, "learning_rate": 4.561602840955223e-07, "logits/chosen": -0.7574809789657593, "logits/rejected": -0.7948395609855652, "logps/chosen": -77.81282043457031, "logps/rejected": -62.41009521484375, "loss": 2.097, "rewards/accuracies": 0.0, "rewards/chosen": 1.1223411560058594, "rewards/margins": -1.3625938892364502, "rewards/rejected": 2.4849350452423096, "step": 8680 }, { "epoch": 1.41, "learning_rate": 4.560293663690396e-07, "logits/chosen": -0.3964043855667114, "logits/rejected": -0.457375705242157, "logps/chosen": -63.37648010253906, "logps/rejected": -73.29476165771484, "loss": 1.0078, "rewards/accuracies": 0.0, "rewards/chosen": 1.487836480140686, "rewards/margins": -0.3721557855606079, "rewards/rejected": 1.859992265701294, "step": 8681 }, { "epoch": 1.41, "learning_rate": 4.558984516805117e-07, "logits/chosen": -0.695534884929657, "logits/rejected": -0.754122257232666, "logps/chosen": -90.54979705810547, "logps/rejected": -123.30325317382812, "loss": 1.5781, "rewards/accuracies": 0.0, "rewards/chosen": 0.6607368588447571, "rewards/margins": -3.0654168128967285, "rewards/rejected": 3.726153612136841, "step": 8682 }, { "epoch": 1.41, "learning_rate": 4.557675400389835e-07, "logits/chosen": -0.5570064783096313, "logits/rejected": -0.5268604755401611, "logps/chosen": -68.63092803955078, "logps/rejected": -106.03097534179688, "loss": 1.106, "rewards/accuracies": 1.0, "rewards/chosen": 1.4750328063964844, "rewards/margins": 0.15466690063476562, "rewards/rejected": 1.3203659057617188, "step": 8683 }, { "epoch": 1.41, "learning_rate": 4.5563663145349967e-07, "logits/chosen": -0.895485520362854, "logits/rejected": -0.8812428116798401, "logps/chosen": -119.60383605957031, "logps/rejected": -46.78864288330078, "loss": 0.8408, "rewards/accuracies": 0.0, "rewards/chosen": 0.514874279499054, "rewards/margins": -1.4660179615020752, "rewards/rejected": 1.9808921813964844, "step": 8684 }, { "epoch": 1.41, "learning_rate": 4.555057259331049e-07, "logits/chosen": -0.8605493307113647, "logits/rejected": -0.6750127673149109, "logps/chosen": -121.17872619628906, "logps/rejected": -148.4983673095703, "loss": 0.3988, "rewards/accuracies": 1.0, "rewards/chosen": 4.867692470550537, "rewards/margins": 0.223907470703125, "rewards/rejected": 4.643784999847412, "step": 8685 }, { "epoch": 1.41, "learning_rate": 4.5537482348684347e-07, "logits/chosen": -0.22980962693691254, "logits/rejected": -0.23521141707897186, "logps/chosen": -8.198921203613281, "logps/rejected": -2.333158254623413, "loss": 0.5756, "rewards/accuracies": 0.0, "rewards/chosen": 0.13956986367702484, "rewards/margins": -0.1741231232881546, "rewards/rejected": 0.31369298696517944, "step": 8686 }, { "epoch": 1.41, "learning_rate": 4.5524392412375944e-07, "logits/chosen": -0.6465445160865784, "logits/rejected": -0.5888029336929321, "logps/chosen": -81.11603546142578, "logps/rejected": -75.60785675048828, "loss": 0.4055, "rewards/accuracies": 0.0, "rewards/chosen": 2.7040398120880127, "rewards/margins": -0.2181687355041504, "rewards/rejected": 2.922208547592163, "step": 8687 }, { "epoch": 1.41, "learning_rate": 4.5511302785289677e-07, "logits/chosen": -0.8768582940101624, "logits/rejected": -0.8389391303062439, "logps/chosen": -156.08267211914062, "logps/rejected": -126.71572875976562, "loss": 0.8805, "rewards/accuracies": 1.0, "rewards/chosen": 5.549099922180176, "rewards/margins": 0.8165040016174316, "rewards/rejected": 4.732595920562744, "step": 8688 }, { "epoch": 1.41, "learning_rate": 4.5498213468329923e-07, "logits/chosen": -0.6797164082527161, "logits/rejected": -0.5579784512519836, "logps/chosen": -96.26013946533203, "logps/rejected": -56.929786682128906, "loss": 0.7167, "rewards/accuracies": 1.0, "rewards/chosen": 3.306476593017578, "rewards/margins": 2.6086113452911377, "rewards/rejected": 0.6978653073310852, "step": 8689 }, { "epoch": 1.41, "learning_rate": 4.5485124462401016e-07, "logits/chosen": -0.533118724822998, "logits/rejected": -0.4495089650154114, "logps/chosen": -43.733863830566406, "logps/rejected": -48.51311111450195, "loss": 2.9717, "rewards/accuracies": 0.0, "rewards/chosen": 1.5685104131698608, "rewards/margins": -1.0091153383255005, "rewards/rejected": 2.5776257514953613, "step": 8690 }, { "epoch": 1.41, "learning_rate": 4.547203576840729e-07, "logits/chosen": -0.825422465801239, "logits/rejected": -0.825422465801239, "logps/chosen": -136.72146606445312, "logps/rejected": -136.72146606445312, "loss": 0.7914, "rewards/accuracies": 0.0, "rewards/chosen": 4.536825656890869, "rewards/margins": 0.0, "rewards/rejected": 4.536825656890869, "step": 8691 }, { "epoch": 1.41, "learning_rate": 4.5458947387253043e-07, "logits/chosen": -0.727748453617096, "logits/rejected": -0.7252210378646851, "logps/chosen": -116.5290756225586, "logps/rejected": -85.52447509765625, "loss": 1.8898, "rewards/accuracies": 1.0, "rewards/chosen": 1.274492621421814, "rewards/margins": 0.4485008120536804, "rewards/rejected": 0.8259918093681335, "step": 8692 }, { "epoch": 1.41, "learning_rate": 4.544585931984257e-07, "logits/chosen": -0.4354805648326874, "logits/rejected": -0.43260037899017334, "logps/chosen": -19.237319946289062, "logps/rejected": -6.479473114013672, "loss": 0.4494, "rewards/accuracies": 0.0, "rewards/chosen": 0.08376121520996094, "rewards/margins": -0.08540049195289612, "rewards/rejected": 0.16916170716285706, "step": 8693 }, { "epoch": 1.41, "learning_rate": 4.5432771567080127e-07, "logits/chosen": -0.8074394464492798, "logits/rejected": -0.7848204970359802, "logps/chosen": -31.65154457092285, "logps/rejected": -77.4636459350586, "loss": 1.3682, "rewards/accuracies": 0.0, "rewards/chosen": 1.178632140159607, "rewards/margins": -1.8662241697311401, "rewards/rejected": 3.044856309890747, "step": 8694 }, { "epoch": 1.41, "learning_rate": 4.541968412986995e-07, "logits/chosen": -0.5638329386711121, "logits/rejected": -0.1330009400844574, "logps/chosen": -113.58683776855469, "logps/rejected": -71.23269653320312, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": 6.422386169433594, "rewards/margins": 3.714723825454712, "rewards/rejected": 2.707662343978882, "step": 8695 }, { "epoch": 1.41, "learning_rate": 4.540659700911626e-07, "logits/chosen": -0.671931266784668, "logits/rejected": -0.5619054436683655, "logps/chosen": -124.71817016601562, "logps/rejected": -61.52893829345703, "loss": 0.736, "rewards/accuracies": 1.0, "rewards/chosen": 4.752081394195557, "rewards/margins": 1.8264825344085693, "rewards/rejected": 2.9255988597869873, "step": 8696 }, { "epoch": 1.41, "learning_rate": 4.5393510205723255e-07, "logits/chosen": -0.23628154397010803, "logits/rejected": -0.2663070559501648, "logps/chosen": -46.66661071777344, "logps/rejected": -51.6989631652832, "loss": 1.0713, "rewards/accuracies": 0.0, "rewards/chosen": 1.2154353857040405, "rewards/margins": -0.5948059558868408, "rewards/rejected": 1.8102413415908813, "step": 8697 }, { "epoch": 1.41, "learning_rate": 4.5380423720595105e-07, "logits/chosen": -0.29820311069488525, "logits/rejected": -0.29820311069488525, "logps/chosen": -3.170416831970215, "logps/rejected": -3.170416831970215, "loss": 0.94, "rewards/accuracies": 0.0, "rewards/chosen": 0.27864503860473633, "rewards/margins": 0.0, "rewards/rejected": 0.27864503860473633, "step": 8698 }, { "epoch": 1.41, "learning_rate": 4.5367337554635973e-07, "logits/chosen": -0.5242993831634521, "logits/rejected": -0.4978190064430237, "logps/chosen": -97.18367004394531, "logps/rejected": -45.79878234863281, "loss": 0.6625, "rewards/accuracies": 1.0, "rewards/chosen": 1.4769012928009033, "rewards/margins": 0.22758984565734863, "rewards/rejected": 1.2493114471435547, "step": 8699 }, { "epoch": 1.41, "learning_rate": 4.5354251708749977e-07, "logits/chosen": -0.7172635197639465, "logits/rejected": -0.7094908952713013, "logps/chosen": -108.63501739501953, "logps/rejected": -78.48350524902344, "loss": 0.6123, "rewards/accuracies": 0.0, "rewards/chosen": 1.5708427429199219, "rewards/margins": -0.47149658203125, "rewards/rejected": 2.042339324951172, "step": 8700 }, { "epoch": 1.41, "learning_rate": 4.5341166183841224e-07, "logits/chosen": -0.6962865591049194, "logits/rejected": -0.5785015225410461, "logps/chosen": -92.61249542236328, "logps/rejected": -17.592397689819336, "loss": 0.8063, "rewards/accuracies": 1.0, "rewards/chosen": 1.0770752429962158, "rewards/margins": 0.5462955832481384, "rewards/rejected": 0.5307796597480774, "step": 8701 }, { "epoch": 1.41, "learning_rate": 4.532808098081381e-07, "logits/chosen": -0.5377246141433716, "logits/rejected": -0.8674480319023132, "logps/chosen": -86.11172485351562, "logps/rejected": -25.495731353759766, "loss": 0.9322, "rewards/accuracies": 1.0, "rewards/chosen": 1.3614548444747925, "rewards/margins": 0.7569231390953064, "rewards/rejected": 0.6045317053794861, "step": 8702 }, { "epoch": 1.41, "learning_rate": 4.5314996100571797e-07, "logits/chosen": -0.6884831786155701, "logits/rejected": -0.7019990086555481, "logps/chosen": -12.207013130187988, "logps/rejected": -1.74277663230896, "loss": 0.7787, "rewards/accuracies": 0.0, "rewards/chosen": -0.08866625279188156, "rewards/margins": -0.4680382311344147, "rewards/rejected": 0.3793719708919525, "step": 8703 }, { "epoch": 1.41, "learning_rate": 4.5301911544019215e-07, "logits/chosen": -0.8866146206855774, "logits/rejected": -0.8866146206855774, "logps/chosen": -63.423072814941406, "logps/rejected": -63.423072814941406, "loss": 0.3953, "rewards/accuracies": 0.0, "rewards/chosen": 1.6362686157226562, "rewards/margins": 0.0, "rewards/rejected": 1.6362686157226562, "step": 8704 }, { "epoch": 1.41, "learning_rate": 4.5288827312060104e-07, "logits/chosen": -0.3855006992816925, "logits/rejected": -0.37710317969322205, "logps/chosen": -17.34387969970703, "logps/rejected": -8.065765380859375, "loss": 0.7729, "rewards/accuracies": 1.0, "rewards/chosen": 0.7817638516426086, "rewards/margins": 0.5213945508003235, "rewards/rejected": 0.26036930084228516, "step": 8705 }, { "epoch": 1.41, "learning_rate": 4.5275743405598437e-07, "logits/chosen": -0.9494797587394714, "logits/rejected": -0.9798811674118042, "logps/chosen": -18.84977149963379, "logps/rejected": -88.91986083984375, "loss": 1.6233, "rewards/accuracies": 0.0, "rewards/chosen": 0.4816949963569641, "rewards/margins": -3.1722373962402344, "rewards/rejected": 3.6539323329925537, "step": 8706 }, { "epoch": 1.41, "learning_rate": 4.52626598255382e-07, "logits/chosen": -0.6256400942802429, "logits/rejected": -0.6942927837371826, "logps/chosen": -113.51559448242188, "logps/rejected": -104.24087524414062, "loss": 1.7041, "rewards/accuracies": 0.0, "rewards/chosen": 3.9698486328125, "rewards/margins": -3.0629043579101562, "rewards/rejected": 7.032752990722656, "step": 8707 }, { "epoch": 1.41, "learning_rate": 4.524957657278335e-07, "logits/chosen": -0.8973143696784973, "logits/rejected": -0.8762489557266235, "logps/chosen": -46.08290481567383, "logps/rejected": -47.50205993652344, "loss": 0.6161, "rewards/accuracies": 1.0, "rewards/chosen": 2.087379217147827, "rewards/margins": 0.9484597444534302, "rewards/rejected": 1.138919472694397, "step": 8708 }, { "epoch": 1.41, "learning_rate": 4.5236493648237803e-07, "logits/chosen": -0.4505104124546051, "logits/rejected": -0.44631296396255493, "logps/chosen": -47.656978607177734, "logps/rejected": -80.97969055175781, "loss": 0.4669, "rewards/accuracies": 1.0, "rewards/chosen": 1.3442379236221313, "rewards/margins": 0.5960552096366882, "rewards/rejected": 0.7481827139854431, "step": 8709 }, { "epoch": 1.41, "learning_rate": 4.5223411052805474e-07, "logits/chosen": -0.7956382036209106, "logits/rejected": -0.6617471575737, "logps/chosen": -210.25328063964844, "logps/rejected": -162.85488891601562, "loss": 0.503, "rewards/accuracies": 0.0, "rewards/chosen": 5.82705545425415, "rewards/margins": -0.281951904296875, "rewards/rejected": 6.109007358551025, "step": 8710 }, { "epoch": 1.41, "learning_rate": 4.5210328787390246e-07, "logits/chosen": -0.6948676109313965, "logits/rejected": -0.6652592420578003, "logps/chosen": -88.90829467773438, "logps/rejected": -102.23228454589844, "loss": 0.1584, "rewards/accuracies": 1.0, "rewards/chosen": 2.4150283336639404, "rewards/margins": 1.186285376548767, "rewards/rejected": 1.2287429571151733, "step": 8711 }, { "epoch": 1.41, "learning_rate": 4.5197246852895974e-07, "logits/chosen": -0.747546374797821, "logits/rejected": -0.7346748113632202, "logps/chosen": -49.80254364013672, "logps/rejected": -64.93929290771484, "loss": 0.1442, "rewards/accuracies": 1.0, "rewards/chosen": 2.320852041244507, "rewards/margins": 1.3487130403518677, "rewards/rejected": 0.9721390008926392, "step": 8712 }, { "epoch": 1.41, "learning_rate": 4.518416525022651e-07, "logits/chosen": -0.8169794678688049, "logits/rejected": -0.5051845908164978, "logps/chosen": -190.24246215820312, "logps/rejected": -21.479373931884766, "loss": 0.1646, "rewards/accuracies": 1.0, "rewards/chosen": 4.574206829071045, "rewards/margins": 4.196278095245361, "rewards/rejected": 0.3779289424419403, "step": 8713 }, { "epoch": 1.41, "learning_rate": 4.517108398028565e-07, "logits/chosen": -0.8005756735801697, "logits/rejected": -0.7560255527496338, "logps/chosen": -58.28057861328125, "logps/rejected": -72.3037338256836, "loss": 0.1214, "rewards/accuracies": 1.0, "rewards/chosen": 2.278045654296875, "rewards/margins": 1.5896461009979248, "rewards/rejected": 0.6883994936943054, "step": 8714 }, { "epoch": 1.41, "learning_rate": 4.5158003043977207e-07, "logits/chosen": -0.8848152160644531, "logits/rejected": -0.8873777389526367, "logps/chosen": -53.327144622802734, "logps/rejected": -56.759971618652344, "loss": 1.8206, "rewards/accuracies": 0.0, "rewards/chosen": 1.807963252067566, "rewards/margins": -0.019478917121887207, "rewards/rejected": 1.8274421691894531, "step": 8715 }, { "epoch": 1.41, "learning_rate": 4.514492244220493e-07, "logits/chosen": -1.236074447631836, "logits/rejected": -1.2472172975540161, "logps/chosen": -150.5460205078125, "logps/rejected": -45.98040008544922, "loss": 0.6238, "rewards/accuracies": 0.0, "rewards/chosen": 1.235510230064392, "rewards/margins": -0.7258538007736206, "rewards/rejected": 1.9613640308380127, "step": 8716 }, { "epoch": 1.41, "learning_rate": 4.513184217587258e-07, "logits/chosen": -0.5109317898750305, "logits/rejected": -0.5165753960609436, "logps/chosen": -1.7988532781600952, "logps/rejected": -19.18284797668457, "loss": 0.7522, "rewards/accuracies": 1.0, "rewards/chosen": 0.590140700340271, "rewards/margins": 0.3421924114227295, "rewards/rejected": 0.2479482740163803, "step": 8717 }, { "epoch": 1.42, "learning_rate": 4.511876224588386e-07, "logits/chosen": -0.2768111824989319, "logits/rejected": -0.2767537236213684, "logps/chosen": -4.663508892059326, "logps/rejected": -7.805919647216797, "loss": 0.4551, "rewards/accuracies": 1.0, "rewards/chosen": 0.20773163437843323, "rewards/margins": 0.24528875946998596, "rewards/rejected": -0.037557125091552734, "step": 8718 }, { "epoch": 1.42, "learning_rate": 4.510568265314249e-07, "logits/chosen": -0.5735297203063965, "logits/rejected": -0.571007251739502, "logps/chosen": -61.18708038330078, "logps/rejected": -75.8641357421875, "loss": 0.7436, "rewards/accuracies": 1.0, "rewards/chosen": 1.6593674421310425, "rewards/margins": 0.4545936584472656, "rewards/rejected": 1.2047737836837769, "step": 8719 }, { "epoch": 1.42, "learning_rate": 4.5092603398552165e-07, "logits/chosen": -0.8431312441825867, "logits/rejected": -0.8382418751716614, "logps/chosen": -159.84776306152344, "logps/rejected": -106.87454223632812, "loss": 0.4101, "rewards/accuracies": 0.0, "rewards/chosen": 1.6670913696289062, "rewards/margins": -0.10560917854309082, "rewards/rejected": 1.772700548171997, "step": 8720 }, { "epoch": 1.42, "learning_rate": 4.507952448301648e-07, "logits/chosen": -0.9355642199516296, "logits/rejected": -0.8616376519203186, "logps/chosen": -48.10758972167969, "logps/rejected": -68.91398620605469, "loss": 0.8477, "rewards/accuracies": 0.0, "rewards/chosen": 2.8629701137542725, "rewards/margins": -0.12802577018737793, "rewards/rejected": 2.9909958839416504, "step": 8721 }, { "epoch": 1.42, "learning_rate": 4.50664459074391e-07, "logits/chosen": -0.3615094721317291, "logits/rejected": -0.3677537441253662, "logps/chosen": -38.327999114990234, "logps/rejected": -36.35127258300781, "loss": 2.4757, "rewards/accuracies": 0.0, "rewards/chosen": 0.99462890625, "rewards/margins": -0.10879552364349365, "rewards/rejected": 1.1034244298934937, "step": 8722 }, { "epoch": 1.42, "learning_rate": 4.505336767272362e-07, "logits/chosen": -0.6121605038642883, "logits/rejected": -0.6981635689735413, "logps/chosen": -187.45596313476562, "logps/rejected": -70.095458984375, "loss": 1.1372, "rewards/accuracies": 1.0, "rewards/chosen": 3.4942901134490967, "rewards/margins": 0.8482489585876465, "rewards/rejected": 2.64604115486145, "step": 8723 }, { "epoch": 1.42, "learning_rate": 4.5040289779773635e-07, "logits/chosen": -0.9400933384895325, "logits/rejected": -0.7547763586044312, "logps/chosen": -112.79638671875, "logps/rejected": -131.47154235839844, "loss": 0.3423, "rewards/accuracies": 1.0, "rewards/chosen": 6.877292156219482, "rewards/margins": 0.03956031799316406, "rewards/rejected": 6.837731838226318, "step": 8724 }, { "epoch": 1.42, "learning_rate": 4.50272122294927e-07, "logits/chosen": -0.7006343007087708, "logits/rejected": -0.5964481830596924, "logps/chosen": -68.0793228149414, "logps/rejected": -88.01576232910156, "loss": 0.193, "rewards/accuracies": 1.0, "rewards/chosen": 4.035370826721191, "rewards/margins": 0.8534829616546631, "rewards/rejected": 3.1818878650665283, "step": 8725 }, { "epoch": 1.42, "learning_rate": 4.5014135022784345e-07, "logits/chosen": -0.4081733524799347, "logits/rejected": -0.4081733524799347, "logps/chosen": -56.31138610839844, "logps/rejected": -56.31138610839844, "loss": 0.8765, "rewards/accuracies": 0.0, "rewards/chosen": 0.025853348895907402, "rewards/margins": 0.0, "rewards/rejected": 0.025853348895907402, "step": 8726 }, { "epoch": 1.42, "learning_rate": 4.500105816055208e-07, "logits/chosen": -0.7809559106826782, "logits/rejected": -0.6640770435333252, "logps/chosen": -99.99004364013672, "logps/rejected": -95.47174072265625, "loss": 0.6146, "rewards/accuracies": 0.0, "rewards/chosen": 1.7801315784454346, "rewards/margins": -0.8710379600524902, "rewards/rejected": 2.651169538497925, "step": 8727 }, { "epoch": 1.42, "learning_rate": 4.4987981643699396e-07, "logits/chosen": -0.635898768901825, "logits/rejected": -0.5567152500152588, "logps/chosen": -132.5602264404297, "logps/rejected": -75.66943359375, "loss": 0.0286, "rewards/accuracies": 1.0, "rewards/chosen": 6.49507474899292, "rewards/margins": 4.069208145141602, "rewards/rejected": 2.4258668422698975, "step": 8728 }, { "epoch": 1.42, "learning_rate": 4.4974905473129756e-07, "logits/chosen": -0.7112732529640198, "logits/rejected": -0.7272734045982361, "logps/chosen": -100.27711486816406, "logps/rejected": -88.32148742675781, "loss": 0.7033, "rewards/accuracies": 0.0, "rewards/chosen": 2.193042039871216, "rewards/margins": -0.4972960948944092, "rewards/rejected": 2.690338134765625, "step": 8729 }, { "epoch": 1.42, "learning_rate": 4.4961829649746593e-07, "logits/chosen": -0.6055271625518799, "logits/rejected": -0.584098219871521, "logps/chosen": -97.07533264160156, "logps/rejected": -129.60693359375, "loss": 1.3268, "rewards/accuracies": 1.0, "rewards/chosen": 2.2868118286132812, "rewards/margins": 0.3112426996231079, "rewards/rejected": 1.9755691289901733, "step": 8730 }, { "epoch": 1.42, "learning_rate": 4.4948754174453334e-07, "logits/chosen": -0.5447694063186646, "logits/rejected": -0.45312806963920593, "logps/chosen": -74.36112976074219, "logps/rejected": -52.708778381347656, "loss": 0.4766, "rewards/accuracies": 1.0, "rewards/chosen": 1.6105927228927612, "rewards/margins": 0.8073582053184509, "rewards/rejected": 0.8032345175743103, "step": 8731 }, { "epoch": 1.42, "learning_rate": 4.493567904815337e-07, "logits/chosen": -0.8617404103279114, "logits/rejected": -0.8568805456161499, "logps/chosen": -164.0255889892578, "logps/rejected": -64.54374694824219, "loss": 0.4778, "rewards/accuracies": 0.0, "rewards/chosen": 2.7978057861328125, "rewards/margins": -0.005697727203369141, "rewards/rejected": 2.8035035133361816, "step": 8732 }, { "epoch": 1.42, "learning_rate": 4.4922604271750064e-07, "logits/chosen": -0.4905139207839966, "logits/rejected": -0.3780595362186432, "logps/chosen": -56.73242950439453, "logps/rejected": -55.107852935791016, "loss": 0.1067, "rewards/accuracies": 1.0, "rewards/chosen": 3.238889455795288, "rewards/margins": 1.4502202272415161, "rewards/rejected": 1.788669228553772, "step": 8733 }, { "epoch": 1.42, "learning_rate": 4.490952984614676e-07, "logits/chosen": -1.13754141330719, "logits/rejected": -0.8320376873016357, "logps/chosen": -140.3390655517578, "logps/rejected": -157.03399658203125, "loss": 0.2004, "rewards/accuracies": 1.0, "rewards/chosen": 7.506211757659912, "rewards/margins": 2.624546527862549, "rewards/rejected": 4.881665229797363, "step": 8734 }, { "epoch": 1.42, "learning_rate": 4.489645577224678e-07, "logits/chosen": -0.9320559501647949, "logits/rejected": -0.8242597579956055, "logps/chosen": -71.0750961303711, "logps/rejected": -57.88232421875, "loss": 0.5445, "rewards/accuracies": 1.0, "rewards/chosen": 1.6031837463378906, "rewards/margins": 0.33097612857818604, "rewards/rejected": 1.2722076177597046, "step": 8735 }, { "epoch": 1.42, "learning_rate": 4.488338205095341e-07, "logits/chosen": -0.576075553894043, "logits/rejected": -0.5776475667953491, "logps/chosen": -72.051025390625, "logps/rejected": -102.12753295898438, "loss": 1.0139, "rewards/accuracies": 0.0, "rewards/chosen": 2.1466972827911377, "rewards/margins": -0.7543435096740723, "rewards/rejected": 2.90104079246521, "step": 8736 }, { "epoch": 1.42, "learning_rate": 4.4870308683169933e-07, "logits/chosen": -0.525505781173706, "logits/rejected": -0.46503111720085144, "logps/chosen": -24.724687576293945, "logps/rejected": -9.575745582580566, "loss": 0.2901, "rewards/accuracies": 1.0, "rewards/chosen": 2.1439363956451416, "rewards/margins": 1.3895525932312012, "rewards/rejected": 0.7543837428092957, "step": 8737 }, { "epoch": 1.42, "learning_rate": 4.485723566979959e-07, "logits/chosen": -1.4622048139572144, "logits/rejected": -0.8421868681907654, "logps/chosen": -105.57228088378906, "logps/rejected": -125.57476043701172, "loss": 0.7898, "rewards/accuracies": 0.0, "rewards/chosen": 1.5653923749923706, "rewards/margins": -0.4399780035018921, "rewards/rejected": 2.0053703784942627, "step": 8738 }, { "epoch": 1.42, "learning_rate": 4.484416301174559e-07, "logits/chosen": -0.8025301694869995, "logits/rejected": -0.8025301694869995, "logps/chosen": -49.272064208984375, "logps/rejected": -49.272064208984375, "loss": 1.064, "rewards/accuracies": 0.0, "rewards/chosen": 2.4275290966033936, "rewards/margins": 0.0, "rewards/rejected": 2.4275290966033936, "step": 8739 }, { "epoch": 1.42, "learning_rate": 4.4831090709911146e-07, "logits/chosen": -0.642052173614502, "logits/rejected": -0.6878019571304321, "logps/chosen": -83.41262817382812, "logps/rejected": -43.79166793823242, "loss": 1.0433, "rewards/accuracies": 0.0, "rewards/chosen": 0.47840118408203125, "rewards/margins": -1.2821186780929565, "rewards/rejected": 1.7605198621749878, "step": 8740 }, { "epoch": 1.42, "learning_rate": 4.4818018765199426e-07, "logits/chosen": -0.906294584274292, "logits/rejected": -0.9064822793006897, "logps/chosen": -62.370399475097656, "logps/rejected": -78.0264892578125, "loss": 0.9717, "rewards/accuracies": 1.0, "rewards/chosen": 2.636234998703003, "rewards/margins": 0.5819334983825684, "rewards/rejected": 2.0543015003204346, "step": 8741 }, { "epoch": 1.42, "learning_rate": 4.480494717851358e-07, "logits/chosen": -0.8066943883895874, "logits/rejected": -0.72292560338974, "logps/chosen": -66.80139923095703, "logps/rejected": -81.49861907958984, "loss": 0.9677, "rewards/accuracies": 0.0, "rewards/chosen": 1.564675211906433, "rewards/margins": -1.2659317255020142, "rewards/rejected": 2.8306069374084473, "step": 8742 }, { "epoch": 1.42, "learning_rate": 4.4791875950756724e-07, "logits/chosen": -0.8529240489006042, "logits/rejected": -0.8972797393798828, "logps/chosen": -64.71173858642578, "logps/rejected": -99.02269744873047, "loss": 3.0379, "rewards/accuracies": 0.0, "rewards/chosen": 1.6031166315078735, "rewards/margins": -3.318441867828369, "rewards/rejected": 4.921558380126953, "step": 8743 }, { "epoch": 1.42, "learning_rate": 4.477880508283196e-07, "logits/chosen": -0.880840003490448, "logits/rejected": -0.905093252658844, "logps/chosen": -55.083778381347656, "logps/rejected": -133.14073181152344, "loss": 1.5656, "rewards/accuracies": 0.0, "rewards/chosen": 1.073358178138733, "rewards/margins": -2.7224411964416504, "rewards/rejected": 3.7957992553710938, "step": 8744 }, { "epoch": 1.42, "learning_rate": 4.4765734575642364e-07, "logits/chosen": -0.9939652681350708, "logits/rejected": -0.9329077005386353, "logps/chosen": -113.17583465576172, "logps/rejected": -80.5291519165039, "loss": 0.1389, "rewards/accuracies": 1.0, "rewards/chosen": 6.34493350982666, "rewards/margins": 3.0964577198028564, "rewards/rejected": 3.2484757900238037, "step": 8745 }, { "epoch": 1.42, "learning_rate": 4.475266443009098e-07, "logits/chosen": -0.1270556002855301, "logits/rejected": -0.1270556002855301, "logps/chosen": -42.439605712890625, "logps/rejected": -42.439605712890625, "loss": 1.64, "rewards/accuracies": 0.0, "rewards/chosen": 1.1925338506698608, "rewards/margins": 0.0, "rewards/rejected": 1.1925338506698608, "step": 8746 }, { "epoch": 1.42, "learning_rate": 4.4739594647080834e-07, "logits/chosen": -0.8564923405647278, "logits/rejected": -0.8564923405647278, "logps/chosen": -48.93217849731445, "logps/rejected": -48.93217849731445, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 2.4235081672668457, "rewards/margins": 0.0, "rewards/rejected": 2.4235081672668457, "step": 8747 }, { "epoch": 1.42, "learning_rate": 4.472652522751492e-07, "logits/chosen": -0.6878069639205933, "logits/rejected": -0.7048913836479187, "logps/chosen": -63.878631591796875, "logps/rejected": -70.1323013305664, "loss": 0.6285, "rewards/accuracies": 0.0, "rewards/chosen": 0.8066375851631165, "rewards/margins": -0.8517387509346008, "rewards/rejected": 1.6583763360977173, "step": 8748 }, { "epoch": 1.42, "learning_rate": 4.471345617229622e-07, "logits/chosen": -0.6969367861747742, "logits/rejected": -0.7210763096809387, "logps/chosen": -48.104530334472656, "logps/rejected": -46.81563949584961, "loss": 0.684, "rewards/accuracies": 0.0, "rewards/chosen": 1.2621891498565674, "rewards/margins": -0.8489577770233154, "rewards/rejected": 2.111146926879883, "step": 8749 }, { "epoch": 1.42, "learning_rate": 4.4700387482327673e-07, "logits/chosen": -0.812166690826416, "logits/rejected": -0.7561947107315063, "logps/chosen": -158.61056518554688, "logps/rejected": -118.5115966796875, "loss": 0.6543, "rewards/accuracies": 0.0, "rewards/chosen": 1.613104224205017, "rewards/margins": -0.9629243612289429, "rewards/rejected": 2.57602858543396, "step": 8750 }, { "epoch": 1.42, "learning_rate": 4.468731915851221e-07, "logits/chosen": -0.3213343322277069, "logits/rejected": -0.3213343322277069, "logps/chosen": -35.80305480957031, "logps/rejected": -35.80305480957031, "loss": 0.7151, "rewards/accuracies": 0.0, "rewards/chosen": 1.2582329511642456, "rewards/margins": 0.0, "rewards/rejected": 1.2582329511642456, "step": 8751 }, { "epoch": 1.42, "learning_rate": 4.467425120175272e-07, "logits/chosen": -0.6239932775497437, "logits/rejected": -0.6071063876152039, "logps/chosen": -81.89857482910156, "logps/rejected": -75.82597351074219, "loss": 0.8565, "rewards/accuracies": 0.0, "rewards/chosen": 1.0735015869140625, "rewards/margins": -1.4665610790252686, "rewards/rejected": 2.540062665939331, "step": 8752 }, { "epoch": 1.42, "learning_rate": 4.466118361295208e-07, "logits/chosen": -0.7963589429855347, "logits/rejected": -0.719456136226654, "logps/chosen": -39.324188232421875, "logps/rejected": -7.2169389724731445, "loss": 0.081, "rewards/accuracies": 1.0, "rewards/chosen": 2.630697011947632, "rewards/margins": 1.7745051383972168, "rewards/rejected": 0.8561919331550598, "step": 8753 }, { "epoch": 1.42, "learning_rate": 4.464811639301314e-07, "logits/chosen": -0.672512948513031, "logits/rejected": -0.672512948513031, "logps/chosen": -77.388671875, "logps/rejected": -77.388671875, "loss": 0.6411, "rewards/accuracies": 0.0, "rewards/chosen": 0.8424301147460938, "rewards/margins": 0.0, "rewards/rejected": 0.8424301147460938, "step": 8754 }, { "epoch": 1.42, "learning_rate": 4.463504954283871e-07, "logits/chosen": -0.7618984580039978, "logits/rejected": -0.783179521560669, "logps/chosen": -50.385772705078125, "logps/rejected": -48.486328125, "loss": 0.4233, "rewards/accuracies": 0.0, "rewards/chosen": 1.5085762739181519, "rewards/margins": -0.28459930419921875, "rewards/rejected": 1.7931755781173706, "step": 8755 }, { "epoch": 1.42, "learning_rate": 4.46219830633316e-07, "logits/chosen": -0.544775664806366, "logits/rejected": -0.5666306614875793, "logps/chosen": -92.5645751953125, "logps/rejected": -110.36711120605469, "loss": 0.2975, "rewards/accuracies": 1.0, "rewards/chosen": 0.9928222894668579, "rewards/margins": 0.23999178409576416, "rewards/rejected": 0.7528305053710938, "step": 8756 }, { "epoch": 1.42, "learning_rate": 4.4608916955394567e-07, "logits/chosen": -1.1296414136886597, "logits/rejected": -1.1126729249954224, "logps/chosen": -73.78665161132812, "logps/rejected": -92.1093978881836, "loss": 0.7107, "rewards/accuracies": 0.0, "rewards/chosen": 2.3134446144104004, "rewards/margins": -0.963599443435669, "rewards/rejected": 3.2770440578460693, "step": 8757 }, { "epoch": 1.42, "learning_rate": 4.4595851219930363e-07, "logits/chosen": -0.4348123073577881, "logits/rejected": -0.4348123073577881, "logps/chosen": -80.06722259521484, "logps/rejected": -80.06722259521484, "loss": 0.3991, "rewards/accuracies": 0.0, "rewards/chosen": 1.1231110095977783, "rewards/margins": 0.0, "rewards/rejected": 1.1231110095977783, "step": 8758 }, { "epoch": 1.42, "learning_rate": 4.4582785857841707e-07, "logits/chosen": -0.5718514919281006, "logits/rejected": -0.5390093922615051, "logps/chosen": -26.68621063232422, "logps/rejected": -52.14545440673828, "loss": 0.4796, "rewards/accuracies": 1.0, "rewards/chosen": 1.6972858905792236, "rewards/margins": 0.14303016662597656, "rewards/rejected": 1.554255723953247, "step": 8759 }, { "epoch": 1.42, "learning_rate": 4.4569720870031293e-07, "logits/chosen": -0.8393805623054504, "logits/rejected": -0.8506525158882141, "logps/chosen": -130.39126586914062, "logps/rejected": -109.51251983642578, "loss": 1.0212, "rewards/accuracies": 0.0, "rewards/chosen": 2.077597141265869, "rewards/margins": -1.437441110610962, "rewards/rejected": 3.515038251876831, "step": 8760 }, { "epoch": 1.42, "learning_rate": 4.455665625740178e-07, "logits/chosen": -0.743471622467041, "logits/rejected": -0.6264467239379883, "logps/chosen": -106.15321350097656, "logps/rejected": -78.5865478515625, "loss": 0.5542, "rewards/accuracies": 1.0, "rewards/chosen": 5.1187744140625, "rewards/margins": 2.2225341796875, "rewards/rejected": 2.896240234375, "step": 8761 }, { "epoch": 1.42, "learning_rate": 4.454359202085581e-07, "logits/chosen": -0.7009618282318115, "logits/rejected": -0.7951165437698364, "logps/chosen": -85.77261352539062, "logps/rejected": -139.70822143554688, "loss": 2.0567, "rewards/accuracies": 0.0, "rewards/chosen": 2.531079053878784, "rewards/margins": -2.976483106613159, "rewards/rejected": 5.507562160491943, "step": 8762 }, { "epoch": 1.42, "learning_rate": 4.453052816129601e-07, "logits/chosen": -0.7880409359931946, "logits/rejected": -0.6328808069229126, "logps/chosen": -131.29391479492188, "logps/rejected": -55.77257537841797, "loss": 0.091, "rewards/accuracies": 1.0, "rewards/chosen": 4.553454875946045, "rewards/margins": 2.1197938919067383, "rewards/rejected": 2.4336609840393066, "step": 8763 }, { "epoch": 1.42, "learning_rate": 4.451746467962496e-07, "logits/chosen": -0.6561727523803711, "logits/rejected": -0.7113334536552429, "logps/chosen": -113.15360260009766, "logps/rejected": -117.0577392578125, "loss": 1.6504, "rewards/accuracies": 0.0, "rewards/chosen": 3.3745276927948, "rewards/margins": -2.5993034839630127, "rewards/rejected": 5.9738311767578125, "step": 8764 }, { "epoch": 1.42, "learning_rate": 4.4504401576745227e-07, "logits/chosen": -0.3777986764907837, "logits/rejected": -0.37207746505737305, "logps/chosen": -7.975395202636719, "logps/rejected": -5.417099475860596, "loss": 1.0071, "rewards/accuracies": 0.0, "rewards/chosen": 0.5761581659317017, "rewards/margins": -0.23285293579101562, "rewards/rejected": 0.8090111017227173, "step": 8765 }, { "epoch": 1.42, "learning_rate": 4.449133885355933e-07, "logits/chosen": -0.6927621960639954, "logits/rejected": -0.7057441473007202, "logps/chosen": -53.05784225463867, "logps/rejected": -91.76337432861328, "loss": 0.3671, "rewards/accuracies": 1.0, "rewards/chosen": 1.2477550506591797, "rewards/margins": 0.08695864677429199, "rewards/rejected": 1.1607964038848877, "step": 8766 }, { "epoch": 1.42, "learning_rate": 4.447827651096981e-07, "logits/chosen": -0.6566106081008911, "logits/rejected": -0.613972008228302, "logps/chosen": -63.74042510986328, "logps/rejected": -30.15485382080078, "loss": 0.3623, "rewards/accuracies": 0.0, "rewards/chosen": 2.253594160079956, "rewards/margins": -0.04206371307373047, "rewards/rejected": 2.2956578731536865, "step": 8767 }, { "epoch": 1.42, "learning_rate": 4.446521454987913e-07, "logits/chosen": 0.03447349742054939, "logits/rejected": 0.03398240730166435, "logps/chosen": -7.604648590087891, "logps/rejected": -5.969285011291504, "loss": 0.7091, "rewards/accuracies": 0.0, "rewards/chosen": 0.13750039041042328, "rewards/margins": -0.308441162109375, "rewards/rejected": 0.4459415376186371, "step": 8768 }, { "epoch": 1.42, "learning_rate": 4.445215297118975e-07, "logits/chosen": -0.23000679910182953, "logits/rejected": -0.2370636761188507, "logps/chosen": -4.245811939239502, "logps/rejected": -0.8990526795387268, "loss": 0.4538, "rewards/accuracies": 0.0, "rewards/chosen": 0.14636249840259552, "rewards/margins": -0.0515778511762619, "rewards/rejected": 0.19794034957885742, "step": 8769 }, { "epoch": 1.42, "learning_rate": 4.4439091775804114e-07, "logits/chosen": -0.8000648617744446, "logits/rejected": -0.677225649356842, "logps/chosen": -134.57638549804688, "logps/rejected": -62.51681137084961, "loss": 0.9543, "rewards/accuracies": 1.0, "rewards/chosen": 3.3007187843322754, "rewards/margins": 0.4783933162689209, "rewards/rejected": 2.8223254680633545, "step": 8770 }, { "epoch": 1.42, "learning_rate": 4.442603096462462e-07, "logits/chosen": -0.9801309704780579, "logits/rejected": -0.86323082447052, "logps/chosen": -183.61483764648438, "logps/rejected": -22.906238555908203, "loss": 0.0864, "rewards/accuracies": 1.0, "rewards/chosen": 3.8873138427734375, "rewards/margins": 3.4878547191619873, "rewards/rejected": 0.3994590938091278, "step": 8771 }, { "epoch": 1.42, "learning_rate": 4.4412970538553645e-07, "logits/chosen": -0.3729616105556488, "logits/rejected": -0.3877342939376831, "logps/chosen": -2.888252019882202, "logps/rejected": -3.309051990509033, "loss": 0.3516, "rewards/accuracies": 0.0, "rewards/chosen": 0.16853554546833038, "rewards/margins": -0.015019148588180542, "rewards/rejected": 0.18355469405651093, "step": 8772 }, { "epoch": 1.42, "learning_rate": 4.4399910498493535e-07, "logits/chosen": -0.6231879591941833, "logits/rejected": -0.6125874519348145, "logps/chosen": -91.85844421386719, "logps/rejected": -65.04022216796875, "loss": 1.6789, "rewards/accuracies": 0.0, "rewards/chosen": 1.5119781494140625, "rewards/margins": -0.3377937078475952, "rewards/rejected": 1.8497718572616577, "step": 8773 }, { "epoch": 1.42, "learning_rate": 4.4386850845346624e-07, "logits/chosen": -0.759391188621521, "logits/rejected": -0.7530149817466736, "logps/chosen": -88.44297790527344, "logps/rejected": -87.01945495605469, "loss": 1.2197, "rewards/accuracies": 0.0, "rewards/chosen": 0.4979759156703949, "rewards/margins": -1.975324273109436, "rewards/rejected": 2.4733002185821533, "step": 8774 }, { "epoch": 1.42, "learning_rate": 4.437379158001521e-07, "logits/chosen": -1.0797280073165894, "logits/rejected": -1.07181715965271, "logps/chosen": -76.49870300292969, "logps/rejected": -80.48800659179688, "loss": 1.2219, "rewards/accuracies": 0.0, "rewards/chosen": 1.2642700672149658, "rewards/margins": -1.3906638622283936, "rewards/rejected": 2.6549339294433594, "step": 8775 }, { "epoch": 1.42, "learning_rate": 4.436073270340156e-07, "logits/chosen": -0.6619633436203003, "logits/rejected": -0.5211767554283142, "logps/chosen": -156.09373474121094, "logps/rejected": -82.41404724121094, "loss": 0.5607, "rewards/accuracies": 0.0, "rewards/chosen": 0.994305431842804, "rewards/margins": -0.652148425579071, "rewards/rejected": 1.646453857421875, "step": 8776 }, { "epoch": 1.42, "learning_rate": 4.4347674216407916e-07, "logits/chosen": -0.702368438243866, "logits/rejected": -0.5897911787033081, "logps/chosen": -116.72584533691406, "logps/rejected": -52.09779357910156, "loss": 0.3756, "rewards/accuracies": 0.0, "rewards/chosen": -0.15479432046413422, "rewards/margins": -0.10816346108913422, "rewards/rejected": -0.046630859375, "step": 8777 }, { "epoch": 1.42, "learning_rate": 4.433461611993651e-07, "logits/chosen": -0.3449020981788635, "logits/rejected": -0.3432214558124542, "logps/chosen": -6.652242660522461, "logps/rejected": -1.7761348485946655, "loss": 0.4743, "rewards/accuracies": 0.0, "rewards/chosen": 0.16385392844676971, "rewards/margins": -0.3889075517654419, "rewards/rejected": 0.5527614951133728, "step": 8778 }, { "epoch": 1.42, "learning_rate": 4.4321558414889515e-07, "logits/chosen": -0.3374203145503998, "logits/rejected": -0.3374203145503998, "logps/chosen": -83.97908782958984, "logps/rejected": -83.97908782958984, "loss": 0.3545, "rewards/accuracies": 0.0, "rewards/chosen": 1.0522164106369019, "rewards/margins": 0.0, "rewards/rejected": 1.0522164106369019, "step": 8779 }, { "epoch": 1.43, "learning_rate": 4.430850110216911e-07, "logits/chosen": -0.7528897523880005, "logits/rejected": -0.7599740624427795, "logps/chosen": -64.82122802734375, "logps/rejected": -106.05978393554688, "loss": 1.2237, "rewards/accuracies": 1.0, "rewards/chosen": 2.2672698497772217, "rewards/margins": 0.28885793685913086, "rewards/rejected": 1.9784119129180908, "step": 8780 }, { "epoch": 1.43, "learning_rate": 4.429544418267742e-07, "logits/chosen": -0.49248456954956055, "logits/rejected": -0.4532725512981415, "logps/chosen": -54.760616302490234, "logps/rejected": -105.76834869384766, "loss": 1.0803, "rewards/accuracies": 0.0, "rewards/chosen": 2.257596969604492, "rewards/margins": -1.2060954570770264, "rewards/rejected": 3.4636924266815186, "step": 8781 }, { "epoch": 1.43, "learning_rate": 4.4282387657316567e-07, "logits/chosen": -0.7097375988960266, "logits/rejected": -0.5470359325408936, "logps/chosen": -102.22163391113281, "logps/rejected": -42.20550537109375, "loss": 0.0916, "rewards/accuracies": 1.0, "rewards/chosen": 5.678384304046631, "rewards/margins": 4.356012344360352, "rewards/rejected": 1.3223720788955688, "step": 8782 }, { "epoch": 1.43, "learning_rate": 4.426933152698862e-07, "logits/chosen": -1.000941276550293, "logits/rejected": -0.9648924469947815, "logps/chosen": -110.79975128173828, "logps/rejected": -97.22510528564453, "loss": 0.5095, "rewards/accuracies": 0.0, "rewards/chosen": 1.921242594718933, "rewards/margins": -0.5587555170059204, "rewards/rejected": 2.4799981117248535, "step": 8783 }, { "epoch": 1.43, "learning_rate": 4.425627579259564e-07, "logits/chosen": -0.7646183371543884, "logits/rejected": -0.6526731252670288, "logps/chosen": -117.77296447753906, "logps/rejected": -50.45133972167969, "loss": 0.5332, "rewards/accuracies": 1.0, "rewards/chosen": 4.328880310058594, "rewards/margins": 2.4078032970428467, "rewards/rejected": 1.921077013015747, "step": 8784 }, { "epoch": 1.43, "learning_rate": 4.424322045503966e-07, "logits/chosen": -0.4661218523979187, "logits/rejected": -0.47413158416748047, "logps/chosen": -4.077055931091309, "logps/rejected": -2.021833658218384, "loss": 0.4717, "rewards/accuracies": 0.0, "rewards/chosen": 0.15967045724391937, "rewards/margins": -0.1994449943304062, "rewards/rejected": 0.35911545157432556, "step": 8785 }, { "epoch": 1.43, "learning_rate": 4.423016551522267e-07, "logits/chosen": -0.9543681144714355, "logits/rejected": -0.6713773608207703, "logps/chosen": -120.95767974853516, "logps/rejected": -18.98951530456543, "loss": 0.4153, "rewards/accuracies": 1.0, "rewards/chosen": 4.661818981170654, "rewards/margins": 4.2889533042907715, "rewards/rejected": 0.3728656768798828, "step": 8786 }, { "epoch": 1.43, "learning_rate": 4.421711097404665e-07, "logits/chosen": -0.26196783781051636, "logits/rejected": -0.26196783781051636, "logps/chosen": -75.84835815429688, "logps/rejected": -75.84835815429688, "loss": 0.3483, "rewards/accuracies": 0.0, "rewards/chosen": 0.2691917419433594, "rewards/margins": 0.0, "rewards/rejected": 0.2691917419433594, "step": 8787 }, { "epoch": 1.43, "learning_rate": 4.420405683241355e-07, "logits/chosen": -1.0456736087799072, "logits/rejected": -1.051020622253418, "logps/chosen": -142.748291015625, "logps/rejected": -91.80299377441406, "loss": 1.1229, "rewards/accuracies": 0.0, "rewards/chosen": 1.2887848615646362, "rewards/margins": -0.6251860857009888, "rewards/rejected": 1.913970947265625, "step": 8788 }, { "epoch": 1.43, "learning_rate": 4.4191003091225276e-07, "logits/chosen": -0.7447340488433838, "logits/rejected": -0.7392837405204773, "logps/chosen": -56.47649383544922, "logps/rejected": -120.01844787597656, "loss": 0.7657, "rewards/accuracies": 1.0, "rewards/chosen": 1.2484734058380127, "rewards/margins": 0.7547386884689331, "rewards/rejected": 0.493734747171402, "step": 8789 }, { "epoch": 1.43, "learning_rate": 4.417794975138372e-07, "logits/chosen": -0.7510175108909607, "logits/rejected": -0.7510175108909607, "logps/chosen": -76.96090698242188, "logps/rejected": -76.96090698242188, "loss": 0.3472, "rewards/accuracies": 0.0, "rewards/chosen": 2.5390853881835938, "rewards/margins": 0.0, "rewards/rejected": 2.5390853881835938, "step": 8790 }, { "epoch": 1.43, "learning_rate": 4.4164896813790755e-07, "logits/chosen": -0.8486239910125732, "logits/rejected": -0.7218790650367737, "logps/chosen": -199.82568359375, "logps/rejected": -44.063575744628906, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": 6.379748821258545, "rewards/margins": 4.175167083740234, "rewards/rejected": 2.2045814990997314, "step": 8791 }, { "epoch": 1.43, "learning_rate": 4.41518442793482e-07, "logits/chosen": -0.9484376311302185, "logits/rejected": -0.9795629382133484, "logps/chosen": -170.02467346191406, "logps/rejected": -81.24913787841797, "loss": 1.3169, "rewards/accuracies": 1.0, "rewards/chosen": 4.6581711769104, "rewards/margins": 2.129422903060913, "rewards/rejected": 2.5287482738494873, "step": 8792 }, { "epoch": 1.43, "learning_rate": 4.4138792148957873e-07, "logits/chosen": -0.4352481961250305, "logits/rejected": -0.43984147906303406, "logps/chosen": -3.794555425643921, "logps/rejected": -0.6849479079246521, "loss": 1.769, "rewards/accuracies": 0.0, "rewards/chosen": 0.17041023075580597, "rewards/margins": -0.08954991400241852, "rewards/rejected": 0.2599601447582245, "step": 8793 }, { "epoch": 1.43, "learning_rate": 4.412574042352155e-07, "logits/chosen": -0.6850215792655945, "logits/rejected": -0.7327939867973328, "logps/chosen": -100.4732666015625, "logps/rejected": -170.28515625, "loss": 0.3688, "rewards/accuracies": 1.0, "rewards/chosen": 1.7629280090332031, "rewards/margins": 0.041761040687561035, "rewards/rejected": 1.721166968345642, "step": 8794 }, { "epoch": 1.43, "learning_rate": 4.4112689103940983e-07, "logits/chosen": -0.6453040242195129, "logits/rejected": -0.6525890827178955, "logps/chosen": -9.767657279968262, "logps/rejected": -24.758893966674805, "loss": 0.2497, "rewards/accuracies": 1.0, "rewards/chosen": 0.8496186137199402, "rewards/margins": 0.5281790494918823, "rewards/rejected": 0.32143956422805786, "step": 8795 }, { "epoch": 1.43, "learning_rate": 4.4099638191117885e-07, "logits/chosen": -0.8845332860946655, "logits/rejected": -0.829521656036377, "logps/chosen": -143.6405792236328, "logps/rejected": -110.3426513671875, "loss": 0.5096, "rewards/accuracies": 0.0, "rewards/chosen": 1.7501556873321533, "rewards/margins": -0.5642974376678467, "rewards/rejected": 2.314453125, "step": 8796 }, { "epoch": 1.43, "learning_rate": 4.408658768595396e-07, "logits/chosen": -0.901160478591919, "logits/rejected": -0.8100296854972839, "logps/chosen": -107.04560852050781, "logps/rejected": -68.29607391357422, "loss": 1.6698, "rewards/accuracies": 0.0, "rewards/chosen": 0.5975639224052429, "rewards/margins": -0.3453209400177002, "rewards/rejected": 0.9428848624229431, "step": 8797 }, { "epoch": 1.43, "learning_rate": 4.4073537589350876e-07, "logits/chosen": -0.5224196910858154, "logits/rejected": -0.5846126079559326, "logps/chosen": -66.46580505371094, "logps/rejected": -95.6003189086914, "loss": 1.8555, "rewards/accuracies": 0.0, "rewards/chosen": 2.0095489025115967, "rewards/margins": -2.6548120975494385, "rewards/rejected": 4.664361000061035, "step": 8798 }, { "epoch": 1.43, "learning_rate": 4.4060487902210267e-07, "logits/chosen": -0.615462601184845, "logits/rejected": -0.498574435710907, "logps/chosen": -50.939414978027344, "logps/rejected": -36.54344177246094, "loss": 1.7499, "rewards/accuracies": 1.0, "rewards/chosen": 2.283834218978882, "rewards/margins": 0.5543843507766724, "rewards/rejected": 1.7294498682022095, "step": 8799 }, { "epoch": 1.43, "learning_rate": 4.4047438625433747e-07, "logits/chosen": -0.6374718546867371, "logits/rejected": -0.5477222800254822, "logps/chosen": -71.28939819335938, "logps/rejected": -66.87197875976562, "loss": 1.6801, "rewards/accuracies": 1.0, "rewards/chosen": 2.318255662918091, "rewards/margins": 0.2656867504119873, "rewards/rejected": 2.0525689125061035, "step": 8800 }, { "epoch": 1.43, "learning_rate": 4.4034389759922884e-07, "logits/chosen": -0.45856764912605286, "logits/rejected": -0.4256822168827057, "logps/chosen": -50.416595458984375, "logps/rejected": -19.25733757019043, "loss": 0.9175, "rewards/accuracies": 1.0, "rewards/chosen": 2.064253330230713, "rewards/margins": 1.5267465114593506, "rewards/rejected": 0.5375068783760071, "step": 8801 }, { "epoch": 1.43, "learning_rate": 4.402134130657925e-07, "logits/chosen": -0.6580349802970886, "logits/rejected": -0.6580349802970886, "logps/chosen": -106.05030059814453, "logps/rejected": -106.05030059814453, "loss": 0.4716, "rewards/accuracies": 0.0, "rewards/chosen": 1.3564690351486206, "rewards/margins": 0.0, "rewards/rejected": 1.3564690351486206, "step": 8802 }, { "epoch": 1.43, "learning_rate": 4.400829326630436e-07, "logits/chosen": -0.3191312253475189, "logits/rejected": -0.3191312253475189, "logps/chosen": -110.29196166992188, "logps/rejected": -110.29196166992188, "loss": 0.7857, "rewards/accuracies": 0.0, "rewards/chosen": 2.4506165981292725, "rewards/margins": 0.0, "rewards/rejected": 2.4506165981292725, "step": 8803 }, { "epoch": 1.43, "learning_rate": 4.399524563999971e-07, "logits/chosen": -0.8038402199745178, "logits/rejected": -0.7139049768447876, "logps/chosen": -85.10028076171875, "logps/rejected": -45.86146545410156, "loss": 1.1624, "rewards/accuracies": 1.0, "rewards/chosen": 2.07598876953125, "rewards/margins": 0.8478343486785889, "rewards/rejected": 1.2281544208526611, "step": 8804 }, { "epoch": 1.43, "learning_rate": 4.398219842856677e-07, "logits/chosen": -0.6309691071510315, "logits/rejected": -0.5889240503311157, "logps/chosen": -183.66720581054688, "logps/rejected": -77.02008056640625, "loss": 1.2531, "rewards/accuracies": 1.0, "rewards/chosen": 3.7692201137542725, "rewards/margins": 1.6836700439453125, "rewards/rejected": 2.08555006980896, "step": 8805 }, { "epoch": 1.43, "learning_rate": 4.3969151632906974e-07, "logits/chosen": -0.734477162361145, "logits/rejected": -0.701367199420929, "logps/chosen": -69.36778259277344, "logps/rejected": -115.13748168945312, "loss": 0.5927, "rewards/accuracies": 1.0, "rewards/chosen": 1.4330612421035767, "rewards/margins": 0.36889803409576416, "rewards/rejected": 1.0641632080078125, "step": 8806 }, { "epoch": 1.43, "learning_rate": 4.395610525392174e-07, "logits/chosen": -1.0413800477981567, "logits/rejected": -0.95346599817276, "logps/chosen": -114.7862548828125, "logps/rejected": -131.2264862060547, "loss": 1.1791, "rewards/accuracies": 0.0, "rewards/chosen": 3.7852447032928467, "rewards/margins": -2.218947172164917, "rewards/rejected": 6.004191875457764, "step": 8807 }, { "epoch": 1.43, "learning_rate": 4.3943059292512445e-07, "logits/chosen": -0.7825959324836731, "logits/rejected": -0.7316206097602844, "logps/chosen": -121.72189331054688, "logps/rejected": -97.26695251464844, "loss": 0.3101, "rewards/accuracies": 1.0, "rewards/chosen": 1.6528351306915283, "rewards/margins": 0.34432756900787354, "rewards/rejected": 1.3085075616836548, "step": 8808 }, { "epoch": 1.43, "learning_rate": 4.393001374958044e-07, "logits/chosen": -0.7491993308067322, "logits/rejected": -0.7627482414245605, "logps/chosen": -73.80229949951172, "logps/rejected": -66.04488372802734, "loss": 0.7884, "rewards/accuracies": 0.0, "rewards/chosen": 2.1752068996429443, "rewards/margins": -0.15087342262268066, "rewards/rejected": 2.326080322265625, "step": 8809 }, { "epoch": 1.43, "learning_rate": 4.3916968626027057e-07, "logits/chosen": -0.5959911346435547, "logits/rejected": -0.4379192590713501, "logps/chosen": -175.37002563476562, "logps/rejected": -87.40434265136719, "loss": 0.1734, "rewards/accuracies": 1.0, "rewards/chosen": 4.246068000793457, "rewards/margins": 0.9274232387542725, "rewards/rejected": 3.3186447620391846, "step": 8810 }, { "epoch": 1.43, "learning_rate": 4.390392392275358e-07, "logits/chosen": -1.0143996477127075, "logits/rejected": -0.8031572103500366, "logps/chosen": -81.1398696899414, "logps/rejected": -73.83226013183594, "loss": 0.2756, "rewards/accuracies": 1.0, "rewards/chosen": 1.5566673278808594, "rewards/margins": 0.46316754817962646, "rewards/rejected": 1.093499779701233, "step": 8811 }, { "epoch": 1.43, "learning_rate": 4.389087964066127e-07, "logits/chosen": -0.5603762269020081, "logits/rejected": -0.6029276847839355, "logps/chosen": -67.52852630615234, "logps/rejected": -124.63459014892578, "loss": 0.74, "rewards/accuracies": 1.0, "rewards/chosen": 0.5217033624649048, "rewards/margins": 0.21378633379936218, "rewards/rejected": 0.3079170286655426, "step": 8812 }, { "epoch": 1.43, "learning_rate": 4.3877835780651384e-07, "logits/chosen": -0.7597126364707947, "logits/rejected": -0.7513231635093689, "logps/chosen": -68.12919616699219, "logps/rejected": -68.97125244140625, "loss": 1.5391, "rewards/accuracies": 0.0, "rewards/chosen": 0.37401124835014343, "rewards/margins": -1.2463150024414062, "rewards/rejected": 1.620326280593872, "step": 8813 }, { "epoch": 1.43, "learning_rate": 4.3864792343625115e-07, "logits/chosen": -0.9699711203575134, "logits/rejected": -0.9482048153877258, "logps/chosen": -129.7225799560547, "logps/rejected": -111.15185546875, "loss": 0.9827, "rewards/accuracies": 1.0, "rewards/chosen": 2.1142029762268066, "rewards/margins": 0.40373694896698, "rewards/rejected": 1.7104660272598267, "step": 8814 }, { "epoch": 1.43, "learning_rate": 4.3851749330483635e-07, "logits/chosen": -0.8854652047157288, "logits/rejected": -0.8511074185371399, "logps/chosen": -124.27645874023438, "logps/rejected": -197.98240661621094, "loss": 0.489, "rewards/accuracies": 0.0, "rewards/chosen": 5.511294841766357, "rewards/margins": -0.5051131248474121, "rewards/rejected": 6.0164079666137695, "step": 8815 }, { "epoch": 1.43, "learning_rate": 4.3838706742128105e-07, "logits/chosen": -0.6015410423278809, "logits/rejected": -0.552668035030365, "logps/chosen": -74.87483215332031, "logps/rejected": -135.4105224609375, "loss": 0.589, "rewards/accuracies": 0.0, "rewards/chosen": 1.1422127485275269, "rewards/margins": -0.015431880950927734, "rewards/rejected": 1.1576446294784546, "step": 8816 }, { "epoch": 1.43, "learning_rate": 4.3825664579459643e-07, "logits/chosen": -1.1791143417358398, "logits/rejected": -1.167129397392273, "logps/chosen": -86.14523315429688, "logps/rejected": -118.59212493896484, "loss": 1.5261, "rewards/accuracies": 0.0, "rewards/chosen": 1.3414353132247925, "rewards/margins": -1.5465811491012573, "rewards/rejected": 2.88801646232605, "step": 8817 }, { "epoch": 1.43, "learning_rate": 4.381262284337933e-07, "logits/chosen": -0.6493269801139832, "logits/rejected": -0.6051800847053528, "logps/chosen": -33.991329193115234, "logps/rejected": -18.14727210998535, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.3398609161376953, "rewards/margins": 0.061887919902801514, "rewards/rejected": 0.2779729962348938, "step": 8818 }, { "epoch": 1.43, "learning_rate": 4.379958153478824e-07, "logits/chosen": -0.6290152668952942, "logits/rejected": -0.6587421298027039, "logps/chosen": -104.5624771118164, "logps/rejected": -63.443721771240234, "loss": 0.7687, "rewards/accuracies": 0.0, "rewards/chosen": 0.770185112953186, "rewards/margins": -0.28252220153808594, "rewards/rejected": 1.052707314491272, "step": 8819 }, { "epoch": 1.43, "learning_rate": 4.3786540654587393e-07, "logits/chosen": -0.8848446011543274, "logits/rejected": -0.7569498419761658, "logps/chosen": -70.0283203125, "logps/rejected": -14.834471702575684, "loss": 0.2619, "rewards/accuracies": 1.0, "rewards/chosen": 1.5140762329101562, "rewards/margins": 0.7686894536018372, "rewards/rejected": 0.7453867793083191, "step": 8820 }, { "epoch": 1.43, "learning_rate": 4.37735002036778e-07, "logits/chosen": -1.1410208940505981, "logits/rejected": -1.0841457843780518, "logps/chosen": -101.06532287597656, "logps/rejected": -23.75800323486328, "loss": 0.7468, "rewards/accuracies": 1.0, "rewards/chosen": 1.2400192022323608, "rewards/margins": 1.3710566759109497, "rewards/rejected": -0.13103751838207245, "step": 8821 }, { "epoch": 1.43, "learning_rate": 4.3760460182960425e-07, "logits/chosen": -0.8192731738090515, "logits/rejected": -0.77367103099823, "logps/chosen": -68.0545654296875, "logps/rejected": -67.13262939453125, "loss": 0.3886, "rewards/accuracies": 0.0, "rewards/chosen": 1.1349433660507202, "rewards/margins": -0.009514689445495605, "rewards/rejected": 1.1444580554962158, "step": 8822 }, { "epoch": 1.43, "learning_rate": 4.3747420593336204e-07, "logits/chosen": -0.5458645820617676, "logits/rejected": -0.5631452798843384, "logps/chosen": -66.93441772460938, "logps/rejected": -69.37136840820312, "loss": 1.039, "rewards/accuracies": 0.0, "rewards/chosen": 0.9124191403388977, "rewards/margins": -0.2474861741065979, "rewards/rejected": 1.1599053144454956, "step": 8823 }, { "epoch": 1.43, "learning_rate": 4.373438143570607e-07, "logits/chosen": -1.1833029985427856, "logits/rejected": -1.2155709266662598, "logps/chosen": -80.51271057128906, "logps/rejected": -22.101438522338867, "loss": 0.5962, "rewards/accuracies": 1.0, "rewards/chosen": 5.790965557098389, "rewards/margins": 5.160656452178955, "rewards/rejected": 0.6303091049194336, "step": 8824 }, { "epoch": 1.43, "learning_rate": 4.3721342710970884e-07, "logits/chosen": -0.5810636878013611, "logits/rejected": -0.5520188808441162, "logps/chosen": -48.04388427734375, "logps/rejected": -125.47418975830078, "loss": 0.9591, "rewards/accuracies": 1.0, "rewards/chosen": 1.6319206953048706, "rewards/margins": 1.1292359828948975, "rewards/rejected": 0.5026847720146179, "step": 8825 }, { "epoch": 1.43, "learning_rate": 4.370830442003151e-07, "logits/chosen": -0.45433467626571655, "logits/rejected": -0.45433467626571655, "logps/chosen": -41.92267990112305, "logps/rejected": -41.92267990112305, "loss": 0.3847, "rewards/accuracies": 0.0, "rewards/chosen": 2.1008057594299316, "rewards/margins": 0.0, "rewards/rejected": 2.1008057594299316, "step": 8826 }, { "epoch": 1.43, "learning_rate": 4.369526656378878e-07, "logits/chosen": -0.8266274333000183, "logits/rejected": -0.8375607132911682, "logps/chosen": -70.18571472167969, "logps/rejected": -91.54022216796875, "loss": 3.4542, "rewards/accuracies": 0.0, "rewards/chosen": 2.4025466442108154, "rewards/margins": -2.3304717540740967, "rewards/rejected": 4.733018398284912, "step": 8827 }, { "epoch": 1.43, "learning_rate": 4.368222914314346e-07, "logits/chosen": -1.1979022026062012, "logits/rejected": -1.1370328664779663, "logps/chosen": -166.39312744140625, "logps/rejected": -165.32073974609375, "loss": 1.3661, "rewards/accuracies": 0.0, "rewards/chosen": 3.9894332885742188, "rewards/margins": -2.3160109519958496, "rewards/rejected": 6.305444240570068, "step": 8828 }, { "epoch": 1.43, "learning_rate": 4.366919215899633e-07, "logits/chosen": -0.5577927827835083, "logits/rejected": -0.48068925738334656, "logps/chosen": -30.318923950195312, "logps/rejected": -89.25297546386719, "loss": 0.9535, "rewards/accuracies": 0.0, "rewards/chosen": 2.190837860107422, "rewards/margins": -1.7027549743652344, "rewards/rejected": 3.8935928344726562, "step": 8829 }, { "epoch": 1.43, "learning_rate": 4.3656155612248125e-07, "logits/chosen": -0.9717175364494324, "logits/rejected": -0.8827618956565857, "logps/chosen": -96.45201110839844, "logps/rejected": -37.95466232299805, "loss": 0.1328, "rewards/accuracies": 1.0, "rewards/chosen": 1.7966049909591675, "rewards/margins": 1.6273480653762817, "rewards/rejected": 0.16925697028636932, "step": 8830 }, { "epoch": 1.43, "learning_rate": 4.364311950379953e-07, "logits/chosen": -1.1029480695724487, "logits/rejected": -1.0199792385101318, "logps/chosen": -259.8173828125, "logps/rejected": -93.42533111572266, "loss": 0.3556, "rewards/accuracies": 0.0, "rewards/chosen": 5.105481147766113, "rewards/margins": -0.020714282989501953, "rewards/rejected": 5.126195430755615, "step": 8831 }, { "epoch": 1.43, "learning_rate": 4.3630083834551234e-07, "logits/chosen": -0.4862571060657501, "logits/rejected": -0.4678000807762146, "logps/chosen": -36.0490837097168, "logps/rejected": -25.22903060913086, "loss": 0.5147, "rewards/accuracies": 0.0, "rewards/chosen": -0.1746620237827301, "rewards/margins": -0.4611322581768036, "rewards/rejected": 0.2864702343940735, "step": 8832 }, { "epoch": 1.43, "learning_rate": 4.3617048605403873e-07, "logits/chosen": -0.8425519466400146, "logits/rejected": -0.8271902203559875, "logps/chosen": -119.42903137207031, "logps/rejected": -212.63931274414062, "loss": 1.0915, "rewards/accuracies": 0.0, "rewards/chosen": 5.393214702606201, "rewards/margins": -2.0441389083862305, "rewards/rejected": 7.437353610992432, "step": 8833 }, { "epoch": 1.43, "learning_rate": 4.360401381725806e-07, "logits/chosen": -0.5699955821037292, "logits/rejected": -0.5246810913085938, "logps/chosen": -59.25876235961914, "logps/rejected": -76.03345489501953, "loss": 0.4925, "rewards/accuracies": 1.0, "rewards/chosen": 1.824276328086853, "rewards/margins": 0.9632160067558289, "rewards/rejected": 0.8610603213310242, "step": 8834 }, { "epoch": 1.43, "learning_rate": 4.3590979471014367e-07, "logits/chosen": -0.3679041862487793, "logits/rejected": -0.3895767331123352, "logps/chosen": -52.730987548828125, "logps/rejected": -75.85951232910156, "loss": 0.2885, "rewards/accuracies": 1.0, "rewards/chosen": 2.5502870082855225, "rewards/margins": 0.6570267677307129, "rewards/rejected": 1.8932602405548096, "step": 8835 }, { "epoch": 1.43, "learning_rate": 4.357794556757335e-07, "logits/chosen": -0.7202765941619873, "logits/rejected": -0.6952803730964661, "logps/chosen": -87.30827331542969, "logps/rejected": -150.10452270507812, "loss": 0.2698, "rewards/accuracies": 1.0, "rewards/chosen": 1.5174369812011719, "rewards/margins": 0.4990501403808594, "rewards/rejected": 1.0183868408203125, "step": 8836 }, { "epoch": 1.43, "learning_rate": 4.3564912107835526e-07, "logits/chosen": -0.2818427085876465, "logits/rejected": -0.24319401383399963, "logps/chosen": -34.42078399658203, "logps/rejected": -21.29256248474121, "loss": 0.3516, "rewards/accuracies": 0.0, "rewards/chosen": 0.1607128232717514, "rewards/margins": -0.0026254653930664062, "rewards/rejected": 0.1633382886648178, "step": 8837 }, { "epoch": 1.43, "learning_rate": 4.3551879092701396e-07, "logits/chosen": -0.7916741967201233, "logits/rejected": -0.6850801110267639, "logps/chosen": -88.91546630859375, "logps/rejected": -71.45452880859375, "loss": 0.2451, "rewards/accuracies": 1.0, "rewards/chosen": 3.331829786300659, "rewards/margins": 0.5373077392578125, "rewards/rejected": 2.7945220470428467, "step": 8838 }, { "epoch": 1.43, "learning_rate": 4.3538846523071396e-07, "logits/chosen": -0.6773947477340698, "logits/rejected": -0.6896965503692627, "logps/chosen": -40.67784118652344, "logps/rejected": -76.62084197998047, "loss": 0.2833, "rewards/accuracies": 1.0, "rewards/chosen": 1.686883568763733, "rewards/margins": 0.41234052181243896, "rewards/rejected": 1.274543046951294, "step": 8839 }, { "epoch": 1.43, "learning_rate": 4.352581439984597e-07, "logits/chosen": -0.7187978625297546, "logits/rejected": -0.609518826007843, "logps/chosen": -111.8346939086914, "logps/rejected": -131.29800415039062, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": 4.263301849365234, "rewards/margins": 3.0147438049316406, "rewards/rejected": 1.2485580444335938, "step": 8840 }, { "epoch": 1.43, "learning_rate": 4.351278272392551e-07, "logits/chosen": -0.5819042325019836, "logits/rejected": -0.3651866018772125, "logps/chosen": -70.38765716552734, "logps/rejected": -15.583373069763184, "loss": 0.6014, "rewards/accuracies": 1.0, "rewards/chosen": 1.4029892683029175, "rewards/margins": 1.1706663370132446, "rewards/rejected": 0.23232297599315643, "step": 8841 }, { "epoch": 1.44, "learning_rate": 4.3499751496210385e-07, "logits/chosen": -0.692362368106842, "logits/rejected": -0.64805006980896, "logps/chosen": -68.21375274658203, "logps/rejected": -62.38302993774414, "loss": 0.7795, "rewards/accuracies": 0.0, "rewards/chosen": 0.8144325613975525, "rewards/margins": -0.9064128994941711, "rewards/rejected": 1.7208454608917236, "step": 8842 }, { "epoch": 1.44, "learning_rate": 4.348672071760092e-07, "logits/chosen": -0.7408919930458069, "logits/rejected": -0.7263538241386414, "logps/chosen": -80.34255981445312, "logps/rejected": -90.93153381347656, "loss": 0.2272, "rewards/accuracies": 1.0, "rewards/chosen": 2.9269020557403564, "rewards/margins": 0.5821502208709717, "rewards/rejected": 2.3447518348693848, "step": 8843 }, { "epoch": 1.44, "learning_rate": 4.347369038899743e-07, "logits/chosen": -0.6716372966766357, "logits/rejected": -0.5811535120010376, "logps/chosen": -61.71833801269531, "logps/rejected": -51.674434661865234, "loss": 1.2229, "rewards/accuracies": 0.0, "rewards/chosen": 1.373857855796814, "rewards/margins": -0.9252041578292847, "rewards/rejected": 2.2990620136260986, "step": 8844 }, { "epoch": 1.44, "learning_rate": 4.346066051130018e-07, "logits/chosen": -0.193306103348732, "logits/rejected": -0.22731882333755493, "logps/chosen": -86.81820678710938, "logps/rejected": -63.45934295654297, "loss": 1.9206, "rewards/accuracies": 1.0, "rewards/chosen": 1.2427963018417358, "rewards/margins": 0.3356117010116577, "rewards/rejected": 0.9071846008300781, "step": 8845 }, { "epoch": 1.44, "learning_rate": 4.344763108540941e-07, "logits/chosen": -0.6898059844970703, "logits/rejected": -0.6247496604919434, "logps/chosen": -119.65379333496094, "logps/rejected": -49.65267562866211, "loss": 0.7706, "rewards/accuracies": 0.0, "rewards/chosen": 1.2364243268966675, "rewards/margins": -1.2679904699325562, "rewards/rejected": 2.5044147968292236, "step": 8846 }, { "epoch": 1.44, "learning_rate": 4.343460211222533e-07, "logits/chosen": -0.9042125344276428, "logits/rejected": -0.8051422834396362, "logps/chosen": -76.09539794921875, "logps/rejected": -71.64485168457031, "loss": 2.4435, "rewards/accuracies": 1.0, "rewards/chosen": 2.7646234035491943, "rewards/margins": 0.28684163093566895, "rewards/rejected": 2.4777817726135254, "step": 8847 }, { "epoch": 1.44, "learning_rate": 4.342157359264813e-07, "logits/chosen": -0.7526320815086365, "logits/rejected": -0.6154475808143616, "logps/chosen": -79.39204406738281, "logps/rejected": -47.94077682495117, "loss": 2.9533, "rewards/accuracies": 1.0, "rewards/chosen": 2.3439393043518066, "rewards/margins": 0.8020988702774048, "rewards/rejected": 1.5418404340744019, "step": 8848 }, { "epoch": 1.44, "learning_rate": 4.3408545527577944e-07, "logits/chosen": -0.5558081865310669, "logits/rejected": -0.5864079594612122, "logps/chosen": -36.02206802368164, "logps/rejected": -66.97923278808594, "loss": 0.3125, "rewards/accuracies": 1.0, "rewards/chosen": 0.7217205166816711, "rewards/margins": 0.46183282136917114, "rewards/rejected": 0.2598876953125, "step": 8849 }, { "epoch": 1.44, "learning_rate": 4.3395517917914894e-07, "logits/chosen": -0.9552685618400574, "logits/rejected": -0.9505832195281982, "logps/chosen": -19.29956817626953, "logps/rejected": -21.486021041870117, "loss": 0.4984, "rewards/accuracies": 1.0, "rewards/chosen": 1.5935157537460327, "rewards/margins": 1.0363166332244873, "rewards/rejected": 0.5571991205215454, "step": 8850 }, { "epoch": 1.44, "learning_rate": 4.3382490764559063e-07, "logits/chosen": -1.0706273317337036, "logits/rejected": -1.0734584331512451, "logps/chosen": -61.1990966796875, "logps/rejected": -102.56602478027344, "loss": 1.5145, "rewards/accuracies": 0.0, "rewards/chosen": 2.658734083175659, "rewards/margins": -2.972843885421753, "rewards/rejected": 5.631577968597412, "step": 8851 }, { "epoch": 1.44, "learning_rate": 4.3369464068410506e-07, "logits/chosen": -0.5162298083305359, "logits/rejected": -0.6461847424507141, "logps/chosen": -54.07884216308594, "logps/rejected": -92.40634155273438, "loss": 1.9823, "rewards/accuracies": 0.0, "rewards/chosen": 0.6092147827148438, "rewards/margins": -1.0918854475021362, "rewards/rejected": 1.70110023021698, "step": 8852 }, { "epoch": 1.44, "learning_rate": 4.335643783036924e-07, "logits/chosen": -0.3337530493736267, "logits/rejected": -0.3337530493736267, "logps/chosen": -54.697296142578125, "logps/rejected": -54.697296142578125, "loss": 0.3529, "rewards/accuracies": 0.0, "rewards/chosen": 0.5422435998916626, "rewards/margins": 0.0, "rewards/rejected": 0.5422435998916626, "step": 8853 }, { "epoch": 1.44, "learning_rate": 4.334341205133526e-07, "logits/chosen": -0.8934774398803711, "logits/rejected": -0.8567951917648315, "logps/chosen": -119.12747955322266, "logps/rejected": -160.8943328857422, "loss": 1.9096, "rewards/accuracies": 0.0, "rewards/chosen": 1.682665228843689, "rewards/margins": -3.695326805114746, "rewards/rejected": 5.377992153167725, "step": 8854 }, { "epoch": 1.44, "learning_rate": 4.333038673220852e-07, "logits/chosen": -0.8189016580581665, "logits/rejected": -0.8346829414367676, "logps/chosen": -69.78501892089844, "logps/rejected": -60.260318756103516, "loss": 0.3223, "rewards/accuracies": 1.0, "rewards/chosen": 3.0294244289398193, "rewards/margins": 0.12250638008117676, "rewards/rejected": 2.9069180488586426, "step": 8855 }, { "epoch": 1.44, "learning_rate": 4.3317361873888955e-07, "logits/chosen": -0.8439896702766418, "logits/rejected": -0.866864800453186, "logps/chosen": -63.217041015625, "logps/rejected": -73.07418823242188, "loss": 0.4903, "rewards/accuracies": 0.0, "rewards/chosen": 2.1157760620117188, "rewards/margins": -0.09347391128540039, "rewards/rejected": 2.209249973297119, "step": 8856 }, { "epoch": 1.44, "learning_rate": 4.330433747727644e-07, "logits/chosen": -0.28661462664604187, "logits/rejected": -0.29272669553756714, "logps/chosen": -24.408082962036133, "logps/rejected": -24.50725555419922, "loss": 0.6076, "rewards/accuracies": 1.0, "rewards/chosen": 0.10304546356201172, "rewards/margins": 0.005433268845081329, "rewards/rejected": 0.09761219471693039, "step": 8857 }, { "epoch": 1.44, "learning_rate": 4.329131354327086e-07, "logits/chosen": -0.6413180232048035, "logits/rejected": -0.6047511696815491, "logps/chosen": -37.769935607910156, "logps/rejected": -9.32969856262207, "loss": 0.3168, "rewards/accuracies": 1.0, "rewards/chosen": 1.332693099975586, "rewards/margins": 0.2475423812866211, "rewards/rejected": 1.0851507186889648, "step": 8858 }, { "epoch": 1.44, "learning_rate": 4.3278290072772034e-07, "logits/chosen": -1.0662065744400024, "logits/rejected": -1.0040900707244873, "logps/chosen": -134.57461547851562, "logps/rejected": -107.22437286376953, "loss": 0.6575, "rewards/accuracies": 0.0, "rewards/chosen": 5.357298374176025, "rewards/margins": -0.5829062461853027, "rewards/rejected": 5.940204620361328, "step": 8859 }, { "epoch": 1.44, "learning_rate": 4.326526706667976e-07, "logits/chosen": -1.050396203994751, "logits/rejected": -1.0537924766540527, "logps/chosen": -81.02144622802734, "logps/rejected": -65.35812377929688, "loss": 1.022, "rewards/accuracies": 0.0, "rewards/chosen": 2.076113224029541, "rewards/margins": -1.051110029220581, "rewards/rejected": 3.127223253250122, "step": 8860 }, { "epoch": 1.44, "learning_rate": 4.325224452589381e-07, "logits/chosen": -0.06473346799612045, "logits/rejected": -0.044381238520145416, "logps/chosen": -25.345378875732422, "logps/rejected": -16.337688446044922, "loss": 0.6501, "rewards/accuracies": 0.0, "rewards/chosen": 0.0050178528763353825, "rewards/margins": -0.6359149813652039, "rewards/rejected": 0.6409328579902649, "step": 8861 }, { "epoch": 1.44, "learning_rate": 4.3239222451313917e-07, "logits/chosen": -0.4914563298225403, "logits/rejected": -0.5500171184539795, "logps/chosen": -32.28221130371094, "logps/rejected": -50.12508773803711, "loss": 0.6353, "rewards/accuracies": 0.0, "rewards/chosen": 0.5395858883857727, "rewards/margins": -0.1691669225692749, "rewards/rejected": 0.7087528109550476, "step": 8862 }, { "epoch": 1.44, "learning_rate": 4.3226200843839785e-07, "logits/chosen": -0.7033978700637817, "logits/rejected": -0.6714688539505005, "logps/chosen": -47.68402862548828, "logps/rejected": -27.412254333496094, "loss": 0.7255, "rewards/accuracies": 0.0, "rewards/chosen": 1.5916874408721924, "rewards/margins": -0.9655070304870605, "rewards/rejected": 2.557194471359253, "step": 8863 }, { "epoch": 1.44, "learning_rate": 4.321317970437107e-07, "logits/chosen": -0.33773142099380493, "logits/rejected": -0.33773142099380493, "logps/chosen": -35.22532653808594, "logps/rejected": -35.22532653808594, "loss": 1.0115, "rewards/accuracies": 0.0, "rewards/chosen": 0.12006073445081711, "rewards/margins": 0.0, "rewards/rejected": 0.12006073445081711, "step": 8864 }, { "epoch": 1.44, "learning_rate": 4.3200159033807425e-07, "logits/chosen": -0.4201521575450897, "logits/rejected": -0.4419083595275879, "logps/chosen": -20.16625213623047, "logps/rejected": -68.81974792480469, "loss": 1.4935, "rewards/accuracies": 0.0, "rewards/chosen": 0.6929588317871094, "rewards/margins": -1.2865715026855469, "rewards/rejected": 1.9795303344726562, "step": 8865 }, { "epoch": 1.44, "learning_rate": 4.3187138833048454e-07, "logits/chosen": -0.8993862867355347, "logits/rejected": -0.908494234085083, "logps/chosen": -131.842041015625, "logps/rejected": -100.28042602539062, "loss": 0.1154, "rewards/accuracies": 1.0, "rewards/chosen": 5.740614414215088, "rewards/margins": 3.824047088623047, "rewards/rejected": 1.9165672063827515, "step": 8866 }, { "epoch": 1.44, "learning_rate": 4.317411910299372e-07, "logits/chosen": -0.9088947772979736, "logits/rejected": -0.5596288442611694, "logps/chosen": -152.73971557617188, "logps/rejected": -102.09901428222656, "loss": 0.1186, "rewards/accuracies": 1.0, "rewards/chosen": 4.299108982086182, "rewards/margins": 2.459498643875122, "rewards/rejected": 1.8396103382110596, "step": 8867 }, { "epoch": 1.44, "learning_rate": 4.3161099844542774e-07, "logits/chosen": -0.7740733027458191, "logits/rejected": -0.7710273861885071, "logps/chosen": -92.0721435546875, "logps/rejected": -118.70964813232422, "loss": 0.7533, "rewards/accuracies": 0.0, "rewards/chosen": 2.8295578956604004, "rewards/margins": -1.0458335876464844, "rewards/rejected": 3.8753914833068848, "step": 8868 }, { "epoch": 1.44, "learning_rate": 4.314808105859512e-07, "logits/chosen": -1.0126829147338867, "logits/rejected": -0.9368712902069092, "logps/chosen": -68.24539184570312, "logps/rejected": -49.60275650024414, "loss": 0.432, "rewards/accuracies": 1.0, "rewards/chosen": 1.4229415655136108, "rewards/margins": 0.47775721549987793, "rewards/rejected": 0.9451843500137329, "step": 8869 }, { "epoch": 1.44, "learning_rate": 4.313506274605022e-07, "logits/chosen": -0.9371922016143799, "logits/rejected": -0.9735615849494934, "logps/chosen": -129.34873962402344, "logps/rejected": -103.10790252685547, "loss": 0.794, "rewards/accuracies": 0.0, "rewards/chosen": 1.6549514532089233, "rewards/margins": -1.3143211603164673, "rewards/rejected": 2.9692726135253906, "step": 8870 }, { "epoch": 1.44, "learning_rate": 4.3122044907807543e-07, "logits/chosen": -0.36638015508651733, "logits/rejected": -0.34989845752716064, "logps/chosen": -68.23412322998047, "logps/rejected": -73.38945770263672, "loss": 0.412, "rewards/accuracies": 0.0, "rewards/chosen": 0.6556144952774048, "rewards/margins": -0.0880584716796875, "rewards/rejected": 0.7436729669570923, "step": 8871 }, { "epoch": 1.44, "learning_rate": 4.310902754476647e-07, "logits/chosen": -0.7945640087127686, "logits/rejected": -0.754116952419281, "logps/chosen": -91.85301208496094, "logps/rejected": -80.18177795410156, "loss": 1.3984, "rewards/accuracies": 0.0, "rewards/chosen": 1.0225311517715454, "rewards/margins": -2.305111885070801, "rewards/rejected": 3.3276429176330566, "step": 8872 }, { "epoch": 1.44, "learning_rate": 4.30960106578264e-07, "logits/chosen": -0.6890634298324585, "logits/rejected": -0.5625495314598083, "logps/chosen": -83.98761749267578, "logps/rejected": -55.69135665893555, "loss": 0.0932, "rewards/accuracies": 1.0, "rewards/chosen": 3.539585828781128, "rewards/margins": 1.7792963981628418, "rewards/rejected": 1.7602894306182861, "step": 8873 }, { "epoch": 1.44, "learning_rate": 4.3082994247886664e-07, "logits/chosen": -0.4896373450756073, "logits/rejected": -0.44105973839759827, "logps/chosen": -35.722293853759766, "logps/rejected": -56.66483688354492, "loss": 0.3915, "rewards/accuracies": 1.0, "rewards/chosen": 1.4343303442001343, "rewards/margins": 0.05317068099975586, "rewards/rejected": 1.3811596632003784, "step": 8874 }, { "epoch": 1.44, "learning_rate": 4.3069978315846575e-07, "logits/chosen": -0.5819785594940186, "logits/rejected": -0.3658449649810791, "logps/chosen": -194.8870849609375, "logps/rejected": -153.4261932373047, "loss": 0.1273, "rewards/accuracies": 1.0, "rewards/chosen": 5.737783908843994, "rewards/margins": 1.590287685394287, "rewards/rejected": 4.147496223449707, "step": 8875 }, { "epoch": 1.44, "learning_rate": 4.3056962862605407e-07, "logits/chosen": -0.8183698058128357, "logits/rejected": -0.8210868239402771, "logps/chosen": -131.83929443359375, "logps/rejected": -133.59591674804688, "loss": 1.2923, "rewards/accuracies": 1.0, "rewards/chosen": 1.4881439208984375, "rewards/margins": 0.42318570613861084, "rewards/rejected": 1.0649582147598267, "step": 8876 }, { "epoch": 1.44, "learning_rate": 4.3043947889062417e-07, "logits/chosen": -0.6900777220726013, "logits/rejected": -0.6986387968063354, "logps/chosen": -95.6623764038086, "logps/rejected": -72.76449584960938, "loss": 0.7239, "rewards/accuracies": 0.0, "rewards/chosen": 2.2281272411346436, "rewards/margins": -0.8949806690216064, "rewards/rejected": 3.12310791015625, "step": 8877 }, { "epoch": 1.44, "learning_rate": 4.303093339611681e-07, "logits/chosen": -0.632439136505127, "logits/rejected": -0.6002012491226196, "logps/chosen": -64.87315368652344, "logps/rejected": -8.440818786621094, "loss": 0.4961, "rewards/accuracies": 1.0, "rewards/chosen": 0.33865052461624146, "rewards/margins": 0.1268211454153061, "rewards/rejected": 0.21182937920093536, "step": 8878 }, { "epoch": 1.44, "learning_rate": 4.3017919384667755e-07, "logits/chosen": -0.6347185373306274, "logits/rejected": -0.7447680830955505, "logps/chosen": -114.7095718383789, "logps/rejected": -130.17098999023438, "loss": 0.8699, "rewards/accuracies": 0.0, "rewards/chosen": 3.791374921798706, "rewards/margins": -1.0034525394439697, "rewards/rejected": 4.794827461242676, "step": 8879 }, { "epoch": 1.44, "learning_rate": 4.3004905855614414e-07, "logits/chosen": -0.8599395155906677, "logits/rejected": -0.858867347240448, "logps/chosen": -47.1430549621582, "logps/rejected": -77.42958068847656, "loss": 1.3944, "rewards/accuracies": 0.0, "rewards/chosen": 1.5044125318527222, "rewards/margins": -0.27156710624694824, "rewards/rejected": 1.7759796380996704, "step": 8880 }, { "epoch": 1.44, "learning_rate": 4.2991892809855885e-07, "logits/chosen": -1.129673957824707, "logits/rejected": -1.1268755197525024, "logps/chosen": -75.55668640136719, "logps/rejected": -132.70826721191406, "loss": 1.0691, "rewards/accuracies": 0.0, "rewards/chosen": 4.028696537017822, "rewards/margins": -1.990170955657959, "rewards/rejected": 6.018867492675781, "step": 8881 }, { "epoch": 1.44, "learning_rate": 4.297888024829125e-07, "logits/chosen": -0.32847076654434204, "logits/rejected": -0.32847076654434204, "logps/chosen": -2.052492380142212, "logps/rejected": -2.052492380142212, "loss": 0.5238, "rewards/accuracies": 0.0, "rewards/chosen": 0.1831447333097458, "rewards/margins": 0.0, "rewards/rejected": 0.1831447333097458, "step": 8882 }, { "epoch": 1.44, "learning_rate": 4.296586817181956e-07, "logits/chosen": -0.24790412187576294, "logits/rejected": -0.24790412187576294, "logps/chosen": -39.41254425048828, "logps/rejected": -39.41254425048828, "loss": 0.652, "rewards/accuracies": 0.0, "rewards/chosen": 1.2353638410568237, "rewards/margins": 0.0, "rewards/rejected": 1.2353638410568237, "step": 8883 }, { "epoch": 1.44, "learning_rate": 4.2952856581339827e-07, "logits/chosen": -0.9788202047348022, "logits/rejected": -0.9355305433273315, "logps/chosen": -32.82461929321289, "logps/rejected": -80.88044738769531, "loss": 0.7279, "rewards/accuracies": 1.0, "rewards/chosen": 0.9233364462852478, "rewards/margins": 0.4698452353477478, "rewards/rejected": 0.4534912109375, "step": 8884 }, { "epoch": 1.44, "learning_rate": 4.2939845477751015e-07, "logits/chosen": -0.8781017065048218, "logits/rejected": -0.8340412974357605, "logps/chosen": -97.98924255371094, "logps/rejected": -91.90247344970703, "loss": 0.2649, "rewards/accuracies": 1.0, "rewards/chosen": 3.8737564086914062, "rewards/margins": 0.531029462814331, "rewards/rejected": 3.342726945877075, "step": 8885 }, { "epoch": 1.44, "learning_rate": 4.2926834861952077e-07, "logits/chosen": -0.6129512786865234, "logits/rejected": -0.610369086265564, "logps/chosen": -89.16462707519531, "logps/rejected": -89.56097412109375, "loss": 0.3356, "rewards/accuracies": 1.0, "rewards/chosen": 2.921234130859375, "rewards/margins": 0.7189009189605713, "rewards/rejected": 2.2023332118988037, "step": 8886 }, { "epoch": 1.44, "learning_rate": 4.2913824734841926e-07, "logits/chosen": -0.7099332213401794, "logits/rejected": -0.6084113717079163, "logps/chosen": -96.20065307617188, "logps/rejected": -92.72299194335938, "loss": 0.3627, "rewards/accuracies": 1.0, "rewards/chosen": 2.0638978481292725, "rewards/margins": 0.039000749588012695, "rewards/rejected": 2.0248970985412598, "step": 8887 }, { "epoch": 1.44, "learning_rate": 4.2900815097319434e-07, "logits/chosen": -0.7826416492462158, "logits/rejected": -0.8115736246109009, "logps/chosen": -117.29910278320312, "logps/rejected": -88.95429229736328, "loss": 0.5588, "rewards/accuracies": 0.0, "rewards/chosen": 1.2062408924102783, "rewards/margins": -0.5937469005584717, "rewards/rejected": 1.79998779296875, "step": 8888 }, { "epoch": 1.44, "learning_rate": 4.2887805950283445e-07, "logits/chosen": -0.8763746619224548, "logits/rejected": -0.8657968640327454, "logps/chosen": -98.33189392089844, "logps/rejected": -106.70912170410156, "loss": 0.7183, "rewards/accuracies": 0.0, "rewards/chosen": 1.8300575017929077, "rewards/margins": -1.1616157293319702, "rewards/rejected": 2.991673231124878, "step": 8889 }, { "epoch": 1.44, "learning_rate": 4.287479729463277e-07, "logits/chosen": -1.0384780168533325, "logits/rejected": -1.016635537147522, "logps/chosen": -113.77140808105469, "logps/rejected": -93.46178436279297, "loss": 0.8116, "rewards/accuracies": 0.0, "rewards/chosen": 0.9970367550849915, "rewards/margins": -1.342026710510254, "rewards/rejected": 2.3390634059906006, "step": 8890 }, { "epoch": 1.44, "learning_rate": 4.2861789131266184e-07, "logits/chosen": -0.8536697030067444, "logits/rejected": -0.8592211604118347, "logps/chosen": -92.84298706054688, "logps/rejected": -132.98497009277344, "loss": 1.2331, "rewards/accuracies": 0.0, "rewards/chosen": 3.3199760913848877, "rewards/margins": -2.3731682300567627, "rewards/rejected": 5.69314432144165, "step": 8891 }, { "epoch": 1.44, "learning_rate": 4.2848781461082426e-07, "logits/chosen": -0.38993486762046814, "logits/rejected": -0.3916124105453491, "logps/chosen": -97.886962890625, "logps/rejected": -91.83787536621094, "loss": 1.0576, "rewards/accuracies": 0.0, "rewards/chosen": 1.06640625, "rewards/margins": -0.5720726251602173, "rewards/rejected": 1.6384788751602173, "step": 8892 }, { "epoch": 1.44, "learning_rate": 4.283577428498019e-07, "logits/chosen": -0.4973470866680145, "logits/rejected": -0.42056137323379517, "logps/chosen": -35.98793029785156, "logps/rejected": -11.484489440917969, "loss": 0.6712, "rewards/accuracies": 1.0, "rewards/chosen": 1.3841816186904907, "rewards/margins": 0.35633397102355957, "rewards/rejected": 1.0278476476669312, "step": 8893 }, { "epoch": 1.44, "learning_rate": 4.2822767603858177e-07, "logits/chosen": -0.9145081639289856, "logits/rejected": -0.86819988489151, "logps/chosen": -93.71018981933594, "logps/rejected": -97.0411376953125, "loss": 0.9847, "rewards/accuracies": 1.0, "rewards/chosen": 2.5619354248046875, "rewards/margins": 0.8126510381698608, "rewards/rejected": 1.7492843866348267, "step": 8894 }, { "epoch": 1.44, "learning_rate": 4.280976141861501e-07, "logits/chosen": -0.7926419973373413, "logits/rejected": -0.7226159572601318, "logps/chosen": -81.87095642089844, "logps/rejected": -73.62842559814453, "loss": 1.2545, "rewards/accuracies": 0.0, "rewards/chosen": 0.7398773431777954, "rewards/margins": -0.808197021484375, "rewards/rejected": 1.5480743646621704, "step": 8895 }, { "epoch": 1.44, "learning_rate": 4.279675573014929e-07, "logits/chosen": -0.7707746624946594, "logits/rejected": -0.609658420085907, "logps/chosen": -144.00479125976562, "logps/rejected": -127.25155639648438, "loss": 1.2741, "rewards/accuracies": 0.0, "rewards/chosen": 3.901686191558838, "rewards/margins": -1.7969894409179688, "rewards/rejected": 5.698675632476807, "step": 8896 }, { "epoch": 1.44, "learning_rate": 4.27837505393596e-07, "logits/chosen": -0.8115215301513672, "logits/rejected": -0.8346341848373413, "logps/chosen": -62.98091506958008, "logps/rejected": -55.81897735595703, "loss": 0.6398, "rewards/accuracies": 0.0, "rewards/chosen": 1.0543758869171143, "rewards/margins": -0.35282623767852783, "rewards/rejected": 1.407202124595642, "step": 8897 }, { "epoch": 1.44, "learning_rate": 4.277074584714446e-07, "logits/chosen": -0.8746722936630249, "logits/rejected": -0.8499025106430054, "logps/chosen": -63.97599792480469, "logps/rejected": -90.61567687988281, "loss": 1.1957, "rewards/accuracies": 1.0, "rewards/chosen": 1.467315673828125, "rewards/margins": 0.28518521785736084, "rewards/rejected": 1.1821304559707642, "step": 8898 }, { "epoch": 1.44, "learning_rate": 4.275774165440238e-07, "logits/chosen": -0.450894832611084, "logits/rejected": -0.4600744843482971, "logps/chosen": -8.37209701538086, "logps/rejected": -2.9583849906921387, "loss": 0.8218, "rewards/accuracies": 0.0, "rewards/chosen": 0.30984431505203247, "rewards/margins": -0.023746192455291748, "rewards/rejected": 0.3335905075073242, "step": 8899 }, { "epoch": 1.44, "learning_rate": 4.2744737962031823e-07, "logits/chosen": -0.6802050471305847, "logits/rejected": -0.5460068583488464, "logps/chosen": -60.700416564941406, "logps/rejected": -52.315887451171875, "loss": 0.8888, "rewards/accuracies": 1.0, "rewards/chosen": 2.443753957748413, "rewards/margins": 0.11332178115844727, "rewards/rejected": 2.330432176589966, "step": 8900 }, { "epoch": 1.44, "learning_rate": 4.273173477093123e-07, "logits/chosen": -0.45976415276527405, "logits/rejected": -0.35610857605934143, "logps/chosen": -49.850502014160156, "logps/rejected": -22.048078536987305, "loss": 1.1144, "rewards/accuracies": 1.0, "rewards/chosen": 2.2939553260803223, "rewards/margins": 1.2064623832702637, "rewards/rejected": 1.0874929428100586, "step": 8901 }, { "epoch": 1.44, "learning_rate": 4.271873208199898e-07, "logits/chosen": -0.48157966136932373, "logits/rejected": -0.4947652816772461, "logps/chosen": -31.167713165283203, "logps/rejected": -2.140803575515747, "loss": 1.3192, "rewards/accuracies": 0.0, "rewards/chosen": -0.10344944149255753, "rewards/margins": -0.5842527747154236, "rewards/rejected": 0.48080331087112427, "step": 8902 }, { "epoch": 1.45, "learning_rate": 4.270572989613346e-07, "logits/chosen": -1.0386494398117065, "logits/rejected": -1.0151748657226562, "logps/chosen": -101.6484146118164, "logps/rejected": -85.23472595214844, "loss": 0.1953, "rewards/accuracies": 1.0, "rewards/chosen": 1.8501030206680298, "rewards/margins": 0.8975822329521179, "rewards/rejected": 0.9525207877159119, "step": 8903 }, { "epoch": 1.45, "learning_rate": 4.2692728214232976e-07, "logits/chosen": -0.49862420558929443, "logits/rejected": -0.5760137438774109, "logps/chosen": -76.8002700805664, "logps/rejected": -71.52430725097656, "loss": 1.0453, "rewards/accuracies": 0.0, "rewards/chosen": 1.5261421203613281, "rewards/margins": -1.895414113998413, "rewards/rejected": 3.421556234359741, "step": 8904 }, { "epoch": 1.45, "learning_rate": 4.2679727037195835e-07, "logits/chosen": -1.2759233713150024, "logits/rejected": -1.0931040048599243, "logps/chosen": -73.78338623046875, "logps/rejected": -97.18414306640625, "loss": 0.0652, "rewards/accuracies": 1.0, "rewards/chosen": 5.699832439422607, "rewards/margins": 2.1169803142547607, "rewards/rejected": 3.5828521251678467, "step": 8905 }, { "epoch": 1.45, "learning_rate": 4.266672636592029e-07, "logits/chosen": -0.7515246868133545, "logits/rejected": -0.7927887439727783, "logps/chosen": -90.4078369140625, "logps/rejected": -114.34542083740234, "loss": 1.4937, "rewards/accuracies": 0.0, "rewards/chosen": 0.8612884879112244, "rewards/margins": -1.4552772045135498, "rewards/rejected": 2.316565752029419, "step": 8906 }, { "epoch": 1.45, "learning_rate": 4.2653726201304555e-07, "logits/chosen": -1.052923321723938, "logits/rejected": -1.052923321723938, "logps/chosen": -69.93914794921875, "logps/rejected": -69.93914794921875, "loss": 0.3864, "rewards/accuracies": 0.0, "rewards/chosen": 0.9521949887275696, "rewards/margins": 0.0, "rewards/rejected": 0.9521949887275696, "step": 8907 }, { "epoch": 1.45, "learning_rate": 4.264072654424684e-07, "logits/chosen": -0.3085992634296417, "logits/rejected": -0.24966056644916534, "logps/chosen": -71.38321685791016, "logps/rejected": -78.76031494140625, "loss": 1.1925, "rewards/accuracies": 1.0, "rewards/chosen": 2.0221238136291504, "rewards/margins": 0.48869478702545166, "rewards/rejected": 1.5334290266036987, "step": 8908 }, { "epoch": 1.45, "learning_rate": 4.2627727395645286e-07, "logits/chosen": -0.393703818321228, "logits/rejected": -0.44584566354751587, "logps/chosen": -73.90829467773438, "logps/rejected": -110.20179748535156, "loss": 1.7564, "rewards/accuracies": 0.0, "rewards/chosen": 0.962451159954071, "rewards/margins": -2.4892821311950684, "rewards/rejected": 3.451733350753784, "step": 8909 }, { "epoch": 1.45, "learning_rate": 4.2614728756398005e-07, "logits/chosen": -1.1328332424163818, "logits/rejected": -1.207882285118103, "logps/chosen": -112.25348663330078, "logps/rejected": -243.03927612304688, "loss": 0.4415, "rewards/accuracies": 0.0, "rewards/chosen": 1.8042229413986206, "rewards/margins": -0.3457130193710327, "rewards/rejected": 2.1499359607696533, "step": 8910 }, { "epoch": 1.45, "learning_rate": 4.2601730627403095e-07, "logits/chosen": -0.7287348508834839, "logits/rejected": -0.7911853790283203, "logps/chosen": -85.28960418701172, "logps/rejected": -118.65906524658203, "loss": 1.8533, "rewards/accuracies": 0.0, "rewards/chosen": 0.9246239066123962, "rewards/margins": -1.9886107444763184, "rewards/rejected": 2.9132347106933594, "step": 8911 }, { "epoch": 1.45, "learning_rate": 4.258873300955859e-07, "logits/chosen": -0.2452075332403183, "logits/rejected": -0.276073694229126, "logps/chosen": -127.7767333984375, "logps/rejected": -68.14208984375, "loss": 0.4002, "rewards/accuracies": 0.0, "rewards/chosen": 1.905053734779358, "rewards/margins": -0.14834749698638916, "rewards/rejected": 2.053401231765747, "step": 8912 }, { "epoch": 1.45, "learning_rate": 4.257573590376251e-07, "logits/chosen": -0.3530396521091461, "logits/rejected": -0.3662722706794739, "logps/chosen": -19.367401123046875, "logps/rejected": -76.82588195800781, "loss": 2.2126, "rewards/accuracies": 0.0, "rewards/chosen": 0.16175976395606995, "rewards/margins": -3.0638201236724854, "rewards/rejected": 3.2255799770355225, "step": 8913 }, { "epoch": 1.45, "learning_rate": 4.2562739310912834e-07, "logits/chosen": -0.5913475155830383, "logits/rejected": -0.5884616374969482, "logps/chosen": -3.8653461933135986, "logps/rejected": -5.4883294105529785, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.30306437611579895, "rewards/margins": 0.06170061230659485, "rewards/rejected": 0.2413637638092041, "step": 8914 }, { "epoch": 1.45, "learning_rate": 4.2549743231907487e-07, "logits/chosen": -0.8153026700019836, "logits/rejected": -0.8405030369758606, "logps/chosen": -78.52413940429688, "logps/rejected": -105.03230285644531, "loss": 0.3725, "rewards/accuracies": 0.0, "rewards/chosen": 2.959439992904663, "rewards/margins": -0.059058189392089844, "rewards/rejected": 3.018498182296753, "step": 8915 }, { "epoch": 1.45, "learning_rate": 4.25367476676444e-07, "logits/chosen": -1.0157498121261597, "logits/rejected": -0.9285722970962524, "logps/chosen": -74.90220642089844, "logps/rejected": -30.844898223876953, "loss": 0.4089, "rewards/accuracies": 1.0, "rewards/chosen": 1.2735542058944702, "rewards/margins": 0.9831409454345703, "rewards/rejected": 0.2904132902622223, "step": 8916 }, { "epoch": 1.45, "learning_rate": 4.2523752619021426e-07, "logits/chosen": -0.6344817280769348, "logits/rejected": -0.6000112295150757, "logps/chosen": -140.7825927734375, "logps/rejected": -95.94776916503906, "loss": 2.0702, "rewards/accuracies": 0.0, "rewards/chosen": 0.8456863760948181, "rewards/margins": -1.3532211780548096, "rewards/rejected": 2.1989076137542725, "step": 8917 }, { "epoch": 1.45, "learning_rate": 4.2510758086936405e-07, "logits/chosen": -0.6960399150848389, "logits/rejected": -0.6375933885574341, "logps/chosen": -27.324485778808594, "logps/rejected": -51.88054656982422, "loss": 1.1796, "rewards/accuracies": 1.0, "rewards/chosen": 1.7385025024414062, "rewards/margins": 0.800686240196228, "rewards/rejected": 0.9378162622451782, "step": 8918 }, { "epoch": 1.45, "learning_rate": 4.249776407228714e-07, "logits/chosen": -0.7799471616744995, "logits/rejected": -0.8943678736686707, "logps/chosen": -108.66958618164062, "logps/rejected": -90.42202758789062, "loss": 2.9875, "rewards/accuracies": 0.0, "rewards/chosen": 0.9271240234375, "rewards/margins": -5.613254070281982, "rewards/rejected": 6.540378093719482, "step": 8919 }, { "epoch": 1.45, "learning_rate": 4.2484770575971384e-07, "logits/chosen": -0.6548371911048889, "logits/rejected": -0.6923730373382568, "logps/chosen": -27.36834716796875, "logps/rejected": -113.78805541992188, "loss": 1.362, "rewards/accuracies": 0.0, "rewards/chosen": 2.330601453781128, "rewards/margins": -1.7221300601959229, "rewards/rejected": 4.052731513977051, "step": 8920 }, { "epoch": 1.45, "learning_rate": 4.247177759888688e-07, "logits/chosen": -0.8261867165565491, "logits/rejected": -0.8036848902702332, "logps/chosen": -99.93028259277344, "logps/rejected": -73.2720718383789, "loss": 1.2354, "rewards/accuracies": 0.0, "rewards/chosen": 1.4045838117599487, "rewards/margins": -0.8025702238082886, "rewards/rejected": 2.2071540355682373, "step": 8921 }, { "epoch": 1.45, "learning_rate": 4.245878514193131e-07, "logits/chosen": -0.9314063787460327, "logits/rejected": -0.5875399708747864, "logps/chosen": -218.96124267578125, "logps/rejected": -138.83990478515625, "loss": 0.2263, "rewards/accuracies": 1.0, "rewards/chosen": 5.3778076171875, "rewards/margins": 0.595576286315918, "rewards/rejected": 4.782231330871582, "step": 8922 }, { "epoch": 1.45, "learning_rate": 4.2445793206002324e-07, "logits/chosen": -0.7100856304168701, "logits/rejected": -0.7115105986595154, "logps/chosen": -61.497291564941406, "logps/rejected": -58.27958679199219, "loss": 0.441, "rewards/accuracies": 0.0, "rewards/chosen": 1.8730560541152954, "rewards/margins": -0.2739936113357544, "rewards/rejected": 2.14704966545105, "step": 8923 }, { "epoch": 1.45, "learning_rate": 4.243280179199755e-07, "logits/chosen": -0.7104063630104065, "logits/rejected": -0.7429401278495789, "logps/chosen": -61.158931732177734, "logps/rejected": -63.34940719604492, "loss": 0.7667, "rewards/accuracies": 1.0, "rewards/chosen": 1.7798312902450562, "rewards/margins": 0.22919762134552002, "rewards/rejected": 1.5506336688995361, "step": 8924 }, { "epoch": 1.45, "learning_rate": 4.241981090081458e-07, "logits/chosen": -0.8320872783660889, "logits/rejected": -0.8383405208587646, "logps/chosen": -73.63993835449219, "logps/rejected": -116.33821105957031, "loss": 0.7056, "rewards/accuracies": 1.0, "rewards/chosen": 1.256683349609375, "rewards/margins": 0.20332026481628418, "rewards/rejected": 1.0533630847930908, "step": 8925 }, { "epoch": 1.45, "learning_rate": 4.2406820533350944e-07, "logits/chosen": -0.5976170301437378, "logits/rejected": -0.49151137471199036, "logps/chosen": -98.51970672607422, "logps/rejected": -14.88991641998291, "loss": 0.261, "rewards/accuracies": 1.0, "rewards/chosen": 1.0733574628829956, "rewards/margins": 0.44661766290664673, "rewards/rejected": 0.6267397999763489, "step": 8926 }, { "epoch": 1.45, "learning_rate": 4.2393830690504165e-07, "logits/chosen": -0.4960172474384308, "logits/rejected": -0.4960172474384308, "logps/chosen": -41.81957244873047, "logps/rejected": -41.81957244873047, "loss": 0.3866, "rewards/accuracies": 0.0, "rewards/chosen": 1.6322669982910156, "rewards/margins": 0.0, "rewards/rejected": 1.6322669982910156, "step": 8927 }, { "epoch": 1.45, "learning_rate": 4.2380841373171703e-07, "logits/chosen": -0.5204960703849792, "logits/rejected": -0.49826744198799133, "logps/chosen": -70.7020263671875, "logps/rejected": -79.22998046875, "loss": 0.5713, "rewards/accuracies": 1.0, "rewards/chosen": 1.1220085620880127, "rewards/margins": 0.34363865852355957, "rewards/rejected": 0.7783699035644531, "step": 8928 }, { "epoch": 1.45, "learning_rate": 4.2367852582251017e-07, "logits/chosen": -0.7498569488525391, "logits/rejected": -0.5904009342193604, "logps/chosen": -164.28915405273438, "logps/rejected": -72.90838623046875, "loss": 1.4753, "rewards/accuracies": 0.0, "rewards/chosen": 1.325648546218872, "rewards/margins": -2.854088544845581, "rewards/rejected": 4.179737091064453, "step": 8929 }, { "epoch": 1.45, "learning_rate": 4.2354864318639506e-07, "logits/chosen": -0.8615736365318298, "logits/rejected": -1.2040220499038696, "logps/chosen": -45.740692138671875, "logps/rejected": -34.38022994995117, "loss": 0.2942, "rewards/accuracies": 1.0, "rewards/chosen": 1.84514319896698, "rewards/margins": 1.5778560638427734, "rewards/rejected": 0.26728707551956177, "step": 8930 }, { "epoch": 1.45, "learning_rate": 4.234187658323453e-07, "logits/chosen": -0.5425424575805664, "logits/rejected": -0.4007466435432434, "logps/chosen": -69.91419982910156, "logps/rejected": -61.176326751708984, "loss": 1.829, "rewards/accuracies": 0.0, "rewards/chosen": 2.785513401031494, "rewards/margins": -2.4781274795532227, "rewards/rejected": 5.263640880584717, "step": 8931 }, { "epoch": 1.45, "learning_rate": 4.2328889376933417e-07, "logits/chosen": -0.7151467204093933, "logits/rejected": -0.6395373940467834, "logps/chosen": -66.37391662597656, "logps/rejected": -68.24803924560547, "loss": 0.326, "rewards/accuracies": 1.0, "rewards/chosen": 1.9338188171386719, "rewards/margins": 0.24858009815216064, "rewards/rejected": 1.6852387189865112, "step": 8932 }, { "epoch": 1.45, "learning_rate": 4.231590270063349e-07, "logits/chosen": -0.6368422508239746, "logits/rejected": -0.6368422508239746, "logps/chosen": -39.950592041015625, "logps/rejected": -39.950592041015625, "loss": 0.56, "rewards/accuracies": 0.0, "rewards/chosen": 0.3150653839111328, "rewards/margins": 0.0, "rewards/rejected": 0.3150653839111328, "step": 8933 }, { "epoch": 1.45, "learning_rate": 4.230291655523196e-07, "logits/chosen": -0.5482937693595886, "logits/rejected": -0.480589896440506, "logps/chosen": -34.11188507080078, "logps/rejected": -31.94499969482422, "loss": 0.3244, "rewards/accuracies": 1.0, "rewards/chosen": 1.638013482093811, "rewards/margins": 0.8263694643974304, "rewards/rejected": 0.8116440176963806, "step": 8934 }, { "epoch": 1.45, "learning_rate": 4.228993094162606e-07, "logits/chosen": -1.0171923637390137, "logits/rejected": -1.0221956968307495, "logps/chosen": -85.43817138671875, "logps/rejected": -95.36241149902344, "loss": 1.3667, "rewards/accuracies": 0.0, "rewards/chosen": 1.7147819995880127, "rewards/margins": -2.5512659549713135, "rewards/rejected": 4.266047954559326, "step": 8935 }, { "epoch": 1.45, "learning_rate": 4.227694586071298e-07, "logits/chosen": -0.6886821985244751, "logits/rejected": -0.6469437479972839, "logps/chosen": -69.24778747558594, "logps/rejected": -53.45330810546875, "loss": 0.9776, "rewards/accuracies": 0.0, "rewards/chosen": 1.5214614868164062, "rewards/margins": -0.007764458656311035, "rewards/rejected": 1.5292259454727173, "step": 8936 }, { "epoch": 1.45, "learning_rate": 4.226396131338986e-07, "logits/chosen": -0.7626060843467712, "logits/rejected": -0.7463821172714233, "logps/chosen": -60.62614440917969, "logps/rejected": -66.24211120605469, "loss": 1.218, "rewards/accuracies": 0.0, "rewards/chosen": 0.5333641171455383, "rewards/margins": -2.3372788429260254, "rewards/rejected": 2.870642900466919, "step": 8937 }, { "epoch": 1.45, "learning_rate": 4.225097730055382e-07, "logits/chosen": -0.8517211079597473, "logits/rejected": -0.8125953674316406, "logps/chosen": -60.4737548828125, "logps/rejected": -86.50068664550781, "loss": 0.6811, "rewards/accuracies": 0.0, "rewards/chosen": 1.653045654296875, "rewards/margins": -0.8838639259338379, "rewards/rejected": 2.536909580230713, "step": 8938 }, { "epoch": 1.45, "learning_rate": 4.223799382310192e-07, "logits/chosen": -0.6811664700508118, "logits/rejected": -0.6933292746543884, "logps/chosen": -77.84117126464844, "logps/rejected": -62.37366485595703, "loss": 0.3005, "rewards/accuracies": 1.0, "rewards/chosen": 1.4391326904296875, "rewards/margins": 0.24517822265625, "rewards/rejected": 1.1939544677734375, "step": 8939 }, { "epoch": 1.45, "learning_rate": 4.2225010881931213e-07, "logits/chosen": -0.45664703845977783, "logits/rejected": -0.4371439814567566, "logps/chosen": -131.87863159179688, "logps/rejected": -116.5313491821289, "loss": 1.0476, "rewards/accuracies": 0.0, "rewards/chosen": 1.0300400257110596, "rewards/margins": -1.9369010925292969, "rewards/rejected": 2.9669411182403564, "step": 8940 }, { "epoch": 1.45, "learning_rate": 4.2212028477938683e-07, "logits/chosen": -0.4559788405895233, "logits/rejected": -0.47724124789237976, "logps/chosen": -42.81399154663086, "logps/rejected": -63.70618438720703, "loss": 0.3635, "rewards/accuracies": 1.0, "rewards/chosen": 1.8333019018173218, "rewards/margins": 0.3715869188308716, "rewards/rejected": 1.4617149829864502, "step": 8941 }, { "epoch": 1.45, "learning_rate": 4.2199046612021296e-07, "logits/chosen": -0.6294055581092834, "logits/rejected": -0.6963140368461609, "logps/chosen": -83.41482543945312, "logps/rejected": -105.66510772705078, "loss": 1.3236, "rewards/accuracies": 0.0, "rewards/chosen": 0.8842529654502869, "rewards/margins": -1.9621620178222656, "rewards/rejected": 2.8464150428771973, "step": 8942 }, { "epoch": 1.45, "learning_rate": 4.2186065285075966e-07, "logits/chosen": -0.5293835997581482, "logits/rejected": -0.5269994139671326, "logps/chosen": -27.676090240478516, "logps/rejected": -7.643043518066406, "loss": 1.0802, "rewards/accuracies": 1.0, "rewards/chosen": 1.413386583328247, "rewards/margins": 0.8076750040054321, "rewards/rejected": 0.6057115793228149, "step": 8943 }, { "epoch": 1.45, "learning_rate": 4.2173084497999594e-07, "logits/chosen": -0.6810215711593628, "logits/rejected": -0.6645322442054749, "logps/chosen": -35.90561294555664, "logps/rejected": -34.249149322509766, "loss": 0.5606, "rewards/accuracies": 0.0, "rewards/chosen": 0.2181755155324936, "rewards/margins": -0.4686233401298523, "rewards/rejected": 0.6867988705635071, "step": 8944 }, { "epoch": 1.45, "learning_rate": 4.216010425168902e-07, "logits/chosen": -0.8822736740112305, "logits/rejected": -0.9176226854324341, "logps/chosen": -150.13499450683594, "logps/rejected": -86.003173828125, "loss": 0.2876, "rewards/accuracies": 1.0, "rewards/chosen": 3.2182602882385254, "rewards/margins": 0.49936842918395996, "rewards/rejected": 2.7188918590545654, "step": 8945 }, { "epoch": 1.45, "learning_rate": 4.214712454704107e-07, "logits/chosen": -1.1241726875305176, "logits/rejected": -1.077639102935791, "logps/chosen": -108.72071838378906, "logps/rejected": -102.37377166748047, "loss": 0.1578, "rewards/accuracies": 1.0, "rewards/chosen": 2.6441237926483154, "rewards/margins": 1.2388137578964233, "rewards/rejected": 1.405310034751892, "step": 8946 }, { "epoch": 1.45, "learning_rate": 4.2134145384952504e-07, "logits/chosen": -0.832577109336853, "logits/rejected": -0.7247610688209534, "logps/chosen": -143.83694458007812, "logps/rejected": -69.73167419433594, "loss": 0.0839, "rewards/accuracies": 1.0, "rewards/chosen": 5.108577251434326, "rewards/margins": 2.7561426162719727, "rewards/rejected": 2.3524346351623535, "step": 8947 }, { "epoch": 1.45, "learning_rate": 4.212116676632006e-07, "logits/chosen": -0.684796154499054, "logits/rejected": -0.6235622763633728, "logps/chosen": -85.81330871582031, "logps/rejected": -66.46006774902344, "loss": 0.7179, "rewards/accuracies": 1.0, "rewards/chosen": 2.7883012294769287, "rewards/margins": 0.4192543029785156, "rewards/rejected": 2.369046926498413, "step": 8948 }, { "epoch": 1.45, "learning_rate": 4.2108188692040434e-07, "logits/chosen": -0.6631994247436523, "logits/rejected": -0.6518986225128174, "logps/chosen": -38.36564254760742, "logps/rejected": -6.959606170654297, "loss": 0.8672, "rewards/accuracies": 1.0, "rewards/chosen": 0.9544891715049744, "rewards/margins": 0.11062955856323242, "rewards/rejected": 0.8438596129417419, "step": 8949 }, { "epoch": 1.45, "learning_rate": 4.2095211163010315e-07, "logits/chosen": -0.4685485363006592, "logits/rejected": -0.4685485363006592, "logps/chosen": -9.163925170898438, "logps/rejected": -9.163925170898438, "loss": 0.3736, "rewards/accuracies": 0.0, "rewards/chosen": 0.07343731075525284, "rewards/margins": 0.0, "rewards/rejected": 0.07343731075525284, "step": 8950 }, { "epoch": 1.45, "learning_rate": 4.20822341801263e-07, "logits/chosen": -0.4017373323440552, "logits/rejected": -0.4017373323440552, "logps/chosen": -77.03016662597656, "logps/rejected": -77.03016662597656, "loss": 0.9997, "rewards/accuracies": 0.0, "rewards/chosen": 1.3683685064315796, "rewards/margins": 0.0, "rewards/rejected": 1.3683685064315796, "step": 8951 }, { "epoch": 1.45, "learning_rate": 4.2069257744284986e-07, "logits/chosen": -0.7210092544555664, "logits/rejected": -0.7885034084320068, "logps/chosen": -60.22068405151367, "logps/rejected": -54.894432067871094, "loss": 1.2101, "rewards/accuracies": 0.0, "rewards/chosen": 1.3223621845245361, "rewards/margins": -0.8178126811981201, "rewards/rejected": 2.1401748657226562, "step": 8952 }, { "epoch": 1.45, "learning_rate": 4.2056281856382924e-07, "logits/chosen": -0.9466091394424438, "logits/rejected": -0.9707948565483093, "logps/chosen": -191.66400146484375, "logps/rejected": -112.75042724609375, "loss": 0.4651, "rewards/accuracies": 1.0, "rewards/chosen": 3.7582428455352783, "rewards/margins": 2.540797472000122, "rewards/rejected": 1.2174453735351562, "step": 8953 }, { "epoch": 1.45, "learning_rate": 4.2043306517316616e-07, "logits/chosen": -0.5479834675788879, "logits/rejected": -0.5058117508888245, "logps/chosen": -63.405208587646484, "logps/rejected": -63.80577850341797, "loss": 0.3545, "rewards/accuracies": 1.0, "rewards/chosen": 1.3155078887939453, "rewards/margins": 0.03889572620391846, "rewards/rejected": 1.2766121625900269, "step": 8954 }, { "epoch": 1.45, "learning_rate": 4.203033172798255e-07, "logits/chosen": -0.8565108776092529, "logits/rejected": -0.6544122099876404, "logps/chosen": -141.146484375, "logps/rejected": -53.14809036254883, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 4.993933200836182, "rewards/margins": 3.156439781188965, "rewards/rejected": 1.8374935388565063, "step": 8955 }, { "epoch": 1.45, "learning_rate": 4.201735748927714e-07, "logits/chosen": -0.28735119104385376, "logits/rejected": -0.35508787631988525, "logps/chosen": -70.76423645019531, "logps/rejected": -62.970306396484375, "loss": 0.4067, "rewards/accuracies": 1.0, "rewards/chosen": 1.5277007818222046, "rewards/margins": 0.16227412223815918, "rewards/rejected": 1.3654266595840454, "step": 8956 }, { "epoch": 1.45, "learning_rate": 4.20043838020968e-07, "logits/chosen": -0.662584662437439, "logits/rejected": -0.7946545481681824, "logps/chosen": -99.6761474609375, "logps/rejected": -75.8401107788086, "loss": 1.3386, "rewards/accuracies": 0.0, "rewards/chosen": 0.5919235348701477, "rewards/margins": -2.602264404296875, "rewards/rejected": 3.194187879562378, "step": 8957 }, { "epoch": 1.45, "learning_rate": 4.199141066733789e-07, "logits/chosen": -0.9882648587226868, "logits/rejected": -0.9667589068412781, "logps/chosen": -142.4661865234375, "logps/rejected": -131.9686279296875, "loss": 0.5389, "rewards/accuracies": 0.0, "rewards/chosen": 1.468017578125, "rewards/margins": -0.41554415225982666, "rewards/rejected": 1.8835617303848267, "step": 8958 }, { "epoch": 1.45, "learning_rate": 4.197843808589672e-07, "logits/chosen": -1.1774593591690063, "logits/rejected": -0.7629406452178955, "logps/chosen": -130.27890014648438, "logps/rejected": -90.90362548828125, "loss": 0.9138, "rewards/accuracies": 0.0, "rewards/chosen": 1.3932541608810425, "rewards/margins": -0.5458755493164062, "rewards/rejected": 1.9391297101974487, "step": 8959 }, { "epoch": 1.45, "learning_rate": 4.196546605866957e-07, "logits/chosen": -0.589161217212677, "logits/rejected": -0.5310252904891968, "logps/chosen": -67.471923828125, "logps/rejected": -25.86121368408203, "loss": 0.3057, "rewards/accuracies": 1.0, "rewards/chosen": 2.4191606044769287, "rewards/margins": 1.2884103059768677, "rewards/rejected": 1.130750298500061, "step": 8960 }, { "epoch": 1.45, "learning_rate": 4.19524945865527e-07, "logits/chosen": -0.7687280774116516, "logits/rejected": -0.6122181415557861, "logps/chosen": -93.09904479980469, "logps/rejected": -43.79703140258789, "loss": 0.1421, "rewards/accuracies": 1.0, "rewards/chosen": 3.637197971343994, "rewards/margins": 2.0518202781677246, "rewards/rejected": 1.58537757396698, "step": 8961 }, { "epoch": 1.45, "learning_rate": 4.1939523670442313e-07, "logits/chosen": -0.3950765132904053, "logits/rejected": -0.3944718837738037, "logps/chosen": -151.84527587890625, "logps/rejected": -126.44036102294922, "loss": 0.3518, "rewards/accuracies": 1.0, "rewards/chosen": 1.2289642095565796, "rewards/margins": 0.025023579597473145, "rewards/rejected": 1.2039406299591064, "step": 8962 }, { "epoch": 1.45, "learning_rate": 4.1926553311234565e-07, "logits/chosen": -0.8697323203086853, "logits/rejected": -0.7687986493110657, "logps/chosen": -97.61064147949219, "logps/rejected": -45.29820251464844, "loss": 0.9392, "rewards/accuracies": 1.0, "rewards/chosen": 0.8340522646903992, "rewards/margins": 0.24924236536026, "rewards/rejected": 0.5848098993301392, "step": 8963 }, { "epoch": 1.45, "learning_rate": 4.191358350982559e-07, "logits/chosen": -0.783948540687561, "logits/rejected": -0.7686856985092163, "logps/chosen": -86.10516357421875, "logps/rejected": -71.06652069091797, "loss": 0.5317, "rewards/accuracies": 1.0, "rewards/chosen": 2.569549560546875, "rewards/margins": 0.8027404546737671, "rewards/rejected": 1.766809105873108, "step": 8964 }, { "epoch": 1.46, "learning_rate": 4.190061426711148e-07, "logits/chosen": -0.7043737769126892, "logits/rejected": -0.6829280853271484, "logps/chosen": -101.48707580566406, "logps/rejected": -79.13777160644531, "loss": 0.7417, "rewards/accuracies": 0.0, "rewards/chosen": 2.498378038406372, "rewards/margins": -1.162614345550537, "rewards/rejected": 3.660992383956909, "step": 8965 }, { "epoch": 1.46, "learning_rate": 4.188764558398829e-07, "logits/chosen": -0.9862487316131592, "logits/rejected": -1.006852626800537, "logps/chosen": -91.08560180664062, "logps/rejected": -87.08685302734375, "loss": 0.806, "rewards/accuracies": 1.0, "rewards/chosen": 0.6271697878837585, "rewards/margins": 0.19776609539985657, "rewards/rejected": 0.429403692483902, "step": 8966 }, { "epoch": 1.46, "learning_rate": 4.187467746135203e-07, "logits/chosen": -0.3524477183818817, "logits/rejected": -0.3524477183818817, "logps/chosen": -53.55473327636719, "logps/rejected": -53.55473327636719, "loss": 0.4293, "rewards/accuracies": 0.0, "rewards/chosen": 2.6581101417541504, "rewards/margins": 0.0, "rewards/rejected": 2.6581101417541504, "step": 8967 }, { "epoch": 1.46, "learning_rate": 4.186170990009867e-07, "logits/chosen": -0.9703991413116455, "logits/rejected": -0.925642728805542, "logps/chosen": -87.21746063232422, "logps/rejected": -86.61874389648438, "loss": 0.6599, "rewards/accuracies": 1.0, "rewards/chosen": 0.7630699276924133, "rewards/margins": 0.2757469117641449, "rewards/rejected": 0.48732301592826843, "step": 8968 }, { "epoch": 1.46, "learning_rate": 4.1848742901124164e-07, "logits/chosen": -1.085294485092163, "logits/rejected": -1.0700008869171143, "logps/chosen": -120.17098999023438, "logps/rejected": -74.97198486328125, "loss": 1.1576, "rewards/accuracies": 0.0, "rewards/chosen": 1.1790390014648438, "rewards/margins": -1.178037405014038, "rewards/rejected": 2.357076406478882, "step": 8969 }, { "epoch": 1.46, "learning_rate": 4.1835776465324386e-07, "logits/chosen": -0.8958755135536194, "logits/rejected": -0.7505336999893188, "logps/chosen": -108.25439453125, "logps/rejected": -93.54098510742188, "loss": 0.2815, "rewards/accuracies": 1.0, "rewards/chosen": 3.7228639125823975, "rewards/margins": 0.2962465286254883, "rewards/rejected": 3.426617383956909, "step": 8970 }, { "epoch": 1.46, "learning_rate": 4.1822810593595205e-07, "logits/chosen": -0.6024031043052673, "logits/rejected": -0.508091390132904, "logps/chosen": -77.30577087402344, "logps/rejected": -30.283843994140625, "loss": 0.8741, "rewards/accuracies": 1.0, "rewards/chosen": 2.005098819732666, "rewards/margins": 1.3453309535980225, "rewards/rejected": 0.6597679257392883, "step": 8971 }, { "epoch": 1.46, "learning_rate": 4.1809845286832435e-07, "logits/chosen": -1.1827173233032227, "logits/rejected": -1.4057186841964722, "logps/chosen": -89.519775390625, "logps/rejected": -34.29399108886719, "loss": 0.3353, "rewards/accuracies": 1.0, "rewards/chosen": 1.2397644519805908, "rewards/margins": 0.8587211966514587, "rewards/rejected": 0.3810432553291321, "step": 8972 }, { "epoch": 1.46, "learning_rate": 4.1796880545931863e-07, "logits/chosen": -0.8342461585998535, "logits/rejected": -0.8603057861328125, "logps/chosen": -58.99486541748047, "logps/rejected": -133.0248260498047, "loss": 1.8876, "rewards/accuracies": 0.0, "rewards/chosen": 2.331972599029541, "rewards/margins": -3.353713035583496, "rewards/rejected": 5.685685634613037, "step": 8973 }, { "epoch": 1.46, "learning_rate": 4.1783916371789226e-07, "logits/chosen": -1.1029218435287476, "logits/rejected": -1.2043712139129639, "logps/chosen": -255.66397094726562, "logps/rejected": -167.1021728515625, "loss": 0.2436, "rewards/accuracies": 1.0, "rewards/chosen": 5.578244209289551, "rewards/margins": 0.9753632545471191, "rewards/rejected": 4.602880954742432, "step": 8974 }, { "epoch": 1.46, "learning_rate": 4.177095276530023e-07, "logits/chosen": -0.6365829110145569, "logits/rejected": -0.7763215899467468, "logps/chosen": -73.78439331054688, "logps/rejected": -117.31402587890625, "loss": 2.1582, "rewards/accuracies": 0.0, "rewards/chosen": 1.9663467407226562, "rewards/margins": -3.9793992042541504, "rewards/rejected": 5.945745944976807, "step": 8975 }, { "epoch": 1.46, "learning_rate": 4.1757989727360526e-07, "logits/chosen": -0.615863561630249, "logits/rejected": -0.5255716443061829, "logps/chosen": -55.62837600708008, "logps/rejected": -50.69345474243164, "loss": 0.2062, "rewards/accuracies": 1.0, "rewards/chosen": 1.843474268913269, "rewards/margins": 1.021005630493164, "rewards/rejected": 0.8224685788154602, "step": 8976 }, { "epoch": 1.46, "learning_rate": 4.174502725886575e-07, "logits/chosen": -1.214451551437378, "logits/rejected": -1.1819580793380737, "logps/chosen": -68.21847534179688, "logps/rejected": -44.28171920776367, "loss": 1.3554, "rewards/accuracies": 0.0, "rewards/chosen": 2.0582542419433594, "rewards/margins": -1.1834371089935303, "rewards/rejected": 3.2416913509368896, "step": 8977 }, { "epoch": 1.46, "learning_rate": 4.173206536071148e-07, "logits/chosen": -0.710375964641571, "logits/rejected": -0.6635199189186096, "logps/chosen": -107.084716796875, "logps/rejected": -56.67035675048828, "loss": 0.0717, "rewards/accuracies": 1.0, "rewards/chosen": 2.489262342453003, "rewards/margins": 2.003965377807617, "rewards/rejected": 0.4852970242500305, "step": 8978 }, { "epoch": 1.46, "learning_rate": 4.1719104033793264e-07, "logits/chosen": -0.5521311163902283, "logits/rejected": -0.5178003907203674, "logps/chosen": -67.85899353027344, "logps/rejected": -87.97803497314453, "loss": 1.4998, "rewards/accuracies": 1.0, "rewards/chosen": 2.346644639968872, "rewards/margins": 1.157710313796997, "rewards/rejected": 1.188934326171875, "step": 8979 }, { "epoch": 1.46, "learning_rate": 4.170614327900661e-07, "logits/chosen": -1.005528211593628, "logits/rejected": -1.005528211593628, "logps/chosen": -45.1624755859375, "logps/rejected": -45.1624755859375, "loss": 0.4047, "rewards/accuracies": 0.0, "rewards/chosen": 1.249153971672058, "rewards/margins": 0.0, "rewards/rejected": 1.249153971672058, "step": 8980 }, { "epoch": 1.46, "learning_rate": 4.1693183097246967e-07, "logits/chosen": -0.42144277691841125, "logits/rejected": -0.3186646103858948, "logps/chosen": -37.46088790893555, "logps/rejected": -56.719451904296875, "loss": 0.1447, "rewards/accuracies": 1.0, "rewards/chosen": 2.5381276607513428, "rewards/margins": 1.5279752016067505, "rewards/rejected": 1.0101524591445923, "step": 8981 }, { "epoch": 1.46, "learning_rate": 4.168022348940978e-07, "logits/chosen": -0.6505721211433411, "logits/rejected": -0.607208788394928, "logps/chosen": -71.42101287841797, "logps/rejected": -76.88773345947266, "loss": 0.4698, "rewards/accuracies": 1.0, "rewards/chosen": 1.4269462823867798, "rewards/margins": 0.15596163272857666, "rewards/rejected": 1.2709846496582031, "step": 8982 }, { "epoch": 1.46, "learning_rate": 4.1667264456390424e-07, "logits/chosen": -0.4758738875389099, "logits/rejected": -0.4758738875389099, "logps/chosen": -67.47021484375, "logps/rejected": -67.47021484375, "loss": 0.4345, "rewards/accuracies": 0.0, "rewards/chosen": 2.241504669189453, "rewards/margins": 0.0, "rewards/rejected": 2.241504669189453, "step": 8983 }, { "epoch": 1.46, "learning_rate": 4.1654305999084234e-07, "logits/chosen": -0.7577362060546875, "logits/rejected": -0.7082440853118896, "logps/chosen": -108.07599639892578, "logps/rejected": -70.77647399902344, "loss": 0.0861, "rewards/accuracies": 1.0, "rewards/chosen": 2.003643035888672, "rewards/margins": 1.8092674016952515, "rewards/rejected": 0.19437561929225922, "step": 8984 }, { "epoch": 1.46, "learning_rate": 4.164134811838655e-07, "logits/chosen": -0.6793474555015564, "logits/rejected": -0.6793474555015564, "logps/chosen": -66.38641357421875, "logps/rejected": -66.38641357421875, "loss": 0.7063, "rewards/accuracies": 0.0, "rewards/chosen": 0.9618896842002869, "rewards/margins": 0.0, "rewards/rejected": 0.9618896842002869, "step": 8985 }, { "epoch": 1.46, "learning_rate": 4.1628390815192617e-07, "logits/chosen": -0.7022143602371216, "logits/rejected": -0.6180431842803955, "logps/chosen": -115.13863372802734, "logps/rejected": -93.78556823730469, "loss": 0.2073, "rewards/accuracies": 1.0, "rewards/chosen": 5.068038463592529, "rewards/margins": 1.3693766593933105, "rewards/rejected": 3.6986618041992188, "step": 8986 }, { "epoch": 1.46, "learning_rate": 4.1615434090397664e-07, "logits/chosen": -0.7900627255439758, "logits/rejected": -0.8772136569023132, "logps/chosen": -73.46556091308594, "logps/rejected": -118.945556640625, "loss": 1.7304, "rewards/accuracies": 0.0, "rewards/chosen": 4.884228706359863, "rewards/margins": -1.07757568359375, "rewards/rejected": 5.961804389953613, "step": 8987 }, { "epoch": 1.46, "learning_rate": 4.160247794489689e-07, "logits/chosen": -0.7971056699752808, "logits/rejected": -0.8167453408241272, "logps/chosen": -122.88609313964844, "logps/rejected": -61.40137481689453, "loss": 1.2092, "rewards/accuracies": 0.0, "rewards/chosen": 0.12375564873218536, "rewards/margins": -2.264753818511963, "rewards/rejected": 2.388509511947632, "step": 8988 }, { "epoch": 1.46, "learning_rate": 4.1589522379585415e-07, "logits/chosen": -0.5553614497184753, "logits/rejected": -0.523911714553833, "logps/chosen": -37.468421936035156, "logps/rejected": -4.811740398406982, "loss": 1.0265, "rewards/accuracies": 0.0, "rewards/chosen": 0.33740997314453125, "rewards/margins": -0.05953440070152283, "rewards/rejected": 0.3969443738460541, "step": 8989 }, { "epoch": 1.46, "learning_rate": 4.1576567395358376e-07, "logits/chosen": -0.4386695921421051, "logits/rejected": -0.36969777941703796, "logps/chosen": -54.27058410644531, "logps/rejected": -67.55439758300781, "loss": 1.9502, "rewards/accuracies": 1.0, "rewards/chosen": 1.3154197931289673, "rewards/margins": 0.3183441162109375, "rewards/rejected": 0.9970756769180298, "step": 8990 }, { "epoch": 1.46, "learning_rate": 4.156361299311082e-07, "logits/chosen": -0.6039385795593262, "logits/rejected": -0.5024566650390625, "logps/chosen": -101.67052459716797, "logps/rejected": -39.25833511352539, "loss": 0.2362, "rewards/accuracies": 1.0, "rewards/chosen": 0.8667594790458679, "rewards/margins": 0.9795150756835938, "rewards/rejected": -0.11275558918714523, "step": 8991 }, { "epoch": 1.46, "learning_rate": 4.155065917373778e-07, "logits/chosen": -0.8458782434463501, "logits/rejected": -1.0264443159103394, "logps/chosen": -131.89102172851562, "logps/rejected": -120.58674621582031, "loss": 1.7168, "rewards/accuracies": 0.0, "rewards/chosen": 1.96391761302948, "rewards/margins": -3.09928560256958, "rewards/rejected": 5.06320333480835, "step": 8992 }, { "epoch": 1.46, "learning_rate": 4.1537705938134247e-07, "logits/chosen": -0.8392694592475891, "logits/rejected": -0.697098970413208, "logps/chosen": -100.6817626953125, "logps/rejected": -36.13098907470703, "loss": 0.3481, "rewards/accuracies": 1.0, "rewards/chosen": 2.225119113922119, "rewards/margins": 2.0156967639923096, "rewards/rejected": 0.209422305226326, "step": 8993 }, { "epoch": 1.46, "learning_rate": 4.152475328719516e-07, "logits/chosen": -0.4382507801055908, "logits/rejected": -0.5080448389053345, "logps/chosen": -117.13269805908203, "logps/rejected": -113.59317016601562, "loss": 1.9999, "rewards/accuracies": 0.0, "rewards/chosen": 1.2659904956817627, "rewards/margins": -3.977762460708618, "rewards/rejected": 5.243752956390381, "step": 8994 }, { "epoch": 1.46, "learning_rate": 4.1511801221815436e-07, "logits/chosen": -0.37291496992111206, "logits/rejected": -0.3080924451351166, "logps/chosen": -24.769012451171875, "logps/rejected": -4.112031936645508, "loss": 0.2783, "rewards/accuracies": 1.0, "rewards/chosen": 0.7705551385879517, "rewards/margins": 0.30417290329933167, "rewards/rejected": 0.46638223528862, "step": 8995 }, { "epoch": 1.46, "learning_rate": 4.1498849742889925e-07, "logits/chosen": -1.3368444442749023, "logits/rejected": -1.3540725708007812, "logps/chosen": -99.96919250488281, "logps/rejected": -40.22679901123047, "loss": 2.2755, "rewards/accuracies": 1.0, "rewards/chosen": 1.4135932922363281, "rewards/margins": 1.1429065465927124, "rewards/rejected": 0.27068671584129333, "step": 8996 }, { "epoch": 1.46, "learning_rate": 4.1485898851313455e-07, "logits/chosen": -0.8239641189575195, "logits/rejected": -0.8782112002372742, "logps/chosen": -54.622379302978516, "logps/rejected": -104.52265930175781, "loss": 1.4231, "rewards/accuracies": 0.0, "rewards/chosen": 2.122610092163086, "rewards/margins": -1.7939929962158203, "rewards/rejected": 3.9166030883789062, "step": 8997 }, { "epoch": 1.46, "learning_rate": 4.147294854798081e-07, "logits/chosen": -0.5988284945487976, "logits/rejected": -0.5751636624336243, "logps/chosen": -100.54579162597656, "logps/rejected": -106.35094451904297, "loss": 0.4076, "rewards/accuracies": 1.0, "rewards/chosen": 0.4890884459018707, "rewards/margins": 0.08642807602882385, "rewards/rejected": 0.4026603698730469, "step": 8998 }, { "epoch": 1.46, "learning_rate": 4.1459998833786744e-07, "logits/chosen": -0.5243982672691345, "logits/rejected": -0.43126460909843445, "logps/chosen": -64.7239761352539, "logps/rejected": -55.30145263671875, "loss": 1.7336, "rewards/accuracies": 0.0, "rewards/chosen": 2.2453629970550537, "rewards/margins": -0.5056822299957275, "rewards/rejected": 2.7510452270507812, "step": 8999 }, { "epoch": 1.46, "learning_rate": 4.144704970962596e-07, "logits/chosen": -0.818384051322937, "logits/rejected": -0.7727527618408203, "logps/chosen": -69.55577087402344, "logps/rejected": -53.93524932861328, "loss": 0.5736, "rewards/accuracies": 1.0, "rewards/chosen": 2.4593262672424316, "rewards/margins": 1.1132690906524658, "rewards/rejected": 1.3460571765899658, "step": 9000 }, { "epoch": 1.46, "learning_rate": 4.1434101176393097e-07, "logits/chosen": -0.7530348300933838, "logits/rejected": -0.6750343441963196, "logps/chosen": -140.90040588378906, "logps/rejected": -93.65478515625, "loss": 0.0585, "rewards/accuracies": 1.0, "rewards/chosen": 4.706111431121826, "rewards/margins": 2.265747308731079, "rewards/rejected": 2.440364122390747, "step": 9001 }, { "epoch": 1.46, "learning_rate": 4.14211532349828e-07, "logits/chosen": -0.7149673104286194, "logits/rejected": -0.6887895464897156, "logps/chosen": -76.75871276855469, "logps/rejected": -70.69332885742188, "loss": 0.9777, "rewards/accuracies": 0.0, "rewards/chosen": 1.722480058670044, "rewards/margins": -1.7174575328826904, "rewards/rejected": 3.4399375915527344, "step": 9002 }, { "epoch": 1.46, "learning_rate": 4.1408205886289635e-07, "logits/chosen": -0.6138072609901428, "logits/rejected": -0.4816499650478363, "logps/chosen": -75.74590301513672, "logps/rejected": -23.64919090270996, "loss": 0.1945, "rewards/accuracies": 1.0, "rewards/chosen": 1.594001054763794, "rewards/margins": 1.443524956703186, "rewards/rejected": 0.15047608315944672, "step": 9003 }, { "epoch": 1.46, "learning_rate": 4.139525913120815e-07, "logits/chosen": -0.7989681363105774, "logits/rejected": -0.7812016010284424, "logps/chosen": -57.74943923950195, "logps/rejected": -89.12188720703125, "loss": 0.2201, "rewards/accuracies": 1.0, "rewards/chosen": 2.9463956356048584, "rewards/margins": 0.6391339302062988, "rewards/rejected": 2.3072617053985596, "step": 9004 }, { "epoch": 1.46, "learning_rate": 4.1382312970632844e-07, "logits/chosen": -1.016215205192566, "logits/rejected": -0.9257537126541138, "logps/chosen": -131.20855712890625, "logps/rejected": -62.21059799194336, "loss": 0.218, "rewards/accuracies": 1.0, "rewards/chosen": 2.6180176734924316, "rewards/margins": 0.7970196008682251, "rewards/rejected": 1.8209980726242065, "step": 9005 }, { "epoch": 1.46, "learning_rate": 4.1369367405458167e-07, "logits/chosen": -0.7408359050750732, "logits/rejected": -0.7408359050750732, "logps/chosen": -67.76729583740234, "logps/rejected": -67.76729583740234, "loss": 0.4917, "rewards/accuracies": 0.0, "rewards/chosen": 2.3756942749023438, "rewards/margins": 0.0, "rewards/rejected": 2.3756942749023438, "step": 9006 }, { "epoch": 1.46, "learning_rate": 4.135642243657854e-07, "logits/chosen": -0.5056148767471313, "logits/rejected": -0.38450804352760315, "logps/chosen": -38.979042053222656, "logps/rejected": -10.28656005859375, "loss": 0.2215, "rewards/accuracies": 1.0, "rewards/chosen": 1.0953384637832642, "rewards/margins": 0.759524941444397, "rewards/rejected": 0.3358135223388672, "step": 9007 }, { "epoch": 1.46, "learning_rate": 4.134347806488833e-07, "logits/chosen": -0.49495673179626465, "logits/rejected": -0.4561907649040222, "logps/chosen": -56.92987823486328, "logps/rejected": -52.24881362915039, "loss": 0.5155, "rewards/accuracies": 0.0, "rewards/chosen": 1.2633682489395142, "rewards/margins": -0.445023775100708, "rewards/rejected": 1.7083920240402222, "step": 9008 }, { "epoch": 1.46, "learning_rate": 4.133053429128188e-07, "logits/chosen": -0.3950340747833252, "logits/rejected": -0.39205625653266907, "logps/chosen": -11.988863945007324, "logps/rejected": -7.922850131988525, "loss": 0.4316, "rewards/accuracies": 1.0, "rewards/chosen": 0.9604304432868958, "rewards/margins": 0.005190551280975342, "rewards/rejected": 0.9552398920059204, "step": 9009 }, { "epoch": 1.46, "learning_rate": 4.131759111665348e-07, "logits/chosen": -0.8004940748214722, "logits/rejected": -0.7460910081863403, "logps/chosen": -125.06232452392578, "logps/rejected": -38.90232849121094, "loss": 0.5313, "rewards/accuracies": 1.0, "rewards/chosen": 1.174691081047058, "rewards/margins": 0.5920273661613464, "rewards/rejected": 0.5826637148857117, "step": 9010 }, { "epoch": 1.46, "learning_rate": 4.1304648541897385e-07, "logits/chosen": -0.3780435025691986, "logits/rejected": -0.46856531500816345, "logps/chosen": -85.46708679199219, "logps/rejected": -96.57173156738281, "loss": 1.3891, "rewards/accuracies": 0.0, "rewards/chosen": 1.4522705078125, "rewards/margins": -1.1251189708709717, "rewards/rejected": 2.5773894786834717, "step": 9011 }, { "epoch": 1.46, "learning_rate": 4.129170656790779e-07, "logits/chosen": -0.5636153817176819, "logits/rejected": -0.5147115588188171, "logps/chosen": -37.51599884033203, "logps/rejected": -75.48007202148438, "loss": 0.8011, "rewards/accuracies": 0.0, "rewards/chosen": 1.6151989698410034, "rewards/margins": -1.1640468835830688, "rewards/rejected": 2.7792458534240723, "step": 9012 }, { "epoch": 1.46, "learning_rate": 4.127876519557888e-07, "logits/chosen": -0.7317179441452026, "logits/rejected": -0.6755772829055786, "logps/chosen": -63.188541412353516, "logps/rejected": -78.0899887084961, "loss": 0.3595, "rewards/accuracies": 1.0, "rewards/chosen": 2.28135347366333, "rewards/margins": 0.08407402038574219, "rewards/rejected": 2.197279453277588, "step": 9013 }, { "epoch": 1.46, "learning_rate": 4.126582442580477e-07, "logits/chosen": -0.6010626554489136, "logits/rejected": -0.48942551016807556, "logps/chosen": -65.91230010986328, "logps/rejected": -63.016075134277344, "loss": 0.2068, "rewards/accuracies": 1.0, "rewards/chosen": 3.6975486278533936, "rewards/margins": 1.4249067306518555, "rewards/rejected": 2.272641897201538, "step": 9014 }, { "epoch": 1.46, "learning_rate": 4.125288425947955e-07, "logits/chosen": -0.9998763203620911, "logits/rejected": -0.724213719367981, "logps/chosen": -118.5414047241211, "logps/rejected": -102.37298583984375, "loss": 0.1068, "rewards/accuracies": 1.0, "rewards/chosen": 3.799837589263916, "rewards/margins": 2.0334906578063965, "rewards/rejected": 1.76634681224823, "step": 9015 }, { "epoch": 1.46, "learning_rate": 4.1239944697497263e-07, "logits/chosen": -0.41820213198661804, "logits/rejected": -0.4142719507217407, "logps/chosen": -38.105255126953125, "logps/rejected": -49.81244659423828, "loss": 0.7159, "rewards/accuracies": 1.0, "rewards/chosen": 0.8285682797431946, "rewards/margins": 0.18169784545898438, "rewards/rejected": 0.6468704342842102, "step": 9016 }, { "epoch": 1.46, "learning_rate": 4.122700574075191e-07, "logits/chosen": -0.7182180285453796, "logits/rejected": -0.6645215749740601, "logps/chosen": -174.7566375732422, "logps/rejected": -51.8167724609375, "loss": 0.0309, "rewards/accuracies": 1.0, "rewards/chosen": 6.280052185058594, "rewards/margins": 3.774531602859497, "rewards/rejected": 2.5055205821990967, "step": 9017 }, { "epoch": 1.46, "learning_rate": 4.1214067390137455e-07, "logits/chosen": -0.5626064538955688, "logits/rejected": -0.5716273188591003, "logps/chosen": -46.827735900878906, "logps/rejected": -72.68915557861328, "loss": 0.2832, "rewards/accuracies": 1.0, "rewards/chosen": 2.3064072132110596, "rewards/margins": 0.6159034967422485, "rewards/rejected": 1.690503716468811, "step": 9018 }, { "epoch": 1.46, "learning_rate": 4.1201129646547794e-07, "logits/chosen": -0.5034898519515991, "logits/rejected": -0.4596770703792572, "logps/chosen": -254.7969970703125, "logps/rejected": -52.93077087402344, "loss": 0.1096, "rewards/accuracies": 1.0, "rewards/chosen": 5.477670192718506, "rewards/margins": 2.1094634532928467, "rewards/rejected": 3.368206739425659, "step": 9019 }, { "epoch": 1.46, "learning_rate": 4.118819251087682e-07, "logits/chosen": -1.0574026107788086, "logits/rejected": -0.9856706261634827, "logps/chosen": -68.14461517333984, "logps/rejected": -30.71015739440918, "loss": 0.8036, "rewards/accuracies": 1.0, "rewards/chosen": 2.3906471729278564, "rewards/margins": 1.922975778579712, "rewards/rejected": 0.46767139434814453, "step": 9020 }, { "epoch": 1.46, "learning_rate": 4.1175255984018377e-07, "logits/chosen": -0.6082541346549988, "logits/rejected": -0.6678606867790222, "logps/chosen": -27.586090087890625, "logps/rejected": -46.39719009399414, "loss": 0.8339, "rewards/accuracies": 0.0, "rewards/chosen": 1.2952152490615845, "rewards/margins": -1.3304160833358765, "rewards/rejected": 2.625631332397461, "step": 9021 }, { "epoch": 1.46, "learning_rate": 4.1162320066866236e-07, "logits/chosen": -0.8395767211914062, "logits/rejected": -0.8834335803985596, "logps/chosen": -54.27642059326172, "logps/rejected": -62.66734313964844, "loss": 0.9378, "rewards/accuracies": 0.0, "rewards/chosen": 1.8939964771270752, "rewards/margins": -0.5274658203125, "rewards/rejected": 2.421462297439575, "step": 9022 }, { "epoch": 1.46, "learning_rate": 4.114938476031416e-07, "logits/chosen": -0.5726124048233032, "logits/rejected": -0.5165255069732666, "logps/chosen": -79.87043762207031, "logps/rejected": -43.60501480102539, "loss": 0.6294, "rewards/accuracies": 0.0, "rewards/chosen": 1.141871690750122, "rewards/margins": -0.07630264759063721, "rewards/rejected": 1.2181743383407593, "step": 9023 }, { "epoch": 1.46, "learning_rate": 4.113645006525585e-07, "logits/chosen": -0.766545295715332, "logits/rejected": -0.6632651686668396, "logps/chosen": -92.62445068359375, "logps/rejected": -62.06338119506836, "loss": 1.2267, "rewards/accuracies": 0.0, "rewards/chosen": 1.8965896368026733, "rewards/margins": -0.025268197059631348, "rewards/rejected": 1.9218578338623047, "step": 9024 }, { "epoch": 1.46, "learning_rate": 4.112351598258498e-07, "logits/chosen": -0.8165294528007507, "logits/rejected": -0.6678172945976257, "logps/chosen": -104.76943969726562, "logps/rejected": -42.03474044799805, "loss": 0.0446, "rewards/accuracies": 1.0, "rewards/chosen": 4.317090034484863, "rewards/margins": 4.0382490158081055, "rewards/rejected": 0.2788410186767578, "step": 9025 }, { "epoch": 1.47, "learning_rate": 4.1110582513195156e-07, "logits/chosen": -1.2336722612380981, "logits/rejected": -0.8630671501159668, "logps/chosen": -206.40892028808594, "logps/rejected": -28.677734375, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": 5.751747131347656, "rewards/margins": 5.452269554138184, "rewards/rejected": 0.29947739839553833, "step": 9026 }, { "epoch": 1.47, "learning_rate": 4.109764965797997e-07, "logits/chosen": -0.6266915202140808, "logits/rejected": -0.6266915202140808, "logps/chosen": -22.420316696166992, "logps/rejected": -22.420316696166992, "loss": 0.8943, "rewards/accuracies": 0.0, "rewards/chosen": 0.7396535873413086, "rewards/margins": 0.0, "rewards/rejected": 0.7396535873413086, "step": 9027 }, { "epoch": 1.47, "learning_rate": 4.1084717417832963e-07, "logits/chosen": -0.6546446084976196, "logits/rejected": -0.6546446084976196, "logps/chosen": -37.929927825927734, "logps/rejected": -37.929927825927734, "loss": 0.3588, "rewards/accuracies": 0.0, "rewards/chosen": 0.7930805087089539, "rewards/margins": 0.0, "rewards/rejected": 0.7930805087089539, "step": 9028 }, { "epoch": 1.47, "learning_rate": 4.107178579364762e-07, "logits/chosen": -0.7099127173423767, "logits/rejected": -0.6768242716789246, "logps/chosen": -57.15531921386719, "logps/rejected": -51.39805603027344, "loss": 0.2785, "rewards/accuracies": 1.0, "rewards/chosen": 2.867169141769409, "rewards/margins": 0.3043212890625, "rewards/rejected": 2.562847852706909, "step": 9029 }, { "epoch": 1.47, "learning_rate": 4.1058854786317406e-07, "logits/chosen": -0.6962305903434753, "logits/rejected": -0.6322013139724731, "logps/chosen": -56.21263122558594, "logps/rejected": -90.17037200927734, "loss": 0.7866, "rewards/accuracies": 0.0, "rewards/chosen": 0.974560558795929, "rewards/margins": -1.3400521278381348, "rewards/rejected": 2.314612627029419, "step": 9030 }, { "epoch": 1.47, "learning_rate": 4.104592439673572e-07, "logits/chosen": -0.42082417011260986, "logits/rejected": -0.2964862287044525, "logps/chosen": -50.175960540771484, "logps/rejected": -15.675320625305176, "loss": 0.1, "rewards/accuracies": 1.0, "rewards/chosen": 2.2907567024230957, "rewards/margins": 2.293363094329834, "rewards/rejected": -0.0026064873673021793, "step": 9031 }, { "epoch": 1.47, "learning_rate": 4.103299462579593e-07, "logits/chosen": -0.6973066926002502, "logits/rejected": -0.6008632779121399, "logps/chosen": -46.06782531738281, "logps/rejected": -18.46538543701172, "loss": 0.2336, "rewards/accuracies": 1.0, "rewards/chosen": 1.1323654651641846, "rewards/margins": 0.8054226636886597, "rewards/rejected": 0.3269428312778473, "step": 9032 }, { "epoch": 1.47, "learning_rate": 4.1020065474391367e-07, "logits/chosen": -0.4017272889614105, "logits/rejected": -0.4017272889614105, "logps/chosen": -45.187679290771484, "logps/rejected": -45.187679290771484, "loss": 0.4008, "rewards/accuracies": 0.0, "rewards/chosen": 0.7214061617851257, "rewards/margins": 0.0, "rewards/rejected": 0.7214061617851257, "step": 9033 }, { "epoch": 1.47, "learning_rate": 4.1007136943415315e-07, "logits/chosen": -0.4574786424636841, "logits/rejected": -0.4547300636768341, "logps/chosen": -2.1996889114379883, "logps/rejected": -8.87035846710205, "loss": 1.4963, "rewards/accuracies": 0.0, "rewards/chosen": 0.22344909608364105, "rewards/margins": -0.08286814391613007, "rewards/rejected": 0.3063172399997711, "step": 9034 }, { "epoch": 1.47, "learning_rate": 4.0994209033761006e-07, "logits/chosen": -0.6887506246566772, "logits/rejected": -0.6522114872932434, "logps/chosen": -152.2874298095703, "logps/rejected": -206.68539428710938, "loss": 0.8644, "rewards/accuracies": 0.0, "rewards/chosen": 2.6144073009490967, "rewards/margins": -1.4657046794891357, "rewards/rejected": 4.080111980438232, "step": 9035 }, { "epoch": 1.47, "learning_rate": 4.0981281746321637e-07, "logits/chosen": -0.6683558225631714, "logits/rejected": -0.6888191103935242, "logps/chosen": -16.164981842041016, "logps/rejected": -2.2846157550811768, "loss": 0.6343, "rewards/accuracies": 0.0, "rewards/chosen": -0.18356285989284515, "rewards/margins": -0.491335391998291, "rewards/rejected": 0.30777251720428467, "step": 9036 }, { "epoch": 1.47, "learning_rate": 4.096835508199037e-07, "logits/chosen": -0.6623834371566772, "logits/rejected": -0.6883392930030823, "logps/chosen": -70.89940643310547, "logps/rejected": -89.39702606201172, "loss": 0.2995, "rewards/accuracies": 1.0, "rewards/chosen": 1.9491432905197144, "rewards/margins": 0.23957371711730957, "rewards/rejected": 1.7095695734024048, "step": 9037 }, { "epoch": 1.47, "learning_rate": 4.0955429041660305e-07, "logits/chosen": -0.8283186554908752, "logits/rejected": -0.6183528304100037, "logps/chosen": -152.63861083984375, "logps/rejected": -19.1990909576416, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": 4.4374237060546875, "rewards/margins": 3.660543918609619, "rewards/rejected": 0.7768799066543579, "step": 9038 }, { "epoch": 1.47, "learning_rate": 4.0942503626224513e-07, "logits/chosen": -1.280263900756836, "logits/rejected": -1.2026957273483276, "logps/chosen": -60.25188446044922, "logps/rejected": -74.65361785888672, "loss": 1.2633, "rewards/accuracies": 0.0, "rewards/chosen": 2.6972146034240723, "rewards/margins": -2.420759677886963, "rewards/rejected": 5.117974281311035, "step": 9039 }, { "epoch": 1.47, "learning_rate": 4.092957883657603e-07, "logits/chosen": -0.5826462507247925, "logits/rejected": -0.5833044648170471, "logps/chosen": -9.47367000579834, "logps/rejected": -2.8315107822418213, "loss": 0.4782, "rewards/accuracies": 0.0, "rewards/chosen": 0.05935230478644371, "rewards/margins": -0.34880152344703674, "rewards/rejected": 0.40815383195877075, "step": 9040 }, { "epoch": 1.47, "learning_rate": 4.091665467360781e-07, "logits/chosen": -1.1091594696044922, "logits/rejected": -0.9805023074150085, "logps/chosen": -159.5147705078125, "logps/rejected": -101.80017852783203, "loss": 0.4814, "rewards/accuracies": 1.0, "rewards/chosen": 5.522075176239014, "rewards/margins": 1.6448938846588135, "rewards/rejected": 3.8771812915802, "step": 9041 }, { "epoch": 1.47, "learning_rate": 4.0903731138212806e-07, "logits/chosen": -0.5508765578269958, "logits/rejected": -0.5919551253318787, "logps/chosen": -45.783836364746094, "logps/rejected": -95.89649963378906, "loss": 0.2595, "rewards/accuracies": 1.0, "rewards/chosen": 1.1760540008544922, "rewards/margins": 0.39763450622558594, "rewards/rejected": 0.7784194946289062, "step": 9042 }, { "epoch": 1.47, "learning_rate": 4.089080823128391e-07, "logits/chosen": -0.8554225564002991, "logits/rejected": -0.8825320601463318, "logps/chosen": -127.1124267578125, "logps/rejected": -116.70069885253906, "loss": 0.5758, "rewards/accuracies": 0.0, "rewards/chosen": 1.5445770025253296, "rewards/margins": -0.7466858625411987, "rewards/rejected": 2.2912628650665283, "step": 9043 }, { "epoch": 1.47, "learning_rate": 4.0877885953713967e-07, "logits/chosen": -0.8832389712333679, "logits/rejected": -0.904634416103363, "logps/chosen": -104.31024169921875, "logps/rejected": -175.37326049804688, "loss": 3.5864, "rewards/accuracies": 0.0, "rewards/chosen": 2.9027786254882812, "rewards/margins": -4.550276279449463, "rewards/rejected": 7.453054904937744, "step": 9044 }, { "epoch": 1.47, "learning_rate": 4.08649643063958e-07, "logits/chosen": -0.626697838306427, "logits/rejected": -0.4293166399002075, "logps/chosen": -104.37893676757812, "logps/rejected": -48.75507736206055, "loss": 0.5428, "rewards/accuracies": 0.0, "rewards/chosen": 0.7507354617118835, "rewards/margins": -0.04276585578918457, "rewards/rejected": 0.7935013175010681, "step": 9045 }, { "epoch": 1.47, "learning_rate": 4.085204329022216e-07, "logits/chosen": -0.550828218460083, "logits/rejected": -0.33629292249679565, "logps/chosen": -107.62847900390625, "logps/rejected": -43.06709289550781, "loss": 1.3833, "rewards/accuracies": 1.0, "rewards/chosen": 1.8834656476974487, "rewards/margins": 1.3926124572753906, "rewards/rejected": 0.49085313081741333, "step": 9046 }, { "epoch": 1.47, "learning_rate": 4.083912290608577e-07, "logits/chosen": -0.7067597508430481, "logits/rejected": -0.6159966588020325, "logps/chosen": -107.82701110839844, "logps/rejected": -107.44825744628906, "loss": 1.3161, "rewards/accuracies": 0.0, "rewards/chosen": 0.5337554812431335, "rewards/margins": -2.4952774047851562, "rewards/rejected": 3.0290329456329346, "step": 9047 }, { "epoch": 1.47, "learning_rate": 4.0826203154879306e-07, "logits/chosen": -0.6734803915023804, "logits/rejected": -0.5426070690155029, "logps/chosen": -54.562496185302734, "logps/rejected": -72.51300048828125, "loss": 0.9426, "rewards/accuracies": 1.0, "rewards/chosen": 1.612833023071289, "rewards/margins": 0.4696735143661499, "rewards/rejected": 1.1431595087051392, "step": 9048 }, { "epoch": 1.47, "learning_rate": 4.08132840374954e-07, "logits/chosen": -0.8669558763504028, "logits/rejected": -0.9110099673271179, "logps/chosen": -161.04031372070312, "logps/rejected": -133.94595336914062, "loss": 1.8209, "rewards/accuracies": 0.0, "rewards/chosen": 2.4290711879730225, "rewards/margins": -3.611825704574585, "rewards/rejected": 6.040896892547607, "step": 9049 }, { "epoch": 1.47, "learning_rate": 4.0800365554826645e-07, "logits/chosen": -0.6705454587936401, "logits/rejected": -0.6110395789146423, "logps/chosen": -80.42105102539062, "logps/rejected": -101.74927520751953, "loss": 0.1132, "rewards/accuracies": 1.0, "rewards/chosen": 5.932278633117676, "rewards/margins": 1.7286386489868164, "rewards/rejected": 4.203639984130859, "step": 9050 }, { "epoch": 1.47, "learning_rate": 4.0787447707765577e-07, "logits/chosen": -0.4124366044998169, "logits/rejected": -0.4321536719799042, "logps/chosen": -24.05848503112793, "logps/rejected": -43.61760330200195, "loss": 0.8658, "rewards/accuracies": 1.0, "rewards/chosen": 0.6779980063438416, "rewards/margins": 0.5451362729072571, "rewards/rejected": 0.13286171853542328, "step": 9051 }, { "epoch": 1.47, "learning_rate": 4.0774530497204706e-07, "logits/chosen": -0.4632437527179718, "logits/rejected": -0.45957422256469727, "logps/chosen": -97.5895004272461, "logps/rejected": -57.64519500732422, "loss": 0.0687, "rewards/accuracies": 1.0, "rewards/chosen": 4.028878211975098, "rewards/margins": 2.298672676086426, "rewards/rejected": 1.7302055358886719, "step": 9052 }, { "epoch": 1.47, "learning_rate": 4.076161392403649e-07, "logits/chosen": -0.8052428960800171, "logits/rejected": -0.4971664547920227, "logps/chosen": -138.8482208251953, "logps/rejected": -112.23876953125, "loss": 0.1749, "rewards/accuracies": 1.0, "rewards/chosen": 4.209971904754639, "rewards/margins": 1.0409777164459229, "rewards/rejected": 3.168994188308716, "step": 9053 }, { "epoch": 1.47, "learning_rate": 4.0748697989153326e-07, "logits/chosen": -0.4483944773674011, "logits/rejected": -0.3822961747646332, "logps/chosen": -44.15772247314453, "logps/rejected": -54.06349182128906, "loss": 0.4401, "rewards/accuracies": 0.0, "rewards/chosen": 1.6968529224395752, "rewards/margins": -0.13076698780059814, "rewards/rejected": 1.8276199102401733, "step": 9054 }, { "epoch": 1.47, "learning_rate": 4.07357826934476e-07, "logits/chosen": -0.70647794008255, "logits/rejected": -0.7006745934486389, "logps/chosen": -62.56844711303711, "logps/rejected": -75.73641967773438, "loss": 0.8369, "rewards/accuracies": 0.0, "rewards/chosen": 1.5136280059814453, "rewards/margins": -0.4021381139755249, "rewards/rejected": 1.9157661199569702, "step": 9055 }, { "epoch": 1.47, "learning_rate": 4.072286803781163e-07, "logits/chosen": -0.8847244381904602, "logits/rejected": -0.6583537459373474, "logps/chosen": -111.08416748046875, "logps/rejected": -31.148883819580078, "loss": 0.0255, "rewards/accuracies": 1.0, "rewards/chosen": 3.1354920864105225, "rewards/margins": 2.955151081085205, "rewards/rejected": 0.1803409606218338, "step": 9056 }, { "epoch": 1.47, "learning_rate": 4.07099540231377e-07, "logits/chosen": -0.9858579635620117, "logits/rejected": -1.0066745281219482, "logps/chosen": -161.05197143554688, "logps/rejected": -218.62957763671875, "loss": 0.5927, "rewards/accuracies": 0.0, "rewards/chosen": 5.522915840148926, "rewards/margins": -0.7754635810852051, "rewards/rejected": 6.298379421234131, "step": 9057 }, { "epoch": 1.47, "learning_rate": 4.0697040650318044e-07, "logits/chosen": -1.097306728363037, "logits/rejected": -1.0892250537872314, "logps/chosen": -41.102989196777344, "logps/rejected": -9.58243179321289, "loss": 0.28, "rewards/accuracies": 1.0, "rewards/chosen": 0.969073474407196, "rewards/margins": 0.4447789788246155, "rewards/rejected": 0.5242944955825806, "step": 9058 }, { "epoch": 1.47, "learning_rate": 4.068412792024485e-07, "logits/chosen": -0.524837851524353, "logits/rejected": -0.524837851524353, "logps/chosen": -3.1965537071228027, "logps/rejected": -3.1965537071228027, "loss": 0.8927, "rewards/accuracies": 0.0, "rewards/chosen": 1.3634699583053589, "rewards/margins": 0.0, "rewards/rejected": 1.3634699583053589, "step": 9059 }, { "epoch": 1.47, "learning_rate": 4.067121583381027e-07, "logits/chosen": -0.8220536708831787, "logits/rejected": -0.7979562282562256, "logps/chosen": -90.82437896728516, "logps/rejected": -47.286659240722656, "loss": 0.3807, "rewards/accuracies": 0.0, "rewards/chosen": 1.9827438592910767, "rewards/margins": -0.03802025318145752, "rewards/rejected": 2.020764112472534, "step": 9060 }, { "epoch": 1.47, "learning_rate": 4.0658304391906405e-07, "logits/chosen": -0.7828171849250793, "logits/rejected": -0.7250691056251526, "logps/chosen": -99.56359100341797, "logps/rejected": -71.8709716796875, "loss": 0.4212, "rewards/accuracies": 1.0, "rewards/chosen": 1.8874610662460327, "rewards/margins": 0.10888290405273438, "rewards/rejected": 1.7785781621932983, "step": 9061 }, { "epoch": 1.47, "learning_rate": 4.064539359542532e-07, "logits/chosen": -0.6966555118560791, "logits/rejected": -0.5909314751625061, "logps/chosen": -109.85688018798828, "logps/rejected": -83.57743835449219, "loss": 0.5311, "rewards/accuracies": 1.0, "rewards/chosen": 2.966829776763916, "rewards/margins": 1.4032219648361206, "rewards/rejected": 1.5636078119277954, "step": 9062 }, { "epoch": 1.47, "learning_rate": 4.063248344525902e-07, "logits/chosen": -0.3887943625450134, "logits/rejected": -0.3703085780143738, "logps/chosen": -77.68208312988281, "logps/rejected": -98.68013000488281, "loss": 0.2977, "rewards/accuracies": 1.0, "rewards/chosen": 1.4190078973770142, "rewards/margins": 0.3664054870605469, "rewards/rejected": 1.0526024103164673, "step": 9063 }, { "epoch": 1.47, "learning_rate": 4.061957394229948e-07, "logits/chosen": -0.7373276352882385, "logits/rejected": -0.7373276352882385, "logps/chosen": -47.18098068237305, "logps/rejected": -47.18098068237305, "loss": 0.6743, "rewards/accuracies": 0.0, "rewards/chosen": 3.193666458129883, "rewards/margins": 0.0, "rewards/rejected": 3.193666458129883, "step": 9064 }, { "epoch": 1.47, "learning_rate": 4.060666508743862e-07, "logits/chosen": -0.791928768157959, "logits/rejected": -0.6527336239814758, "logps/chosen": -107.67755889892578, "logps/rejected": -57.741172790527344, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 6.37465763092041, "rewards/margins": 3.8686234951019287, "rewards/rejected": 2.5060341358184814, "step": 9065 }, { "epoch": 1.47, "learning_rate": 4.059375688156832e-07, "logits/chosen": -0.5477898716926575, "logits/rejected": -0.5132866501808167, "logps/chosen": -37.52785110473633, "logps/rejected": -56.49770736694336, "loss": 1.0037, "rewards/accuracies": 0.0, "rewards/chosen": 2.4404561519622803, "rewards/margins": -0.42100048065185547, "rewards/rejected": 2.8614566326141357, "step": 9066 }, { "epoch": 1.47, "learning_rate": 4.058084932558042e-07, "logits/chosen": -0.8042612671852112, "logits/rejected": -0.8573605418205261, "logps/chosen": -86.80729675292969, "logps/rejected": -102.89514923095703, "loss": 0.6088, "rewards/accuracies": 0.0, "rewards/chosen": 2.745854139328003, "rewards/margins": -0.812079668045044, "rewards/rejected": 3.557933807373047, "step": 9067 }, { "epoch": 1.47, "learning_rate": 4.0567942420366705e-07, "logits/chosen": -1.2926498651504517, "logits/rejected": -1.2473098039627075, "logps/chosen": -62.2548828125, "logps/rejected": -35.16595458984375, "loss": 0.5324, "rewards/accuracies": 1.0, "rewards/chosen": 2.6209304332733154, "rewards/margins": 2.368328809738159, "rewards/rejected": 0.25260162353515625, "step": 9068 }, { "epoch": 1.47, "learning_rate": 4.055503616681892e-07, "logits/chosen": -1.1335405111312866, "logits/rejected": -1.1284576654434204, "logps/chosen": -82.45912170410156, "logps/rejected": -102.59220123291016, "loss": 1.6816, "rewards/accuracies": 0.0, "rewards/chosen": 4.306020259857178, "rewards/margins": -1.18184232711792, "rewards/rejected": 5.487862586975098, "step": 9069 }, { "epoch": 1.47, "learning_rate": 4.054213056582877e-07, "logits/chosen": -0.6707059741020203, "logits/rejected": -0.6707059741020203, "logps/chosen": -97.20890045166016, "logps/rejected": -97.20890045166016, "loss": 0.3519, "rewards/accuracies": 0.0, "rewards/chosen": 1.7601760625839233, "rewards/margins": 0.0, "rewards/rejected": 1.7601760625839233, "step": 9070 }, { "epoch": 1.47, "learning_rate": 4.052922561828791e-07, "logits/chosen": -0.587710976600647, "logits/rejected": -0.4959185719490051, "logps/chosen": -55.985816955566406, "logps/rejected": -98.63822174072266, "loss": 0.1761, "rewards/accuracies": 1.0, "rewards/chosen": 2.273571014404297, "rewards/margins": 0.8657699823379517, "rewards/rejected": 1.4078010320663452, "step": 9071 }, { "epoch": 1.47, "learning_rate": 4.051632132508794e-07, "logits/chosen": -0.5009031295776367, "logits/rejected": -0.6101566553115845, "logps/chosen": -100.36390686035156, "logps/rejected": -137.84555053710938, "loss": 0.9854, "rewards/accuracies": 0.0, "rewards/chosen": 1.7763077020645142, "rewards/margins": -1.7721420526504517, "rewards/rejected": 3.548449754714966, "step": 9072 }, { "epoch": 1.47, "learning_rate": 4.0503417687120433e-07, "logits/chosen": -0.5028899908065796, "logits/rejected": -0.6180834174156189, "logps/chosen": -95.88096618652344, "logps/rejected": -113.46501159667969, "loss": 1.3213, "rewards/accuracies": 0.0, "rewards/chosen": 2.0768158435821533, "rewards/margins": -2.522456407546997, "rewards/rejected": 4.59927225112915, "step": 9073 }, { "epoch": 1.47, "learning_rate": 4.0490514705276906e-07, "logits/chosen": -0.7789983153343201, "logits/rejected": -0.751548171043396, "logps/chosen": -161.74864196777344, "logps/rejected": -151.8447265625, "loss": 2.1008, "rewards/accuracies": 0.0, "rewards/chosen": 3.5400466918945312, "rewards/margins": -2.426483154296875, "rewards/rejected": 5.966529846191406, "step": 9074 }, { "epoch": 1.47, "learning_rate": 4.0477612380448836e-07, "logits/chosen": -0.8967476487159729, "logits/rejected": -0.8752449750900269, "logps/chosen": -37.47978591918945, "logps/rejected": -54.32563018798828, "loss": 0.5266, "rewards/accuracies": 0.0, "rewards/chosen": 1.5341824293136597, "rewards/margins": -0.564820408821106, "rewards/rejected": 2.0990028381347656, "step": 9075 }, { "epoch": 1.47, "learning_rate": 4.046471071352764e-07, "logits/chosen": -0.7094306349754333, "logits/rejected": -0.582289457321167, "logps/chosen": -51.273563385009766, "logps/rejected": -49.032745361328125, "loss": 0.4991, "rewards/accuracies": 1.0, "rewards/chosen": 1.9057483673095703, "rewards/margins": 0.4997127056121826, "rewards/rejected": 1.4060356616973877, "step": 9076 }, { "epoch": 1.47, "learning_rate": 4.0451809705404716e-07, "logits/chosen": -0.6429018378257751, "logits/rejected": -0.6509014368057251, "logps/chosen": -30.97286033630371, "logps/rejected": -69.3380355834961, "loss": 1.8283, "rewards/accuracies": 0.0, "rewards/chosen": 1.995072364807129, "rewards/margins": -1.3604400157928467, "rewards/rejected": 3.3555123805999756, "step": 9077 }, { "epoch": 1.47, "learning_rate": 4.0438909356971397e-07, "logits/chosen": -0.535085141658783, "logits/rejected": -0.4985453188419342, "logps/chosen": -90.39651489257812, "logps/rejected": -62.479671478271484, "loss": 0.3437, "rewards/accuracies": 1.0, "rewards/chosen": 1.9475654363632202, "rewards/margins": 0.30379676818847656, "rewards/rejected": 1.6437686681747437, "step": 9078 }, { "epoch": 1.47, "learning_rate": 4.0426009669118966e-07, "logits/chosen": -0.8008214831352234, "logits/rejected": -0.7396408319473267, "logps/chosen": -68.92988586425781, "logps/rejected": -42.7796516418457, "loss": 0.7204, "rewards/accuracies": 0.0, "rewards/chosen": 1.1547783613204956, "rewards/margins": -1.0939167737960815, "rewards/rejected": 2.248695135116577, "step": 9079 }, { "epoch": 1.47, "learning_rate": 4.0413110642738683e-07, "logits/chosen": -0.8032070994377136, "logits/rejected": -0.8037427663803101, "logps/chosen": -27.077754974365234, "logps/rejected": -61.194007873535156, "loss": 0.4898, "rewards/accuracies": 1.0, "rewards/chosen": 1.8675743341445923, "rewards/margins": 0.6775658130645752, "rewards/rejected": 1.190008521080017, "step": 9080 }, { "epoch": 1.47, "learning_rate": 4.040021227872175e-07, "logits/chosen": -0.5895198583602905, "logits/rejected": -0.6426429748535156, "logps/chosen": -84.14250183105469, "logps/rejected": -115.05768585205078, "loss": 0.379, "rewards/accuracies": 1.0, "rewards/chosen": 1.1692001819610596, "rewards/margins": 0.5721222162246704, "rewards/rejected": 0.5970779657363892, "step": 9081 }, { "epoch": 1.47, "learning_rate": 4.038731457795931e-07, "logits/chosen": -0.7472631931304932, "logits/rejected": -0.7004808783531189, "logps/chosen": -130.88565063476562, "logps/rejected": -42.49662780761719, "loss": 1.8231, "rewards/accuracies": 1.0, "rewards/chosen": 0.3255722224712372, "rewards/margins": 0.05842667818069458, "rewards/rejected": 0.2671455442905426, "step": 9082 }, { "epoch": 1.47, "learning_rate": 4.037441754134247e-07, "logits/chosen": -0.48857566714286804, "logits/rejected": -0.4645597040653229, "logps/chosen": -92.71852111816406, "logps/rejected": -81.88365936279297, "loss": 2.4298, "rewards/accuracies": 0.0, "rewards/chosen": 1.458991289138794, "rewards/margins": -0.3686683177947998, "rewards/rejected": 1.8276596069335938, "step": 9083 }, { "epoch": 1.47, "learning_rate": 4.03615211697623e-07, "logits/chosen": -0.9268806576728821, "logits/rejected": -1.0633947849273682, "logps/chosen": -54.753257751464844, "logps/rejected": -142.541748046875, "loss": 2.2502, "rewards/accuracies": 0.0, "rewards/chosen": 1.1181999444961548, "rewards/margins": -3.5528788566589355, "rewards/rejected": 4.671078681945801, "step": 9084 }, { "epoch": 1.47, "learning_rate": 4.0348625464109826e-07, "logits/chosen": -0.8409255146980286, "logits/rejected": -0.7577897310256958, "logps/chosen": -86.69481658935547, "logps/rejected": -30.034692764282227, "loss": 0.3655, "rewards/accuracies": 1.0, "rewards/chosen": 1.6921783685684204, "rewards/margins": 1.4631727933883667, "rewards/rejected": 0.2290056198835373, "step": 9085 }, { "epoch": 1.47, "learning_rate": 4.0335730425276e-07, "logits/chosen": -0.8869245052337646, "logits/rejected": -0.8025788068771362, "logps/chosen": -183.08218383789062, "logps/rejected": -98.7884750366211, "loss": 0.0447, "rewards/accuracies": 1.0, "rewards/chosen": 6.220878601074219, "rewards/margins": 3.0014548301696777, "rewards/rejected": 3.219423770904541, "step": 9086 }, { "epoch": 1.47, "learning_rate": 4.0322836054151766e-07, "logits/chosen": -0.7739489078521729, "logits/rejected": -0.6969649195671082, "logps/chosen": -82.4106216430664, "logps/rejected": -51.954856872558594, "loss": 0.3803, "rewards/accuracies": 1.0, "rewards/chosen": 3.091278076171875, "rewards/margins": 2.661734104156494, "rewards/rejected": 0.429544061422348, "step": 9087 }, { "epoch": 1.48, "learning_rate": 4.0309942351627995e-07, "logits/chosen": -0.45982396602630615, "logits/rejected": -0.45982396602630615, "logps/chosen": -45.50544738769531, "logps/rejected": -45.50544738769531, "loss": 0.6234, "rewards/accuracies": 0.0, "rewards/chosen": 0.23451653122901917, "rewards/margins": 0.0, "rewards/rejected": 0.23451653122901917, "step": 9088 }, { "epoch": 1.48, "learning_rate": 4.029704931859552e-07, "logits/chosen": -0.9195278882980347, "logits/rejected": -0.9995019435882568, "logps/chosen": -84.405029296875, "logps/rejected": -112.50210571289062, "loss": 0.9703, "rewards/accuracies": 0.0, "rewards/chosen": 0.6487327814102173, "rewards/margins": -1.542258381843567, "rewards/rejected": 2.190991163253784, "step": 9089 }, { "epoch": 1.48, "learning_rate": 4.0284156955945113e-07, "logits/chosen": -0.9053624272346497, "logits/rejected": -0.9288448691368103, "logps/chosen": -156.93612670898438, "logps/rejected": -93.0291748046875, "loss": 0.7381, "rewards/accuracies": 0.0, "rewards/chosen": 3.9334137439727783, "rewards/margins": -1.1433870792388916, "rewards/rejected": 5.07680082321167, "step": 9090 }, { "epoch": 1.48, "learning_rate": 4.0271265264567545e-07, "logits/chosen": -0.5409818887710571, "logits/rejected": -0.6641530394554138, "logps/chosen": -94.6642074584961, "logps/rejected": -131.58897399902344, "loss": 1.4186, "rewards/accuracies": 0.0, "rewards/chosen": 2.1475448608398438, "rewards/margins": -2.6777987480163574, "rewards/rejected": 4.825343608856201, "step": 9091 }, { "epoch": 1.48, "learning_rate": 4.025837424535348e-07, "logits/chosen": -0.5021708607673645, "logits/rejected": -0.4915212392807007, "logps/chosen": -59.05934524536133, "logps/rejected": -61.69242858886719, "loss": 1.3168, "rewards/accuracies": 1.0, "rewards/chosen": 1.9532283544540405, "rewards/margins": 0.3964778184890747, "rewards/rejected": 1.5567505359649658, "step": 9092 }, { "epoch": 1.48, "learning_rate": 4.0245483899193586e-07, "logits/chosen": -0.2122393250465393, "logits/rejected": -0.20392312109470367, "logps/chosen": -53.4991455078125, "logps/rejected": -42.30297088623047, "loss": 0.3857, "rewards/accuracies": 1.0, "rewards/chosen": 0.31381914019584656, "rewards/margins": 0.10927774012088776, "rewards/rejected": 0.2045414000749588, "step": 9093 }, { "epoch": 1.48, "learning_rate": 4.0232594226978457e-07, "logits/chosen": -0.4558328688144684, "logits/rejected": -0.46711984276771545, "logps/chosen": -71.80148315429688, "logps/rejected": -59.519004821777344, "loss": 0.6202, "rewards/accuracies": 0.0, "rewards/chosen": 2.3536407947540283, "rewards/margins": -0.6343376636505127, "rewards/rejected": 2.987978458404541, "step": 9094 }, { "epoch": 1.48, "learning_rate": 4.0219705229598644e-07, "logits/chosen": -0.6713957190513611, "logits/rejected": -0.5516135096549988, "logps/chosen": -76.36700439453125, "logps/rejected": -9.384566307067871, "loss": 0.8048, "rewards/accuracies": 1.0, "rewards/chosen": 1.6839096546173096, "rewards/margins": 0.9865281581878662, "rewards/rejected": 0.6973814964294434, "step": 9095 }, { "epoch": 1.48, "learning_rate": 4.0206816907944664e-07, "logits/chosen": -0.8941863775253296, "logits/rejected": -0.8835805058479309, "logps/chosen": -80.84823608398438, "logps/rejected": -62.81342697143555, "loss": 0.5187, "rewards/accuracies": 1.0, "rewards/chosen": 1.2591736316680908, "rewards/margins": 0.11647069454193115, "rewards/rejected": 1.1427029371261597, "step": 9096 }, { "epoch": 1.48, "learning_rate": 4.019392926290696e-07, "logits/chosen": -0.8666519522666931, "logits/rejected": -0.8502100706100464, "logps/chosen": -65.65242004394531, "logps/rejected": -47.25661087036133, "loss": 1.4379, "rewards/accuracies": 0.0, "rewards/chosen": 0.9241897463798523, "rewards/margins": -0.005589306354522705, "rewards/rejected": 0.929779052734375, "step": 9097 }, { "epoch": 1.48, "learning_rate": 4.0181042295375965e-07, "logits/chosen": -0.9948462247848511, "logits/rejected": -0.9430758953094482, "logps/chosen": -52.027950286865234, "logps/rejected": -59.427791595458984, "loss": 0.4608, "rewards/accuracies": 0.0, "rewards/chosen": 2.5721194744110107, "rewards/margins": -0.16042423248291016, "rewards/rejected": 2.732543706893921, "step": 9098 }, { "epoch": 1.48, "learning_rate": 4.016815600624204e-07, "logits/chosen": -0.741686999797821, "logits/rejected": -0.7547622323036194, "logps/chosen": -86.732177734375, "logps/rejected": -43.567832946777344, "loss": 0.9763, "rewards/accuracies": 0.0, "rewards/chosen": 1.0397751331329346, "rewards/margins": -1.1328835487365723, "rewards/rejected": 2.172658681869507, "step": 9099 }, { "epoch": 1.48, "learning_rate": 4.01552703963955e-07, "logits/chosen": -0.4509037733078003, "logits/rejected": -0.48210036754608154, "logps/chosen": -72.52873229980469, "logps/rejected": -137.68116760253906, "loss": 0.2215, "rewards/accuracies": 1.0, "rewards/chosen": 2.014101505279541, "rewards/margins": 1.4256706237792969, "rewards/rejected": 0.5884308218955994, "step": 9100 }, { "epoch": 1.48, "learning_rate": 4.014238546672663e-07, "logits/chosen": -0.2699982821941376, "logits/rejected": -0.2724655270576477, "logps/chosen": -11.4879150390625, "logps/rejected": -2.2646918296813965, "loss": 0.646, "rewards/accuracies": 0.0, "rewards/chosen": -0.04717559739947319, "rewards/margins": -0.36203083395957947, "rewards/rejected": 0.3148552477359772, "step": 9101 }, { "epoch": 1.48, "learning_rate": 4.012950121812565e-07, "logits/chosen": -0.6387159824371338, "logits/rejected": -0.6323772668838501, "logps/chosen": -21.25383949279785, "logps/rejected": -0.5484440922737122, "loss": 1.9037, "rewards/accuracies": 0.0, "rewards/chosen": -0.09679947048425674, "rewards/margins": -0.2270316183567047, "rewards/rejected": 0.13023214042186737, "step": 9102 }, { "epoch": 1.48, "learning_rate": 4.011661765148274e-07, "logits/chosen": -0.6587079763412476, "logits/rejected": -0.68869948387146, "logps/chosen": -57.58966827392578, "logps/rejected": -68.98200988769531, "loss": 0.5132, "rewards/accuracies": 0.0, "rewards/chosen": 1.7168045043945312, "rewards/margins": -0.5682206153869629, "rewards/rejected": 2.285025119781494, "step": 9103 }, { "epoch": 1.48, "learning_rate": 4.010373476768803e-07, "logits/chosen": -0.8310398459434509, "logits/rejected": -0.8414778709411621, "logps/chosen": -112.57466888427734, "logps/rejected": -161.6164093017578, "loss": 2.4588, "rewards/accuracies": 0.0, "rewards/chosen": 4.964920997619629, "rewards/margins": -0.8348608016967773, "rewards/rejected": 5.799781799316406, "step": 9104 }, { "epoch": 1.48, "learning_rate": 4.009085256763161e-07, "logits/chosen": -0.7491438388824463, "logits/rejected": -0.8038623332977295, "logps/chosen": -158.72828674316406, "logps/rejected": -156.7645263671875, "loss": 2.3505, "rewards/accuracies": 0.0, "rewards/chosen": 5.012071132659912, "rewards/margins": -1.1285324096679688, "rewards/rejected": 6.140603542327881, "step": 9105 }, { "epoch": 1.48, "learning_rate": 4.0077971052203517e-07, "logits/chosen": -0.7175871133804321, "logits/rejected": -0.676297128200531, "logps/chosen": -72.77313995361328, "logps/rejected": -7.404173374176025, "loss": 0.9558, "rewards/accuracies": 1.0, "rewards/chosen": 1.9079993963241577, "rewards/margins": 1.402382493019104, "rewards/rejected": 0.5056169033050537, "step": 9106 }, { "epoch": 1.48, "learning_rate": 4.006509022229374e-07, "logits/chosen": -0.4138377606868744, "logits/rejected": -0.4138377606868744, "logps/chosen": -45.42140197753906, "logps/rejected": -45.42140197753906, "loss": 0.3762, "rewards/accuracies": 0.0, "rewards/chosen": 1.1915801763534546, "rewards/margins": 0.0, "rewards/rejected": 1.1915801763534546, "step": 9107 }, { "epoch": 1.48, "learning_rate": 4.0052210078792226e-07, "logits/chosen": -0.8857426047325134, "logits/rejected": -0.8538134694099426, "logps/chosen": -71.02232360839844, "logps/rejected": -89.2378158569336, "loss": 0.7797, "rewards/accuracies": 0.0, "rewards/chosen": 1.2552658319473267, "rewards/margins": -0.9331551790237427, "rewards/rejected": 2.1884210109710693, "step": 9108 }, { "epoch": 1.48, "learning_rate": 4.003933062258887e-07, "logits/chosen": -0.9434788823127747, "logits/rejected": -0.9376904964447021, "logps/chosen": -95.36801147460938, "logps/rejected": -101.4443359375, "loss": 3.1408, "rewards/accuracies": 0.0, "rewards/chosen": 1.2452659606933594, "rewards/margins": -2.677938222885132, "rewards/rejected": 3.923204183578491, "step": 9109 }, { "epoch": 1.48, "learning_rate": 4.002645185457351e-07, "logits/chosen": -0.8143261075019836, "logits/rejected": -0.7530678510665894, "logps/chosen": -58.22632598876953, "logps/rejected": -32.722713470458984, "loss": 0.8279, "rewards/accuracies": 0.0, "rewards/chosen": 1.1795578002929688, "rewards/margins": -0.48157691955566406, "rewards/rejected": 1.6611347198486328, "step": 9110 }, { "epoch": 1.48, "learning_rate": 4.001357377563596e-07, "logits/chosen": -0.6426871418952942, "logits/rejected": -0.5861188173294067, "logps/chosen": -80.96084594726562, "logps/rejected": -63.36664581298828, "loss": 1.2177, "rewards/accuracies": 0.0, "rewards/chosen": 0.6959205865859985, "rewards/margins": -2.3136696815490723, "rewards/rejected": 3.0095901489257812, "step": 9111 }, { "epoch": 1.48, "learning_rate": 4.0000696386665964e-07, "logits/chosen": -0.9932435154914856, "logits/rejected": -0.9956218004226685, "logps/chosen": -119.53477478027344, "logps/rejected": -47.43896484375, "loss": 1.7196, "rewards/accuracies": 0.0, "rewards/chosen": 0.7225112915039062, "rewards/margins": -1.629159688949585, "rewards/rejected": 2.351670980453491, "step": 9112 }, { "epoch": 1.48, "learning_rate": 3.998781968855324e-07, "logits/chosen": -0.685135543346405, "logits/rejected": -0.7536861896514893, "logps/chosen": -49.07738494873047, "logps/rejected": -80.55157470703125, "loss": 0.5033, "rewards/accuracies": 0.0, "rewards/chosen": 0.7510024905204773, "rewards/margins": -0.2104538083076477, "rewards/rejected": 0.961456298828125, "step": 9113 }, { "epoch": 1.48, "learning_rate": 3.9974943682187446e-07, "logits/chosen": -0.6583206057548523, "logits/rejected": -0.5237182974815369, "logps/chosen": -119.90838623046875, "logps/rejected": -75.0477066040039, "loss": 0.4476, "rewards/accuracies": 1.0, "rewards/chosen": 2.89628005027771, "rewards/margins": 0.4574394226074219, "rewards/rejected": 2.438840627670288, "step": 9114 }, { "epoch": 1.48, "learning_rate": 3.996206836845818e-07, "logits/chosen": -0.8198456168174744, "logits/rejected": -0.8524345755577087, "logps/chosen": -71.09041595458984, "logps/rejected": -70.94046020507812, "loss": 1.1753, "rewards/accuracies": 0.0, "rewards/chosen": 1.7865486145019531, "rewards/margins": -2.207839250564575, "rewards/rejected": 3.9943878650665283, "step": 9115 }, { "epoch": 1.48, "learning_rate": 3.9949193748255004e-07, "logits/chosen": -0.20571984350681305, "logits/rejected": -0.20571984350681305, "logps/chosen": -73.14414978027344, "logps/rejected": -73.14414978027344, "loss": 0.7191, "rewards/accuracies": 0.0, "rewards/chosen": 0.3686660826206207, "rewards/margins": 0.0, "rewards/rejected": 0.3686660826206207, "step": 9116 }, { "epoch": 1.48, "learning_rate": 3.993631982246745e-07, "logits/chosen": -0.3886394202709198, "logits/rejected": -0.3072943091392517, "logps/chosen": -34.19921875, "logps/rejected": -57.295040130615234, "loss": 0.4108, "rewards/accuracies": 1.0, "rewards/chosen": 2.142143726348877, "rewards/margins": 1.6714482307434082, "rewards/rejected": 0.47069549560546875, "step": 9117 }, { "epoch": 1.48, "learning_rate": 3.992344659198497e-07, "logits/chosen": -0.6709410548210144, "logits/rejected": -0.7123286128044128, "logps/chosen": -116.92665100097656, "logps/rejected": -133.90328979492188, "loss": 0.1537, "rewards/accuracies": 1.0, "rewards/chosen": 1.5011001825332642, "rewards/margins": 1.338958740234375, "rewards/rejected": 0.16214142739772797, "step": 9118 }, { "epoch": 1.48, "learning_rate": 3.9910574057696976e-07, "logits/chosen": -0.9218870401382446, "logits/rejected": -0.9563711881637573, "logps/chosen": -105.76162719726562, "logps/rejected": -81.11302185058594, "loss": 0.6063, "rewards/accuracies": 0.0, "rewards/chosen": 0.8929252624511719, "rewards/margins": -0.5003334283828735, "rewards/rejected": 1.3932586908340454, "step": 9119 }, { "epoch": 1.48, "learning_rate": 3.9897702220492854e-07, "logits/chosen": -0.7313904762268066, "logits/rejected": -0.7066567540168762, "logps/chosen": -75.75732421875, "logps/rejected": -128.05694580078125, "loss": 0.0791, "rewards/accuracies": 1.0, "rewards/chosen": 4.086463928222656, "rewards/margins": 1.869990587234497, "rewards/rejected": 2.216473340988159, "step": 9120 }, { "epoch": 1.48, "learning_rate": 3.988483108126193e-07, "logits/chosen": -0.5511377453804016, "logits/rejected": -0.5669499635696411, "logps/chosen": -72.54449462890625, "logps/rejected": -104.7586669921875, "loss": 0.6995, "rewards/accuracies": 0.0, "rewards/chosen": -0.07662735134363174, "rewards/margins": -1.0117805004119873, "rewards/rejected": 0.9351531863212585, "step": 9121 }, { "epoch": 1.48, "learning_rate": 3.9871960640893455e-07, "logits/chosen": -0.6747174263000488, "logits/rejected": -0.6994944214820862, "logps/chosen": -52.58732604980469, "logps/rejected": -87.99935913085938, "loss": 0.7283, "rewards/accuracies": 1.0, "rewards/chosen": 2.082383871078491, "rewards/margins": 0.8614853620529175, "rewards/rejected": 1.2208985090255737, "step": 9122 }, { "epoch": 1.48, "learning_rate": 3.985909090027667e-07, "logits/chosen": -0.41317853331565857, "logits/rejected": -0.41157883405685425, "logps/chosen": -43.77128219604492, "logps/rejected": -43.41028594970703, "loss": 1.0547, "rewards/accuracies": 0.0, "rewards/chosen": 1.4266891479492188, "rewards/margins": -0.047591447830200195, "rewards/rejected": 1.474280595779419, "step": 9123 }, { "epoch": 1.48, "learning_rate": 3.9846221860300745e-07, "logits/chosen": -0.5318660736083984, "logits/rejected": -0.5461583733558655, "logps/chosen": -72.97947692871094, "logps/rejected": -60.31163024902344, "loss": 0.7781, "rewards/accuracies": 0.0, "rewards/chosen": 2.015376329421997, "rewards/margins": -1.2153220176696777, "rewards/rejected": 3.230698347091675, "step": 9124 }, { "epoch": 1.48, "learning_rate": 3.9833353521854816e-07, "logits/chosen": -0.7144429087638855, "logits/rejected": -0.7569072842597961, "logps/chosen": -60.20866775512695, "logps/rejected": -92.61412048339844, "loss": 0.6819, "rewards/accuracies": 0.0, "rewards/chosen": 1.008352279663086, "rewards/margins": -0.23151588439941406, "rewards/rejected": 1.2398681640625, "step": 9125 }, { "epoch": 1.48, "learning_rate": 3.982048588582797e-07, "logits/chosen": -0.4057134687900543, "logits/rejected": -0.4128880500793457, "logps/chosen": -73.56981658935547, "logps/rejected": -78.20413208007812, "loss": 0.3794, "rewards/accuracies": 1.0, "rewards/chosen": 2.0325615406036377, "rewards/margins": 0.15077292919158936, "rewards/rejected": 1.8817886114120483, "step": 9126 }, { "epoch": 1.48, "learning_rate": 3.980761895310922e-07, "logits/chosen": -0.5620923042297363, "logits/rejected": -0.49382510781288147, "logps/chosen": -67.48590850830078, "logps/rejected": -18.055936813354492, "loss": 0.3234, "rewards/accuracies": 1.0, "rewards/chosen": 0.8521873354911804, "rewards/margins": 0.16567456722259521, "rewards/rejected": 0.6865127682685852, "step": 9127 }, { "epoch": 1.48, "learning_rate": 3.979475272458757e-07, "logits/chosen": -0.9041147232055664, "logits/rejected": -0.8511695265769958, "logps/chosen": -104.05813598632812, "logps/rejected": -58.40845489501953, "loss": 0.5567, "rewards/accuracies": 0.0, "rewards/chosen": 1.79887855052948, "rewards/margins": -0.24545824527740479, "rewards/rejected": 2.0443367958068848, "step": 9128 }, { "epoch": 1.48, "learning_rate": 3.978188720115193e-07, "logits/chosen": -0.8724051713943481, "logits/rejected": -0.8724051713943481, "logps/chosen": -126.45410919189453, "logps/rejected": -126.45410919189453, "loss": 0.4432, "rewards/accuracies": 0.0, "rewards/chosen": 3.596843719482422, "rewards/margins": 0.0, "rewards/rejected": 3.596843719482422, "step": 9129 }, { "epoch": 1.48, "learning_rate": 3.9769022383691204e-07, "logits/chosen": -0.6119790077209473, "logits/rejected": -0.47782814502716064, "logps/chosen": -46.16929244995117, "logps/rejected": -22.529098510742188, "loss": 0.5161, "rewards/accuracies": 1.0, "rewards/chosen": 0.7984394431114197, "rewards/margins": 0.0007122159004211426, "rewards/rejected": 0.7977272272109985, "step": 9130 }, { "epoch": 1.48, "learning_rate": 3.9756158273094225e-07, "logits/chosen": -0.9101955890655518, "logits/rejected": -0.9828537106513977, "logps/chosen": -122.54766082763672, "logps/rejected": -170.80352783203125, "loss": 1.3606, "rewards/accuracies": 0.0, "rewards/chosen": 2.3491432666778564, "rewards/margins": -2.6013967990875244, "rewards/rejected": 4.950540065765381, "step": 9131 }, { "epoch": 1.48, "learning_rate": 3.974329487024978e-07, "logits/chosen": -0.6116807460784912, "logits/rejected": -0.719810962677002, "logps/chosen": -117.49775695800781, "logps/rejected": -96.70941162109375, "loss": 2.4134, "rewards/accuracies": 0.0, "rewards/chosen": 0.8373581171035767, "rewards/margins": -4.510472297668457, "rewards/rejected": 5.347830295562744, "step": 9132 }, { "epoch": 1.48, "learning_rate": 3.973043217604662e-07, "logits/chosen": -0.8041737675666809, "logits/rejected": -0.7735729813575745, "logps/chosen": -119.9794692993164, "logps/rejected": -95.73567962646484, "loss": 1.016, "rewards/accuracies": 0.0, "rewards/chosen": 1.9707390069961548, "rewards/margins": -0.1650146245956421, "rewards/rejected": 2.135753631591797, "step": 9133 }, { "epoch": 1.48, "learning_rate": 3.971757019137342e-07, "logits/chosen": -0.6108120083808899, "logits/rejected": -0.5626890063285828, "logps/chosen": -101.27326965332031, "logps/rejected": -82.09176635742188, "loss": 0.7121, "rewards/accuracies": 1.0, "rewards/chosen": 1.4046601057052612, "rewards/margins": 1.1225433349609375, "rewards/rejected": 0.28211671113967896, "step": 9134 }, { "epoch": 1.48, "learning_rate": 3.9704708917118816e-07, "logits/chosen": -1.2113568782806396, "logits/rejected": -1.3717652559280396, "logps/chosen": -140.8919677734375, "logps/rejected": -124.18840026855469, "loss": 0.0778, "rewards/accuracies": 1.0, "rewards/chosen": 8.001764297485352, "rewards/margins": 4.086153030395508, "rewards/rejected": 3.9156112670898438, "step": 9135 }, { "epoch": 1.48, "learning_rate": 3.969184835417142e-07, "logits/chosen": -0.3787028193473816, "logits/rejected": -0.30900201201438904, "logps/chosen": -60.37675476074219, "logps/rejected": -52.21002960205078, "loss": 0.8246, "rewards/accuracies": 0.0, "rewards/chosen": 1.3951996564865112, "rewards/margins": -1.4116233587265015, "rewards/rejected": 2.8068230152130127, "step": 9136 }, { "epoch": 1.48, "learning_rate": 3.9678988503419763e-07, "logits/chosen": -0.5910496115684509, "logits/rejected": -0.5965805649757385, "logps/chosen": -52.63603591918945, "logps/rejected": -108.68502807617188, "loss": 0.7643, "rewards/accuracies": 1.0, "rewards/chosen": 0.9512519836425781, "rewards/margins": 0.12973862886428833, "rewards/rejected": 0.8215133547782898, "step": 9137 }, { "epoch": 1.48, "learning_rate": 3.966612936575235e-07, "logits/chosen": -0.5271927714347839, "logits/rejected": -0.5438055992126465, "logps/chosen": -39.54179382324219, "logps/rejected": -42.774906158447266, "loss": 0.5422, "rewards/accuracies": 0.0, "rewards/chosen": 1.100253701210022, "rewards/margins": -0.062197089195251465, "rewards/rejected": 1.1624507904052734, "step": 9138 }, { "epoch": 1.48, "learning_rate": 3.965327094205761e-07, "logits/chosen": -0.588333010673523, "logits/rejected": -0.4761940836906433, "logps/chosen": -70.37383270263672, "logps/rejected": -63.504695892333984, "loss": 0.3191, "rewards/accuracies": 1.0, "rewards/chosen": 1.9842430353164673, "rewards/margins": 0.4861034154891968, "rewards/rejected": 1.4981396198272705, "step": 9139 }, { "epoch": 1.48, "learning_rate": 3.9640413233223945e-07, "logits/chosen": -0.6042488813400269, "logits/rejected": -0.6425122022628784, "logps/chosen": -74.60128784179688, "logps/rejected": -76.31301879882812, "loss": 2.055, "rewards/accuracies": 0.0, "rewards/chosen": 2.137913465499878, "rewards/margins": -3.8603670597076416, "rewards/rejected": 5.9982805252075195, "step": 9140 }, { "epoch": 1.48, "learning_rate": 3.962755624013971e-07, "logits/chosen": -0.37252989411354065, "logits/rejected": -0.39029160141944885, "logps/chosen": -34.56285095214844, "logps/rejected": -54.29342269897461, "loss": 0.6045, "rewards/accuracies": 0.0, "rewards/chosen": 1.1709091663360596, "rewards/margins": -0.1651172637939453, "rewards/rejected": 1.3360264301300049, "step": 9141 }, { "epoch": 1.48, "learning_rate": 3.9614699963693185e-07, "logits/chosen": -1.0533490180969238, "logits/rejected": -1.0224206447601318, "logps/chosen": -93.26168060302734, "logps/rejected": -83.77783203125, "loss": 2.029, "rewards/accuracies": 0.0, "rewards/chosen": 1.488166093826294, "rewards/margins": -0.8467812538146973, "rewards/rejected": 2.334947347640991, "step": 9142 }, { "epoch": 1.48, "learning_rate": 3.960184440477263e-07, "logits/chosen": -0.5137799382209778, "logits/rejected": -0.4696890115737915, "logps/chosen": -73.0301513671875, "logps/rejected": -43.71552276611328, "loss": 1.0669, "rewards/accuracies": 0.0, "rewards/chosen": 0.5948936343193054, "rewards/margins": -0.480707585811615, "rewards/rejected": 1.0756012201309204, "step": 9143 }, { "epoch": 1.48, "learning_rate": 3.958898956426624e-07, "logits/chosen": -0.7336900234222412, "logits/rejected": -0.8300766944885254, "logps/chosen": -113.13414001464844, "logps/rejected": -119.81095886230469, "loss": 2.2956, "rewards/accuracies": 0.0, "rewards/chosen": 2.0276641845703125, "rewards/margins": -4.56582498550415, "rewards/rejected": 6.593489170074463, "step": 9144 }, { "epoch": 1.48, "learning_rate": 3.957613544306216e-07, "logits/chosen": -0.4520425796508789, "logits/rejected": -0.3235175311565399, "logps/chosen": -68.08428955078125, "logps/rejected": -47.22624206542969, "loss": 0.2682, "rewards/accuracies": 1.0, "rewards/chosen": 0.5936905145645142, "rewards/margins": 0.5204674005508423, "rewards/rejected": 0.07322311401367188, "step": 9145 }, { "epoch": 1.48, "learning_rate": 3.956328204204849e-07, "logits/chosen": -1.050960659980774, "logits/rejected": -1.0117799043655396, "logps/chosen": -153.73028564453125, "logps/rejected": -113.783447265625, "loss": 0.0961, "rewards/accuracies": 1.0, "rewards/chosen": 3.054274082183838, "rewards/margins": 1.7951569557189941, "rewards/rejected": 1.2591171264648438, "step": 9146 }, { "epoch": 1.48, "learning_rate": 3.9550429362113283e-07, "logits/chosen": -0.41521039605140686, "logits/rejected": -0.330870121717453, "logps/chosen": -85.84002685546875, "logps/rejected": -84.27682495117188, "loss": 1.0934, "rewards/accuracies": 0.0, "rewards/chosen": 1.475062608718872, "rewards/margins": -1.6407577991485596, "rewards/rejected": 3.1158204078674316, "step": 9147 }, { "epoch": 1.48, "learning_rate": 3.953757740414453e-07, "logits/chosen": -0.6061698794364929, "logits/rejected": -0.608241856098175, "logps/chosen": -46.063724517822266, "logps/rejected": -37.36761474609375, "loss": 0.5876, "rewards/accuracies": 1.0, "rewards/chosen": 1.2525848150253296, "rewards/margins": 0.5330547094345093, "rewards/rejected": 0.7195301055908203, "step": 9148 }, { "epoch": 1.48, "learning_rate": 3.9524726169030174e-07, "logits/chosen": -0.7327955961227417, "logits/rejected": -0.830439567565918, "logps/chosen": -63.15583801269531, "logps/rejected": -84.29379272460938, "loss": 1.5529, "rewards/accuracies": 0.0, "rewards/chosen": 1.0579299926757812, "rewards/margins": -2.486494541168213, "rewards/rejected": 3.544424533843994, "step": 9149 }, { "epoch": 1.49, "learning_rate": 3.951187565765813e-07, "logits/chosen": -0.7290652394294739, "logits/rejected": -0.7291415929794312, "logps/chosen": -5.413449287414551, "logps/rejected": -12.738628387451172, "loss": 0.3037, "rewards/accuracies": 1.0, "rewards/chosen": 0.5892737507820129, "rewards/margins": 0.23893997073173523, "rewards/rejected": 0.3503337800502777, "step": 9150 }, { "epoch": 1.49, "learning_rate": 3.9499025870916235e-07, "logits/chosen": -0.615169107913971, "logits/rejected": -0.5555798411369324, "logps/chosen": -100.96688842773438, "logps/rejected": -73.17652893066406, "loss": 0.4673, "rewards/accuracies": 0.0, "rewards/chosen": 1.111883521080017, "rewards/margins": -0.37197113037109375, "rewards/rejected": 1.4838546514511108, "step": 9151 }, { "epoch": 1.49, "learning_rate": 3.948617680969229e-07, "logits/chosen": -0.7181751132011414, "logits/rejected": -0.7309575080871582, "logps/chosen": -89.32398223876953, "logps/rejected": -113.21541595458984, "loss": 1.0839, "rewards/accuracies": 1.0, "rewards/chosen": 1.2000579833984375, "rewards/margins": 0.34664762020111084, "rewards/rejected": 0.8534103631973267, "step": 9152 }, { "epoch": 1.49, "learning_rate": 3.947332847487405e-07, "logits/chosen": -0.5558714866638184, "logits/rejected": -0.5150691866874695, "logps/chosen": -49.98700714111328, "logps/rejected": -55.56220626831055, "loss": 0.6189, "rewards/accuracies": 1.0, "rewards/chosen": 1.7035034894943237, "rewards/margins": 0.44319117069244385, "rewards/rejected": 1.2603123188018799, "step": 9153 }, { "epoch": 1.49, "learning_rate": 3.94604808673492e-07, "logits/chosen": -0.9456616044044495, "logits/rejected": -0.9146885871887207, "logps/chosen": -73.73872375488281, "logps/rejected": -56.541412353515625, "loss": 2.3, "rewards/accuracies": 0.0, "rewards/chosen": 1.1885032653808594, "rewards/margins": -0.6940666437149048, "rewards/rejected": 1.8825699090957642, "step": 9154 }, { "epoch": 1.49, "learning_rate": 3.94476339880054e-07, "logits/chosen": -0.7327145338058472, "logits/rejected": -0.5879763960838318, "logps/chosen": -133.6480712890625, "logps/rejected": -84.81184387207031, "loss": 0.1611, "rewards/accuracies": 1.0, "rewards/chosen": 4.707482814788818, "rewards/margins": 3.1396484375, "rewards/rejected": 1.567834496498108, "step": 9155 }, { "epoch": 1.49, "learning_rate": 3.943478783773024e-07, "logits/chosen": -0.6797139644622803, "logits/rejected": -0.5891231298446655, "logps/chosen": -119.19290161132812, "logps/rejected": -78.08584594726562, "loss": 0.4453, "rewards/accuracies": 0.0, "rewards/chosen": 2.004159688949585, "rewards/margins": -0.3114180564880371, "rewards/rejected": 2.315577745437622, "step": 9156 }, { "epoch": 1.49, "learning_rate": 3.942194241741127e-07, "logits/chosen": -0.6842843294143677, "logits/rejected": -0.6523011922836304, "logps/chosen": -92.1614990234375, "logps/rejected": -174.466064453125, "loss": 0.8692, "rewards/accuracies": 0.0, "rewards/chosen": 4.418449401855469, "rewards/margins": -1.3837265968322754, "rewards/rejected": 5.802175998687744, "step": 9157 }, { "epoch": 1.49, "learning_rate": 3.9409097727936004e-07, "logits/chosen": -1.0770504474639893, "logits/rejected": -1.122446894645691, "logps/chosen": -45.46533203125, "logps/rejected": -117.62333679199219, "loss": 1.4295, "rewards/accuracies": 0.0, "rewards/chosen": 1.5848087072372437, "rewards/margins": -1.7128292322158813, "rewards/rejected": 3.297637939453125, "step": 9158 }, { "epoch": 1.49, "learning_rate": 3.939625377019186e-07, "logits/chosen": -0.9793359637260437, "logits/rejected": -1.002324104309082, "logps/chosen": -185.27828979492188, "logps/rejected": -40.24018096923828, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 4.706851482391357, "rewards/margins": 4.645657539367676, "rewards/rejected": 0.06119384989142418, "step": 9159 }, { "epoch": 1.49, "learning_rate": 3.938341054506625e-07, "logits/chosen": -0.8381272554397583, "logits/rejected": -0.7888350486755371, "logps/chosen": -113.89358520507812, "logps/rejected": -51.39636993408203, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 4.276170253753662, "rewards/margins": 2.5315420627593994, "rewards/rejected": 1.7446281909942627, "step": 9160 }, { "epoch": 1.49, "learning_rate": 3.9370568053446513e-07, "logits/chosen": -0.6303039193153381, "logits/rejected": -0.45695793628692627, "logps/chosen": -135.70166015625, "logps/rejected": -59.99181365966797, "loss": 0.3273, "rewards/accuracies": 1.0, "rewards/chosen": 4.728079319000244, "rewards/margins": 3.183159828186035, "rewards/rejected": 1.5449196100234985, "step": 9161 }, { "epoch": 1.49, "learning_rate": 3.935772629621995e-07, "logits/chosen": -1.15240478515625, "logits/rejected": -1.148569107055664, "logps/chosen": -145.464111328125, "logps/rejected": -203.90277099609375, "loss": 1.8605, "rewards/accuracies": 0.0, "rewards/chosen": 0.9796814322471619, "rewards/margins": -3.685891628265381, "rewards/rejected": 4.6655731201171875, "step": 9162 }, { "epoch": 1.49, "learning_rate": 3.93448852742738e-07, "logits/chosen": -0.6242310404777527, "logits/rejected": -0.5240197777748108, "logps/chosen": -122.75031280517578, "logps/rejected": -78.85709381103516, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 6.204456329345703, "rewards/margins": 3.415602922439575, "rewards/rejected": 2.788853406906128, "step": 9163 }, { "epoch": 1.49, "learning_rate": 3.933204498849526e-07, "logits/chosen": -0.594041109085083, "logits/rejected": -0.646079957485199, "logps/chosen": -71.32945251464844, "logps/rejected": -91.30059051513672, "loss": 0.2513, "rewards/accuracies": 1.0, "rewards/chosen": 1.2146819829940796, "rewards/margins": 0.5203597545623779, "rewards/rejected": 0.6943222284317017, "step": 9164 }, { "epoch": 1.49, "learning_rate": 3.931920543977147e-07, "logits/chosen": -0.24884863197803497, "logits/rejected": -0.22217729687690735, "logps/chosen": -18.166357040405273, "logps/rejected": -0.7950640916824341, "loss": 0.8553, "rewards/accuracies": 0.0, "rewards/chosen": 0.46036338806152344, "rewards/margins": -0.020459264516830444, "rewards/rejected": 0.4808226525783539, "step": 9165 }, { "epoch": 1.49, "learning_rate": 3.930636662898952e-07, "logits/chosen": -0.8980427980422974, "logits/rejected": -0.8593018054962158, "logps/chosen": -59.63787841796875, "logps/rejected": -136.34109497070312, "loss": 0.2665, "rewards/accuracies": 1.0, "rewards/chosen": 2.926748752593994, "rewards/margins": 0.8100221157073975, "rewards/rejected": 2.1167266368865967, "step": 9166 }, { "epoch": 1.49, "learning_rate": 3.929352855703644e-07, "logits/chosen": -0.6384537816047668, "logits/rejected": -0.5855579376220703, "logps/chosen": -98.76558685302734, "logps/rejected": -74.7204818725586, "loss": 0.8134, "rewards/accuracies": 0.0, "rewards/chosen": 2.8008453845977783, "rewards/margins": -1.2712624073028564, "rewards/rejected": 4.072107791900635, "step": 9167 }, { "epoch": 1.49, "learning_rate": 3.9280691224799246e-07, "logits/chosen": -0.4650754928588867, "logits/rejected": -0.3684663474559784, "logps/chosen": -79.09151458740234, "logps/rejected": -62.296669006347656, "loss": 0.566, "rewards/accuracies": 0.0, "rewards/chosen": 1.2947129011154175, "rewards/margins": -0.3243751525878906, "rewards/rejected": 1.619088053703308, "step": 9168 }, { "epoch": 1.49, "learning_rate": 3.926785463316486e-07, "logits/chosen": -0.692472517490387, "logits/rejected": -0.6406046152114868, "logps/chosen": -94.18899536132812, "logps/rejected": -93.58818054199219, "loss": 0.9825, "rewards/accuracies": 0.0, "rewards/chosen": 0.9908737540245056, "rewards/margins": -1.691502332687378, "rewards/rejected": 2.6823761463165283, "step": 9169 }, { "epoch": 1.49, "learning_rate": 3.9255018783020166e-07, "logits/chosen": -0.9822280406951904, "logits/rejected": -0.9215776324272156, "logps/chosen": -88.37422943115234, "logps/rejected": -59.324684143066406, "loss": 1.1235, "rewards/accuracies": 1.0, "rewards/chosen": 2.4248948097229004, "rewards/margins": 0.40975499153137207, "rewards/rejected": 2.0151398181915283, "step": 9170 }, { "epoch": 1.49, "learning_rate": 3.924218367525201e-07, "logits/chosen": -0.6849044561386108, "logits/rejected": -0.23670966923236847, "logps/chosen": -174.94337463378906, "logps/rejected": -130.42169189453125, "loss": 2.4039, "rewards/accuracies": 0.0, "rewards/chosen": 0.5276764035224915, "rewards/margins": -1.6994431018829346, "rewards/rejected": 2.2271194458007812, "step": 9171 }, { "epoch": 1.49, "learning_rate": 3.922934931074717e-07, "logits/chosen": -0.725695788860321, "logits/rejected": -0.6383082270622253, "logps/chosen": -53.3061637878418, "logps/rejected": -53.277587890625, "loss": 0.1819, "rewards/accuracies": 1.0, "rewards/chosen": 2.741647720336914, "rewards/margins": 1.717359185218811, "rewards/rejected": 1.024288535118103, "step": 9172 }, { "epoch": 1.49, "learning_rate": 3.9216515690392376e-07, "logits/chosen": -0.33051538467407227, "logits/rejected": -0.3224339187145233, "logps/chosen": -89.81302642822266, "logps/rejected": -138.5905303955078, "loss": 0.6956, "rewards/accuracies": 0.0, "rewards/chosen": 1.0830596685409546, "rewards/margins": -0.08025670051574707, "rewards/rejected": 1.1633163690567017, "step": 9173 }, { "epoch": 1.49, "learning_rate": 3.920368281507431e-07, "logits/chosen": -0.7070522904396057, "logits/rejected": -0.7074733376502991, "logps/chosen": -5.038484573364258, "logps/rejected": -15.872530937194824, "loss": 4.0449, "rewards/accuracies": 1.0, "rewards/chosen": 0.35253411531448364, "rewards/margins": 3.4809112548828125e-05, "rewards/rejected": 0.3524993062019348, "step": 9174 }, { "epoch": 1.49, "learning_rate": 3.9190850685679617e-07, "logits/chosen": -0.6403971314430237, "logits/rejected": -0.6383908987045288, "logps/chosen": -57.95684051513672, "logps/rejected": -85.15673828125, "loss": 1.7738, "rewards/accuracies": 1.0, "rewards/chosen": 1.149797797203064, "rewards/margins": 0.021282196044921875, "rewards/rejected": 1.128515601158142, "step": 9175 }, { "epoch": 1.49, "learning_rate": 3.9178019303094856e-07, "logits/chosen": -0.41218605637550354, "logits/rejected": -0.38313624262809753, "logps/chosen": -105.7675552368164, "logps/rejected": -65.23518371582031, "loss": 0.6422, "rewards/accuracies": 0.0, "rewards/chosen": 0.5066978335380554, "rewards/margins": -0.043318986892700195, "rewards/rejected": 0.5500168204307556, "step": 9176 }, { "epoch": 1.49, "learning_rate": 3.9165188668206566e-07, "logits/chosen": -0.8846073150634766, "logits/rejected": -0.8845259547233582, "logps/chosen": -105.06401062011719, "logps/rejected": -48.87739562988281, "loss": 0.2099, "rewards/accuracies": 1.0, "rewards/chosen": 3.0680558681488037, "rewards/margins": 0.8042199611663818, "rewards/rejected": 2.263835906982422, "step": 9177 }, { "epoch": 1.49, "learning_rate": 3.915235878190122e-07, "logits/chosen": -0.646425187587738, "logits/rejected": -0.646425187587738, "logps/chosen": -60.80280685424805, "logps/rejected": -60.80280685424805, "loss": 0.353, "rewards/accuracies": 0.0, "rewards/chosen": 1.8808544874191284, "rewards/margins": 0.0, "rewards/rejected": 1.8808544874191284, "step": 9178 }, { "epoch": 1.49, "learning_rate": 3.913952964506524e-07, "logits/chosen": -1.19016695022583, "logits/rejected": -1.0691273212432861, "logps/chosen": -88.86702728271484, "logps/rejected": -67.02644348144531, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": 6.5578742027282715, "rewards/margins": 4.13144588470459, "rewards/rejected": 2.4264283180236816, "step": 9179 }, { "epoch": 1.49, "learning_rate": 3.9126701258584996e-07, "logits/chosen": -1.24973464012146, "logits/rejected": -1.1451669931411743, "logps/chosen": -139.15626525878906, "logps/rejected": -46.37405014038086, "loss": 0.1055, "rewards/accuracies": 1.0, "rewards/chosen": 4.829415798187256, "rewards/margins": 3.8123955726623535, "rewards/rejected": 1.0170201063156128, "step": 9180 }, { "epoch": 1.49, "learning_rate": 3.911387362334682e-07, "logits/chosen": -0.6049637198448181, "logits/rejected": -0.41017842292785645, "logps/chosen": -26.705278396606445, "logps/rejected": -97.49166870117188, "loss": 1.9526, "rewards/accuracies": 0.0, "rewards/chosen": 0.7557344436645508, "rewards/margins": -3.8737120628356934, "rewards/rejected": 4.629446506500244, "step": 9181 }, { "epoch": 1.49, "learning_rate": 3.910104674023696e-07, "logits/chosen": -0.7329900860786438, "logits/rejected": -0.7420730590820312, "logps/chosen": -58.3873291015625, "logps/rejected": -59.46513748168945, "loss": 1.0671, "rewards/accuracies": 1.0, "rewards/chosen": 2.0979630947113037, "rewards/margins": 0.0346989631652832, "rewards/rejected": 2.0632641315460205, "step": 9182 }, { "epoch": 1.49, "learning_rate": 3.908822061014165e-07, "logits/chosen": -0.4104085862636566, "logits/rejected": -0.4083397686481476, "logps/chosen": -61.334129333496094, "logps/rejected": -114.75286865234375, "loss": 1.4937, "rewards/accuracies": 0.0, "rewards/chosen": 1.8924843072891235, "rewards/margins": -2.716430187225342, "rewards/rejected": 4.608914375305176, "step": 9183 }, { "epoch": 1.49, "learning_rate": 3.907539523394704e-07, "logits/chosen": -0.9307522177696228, "logits/rejected": -0.669600784778595, "logps/chosen": -205.49826049804688, "logps/rejected": -25.953210830688477, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 3.7579803466796875, "rewards/margins": 3.3114871978759766, "rewards/rejected": 0.44649314880371094, "step": 9184 }, { "epoch": 1.49, "learning_rate": 3.9062570612539257e-07, "logits/chosen": -0.9532821178436279, "logits/rejected": -0.8340689539909363, "logps/chosen": -204.21658325195312, "logps/rejected": -13.6140775680542, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 4.916806221008301, "rewards/margins": 4.372082233428955, "rewards/rejected": 0.5447239875793457, "step": 9185 }, { "epoch": 1.49, "learning_rate": 3.904974674680435e-07, "logits/chosen": -0.8130156993865967, "logits/rejected": -0.7636575102806091, "logps/chosen": -102.33897399902344, "logps/rejected": -44.17036437988281, "loss": 0.4869, "rewards/accuracies": 1.0, "rewards/chosen": 1.4945954084396362, "rewards/margins": 0.29319918155670166, "rewards/rejected": 1.2013962268829346, "step": 9186 }, { "epoch": 1.49, "learning_rate": 3.903692363762833e-07, "logits/chosen": -1.1017184257507324, "logits/rejected": -1.0266449451446533, "logps/chosen": -120.4600830078125, "logps/rejected": -39.986839294433594, "loss": 0.2067, "rewards/accuracies": 1.0, "rewards/chosen": 1.455043077468872, "rewards/margins": 1.201978325843811, "rewards/rejected": 0.25306472182273865, "step": 9187 }, { "epoch": 1.49, "learning_rate": 3.902410128589716e-07, "logits/chosen": -0.8951634168624878, "logits/rejected": -0.8755595684051514, "logps/chosen": -86.73751831054688, "logps/rejected": -113.03840637207031, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 1.1687606573104858, "rewards/margins": 0.5271972417831421, "rewards/rejected": 0.6415634155273438, "step": 9188 }, { "epoch": 1.49, "learning_rate": 3.9011279692496736e-07, "logits/chosen": -0.6137470006942749, "logits/rejected": -0.5161790251731873, "logps/chosen": -50.62200164794922, "logps/rejected": -8.848785400390625, "loss": 0.1719, "rewards/accuracies": 1.0, "rewards/chosen": 2.3408432006835938, "rewards/margins": 1.3791497945785522, "rewards/rejected": 0.9616934061050415, "step": 9189 }, { "epoch": 1.49, "learning_rate": 3.89984588583129e-07, "logits/chosen": -0.7401942014694214, "logits/rejected": -0.7450754642486572, "logps/chosen": -121.3570556640625, "logps/rejected": -62.406768798828125, "loss": 0.4566, "rewards/accuracies": 0.0, "rewards/chosen": 1.4697494506835938, "rewards/margins": -0.15256118774414062, "rewards/rejected": 1.6223106384277344, "step": 9190 }, { "epoch": 1.49, "learning_rate": 3.8985638784231465e-07, "logits/chosen": -0.8710904717445374, "logits/rejected": -0.8620815873146057, "logps/chosen": -165.37237548828125, "logps/rejected": -101.74446105957031, "loss": 0.5402, "rewards/accuracies": 0.0, "rewards/chosen": 2.853463888168335, "rewards/margins": -0.6296005249023438, "rewards/rejected": 3.4830644130706787, "step": 9191 }, { "epoch": 1.49, "learning_rate": 3.897281947113817e-07, "logits/chosen": -0.5171758532524109, "logits/rejected": -0.3652268350124359, "logps/chosen": -115.64014434814453, "logps/rejected": -72.16871643066406, "loss": 0.0882, "rewards/accuracies": 1.0, "rewards/chosen": 3.398120880126953, "rewards/margins": 1.6635528802871704, "rewards/rejected": 1.7345679998397827, "step": 9192 }, { "epoch": 1.49, "learning_rate": 3.8960000919918713e-07, "logits/chosen": -0.7682496309280396, "logits/rejected": -0.7352158427238464, "logps/chosen": -66.49642944335938, "logps/rejected": -92.73617553710938, "loss": 0.2116, "rewards/accuracies": 1.0, "rewards/chosen": 3.117732286453247, "rewards/margins": 0.9787240028381348, "rewards/rejected": 2.1390082836151123, "step": 9193 }, { "epoch": 1.49, "learning_rate": 3.894718313145872e-07, "logits/chosen": -0.4896641671657562, "logits/rejected": -0.4903916120529175, "logps/chosen": -95.07887268066406, "logps/rejected": -81.7105941772461, "loss": 1.2427, "rewards/accuracies": 0.0, "rewards/chosen": 0.9103042483329773, "rewards/margins": -1.5185966491699219, "rewards/rejected": 2.428900957107544, "step": 9194 }, { "epoch": 1.49, "learning_rate": 3.89343661066438e-07, "logits/chosen": -0.7925365567207336, "logits/rejected": -0.7464592456817627, "logps/chosen": -62.543174743652344, "logps/rejected": -64.02979278564453, "loss": 0.4074, "rewards/accuracies": 0.0, "rewards/chosen": 1.6854546070098877, "rewards/margins": -0.2046874761581421, "rewards/rejected": 1.8901420831680298, "step": 9195 }, { "epoch": 1.49, "learning_rate": 3.8921549846359475e-07, "logits/chosen": -0.6892961263656616, "logits/rejected": -0.7717104554176331, "logps/chosen": -86.75663757324219, "logps/rejected": -117.89903259277344, "loss": 0.441, "rewards/accuracies": 0.0, "rewards/chosen": 1.3529938459396362, "rewards/margins": -0.3128570318222046, "rewards/rejected": 1.6658508777618408, "step": 9196 }, { "epoch": 1.49, "learning_rate": 3.8908734351491233e-07, "logits/chosen": -1.0415680408477783, "logits/rejected": -0.9314557909965515, "logps/chosen": -103.14143371582031, "logps/rejected": -20.16184425354004, "loss": 0.7631, "rewards/accuracies": 1.0, "rewards/chosen": 4.907383918762207, "rewards/margins": 4.508580207824707, "rewards/rejected": 0.3988039195537567, "step": 9197 }, { "epoch": 1.49, "learning_rate": 3.88959196229245e-07, "logits/chosen": -0.809603750705719, "logits/rejected": -0.6995055675506592, "logps/chosen": -149.87240600585938, "logps/rejected": -50.31343078613281, "loss": 0.2609, "rewards/accuracies": 1.0, "rewards/chosen": 3.778002977371216, "rewards/margins": 1.285832166671753, "rewards/rejected": 2.492170810699463, "step": 9198 }, { "epoch": 1.49, "learning_rate": 3.8883105661544645e-07, "logits/chosen": -0.7450310587882996, "logits/rejected": -0.6872192025184631, "logps/chosen": -95.42268371582031, "logps/rejected": -72.17894744873047, "loss": 0.172, "rewards/accuracies": 1.0, "rewards/chosen": 5.017125129699707, "rewards/margins": 2.8296501636505127, "rewards/rejected": 2.1874749660491943, "step": 9199 }, { "epoch": 1.49, "learning_rate": 3.887029246823701e-07, "logits/chosen": 0.0402250736951828, "logits/rejected": 0.0402250736951828, "logps/chosen": -49.36643600463867, "logps/rejected": -49.36643600463867, "loss": 0.7501, "rewards/accuracies": 0.0, "rewards/chosen": 0.1577068418264389, "rewards/margins": 0.0, "rewards/rejected": 0.1577068418264389, "step": 9200 }, { "epoch": 1.49, "learning_rate": 3.885748004388685e-07, "logits/chosen": -0.8895453810691833, "logits/rejected": -0.6967215538024902, "logps/chosen": -222.79275512695312, "logps/rejected": -72.02395629882812, "loss": 0.1492, "rewards/accuracies": 1.0, "rewards/chosen": 1.681921362876892, "rewards/margins": 1.2497893571853638, "rewards/rejected": 0.43213197588920593, "step": 9201 }, { "epoch": 1.49, "learning_rate": 3.8844668389379393e-07, "logits/chosen": -0.6982597708702087, "logits/rejected": -0.6982597708702087, "logps/chosen": -70.29866790771484, "logps/rejected": -70.29866790771484, "loss": 0.3736, "rewards/accuracies": 0.0, "rewards/chosen": 1.9438087940216064, "rewards/margins": 0.0, "rewards/rejected": 1.9438087940216064, "step": 9202 }, { "epoch": 1.49, "learning_rate": 3.883185750559978e-07, "logits/chosen": -0.897071897983551, "logits/rejected": -0.7903367280960083, "logps/chosen": -98.68074035644531, "logps/rejected": -17.90135955810547, "loss": 0.1885, "rewards/accuracies": 1.0, "rewards/chosen": 0.8755539059638977, "rewards/margins": 0.7910465598106384, "rewards/rejected": 0.08450736850500107, "step": 9203 }, { "epoch": 1.49, "learning_rate": 3.8819047393433156e-07, "logits/chosen": -0.7996545433998108, "logits/rejected": -0.7265851497650146, "logps/chosen": -90.08491516113281, "logps/rejected": -30.37145233154297, "loss": 0.1525, "rewards/accuracies": 1.0, "rewards/chosen": 1.4592026472091675, "rewards/margins": 1.2094639539718628, "rewards/rejected": 0.2497386932373047, "step": 9204 }, { "epoch": 1.49, "learning_rate": 3.880623805376456e-07, "logits/chosen": -0.1341562271118164, "logits/rejected": -0.07391638308763504, "logps/chosen": -56.145751953125, "logps/rejected": -1.4338399171829224, "loss": 0.4505, "rewards/accuracies": 0.0, "rewards/chosen": 0.2264305204153061, "rewards/margins": -0.1838105171918869, "rewards/rejected": 0.410241037607193, "step": 9205 }, { "epoch": 1.49, "learning_rate": 3.8793429487478997e-07, "logits/chosen": -0.5175990462303162, "logits/rejected": -0.4906800091266632, "logps/chosen": -58.51861572265625, "logps/rejected": -59.026851654052734, "loss": 0.8262, "rewards/accuracies": 0.0, "rewards/chosen": 2.1939544677734375, "rewards/margins": -0.01905035972595215, "rewards/rejected": 2.2130048274993896, "step": 9206 }, { "epoch": 1.49, "learning_rate": 3.878062169546142e-07, "logits/chosen": -0.8185654878616333, "logits/rejected": -0.7370501160621643, "logps/chosen": -87.17201232910156, "logps/rejected": -45.71559143066406, "loss": 0.1156, "rewards/accuracies": 1.0, "rewards/chosen": 1.2361602783203125, "rewards/margins": 1.3835922479629517, "rewards/rejected": -0.14743195474147797, "step": 9207 }, { "epoch": 1.49, "learning_rate": 3.876781467859672e-07, "logits/chosen": -1.1109893321990967, "logits/rejected": -0.9316175580024719, "logps/chosen": -140.83961486816406, "logps/rejected": -37.9774169921875, "loss": 0.02, "rewards/accuracies": 1.0, "rewards/chosen": 4.341746807098389, "rewards/margins": 4.042901039123535, "rewards/rejected": 0.29884567856788635, "step": 9208 }, { "epoch": 1.49, "learning_rate": 3.8755008437769754e-07, "logits/chosen": -0.5639035701751709, "logits/rejected": -0.4800279140472412, "logps/chosen": -42.6841926574707, "logps/rejected": -69.139404296875, "loss": 0.568, "rewards/accuracies": 0.0, "rewards/chosen": 2.1524930000305176, "rewards/margins": -0.7045314311981201, "rewards/rejected": 2.8570244312286377, "step": 9209 }, { "epoch": 1.49, "learning_rate": 3.87422029738653e-07, "logits/chosen": -0.6444395184516907, "logits/rejected": -0.6153914332389832, "logps/chosen": -103.6375961303711, "logps/rejected": -92.5869140625, "loss": 0.5222, "rewards/accuracies": 1.0, "rewards/chosen": 1.1274932622909546, "rewards/margins": 1.193939208984375, "rewards/rejected": -0.06644592434167862, "step": 9210 }, { "epoch": 1.5, "learning_rate": 3.8729398287768094e-07, "logits/chosen": -0.742573082447052, "logits/rejected": -0.6571233868598938, "logps/chosen": -90.88348388671875, "logps/rejected": -100.00059509277344, "loss": 0.1203, "rewards/accuracies": 1.0, "rewards/chosen": 4.977084636688232, "rewards/margins": 2.050255060195923, "rewards/rejected": 2.9268295764923096, "step": 9211 }, { "epoch": 1.5, "learning_rate": 3.8716594380362827e-07, "logits/chosen": -0.9488999247550964, "logits/rejected": -0.9860779643058777, "logps/chosen": -65.16717529296875, "logps/rejected": -161.31869506835938, "loss": 0.5697, "rewards/accuracies": 0.0, "rewards/chosen": 5.302820682525635, "rewards/margins": -0.7516622543334961, "rewards/rejected": 6.054482936859131, "step": 9212 }, { "epoch": 1.5, "learning_rate": 3.870379125253412e-07, "logits/chosen": -0.5111910700798035, "logits/rejected": -0.4161599278450012, "logps/chosen": -58.39265060424805, "logps/rejected": -37.175392150878906, "loss": 0.4511, "rewards/accuracies": 0.0, "rewards/chosen": 1.531576156616211, "rewards/margins": -0.3658936023712158, "rewards/rejected": 1.8974697589874268, "step": 9213 }, { "epoch": 1.5, "learning_rate": 3.869098890516656e-07, "logits/chosen": -0.7298460006713867, "logits/rejected": -0.7207040786743164, "logps/chosen": -45.22304916381836, "logps/rejected": -72.90978240966797, "loss": 0.7616, "rewards/accuracies": 1.0, "rewards/chosen": 1.376699447631836, "rewards/margins": 0.11895942687988281, "rewards/rejected": 1.2577400207519531, "step": 9214 }, { "epoch": 1.5, "learning_rate": 3.867818733914466e-07, "logits/chosen": -0.2901815176010132, "logits/rejected": -0.2878457009792328, "logps/chosen": -39.22100067138672, "logps/rejected": -44.14830017089844, "loss": 0.5331, "rewards/accuracies": 0.0, "rewards/chosen": 1.143225908279419, "rewards/margins": -0.6124950647354126, "rewards/rejected": 1.7557209730148315, "step": 9215 }, { "epoch": 1.5, "learning_rate": 3.8665386555352875e-07, "logits/chosen": -0.938618540763855, "logits/rejected": -0.7760571837425232, "logps/chosen": -82.98693084716797, "logps/rejected": -70.72711944580078, "loss": 0.7187, "rewards/accuracies": 1.0, "rewards/chosen": 2.6333580017089844, "rewards/margins": 0.9798355102539062, "rewards/rejected": 1.6535224914550781, "step": 9216 }, { "epoch": 1.5, "learning_rate": 3.8652586554675637e-07, "logits/chosen": -0.613007128238678, "logits/rejected": -0.5163911581039429, "logps/chosen": -57.95348358154297, "logps/rejected": -53.703041076660156, "loss": 0.2279, "rewards/accuracies": 1.0, "rewards/chosen": 1.2813042402267456, "rewards/margins": 0.6845856308937073, "rewards/rejected": 0.5967186093330383, "step": 9217 }, { "epoch": 1.5, "learning_rate": 3.86397873379973e-07, "logits/chosen": -0.43387097120285034, "logits/rejected": -0.37701669335365295, "logps/chosen": -53.40388488769531, "logps/rejected": -62.648502349853516, "loss": 0.1895, "rewards/accuracies": 1.0, "rewards/chosen": 1.6527084112167358, "rewards/margins": 0.9601176977157593, "rewards/rejected": 0.6925907135009766, "step": 9218 }, { "epoch": 1.5, "learning_rate": 3.862698890620216e-07, "logits/chosen": -0.3783048391342163, "logits/rejected": -0.337510347366333, "logps/chosen": -40.72362518310547, "logps/rejected": -24.508926391601562, "loss": 0.8559, "rewards/accuracies": 0.0, "rewards/chosen": 1.2197998762130737, "rewards/margins": -0.8510452508926392, "rewards/rejected": 2.070845127105713, "step": 9219 }, { "epoch": 1.5, "learning_rate": 3.8614191260174485e-07, "logits/chosen": -0.6059946417808533, "logits/rejected": -0.579167366027832, "logps/chosen": -50.418636322021484, "logps/rejected": -19.099346160888672, "loss": 0.6267, "rewards/accuracies": 1.0, "rewards/chosen": 1.3310989141464233, "rewards/margins": 1.072226881980896, "rewards/rejected": 0.25887203216552734, "step": 9220 }, { "epoch": 1.5, "learning_rate": 3.860139440079845e-07, "logits/chosen": -1.1973724365234375, "logits/rejected": -1.0540536642074585, "logps/chosen": -132.54241943359375, "logps/rejected": -20.871654510498047, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 8.212717056274414, "rewards/margins": 7.679820537567139, "rewards/rejected": 0.5328964591026306, "step": 9221 }, { "epoch": 1.5, "learning_rate": 3.8588598328958215e-07, "logits/chosen": -1.7487138509750366, "logits/rejected": -1.8078681230545044, "logps/chosen": -54.590492248535156, "logps/rejected": -23.05948257446289, "loss": 0.1677, "rewards/accuracies": 1.0, "rewards/chosen": 1.4131546020507812, "rewards/margins": 1.1297540664672852, "rewards/rejected": 0.2834005355834961, "step": 9222 }, { "epoch": 1.5, "learning_rate": 3.857580304553786e-07, "logits/chosen": -1.032273530960083, "logits/rejected": -0.9531127214431763, "logps/chosen": -48.09795379638672, "logps/rejected": -45.4219970703125, "loss": 0.4118, "rewards/accuracies": 1.0, "rewards/chosen": 2.4606850147247314, "rewards/margins": 0.2942543029785156, "rewards/rejected": 2.166430711746216, "step": 9223 }, { "epoch": 1.5, "learning_rate": 3.856300855142141e-07, "logits/chosen": -0.9897327423095703, "logits/rejected": -0.9607491493225098, "logps/chosen": -221.9731903076172, "logps/rejected": -128.58578491210938, "loss": 1.0738, "rewards/accuracies": 0.0, "rewards/chosen": 2.689863681793213, "rewards/margins": -1.980787754058838, "rewards/rejected": 4.670651435852051, "step": 9224 }, { "epoch": 1.5, "learning_rate": 3.855021484749286e-07, "logits/chosen": -1.1900238990783691, "logits/rejected": -1.1572836637496948, "logps/chosen": -119.67517852783203, "logps/rejected": -37.833011627197266, "loss": 0.318, "rewards/accuracies": 1.0, "rewards/chosen": 1.2742912769317627, "rewards/margins": 1.008357286453247, "rewards/rejected": 0.2659339904785156, "step": 9225 }, { "epoch": 1.5, "learning_rate": 3.8537421934636116e-07, "logits/chosen": -1.0455775260925293, "logits/rejected": -1.0343575477600098, "logps/chosen": -45.7525749206543, "logps/rejected": -49.4356689453125, "loss": 0.8663, "rewards/accuracies": 1.0, "rewards/chosen": 1.5830109119415283, "rewards/margins": 0.034847259521484375, "rewards/rejected": 1.548163652420044, "step": 9226 }, { "epoch": 1.5, "learning_rate": 3.852462981373506e-07, "logits/chosen": -0.871806263923645, "logits/rejected": -0.8333057761192322, "logps/chosen": -156.45770263671875, "logps/rejected": -168.72418212890625, "loss": 2.6843, "rewards/accuracies": 0.0, "rewards/chosen": 0.20206299424171448, "rewards/margins": -5.021675109863281, "rewards/rejected": 5.223738193511963, "step": 9227 }, { "epoch": 1.5, "learning_rate": 3.8511838485673503e-07, "logits/chosen": -0.7832803130149841, "logits/rejected": -0.7652430534362793, "logps/chosen": -54.56536102294922, "logps/rejected": -65.18108367919922, "loss": 0.3177, "rewards/accuracies": 1.0, "rewards/chosen": 2.073667287826538, "rewards/margins": 0.1319664716720581, "rewards/rejected": 1.94170081615448, "step": 9228 }, { "epoch": 1.5, "learning_rate": 3.8499047951335205e-07, "logits/chosen": -0.4739719331264496, "logits/rejected": -0.4034121036529541, "logps/chosen": -75.34390258789062, "logps/rejected": -148.6912384033203, "loss": 0.4369, "rewards/accuracies": 1.0, "rewards/chosen": 0.748760998249054, "rewards/margins": 1.0341583490371704, "rewards/rejected": -0.28539735078811646, "step": 9229 }, { "epoch": 1.5, "learning_rate": 3.848625821160387e-07, "logits/chosen": -0.8720961809158325, "logits/rejected": -0.7771822810173035, "logps/chosen": -40.046443939208984, "logps/rejected": -53.99871063232422, "loss": 0.7269, "rewards/accuracies": 1.0, "rewards/chosen": 0.8713222742080688, "rewards/margins": 0.24545329809188843, "rewards/rejected": 0.6258689761161804, "step": 9230 }, { "epoch": 1.5, "learning_rate": 3.847346926736315e-07, "logits/chosen": -0.8470531105995178, "logits/rejected": -0.8589715957641602, "logps/chosen": -75.93537902832031, "logps/rejected": -83.9684829711914, "loss": 0.7054, "rewards/accuracies": 0.0, "rewards/chosen": 2.3391997814178467, "rewards/margins": -0.90561842918396, "rewards/rejected": 3.2448182106018066, "step": 9231 }, { "epoch": 1.5, "learning_rate": 3.846068111949664e-07, "logits/chosen": -0.8337647318840027, "logits/rejected": -0.7580335140228271, "logps/chosen": -41.2964973449707, "logps/rejected": -22.520246505737305, "loss": 0.2523, "rewards/accuracies": 1.0, "rewards/chosen": 2.234575033187866, "rewards/margins": 1.7175939083099365, "rewards/rejected": 0.5169811248779297, "step": 9232 }, { "epoch": 1.5, "learning_rate": 3.8447893768887873e-07, "logits/chosen": -1.3743600845336914, "logits/rejected": -1.321305751800537, "logps/chosen": -91.61469268798828, "logps/rejected": -65.27943420410156, "loss": 0.874, "rewards/accuracies": 0.0, "rewards/chosen": 2.0519912242889404, "rewards/margins": -0.36631011962890625, "rewards/rejected": 2.4183013439178467, "step": 9233 }, { "epoch": 1.5, "learning_rate": 3.8435107216420347e-07, "logits/chosen": -0.7995764017105103, "logits/rejected": -0.7273313999176025, "logps/chosen": -155.34117126464844, "logps/rejected": -129.149658203125, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": 6.73388671875, "rewards/margins": 3.3892898559570312, "rewards/rejected": 3.3445968627929688, "step": 9234 }, { "epoch": 1.5, "learning_rate": 3.842232146297748e-07, "logits/chosen": -1.1531025171279907, "logits/rejected": -1.0784316062927246, "logps/chosen": -283.82098388671875, "logps/rejected": -77.94401550292969, "loss": 0.0706, "rewards/accuracies": 1.0, "rewards/chosen": 6.254931926727295, "rewards/margins": 3.175459623336792, "rewards/rejected": 3.079472303390503, "step": 9235 }, { "epoch": 1.5, "learning_rate": 3.840953650944265e-07, "logits/chosen": -0.3633961081504822, "logits/rejected": -0.4230410158634186, "logps/chosen": -52.27965545654297, "logps/rejected": -81.88262939453125, "loss": 0.82, "rewards/accuracies": 1.0, "rewards/chosen": 0.5315384268760681, "rewards/margins": 0.6370697617530823, "rewards/rejected": -0.10553131252527237, "step": 9236 }, { "epoch": 1.5, "learning_rate": 3.839675235669918e-07, "logits/chosen": -0.774436891078949, "logits/rejected": -0.6605539321899414, "logps/chosen": -160.56906127929688, "logps/rejected": -38.033424377441406, "loss": 0.3915, "rewards/accuracies": 1.0, "rewards/chosen": 4.207159519195557, "rewards/margins": 3.2658543586730957, "rewards/rejected": 0.9413051605224609, "step": 9237 }, { "epoch": 1.5, "learning_rate": 3.838396900563032e-07, "logits/chosen": -0.4723682999610901, "logits/rejected": -0.45510461926460266, "logps/chosen": -45.20744705200195, "logps/rejected": -64.17710876464844, "loss": 0.3542, "rewards/accuracies": 1.0, "rewards/chosen": 1.738104224205017, "rewards/margins": 0.11607122421264648, "rewards/rejected": 1.6220329999923706, "step": 9238 }, { "epoch": 1.5, "learning_rate": 3.83711864571193e-07, "logits/chosen": -0.7781736850738525, "logits/rejected": -0.8629860281944275, "logps/chosen": -176.8038787841797, "logps/rejected": -126.05057525634766, "loss": 2.2085, "rewards/accuracies": 0.0, "rewards/chosen": 0.8110061883926392, "rewards/margins": -4.356657028198242, "rewards/rejected": 5.167663097381592, "step": 9239 }, { "epoch": 1.5, "learning_rate": 3.8358404712049257e-07, "logits/chosen": -0.6585121154785156, "logits/rejected": -0.5680190324783325, "logps/chosen": -57.28522491455078, "logps/rejected": -17.43816375732422, "loss": 0.986, "rewards/accuracies": 1.0, "rewards/chosen": 2.0773651599884033, "rewards/margins": 1.8680641651153564, "rewards/rejected": 0.20930099487304688, "step": 9240 }, { "epoch": 1.5, "learning_rate": 3.8345623771303305e-07, "logits/chosen": -0.7235291600227356, "logits/rejected": -0.37495461106300354, "logps/chosen": -245.7691650390625, "logps/rejected": -137.080078125, "loss": 0.5592, "rewards/accuracies": 1.0, "rewards/chosen": 5.039306640625, "rewards/margins": 0.43221282958984375, "rewards/rejected": 4.607093811035156, "step": 9241 }, { "epoch": 1.5, "learning_rate": 3.8332843635764465e-07, "logits/chosen": -0.6533486247062683, "logits/rejected": -0.7003201246261597, "logps/chosen": -48.735145568847656, "logps/rejected": -49.945228576660156, "loss": 0.9, "rewards/accuracies": 0.0, "rewards/chosen": 0.6596130728721619, "rewards/margins": -1.2040619850158691, "rewards/rejected": 1.8636749982833862, "step": 9242 }, { "epoch": 1.5, "learning_rate": 3.8320064306315747e-07, "logits/chosen": -0.6032822132110596, "logits/rejected": -0.4578585624694824, "logps/chosen": -132.64950561523438, "logps/rejected": -20.39673614501953, "loss": 0.4726, "rewards/accuracies": 1.0, "rewards/chosen": 0.5728195309638977, "rewards/margins": 0.07600250840187073, "rewards/rejected": 0.496817022562027, "step": 9243 }, { "epoch": 1.5, "learning_rate": 3.830728578384006e-07, "logits/chosen": -0.6725866198539734, "logits/rejected": -0.47338488698005676, "logps/chosen": -178.94845581054688, "logps/rejected": -88.59715270996094, "loss": 0.8104, "rewards/accuracies": 1.0, "rewards/chosen": 6.6979265213012695, "rewards/margins": 3.3124544620513916, "rewards/rejected": 3.385472059249878, "step": 9244 }, { "epoch": 1.5, "learning_rate": 3.8294508069220286e-07, "logits/chosen": -0.5561017394065857, "logits/rejected": -0.5822011828422546, "logps/chosen": -96.2396240234375, "logps/rejected": -70.820556640625, "loss": 0.976, "rewards/accuracies": 0.0, "rewards/chosen": 1.0377899408340454, "rewards/margins": -0.9752243757247925, "rewards/rejected": 2.013014316558838, "step": 9245 }, { "epoch": 1.5, "learning_rate": 3.8281731163339247e-07, "logits/chosen": -1.1463220119476318, "logits/rejected": -1.1814645528793335, "logps/chosen": -121.74604034423828, "logps/rejected": -80.60841369628906, "loss": 0.5394, "rewards/accuracies": 0.0, "rewards/chosen": 1.1638405323028564, "rewards/margins": -0.34545814990997314, "rewards/rejected": 1.5092986822128296, "step": 9246 }, { "epoch": 1.5, "learning_rate": 3.826895506707971e-07, "logits/chosen": -0.5905168056488037, "logits/rejected": -0.41943472623825073, "logps/chosen": -102.8131332397461, "logps/rejected": -110.39293670654297, "loss": 1.061, "rewards/accuracies": 0.0, "rewards/chosen": 2.625934600830078, "rewards/margins": -1.9912567138671875, "rewards/rejected": 4.617191314697266, "step": 9247 }, { "epoch": 1.5, "learning_rate": 3.8256179781324376e-07, "logits/chosen": -0.7501330971717834, "logits/rejected": -0.8202295899391174, "logps/chosen": -300.5863037109375, "logps/rejected": -73.66407012939453, "loss": 1.0818, "rewards/accuracies": 1.0, "rewards/chosen": 2.996746778488159, "rewards/margins": 0.646080732345581, "rewards/rejected": 2.350666046142578, "step": 9248 }, { "epoch": 1.5, "learning_rate": 3.82434053069559e-07, "logits/chosen": -0.7350068688392639, "logits/rejected": -0.677157461643219, "logps/chosen": -131.32083129882812, "logps/rejected": -181.21682739257812, "loss": 0.6279, "rewards/accuracies": 0.0, "rewards/chosen": 5.879126071929932, "rewards/margins": -0.07704448699951172, "rewards/rejected": 5.956170558929443, "step": 9249 }, { "epoch": 1.5, "learning_rate": 3.823063164485687e-07, "logits/chosen": -0.8499850630760193, "logits/rejected": -0.9106261730194092, "logps/chosen": -291.2978210449219, "logps/rejected": -203.28604125976562, "loss": 1.2175, "rewards/accuracies": 0.0, "rewards/chosen": 5.05584716796875, "rewards/margins": -2.328967571258545, "rewards/rejected": 7.384814739227295, "step": 9250 }, { "epoch": 1.5, "learning_rate": 3.821785879590984e-07, "logits/chosen": -0.9413585662841797, "logits/rejected": -0.9372779130935669, "logps/chosen": -222.14306640625, "logps/rejected": -167.7455291748047, "loss": 0.5125, "rewards/accuracies": 1.0, "rewards/chosen": 5.147571086883545, "rewards/margins": 1.2814288139343262, "rewards/rejected": 3.8661422729492188, "step": 9251 }, { "epoch": 1.5, "learning_rate": 3.8205086760997276e-07, "logits/chosen": -0.3566586971282959, "logits/rejected": -0.3097761273384094, "logps/chosen": -63.56157302856445, "logps/rejected": -43.65595245361328, "loss": 0.5507, "rewards/accuracies": 0.0, "rewards/chosen": 1.4093456268310547, "rewards/margins": -0.40533947944641113, "rewards/rejected": 1.8146851062774658, "step": 9252 }, { "epoch": 1.5, "learning_rate": 3.819231554100161e-07, "logits/chosen": -0.9424208998680115, "logits/rejected": -0.8648868799209595, "logps/chosen": -64.68827056884766, "logps/rejected": -84.39972686767578, "loss": 0.4432, "rewards/accuracies": 0.0, "rewards/chosen": 1.9316627979278564, "rewards/margins": -0.2967560291290283, "rewards/rejected": 2.2284188270568848, "step": 9253 }, { "epoch": 1.5, "learning_rate": 3.817954513680524e-07, "logits/chosen": -0.5530639886856079, "logits/rejected": -0.624919056892395, "logps/chosen": -80.12256622314453, "logps/rejected": -91.6798095703125, "loss": 0.9216, "rewards/accuracies": 0.0, "rewards/chosen": 1.67314612865448, "rewards/margins": -1.6359528303146362, "rewards/rejected": 3.309098958969116, "step": 9254 }, { "epoch": 1.5, "learning_rate": 3.816677554929043e-07, "logits/chosen": -1.3749808073043823, "logits/rejected": -1.4043539762496948, "logps/chosen": -187.35748291015625, "logps/rejected": -57.28887939453125, "loss": 0.2568, "rewards/accuracies": 1.0, "rewards/chosen": 0.9858734011650085, "rewards/margins": 0.5064010620117188, "rewards/rejected": 0.4794723689556122, "step": 9255 }, { "epoch": 1.5, "learning_rate": 3.815400677933947e-07, "logits/chosen": -0.8296961784362793, "logits/rejected": -0.7296993136405945, "logps/chosen": -95.50648498535156, "logps/rejected": -44.21204376220703, "loss": 1.049, "rewards/accuracies": 1.0, "rewards/chosen": 4.157823085784912, "rewards/margins": 1.2937192916870117, "rewards/rejected": 2.8641037940979004, "step": 9256 }, { "epoch": 1.5, "learning_rate": 3.8141238827834546e-07, "logits/chosen": -0.81766277551651, "logits/rejected": -0.81766277551651, "logps/chosen": -1.8541938066482544, "logps/rejected": -1.8541938066482544, "loss": 0.6658, "rewards/accuracies": 0.0, "rewards/chosen": 0.6688739657402039, "rewards/margins": 0.0, "rewards/rejected": 0.6688739657402039, "step": 9257 }, { "epoch": 1.5, "learning_rate": 3.812847169565782e-07, "logits/chosen": -0.8017877340316772, "logits/rejected": -0.8149999976158142, "logps/chosen": -133.93142700195312, "logps/rejected": -84.1892318725586, "loss": 1.1271, "rewards/accuracies": 0.0, "rewards/chosen": 3.549368381500244, "rewards/margins": -2.1322364807128906, "rewards/rejected": 5.681604862213135, "step": 9258 }, { "epoch": 1.5, "learning_rate": 3.811570538369135e-07, "logits/chosen": -0.22890324890613556, "logits/rejected": -0.27725088596343994, "logps/chosen": -22.64291000366211, "logps/rejected": -48.44763946533203, "loss": 0.3213, "rewards/accuracies": 1.0, "rewards/chosen": 0.5337499976158142, "rewards/margins": 0.3101850748062134, "rewards/rejected": 0.22356490790843964, "step": 9259 }, { "epoch": 1.5, "learning_rate": 3.81029398928172e-07, "logits/chosen": -0.6039168834686279, "logits/rejected": -0.5830188989639282, "logps/chosen": -20.227680206298828, "logps/rejected": -20.704986572265625, "loss": 0.308, "rewards/accuracies": 1.0, "rewards/chosen": 1.1021515130996704, "rewards/margins": 0.2052200436592102, "rewards/rejected": 0.8969314694404602, "step": 9260 }, { "epoch": 1.5, "learning_rate": 3.809017522391734e-07, "logits/chosen": -1.057177186012268, "logits/rejected": -1.0951566696166992, "logps/chosen": -77.23918151855469, "logps/rejected": -90.91905212402344, "loss": 2.3298, "rewards/accuracies": 0.0, "rewards/chosen": 2.1290206909179688, "rewards/margins": -4.234625339508057, "rewards/rejected": 6.363646030426025, "step": 9261 }, { "epoch": 1.5, "learning_rate": 3.807741137787367e-07, "logits/chosen": -0.3919730484485626, "logits/rejected": -0.3888913094997406, "logps/chosen": -6.550205707550049, "logps/rejected": -1.5571237802505493, "loss": 1.1259, "rewards/accuracies": 0.0, "rewards/chosen": 0.3601597249507904, "rewards/margins": -0.16102656722068787, "rewards/rejected": 0.5211862921714783, "step": 9262 }, { "epoch": 1.5, "learning_rate": 3.806464835556806e-07, "logits/chosen": -0.43854135274887085, "logits/rejected": -0.5967606902122498, "logps/chosen": -75.04708099365234, "logps/rejected": -130.1267547607422, "loss": 1.6045, "rewards/accuracies": 0.0, "rewards/chosen": 2.9591147899627686, "rewards/margins": -0.8016242980957031, "rewards/rejected": 3.7607390880584717, "step": 9263 }, { "epoch": 1.5, "learning_rate": 3.8051886157882327e-07, "logits/chosen": -0.6954123973846436, "logits/rejected": -0.6744985580444336, "logps/chosen": -97.17719268798828, "logps/rejected": -126.34847259521484, "loss": 0.5958, "rewards/accuracies": 1.0, "rewards/chosen": 2.0364837646484375, "rewards/margins": 0.4367408752441406, "rewards/rejected": 1.5997428894042969, "step": 9264 }, { "epoch": 1.5, "learning_rate": 3.80391247856982e-07, "logits/chosen": -0.6778455376625061, "logits/rejected": -0.6834543347358704, "logps/chosen": -89.66300964355469, "logps/rejected": -111.15103149414062, "loss": 0.8047, "rewards/accuracies": 0.0, "rewards/chosen": 1.6372970342636108, "rewards/margins": -0.006683349609375, "rewards/rejected": 1.6439803838729858, "step": 9265 }, { "epoch": 1.5, "learning_rate": 3.802636423989737e-07, "logits/chosen": -0.66297447681427, "logits/rejected": -0.6569448709487915, "logps/chosen": -99.2273178100586, "logps/rejected": -118.61912536621094, "loss": 1.7078, "rewards/accuracies": 0.0, "rewards/chosen": 2.101612091064453, "rewards/margins": -2.1329355239868164, "rewards/rejected": 4.2345476150512695, "step": 9266 }, { "epoch": 1.5, "learning_rate": 3.8013604521361484e-07, "logits/chosen": -0.2930378019809723, "logits/rejected": -0.29857927560806274, "logps/chosen": -11.673955917358398, "logps/rejected": -1.2486019134521484, "loss": 0.4256, "rewards/accuracies": 0.0, "rewards/chosen": 0.05516338348388672, "rewards/margins": -0.2927328050136566, "rewards/rejected": 0.34789618849754333, "step": 9267 }, { "epoch": 1.5, "learning_rate": 3.8000845630972113e-07, "logits/chosen": -0.47859564423561096, "logits/rejected": -0.47859564423561096, "logps/chosen": -8.393875122070312, "logps/rejected": -8.393875122070312, "loss": 0.8035, "rewards/accuracies": 0.0, "rewards/chosen": 0.8176586031913757, "rewards/margins": 0.0, "rewards/rejected": 0.8176586031913757, "step": 9268 }, { "epoch": 1.5, "learning_rate": 3.798808756961076e-07, "logits/chosen": -0.45618632435798645, "logits/rejected": -0.4410455822944641, "logps/chosen": -56.674957275390625, "logps/rejected": -130.925537109375, "loss": 2.2098, "rewards/accuracies": 0.0, "rewards/chosen": 2.076141357421875, "rewards/margins": -1.703033447265625, "rewards/rejected": 3.7791748046875, "step": 9269 }, { "epoch": 1.5, "learning_rate": 3.7975330338158904e-07, "logits/chosen": -0.6416309475898743, "logits/rejected": -0.6423226594924927, "logps/chosen": -7.927094459533691, "logps/rejected": -6.828409671783447, "loss": 0.9483, "rewards/accuracies": 0.0, "rewards/chosen": 0.02726583555340767, "rewards/margins": -0.3060082793235779, "rewards/rejected": 0.33327412605285645, "step": 9270 }, { "epoch": 1.5, "learning_rate": 3.7962573937497944e-07, "logits/chosen": -0.9414961934089661, "logits/rejected": -0.9175240993499756, "logps/chosen": -37.101905822753906, "logps/rejected": -62.21246337890625, "loss": 1.2264, "rewards/accuracies": 0.0, "rewards/chosen": 0.7270004153251648, "rewards/margins": -1.5930848121643066, "rewards/rejected": 2.320085287094116, "step": 9271 }, { "epoch": 1.5, "learning_rate": 3.794981836850922e-07, "logits/chosen": -0.8539915680885315, "logits/rejected": -0.8385422825813293, "logps/chosen": -45.331459045410156, "logps/rejected": -58.952728271484375, "loss": 0.4718, "rewards/accuracies": 0.0, "rewards/chosen": 1.9722877740859985, "rewards/margins": -0.4384254217147827, "rewards/rejected": 2.4107131958007812, "step": 9272 }, { "epoch": 1.51, "learning_rate": 3.793706363207403e-07, "logits/chosen": -0.8036388158798218, "logits/rejected": -0.7231810092926025, "logps/chosen": -126.645263671875, "logps/rejected": -55.31640625, "loss": 0.0974, "rewards/accuracies": 1.0, "rewards/chosen": 3.1346070766448975, "rewards/margins": 1.6999787092208862, "rewards/rejected": 1.4346283674240112, "step": 9273 }, { "epoch": 1.51, "learning_rate": 3.7924309729073613e-07, "logits/chosen": -0.9469101428985596, "logits/rejected": -0.9676110148429871, "logps/chosen": -215.0430908203125, "logps/rejected": -209.82461547851562, "loss": 0.5957, "rewards/accuracies": 0.0, "rewards/chosen": 4.792334079742432, "rewards/margins": -0.8249754905700684, "rewards/rejected": 5.6173095703125, "step": 9274 }, { "epoch": 1.51, "learning_rate": 3.791155666038912e-07, "logits/chosen": -0.7695642113685608, "logits/rejected": -0.7847139239311218, "logps/chosen": -95.1125259399414, "logps/rejected": -97.50520324707031, "loss": 0.7907, "rewards/accuracies": 1.0, "rewards/chosen": 3.7273507118225098, "rewards/margins": 0.42850565910339355, "rewards/rejected": 3.298845052719116, "step": 9275 }, { "epoch": 1.51, "learning_rate": 3.789880442690168e-07, "logits/chosen": -0.4779372215270996, "logits/rejected": -0.6681590676307678, "logps/chosen": -79.71114349365234, "logps/rejected": -161.55276489257812, "loss": 2.2035, "rewards/accuracies": 0.0, "rewards/chosen": 0.7534446716308594, "rewards/margins": -3.689185619354248, "rewards/rejected": 4.442630290985107, "step": 9276 }, { "epoch": 1.51, "learning_rate": 3.7886053029492354e-07, "logits/chosen": -0.9169570207595825, "logits/rejected": -0.6839357018470764, "logps/chosen": -160.48251342773438, "logps/rejected": -20.84755516052246, "loss": 0.2841, "rewards/accuracies": 1.0, "rewards/chosen": 4.893069744110107, "rewards/margins": 4.530467510223389, "rewards/rejected": 0.36260244250297546, "step": 9277 }, { "epoch": 1.51, "learning_rate": 3.7873302469042144e-07, "logits/chosen": -0.9086787700653076, "logits/rejected": -0.8581095337867737, "logps/chosen": -253.62615966796875, "logps/rejected": -62.23035430908203, "loss": 0.1511, "rewards/accuracies": 1.0, "rewards/chosen": 4.978146553039551, "rewards/margins": 1.057826280593872, "rewards/rejected": 3.9203202724456787, "step": 9278 }, { "epoch": 1.51, "learning_rate": 3.7860552746431985e-07, "logits/chosen": -0.3611179292201996, "logits/rejected": -0.32518264651298523, "logps/chosen": -98.16659545898438, "logps/rejected": -90.80427551269531, "loss": 0.7088, "rewards/accuracies": 0.0, "rewards/chosen": 1.012346625328064, "rewards/margins": -0.5898064374923706, "rewards/rejected": 1.6021530628204346, "step": 9279 }, { "epoch": 1.51, "learning_rate": 3.784780386254277e-07, "logits/chosen": -0.41887134313583374, "logits/rejected": -0.48041069507598877, "logps/chosen": -58.23234939575195, "logps/rejected": -45.13330078125, "loss": 1.2337, "rewards/accuracies": 0.0, "rewards/chosen": 1.0493954420089722, "rewards/margins": -1.7117518186569214, "rewards/rejected": 2.7611472606658936, "step": 9280 }, { "epoch": 1.51, "learning_rate": 3.783505581825532e-07, "logits/chosen": -0.8468227982521057, "logits/rejected": -0.836665689945221, "logps/chosen": -146.04345703125, "logps/rejected": -114.73185729980469, "loss": 0.5007, "rewards/accuracies": 1.0, "rewards/chosen": 6.533995151519775, "rewards/margins": 1.4592771530151367, "rewards/rejected": 5.074717998504639, "step": 9281 }, { "epoch": 1.51, "learning_rate": 3.7822308614450406e-07, "logits/chosen": -0.6959300637245178, "logits/rejected": -0.6812627911567688, "logps/chosen": -84.14463806152344, "logps/rejected": -60.549896240234375, "loss": 1.0234, "rewards/accuracies": 0.0, "rewards/chosen": 0.900360107421875, "rewards/margins": -1.854743242263794, "rewards/rejected": 2.755103349685669, "step": 9282 }, { "epoch": 1.51, "learning_rate": 3.780956225200874e-07, "logits/chosen": -0.335891991853714, "logits/rejected": -0.284053236246109, "logps/chosen": -71.16157531738281, "logps/rejected": -33.03158187866211, "loss": 0.3358, "rewards/accuracies": 1.0, "rewards/chosen": 1.891412377357483, "rewards/margins": 0.5401828289031982, "rewards/rejected": 1.3512295484542847, "step": 9283 }, { "epoch": 1.51, "learning_rate": 3.779681673181098e-07, "logits/chosen": -0.9600235223770142, "logits/rejected": -0.8145130276679993, "logps/chosen": -79.6598129272461, "logps/rejected": -157.42178344726562, "loss": 1.9615, "rewards/accuracies": 0.0, "rewards/chosen": 1.3850120306015015, "rewards/margins": -3.55289363861084, "rewards/rejected": 4.937905788421631, "step": 9284 }, { "epoch": 1.51, "learning_rate": 3.7784072054737717e-07, "logits/chosen": -1.1653276681900024, "logits/rejected": -1.0935094356536865, "logps/chosen": -185.68386840820312, "logps/rejected": -130.34510803222656, "loss": 0.4567, "rewards/accuracies": 1.0, "rewards/chosen": 4.268760681152344, "rewards/margins": 0.4364655017852783, "rewards/rejected": 3.8322951793670654, "step": 9285 }, { "epoch": 1.51, "learning_rate": 3.7771328221669486e-07, "logits/chosen": -0.8474697470664978, "logits/rejected": -0.8402629494667053, "logps/chosen": -70.15384674072266, "logps/rejected": -69.69757080078125, "loss": 0.7293, "rewards/accuracies": 0.0, "rewards/chosen": 2.362285614013672, "rewards/margins": -0.13963866233825684, "rewards/rejected": 2.5019242763519287, "step": 9286 }, { "epoch": 1.51, "learning_rate": 3.775858523348676e-07, "logits/chosen": -1.060347557067871, "logits/rejected": -1.002591848373413, "logps/chosen": -67.06986999511719, "logps/rejected": -217.15472412109375, "loss": 2.9917, "rewards/accuracies": 0.0, "rewards/chosen": 1.2766640186309814, "rewards/margins": -4.565199851989746, "rewards/rejected": 5.841864109039307, "step": 9287 }, { "epoch": 1.51, "learning_rate": 3.774584309106997e-07, "logits/chosen": -0.7890558838844299, "logits/rejected": -0.5456504821777344, "logps/chosen": -228.21707153320312, "logps/rejected": -78.14427947998047, "loss": 0.0646, "rewards/accuracies": 1.0, "rewards/chosen": 5.117315769195557, "rewards/margins": 2.1159980297088623, "rewards/rejected": 3.0013177394866943, "step": 9288 }, { "epoch": 1.51, "learning_rate": 3.773310179529947e-07, "logits/chosen": -0.6163687705993652, "logits/rejected": -0.6005703806877136, "logps/chosen": -100.43673706054688, "logps/rejected": -71.70240783691406, "loss": 2.1228, "rewards/accuracies": 0.0, "rewards/chosen": 0.04341430589556694, "rewards/margins": -0.9559341669082642, "rewards/rejected": 0.9993484616279602, "step": 9289 }, { "epoch": 1.51, "learning_rate": 3.772036134705557e-07, "logits/chosen": -1.194026231765747, "logits/rejected": -1.1643288135528564, "logps/chosen": -167.2061767578125, "logps/rejected": -79.04310607910156, "loss": 0.0514, "rewards/accuracies": 1.0, "rewards/chosen": 4.875012397766113, "rewards/margins": 2.5753939151763916, "rewards/rejected": 2.2996184825897217, "step": 9290 }, { "epoch": 1.51, "learning_rate": 3.7707621747218506e-07, "logits/chosen": -0.6940405368804932, "logits/rejected": -0.6538631319999695, "logps/chosen": -53.40278625488281, "logps/rejected": -79.29673767089844, "loss": 1.0264, "rewards/accuracies": 0.0, "rewards/chosen": 1.3784462213516235, "rewards/margins": -0.28982317447662354, "rewards/rejected": 1.668269395828247, "step": 9291 }, { "epoch": 1.51, "learning_rate": 3.769488299666847e-07, "logits/chosen": -0.6201995015144348, "logits/rejected": -0.7044296860694885, "logps/chosen": -58.294578552246094, "logps/rejected": -99.673583984375, "loss": 1.6803, "rewards/accuracies": 0.0, "rewards/chosen": 2.2961976528167725, "rewards/margins": -1.7256591320037842, "rewards/rejected": 4.021856784820557, "step": 9292 }, { "epoch": 1.51, "learning_rate": 3.7682145096285584e-07, "logits/chosen": -0.5111376643180847, "logits/rejected": -0.5090407729148865, "logps/chosen": -161.43226623535156, "logps/rejected": -145.60137939453125, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": 5.679878234863281, "rewards/margins": 4.570796012878418, "rewards/rejected": 1.1090821027755737, "step": 9293 }, { "epoch": 1.51, "learning_rate": 3.7669408046949914e-07, "logits/chosen": -1.0133568048477173, "logits/rejected": -0.9859750270843506, "logps/chosen": -154.44525146484375, "logps/rejected": -181.99147033691406, "loss": 1.5711, "rewards/accuracies": 0.0, "rewards/chosen": 5.417196750640869, "rewards/margins": -3.0738606452941895, "rewards/rejected": 8.491057395935059, "step": 9294 }, { "epoch": 1.51, "learning_rate": 3.765667184954148e-07, "logits/chosen": -0.2137204110622406, "logits/rejected": -0.2137204110622406, "logps/chosen": -68.03753662109375, "logps/rejected": -68.03753662109375, "loss": 0.3545, "rewards/accuracies": 0.0, "rewards/chosen": 0.5881943106651306, "rewards/margins": 0.0, "rewards/rejected": 0.5881943106651306, "step": 9295 }, { "epoch": 1.51, "learning_rate": 3.7643936504940227e-07, "logits/chosen": -0.7375574111938477, "logits/rejected": -0.7639464735984802, "logps/chosen": -61.01917266845703, "logps/rejected": -101.42304992675781, "loss": 0.1535, "rewards/accuracies": 1.0, "rewards/chosen": 1.136705756187439, "rewards/margins": 1.1296393871307373, "rewards/rejected": 0.0070663453079760075, "step": 9296 }, { "epoch": 1.51, "learning_rate": 3.7631202014026054e-07, "logits/chosen": -0.9227220416069031, "logits/rejected": -0.9256303310394287, "logps/chosen": -98.9301986694336, "logps/rejected": -58.42391586303711, "loss": 1.4151, "rewards/accuracies": 0.0, "rewards/chosen": 1.3270149230957031, "rewards/margins": -0.9257206916809082, "rewards/rejected": 2.2527356147766113, "step": 9297 }, { "epoch": 1.51, "learning_rate": 3.761846837767878e-07, "logits/chosen": -0.5917920470237732, "logits/rejected": -0.5745935440063477, "logps/chosen": -64.00856018066406, "logps/rejected": -84.91549682617188, "loss": 1.8845, "rewards/accuracies": 0.0, "rewards/chosen": 2.2217025756835938, "rewards/margins": -0.570960283279419, "rewards/rejected": 2.7926628589630127, "step": 9298 }, { "epoch": 1.51, "learning_rate": 3.7605735596778187e-07, "logits/chosen": -0.4234517216682434, "logits/rejected": -0.3223571479320526, "logps/chosen": -85.25132751464844, "logps/rejected": -64.33629608154297, "loss": 0.4335, "rewards/accuracies": 1.0, "rewards/chosen": 4.887925624847412, "rewards/margins": 3.518944263458252, "rewards/rejected": 1.3689812421798706, "step": 9299 }, { "epoch": 1.51, "learning_rate": 3.759300367220399e-07, "logits/chosen": -0.9426396489143372, "logits/rejected": -1.0461087226867676, "logps/chosen": -54.180419921875, "logps/rejected": -119.5262222290039, "loss": 1.9816, "rewards/accuracies": 0.0, "rewards/chosen": 2.6737496852874756, "rewards/margins": -3.6678054332733154, "rewards/rejected": 6.341555118560791, "step": 9300 }, { "epoch": 1.51, "learning_rate": 3.7580272604835844e-07, "logits/chosen": -0.5325074195861816, "logits/rejected": -0.5993110537528992, "logps/chosen": -90.653564453125, "logps/rejected": -122.71255493164062, "loss": 0.4467, "rewards/accuracies": 0.0, "rewards/chosen": 0.3007919490337372, "rewards/margins": -0.05627897381782532, "rewards/rejected": 0.3570709228515625, "step": 9301 }, { "epoch": 1.51, "learning_rate": 3.7567542395553343e-07, "logits/chosen": -0.8586857914924622, "logits/rejected": -0.7424739599227905, "logps/chosen": -194.69488525390625, "logps/rejected": -246.21246337890625, "loss": 1.3907, "rewards/accuracies": 0.0, "rewards/chosen": 3.7469863891601562, "rewards/margins": -2.679490566253662, "rewards/rejected": 6.426476955413818, "step": 9302 }, { "epoch": 1.51, "learning_rate": 3.7554813045236027e-07, "logits/chosen": -0.20868414640426636, "logits/rejected": -0.16067089140415192, "logps/chosen": -21.283010482788086, "logps/rejected": -17.83431625366211, "loss": 0.6615, "rewards/accuracies": 1.0, "rewards/chosen": 0.6956819891929626, "rewards/margins": 0.4658184349536896, "rewards/rejected": 0.22986355423927307, "step": 9303 }, { "epoch": 1.51, "learning_rate": 3.754208455476337e-07, "logits/chosen": -0.577451765537262, "logits/rejected": -0.577451765537262, "logps/chosen": -0.642286479473114, "logps/rejected": -0.642286479473114, "loss": 0.3873, "rewards/accuracies": 0.0, "rewards/chosen": 0.2222878783941269, "rewards/margins": 0.0, "rewards/rejected": 0.2222878783941269, "step": 9304 }, { "epoch": 1.51, "learning_rate": 3.75293569250148e-07, "logits/chosen": -0.8475014567375183, "logits/rejected": -0.9238470196723938, "logps/chosen": -78.79910278320312, "logps/rejected": -176.50527954101562, "loss": 2.6464, "rewards/accuracies": 0.0, "rewards/chosen": 2.034411668777466, "rewards/margins": -4.375551223754883, "rewards/rejected": 6.4099626541137695, "step": 9305 }, { "epoch": 1.51, "learning_rate": 3.7516630156869656e-07, "logits/chosen": -0.7408556938171387, "logits/rejected": -0.6964367032051086, "logps/chosen": -55.43308639526367, "logps/rejected": -64.92716217041016, "loss": 0.8774, "rewards/accuracies": 0.0, "rewards/chosen": 1.4142849445343018, "rewards/margins": -0.18475759029388428, "rewards/rejected": 1.599042534828186, "step": 9306 }, { "epoch": 1.51, "learning_rate": 3.7503904251207257e-07, "logits/chosen": -1.0567151308059692, "logits/rejected": -0.8781540989875793, "logps/chosen": -181.35562133789062, "logps/rejected": -179.12954711914062, "loss": 0.5471, "rewards/accuracies": 0.0, "rewards/chosen": 4.4872941970825195, "rewards/margins": -0.39853668212890625, "rewards/rejected": 4.885830879211426, "step": 9307 }, { "epoch": 1.51, "learning_rate": 3.749117920890683e-07, "logits/chosen": -0.8493016958236694, "logits/rejected": -0.7606171369552612, "logps/chosen": -131.10958862304688, "logps/rejected": -107.94792175292969, "loss": 0.4729, "rewards/accuracies": 1.0, "rewards/chosen": 6.005154609680176, "rewards/margins": 1.0166549682617188, "rewards/rejected": 4.988499641418457, "step": 9308 }, { "epoch": 1.51, "learning_rate": 3.7478455030847565e-07, "logits/chosen": -0.6185157895088196, "logits/rejected": -0.6534430384635925, "logps/chosen": -69.12590789794922, "logps/rejected": -115.8266830444336, "loss": 0.6659, "rewards/accuracies": 1.0, "rewards/chosen": 1.2266937494277954, "rewards/margins": 0.08087003231048584, "rewards/rejected": 1.1458237171173096, "step": 9309 }, { "epoch": 1.51, "learning_rate": 3.746573171790858e-07, "logits/chosen": -0.6235032677650452, "logits/rejected": -0.5240463018417358, "logps/chosen": -78.79056549072266, "logps/rejected": -63.317138671875, "loss": 0.3697, "rewards/accuracies": 1.0, "rewards/chosen": 1.0306007862091064, "rewards/margins": 0.10241323709487915, "rewards/rejected": 0.9281875491142273, "step": 9310 }, { "epoch": 1.51, "learning_rate": 3.745300927096893e-07, "logits/chosen": -0.6834889054298401, "logits/rejected": -0.6683202385902405, "logps/chosen": -68.1911392211914, "logps/rejected": -99.5748062133789, "loss": 0.569, "rewards/accuracies": 0.0, "rewards/chosen": 0.48427507281303406, "rewards/margins": -0.16193237900733948, "rewards/rejected": 0.6462074518203735, "step": 9311 }, { "epoch": 1.51, "learning_rate": 3.744028769090761e-07, "logits/chosen": -1.049423336982727, "logits/rejected": -1.0130287408828735, "logps/chosen": -80.99939727783203, "logps/rejected": -41.423580169677734, "loss": 0.1931, "rewards/accuracies": 1.0, "rewards/chosen": 1.0540474653244019, "rewards/margins": 0.942564070224762, "rewards/rejected": 0.1114833876490593, "step": 9312 }, { "epoch": 1.51, "learning_rate": 3.742756697860359e-07, "logits/chosen": -0.9944909811019897, "logits/rejected": -0.9282260537147522, "logps/chosen": -70.93649291992188, "logps/rejected": -34.44959259033203, "loss": 0.3379, "rewards/accuracies": 1.0, "rewards/chosen": 2.571917772293091, "rewards/margins": 2.3541476726531982, "rewards/rejected": 0.21777001023292542, "step": 9313 }, { "epoch": 1.51, "learning_rate": 3.741484713493571e-07, "logits/chosen": -0.678169310092926, "logits/rejected": -0.631546139717102, "logps/chosen": -85.17866516113281, "logps/rejected": -58.01622009277344, "loss": 0.8606, "rewards/accuracies": 0.0, "rewards/chosen": 0.8751266598701477, "rewards/margins": -1.2755486965179443, "rewards/rejected": 2.1506752967834473, "step": 9314 }, { "epoch": 1.51, "learning_rate": 3.740212816078282e-07, "logits/chosen": -0.4742186367511749, "logits/rejected": -0.4571021497249603, "logps/chosen": -80.62081909179688, "logps/rejected": -31.3348388671875, "loss": 0.5074, "rewards/accuracies": 1.0, "rewards/chosen": 1.9625732898712158, "rewards/margins": 0.9692986011505127, "rewards/rejected": 0.9932746887207031, "step": 9315 }, { "epoch": 1.51, "learning_rate": 3.738941005702367e-07, "logits/chosen": -1.0451598167419434, "logits/rejected": -0.9769750833511353, "logps/chosen": -127.61663818359375, "logps/rejected": -162.72518920898438, "loss": 1.7551, "rewards/accuracies": 0.0, "rewards/chosen": 3.815861463546753, "rewards/margins": -3.294761896133423, "rewards/rejected": 7.110623359680176, "step": 9316 }, { "epoch": 1.51, "learning_rate": 3.737669282453696e-07, "logits/chosen": -0.48016440868377686, "logits/rejected": -0.48016440868377686, "logps/chosen": -0.47603708505630493, "logps/rejected": -0.47603708505630493, "loss": 2.816, "rewards/accuracies": 0.0, "rewards/chosen": 0.22681331634521484, "rewards/margins": 0.0, "rewards/rejected": 0.22681331634521484, "step": 9317 }, { "epoch": 1.51, "learning_rate": 3.7363976464201343e-07, "logits/chosen": -0.7035925984382629, "logits/rejected": -0.6241157650947571, "logps/chosen": -149.4945526123047, "logps/rejected": -78.12886047363281, "loss": 0.4979, "rewards/accuracies": 0.0, "rewards/chosen": 0.17113342881202698, "rewards/margins": -0.37629011273384094, "rewards/rejected": 0.5474235415458679, "step": 9318 }, { "epoch": 1.51, "learning_rate": 3.7351260976895377e-07, "logits/chosen": -0.895056962966919, "logits/rejected": -0.7846724390983582, "logps/chosen": -194.3838348388672, "logps/rejected": -123.14320373535156, "loss": 0.9458, "rewards/accuracies": 0.0, "rewards/chosen": 1.0300766229629517, "rewards/margins": -1.0012344121932983, "rewards/rejected": 2.03131103515625, "step": 9319 }, { "epoch": 1.51, "learning_rate": 3.73385463634976e-07, "logits/chosen": -0.667400062084198, "logits/rejected": -0.6352319121360779, "logps/chosen": -40.69319534301758, "logps/rejected": -64.72640991210938, "loss": 0.1386, "rewards/accuracies": 1.0, "rewards/chosen": 2.3825900554656982, "rewards/margins": 1.2695269584655762, "rewards/rejected": 1.113063097000122, "step": 9320 }, { "epoch": 1.51, "learning_rate": 3.7325832624886464e-07, "logits/chosen": -0.9371627569198608, "logits/rejected": -0.9326887130737305, "logps/chosen": -65.3373794555664, "logps/rejected": -19.270092010498047, "loss": 0.5619, "rewards/accuracies": 0.0, "rewards/chosen": 0.06358108669519424, "rewards/margins": -0.702117919921875, "rewards/rejected": 0.765699028968811, "step": 9321 }, { "epoch": 1.51, "learning_rate": 3.731311976194037e-07, "logits/chosen": -0.8612033724784851, "logits/rejected": -0.7234482169151306, "logps/chosen": -141.19430541992188, "logps/rejected": -105.48605346679688, "loss": 0.0462, "rewards/accuracies": 1.0, "rewards/chosen": 3.9878907203674316, "rewards/margins": 2.384478807449341, "rewards/rejected": 1.6034119129180908, "step": 9322 }, { "epoch": 1.51, "learning_rate": 3.7300407775537656e-07, "logits/chosen": -0.34327709674835205, "logits/rejected": -0.3551039397716522, "logps/chosen": -39.56310272216797, "logps/rejected": -26.88528823852539, "loss": 0.5603, "rewards/accuracies": 0.0, "rewards/chosen": 0.6371769309043884, "rewards/margins": -0.47284120321273804, "rewards/rejected": 1.1100181341171265, "step": 9323 }, { "epoch": 1.51, "learning_rate": 3.7287696666556603e-07, "logits/chosen": -0.23920851945877075, "logits/rejected": -0.24204912781715393, "logps/chosen": -2.496340751647949, "logps/rejected": -9.534687042236328, "loss": 1.4148, "rewards/accuracies": 1.0, "rewards/chosen": 0.18785062432289124, "rewards/margins": 0.06335730850696564, "rewards/rejected": 0.1244933158159256, "step": 9324 }, { "epoch": 1.51, "learning_rate": 3.727498643587542e-07, "logits/chosen": -0.44122180342674255, "logits/rejected": -0.44122180342674255, "logps/chosen": -46.93377685546875, "logps/rejected": -46.93377685546875, "loss": 0.87, "rewards/accuracies": 0.0, "rewards/chosen": 0.504925549030304, "rewards/margins": 0.0, "rewards/rejected": 0.504925549030304, "step": 9325 }, { "epoch": 1.51, "learning_rate": 3.7262277084372274e-07, "logits/chosen": -0.8235068321228027, "logits/rejected": -0.7367630004882812, "logps/chosen": -155.53701782226562, "logps/rejected": -163.53472900390625, "loss": 0.2644, "rewards/accuracies": 1.0, "rewards/chosen": 4.229779243469238, "rewards/margins": 0.36391305923461914, "rewards/rejected": 3.865866184234619, "step": 9326 }, { "epoch": 1.51, "learning_rate": 3.724956861292525e-07, "logits/chosen": -0.7035083174705505, "logits/rejected": -0.6743241548538208, "logps/chosen": -67.64006042480469, "logps/rejected": -47.90545654296875, "loss": 0.4732, "rewards/accuracies": 0.0, "rewards/chosen": 0.565411388874054, "rewards/margins": -0.443364679813385, "rewards/rejected": 1.008776068687439, "step": 9327 }, { "epoch": 1.51, "learning_rate": 3.723686102241239e-07, "logits/chosen": -0.6917827725410461, "logits/rejected": -0.6892970204353333, "logps/chosen": -115.99270629882812, "logps/rejected": -89.38905334472656, "loss": 1.4944, "rewards/accuracies": 0.0, "rewards/chosen": 0.9131134152412415, "rewards/margins": -0.001284778118133545, "rewards/rejected": 0.914398193359375, "step": 9328 }, { "epoch": 1.51, "learning_rate": 3.7224154313711673e-07, "logits/chosen": -0.2621462941169739, "logits/rejected": -0.2621462941169739, "logps/chosen": -84.53229522705078, "logps/rejected": -84.53229522705078, "loss": 1.4565, "rewards/accuracies": 0.0, "rewards/chosen": 2.1683144569396973, "rewards/margins": 0.0, "rewards/rejected": 2.1683144569396973, "step": 9329 }, { "epoch": 1.51, "learning_rate": 3.7211448487700995e-07, "logits/chosen": -0.6842371225357056, "logits/rejected": -0.7359424233436584, "logps/chosen": -218.10443115234375, "logps/rejected": -68.3296127319336, "loss": 0.3862, "rewards/accuracies": 1.0, "rewards/chosen": 4.751946926116943, "rewards/margins": 3.2975943088531494, "rewards/rejected": 1.454352617263794, "step": 9330 }, { "epoch": 1.51, "learning_rate": 3.7198743545258225e-07, "logits/chosen": -0.6731358766555786, "logits/rejected": -0.6626133322715759, "logps/chosen": -64.35455322265625, "logps/rejected": -78.4189453125, "loss": 0.8414, "rewards/accuracies": 0.0, "rewards/chosen": 2.6372344493865967, "rewards/margins": -1.4044053554534912, "rewards/rejected": 4.041639804840088, "step": 9331 }, { "epoch": 1.51, "learning_rate": 3.7186039487261154e-07, "logits/chosen": -0.5360919237136841, "logits/rejected": -0.5844708681106567, "logps/chosen": -50.46391296386719, "logps/rejected": -67.90542602539062, "loss": 0.8865, "rewards/accuracies": 0.0, "rewards/chosen": 2.2958977222442627, "rewards/margins": -0.6650002002716064, "rewards/rejected": 2.960897922515869, "step": 9332 }, { "epoch": 1.51, "learning_rate": 3.717333631458751e-07, "logits/chosen": -0.37127959728240967, "logits/rejected": -0.3900376260280609, "logps/chosen": -79.27204895019531, "logps/rejected": -76.42945861816406, "loss": 0.3237, "rewards/accuracies": 1.0, "rewards/chosen": 2.1858177185058594, "rewards/margins": 0.09986042976379395, "rewards/rejected": 2.0859572887420654, "step": 9333 }, { "epoch": 1.52, "learning_rate": 3.7160634028114955e-07, "logits/chosen": -0.815858781337738, "logits/rejected": -0.7146664261817932, "logps/chosen": -93.64048767089844, "logps/rejected": -67.14352416992188, "loss": 0.4303, "rewards/accuracies": 1.0, "rewards/chosen": 5.054362773895264, "rewards/margins": 2.2938647270202637, "rewards/rejected": 2.760498046875, "step": 9334 }, { "epoch": 1.52, "learning_rate": 3.7147932628721106e-07, "logits/chosen": -0.523581326007843, "logits/rejected": -0.5184478759765625, "logps/chosen": -63.86643981933594, "logps/rejected": -34.41944122314453, "loss": 1.7517, "rewards/accuracies": 0.0, "rewards/chosen": 1.1540825366973877, "rewards/margins": -0.8992302417755127, "rewards/rejected": 2.0533127784729004, "step": 9335 }, { "epoch": 1.52, "learning_rate": 3.7135232117283506e-07, "logits/chosen": -0.5304237604141235, "logits/rejected": -0.5231208801269531, "logps/chosen": -94.29934692382812, "logps/rejected": -64.8615951538086, "loss": 0.3496, "rewards/accuracies": 1.0, "rewards/chosen": 1.113073706626892, "rewards/margins": 0.00241696834564209, "rewards/rejected": 1.11065673828125, "step": 9336 }, { "epoch": 1.52, "learning_rate": 3.712253249467964e-07, "logits/chosen": -0.4596255421638489, "logits/rejected": -0.4596278667449951, "logps/chosen": -85.45701599121094, "logps/rejected": -45.4471321105957, "loss": 0.818, "rewards/accuracies": 0.0, "rewards/chosen": 0.619921863079071, "rewards/margins": -1.1947896480560303, "rewards/rejected": 1.8147114515304565, "step": 9337 }, { "epoch": 1.52, "learning_rate": 3.710983376178692e-07, "logits/chosen": -0.7110756635665894, "logits/rejected": -0.6729215383529663, "logps/chosen": -115.40544891357422, "logps/rejected": -91.59864807128906, "loss": 0.1361, "rewards/accuracies": 1.0, "rewards/chosen": 4.062179088592529, "rewards/margins": 1.6526720523834229, "rewards/rejected": 2.4095070362091064, "step": 9338 }, { "epoch": 1.52, "learning_rate": 3.7097135919482726e-07, "logits/chosen": -1.2283210754394531, "logits/rejected": -1.2482435703277588, "logps/chosen": -91.12692260742188, "logps/rejected": -128.28701782226562, "loss": 1.5748, "rewards/accuracies": 0.0, "rewards/chosen": 1.5862747430801392, "rewards/margins": -2.6562790870666504, "rewards/rejected": 4.2425537109375, "step": 9339 }, { "epoch": 1.52, "learning_rate": 3.708443896864435e-07, "logits/chosen": -0.7044384479522705, "logits/rejected": -0.5247344970703125, "logps/chosen": -105.42317199707031, "logps/rejected": -74.52792358398438, "loss": 0.8156, "rewards/accuracies": 1.0, "rewards/chosen": 4.214725017547607, "rewards/margins": 0.4176661968231201, "rewards/rejected": 3.7970588207244873, "step": 9340 }, { "epoch": 1.52, "learning_rate": 3.7071742910149034e-07, "logits/chosen": -0.524875819683075, "logits/rejected": -0.3638777434825897, "logps/chosen": -52.41610336303711, "logps/rejected": -52.309505462646484, "loss": 0.4761, "rewards/accuracies": 1.0, "rewards/chosen": 1.6182087659835815, "rewards/margins": 0.08863306045532227, "rewards/rejected": 1.5295757055282593, "step": 9341 }, { "epoch": 1.52, "learning_rate": 3.7059047744873955e-07, "logits/chosen": -0.7512341737747192, "logits/rejected": -0.6936742663383484, "logps/chosen": -59.04553985595703, "logps/rejected": -65.24368286132812, "loss": 0.2661, "rewards/accuracies": 1.0, "rewards/chosen": 2.367429494857788, "rewards/margins": 0.7089723348617554, "rewards/rejected": 1.6584571599960327, "step": 9342 }, { "epoch": 1.52, "learning_rate": 3.704635347369623e-07, "logits/chosen": -0.3866432309150696, "logits/rejected": -0.7229389548301697, "logps/chosen": -47.36167907714844, "logps/rejected": -40.827796936035156, "loss": 0.4213, "rewards/accuracies": 1.0, "rewards/chosen": 1.9684219360351562, "rewards/margins": 1.1428985595703125, "rewards/rejected": 0.8255233764648438, "step": 9343 }, { "epoch": 1.52, "learning_rate": 3.703366009749291e-07, "logits/chosen": -0.7729613780975342, "logits/rejected": -0.6529037356376648, "logps/chosen": -111.55413818359375, "logps/rejected": -132.2061309814453, "loss": 2.5432, "rewards/accuracies": 0.0, "rewards/chosen": 4.755377292633057, "rewards/margins": -3.0286455154418945, "rewards/rejected": 7.784022808074951, "step": 9344 }, { "epoch": 1.52, "learning_rate": 3.7020967617140984e-07, "logits/chosen": -1.0946729183197021, "logits/rejected": -1.0488768815994263, "logps/chosen": -89.16954040527344, "logps/rejected": -117.20510864257812, "loss": 1.1667, "rewards/accuracies": 0.0, "rewards/chosen": 1.4091980457305908, "rewards/margins": -1.446171522140503, "rewards/rejected": 2.8553695678710938, "step": 9345 }, { "epoch": 1.52, "learning_rate": 3.7008276033517395e-07, "logits/chosen": -0.5976284742355347, "logits/rejected": -0.4336237609386444, "logps/chosen": -123.8851547241211, "logps/rejected": -88.25473022460938, "loss": 0.1111, "rewards/accuracies": 1.0, "rewards/chosen": 6.918598175048828, "rewards/margins": 3.718402862548828, "rewards/rejected": 3.2001953125, "step": 9346 }, { "epoch": 1.52, "learning_rate": 3.6995585347499003e-07, "logits/chosen": -0.013959977775812149, "logits/rejected": -0.033187247812747955, "logps/chosen": -3.238309144973755, "logps/rejected": -30.192729949951172, "loss": 0.7893, "rewards/accuracies": 1.0, "rewards/chosen": 0.38160350918769836, "rewards/margins": 0.04218471050262451, "rewards/rejected": 0.33941879868507385, "step": 9347 }, { "epoch": 1.52, "learning_rate": 3.698289555996261e-07, "logits/chosen": -0.7165191769599915, "logits/rejected": -0.6792961955070496, "logps/chosen": -81.39541625976562, "logps/rejected": -60.67587661743164, "loss": 0.2414, "rewards/accuracies": 1.0, "rewards/chosen": 4.326634883880615, "rewards/margins": 1.9704158306121826, "rewards/rejected": 2.3562190532684326, "step": 9348 }, { "epoch": 1.52, "learning_rate": 3.6970206671784963e-07, "logits/chosen": -0.5212846398353577, "logits/rejected": -0.5004951357841492, "logps/chosen": -62.09161376953125, "logps/rejected": -98.1303482055664, "loss": 0.6148, "rewards/accuracies": 0.0, "rewards/chosen": 0.48755112290382385, "rewards/margins": -0.6601883172988892, "rewards/rejected": 1.1477394104003906, "step": 9349 }, { "epoch": 1.52, "learning_rate": 3.695751868384275e-07, "logits/chosen": -0.7961288094520569, "logits/rejected": -0.7838778495788574, "logps/chosen": -86.48342895507812, "logps/rejected": -125.8297348022461, "loss": 0.233, "rewards/accuracies": 1.0, "rewards/chosen": 1.3899163007736206, "rewards/margins": 0.6747971177101135, "rewards/rejected": 0.7151191830635071, "step": 9350 }, { "epoch": 1.52, "learning_rate": 3.694483159701259e-07, "logits/chosen": -0.20839376747608185, "logits/rejected": -0.20839376747608185, "logps/chosen": -1.0849741697311401, "logps/rejected": -1.0849741697311401, "loss": 0.3699, "rewards/accuracies": 0.0, "rewards/chosen": 0.2184312790632248, "rewards/margins": 0.0, "rewards/rejected": 0.2184312790632248, "step": 9351 }, { "epoch": 1.52, "learning_rate": 3.6932145412171033e-07, "logits/chosen": -0.45542508363723755, "logits/rejected": -0.4808739423751831, "logps/chosen": -33.1320686340332, "logps/rejected": -80.34619140625, "loss": 0.7711, "rewards/accuracies": 0.0, "rewards/chosen": 0.394308865070343, "rewards/margins": -0.2892879247665405, "rewards/rejected": 0.6835967898368835, "step": 9352 }, { "epoch": 1.52, "learning_rate": 3.691946013019458e-07, "logits/chosen": -0.7574009299278259, "logits/rejected": -0.7592134475708008, "logps/chosen": -142.79519653320312, "logps/rejected": -186.25094604492188, "loss": 2.6458, "rewards/accuracies": 0.0, "rewards/chosen": 2.979013204574585, "rewards/margins": -5.275097846984863, "rewards/rejected": 8.254111289978027, "step": 9353 }, { "epoch": 1.52, "learning_rate": 3.690677575195966e-07, "logits/chosen": -0.7955902218818665, "logits/rejected": -0.7938514351844788, "logps/chosen": -79.48417663574219, "logps/rejected": -136.6099090576172, "loss": 0.5881, "rewards/accuracies": 0.0, "rewards/chosen": 1.3886489868164062, "rewards/margins": -0.41439056396484375, "rewards/rejected": 1.80303955078125, "step": 9354 }, { "epoch": 1.52, "learning_rate": 3.6894092278342647e-07, "logits/chosen": -0.8631914258003235, "logits/rejected": -0.7545461654663086, "logps/chosen": -122.50833129882812, "logps/rejected": -62.74148178100586, "loss": 0.1653, "rewards/accuracies": 1.0, "rewards/chosen": 5.07986307144165, "rewards/margins": 2.425053119659424, "rewards/rejected": 2.6548099517822266, "step": 9355 }, { "epoch": 1.52, "learning_rate": 3.6881409710219846e-07, "logits/chosen": -0.6766822934150696, "logits/rejected": -0.6613647937774658, "logps/chosen": -56.30976104736328, "logps/rejected": -84.18905639648438, "loss": 0.5872, "rewards/accuracies": 1.0, "rewards/chosen": 2.6467933654785156, "rewards/margins": 1.2929191589355469, "rewards/rejected": 1.3538742065429688, "step": 9356 }, { "epoch": 1.52, "learning_rate": 3.6868728048467515e-07, "logits/chosen": -0.4793466329574585, "logits/rejected": -0.36912664771080017, "logps/chosen": -87.47366333007812, "logps/rejected": -47.076927185058594, "loss": 0.5754, "rewards/accuracies": 1.0, "rewards/chosen": 1.1476410627365112, "rewards/margins": 0.16977089643478394, "rewards/rejected": 0.9778701663017273, "step": 9357 }, { "epoch": 1.52, "learning_rate": 3.685604729396182e-07, "logits/chosen": -0.5667369961738586, "logits/rejected": -0.5575421452522278, "logps/chosen": -18.717248916625977, "logps/rejected": -3.6492257118225098, "loss": 0.5225, "rewards/accuracies": 1.0, "rewards/chosen": 0.6262826919555664, "rewards/margins": 0.07510942220687866, "rewards/rejected": 0.5511732697486877, "step": 9358 }, { "epoch": 1.52, "learning_rate": 3.684336744757889e-07, "logits/chosen": -0.6315008401870728, "logits/rejected": -0.5057801008224487, "logps/chosen": -81.81053161621094, "logps/rejected": -45.17884063720703, "loss": 0.4604, "rewards/accuracies": 1.0, "rewards/chosen": 2.113481283187866, "rewards/margins": 0.26322031021118164, "rewards/rejected": 1.8502609729766846, "step": 9359 }, { "epoch": 1.52, "learning_rate": 3.6830688510194785e-07, "logits/chosen": -1.0113017559051514, "logits/rejected": -0.8511002063751221, "logps/chosen": -162.25035095214844, "logps/rejected": -40.3194694519043, "loss": 0.1925, "rewards/accuracies": 1.0, "rewards/chosen": 5.913737773895264, "rewards/margins": 4.649367332458496, "rewards/rejected": 1.264370322227478, "step": 9360 }, { "epoch": 1.52, "learning_rate": 3.6818010482685483e-07, "logits/chosen": -0.861135721206665, "logits/rejected": -0.8446504473686218, "logps/chosen": -203.52145385742188, "logps/rejected": -187.4199981689453, "loss": 0.3958, "rewards/accuracies": 0.0, "rewards/chosen": 6.723752021789551, "rewards/margins": -0.15156078338623047, "rewards/rejected": 6.875312805175781, "step": 9361 }, { "epoch": 1.52, "learning_rate": 3.680533336592694e-07, "logits/chosen": -0.5653039216995239, "logits/rejected": -0.5574060082435608, "logps/chosen": -46.415225982666016, "logps/rejected": -83.28449249267578, "loss": 0.4825, "rewards/accuracies": 1.0, "rewards/chosen": 2.7018895149230957, "rewards/margins": 1.1773258447647095, "rewards/rejected": 1.5245636701583862, "step": 9362 }, { "epoch": 1.52, "learning_rate": 3.6792657160795005e-07, "logits/chosen": -0.8161588907241821, "logits/rejected": -0.7308324575424194, "logps/chosen": -117.55892944335938, "logps/rejected": -89.27264404296875, "loss": 0.2574, "rewards/accuracies": 1.0, "rewards/chosen": 5.257382392883301, "rewards/margins": 2.8431947231292725, "rewards/rejected": 2.4141876697540283, "step": 9363 }, { "epoch": 1.52, "learning_rate": 3.6779981868165487e-07, "logits/chosen": -0.3415967524051666, "logits/rejected": -0.34144842624664307, "logps/chosen": -92.90573120117188, "logps/rejected": -127.45443725585938, "loss": 0.4031, "rewards/accuracies": 1.0, "rewards/chosen": 1.2200698852539062, "rewards/margins": 1.283207654953003, "rewards/rejected": -0.06313782185316086, "step": 9364 }, { "epoch": 1.52, "learning_rate": 3.6767307488914133e-07, "logits/chosen": -0.6742115616798401, "logits/rejected": -0.5998812317848206, "logps/chosen": -106.55778503417969, "logps/rejected": -62.38672637939453, "loss": 0.5458, "rewards/accuracies": 0.0, "rewards/chosen": 1.9604393243789673, "rewards/margins": -0.5730553865432739, "rewards/rejected": 2.533494710922241, "step": 9365 }, { "epoch": 1.52, "learning_rate": 3.675463402391663e-07, "logits/chosen": -0.5268975496292114, "logits/rejected": -0.5537373423576355, "logps/chosen": -109.5130615234375, "logps/rejected": -114.7331771850586, "loss": 0.3099, "rewards/accuracies": 1.0, "rewards/chosen": 0.8526397943496704, "rewards/margins": 0.36905062198638916, "rewards/rejected": 0.48358917236328125, "step": 9366 }, { "epoch": 1.52, "learning_rate": 3.674196147404858e-07, "logits/chosen": -0.3050389587879181, "logits/rejected": -0.29923924803733826, "logps/chosen": -84.67017364501953, "logps/rejected": -41.22642135620117, "loss": 0.7256, "rewards/accuracies": 0.0, "rewards/chosen": 0.8481483459472656, "rewards/margins": -1.1385173797607422, "rewards/rejected": 1.9866657257080078, "step": 9367 }, { "epoch": 1.52, "learning_rate": 3.672928984018555e-07, "logits/chosen": -0.5993526577949524, "logits/rejected": -0.48975908756256104, "logps/chosen": -83.17997741699219, "logps/rejected": -28.578161239624023, "loss": 0.1463, "rewards/accuracies": 1.0, "rewards/chosen": 2.5974578857421875, "rewards/margins": 2.0765302181243896, "rewards/rejected": 0.5209276080131531, "step": 9368 }, { "epoch": 1.52, "learning_rate": 3.671661912320302e-07, "logits/chosen": -0.7193921208381653, "logits/rejected": -0.5457112789154053, "logps/chosen": -145.71983337402344, "logps/rejected": -34.077796936035156, "loss": 0.1499, "rewards/accuracies": 1.0, "rewards/chosen": 1.2597945928573608, "rewards/margins": 1.0754814147949219, "rewards/rejected": 0.18431320786476135, "step": 9369 }, { "epoch": 1.52, "learning_rate": 3.6703949323976416e-07, "logits/chosen": -0.6110945343971252, "logits/rejected": -0.5757772922515869, "logps/chosen": -75.60871887207031, "logps/rejected": -48.219871520996094, "loss": 0.8989, "rewards/accuracies": 0.0, "rewards/chosen": 2.1543776988983154, "rewards/margins": -0.7020561695098877, "rewards/rejected": 2.856433868408203, "step": 9370 }, { "epoch": 1.52, "learning_rate": 3.669128044338111e-07, "logits/chosen": -0.6991646885871887, "logits/rejected": -0.7026674151420593, "logps/chosen": -66.3481674194336, "logps/rejected": -57.79373550415039, "loss": 0.5282, "rewards/accuracies": 0.0, "rewards/chosen": 0.6220085024833679, "rewards/margins": -0.5506741404533386, "rewards/rejected": 1.1726826429367065, "step": 9371 }, { "epoch": 1.52, "learning_rate": 3.66786124822924e-07, "logits/chosen": -0.7384218573570251, "logits/rejected": -0.8195273876190186, "logps/chosen": -66.57823944091797, "logps/rejected": -133.5537872314453, "loss": 0.7644, "rewards/accuracies": 0.0, "rewards/chosen": 2.3444840908050537, "rewards/margins": -1.0726637840270996, "rewards/rejected": 3.4171478748321533, "step": 9372 }, { "epoch": 1.52, "learning_rate": 3.6665945441585517e-07, "logits/chosen": -0.42531710863113403, "logits/rejected": -0.41216760873794556, "logps/chosen": -44.16746139526367, "logps/rejected": -92.92362976074219, "loss": 0.978, "rewards/accuracies": 0.0, "rewards/chosen": 0.9939869046211243, "rewards/margins": -1.3381054401397705, "rewards/rejected": 2.33209228515625, "step": 9373 }, { "epoch": 1.52, "learning_rate": 3.6653279322135635e-07, "logits/chosen": -0.9318108558654785, "logits/rejected": -0.9071359634399414, "logps/chosen": -165.18800354003906, "logps/rejected": -80.64451599121094, "loss": 0.503, "rewards/accuracies": 1.0, "rewards/chosen": 6.438439846038818, "rewards/margins": 3.657679557800293, "rewards/rejected": 2.7807602882385254, "step": 9374 }, { "epoch": 1.52, "learning_rate": 3.664061412481786e-07, "logits/chosen": -0.5894399881362915, "logits/rejected": -0.28678151965141296, "logps/chosen": -91.84688568115234, "logps/rejected": -51.741825103759766, "loss": 1.2493, "rewards/accuracies": 0.0, "rewards/chosen": 0.6667999625205994, "rewards/margins": -1.1510264873504639, "rewards/rejected": 1.817826509475708, "step": 9375 }, { "epoch": 1.52, "learning_rate": 3.6627949850507246e-07, "logits/chosen": -0.8805267810821533, "logits/rejected": -0.7478978633880615, "logps/chosen": -76.49125671386719, "logps/rejected": -35.195533752441406, "loss": 0.2215, "rewards/accuracies": 1.0, "rewards/chosen": 3.4716477394104004, "rewards/margins": 1.2623136043548584, "rewards/rejected": 2.209334135055542, "step": 9376 }, { "epoch": 1.52, "learning_rate": 3.661528650007877e-07, "logits/chosen": -0.7487775683403015, "logits/rejected": -0.8284046649932861, "logps/chosen": -107.06900024414062, "logps/rejected": -171.5185546875, "loss": 2.5848, "rewards/accuracies": 0.0, "rewards/chosen": 1.9586273431777954, "rewards/margins": -4.330535888671875, "rewards/rejected": 6.289163112640381, "step": 9377 }, { "epoch": 1.52, "learning_rate": 3.660262407440735e-07, "logits/chosen": -0.935001790523529, "logits/rejected": -1.0028672218322754, "logps/chosen": -91.47972106933594, "logps/rejected": -76.84586334228516, "loss": 0.9469, "rewards/accuracies": 0.0, "rewards/chosen": 1.6086090803146362, "rewards/margins": -0.5076972246170044, "rewards/rejected": 2.1163063049316406, "step": 9378 }, { "epoch": 1.52, "learning_rate": 3.658996257436784e-07, "logits/chosen": -0.5300091505050659, "logits/rejected": -0.5337395071983337, "logps/chosen": -7.386914253234863, "logps/rejected": -2.4341235160827637, "loss": 0.7183, "rewards/accuracies": 0.0, "rewards/chosen": 0.026716042309999466, "rewards/margins": -0.09825870394706726, "rewards/rejected": 0.12497474998235703, "step": 9379 }, { "epoch": 1.52, "learning_rate": 3.6577302000835023e-07, "logits/chosen": -0.4973522126674652, "logits/rejected": -0.4903339743614197, "logps/chosen": -64.09327697753906, "logps/rejected": -57.58643341064453, "loss": 0.6601, "rewards/accuracies": 0.0, "rewards/chosen": 1.1031837463378906, "rewards/margins": -0.9256653785705566, "rewards/rejected": 2.0288491249084473, "step": 9380 }, { "epoch": 1.52, "learning_rate": 3.656464235468364e-07, "logits/chosen": -0.7127599120140076, "logits/rejected": -0.8497263193130493, "logps/chosen": -75.14826965332031, "logps/rejected": -125.26885986328125, "loss": 1.0627, "rewards/accuracies": 0.0, "rewards/chosen": 2.561666965484619, "rewards/margins": -1.7076539993286133, "rewards/rejected": 4.269320964813232, "step": 9381 }, { "epoch": 1.52, "learning_rate": 3.6551983636788334e-07, "logits/chosen": -0.4687407910823822, "logits/rejected": -0.3975217342376709, "logps/chosen": -84.48333740234375, "logps/rejected": -78.59918975830078, "loss": 0.2939, "rewards/accuracies": 1.0, "rewards/chosen": 1.3721030950546265, "rewards/margins": 0.23469388484954834, "rewards/rejected": 1.1374092102050781, "step": 9382 }, { "epoch": 1.52, "learning_rate": 3.6539325848023717e-07, "logits/chosen": -0.7615095376968384, "logits/rejected": -0.7843623161315918, "logps/chosen": -86.66157531738281, "logps/rejected": -80.90975189208984, "loss": 0.6656, "rewards/accuracies": 0.0, "rewards/chosen": 1.7715507745742798, "rewards/margins": -0.21969068050384521, "rewards/rejected": 1.991241455078125, "step": 9383 }, { "epoch": 1.52, "learning_rate": 3.652666898926432e-07, "logits/chosen": -0.669243335723877, "logits/rejected": -0.742001473903656, "logps/chosen": -75.01673126220703, "logps/rejected": -102.065673828125, "loss": 1.1485, "rewards/accuracies": 0.0, "rewards/chosen": 2.1932594776153564, "rewards/margins": -2.073679208755493, "rewards/rejected": 4.26693868637085, "step": 9384 }, { "epoch": 1.52, "learning_rate": 3.651401306138461e-07, "logits/chosen": -0.6801169514656067, "logits/rejected": -0.7925721406936646, "logps/chosen": -91.74758911132812, "logps/rejected": -140.7262420654297, "loss": 0.7638, "rewards/accuracies": 0.0, "rewards/chosen": 1.6716583967208862, "rewards/margins": -0.2651686668395996, "rewards/rejected": 1.9368270635604858, "step": 9385 }, { "epoch": 1.52, "learning_rate": 3.6501358065258973e-07, "logits/chosen": -0.8760507106781006, "logits/rejected": -0.8651865124702454, "logps/chosen": -103.42063903808594, "logps/rejected": -222.97952270507812, "loss": 2.663, "rewards/accuracies": 0.0, "rewards/chosen": 1.029211401939392, "rewards/margins": -5.133847236633301, "rewards/rejected": 6.163058757781982, "step": 9386 }, { "epoch": 1.52, "learning_rate": 3.648870400176179e-07, "logits/chosen": -0.8049139976501465, "logits/rejected": -0.7909403443336487, "logps/chosen": -86.08345031738281, "logps/rejected": -35.69342803955078, "loss": 2.0639, "rewards/accuracies": 0.0, "rewards/chosen": 0.6282302737236023, "rewards/margins": -0.7764355540275574, "rewards/rejected": 1.4046658277511597, "step": 9387 }, { "epoch": 1.52, "learning_rate": 3.6476050871767305e-07, "logits/chosen": -0.4562118947505951, "logits/rejected": -0.349496454000473, "logps/chosen": -56.218971252441406, "logps/rejected": -28.7237606048584, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": 2.3263566493988037, "rewards/margins": 2.286848783493042, "rewards/rejected": 0.03950786590576172, "step": 9388 }, { "epoch": 1.52, "learning_rate": 3.646339867614975e-07, "logits/chosen": -0.6774376630783081, "logits/rejected": -0.7069344520568848, "logps/chosen": -80.59877014160156, "logps/rejected": -116.49427032470703, "loss": 0.7527, "rewards/accuracies": 0.0, "rewards/chosen": 2.074298143386841, "rewards/margins": -1.2316093444824219, "rewards/rejected": 3.3059074878692627, "step": 9389 }, { "epoch": 1.52, "learning_rate": 3.6450747415783254e-07, "logits/chosen": -1.03354811668396, "logits/rejected": -0.9899842143058777, "logps/chosen": -147.94583129882812, "logps/rejected": -43.42314910888672, "loss": 0.329, "rewards/accuracies": 1.0, "rewards/chosen": 3.586761474609375, "rewards/margins": 0.56819748878479, "rewards/rejected": 3.018563985824585, "step": 9390 }, { "epoch": 1.52, "learning_rate": 3.643809709154191e-07, "logits/chosen": -0.7034496665000916, "logits/rejected": -0.7034496665000916, "logps/chosen": -95.31094360351562, "logps/rejected": -95.31094360351562, "loss": 0.4672, "rewards/accuracies": 0.0, "rewards/chosen": 2.905796766281128, "rewards/margins": 0.0, "rewards/rejected": 2.905796766281128, "step": 9391 }, { "epoch": 1.52, "learning_rate": 3.642544770429974e-07, "logits/chosen": -0.6735120415687561, "logits/rejected": -0.5666055083274841, "logps/chosen": -69.15843963623047, "logps/rejected": -76.0981674194336, "loss": 0.1733, "rewards/accuracies": 1.0, "rewards/chosen": 1.8849449157714844, "rewards/margins": 0.9124007821083069, "rewards/rejected": 0.9725441336631775, "step": 9392 }, { "epoch": 1.52, "learning_rate": 3.6412799254930684e-07, "logits/chosen": -0.4898470640182495, "logits/rejected": -0.49198704957962036, "logps/chosen": -16.712448120117188, "logps/rejected": -2.9165706634521484, "loss": 0.6676, "rewards/accuracies": 1.0, "rewards/chosen": 0.23847655951976776, "rewards/margins": 0.1198662742972374, "rewards/rejected": 0.11861028522253036, "step": 9393 }, { "epoch": 1.52, "learning_rate": 3.640015174430864e-07, "logits/chosen": -0.6750447750091553, "logits/rejected": -0.687709629535675, "logps/chosen": -45.24134063720703, "logps/rejected": -67.42298889160156, "loss": 2.7553, "rewards/accuracies": 0.0, "rewards/chosen": 1.314164400100708, "rewards/margins": -0.5033618211746216, "rewards/rejected": 1.8175262212753296, "step": 9394 }, { "epoch": 1.52, "learning_rate": 3.638750517330742e-07, "logits/chosen": -0.5094062089920044, "logits/rejected": -0.5678739547729492, "logps/chosen": -71.59605407714844, "logps/rejected": -97.51107025146484, "loss": 0.7537, "rewards/accuracies": 0.0, "rewards/chosen": 1.0494316816329956, "rewards/margins": -0.09922099113464355, "rewards/rejected": 1.1486526727676392, "step": 9395 }, { "epoch": 1.53, "learning_rate": 3.6374859542800797e-07, "logits/chosen": -0.9080405235290527, "logits/rejected": -0.9490788578987122, "logps/chosen": -46.51753234863281, "logps/rejected": -92.7375259399414, "loss": 0.5231, "rewards/accuracies": 0.0, "rewards/chosen": 0.37146759033203125, "rewards/margins": -0.5672569274902344, "rewards/rejected": 0.9387245178222656, "step": 9396 }, { "epoch": 1.53, "learning_rate": 3.6362214853662447e-07, "logits/chosen": -0.5332550406455994, "logits/rejected": -0.4907093346118927, "logps/chosen": -35.60016632080078, "logps/rejected": -49.924598693847656, "loss": 0.5847, "rewards/accuracies": 1.0, "rewards/chosen": 2.1405277252197266, "rewards/margins": 0.7669093608856201, "rewards/rejected": 1.3736183643341064, "step": 9397 }, { "epoch": 1.53, "learning_rate": 3.634957110676602e-07, "logits/chosen": -0.6517212986946106, "logits/rejected": -0.5842897295951843, "logps/chosen": -110.02886962890625, "logps/rejected": -56.132408142089844, "loss": 0.2332, "rewards/accuracies": 1.0, "rewards/chosen": 3.8279144763946533, "rewards/margins": 0.5518996715545654, "rewards/rejected": 3.276014804840088, "step": 9398 }, { "epoch": 1.53, "learning_rate": 3.633692830298506e-07, "logits/chosen": -0.934547483921051, "logits/rejected": -0.7791581749916077, "logps/chosen": -123.98228454589844, "logps/rejected": -99.9382095336914, "loss": 0.118, "rewards/accuracies": 1.0, "rewards/chosen": 6.484571933746338, "rewards/margins": 2.7677299976348877, "rewards/rejected": 3.71684193611145, "step": 9399 }, { "epoch": 1.53, "learning_rate": 3.6324286443193077e-07, "logits/chosen": -0.2799466848373413, "logits/rejected": -0.31948965787887573, "logps/chosen": -140.65423583984375, "logps/rejected": -63.63111877441406, "loss": 0.4634, "rewards/accuracies": 1.0, "rewards/chosen": 3.8677613735198975, "rewards/margins": 2.824392795562744, "rewards/rejected": 1.0433685779571533, "step": 9400 }, { "epoch": 1.53, "learning_rate": 3.6311645528263503e-07, "logits/chosen": -0.790238082408905, "logits/rejected": -0.6384965777397156, "logps/chosen": -123.8938980102539, "logps/rejected": -83.04165649414062, "loss": 0.1512, "rewards/accuracies": 1.0, "rewards/chosen": 5.625051021575928, "rewards/margins": 2.1816213130950928, "rewards/rejected": 3.443429708480835, "step": 9401 }, { "epoch": 1.53, "learning_rate": 3.6299005559069694e-07, "logits/chosen": -0.8325220346450806, "logits/rejected": -0.7949203252792358, "logps/chosen": -97.60972595214844, "logps/rejected": -117.00946807861328, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": 2.26310133934021, "rewards/margins": 2.7728676795959473, "rewards/rejected": -0.5097663998603821, "step": 9402 }, { "epoch": 1.53, "learning_rate": 3.628636653648497e-07, "logits/chosen": -0.701384425163269, "logits/rejected": -0.5411812663078308, "logps/chosen": -138.64488220214844, "logps/rejected": -65.9080810546875, "loss": 0.5968, "rewards/accuracies": 1.0, "rewards/chosen": 5.0016679763793945, "rewards/margins": 2.8987085819244385, "rewards/rejected": 2.102959394454956, "step": 9403 }, { "epoch": 1.53, "learning_rate": 3.6273728461382553e-07, "logits/chosen": -1.0132161378860474, "logits/rejected": -1.0116024017333984, "logps/chosen": -133.11978149414062, "logps/rejected": -86.10477447509766, "loss": 0.7459, "rewards/accuracies": 0.0, "rewards/chosen": 3.7186920642852783, "rewards/margins": -0.7283651828765869, "rewards/rejected": 4.447057247161865, "step": 9404 }, { "epoch": 1.53, "learning_rate": 3.626109133463562e-07, "logits/chosen": -0.6197583675384521, "logits/rejected": -0.5617904663085938, "logps/chosen": -86.14938354492188, "logps/rejected": -17.340282440185547, "loss": 0.0697, "rewards/accuracies": 1.0, "rewards/chosen": 3.14512038230896, "rewards/margins": 2.010662078857422, "rewards/rejected": 1.1344581842422485, "step": 9405 }, { "epoch": 1.53, "learning_rate": 3.624845515711728e-07, "logits/chosen": -0.48899659514427185, "logits/rejected": -0.48255378007888794, "logps/chosen": -6.489933967590332, "logps/rejected": -5.810058116912842, "loss": 0.9997, "rewards/accuracies": 0.0, "rewards/chosen": -0.08324327319860458, "rewards/margins": -0.19422058761119843, "rewards/rejected": 0.11097731441259384, "step": 9406 }, { "epoch": 1.53, "learning_rate": 3.6235819929700575e-07, "logits/chosen": -0.5146589875221252, "logits/rejected": -0.5445291996002197, "logps/chosen": -52.03990173339844, "logps/rejected": -112.2083740234375, "loss": 0.4169, "rewards/accuracies": 0.0, "rewards/chosen": 1.7316421270370483, "rewards/margins": -0.2595353126525879, "rewards/rejected": 1.9911774396896362, "step": 9407 }, { "epoch": 1.53, "learning_rate": 3.6223185653258466e-07, "logits/chosen": -0.5197800397872925, "logits/rejected": -0.5162490606307983, "logps/chosen": -64.21575164794922, "logps/rejected": -99.96955108642578, "loss": 0.9231, "rewards/accuracies": 0.0, "rewards/chosen": 1.5608665943145752, "rewards/margins": -0.462644100189209, "rewards/rejected": 2.023510694503784, "step": 9408 }, { "epoch": 1.53, "learning_rate": 3.6210552328663867e-07, "logits/chosen": -0.8622415661811829, "logits/rejected": -0.8293755650520325, "logps/chosen": -39.67916488647461, "logps/rejected": -81.27783203125, "loss": 1.0165, "rewards/accuracies": 1.0, "rewards/chosen": 2.2076289653778076, "rewards/margins": 1.0436686277389526, "rewards/rejected": 1.163960337638855, "step": 9409 }, { "epoch": 1.53, "learning_rate": 3.6197919956789633e-07, "logits/chosen": -0.49577847123146057, "logits/rejected": -0.3988911807537079, "logps/chosen": -106.25471496582031, "logps/rejected": -60.415245056152344, "loss": 0.9599, "rewards/accuracies": 1.0, "rewards/chosen": 1.3047469854354858, "rewards/margins": 0.4409019351005554, "rewards/rejected": 0.8638450503349304, "step": 9410 }, { "epoch": 1.53, "learning_rate": 3.618528853850854e-07, "logits/chosen": -0.9257696270942688, "logits/rejected": -0.9599962830543518, "logps/chosen": -87.33216857910156, "logps/rejected": -123.59730529785156, "loss": 2.2284, "rewards/accuracies": 0.0, "rewards/chosen": 1.5774002075195312, "rewards/margins": -3.70430326461792, "rewards/rejected": 5.281703472137451, "step": 9411 }, { "epoch": 1.53, "learning_rate": 3.6172658074693285e-07, "logits/chosen": -0.8477513194084167, "logits/rejected": -0.6240096688270569, "logps/chosen": -111.21622467041016, "logps/rejected": -77.59056854248047, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": 7.713985443115234, "rewards/margins": 4.607146263122559, "rewards/rejected": 3.1068389415740967, "step": 9412 }, { "epoch": 1.53, "learning_rate": 3.616002856621652e-07, "logits/chosen": -0.9536168575286865, "logits/rejected": -0.9531603455543518, "logps/chosen": -83.91333770751953, "logps/rejected": -44.08998107910156, "loss": 1.581, "rewards/accuracies": 1.0, "rewards/chosen": 1.510506510734558, "rewards/margins": 0.45761728286743164, "rewards/rejected": 1.0528892278671265, "step": 9413 }, { "epoch": 1.53, "learning_rate": 3.614740001395083e-07, "logits/chosen": -0.9942983388900757, "logits/rejected": -0.8908655643463135, "logps/chosen": -122.04209899902344, "logps/rejected": -66.90481567382812, "loss": 0.9136, "rewards/accuracies": 0.0, "rewards/chosen": 1.057154893875122, "rewards/margins": -0.5656462907791138, "rewards/rejected": 1.6228011846542358, "step": 9414 }, { "epoch": 1.53, "learning_rate": 3.613477241876872e-07, "logits/chosen": -0.7744637131690979, "logits/rejected": -0.804359495639801, "logps/chosen": -127.57994079589844, "logps/rejected": -134.5414276123047, "loss": 0.4055, "rewards/accuracies": 0.0, "rewards/chosen": 5.913201808929443, "rewards/margins": -0.1536850929260254, "rewards/rejected": 6.066886901855469, "step": 9415 }, { "epoch": 1.53, "learning_rate": 3.6122145781542643e-07, "logits/chosen": -0.9527637362480164, "logits/rejected": -0.9440281987190247, "logps/chosen": -148.53466796875, "logps/rejected": -82.17721557617188, "loss": 1.0213, "rewards/accuracies": 1.0, "rewards/chosen": 5.1152496337890625, "rewards/margins": 2.9661171436309814, "rewards/rejected": 2.149132490158081, "step": 9416 }, { "epoch": 1.53, "learning_rate": 3.6109520103144984e-07, "logits/chosen": -0.5052932500839233, "logits/rejected": -0.41708141565322876, "logps/chosen": -48.5534782409668, "logps/rejected": -68.16177368164062, "loss": 0.186, "rewards/accuracies": 1.0, "rewards/chosen": 2.645840883255005, "rewards/margins": 0.8216968774795532, "rewards/rejected": 1.8241440057754517, "step": 9417 }, { "epoch": 1.53, "learning_rate": 3.609689538444805e-07, "logits/chosen": -1.0120296478271484, "logits/rejected": -1.0791630744934082, "logps/chosen": -72.05311584472656, "logps/rejected": -139.04075622558594, "loss": 0.7834, "rewards/accuracies": 0.0, "rewards/chosen": 1.7663239240646362, "rewards/margins": -1.3294249773025513, "rewards/rejected": 3.0957489013671875, "step": 9418 }, { "epoch": 1.53, "learning_rate": 3.608427162632409e-07, "logits/chosen": -0.7420729398727417, "logits/rejected": -0.5415914058685303, "logps/chosen": -93.38426208496094, "logps/rejected": -19.35722541809082, "loss": 0.0386, "rewards/accuracies": 1.0, "rewards/chosen": 3.3418824672698975, "rewards/margins": 3.2199857234954834, "rewards/rejected": 0.12189674377441406, "step": 9419 }, { "epoch": 1.53, "learning_rate": 3.6071648829645294e-07, "logits/chosen": -0.569717288017273, "logits/rejected": -0.6011186242103577, "logps/chosen": -96.38616180419922, "logps/rejected": -137.5538330078125, "loss": 0.7734, "rewards/accuracies": 1.0, "rewards/chosen": 1.2210105657577515, "rewards/margins": 0.21638870239257812, "rewards/rejected": 1.0046218633651733, "step": 9420 }, { "epoch": 1.53, "learning_rate": 3.605902699528376e-07, "logits/chosen": -0.45767825841903687, "logits/rejected": -0.41573911905288696, "logps/chosen": -69.88320922851562, "logps/rejected": -63.200469970703125, "loss": 0.2726, "rewards/accuracies": 1.0, "rewards/chosen": 2.5391128063201904, "rewards/margins": 0.5254087448120117, "rewards/rejected": 2.0137040615081787, "step": 9421 }, { "epoch": 1.53, "learning_rate": 3.6046406124111555e-07, "logits/chosen": -0.4530417323112488, "logits/rejected": -0.4530417323112488, "logps/chosen": -51.562896728515625, "logps/rejected": -51.562896728515625, "loss": 0.5679, "rewards/accuracies": 0.0, "rewards/chosen": 2.646249532699585, "rewards/margins": 0.0, "rewards/rejected": 2.646249532699585, "step": 9422 }, { "epoch": 1.53, "learning_rate": 3.603378621700066e-07, "logits/chosen": -0.9417046308517456, "logits/rejected": -0.7681903839111328, "logps/chosen": -176.32034301757812, "logps/rejected": -37.49869155883789, "loss": 0.2045, "rewards/accuracies": 1.0, "rewards/chosen": 2.37424635887146, "rewards/margins": 2.098548650741577, "rewards/rejected": 0.2756977081298828, "step": 9423 }, { "epoch": 1.53, "learning_rate": 3.6021167274822994e-07, "logits/chosen": -0.44718137383461, "logits/rejected": -0.4563562273979187, "logps/chosen": -2.558441400527954, "logps/rejected": -1.3604748249053955, "loss": 0.7118, "rewards/accuracies": 0.0, "rewards/chosen": 0.30996355414390564, "rewards/margins": -0.006821244955062866, "rewards/rejected": 0.3167847990989685, "step": 9424 }, { "epoch": 1.53, "learning_rate": 3.6008549298450396e-07, "logits/chosen": -0.4302810728549957, "logits/rejected": -0.41488248109817505, "logps/chosen": -18.118274688720703, "logps/rejected": -19.022350311279297, "loss": 0.4704, "rewards/accuracies": 1.0, "rewards/chosen": 0.446641743183136, "rewards/margins": 0.1090940535068512, "rewards/rejected": 0.3375476896762848, "step": 9425 }, { "epoch": 1.53, "learning_rate": 3.599593228875465e-07, "logits/chosen": -0.35539117455482483, "logits/rejected": -0.22434264421463013, "logps/chosen": -97.46919250488281, "logps/rejected": -32.976226806640625, "loss": 0.4942, "rewards/accuracies": 1.0, "rewards/chosen": 1.6139236688613892, "rewards/margins": 1.321629285812378, "rewards/rejected": 0.29229432344436646, "step": 9426 }, { "epoch": 1.53, "learning_rate": 3.5983316246607474e-07, "logits/chosen": -0.8578493595123291, "logits/rejected": -0.828946053981781, "logps/chosen": -84.13480377197266, "logps/rejected": -61.46860885620117, "loss": 1.7073, "rewards/accuracies": 0.0, "rewards/chosen": 1.4424965381622314, "rewards/margins": -0.5920085906982422, "rewards/rejected": 2.0345051288604736, "step": 9427 }, { "epoch": 1.53, "learning_rate": 3.5970701172880525e-07, "logits/chosen": -0.785624086856842, "logits/rejected": -0.8033318519592285, "logps/chosen": -33.90003967285156, "logps/rejected": -78.77384948730469, "loss": 0.7784, "rewards/accuracies": 0.0, "rewards/chosen": 0.896554172039032, "rewards/margins": -1.1953036785125732, "rewards/rejected": 2.09185791015625, "step": 9428 }, { "epoch": 1.53, "learning_rate": 3.5958087068445374e-07, "logits/chosen": -0.5519090294837952, "logits/rejected": -0.525153636932373, "logps/chosen": -55.332515716552734, "logps/rejected": -40.190269470214844, "loss": 0.6011, "rewards/accuracies": 0.0, "rewards/chosen": 0.9283611178398132, "rewards/margins": -0.06212502717971802, "rewards/rejected": 0.9904861450195312, "step": 9429 }, { "epoch": 1.53, "learning_rate": 3.5945473934173543e-07, "logits/chosen": -0.7477438449859619, "logits/rejected": -0.7035093307495117, "logps/chosen": -153.38441467285156, "logps/rejected": -35.70939636230469, "loss": 0.3549, "rewards/accuracies": 1.0, "rewards/chosen": 4.329748630523682, "rewards/margins": 4.091968059539795, "rewards/rejected": 0.2377803772687912, "step": 9430 }, { "epoch": 1.53, "learning_rate": 3.593286177093648e-07, "logits/chosen": -0.8409988880157471, "logits/rejected": -0.8064189553260803, "logps/chosen": -71.43061828613281, "logps/rejected": -42.007530212402344, "loss": 0.4632, "rewards/accuracies": 0.0, "rewards/chosen": 2.7758872509002686, "rewards/margins": -0.1534433364868164, "rewards/rejected": 2.929330587387085, "step": 9431 }, { "epoch": 1.53, "learning_rate": 3.592025057960556e-07, "logits/chosen": -0.8422797322273254, "logits/rejected": -0.9852803945541382, "logps/chosen": -112.73109436035156, "logps/rejected": -131.27413940429688, "loss": 0.8638, "rewards/accuracies": 0.0, "rewards/chosen": 2.1474106311798096, "rewards/margins": -1.5128417015075684, "rewards/rejected": 3.660252332687378, "step": 9432 }, { "epoch": 1.53, "learning_rate": 3.5907640361052106e-07, "logits/chosen": -1.055703043937683, "logits/rejected": -0.948194682598114, "logps/chosen": -67.2031478881836, "logps/rejected": -31.69257354736328, "loss": 0.1143, "rewards/accuracies": 1.0, "rewards/chosen": 1.6409523487091064, "rewards/margins": 1.5366867780685425, "rewards/rejected": 0.10426559299230576, "step": 9433 }, { "epoch": 1.53, "learning_rate": 3.5895031116147353e-07, "logits/chosen": -1.0206449031829834, "logits/rejected": -0.9501296877861023, "logps/chosen": -141.51072692871094, "logps/rejected": -85.05792236328125, "loss": 0.0535, "rewards/accuracies": 1.0, "rewards/chosen": 6.2329301834106445, "rewards/margins": 3.2985384464263916, "rewards/rejected": 2.934391736984253, "step": 9434 }, { "epoch": 1.53, "learning_rate": 3.5882422845762487e-07, "logits/chosen": -0.8964567184448242, "logits/rejected": -0.9998287558555603, "logps/chosen": -190.26416015625, "logps/rejected": -138.01708984375, "loss": 0.0427, "rewards/accuracies": 1.0, "rewards/chosen": 3.574018955230713, "rewards/margins": 2.5651841163635254, "rewards/rejected": 1.0088348388671875, "step": 9435 }, { "epoch": 1.53, "learning_rate": 3.586981555076862e-07, "logits/chosen": -0.9056307077407837, "logits/rejected": -0.8644688725471497, "logps/chosen": -93.13802337646484, "logps/rejected": -63.1893424987793, "loss": 0.5622, "rewards/accuracies": 0.0, "rewards/chosen": 2.3081016540527344, "rewards/margins": -0.7159068584442139, "rewards/rejected": 3.0240085124969482, "step": 9436 }, { "epoch": 1.53, "learning_rate": 3.585720923203681e-07, "logits/chosen": -0.5854862928390503, "logits/rejected": -0.5035849809646606, "logps/chosen": -61.356712341308594, "logps/rejected": -48.18871307373047, "loss": 1.3221, "rewards/accuracies": 1.0, "rewards/chosen": 1.9179176092147827, "rewards/margins": 0.1989912986755371, "rewards/rejected": 1.7189263105392456, "step": 9437 }, { "epoch": 1.53, "learning_rate": 3.5844603890438005e-07, "logits/chosen": -0.24990524351596832, "logits/rejected": -0.2581145763397217, "logps/chosen": -5.118017673492432, "logps/rejected": -4.154481410980225, "loss": 0.7779, "rewards/accuracies": 0.0, "rewards/chosen": 0.23149099946022034, "rewards/margins": -0.049782127141952515, "rewards/rejected": 0.28127312660217285, "step": 9438 }, { "epoch": 1.53, "learning_rate": 3.583199952684315e-07, "logits/chosen": -0.9807314276695251, "logits/rejected": -0.9803350567817688, "logps/chosen": -105.80621337890625, "logps/rejected": -69.7420883178711, "loss": 2.1153, "rewards/accuracies": 0.0, "rewards/chosen": 1.3232635259628296, "rewards/margins": -2.3672614097595215, "rewards/rejected": 3.6905250549316406, "step": 9439 }, { "epoch": 1.53, "learning_rate": 3.5819396142123057e-07, "logits/chosen": -0.8692057728767395, "logits/rejected": -0.7679698467254639, "logps/chosen": -87.55347442626953, "logps/rejected": -52.0649299621582, "loss": 0.1151, "rewards/accuracies": 1.0, "rewards/chosen": 3.878746747970581, "rewards/margins": 2.428586959838867, "rewards/rejected": 1.4501599073410034, "step": 9440 }, { "epoch": 1.53, "learning_rate": 3.580679373714851e-07, "logits/chosen": -0.7296832203865051, "logits/rejected": -0.6970661282539368, "logps/chosen": -22.277549743652344, "logps/rejected": -104.92201232910156, "loss": 1.9751, "rewards/accuracies": 0.0, "rewards/chosen": 0.5832973718643188, "rewards/margins": -3.6081714630126953, "rewards/rejected": 4.191468715667725, "step": 9441 }, { "epoch": 1.53, "learning_rate": 3.5794192312790227e-07, "logits/chosen": -0.163260817527771, "logits/rejected": -0.1584698110818863, "logps/chosen": -8.938577651977539, "logps/rejected": -1.563307762145996, "loss": 0.3688, "rewards/accuracies": 0.0, "rewards/chosen": 0.2437165230512619, "rewards/margins": -0.020417168736457825, "rewards/rejected": 0.2641336917877197, "step": 9442 }, { "epoch": 1.53, "learning_rate": 3.5781591869918835e-07, "logits/chosen": -0.6042748093605042, "logits/rejected": -0.4290482997894287, "logps/chosen": -118.06314086914062, "logps/rejected": -38.14759826660156, "loss": 0.0759, "rewards/accuracies": 1.0, "rewards/chosen": 3.6221680641174316, "rewards/margins": 1.8227760791778564, "rewards/rejected": 1.7993919849395752, "step": 9443 }, { "epoch": 1.53, "learning_rate": 3.576899240940491e-07, "logits/chosen": -1.0182056427001953, "logits/rejected": -0.9367044568061829, "logps/chosen": -153.22543334960938, "logps/rejected": -50.1682243347168, "loss": 0.8615, "rewards/accuracies": 1.0, "rewards/chosen": 0.841815173625946, "rewards/margins": 0.7254619598388672, "rewards/rejected": 0.11635322868824005, "step": 9444 }, { "epoch": 1.53, "learning_rate": 3.575639393211895e-07, "logits/chosen": -0.7819949984550476, "logits/rejected": -0.6827114224433899, "logps/chosen": -93.2636489868164, "logps/rejected": -15.33963680267334, "loss": 0.7787, "rewards/accuracies": 1.0, "rewards/chosen": 0.5992302298545837, "rewards/margins": 0.0975348949432373, "rewards/rejected": 0.5016953349113464, "step": 9445 }, { "epoch": 1.53, "learning_rate": 3.5743796438931395e-07, "logits/chosen": -0.9403374195098877, "logits/rejected": -0.9617140889167786, "logps/chosen": -30.74997329711914, "logps/rejected": -59.58486557006836, "loss": 1.5711, "rewards/accuracies": 1.0, "rewards/chosen": 0.7609134912490845, "rewards/margins": 0.47700807452201843, "rewards/rejected": 0.28390541672706604, "step": 9446 }, { "epoch": 1.53, "learning_rate": 3.5731199930712617e-07, "logits/chosen": -0.9117005467414856, "logits/rejected": -0.9033826589584351, "logps/chosen": -101.93952941894531, "logps/rejected": -162.93157958984375, "loss": 0.597, "rewards/accuracies": 1.0, "rewards/chosen": 3.8475723266601562, "rewards/margins": 0.08868551254272461, "rewards/rejected": 3.7588868141174316, "step": 9447 }, { "epoch": 1.53, "learning_rate": 3.5718604408332907e-07, "logits/chosen": -0.37682443857192993, "logits/rejected": -0.3745126724243164, "logps/chosen": -70.043212890625, "logps/rejected": -78.74822998046875, "loss": 0.515, "rewards/accuracies": 0.0, "rewards/chosen": 1.189958930015564, "rewards/margins": -0.5125541687011719, "rewards/rejected": 1.7025130987167358, "step": 9448 }, { "epoch": 1.53, "learning_rate": 3.57060098726625e-07, "logits/chosen": -0.8899775147438049, "logits/rejected": -0.8453356623649597, "logps/chosen": -104.52923583984375, "logps/rejected": -66.66559600830078, "loss": 1.7716, "rewards/accuracies": 0.0, "rewards/chosen": 0.5653716921806335, "rewards/margins": -0.46843796968460083, "rewards/rejected": 1.0338096618652344, "step": 9449 }, { "epoch": 1.53, "learning_rate": 3.569341632457157e-07, "logits/chosen": -0.9003878235816956, "logits/rejected": -0.855114758014679, "logps/chosen": -65.0379638671875, "logps/rejected": -57.499855041503906, "loss": 0.1735, "rewards/accuracies": 1.0, "rewards/chosen": 2.3561768531799316, "rewards/margins": 0.954147458076477, "rewards/rejected": 1.4020293951034546, "step": 9450 }, { "epoch": 1.53, "learning_rate": 3.568082376493019e-07, "logits/chosen": -0.5374038815498352, "logits/rejected": -0.40438467264175415, "logps/chosen": -72.63723754882812, "logps/rejected": -139.17581176757812, "loss": 1.5934, "rewards/accuracies": 0.0, "rewards/chosen": 0.9407127499580383, "rewards/margins": -1.9609200954437256, "rewards/rejected": 2.901632785797119, "step": 9451 }, { "epoch": 1.53, "learning_rate": 3.5668232194608407e-07, "logits/chosen": -0.6120738983154297, "logits/rejected": -0.6412381529808044, "logps/chosen": -116.71258544921875, "logps/rejected": -106.48963165283203, "loss": 0.6012, "rewards/accuracies": 1.0, "rewards/chosen": 3.3623931407928467, "rewards/margins": 2.7592551708221436, "rewards/rejected": 0.6031379699707031, "step": 9452 }, { "epoch": 1.53, "learning_rate": 3.5655641614476167e-07, "logits/chosen": -0.9049655795097351, "logits/rejected": -0.8951296210289001, "logps/chosen": -127.48898315429688, "logps/rejected": -146.39324951171875, "loss": 0.3516, "rewards/accuracies": 1.0, "rewards/chosen": 1.9945862293243408, "rewards/margins": 0.1262756586074829, "rewards/rejected": 1.868310570716858, "step": 9453 }, { "epoch": 1.53, "learning_rate": 3.5643052025403365e-07, "logits/chosen": -1.0997765064239502, "logits/rejected": -1.101821780204773, "logps/chosen": -113.40811157226562, "logps/rejected": -158.86729431152344, "loss": 0.2882, "rewards/accuracies": 1.0, "rewards/chosen": 2.056182861328125, "rewards/margins": 0.319132924079895, "rewards/rejected": 1.73704993724823, "step": 9454 }, { "epoch": 1.53, "learning_rate": 3.5630463428259815e-07, "logits/chosen": -0.4938099682331085, "logits/rejected": -0.46295270323753357, "logps/chosen": -60.315093994140625, "logps/rejected": -88.2235107421875, "loss": 0.175, "rewards/accuracies": 1.0, "rewards/chosen": 3.784871816635132, "rewards/margins": 1.4758827686309814, "rewards/rejected": 2.3089890480041504, "step": 9455 }, { "epoch": 1.53, "learning_rate": 3.5617875823915287e-07, "logits/chosen": -0.6892808675765991, "logits/rejected": -0.7314494848251343, "logps/chosen": -61.658729553222656, "logps/rejected": -63.80733871459961, "loss": 1.5965, "rewards/accuracies": 0.0, "rewards/chosen": 1.7572624683380127, "rewards/margins": -1.5532069206237793, "rewards/rejected": 3.310469388961792, "step": 9456 }, { "epoch": 1.53, "learning_rate": 3.5605289213239434e-07, "logits/chosen": -0.8380004167556763, "logits/rejected": -0.8613023161888123, "logps/chosen": -126.34347534179688, "logps/rejected": -176.74169921875, "loss": 0.2658, "rewards/accuracies": 1.0, "rewards/chosen": 1.5526779890060425, "rewards/margins": 0.6941086649894714, "rewards/rejected": 0.858569324016571, "step": 9457 }, { "epoch": 1.54, "learning_rate": 3.559270359710191e-07, "logits/chosen": -0.5025743842124939, "logits/rejected": -0.5011418461799622, "logps/chosen": -9.677007675170898, "logps/rejected": -2.5768239498138428, "loss": 1.0404, "rewards/accuracies": 0.0, "rewards/chosen": -0.06388922035694122, "rewards/margins": -0.36188840866088867, "rewards/rejected": 0.29799917340278625, "step": 9458 }, { "epoch": 1.54, "learning_rate": 3.558011897637224e-07, "logits/chosen": -1.0651984214782715, "logits/rejected": -1.0181307792663574, "logps/chosen": -71.06988525390625, "logps/rejected": -81.36630249023438, "loss": 0.7817, "rewards/accuracies": 0.0, "rewards/chosen": 2.373730421066284, "rewards/margins": -1.1743690967559814, "rewards/rejected": 3.5480995178222656, "step": 9459 }, { "epoch": 1.54, "learning_rate": 3.5567535351919906e-07, "logits/chosen": -0.5958255529403687, "logits/rejected": -1.0093680620193481, "logps/chosen": -94.15287780761719, "logps/rejected": -37.930870056152344, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": 2.1323564052581787, "rewards/margins": 1.8089876174926758, "rewards/rejected": 0.3233688473701477, "step": 9460 }, { "epoch": 1.54, "learning_rate": 3.555495272461433e-07, "logits/chosen": -0.710827112197876, "logits/rejected": -0.6994801759719849, "logps/chosen": -88.17064666748047, "logps/rejected": -142.4295654296875, "loss": 3.0409, "rewards/accuracies": 0.0, "rewards/chosen": 1.4906891584396362, "rewards/margins": -4.555469989776611, "rewards/rejected": 6.046159267425537, "step": 9461 }, { "epoch": 1.54, "learning_rate": 3.554237109532483e-07, "logits/chosen": -0.7727981209754944, "logits/rejected": -0.6384212374687195, "logps/chosen": -83.00685119628906, "logps/rejected": -68.41718292236328, "loss": 0.3683, "rewards/accuracies": 0.0, "rewards/chosen": 1.999707818031311, "rewards/margins": -0.05531156063079834, "rewards/rejected": 2.0550193786621094, "step": 9462 }, { "epoch": 1.54, "learning_rate": 3.5529790464920685e-07, "logits/chosen": -0.540474534034729, "logits/rejected": -0.49840784072875977, "logps/chosen": -64.10980224609375, "logps/rejected": -53.84026336669922, "loss": 1.0863, "rewards/accuracies": 1.0, "rewards/chosen": 1.2717323303222656, "rewards/margins": 0.01101076602935791, "rewards/rejected": 1.2607215642929077, "step": 9463 }, { "epoch": 1.54, "learning_rate": 3.5517210834271106e-07, "logits/chosen": -0.2663228511810303, "logits/rejected": -0.27160510420799255, "logps/chosen": -84.75502014160156, "logps/rejected": -52.80246353149414, "loss": 0.5188, "rewards/accuracies": 1.0, "rewards/chosen": 1.156476616859436, "rewards/margins": 0.2596660852432251, "rewards/rejected": 0.8968105316162109, "step": 9464 }, { "epoch": 1.54, "learning_rate": 3.550463220424522e-07, "logits/chosen": -0.9813416004180908, "logits/rejected": -1.0295922756195068, "logps/chosen": -56.139869689941406, "logps/rejected": -139.91128540039062, "loss": 0.8372, "rewards/accuracies": 0.0, "rewards/chosen": 2.0649986267089844, "rewards/margins": -0.9745004177093506, "rewards/rejected": 3.039499044418335, "step": 9465 }, { "epoch": 1.54, "learning_rate": 3.549205457571209e-07, "logits/chosen": -0.6946633458137512, "logits/rejected": -0.6946633458137512, "logps/chosen": -114.42524719238281, "logps/rejected": -114.42524719238281, "loss": 0.7229, "rewards/accuracies": 0.0, "rewards/chosen": 2.831822156906128, "rewards/margins": 0.0, "rewards/rejected": 2.831822156906128, "step": 9466 }, { "epoch": 1.54, "learning_rate": 3.547947794954074e-07, "logits/chosen": -0.5485588312149048, "logits/rejected": -0.46749240159988403, "logps/chosen": -44.97618865966797, "logps/rejected": -34.672088623046875, "loss": 1.5355, "rewards/accuracies": 1.0, "rewards/chosen": 1.5729068517684937, "rewards/margins": 0.23407971858978271, "rewards/rejected": 1.338827133178711, "step": 9467 }, { "epoch": 1.54, "learning_rate": 3.546690232660004e-07, "logits/chosen": -0.9228872060775757, "logits/rejected": -0.956653356552124, "logps/chosen": -56.746944427490234, "logps/rejected": -104.77569580078125, "loss": 1.7096, "rewards/accuracies": 0.0, "rewards/chosen": 2.855701208114624, "rewards/margins": -3.0792691707611084, "rewards/rejected": 5.934970378875732, "step": 9468 }, { "epoch": 1.54, "learning_rate": 3.5454327707758877e-07, "logits/chosen": -1.0622185468673706, "logits/rejected": -1.102617859840393, "logps/chosen": -104.11153411865234, "logps/rejected": -149.47979736328125, "loss": 2.4318, "rewards/accuracies": 0.0, "rewards/chosen": 1.5035438537597656, "rewards/margins": -4.145412445068359, "rewards/rejected": 5.648956298828125, "step": 9469 }, { "epoch": 1.54, "learning_rate": 3.5441754093886047e-07, "logits/chosen": -0.552049458026886, "logits/rejected": -0.4634416699409485, "logps/chosen": -54.81999969482422, "logps/rejected": -39.44358825683594, "loss": 0.1988, "rewards/accuracies": 1.0, "rewards/chosen": 3.1281166076660156, "rewards/margins": 0.7503352165222168, "rewards/rejected": 2.377781391143799, "step": 9470 }, { "epoch": 1.54, "learning_rate": 3.5429181485850246e-07, "logits/chosen": -0.778732180595398, "logits/rejected": -0.6480875015258789, "logps/chosen": -149.0980224609375, "logps/rejected": -81.34591674804688, "loss": 1.9238, "rewards/accuracies": 0.0, "rewards/chosen": 0.8379486203193665, "rewards/margins": -1.2506470680236816, "rewards/rejected": 2.0885956287384033, "step": 9471 }, { "epoch": 1.54, "learning_rate": 3.541660988452014e-07, "logits/chosen": -0.4569491147994995, "logits/rejected": -0.4569491147994995, "logps/chosen": -30.677555084228516, "logps/rejected": -30.677555084228516, "loss": 0.4137, "rewards/accuracies": 0.0, "rewards/chosen": 0.6029769778251648, "rewards/margins": 0.0, "rewards/rejected": 0.6029769778251648, "step": 9472 }, { "epoch": 1.54, "learning_rate": 3.54040392907643e-07, "logits/chosen": -0.6985426545143127, "logits/rejected": -0.7093279361724854, "logps/chosen": -26.72689437866211, "logps/rejected": -69.75785827636719, "loss": 0.8727, "rewards/accuracies": 0.0, "rewards/chosen": 1.9028037786483765, "rewards/margins": -0.4807473421096802, "rewards/rejected": 2.3835511207580566, "step": 9473 }, { "epoch": 1.54, "learning_rate": 3.5391469705451236e-07, "logits/chosen": -0.7831602096557617, "logits/rejected": -0.5850800275802612, "logps/chosen": -82.60340881347656, "logps/rejected": -123.4706039428711, "loss": 1.8972, "rewards/accuracies": 0.0, "rewards/chosen": 1.4449844360351562, "rewards/margins": -1.1557457447052002, "rewards/rejected": 2.6007301807403564, "step": 9474 }, { "epoch": 1.54, "learning_rate": 3.537890112944939e-07, "logits/chosen": -0.5935494303703308, "logits/rejected": -0.4402012825012207, "logps/chosen": -120.701171875, "logps/rejected": -66.01006317138672, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 4.423623561859131, "rewards/margins": 3.1963348388671875, "rewards/rejected": 1.227288842201233, "step": 9475 }, { "epoch": 1.54, "learning_rate": 3.536633356362713e-07, "logits/chosen": -0.6805550456047058, "logits/rejected": -1.1216647624969482, "logps/chosen": -101.2159423828125, "logps/rejected": -34.73497009277344, "loss": 0.6083, "rewards/accuracies": 1.0, "rewards/chosen": 2.0675370693206787, "rewards/margins": 1.8126164674758911, "rewards/rejected": 0.2549205720424652, "step": 9476 }, { "epoch": 1.54, "learning_rate": 3.5353767008852743e-07, "logits/chosen": -0.472694993019104, "logits/rejected": -0.472694993019104, "logps/chosen": -89.31155395507812, "logps/rejected": -89.31155395507812, "loss": 1.6938, "rewards/accuracies": 0.0, "rewards/chosen": 0.34403687715530396, "rewards/margins": 0.0, "rewards/rejected": 0.34403687715530396, "step": 9477 }, { "epoch": 1.54, "learning_rate": 3.534120146599448e-07, "logits/chosen": -0.4308617413043976, "logits/rejected": -0.41908925771713257, "logps/chosen": -49.872535705566406, "logps/rejected": -67.41088104248047, "loss": 1.0798, "rewards/accuracies": 0.0, "rewards/chosen": 1.6143760681152344, "rewards/margins": -0.30197298526763916, "rewards/rejected": 1.9163490533828735, "step": 9478 }, { "epoch": 1.54, "learning_rate": 3.53286369359205e-07, "logits/chosen": -0.9063133001327515, "logits/rejected": -0.9063133001327515, "logps/chosen": -88.69322204589844, "logps/rejected": -88.69322204589844, "loss": 0.4909, "rewards/accuracies": 0.0, "rewards/chosen": 0.8098037838935852, "rewards/margins": 0.0, "rewards/rejected": 0.8098037838935852, "step": 9479 }, { "epoch": 1.54, "learning_rate": 3.531607341949888e-07, "logits/chosen": -0.5959312319755554, "logits/rejected": -0.590469479560852, "logps/chosen": -115.724609375, "logps/rejected": -85.2189712524414, "loss": 0.7683, "rewards/accuracies": 1.0, "rewards/chosen": 2.681042432785034, "rewards/margins": 0.17451167106628418, "rewards/rejected": 2.50653076171875, "step": 9480 }, { "epoch": 1.54, "learning_rate": 3.530351091759765e-07, "logits/chosen": -0.7235406041145325, "logits/rejected": -0.653308629989624, "logps/chosen": -72.52934265136719, "logps/rejected": -21.90389060974121, "loss": 0.3328, "rewards/accuracies": 1.0, "rewards/chosen": 1.76335608959198, "rewards/margins": 1.6239756345748901, "rewards/rejected": 0.13938045501708984, "step": 9481 }, { "epoch": 1.54, "learning_rate": 3.529094943108475e-07, "logits/chosen": -0.5938329100608826, "logits/rejected": -0.5938329100608826, "logps/chosen": -68.46023559570312, "logps/rejected": -68.46023559570312, "loss": 2.2466, "rewards/accuracies": 0.0, "rewards/chosen": 1.6522018909454346, "rewards/margins": 0.0, "rewards/rejected": 1.6522018909454346, "step": 9482 }, { "epoch": 1.54, "learning_rate": 3.5278388960828076e-07, "logits/chosen": -0.9974541068077087, "logits/rejected": -0.994236409664154, "logps/chosen": -85.8367919921875, "logps/rejected": -55.064815521240234, "loss": 0.8354, "rewards/accuracies": 0.0, "rewards/chosen": 2.094738721847534, "rewards/margins": -0.9868383407592773, "rewards/rejected": 3.0815770626068115, "step": 9483 }, { "epoch": 1.54, "learning_rate": 3.526582950769542e-07, "logits/chosen": -1.1627811193466187, "logits/rejected": -1.0492010116577148, "logps/chosen": -130.19683837890625, "logps/rejected": -140.85635375976562, "loss": 0.3684, "rewards/accuracies": 1.0, "rewards/chosen": 6.254300117492676, "rewards/margins": 0.6048216819763184, "rewards/rejected": 5.649478435516357, "step": 9484 }, { "epoch": 1.54, "learning_rate": 3.525327107255453e-07, "logits/chosen": -0.94692462682724, "logits/rejected": -0.7790360450744629, "logps/chosen": -89.3634033203125, "logps/rejected": -31.21508026123047, "loss": 0.0466, "rewards/accuracies": 1.0, "rewards/chosen": 3.815035343170166, "rewards/margins": 3.326573371887207, "rewards/rejected": 0.48846206068992615, "step": 9485 }, { "epoch": 1.54, "learning_rate": 3.5240713656273074e-07, "logits/chosen": -0.6309714913368225, "logits/rejected": -0.6536346077919006, "logps/chosen": -69.93362426757812, "logps/rejected": -135.00022888183594, "loss": 0.1227, "rewards/accuracies": 1.0, "rewards/chosen": 1.6083236932754517, "rewards/margins": 1.6242599487304688, "rewards/rejected": -0.01593627966940403, "step": 9486 }, { "epoch": 1.54, "learning_rate": 3.522815725971865e-07, "logits/chosen": -0.4503662586212158, "logits/rejected": -0.4296985864639282, "logps/chosen": -101.57854461669922, "logps/rejected": -74.96784210205078, "loss": 1.4514, "rewards/accuracies": 0.0, "rewards/chosen": 0.34896087646484375, "rewards/margins": -0.7927703857421875, "rewards/rejected": 1.1417312622070312, "step": 9487 }, { "epoch": 1.54, "learning_rate": 3.5215601883758795e-07, "logits/chosen": -0.8542454242706299, "logits/rejected": -0.848780632019043, "logps/chosen": -92.80420684814453, "logps/rejected": -110.03457641601562, "loss": 0.4413, "rewards/accuracies": 0.0, "rewards/chosen": 0.8038474917411804, "rewards/margins": -0.30893629789352417, "rewards/rejected": 1.1127837896347046, "step": 9488 }, { "epoch": 1.54, "learning_rate": 3.520304752926095e-07, "logits/chosen": -0.6489207148551941, "logits/rejected": -0.7001739144325256, "logps/chosen": -134.79949951171875, "logps/rejected": -99.02240753173828, "loss": 0.9056, "rewards/accuracies": 0.0, "rewards/chosen": 5.843453884124756, "rewards/margins": -0.11458683013916016, "rewards/rejected": 5.958040714263916, "step": 9489 }, { "epoch": 1.54, "learning_rate": 3.519049419709251e-07, "logits/chosen": -0.5392937660217285, "logits/rejected": -0.458177775144577, "logps/chosen": -45.19125747680664, "logps/rejected": -42.07548141479492, "loss": 0.172, "rewards/accuracies": 1.0, "rewards/chosen": 2.3167026042938232, "rewards/margins": 1.8658874034881592, "rewards/rejected": 0.45081520080566406, "step": 9490 }, { "epoch": 1.54, "learning_rate": 3.5177941888120797e-07, "logits/chosen": -0.35489872097969055, "logits/rejected": -0.3735564351081848, "logps/chosen": -10.943561553955078, "logps/rejected": -22.483232498168945, "loss": 1.1159, "rewards/accuracies": 0.0, "rewards/chosen": -0.2527231276035309, "rewards/margins": -0.49850788712501526, "rewards/rejected": 0.24578475952148438, "step": 9491 }, { "epoch": 1.54, "learning_rate": 3.5165390603213054e-07, "logits/chosen": -0.9571158289909363, "logits/rejected": -1.0807421207427979, "logps/chosen": -285.8245849609375, "logps/rejected": -160.86898803710938, "loss": 0.7915, "rewards/accuracies": 0.0, "rewards/chosen": 4.834527492523193, "rewards/margins": -1.2240142822265625, "rewards/rejected": 6.058541774749756, "step": 9492 }, { "epoch": 1.54, "learning_rate": 3.515284034323645e-07, "logits/chosen": -0.9533073306083679, "logits/rejected": -0.7709510922431946, "logps/chosen": -175.35714721679688, "logps/rejected": -32.15354537963867, "loss": 0.1572, "rewards/accuracies": 1.0, "rewards/chosen": 1.01861572265625, "rewards/margins": 1.0293941497802734, "rewards/rejected": -0.010778427124023438, "step": 9493 }, { "epoch": 1.54, "learning_rate": 3.5140291109058086e-07, "logits/chosen": -0.6804512739181519, "logits/rejected": -0.5903725624084473, "logps/chosen": -75.91495513916016, "logps/rejected": -19.0468692779541, "loss": 0.2379, "rewards/accuracies": 1.0, "rewards/chosen": 2.031662702560425, "rewards/margins": 1.8906378746032715, "rewards/rejected": 0.14102478325366974, "step": 9494 }, { "epoch": 1.54, "learning_rate": 3.512774290154501e-07, "logits/chosen": -1.0650243759155273, "logits/rejected": -1.098415732383728, "logps/chosen": -105.3467788696289, "logps/rejected": -130.957763671875, "loss": 1.8309, "rewards/accuracies": 0.0, "rewards/chosen": 1.3827781677246094, "rewards/margins": -2.8885674476623535, "rewards/rejected": 4.271345615386963, "step": 9495 }, { "epoch": 1.54, "learning_rate": 3.511519572156418e-07, "logits/chosen": -0.6161975264549255, "logits/rejected": -0.6161975264549255, "logps/chosen": -1.813679575920105, "logps/rejected": -1.813679575920105, "loss": 0.5382, "rewards/accuracies": 0.0, "rewards/chosen": 0.3077041804790497, "rewards/margins": 0.0, "rewards/rejected": 0.3077041804790497, "step": 9496 }, { "epoch": 1.54, "learning_rate": 3.510264956998248e-07, "logits/chosen": -0.6608256697654724, "logits/rejected": -0.703921914100647, "logps/chosen": -87.89168548583984, "logps/rejected": -119.20347595214844, "loss": 0.7453, "rewards/accuracies": 0.0, "rewards/chosen": 2.449160099029541, "rewards/margins": -1.2102317810058594, "rewards/rejected": 3.6593918800354004, "step": 9497 }, { "epoch": 1.54, "learning_rate": 3.5090104447666734e-07, "logits/chosen": -0.8670099377632141, "logits/rejected": -0.840264081954956, "logps/chosen": -78.43095397949219, "logps/rejected": -136.28993225097656, "loss": 0.3854, "rewards/accuracies": 1.0, "rewards/chosen": 1.1224106550216675, "rewards/margins": 1.1272873878479004, "rewards/rejected": -0.0048767090775072575, "step": 9498 }, { "epoch": 1.54, "learning_rate": 3.507756035548369e-07, "logits/chosen": -0.862468421459198, "logits/rejected": -0.8574032783508301, "logps/chosen": -123.3633041381836, "logps/rejected": -109.43374633789062, "loss": 1.0325, "rewards/accuracies": 0.0, "rewards/chosen": 0.8550682067871094, "rewards/margins": -0.09012454748153687, "rewards/rejected": 0.9451927542686462, "step": 9499 }, { "epoch": 1.54, "learning_rate": 3.5065017294300037e-07, "logits/chosen": -0.8357109427452087, "logits/rejected": -0.8800634145736694, "logps/chosen": -66.61813354492188, "logps/rejected": -107.02093505859375, "loss": 0.2999, "rewards/accuracies": 1.0, "rewards/chosen": 1.1127747297286987, "rewards/margins": 0.24241948127746582, "rewards/rejected": 0.8703552484512329, "step": 9500 }, { "epoch": 1.54, "learning_rate": 3.5052475264982364e-07, "logits/chosen": -0.2412496656179428, "logits/rejected": -0.2412496656179428, "logps/chosen": -6.22204065322876, "logps/rejected": -6.22204065322876, "loss": 0.6304, "rewards/accuracies": 0.0, "rewards/chosen": 0.7223247289657593, "rewards/margins": 0.0, "rewards/rejected": 0.7223247289657593, "step": 9501 }, { "epoch": 1.54, "learning_rate": 3.503993426839722e-07, "logits/chosen": -0.8624864220619202, "logits/rejected": -0.9351704716682434, "logps/chosen": -200.97618103027344, "logps/rejected": -87.93843078613281, "loss": 1.0923, "rewards/accuracies": 0.0, "rewards/chosen": 3.75732421875, "rewards/margins": -2.012782573699951, "rewards/rejected": 5.770106792449951, "step": 9502 }, { "epoch": 1.54, "learning_rate": 3.502739430541106e-07, "logits/chosen": -1.0112699270248413, "logits/rejected": -0.9550628066062927, "logps/chosen": -72.68619537353516, "logps/rejected": -39.254154205322266, "loss": 0.1439, "rewards/accuracies": 1.0, "rewards/chosen": 1.5013580322265625, "rewards/margins": 1.2420116662979126, "rewards/rejected": 0.2593463957309723, "step": 9503 }, { "epoch": 1.54, "learning_rate": 3.5014855376890283e-07, "logits/chosen": -0.8915215730667114, "logits/rejected": -1.0922627449035645, "logps/chosen": -440.616455078125, "logps/rejected": -122.83148193359375, "loss": 0.2147, "rewards/accuracies": 1.0, "rewards/chosen": 5.559472560882568, "rewards/margins": 0.8316922187805176, "rewards/rejected": 4.727780342102051, "step": 9504 }, { "epoch": 1.54, "learning_rate": 3.5002317483701216e-07, "logits/chosen": -0.4735300838947296, "logits/rejected": -0.4735300838947296, "logps/chosen": -67.91925048828125, "logps/rejected": -67.91925048828125, "loss": 0.9122, "rewards/accuracies": 0.0, "rewards/chosen": 1.32359778881073, "rewards/margins": 0.0, "rewards/rejected": 1.32359778881073, "step": 9505 }, { "epoch": 1.54, "learning_rate": 3.4989780626710095e-07, "logits/chosen": -0.8663950562477112, "logits/rejected": -0.8374884128570557, "logps/chosen": -118.6734390258789, "logps/rejected": -129.5680389404297, "loss": 1.8379, "rewards/accuracies": 0.0, "rewards/chosen": 0.19333267211914062, "rewards/margins": -1.646521806716919, "rewards/rejected": 1.8398544788360596, "step": 9506 }, { "epoch": 1.54, "learning_rate": 3.49772448067831e-07, "logits/chosen": -1.806296944618225, "logits/rejected": -1.8353534936904907, "logps/chosen": -158.2705078125, "logps/rejected": -159.66403198242188, "loss": 1.312, "rewards/accuracies": 0.0, "rewards/chosen": 5.445990085601807, "rewards/margins": -0.41260671615600586, "rewards/rejected": 5.8585968017578125, "step": 9507 }, { "epoch": 1.54, "learning_rate": 3.4964710024786347e-07, "logits/chosen": -0.7907215356826782, "logits/rejected": -0.7290037274360657, "logps/chosen": -150.55062866210938, "logps/rejected": -69.6472396850586, "loss": 0.1965, "rewards/accuracies": 1.0, "rewards/chosen": 4.655208110809326, "rewards/margins": 2.9750216007232666, "rewards/rejected": 1.6801865100860596, "step": 9508 }, { "epoch": 1.54, "learning_rate": 3.495217628158587e-07, "logits/chosen": -0.7487667798995972, "logits/rejected": -0.6289169788360596, "logps/chosen": -93.9879379272461, "logps/rejected": -54.026939392089844, "loss": 0.5573, "rewards/accuracies": 0.0, "rewards/chosen": 2.2327592372894287, "rewards/margins": -0.6127593517303467, "rewards/rejected": 2.8455185890197754, "step": 9509 }, { "epoch": 1.54, "learning_rate": 3.493964357804763e-07, "logits/chosen": -0.5120337605476379, "logits/rejected": -0.5222653150558472, "logps/chosen": -52.5057373046875, "logps/rejected": -58.78011703491211, "loss": 1.4588, "rewards/accuracies": 0.0, "rewards/chosen": 1.7041893005371094, "rewards/margins": -1.2354915142059326, "rewards/rejected": 2.939680814743042, "step": 9510 }, { "epoch": 1.54, "learning_rate": 3.492711191503751e-07, "logits/chosen": -0.44955185055732727, "logits/rejected": -0.43956509232521057, "logps/chosen": -57.53398132324219, "logps/rejected": -94.83290100097656, "loss": 0.6719, "rewards/accuracies": 1.0, "rewards/chosen": 1.1453338861465454, "rewards/margins": 0.6319076418876648, "rewards/rejected": 0.5134262442588806, "step": 9511 }, { "epoch": 1.54, "learning_rate": 3.491458129342133e-07, "logits/chosen": -0.29440751671791077, "logits/rejected": -0.2875555455684662, "logps/chosen": -57.52874755859375, "logps/rejected": -99.43805694580078, "loss": 0.5065, "rewards/accuracies": 1.0, "rewards/chosen": 0.7124786376953125, "rewards/margins": 0.9755172729492188, "rewards/rejected": -0.26303863525390625, "step": 9512 }, { "epoch": 1.54, "learning_rate": 3.490205171406484e-07, "logits/chosen": -0.8597267270088196, "logits/rejected": -0.7852645516395569, "logps/chosen": -40.711978912353516, "logps/rejected": -158.75634765625, "loss": 2.6699, "rewards/accuracies": 0.0, "rewards/chosen": 1.729990005493164, "rewards/margins": -5.250582695007324, "rewards/rejected": 6.980572700500488, "step": 9513 }, { "epoch": 1.54, "learning_rate": 3.488952317783374e-07, "logits/chosen": -0.888874888420105, "logits/rejected": -0.838366687297821, "logps/chosen": -88.17976379394531, "logps/rejected": -32.297489166259766, "loss": 0.6265, "rewards/accuracies": 1.0, "rewards/chosen": 0.8983734250068665, "rewards/margins": 0.5345897674560547, "rewards/rejected": 0.36378365755081177, "step": 9514 }, { "epoch": 1.54, "learning_rate": 3.487699568559359e-07, "logits/chosen": -0.5045928359031677, "logits/rejected": -0.5045928359031677, "logps/chosen": -52.38676071166992, "logps/rejected": -52.38676071166992, "loss": 0.4306, "rewards/accuracies": 0.0, "rewards/chosen": 1.1864551305770874, "rewards/margins": 0.0, "rewards/rejected": 1.1864551305770874, "step": 9515 }, { "epoch": 1.54, "learning_rate": 3.4864469238209953e-07, "logits/chosen": -0.4888817071914673, "logits/rejected": -0.47500014305114746, "logps/chosen": -22.125688552856445, "logps/rejected": -1.8597005605697632, "loss": 1.3097, "rewards/accuracies": 0.0, "rewards/chosen": -0.003147697541862726, "rewards/margins": -0.4021923542022705, "rewards/rejected": 0.3990446627140045, "step": 9516 }, { "epoch": 1.54, "learning_rate": 3.4851943836548283e-07, "logits/chosen": -0.4555743336677551, "logits/rejected": -0.45228472352027893, "logps/chosen": -11.263755798339844, "logps/rejected": -8.071166038513184, "loss": 1.0844, "rewards/accuracies": 0.0, "rewards/chosen": 0.24099703133106232, "rewards/margins": -0.005469039082527161, "rewards/rejected": 0.24646607041358948, "step": 9517 }, { "epoch": 1.54, "learning_rate": 3.4839419481473953e-07, "logits/chosen": -0.8291386961936951, "logits/rejected": -0.6991226077079773, "logps/chosen": -116.97217559814453, "logps/rejected": -62.68336486816406, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": 4.598484039306641, "rewards/margins": 2.1281890869140625, "rewards/rejected": 2.470294952392578, "step": 9518 }, { "epoch": 1.55, "learning_rate": 3.482689617385229e-07, "logits/chosen": -0.08826931565999985, "logits/rejected": -0.08826931565999985, "logps/chosen": -4.520127296447754, "logps/rejected": -4.520127296447754, "loss": 0.3557, "rewards/accuracies": 0.0, "rewards/chosen": 0.5590214729309082, "rewards/margins": 0.0, "rewards/rejected": 0.5590214729309082, "step": 9519 }, { "epoch": 1.55, "learning_rate": 3.481437391454853e-07, "logits/chosen": -0.8567919731140137, "logits/rejected": -0.7876336574554443, "logps/chosen": -57.398807525634766, "logps/rejected": -43.68671798706055, "loss": 1.6245, "rewards/accuracies": 1.0, "rewards/chosen": 1.3004940748214722, "rewards/margins": 0.06746184825897217, "rewards/rejected": 1.2330322265625, "step": 9520 }, { "epoch": 1.55, "learning_rate": 3.4801852704427845e-07, "logits/chosen": -0.7356400489807129, "logits/rejected": -0.7743350267410278, "logps/chosen": -81.18529510498047, "logps/rejected": -65.46876525878906, "loss": 1.6861, "rewards/accuracies": 0.0, "rewards/chosen": 0.5809044241905212, "rewards/margins": -1.8632986545562744, "rewards/rejected": 2.4442031383514404, "step": 9521 }, { "epoch": 1.55, "learning_rate": 3.478933254435534e-07, "logits/chosen": -0.6874272227287292, "logits/rejected": -0.7319613099098206, "logps/chosen": -62.559051513671875, "logps/rejected": -37.593589782714844, "loss": 0.6064, "rewards/accuracies": 0.0, "rewards/chosen": 0.9628036618232727, "rewards/margins": -0.44891661405563354, "rewards/rejected": 1.4117202758789062, "step": 9522 }, { "epoch": 1.55, "learning_rate": 3.4776813435196025e-07, "logits/chosen": -0.8963873982429504, "logits/rejected": -0.8209782838821411, "logps/chosen": -78.03385925292969, "logps/rejected": -134.785888671875, "loss": 0.7771, "rewards/accuracies": 1.0, "rewards/chosen": 5.727330207824707, "rewards/margins": 0.24950265884399414, "rewards/rejected": 5.477827548980713, "step": 9523 }, { "epoch": 1.55, "learning_rate": 3.4764295377814855e-07, "logits/chosen": -0.9373795986175537, "logits/rejected": -0.8924715518951416, "logps/chosen": -66.80140686035156, "logps/rejected": -21.387311935424805, "loss": 0.5592, "rewards/accuracies": 1.0, "rewards/chosen": 0.9532577395439148, "rewards/margins": 0.19263511896133423, "rewards/rejected": 0.7606226205825806, "step": 9524 }, { "epoch": 1.55, "learning_rate": 3.475177837307671e-07, "logits/chosen": -0.5859277248382568, "logits/rejected": -0.5501390099525452, "logps/chosen": -90.96751403808594, "logps/rejected": -60.84046173095703, "loss": 0.3739, "rewards/accuracies": 1.0, "rewards/chosen": 2.4513099193573, "rewards/margins": 0.3667325973510742, "rewards/rejected": 2.0845773220062256, "step": 9525 }, { "epoch": 1.55, "learning_rate": 3.473926242184642e-07, "logits/chosen": -0.496371865272522, "logits/rejected": -0.6280784606933594, "logps/chosen": -74.85850524902344, "logps/rejected": -120.04104614257812, "loss": 2.7574, "rewards/accuracies": 0.0, "rewards/chosen": 1.175689697265625, "rewards/margins": -3.652099609375, "rewards/rejected": 4.827789306640625, "step": 9526 }, { "epoch": 1.55, "learning_rate": 3.4726747524988675e-07, "logits/chosen": -0.9775102734565735, "logits/rejected": -0.8684404492378235, "logps/chosen": -91.7088623046875, "logps/rejected": -66.45293426513672, "loss": 0.9818, "rewards/accuracies": 0.0, "rewards/chosen": 1.6544907093048096, "rewards/margins": -0.0779639482498169, "rewards/rejected": 1.7324546575546265, "step": 9527 }, { "epoch": 1.55, "learning_rate": 3.4714233683368167e-07, "logits/chosen": -0.4151306748390198, "logits/rejected": -0.4151306748390198, "logps/chosen": -39.76220703125, "logps/rejected": -39.76220703125, "loss": 0.3629, "rewards/accuracies": 0.0, "rewards/chosen": 1.5529282093048096, "rewards/margins": 0.0, "rewards/rejected": 1.5529282093048096, "step": 9528 }, { "epoch": 1.55, "learning_rate": 3.4701720897849477e-07, "logits/chosen": -0.6465986371040344, "logits/rejected": -0.6586521863937378, "logps/chosen": -47.68994140625, "logps/rejected": -86.61689758300781, "loss": 1.1707, "rewards/accuracies": 0.0, "rewards/chosen": 1.0104515552520752, "rewards/margins": -1.3227622509002686, "rewards/rejected": 2.3332138061523438, "step": 9529 }, { "epoch": 1.55, "learning_rate": 3.4689209169297117e-07, "logits/chosen": -0.860192596912384, "logits/rejected": -0.7523695230484009, "logps/chosen": -108.49581146240234, "logps/rejected": -21.188278198242188, "loss": 0.5605, "rewards/accuracies": 1.0, "rewards/chosen": 5.9634013175964355, "rewards/margins": 5.5328497886657715, "rewards/rejected": 0.43055152893066406, "step": 9530 }, { "epoch": 1.55, "learning_rate": 3.467669849857554e-07, "logits/chosen": -1.0724982023239136, "logits/rejected": -1.0713706016540527, "logps/chosen": -78.2497329711914, "logps/rejected": -104.69033813476562, "loss": 1.0445, "rewards/accuracies": 0.0, "rewards/chosen": 1.6010102033615112, "rewards/margins": -1.327189564704895, "rewards/rejected": 2.9281997680664062, "step": 9531 }, { "epoch": 1.55, "learning_rate": 3.46641888865491e-07, "logits/chosen": -0.8829189538955688, "logits/rejected": -0.8431136012077332, "logps/chosen": -60.52257537841797, "logps/rejected": -24.05988121032715, "loss": 0.4176, "rewards/accuracies": 0.0, "rewards/chosen": 0.3107261657714844, "rewards/margins": -0.10956001281738281, "rewards/rejected": 0.4202861785888672, "step": 9532 }, { "epoch": 1.55, "learning_rate": 3.46516803340821e-07, "logits/chosen": -1.034852147102356, "logits/rejected": -0.8927087187767029, "logps/chosen": -161.78408813476562, "logps/rejected": -62.55181884765625, "loss": 0.3176, "rewards/accuracies": 1.0, "rewards/chosen": 4.7300310134887695, "rewards/margins": 2.453270196914673, "rewards/rejected": 2.2767608165740967, "step": 9533 }, { "epoch": 1.55, "learning_rate": 3.4639172842038763e-07, "logits/chosen": -0.32376202940940857, "logits/rejected": -0.3745153844356537, "logps/chosen": -25.153318405151367, "logps/rejected": -99.05847930908203, "loss": 0.6796, "rewards/accuracies": 0.0, "rewards/chosen": -0.13870945572853088, "rewards/margins": -0.7661501169204712, "rewards/rejected": 0.6274406313896179, "step": 9534 }, { "epoch": 1.55, "learning_rate": 3.462666641128323e-07, "logits/chosen": -0.6212755441665649, "logits/rejected": -0.6212755441665649, "logps/chosen": -120.76515197753906, "logps/rejected": -120.76515197753906, "loss": 0.4292, "rewards/accuracies": 0.0, "rewards/chosen": 3.266798496246338, "rewards/margins": 0.0, "rewards/rejected": 3.266798496246338, "step": 9535 }, { "epoch": 1.55, "learning_rate": 3.461416104267959e-07, "logits/chosen": -0.2595919966697693, "logits/rejected": -0.25902408361434937, "logps/chosen": -2.7512497901916504, "logps/rejected": -1.1870752573013306, "loss": 0.4826, "rewards/accuracies": 0.0, "rewards/chosen": 0.2711930274963379, "rewards/margins": -0.1683397889137268, "rewards/rejected": 0.4395328164100647, "step": 9536 }, { "epoch": 1.55, "learning_rate": 3.4601656737091844e-07, "logits/chosen": -0.5894801616668701, "logits/rejected": -0.5126442313194275, "logps/chosen": -65.54750061035156, "logps/rejected": -43.871864318847656, "loss": 1.7338, "rewards/accuracies": 0.0, "rewards/chosen": 1.8130470514297485, "rewards/margins": -0.1353515386581421, "rewards/rejected": 1.9483985900878906, "step": 9537 }, { "epoch": 1.55, "learning_rate": 3.458915349538391e-07, "logits/chosen": -0.9522665739059448, "logits/rejected": -0.9602212905883789, "logps/chosen": -108.77839660644531, "logps/rejected": -48.08045959472656, "loss": 1.3228, "rewards/accuracies": 0.0, "rewards/chosen": -0.147715762257576, "rewards/margins": -1.5485154390335083, "rewards/rejected": 1.4007996320724487, "step": 9538 }, { "epoch": 1.55, "learning_rate": 3.4576651318419653e-07, "logits/chosen": -0.3072817027568817, "logits/rejected": -0.20378321409225464, "logps/chosen": -79.25798034667969, "logps/rejected": -1.5091724395751953, "loss": 0.3022, "rewards/accuracies": 1.0, "rewards/chosen": 0.6654991507530212, "rewards/margins": 0.4285547733306885, "rewards/rejected": 0.23694436252117157, "step": 9539 }, { "epoch": 1.55, "learning_rate": 3.456415020706285e-07, "logits/chosen": -0.9092223644256592, "logits/rejected": -0.9092223644256592, "logps/chosen": -84.1856689453125, "logps/rejected": -84.1856689453125, "loss": 1.0081, "rewards/accuracies": 0.0, "rewards/chosen": 1.6275665760040283, "rewards/margins": 0.0, "rewards/rejected": 1.6275665760040283, "step": 9540 }, { "epoch": 1.55, "learning_rate": 3.4551650162177216e-07, "logits/chosen": -0.6861236691474915, "logits/rejected": -0.5601093769073486, "logps/chosen": -90.32994079589844, "logps/rejected": -189.83314514160156, "loss": 1.3377, "rewards/accuracies": 0.0, "rewards/chosen": 1.045843482017517, "rewards/margins": -2.526153564453125, "rewards/rejected": 3.5719971656799316, "step": 9541 }, { "epoch": 1.55, "learning_rate": 3.453915118462638e-07, "logits/chosen": -0.9479281306266785, "logits/rejected": -1.2180081605911255, "logps/chosen": -83.3145751953125, "logps/rejected": -34.811275482177734, "loss": 0.0959, "rewards/accuracies": 1.0, "rewards/chosen": 2.1012465953826904, "rewards/margins": 1.780745267868042, "rewards/rejected": 0.32050132751464844, "step": 9542 }, { "epoch": 1.55, "learning_rate": 3.452665327527391e-07, "logits/chosen": -0.41063952445983887, "logits/rejected": -0.4432840943336487, "logps/chosen": -4.8659257888793945, "logps/rejected": -40.29050064086914, "loss": 0.8692, "rewards/accuracies": 1.0, "rewards/chosen": 0.6603955626487732, "rewards/margins": 0.4646027684211731, "rewards/rejected": 0.1957927793264389, "step": 9543 }, { "epoch": 1.55, "learning_rate": 3.451415643498328e-07, "logits/chosen": -0.7514692544937134, "logits/rejected": -0.7108387351036072, "logps/chosen": -172.254150390625, "logps/rejected": -122.99922180175781, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 4.584252834320068, "rewards/margins": 2.921613931655884, "rewards/rejected": 1.6626389026641846, "step": 9544 }, { "epoch": 1.55, "learning_rate": 3.450166066461792e-07, "logits/chosen": -0.2552620470523834, "logits/rejected": -0.2893921434879303, "logps/chosen": -108.38335418701172, "logps/rejected": -80.3734130859375, "loss": 1.0791, "rewards/accuracies": 1.0, "rewards/chosen": 1.3347336053848267, "rewards/margins": 0.2045806646347046, "rewards/rejected": 1.130152940750122, "step": 9545 }, { "epoch": 1.55, "learning_rate": 3.4489165965041156e-07, "logits/chosen": -0.648017942905426, "logits/rejected": -0.5958664417266846, "logps/chosen": -74.00021362304688, "logps/rejected": -63.765869140625, "loss": 0.5967, "rewards/accuracies": 1.0, "rewards/chosen": 2.3041465282440186, "rewards/margins": 0.43190300464630127, "rewards/rejected": 1.8722435235977173, "step": 9546 }, { "epoch": 1.55, "learning_rate": 3.4476672337116264e-07, "logits/chosen": -0.06364553421735764, "logits/rejected": -0.07341643422842026, "logps/chosen": -11.499691009521484, "logps/rejected": -2.123898506164551, "loss": 0.8248, "rewards/accuracies": 0.0, "rewards/chosen": 0.18225251138210297, "rewards/margins": -0.014328330755233765, "rewards/rejected": 0.19658084213733673, "step": 9547 }, { "epoch": 1.55, "learning_rate": 3.4464179781706413e-07, "logits/chosen": -0.6807134747505188, "logits/rejected": -0.6627008318901062, "logps/chosen": -185.94561767578125, "logps/rejected": -59.81911087036133, "loss": 0.1018, "rewards/accuracies": 1.0, "rewards/chosen": 4.185415744781494, "rewards/margins": 2.424994945526123, "rewards/rejected": 1.7604206800460815, "step": 9548 }, { "epoch": 1.55, "learning_rate": 3.4451688299674754e-07, "logits/chosen": -0.7325038909912109, "logits/rejected": -0.7084056735038757, "logps/chosen": -30.092121124267578, "logps/rejected": -33.510345458984375, "loss": 0.4434, "rewards/accuracies": 0.0, "rewards/chosen": 1.840037226676941, "rewards/margins": -0.04019355773925781, "rewards/rejected": 1.8802307844161987, "step": 9549 }, { "epoch": 1.55, "learning_rate": 3.4439197891884313e-07, "logits/chosen": -0.6640268564224243, "logits/rejected": -0.6661614775657654, "logps/chosen": -50.78797149658203, "logps/rejected": -102.6190185546875, "loss": 0.4411, "rewards/accuracies": 1.0, "rewards/chosen": 1.9010947942733765, "rewards/margins": 0.2843475341796875, "rewards/rejected": 1.616747260093689, "step": 9550 }, { "epoch": 1.55, "learning_rate": 3.442670855919806e-07, "logits/chosen": -0.6734236478805542, "logits/rejected": -0.5833849906921387, "logps/chosen": -55.67275619506836, "logps/rejected": -70.635986328125, "loss": 0.8385, "rewards/accuracies": 0.0, "rewards/chosen": 2.346419095993042, "rewards/margins": -0.3157644271850586, "rewards/rejected": 2.6621835231781006, "step": 9551 }, { "epoch": 1.55, "learning_rate": 3.441422030247889e-07, "logits/chosen": -0.7429150938987732, "logits/rejected": -0.8527172207832336, "logps/chosen": -112.14252471923828, "logps/rejected": -72.21726989746094, "loss": 0.0766, "rewards/accuracies": 1.0, "rewards/chosen": 3.188190460205078, "rewards/margins": 2.2652077674865723, "rewards/rejected": 0.9229828119277954, "step": 9552 }, { "epoch": 1.55, "learning_rate": 3.440173312258962e-07, "logits/chosen": -0.7938204407691956, "logits/rejected": -0.7703947424888611, "logps/chosen": -144.49636840820312, "logps/rejected": -70.31044006347656, "loss": 0.3828, "rewards/accuracies": 0.0, "rewards/chosen": 1.130273461341858, "rewards/margins": -0.04198765754699707, "rewards/rejected": 1.172261118888855, "step": 9553 }, { "epoch": 1.55, "learning_rate": 3.4389247020393e-07, "logits/chosen": -0.6242270469665527, "logits/rejected": -0.6807536482810974, "logps/chosen": -164.60760498046875, "logps/rejected": -51.156742095947266, "loss": 0.2195, "rewards/accuracies": 1.0, "rewards/chosen": 3.501486301422119, "rewards/margins": 1.1786091327667236, "rewards/rejected": 2.3228771686553955, "step": 9554 }, { "epoch": 1.55, "learning_rate": 3.4376761996751704e-07, "logits/chosen": -0.6257117390632629, "logits/rejected": -0.5931715369224548, "logps/chosen": -38.94615173339844, "logps/rejected": -53.49920654296875, "loss": 0.9101, "rewards/accuracies": 0.0, "rewards/chosen": 1.2087013721466064, "rewards/margins": -0.030335187911987305, "rewards/rejected": 1.2390365600585938, "step": 9555 }, { "epoch": 1.55, "learning_rate": 3.436427805252833e-07, "logits/chosen": -0.9421244263648987, "logits/rejected": -0.7237961888313293, "logps/chosen": -104.67583465576172, "logps/rejected": -88.23162078857422, "loss": 0.5817, "rewards/accuracies": 1.0, "rewards/chosen": 6.1989359855651855, "rewards/margins": 3.883094310760498, "rewards/rejected": 2.3158416748046875, "step": 9556 }, { "epoch": 1.55, "learning_rate": 3.4351795188585387e-07, "logits/chosen": -0.4998133182525635, "logits/rejected": -0.5302832126617432, "logps/chosen": -34.32765197753906, "logps/rejected": -76.72273254394531, "loss": 1.0512, "rewards/accuracies": 0.0, "rewards/chosen": 1.4742740392684937, "rewards/margins": -1.1289783716201782, "rewards/rejected": 2.603252410888672, "step": 9557 }, { "epoch": 1.55, "learning_rate": 3.433931340578533e-07, "logits/chosen": -0.7104575037956238, "logits/rejected": -0.7104575037956238, "logps/chosen": -74.13291931152344, "logps/rejected": -74.13291931152344, "loss": 0.6375, "rewards/accuracies": 0.0, "rewards/chosen": 1.3791595697402954, "rewards/margins": 0.0, "rewards/rejected": 1.3791595697402954, "step": 9558 }, { "epoch": 1.55, "learning_rate": 3.432683270499054e-07, "logits/chosen": -0.7170116305351257, "logits/rejected": -0.6870935559272766, "logps/chosen": -162.50772094726562, "logps/rejected": -56.205726623535156, "loss": 0.1571, "rewards/accuracies": 1.0, "rewards/chosen": 3.795126438140869, "rewards/margins": 1.8831650018692017, "rewards/rejected": 1.9119614362716675, "step": 9559 }, { "epoch": 1.55, "learning_rate": 3.4314353087063307e-07, "logits/chosen": -0.1131749302148819, "logits/rejected": -0.11540685594081879, "logps/chosen": -3.491400957107544, "logps/rejected": -1.7374978065490723, "loss": 0.489, "rewards/accuracies": 0.0, "rewards/chosen": 0.17803648114204407, "rewards/margins": -0.10849511623382568, "rewards/rejected": 0.28653159737586975, "step": 9560 }, { "epoch": 1.55, "learning_rate": 3.430187455286586e-07, "logits/chosen": -0.49227631092071533, "logits/rejected": -0.47974893450737, "logps/chosen": -69.08787536621094, "logps/rejected": -69.04510498046875, "loss": 0.7315, "rewards/accuracies": 0.0, "rewards/chosen": 1.8536865711212158, "rewards/margins": -0.4426102638244629, "rewards/rejected": 2.2962968349456787, "step": 9561 }, { "epoch": 1.55, "learning_rate": 3.4289397103260344e-07, "logits/chosen": -0.7868579030036926, "logits/rejected": -0.6971458196640015, "logps/chosen": -135.9004364013672, "logps/rejected": -59.90508270263672, "loss": 0.8574, "rewards/accuracies": 1.0, "rewards/chosen": 2.5800812244415283, "rewards/margins": 0.4856085777282715, "rewards/rejected": 2.094472646713257, "step": 9562 }, { "epoch": 1.55, "learning_rate": 3.4276920739108827e-07, "logits/chosen": -0.9341633319854736, "logits/rejected": -0.9239158630371094, "logps/chosen": -111.28412628173828, "logps/rejected": -101.68402862548828, "loss": 1.4492, "rewards/accuracies": 0.0, "rewards/chosen": 0.8490837216377258, "rewards/margins": -2.1497802734375, "rewards/rejected": 2.998863935470581, "step": 9563 }, { "epoch": 1.55, "learning_rate": 3.4264445461273317e-07, "logits/chosen": -0.7745971083641052, "logits/rejected": -0.6949083805084229, "logps/chosen": -77.55465698242188, "logps/rejected": -77.37789916992188, "loss": 0.3373, "rewards/accuracies": 1.0, "rewards/chosen": 1.9238404035568237, "rewards/margins": 0.4100792407989502, "rewards/rejected": 1.5137611627578735, "step": 9564 }, { "epoch": 1.55, "learning_rate": 3.425197127061573e-07, "logits/chosen": -0.786777913570404, "logits/rejected": -0.6876291036605835, "logps/chosen": -63.831302642822266, "logps/rejected": -59.913360595703125, "loss": 0.2815, "rewards/accuracies": 1.0, "rewards/chosen": 2.1566083431243896, "rewards/margins": 1.2448331117630005, "rewards/rejected": 0.9117752313613892, "step": 9565 }, { "epoch": 1.55, "learning_rate": 3.4239498167997926e-07, "logits/chosen": -1.0833553075790405, "logits/rejected": -1.1217297315597534, "logps/chosen": -136.26336669921875, "logps/rejected": -37.49308395385742, "loss": 0.0749, "rewards/accuracies": 1.0, "rewards/chosen": 3.8211593627929688, "rewards/margins": 2.106813907623291, "rewards/rejected": 1.7143455743789673, "step": 9566 }, { "epoch": 1.55, "learning_rate": 3.4227026154281667e-07, "logits/chosen": -0.7458534240722656, "logits/rejected": -0.835028886795044, "logps/chosen": -206.2660675048828, "logps/rejected": -245.1118621826172, "loss": 1.1053, "rewards/accuracies": 0.0, "rewards/chosen": 5.461461067199707, "rewards/margins": -1.6801362037658691, "rewards/rejected": 7.141597270965576, "step": 9567 }, { "epoch": 1.55, "learning_rate": 3.4214555230328654e-07, "logits/chosen": -0.6562351584434509, "logits/rejected": -0.6050323843955994, "logps/chosen": -79.85613250732422, "logps/rejected": -40.94391632080078, "loss": 0.0944, "rewards/accuracies": 1.0, "rewards/chosen": 3.381990909576416, "rewards/margins": 2.0581889152526855, "rewards/rejected": 1.323801875114441, "step": 9568 }, { "epoch": 1.55, "learning_rate": 3.4202085397000526e-07, "logits/chosen": -0.9035137295722961, "logits/rejected": -0.8115475177764893, "logps/chosen": -101.4488525390625, "logps/rejected": -52.404823303222656, "loss": 0.1338, "rewards/accuracies": 1.0, "rewards/chosen": 6.157472133636475, "rewards/margins": 3.8510825634002686, "rewards/rejected": 2.306389570236206, "step": 9569 }, { "epoch": 1.55, "learning_rate": 3.41896166551588e-07, "logits/chosen": -0.8857393264770508, "logits/rejected": -0.8230994939804077, "logps/chosen": -138.077880859375, "logps/rejected": -114.40283966064453, "loss": 0.0944, "rewards/accuracies": 1.0, "rewards/chosen": 6.275961399078369, "rewards/margins": 1.7680549621582031, "rewards/rejected": 4.507906436920166, "step": 9570 }, { "epoch": 1.55, "learning_rate": 3.417714900566497e-07, "logits/chosen": -0.644684374332428, "logits/rejected": -0.4589693546295166, "logps/chosen": -80.6436767578125, "logps/rejected": -43.04030990600586, "loss": 0.1124, "rewards/accuracies": 1.0, "rewards/chosen": 3.637387990951538, "rewards/margins": 1.8705750703811646, "rewards/rejected": 1.7668129205703735, "step": 9571 }, { "epoch": 1.55, "learning_rate": 3.4164682449380423e-07, "logits/chosen": -0.9564345479011536, "logits/rejected": -0.922566831111908, "logps/chosen": -33.30271911621094, "logps/rejected": -36.84066390991211, "loss": 0.2649, "rewards/accuracies": 1.0, "rewards/chosen": 2.056220293045044, "rewards/margins": 0.42091596126556396, "rewards/rejected": 1.63530433177948, "step": 9572 }, { "epoch": 1.55, "learning_rate": 3.415221698716648e-07, "logits/chosen": -0.707609236240387, "logits/rejected": -0.6323628425598145, "logps/chosen": -61.15340042114258, "logps/rejected": -83.12557983398438, "loss": 0.4834, "rewards/accuracies": 0.0, "rewards/chosen": 2.049436569213867, "rewards/margins": -0.3173940181732178, "rewards/rejected": 2.366830587387085, "step": 9573 }, { "epoch": 1.55, "learning_rate": 3.413975261988441e-07, "logits/chosen": -0.6027939319610596, "logits/rejected": -0.5559049248695374, "logps/chosen": -82.15926361083984, "logps/rejected": -56.99095153808594, "loss": 1.1255, "rewards/accuracies": 0.0, "rewards/chosen": 0.5946678519248962, "rewards/margins": -0.7802078127861023, "rewards/rejected": 1.3748756647109985, "step": 9574 }, { "epoch": 1.55, "learning_rate": 3.412728934839535e-07, "logits/chosen": -0.5605971217155457, "logits/rejected": -0.5779492855072021, "logps/chosen": -67.14815521240234, "logps/rejected": -96.62637329101562, "loss": 0.6302, "rewards/accuracies": 0.0, "rewards/chosen": 2.717005968093872, "rewards/margins": -0.8984940052032471, "rewards/rejected": 3.615499973297119, "step": 9575 }, { "epoch": 1.55, "learning_rate": 3.41148271735604e-07, "logits/chosen": -0.7074317932128906, "logits/rejected": -0.6921582818031311, "logps/chosen": -93.02609252929688, "logps/rejected": -139.2965850830078, "loss": 0.3502, "rewards/accuracies": 1.0, "rewards/chosen": 1.5386756658554077, "rewards/margins": 0.033739447593688965, "rewards/rejected": 1.5049362182617188, "step": 9576 }, { "epoch": 1.55, "learning_rate": 3.41023660962406e-07, "logits/chosen": -0.8814786672592163, "logits/rejected": -0.7931711673736572, "logps/chosen": -189.3475341796875, "logps/rejected": -84.52997589111328, "loss": 0.7262, "rewards/accuracies": 1.0, "rewards/chosen": 1.9113037586212158, "rewards/margins": 0.9381828904151917, "rewards/rejected": 0.9731208682060242, "step": 9577 }, { "epoch": 1.55, "learning_rate": 3.4089906117296865e-07, "logits/chosen": -0.6867831349372864, "logits/rejected": -0.6410408020019531, "logps/chosen": -36.192771911621094, "logps/rejected": -40.36796188354492, "loss": 0.797, "rewards/accuracies": 1.0, "rewards/chosen": 1.4254673719406128, "rewards/margins": 0.015242815017700195, "rewards/rejected": 1.4102245569229126, "step": 9578 }, { "epoch": 1.55, "learning_rate": 3.4077447237590075e-07, "logits/chosen": -0.522395670413971, "logits/rejected": -0.511141836643219, "logps/chosen": -58.2997932434082, "logps/rejected": -95.9813003540039, "loss": 0.954, "rewards/accuracies": 1.0, "rewards/chosen": 2.0403149127960205, "rewards/margins": 0.5416004657745361, "rewards/rejected": 1.4987144470214844, "step": 9579 }, { "epoch": 1.55, "learning_rate": 3.4064989457981026e-07, "logits/chosen": -0.8389900922775269, "logits/rejected": -0.7989628911018372, "logps/chosen": -45.394386291503906, "logps/rejected": -80.22657775878906, "loss": 1.5046, "rewards/accuracies": 1.0, "rewards/chosen": 1.463405966758728, "rewards/margins": 0.46466708183288574, "rewards/rejected": 0.9987388849258423, "step": 9580 }, { "epoch": 1.56, "learning_rate": 3.4052532779330425e-07, "logits/chosen": -0.41475144028663635, "logits/rejected": -0.4263923168182373, "logps/chosen": -70.3922348022461, "logps/rejected": -116.10264587402344, "loss": 0.6309, "rewards/accuracies": 0.0, "rewards/chosen": 0.7354736328125, "rewards/margins": -0.16784363985061646, "rewards/rejected": 0.9033172726631165, "step": 9581 }, { "epoch": 1.56, "learning_rate": 3.404007720249891e-07, "logits/chosen": -0.6320773363113403, "logits/rejected": -0.4874025583267212, "logps/chosen": -62.883888244628906, "logps/rejected": -67.64495849609375, "loss": 0.1988, "rewards/accuracies": 1.0, "rewards/chosen": 1.8244857788085938, "rewards/margins": 0.7322204113006592, "rewards/rejected": 1.0922653675079346, "step": 9582 }, { "epoch": 1.56, "learning_rate": 3.402762272834705e-07, "logits/chosen": -0.5717529058456421, "logits/rejected": -0.544143795967102, "logps/chosen": -86.90921020507812, "logps/rejected": -151.63494873046875, "loss": 0.2967, "rewards/accuracies": 1.0, "rewards/chosen": 1.7769638299942017, "rewards/margins": 0.22073817253112793, "rewards/rejected": 1.5562256574630737, "step": 9583 }, { "epoch": 1.56, "learning_rate": 3.401516935773533e-07, "logits/chosen": -0.5701218843460083, "logits/rejected": -0.5945348143577576, "logps/chosen": -56.643043518066406, "logps/rejected": -54.87456512451172, "loss": 1.1657, "rewards/accuracies": 0.0, "rewards/chosen": 0.5726291537284851, "rewards/margins": -1.0590686798095703, "rewards/rejected": 1.6316978931427002, "step": 9584 }, { "epoch": 1.56, "learning_rate": 3.4002717091524145e-07, "logits/chosen": -0.3518224060535431, "logits/rejected": -0.3605703115463257, "logps/chosen": -18.509281158447266, "logps/rejected": -21.91246795654297, "loss": 2.2042, "rewards/accuracies": 1.0, "rewards/chosen": 0.3443855345249176, "rewards/margins": 0.38978272676467896, "rewards/rejected": -0.045397188514471054, "step": 9585 }, { "epoch": 1.56, "learning_rate": 3.399026593057386e-07, "logits/chosen": -1.3504148721694946, "logits/rejected": -1.1602537631988525, "logps/chosen": -112.20703887939453, "logps/rejected": -20.21110725402832, "loss": 2.2999, "rewards/accuracies": 1.0, "rewards/chosen": 7.1342339515686035, "rewards/margins": 6.6607489585876465, "rewards/rejected": 0.4734848141670227, "step": 9586 }, { "epoch": 1.56, "learning_rate": 3.3977815875744696e-07, "logits/chosen": -0.92270827293396, "logits/rejected": -0.9049558639526367, "logps/chosen": -110.96881103515625, "logps/rejected": -92.25727081298828, "loss": 0.4698, "rewards/accuracies": 1.0, "rewards/chosen": 1.0546554327011108, "rewards/margins": 0.25497812032699585, "rewards/rejected": 0.799677312374115, "step": 9587 }, { "epoch": 1.56, "learning_rate": 3.396536692789686e-07, "logits/chosen": -0.7992443442344666, "logits/rejected": -0.6973413228988647, "logps/chosen": -142.09066772460938, "logps/rejected": -126.72589874267578, "loss": 1.8664, "rewards/accuracies": 0.0, "rewards/chosen": 1.1311829090118408, "rewards/margins": -3.246422529220581, "rewards/rejected": 4.377605438232422, "step": 9588 }, { "epoch": 1.56, "learning_rate": 3.3952919087890446e-07, "logits/chosen": -0.7367273569107056, "logits/rejected": -0.5950189232826233, "logps/chosen": -195.92161560058594, "logps/rejected": -93.48124694824219, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": 7.479569911956787, "rewards/margins": 3.0922513008117676, "rewards/rejected": 4.3873186111450195, "step": 9589 }, { "epoch": 1.56, "learning_rate": 3.3940472356585487e-07, "logits/chosen": -0.4449622631072998, "logits/rejected": -0.42018741369247437, "logps/chosen": -52.75390625, "logps/rejected": -89.45736694335938, "loss": 0.5831, "rewards/accuracies": 1.0, "rewards/chosen": 1.9200317859649658, "rewards/margins": 0.8260971307754517, "rewards/rejected": 1.0939346551895142, "step": 9590 }, { "epoch": 1.56, "learning_rate": 3.392802673484193e-07, "logits/chosen": -0.5555399060249329, "logits/rejected": -0.5698171854019165, "logps/chosen": -30.037057876586914, "logps/rejected": -51.200889587402344, "loss": 0.4662, "rewards/accuracies": 0.0, "rewards/chosen": 0.31035366654396057, "rewards/margins": -0.4005899727344513, "rewards/rejected": 0.7109436392784119, "step": 9591 }, { "epoch": 1.56, "learning_rate": 3.391558222351965e-07, "logits/chosen": -0.6573485136032104, "logits/rejected": -0.6995394825935364, "logps/chosen": -67.12432861328125, "logps/rejected": -86.04503631591797, "loss": 0.9826, "rewards/accuracies": 0.0, "rewards/chosen": 1.682137370109558, "rewards/margins": -0.46092450618743896, "rewards/rejected": 2.143061876296997, "step": 9592 }, { "epoch": 1.56, "learning_rate": 3.390313882347845e-07, "logits/chosen": -0.8268502950668335, "logits/rejected": -0.7445166707038879, "logps/chosen": -69.99966430664062, "logps/rejected": -67.85063934326172, "loss": 0.5746, "rewards/accuracies": 0.0, "rewards/chosen": 2.4899277687072754, "rewards/margins": -0.1899261474609375, "rewards/rejected": 2.679853916168213, "step": 9593 }, { "epoch": 1.56, "learning_rate": 3.389069653557804e-07, "logits/chosen": -0.5234478116035461, "logits/rejected": -0.4983660578727722, "logps/chosen": -89.66777038574219, "logps/rejected": -101.10591125488281, "loss": 0.6158, "rewards/accuracies": 0.0, "rewards/chosen": 2.0423126220703125, "rewards/margins": -0.17468571662902832, "rewards/rejected": 2.216998338699341, "step": 9594 }, { "epoch": 1.56, "learning_rate": 3.3878255360678075e-07, "logits/chosen": -0.5900562405586243, "logits/rejected": -0.5900562405586243, "logps/chosen": -103.09587097167969, "logps/rejected": -103.09587097167969, "loss": 0.3627, "rewards/accuracies": 0.0, "rewards/chosen": 1.9713042974472046, "rewards/margins": 0.0, "rewards/rejected": 1.9713042974472046, "step": 9595 }, { "epoch": 1.56, "learning_rate": 3.386581529963812e-07, "logits/chosen": -0.6981551051139832, "logits/rejected": -0.683983564376831, "logps/chosen": -127.30901336669922, "logps/rejected": -109.25460052490234, "loss": 0.1384, "rewards/accuracies": 1.0, "rewards/chosen": 5.85297155380249, "rewards/margins": 1.9452955722808838, "rewards/rejected": 3.9076759815216064, "step": 9596 }, { "epoch": 1.56, "learning_rate": 3.385337635331767e-07, "logits/chosen": -1.1230517625808716, "logits/rejected": -1.0430161952972412, "logps/chosen": -72.7042007446289, "logps/rejected": -93.28315734863281, "loss": 0.2374, "rewards/accuracies": 1.0, "rewards/chosen": 4.149688243865967, "rewards/margins": 0.6649088859558105, "rewards/rejected": 3.4847793579101562, "step": 9597 }, { "epoch": 1.56, "learning_rate": 3.384093852257613e-07, "logits/chosen": -0.7164077758789062, "logits/rejected": -0.6569709181785583, "logps/chosen": -52.096195220947266, "logps/rejected": -56.53083801269531, "loss": 0.3886, "rewards/accuracies": 1.0, "rewards/chosen": 1.6462124586105347, "rewards/margins": 0.6185222864151001, "rewards/rejected": 1.0276901721954346, "step": 9598 }, { "epoch": 1.56, "learning_rate": 3.3828501808272836e-07, "logits/chosen": -0.7123439311981201, "logits/rejected": -0.6535733342170715, "logps/chosen": -104.95458221435547, "logps/rejected": -309.6971435546875, "loss": 1.7958, "rewards/accuracies": 0.0, "rewards/chosen": 2.967479705810547, "rewards/margins": -3.3464698791503906, "rewards/rejected": 6.3139495849609375, "step": 9599 }, { "epoch": 1.56, "learning_rate": 3.3816066211267055e-07, "logits/chosen": -0.4348476231098175, "logits/rejected": -0.38039058446884155, "logps/chosen": -63.95625305175781, "logps/rejected": -45.304866790771484, "loss": 1.6565, "rewards/accuracies": 0.0, "rewards/chosen": 0.7246856689453125, "rewards/margins": -0.09359246492385864, "rewards/rejected": 0.8182781338691711, "step": 9600 }, { "epoch": 1.56, "learning_rate": 3.380363173241796e-07, "logits/chosen": -0.899101197719574, "logits/rejected": -0.9602684378623962, "logps/chosen": -119.04098510742188, "logps/rejected": -83.85594177246094, "loss": 0.7166, "rewards/accuracies": 0.0, "rewards/chosen": 5.934582710266113, "rewards/margins": -1.0573911666870117, "rewards/rejected": 6.991973876953125, "step": 9601 }, { "epoch": 1.56, "learning_rate": 3.379119837258466e-07, "logits/chosen": -0.29924166202545166, "logits/rejected": -0.30903103947639465, "logps/chosen": -97.7236328125, "logps/rejected": -42.83165740966797, "loss": 1.1571, "rewards/accuracies": 0.0, "rewards/chosen": -0.3900924623012543, "rewards/margins": -2.182180404663086, "rewards/rejected": 1.7920879125595093, "step": 9602 }, { "epoch": 1.56, "learning_rate": 3.377876613262619e-07, "logits/chosen": -0.639531135559082, "logits/rejected": -0.7034118175506592, "logps/chosen": -51.105018615722656, "logps/rejected": -90.01014709472656, "loss": 1.3735, "rewards/accuracies": 0.0, "rewards/chosen": 1.8773812055587769, "rewards/margins": -2.1344552040100098, "rewards/rejected": 4.011836528778076, "step": 9603 }, { "epoch": 1.56, "learning_rate": 3.376633501340148e-07, "logits/chosen": -0.6256460547447205, "logits/rejected": -0.6513810157775879, "logps/chosen": -85.18948364257812, "logps/rejected": -98.11164855957031, "loss": 0.6428, "rewards/accuracies": 0.0, "rewards/chosen": 0.19877243041992188, "rewards/margins": -0.5776848196983337, "rewards/rejected": 0.7764572501182556, "step": 9604 }, { "epoch": 1.56, "learning_rate": 3.375390501576943e-07, "logits/chosen": -0.8350669741630554, "logits/rejected": -0.771978497505188, "logps/chosen": -79.86964416503906, "logps/rejected": -90.88493347167969, "loss": 1.2572, "rewards/accuracies": 0.0, "rewards/chosen": 1.2786842584609985, "rewards/margins": -0.7933510541915894, "rewards/rejected": 2.072035312652588, "step": 9605 }, { "epoch": 1.56, "learning_rate": 3.374147614058882e-07, "logits/chosen": -1.1084694862365723, "logits/rejected": -0.9997771382331848, "logps/chosen": -76.52259826660156, "logps/rejected": -113.58920288085938, "loss": 0.7423, "rewards/accuracies": 0.0, "rewards/chosen": 4.131664276123047, "rewards/margins": -1.1007075309753418, "rewards/rejected": 5.232371807098389, "step": 9606 }, { "epoch": 1.56, "learning_rate": 3.372904838871836e-07, "logits/chosen": -0.5380375385284424, "logits/rejected": -0.6416534185409546, "logps/chosen": -43.19037628173828, "logps/rejected": -73.55940246582031, "loss": 0.5685, "rewards/accuracies": 0.0, "rewards/chosen": 1.4224525690078735, "rewards/margins": -0.6898080110549927, "rewards/rejected": 2.112260580062866, "step": 9607 }, { "epoch": 1.56, "learning_rate": 3.371662176101671e-07, "logits/chosen": -0.8290218114852905, "logits/rejected": -0.8479938507080078, "logps/chosen": -62.995147705078125, "logps/rejected": -61.41889190673828, "loss": 0.6446, "rewards/accuracies": 0.0, "rewards/chosen": 1.9539588689804077, "rewards/margins": -0.2862457036972046, "rewards/rejected": 2.2402045726776123, "step": 9608 }, { "epoch": 1.56, "learning_rate": 3.370419625834242e-07, "logits/chosen": -0.8804934024810791, "logits/rejected": -0.7370959520339966, "logps/chosen": -64.38634490966797, "logps/rejected": -50.131717681884766, "loss": 0.4642, "rewards/accuracies": 0.0, "rewards/chosen": 3.139472246170044, "rewards/margins": -0.001991748809814453, "rewards/rejected": 3.1414639949798584, "step": 9609 }, { "epoch": 1.56, "learning_rate": 3.3691771881553974e-07, "logits/chosen": -0.4320111870765686, "logits/rejected": -0.41185298562049866, "logps/chosen": -63.710479736328125, "logps/rejected": -78.21038818359375, "loss": 0.3441, "rewards/accuracies": 1.0, "rewards/chosen": 1.7218490839004517, "rewards/margins": 0.01768803596496582, "rewards/rejected": 1.7041610479354858, "step": 9610 }, { "epoch": 1.56, "learning_rate": 3.367934863150979e-07, "logits/chosen": -0.8822422623634338, "logits/rejected": -0.8330563902854919, "logps/chosen": -69.51364135742188, "logps/rejected": -76.2598648071289, "loss": 1.278, "rewards/accuracies": 0.0, "rewards/chosen": 0.9654304385185242, "rewards/margins": -1.848717451095581, "rewards/rejected": 2.81414794921875, "step": 9611 }, { "epoch": 1.56, "learning_rate": 3.366692650906817e-07, "logits/chosen": -0.8656340837478638, "logits/rejected": -0.8385069966316223, "logps/chosen": -223.85516357421875, "logps/rejected": -85.19684600830078, "loss": 0.0917, "rewards/accuracies": 1.0, "rewards/chosen": 4.391659736633301, "rewards/margins": 1.693021535873413, "rewards/rejected": 2.6986382007598877, "step": 9612 }, { "epoch": 1.56, "learning_rate": 3.365450551508739e-07, "logits/chosen": -0.7281138896942139, "logits/rejected": -0.6389197707176208, "logps/chosen": -36.374454498291016, "logps/rejected": -9.532815933227539, "loss": 0.1204, "rewards/accuracies": 1.0, "rewards/chosen": 1.941745400428772, "rewards/margins": 1.3304095268249512, "rewards/rejected": 0.6113358736038208, "step": 9613 }, { "epoch": 1.56, "learning_rate": 3.364208565042562e-07, "logits/chosen": -1.1050519943237305, "logits/rejected": -1.0959831476211548, "logps/chosen": -183.78366088867188, "logps/rejected": -136.9869384765625, "loss": 1.4704, "rewards/accuracies": 0.0, "rewards/chosen": 5.169633388519287, "rewards/margins": -0.9850921630859375, "rewards/rejected": 6.154725551605225, "step": 9614 }, { "epoch": 1.56, "learning_rate": 3.362966691594096e-07, "logits/chosen": -0.4459393322467804, "logits/rejected": -0.4601035416126251, "logps/chosen": -60.205413818359375, "logps/rejected": -44.78810119628906, "loss": 0.6437, "rewards/accuracies": 1.0, "rewards/chosen": 1.9129012823104858, "rewards/margins": 0.5522170066833496, "rewards/rejected": 1.3606842756271362, "step": 9615 }, { "epoch": 1.56, "learning_rate": 3.3617249312491403e-07, "logits/chosen": -0.4660159647464752, "logits/rejected": -0.4660159647464752, "logps/chosen": -34.64098358154297, "logps/rejected": -34.64098358154297, "loss": 0.9818, "rewards/accuracies": 0.0, "rewards/chosen": 2.1789443492889404, "rewards/margins": 0.0, "rewards/rejected": 2.1789443492889404, "step": 9616 }, { "epoch": 1.56, "learning_rate": 3.360483284093491e-07, "logits/chosen": -1.0409760475158691, "logits/rejected": -1.0145496129989624, "logps/chosen": -131.35507202148438, "logps/rejected": -76.147705078125, "loss": 0.5464, "rewards/accuracies": 1.0, "rewards/chosen": 4.458639621734619, "rewards/margins": 1.9213836193084717, "rewards/rejected": 2.5372560024261475, "step": 9617 }, { "epoch": 1.56, "learning_rate": 3.359241750212933e-07, "logits/chosen": -0.5568696856498718, "logits/rejected": -0.4982403814792633, "logps/chosen": -76.90765380859375, "logps/rejected": -104.59452819824219, "loss": 0.2754, "rewards/accuracies": 1.0, "rewards/chosen": 1.6848976612091064, "rewards/margins": 1.3136863708496094, "rewards/rejected": 0.3712112605571747, "step": 9618 }, { "epoch": 1.56, "learning_rate": 3.358000329693246e-07, "logits/chosen": -0.794144332408905, "logits/rejected": -0.6751219034194946, "logps/chosen": -38.18280029296875, "logps/rejected": -6.22683048248291, "loss": 0.0611, "rewards/accuracies": 1.0, "rewards/chosen": 2.871532440185547, "rewards/margins": 2.1633293628692627, "rewards/rejected": 0.708203136920929, "step": 9619 }, { "epoch": 1.56, "learning_rate": 3.356759022620199e-07, "logits/chosen": -0.8451735377311707, "logits/rejected": -0.7821053862571716, "logps/chosen": -108.51744079589844, "logps/rejected": -92.25281524658203, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 2.8364975452423096, "rewards/margins": 0.49236536026000977, "rewards/rejected": 2.3441321849823, "step": 9620 }, { "epoch": 1.56, "learning_rate": 3.355517829079555e-07, "logits/chosen": -0.9109523892402649, "logits/rejected": -0.882617175579071, "logps/chosen": -58.31586456298828, "logps/rejected": -66.78659057617188, "loss": 0.8024, "rewards/accuracies": 0.0, "rewards/chosen": 1.5286277532577515, "rewards/margins": -1.2275367975234985, "rewards/rejected": 2.75616455078125, "step": 9621 }, { "epoch": 1.56, "learning_rate": 3.354276749157069e-07, "logits/chosen": -0.7169577479362488, "logits/rejected": -0.7685362696647644, "logps/chosen": -111.23469543457031, "logps/rejected": -107.6610107421875, "loss": 0.776, "rewards/accuracies": 0.0, "rewards/chosen": 4.383955478668213, "rewards/margins": -0.7021393775939941, "rewards/rejected": 5.086094856262207, "step": 9622 }, { "epoch": 1.56, "learning_rate": 3.3530357829384875e-07, "logits/chosen": -0.88983154296875, "logits/rejected": -0.8971171379089355, "logps/chosen": -27.070796966552734, "logps/rejected": -34.90530776977539, "loss": 0.7802, "rewards/accuracies": 1.0, "rewards/chosen": 0.39150142669677734, "rewards/margins": 0.03830242156982422, "rewards/rejected": 0.3531990051269531, "step": 9623 }, { "epoch": 1.56, "learning_rate": 3.3517949305095494e-07, "logits/chosen": -0.9309608936309814, "logits/rejected": -1.1111146211624146, "logps/chosen": -176.4524383544922, "logps/rejected": -201.56439208984375, "loss": 0.9142, "rewards/accuracies": 0.0, "rewards/chosen": 4.172061443328857, "rewards/margins": -1.5651273727416992, "rewards/rejected": 5.737188816070557, "step": 9624 }, { "epoch": 1.56, "learning_rate": 3.3505541919559864e-07, "logits/chosen": -0.417823851108551, "logits/rejected": -0.34184831380844116, "logps/chosen": -70.0904541015625, "logps/rejected": -52.15210723876953, "loss": 0.4021, "rewards/accuracies": 0.0, "rewards/chosen": 1.89727783203125, "rewards/margins": -0.20511937141418457, "rewards/rejected": 2.1023972034454346, "step": 9625 }, { "epoch": 1.56, "learning_rate": 3.349313567363522e-07, "logits/chosen": -1.164981722831726, "logits/rejected": -1.1076136827468872, "logps/chosen": -82.45829772949219, "logps/rejected": -57.95298767089844, "loss": 1.3073, "rewards/accuracies": 0.0, "rewards/chosen": 1.3713897466659546, "rewards/margins": -2.154752254486084, "rewards/rejected": 3.526142120361328, "step": 9626 }, { "epoch": 1.56, "learning_rate": 3.3480730568178707e-07, "logits/chosen": -1.144240379333496, "logits/rejected": -1.0105684995651245, "logps/chosen": -90.40205383300781, "logps/rejected": -90.18099212646484, "loss": 0.0834, "rewards/accuracies": 1.0, "rewards/chosen": 5.0236406326293945, "rewards/margins": 1.9314172267913818, "rewards/rejected": 3.0922234058380127, "step": 9627 }, { "epoch": 1.56, "learning_rate": 3.3468326604047406e-07, "logits/chosen": -0.7818411588668823, "logits/rejected": -0.7708740830421448, "logps/chosen": -88.26591491699219, "logps/rejected": -77.33070373535156, "loss": 1.6055, "rewards/accuracies": 1.0, "rewards/chosen": 3.2679977416992188, "rewards/margins": 0.5896697044372559, "rewards/rejected": 2.678328037261963, "step": 9628 }, { "epoch": 1.56, "learning_rate": 3.345592378209831e-07, "logits/chosen": -0.6623765230178833, "logits/rejected": -0.6844723224639893, "logps/chosen": -138.09866333007812, "logps/rejected": -78.99153137207031, "loss": 0.915, "rewards/accuracies": 0.0, "rewards/chosen": 2.893894910812378, "rewards/margins": -1.5607712268829346, "rewards/rejected": 4.4546661376953125, "step": 9629 }, { "epoch": 1.56, "learning_rate": 3.344352210318834e-07, "logits/chosen": -1.0167299509048462, "logits/rejected": -0.9145480394363403, "logps/chosen": -73.95166015625, "logps/rejected": -214.20579528808594, "loss": 1.9196, "rewards/accuracies": 0.0, "rewards/chosen": 2.161299228668213, "rewards/margins": -3.6595396995544434, "rewards/rejected": 5.820838928222656, "step": 9630 }, { "epoch": 1.56, "learning_rate": 3.3431121568174336e-07, "logits/chosen": -0.8014445900917053, "logits/rejected": -0.8093652725219727, "logps/chosen": -81.33140563964844, "logps/rejected": -75.39225006103516, "loss": 1.1488, "rewards/accuracies": 0.0, "rewards/chosen": 1.436151146888733, "rewards/margins": -0.9643310308456421, "rewards/rejected": 2.400482177734375, "step": 9631 }, { "epoch": 1.56, "learning_rate": 3.341872217791305e-07, "logits/chosen": -0.9456924200057983, "logits/rejected": -1.009079933166504, "logps/chosen": -150.2264404296875, "logps/rejected": -168.0626678466797, "loss": 1.1779, "rewards/accuracies": 0.0, "rewards/chosen": 4.2109527587890625, "rewards/margins": -1.654890537261963, "rewards/rejected": 5.865843296051025, "step": 9632 }, { "epoch": 1.56, "learning_rate": 3.3406323933261174e-07, "logits/chosen": -0.5121592283248901, "logits/rejected": -0.4735688865184784, "logps/chosen": -40.98961639404297, "logps/rejected": -45.947391510009766, "loss": 0.4008, "rewards/accuracies": 0.0, "rewards/chosen": 1.152241587638855, "rewards/margins": -0.030125737190246582, "rewards/rejected": 1.1823673248291016, "step": 9633 }, { "epoch": 1.56, "learning_rate": 3.3393926835075304e-07, "logits/chosen": -0.862553060054779, "logits/rejected": -0.862363874912262, "logps/chosen": -196.85348510742188, "logps/rejected": -142.55520629882812, "loss": 1.0365, "rewards/accuracies": 1.0, "rewards/chosen": 4.318973064422607, "rewards/margins": 0.7303438186645508, "rewards/rejected": 3.5886292457580566, "step": 9634 }, { "epoch": 1.56, "learning_rate": 3.338153088421196e-07, "logits/chosen": -0.45485666394233704, "logits/rejected": -0.5104977488517761, "logps/chosen": -80.74400329589844, "logps/rejected": -58.142547607421875, "loss": 0.4594, "rewards/accuracies": 0.0, "rewards/chosen": 0.9650352597236633, "rewards/margins": -0.35259097814559937, "rewards/rejected": 1.3176262378692627, "step": 9635 }, { "epoch": 1.56, "learning_rate": 3.336913608152758e-07, "logits/chosen": -0.5218068361282349, "logits/rejected": -0.5218068361282349, "logps/chosen": -96.61222839355469, "logps/rejected": -96.61222839355469, "loss": 0.7041, "rewards/accuracies": 0.0, "rewards/chosen": 0.5694671869277954, "rewards/margins": 0.0, "rewards/rejected": 0.5694671869277954, "step": 9636 }, { "epoch": 1.56, "learning_rate": 3.3356742427878545e-07, "logits/chosen": -0.09571606665849686, "logits/rejected": -0.11925816535949707, "logps/chosen": -11.305069923400879, "logps/rejected": -54.4940185546875, "loss": 0.3722, "rewards/accuracies": 0.0, "rewards/chosen": 0.1989274024963379, "rewards/margins": -0.08731794357299805, "rewards/rejected": 0.28624534606933594, "step": 9637 }, { "epoch": 1.56, "learning_rate": 3.334434992412112e-07, "logits/chosen": -0.982570469379425, "logits/rejected": -0.9169425964355469, "logps/chosen": -89.80270385742188, "logps/rejected": -69.35990905761719, "loss": 0.7346, "rewards/accuracies": 1.0, "rewards/chosen": 4.718937873840332, "rewards/margins": 1.0551164150238037, "rewards/rejected": 3.6638214588165283, "step": 9638 }, { "epoch": 1.56, "learning_rate": 3.3331958571111523e-07, "logits/chosen": -1.035335898399353, "logits/rejected": -0.9819395542144775, "logps/chosen": -85.52470397949219, "logps/rejected": -44.21118927001953, "loss": 0.6194, "rewards/accuracies": 1.0, "rewards/chosen": 1.1933525800704956, "rewards/margins": 0.6877586841583252, "rewards/rejected": 0.5055938959121704, "step": 9639 }, { "epoch": 1.56, "learning_rate": 3.331956836970586e-07, "logits/chosen": -0.8643916249275208, "logits/rejected": -0.8660829663276672, "logps/chosen": -74.6932144165039, "logps/rejected": -63.11199188232422, "loss": 0.6321, "rewards/accuracies": 0.0, "rewards/chosen": 2.1520631313323975, "rewards/margins": -0.06991958618164062, "rewards/rejected": 2.221982717514038, "step": 9640 }, { "epoch": 1.56, "learning_rate": 3.3307179320760205e-07, "logits/chosen": -0.8364993929862976, "logits/rejected": -0.7638860940933228, "logps/chosen": -148.1933135986328, "logps/rejected": -92.88397979736328, "loss": 0.4238, "rewards/accuracies": 1.0, "rewards/chosen": 4.699493408203125, "rewards/margins": 2.2486846446990967, "rewards/rejected": 2.4508087635040283, "step": 9641 }, { "epoch": 1.57, "learning_rate": 3.3294791425130507e-07, "logits/chosen": -0.8755444288253784, "logits/rejected": -0.8990728855133057, "logps/chosen": -101.02599334716797, "logps/rejected": -61.66019058227539, "loss": 1.0822, "rewards/accuracies": 1.0, "rewards/chosen": 1.0626763105392456, "rewards/margins": 0.07209515571594238, "rewards/rejected": 0.9905811548233032, "step": 9642 }, { "epoch": 1.57, "learning_rate": 3.328240468367266e-07, "logits/chosen": -0.7638511657714844, "logits/rejected": -0.7638511657714844, "logps/chosen": -74.1994857788086, "logps/rejected": -74.1994857788086, "loss": 0.5885, "rewards/accuracies": 0.0, "rewards/chosen": 2.981879472732544, "rewards/margins": 0.0, "rewards/rejected": 2.981879472732544, "step": 9643 }, { "epoch": 1.57, "learning_rate": 3.3270019097242464e-07, "logits/chosen": -0.7859988808631897, "logits/rejected": -0.7189258337020874, "logps/chosen": -128.69082641601562, "logps/rejected": -64.27075958251953, "loss": 0.3189, "rewards/accuracies": 1.0, "rewards/chosen": 3.1103317737579346, "rewards/margins": 1.3542991876602173, "rewards/rejected": 1.7560325860977173, "step": 9644 }, { "epoch": 1.57, "learning_rate": 3.3257634666695645e-07, "logits/chosen": -0.32431772351264954, "logits/rejected": -0.354879230260849, "logps/chosen": -6.653526306152344, "logps/rejected": -80.84065246582031, "loss": 0.3213, "rewards/accuracies": 1.0, "rewards/chosen": 0.19793395698070526, "rewards/margins": 0.1578224152326584, "rewards/rejected": 0.040111541748046875, "step": 9645 }, { "epoch": 1.57, "learning_rate": 3.324525139288785e-07, "logits/chosen": -0.9086135029792786, "logits/rejected": -0.7567790150642395, "logps/chosen": -63.625709533691406, "logps/rejected": -112.31253051757812, "loss": 0.7997, "rewards/accuracies": 0.0, "rewards/chosen": 1.4759620428085327, "rewards/margins": -0.4410911798477173, "rewards/rejected": 1.91705322265625, "step": 9646 }, { "epoch": 1.57, "learning_rate": 3.323286927667466e-07, "logits/chosen": -0.6814551949501038, "logits/rejected": -0.6814551949501038, "logps/chosen": -59.1408805847168, "logps/rejected": -59.1408805847168, "loss": 0.6716, "rewards/accuracies": 0.0, "rewards/chosen": 1.7038754224777222, "rewards/margins": 0.0, "rewards/rejected": 1.7038754224777222, "step": 9647 }, { "epoch": 1.57, "learning_rate": 3.322048831891154e-07, "logits/chosen": -0.4860377311706543, "logits/rejected": -0.555593729019165, "logps/chosen": -89.61825561523438, "logps/rejected": -72.90692138671875, "loss": 1.4289, "rewards/accuracies": 0.0, "rewards/chosen": 0.42605286836624146, "rewards/margins": -1.1237945556640625, "rewards/rejected": 1.5498474836349487, "step": 9648 }, { "epoch": 1.57, "learning_rate": 3.320810852045391e-07, "logits/chosen": -0.38101136684417725, "logits/rejected": -0.34762370586395264, "logps/chosen": -37.180213928222656, "logps/rejected": -8.85647964477539, "loss": 2.8979, "rewards/accuracies": 1.0, "rewards/chosen": 0.4293983578681946, "rewards/margins": 0.02901497483253479, "rewards/rejected": 0.4003833830356598, "step": 9649 }, { "epoch": 1.57, "learning_rate": 3.31957298821571e-07, "logits/chosen": -0.7457564473152161, "logits/rejected": -0.6688351631164551, "logps/chosen": -53.01002502441406, "logps/rejected": -120.10198211669922, "loss": 1.0344, "rewards/accuracies": 0.0, "rewards/chosen": 1.9790840148925781, "rewards/margins": -1.0249009132385254, "rewards/rejected": 3.0039849281311035, "step": 9650 }, { "epoch": 1.57, "learning_rate": 3.318335240487634e-07, "logits/chosen": -0.19900032877922058, "logits/rejected": -0.175087571144104, "logps/chosen": -58.1455192565918, "logps/rejected": -54.92552185058594, "loss": 0.5031, "rewards/accuracies": 1.0, "rewards/chosen": 0.9948284029960632, "rewards/margins": 0.03940010070800781, "rewards/rejected": 0.9554283022880554, "step": 9651 }, { "epoch": 1.57, "learning_rate": 3.317097608946682e-07, "logits/chosen": -0.8451178669929504, "logits/rejected": -0.78590327501297, "logps/chosen": -130.57180786132812, "logps/rejected": -71.97765350341797, "loss": 1.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.9977432489395142, "rewards/margins": 0.10303270816802979, "rewards/rejected": 1.8947105407714844, "step": 9652 }, { "epoch": 1.57, "learning_rate": 3.3158600936783607e-07, "logits/chosen": -0.8552995920181274, "logits/rejected": -0.8757376074790955, "logps/chosen": -97.0543441772461, "logps/rejected": -106.57185363769531, "loss": 0.6697, "rewards/accuracies": 0.0, "rewards/chosen": 1.099768042564392, "rewards/margins": -0.18878257274627686, "rewards/rejected": 1.288550615310669, "step": 9653 }, { "epoch": 1.57, "learning_rate": 3.314622694768172e-07, "logits/chosen": -0.8580488562583923, "logits/rejected": -0.849816620349884, "logps/chosen": -91.0511474609375, "logps/rejected": -64.3969497680664, "loss": 0.7585, "rewards/accuracies": 1.0, "rewards/chosen": 1.4140456914901733, "rewards/margins": 0.3191138505935669, "rewards/rejected": 1.0949318408966064, "step": 9654 }, { "epoch": 1.57, "learning_rate": 3.313385412301608e-07, "logits/chosen": -0.8230343461036682, "logits/rejected": -0.8230343461036682, "logps/chosen": -64.952392578125, "logps/rejected": -64.952392578125, "loss": 0.4754, "rewards/accuracies": 0.0, "rewards/chosen": 2.14898681640625, "rewards/margins": 0.0, "rewards/rejected": 2.14898681640625, "step": 9655 }, { "epoch": 1.57, "learning_rate": 3.3121482463641533e-07, "logits/chosen": -0.6400685906410217, "logits/rejected": -0.6527466773986816, "logps/chosen": -105.17939758300781, "logps/rejected": -74.30289459228516, "loss": 0.3966, "rewards/accuracies": 1.0, "rewards/chosen": 1.4198402166366577, "rewards/margins": 0.8067008852958679, "rewards/rejected": 0.6131393313407898, "step": 9656 }, { "epoch": 1.57, "learning_rate": 3.3109111970412834e-07, "logits/chosen": -0.9578427672386169, "logits/rejected": -0.9016194343566895, "logps/chosen": -79.97169494628906, "logps/rejected": -84.65930938720703, "loss": 1.6705, "rewards/accuracies": 1.0, "rewards/chosen": 2.2233810424804688, "rewards/margins": 0.4003700017929077, "rewards/rejected": 1.823011040687561, "step": 9657 }, { "epoch": 1.57, "learning_rate": 3.309674264418468e-07, "logits/chosen": -0.5177013874053955, "logits/rejected": -0.5205331444740295, "logps/chosen": -12.535384178161621, "logps/rejected": -2.180084705352783, "loss": 0.5582, "rewards/accuracies": 0.0, "rewards/chosen": -0.0999814048409462, "rewards/margins": -0.4423416554927826, "rewards/rejected": 0.342360258102417, "step": 9658 }, { "epoch": 1.57, "learning_rate": 3.308437448581167e-07, "logits/chosen": -1.1841331720352173, "logits/rejected": -1.1667420864105225, "logps/chosen": -161.7973175048828, "logps/rejected": -89.27522277832031, "loss": 0.1726, "rewards/accuracies": 1.0, "rewards/chosen": 4.380298137664795, "rewards/margins": 1.2784030437469482, "rewards/rejected": 3.1018950939178467, "step": 9659 }, { "epoch": 1.57, "learning_rate": 3.307200749614832e-07, "logits/chosen": -1.007117509841919, "logits/rejected": -0.9216920137405396, "logps/chosen": -139.75668334960938, "logps/rejected": -84.16243743896484, "loss": 0.0539, "rewards/accuracies": 1.0, "rewards/chosen": 4.631381511688232, "rewards/margins": 3.47194766998291, "rewards/rejected": 1.1594337224960327, "step": 9660 }, { "epoch": 1.57, "learning_rate": 3.305964167604908e-07, "logits/chosen": -0.869694709777832, "logits/rejected": -0.8070516586303711, "logps/chosen": -67.28137969970703, "logps/rejected": -79.40705871582031, "loss": 2.9816, "rewards/accuracies": 1.0, "rewards/chosen": 0.8676231503486633, "rewards/margins": 0.39166030287742615, "rewards/rejected": 0.4759628474712372, "step": 9661 }, { "epoch": 1.57, "learning_rate": 3.3047277026368317e-07, "logits/chosen": -0.8472811579704285, "logits/rejected": -0.955456018447876, "logps/chosen": -104.74324798583984, "logps/rejected": -114.7569808959961, "loss": 2.381, "rewards/accuracies": 0.0, "rewards/chosen": 3.7247276306152344, "rewards/margins": -3.1531996726989746, "rewards/rejected": 6.877927303314209, "step": 9662 }, { "epoch": 1.57, "learning_rate": 3.303491354796029e-07, "logits/chosen": -1.1690069437026978, "logits/rejected": -1.0015712976455688, "logps/chosen": -119.3097915649414, "logps/rejected": -93.65737915039062, "loss": 0.3067, "rewards/accuracies": 1.0, "rewards/chosen": 5.3903021812438965, "rewards/margins": 2.010263681411743, "rewards/rejected": 3.3800384998321533, "step": 9663 }, { "epoch": 1.57, "learning_rate": 3.302255124167922e-07, "logits/chosen": -0.5880376100540161, "logits/rejected": -0.5283832550048828, "logps/chosen": -42.926998138427734, "logps/rejected": -24.940208435058594, "loss": 0.3267, "rewards/accuracies": 1.0, "rewards/chosen": 0.5862876772880554, "rewards/margins": 0.14292505383491516, "rewards/rejected": 0.44336262345314026, "step": 9664 }, { "epoch": 1.57, "learning_rate": 3.301019010837921e-07, "logits/chosen": -0.658491313457489, "logits/rejected": -0.6019468903541565, "logps/chosen": -44.920257568359375, "logps/rejected": -62.42735290527344, "loss": 0.4782, "rewards/accuracies": 1.0, "rewards/chosen": 3.67167067527771, "rewards/margins": 0.761199951171875, "rewards/rejected": 2.910470724105835, "step": 9665 }, { "epoch": 1.57, "learning_rate": 3.2997830148914313e-07, "logits/chosen": -0.785592257976532, "logits/rejected": -0.69675213098526, "logps/chosen": -109.12642669677734, "logps/rejected": -48.910831451416016, "loss": 0.1545, "rewards/accuracies": 1.0, "rewards/chosen": 3.5629220008850098, "rewards/margins": 1.8974300622940063, "rewards/rejected": 1.6654919385910034, "step": 9666 }, { "epoch": 1.57, "learning_rate": 3.2985471364138473e-07, "logits/chosen": -0.5187723636627197, "logits/rejected": -0.5187723636627197, "logps/chosen": -1.001577377319336, "logps/rejected": -1.001577377319336, "loss": 0.4726, "rewards/accuracies": 0.0, "rewards/chosen": 0.29224392771720886, "rewards/margins": 0.0, "rewards/rejected": 0.29224392771720886, "step": 9667 }, { "epoch": 1.57, "learning_rate": 3.297311375490557e-07, "logits/chosen": -0.7862832546234131, "logits/rejected": -0.6927005052566528, "logps/chosen": -66.1011962890625, "logps/rejected": -50.87855529785156, "loss": 1.7973, "rewards/accuracies": 0.0, "rewards/chosen": 2.300463914871216, "rewards/margins": -0.36736369132995605, "rewards/rejected": 2.667827606201172, "step": 9668 }, { "epoch": 1.57, "learning_rate": 3.29607573220694e-07, "logits/chosen": -0.47687557339668274, "logits/rejected": -0.47687557339668274, "logps/chosen": -0.8820303082466125, "logps/rejected": -0.8820303082466125, "loss": 0.659, "rewards/accuracies": 0.0, "rewards/chosen": 0.26089251041412354, "rewards/margins": 0.0, "rewards/rejected": 0.26089251041412354, "step": 9669 }, { "epoch": 1.57, "learning_rate": 3.294840206648366e-07, "logits/chosen": -1.0571863651275635, "logits/rejected": -0.976784884929657, "logps/chosen": -146.76904296875, "logps/rejected": -74.36048126220703, "loss": 0.1592, "rewards/accuracies": 1.0, "rewards/chosen": 3.6716065406799316, "rewards/margins": 2.023831844329834, "rewards/rejected": 1.647774577140808, "step": 9670 }, { "epoch": 1.57, "learning_rate": 3.2936047989002005e-07, "logits/chosen": -0.8112409114837646, "logits/rejected": -0.8723919987678528, "logps/chosen": -93.97440338134766, "logps/rejected": -160.14039611816406, "loss": 1.6215, "rewards/accuracies": 0.0, "rewards/chosen": 1.041490912437439, "rewards/margins": -2.294311046600342, "rewards/rejected": 3.335801839828491, "step": 9671 }, { "epoch": 1.57, "learning_rate": 3.292369509047797e-07, "logits/chosen": -0.6218075156211853, "logits/rejected": -0.5785751938819885, "logps/chosen": -273.2868347167969, "logps/rejected": -101.22196197509766, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 5.9043426513671875, "rewards/margins": 1.3950462341308594, "rewards/rejected": 4.509296417236328, "step": 9672 }, { "epoch": 1.57, "learning_rate": 3.291134337176503e-07, "logits/chosen": -0.4539477229118347, "logits/rejected": -0.4539477229118347, "logps/chosen": -43.72978973388672, "logps/rejected": -43.72978973388672, "loss": 0.4201, "rewards/accuracies": 0.0, "rewards/chosen": 1.1966094970703125, "rewards/margins": 0.0, "rewards/rejected": 1.1966094970703125, "step": 9673 }, { "epoch": 1.57, "learning_rate": 3.2898992833716563e-07, "logits/chosen": -0.7558201551437378, "logits/rejected": -0.7558201551437378, "logps/chosen": -98.57279205322266, "logps/rejected": -98.57279205322266, "loss": 0.4723, "rewards/accuracies": 0.0, "rewards/chosen": 2.1435210704803467, "rewards/margins": 0.0, "rewards/rejected": 2.1435210704803467, "step": 9674 }, { "epoch": 1.57, "learning_rate": 3.288664347718587e-07, "logits/chosen": -0.9014262557029724, "logits/rejected": -0.8537228107452393, "logps/chosen": -73.96678161621094, "logps/rejected": -48.51082229614258, "loss": 0.4946, "rewards/accuracies": 0.0, "rewards/chosen": 1.0560302734375, "rewards/margins": -0.5129421949386597, "rewards/rejected": 1.5689724683761597, "step": 9675 }, { "epoch": 1.57, "learning_rate": 3.28742953030262e-07, "logits/chosen": -0.7535288333892822, "logits/rejected": -0.7298394441604614, "logps/chosen": -69.356201171875, "logps/rejected": -88.13874053955078, "loss": 0.5782, "rewards/accuracies": 0.0, "rewards/chosen": 2.2326819896698, "rewards/margins": -0.4119751453399658, "rewards/rejected": 2.6446571350097656, "step": 9676 }, { "epoch": 1.57, "learning_rate": 3.2861948312090677e-07, "logits/chosen": -0.7662824392318726, "logits/rejected": -0.7153250575065613, "logps/chosen": -86.7996826171875, "logps/rejected": -74.90166473388672, "loss": 0.2644, "rewards/accuracies": 1.0, "rewards/chosen": 3.844744920730591, "rewards/margins": 0.8028252124786377, "rewards/rejected": 3.041919708251953, "step": 9677 }, { "epoch": 1.57, "learning_rate": 3.2849602505232365e-07, "logits/chosen": -0.6997358798980713, "logits/rejected": -0.6577571630477905, "logps/chosen": -130.0277099609375, "logps/rejected": -112.30310821533203, "loss": 1.1945, "rewards/accuracies": 0.0, "rewards/chosen": 3.398632764816284, "rewards/margins": -1.5068094730377197, "rewards/rejected": 4.905442237854004, "step": 9678 }, { "epoch": 1.57, "learning_rate": 3.2837257883304235e-07, "logits/chosen": -0.7591532468795776, "logits/rejected": -0.7875945568084717, "logps/chosen": -55.01970672607422, "logps/rejected": -78.31205749511719, "loss": 0.5753, "rewards/accuracies": 0.0, "rewards/chosen": 1.3660904169082642, "rewards/margins": -0.7347313165664673, "rewards/rejected": 2.1008217334747314, "step": 9679 }, { "epoch": 1.57, "learning_rate": 3.282491444715921e-07, "logits/chosen": -0.7240793704986572, "logits/rejected": -0.5605326294898987, "logps/chosen": -117.82780456542969, "logps/rejected": -18.977083206176758, "loss": 1.358, "rewards/accuracies": 1.0, "rewards/chosen": 1.5457366704940796, "rewards/margins": 0.8478171825408936, "rewards/rejected": 0.697919487953186, "step": 9680 }, { "epoch": 1.57, "learning_rate": 3.2812572197650077e-07, "logits/chosen": -0.7209490537643433, "logits/rejected": -0.6154413819313049, "logps/chosen": -52.658504486083984, "logps/rejected": -62.23921203613281, "loss": 0.6481, "rewards/accuracies": 0.0, "rewards/chosen": 2.01763653755188, "rewards/margins": -0.06629061698913574, "rewards/rejected": 2.0839271545410156, "step": 9681 }, { "epoch": 1.57, "learning_rate": 3.280023113562956e-07, "logits/chosen": -0.5093582272529602, "logits/rejected": -0.3232237994670868, "logps/chosen": -67.28199768066406, "logps/rejected": -19.060768127441406, "loss": 0.8451, "rewards/accuracies": 1.0, "rewards/chosen": 2.1354522705078125, "rewards/margins": 1.562761664390564, "rewards/rejected": 0.5726906061172485, "step": 9682 }, { "epoch": 1.57, "learning_rate": 3.278789126195034e-07, "logits/chosen": -0.5582056045532227, "logits/rejected": -0.5356014370918274, "logps/chosen": -77.41490173339844, "logps/rejected": -98.03941345214844, "loss": 0.5091, "rewards/accuracies": 1.0, "rewards/chosen": 1.0009491443634033, "rewards/margins": 0.5432411432266235, "rewards/rejected": 0.4577079713344574, "step": 9683 }, { "epoch": 1.57, "learning_rate": 3.2775552577464973e-07, "logits/chosen": -0.6820409893989563, "logits/rejected": -0.7393937706947327, "logps/chosen": -155.8603057861328, "logps/rejected": -112.78264617919922, "loss": 0.045, "rewards/accuracies": 1.0, "rewards/chosen": 2.721086263656616, "rewards/margins": 2.3871278762817383, "rewards/rejected": 0.3339584469795227, "step": 9684 }, { "epoch": 1.57, "learning_rate": 3.276321508302593e-07, "logits/chosen": -0.4535447657108307, "logits/rejected": -0.4598637521266937, "logps/chosen": -61.39814758300781, "logps/rejected": -72.30018615722656, "loss": 0.4681, "rewards/accuracies": 1.0, "rewards/chosen": 3.8996779918670654, "rewards/margins": 1.0336883068084717, "rewards/rejected": 2.8659896850585938, "step": 9685 }, { "epoch": 1.57, "learning_rate": 3.2750878779485635e-07, "logits/chosen": -0.7316385507583618, "logits/rejected": -0.7324025630950928, "logps/chosen": -94.10299682617188, "logps/rejected": -58.65849304199219, "loss": 1.8408, "rewards/accuracies": 0.0, "rewards/chosen": 1.2756118774414062, "rewards/margins": -0.33735811710357666, "rewards/rejected": 1.612969994544983, "step": 9686 }, { "epoch": 1.57, "learning_rate": 3.2738543667696407e-07, "logits/chosen": -0.5955264568328857, "logits/rejected": -0.5553448796272278, "logps/chosen": -34.311683654785156, "logps/rejected": -7.928928375244141, "loss": 0.3712, "rewards/accuracies": 1.0, "rewards/chosen": 2.7859272956848145, "rewards/margins": 1.8065874576568604, "rewards/rejected": 0.9793397784233093, "step": 9687 }, { "epoch": 1.57, "learning_rate": 3.2726209748510473e-07, "logits/chosen": -0.8426308035850525, "logits/rejected": -0.8993826508522034, "logps/chosen": -116.15543365478516, "logps/rejected": -175.2216796875, "loss": 2.8966, "rewards/accuracies": 0.0, "rewards/chosen": 0.6805702447891235, "rewards/margins": -4.517212867736816, "rewards/rejected": 5.19778299331665, "step": 9688 }, { "epoch": 1.57, "learning_rate": 3.271387702278001e-07, "logits/chosen": -0.4881962239742279, "logits/rejected": -0.2574423849582672, "logps/chosen": -97.25416564941406, "logps/rejected": -22.16375160217285, "loss": 1.949, "rewards/accuracies": 1.0, "rewards/chosen": 3.4963700771331787, "rewards/margins": 3.2426416873931885, "rewards/rejected": 0.2537284791469574, "step": 9689 }, { "epoch": 1.57, "learning_rate": 3.270154549135707e-07, "logits/chosen": -0.9051379561424255, "logits/rejected": -0.9321800470352173, "logps/chosen": -74.36407470703125, "logps/rejected": -92.47477722167969, "loss": 1.1238, "rewards/accuracies": 0.0, "rewards/chosen": 1.6120223999023438, "rewards/margins": -1.9051659107208252, "rewards/rejected": 3.517188310623169, "step": 9690 }, { "epoch": 1.57, "learning_rate": 3.268921515509367e-07, "logits/chosen": -0.791691780090332, "logits/rejected": -0.6825550198554993, "logps/chosen": -44.17943572998047, "logps/rejected": -37.93286895751953, "loss": 0.1201, "rewards/accuracies": 1.0, "rewards/chosen": 1.7776756286621094, "rewards/margins": 1.3213310241699219, "rewards/rejected": 0.4563446044921875, "step": 9691 }, { "epoch": 1.57, "learning_rate": 3.2676886014841696e-07, "logits/chosen": -0.35085493326187134, "logits/rejected": -0.281122088432312, "logps/chosen": -43.71662139892578, "logps/rejected": -39.859764099121094, "loss": 0.9275, "rewards/accuracies": 1.0, "rewards/chosen": 2.417046308517456, "rewards/margins": 0.9859442710876465, "rewards/rejected": 1.4311020374298096, "step": 9692 }, { "epoch": 1.57, "learning_rate": 3.2664558071453e-07, "logits/chosen": -0.9230132102966309, "logits/rejected": -0.7369177341461182, "logps/chosen": -93.33296203613281, "logps/rejected": -76.2383041381836, "loss": 0.3873, "rewards/accuracies": 1.0, "rewards/chosen": 1.8047081232070923, "rewards/margins": 0.40621113777160645, "rewards/rejected": 1.3984969854354858, "step": 9693 }, { "epoch": 1.57, "learning_rate": 3.2652231325779296e-07, "logits/chosen": -0.8651280999183655, "logits/rejected": -0.7981342673301697, "logps/chosen": -57.46519088745117, "logps/rejected": -72.34564971923828, "loss": 1.3829, "rewards/accuracies": 0.0, "rewards/chosen": 1.4274685382843018, "rewards/margins": -0.5708796977996826, "rewards/rejected": 1.9983482360839844, "step": 9694 }, { "epoch": 1.57, "learning_rate": 3.2639905778672264e-07, "logits/chosen": -0.9183019399642944, "logits/rejected": -0.9167134761810303, "logps/chosen": -149.56146240234375, "logps/rejected": -90.41893768310547, "loss": 1.8089, "rewards/accuracies": 1.0, "rewards/chosen": 7.256335735321045, "rewards/margins": 1.4668693542480469, "rewards/rejected": 5.789466381072998, "step": 9695 }, { "epoch": 1.57, "learning_rate": 3.2627581430983474e-07, "logits/chosen": -1.0579273700714111, "logits/rejected": -1.02315354347229, "logps/chosen": -59.78416061401367, "logps/rejected": -53.86585998535156, "loss": 0.5785, "rewards/accuracies": 0.0, "rewards/chosen": 0.6513378024101257, "rewards/margins": -0.0834423303604126, "rewards/rejected": 0.7347801327705383, "step": 9696 }, { "epoch": 1.57, "learning_rate": 3.261525828356444e-07, "logits/chosen": -1.1184321641921997, "logits/rejected": -1.12720787525177, "logps/chosen": -220.82412719726562, "logps/rejected": -84.6800308227539, "loss": 0.0926, "rewards/accuracies": 1.0, "rewards/chosen": 4.08648681640625, "rewards/margins": 1.5994207859039307, "rewards/rejected": 2.4870660305023193, "step": 9697 }, { "epoch": 1.57, "learning_rate": 3.2602936337266557e-07, "logits/chosen": -0.8477022051811218, "logits/rejected": -0.8233898282051086, "logps/chosen": -65.87973022460938, "logps/rejected": -38.96907424926758, "loss": 0.7784, "rewards/accuracies": 0.0, "rewards/chosen": 1.0193214416503906, "rewards/margins": -0.276302695274353, "rewards/rejected": 1.2956241369247437, "step": 9698 }, { "epoch": 1.57, "learning_rate": 3.259061559294116e-07, "logits/chosen": -0.49655860662460327, "logits/rejected": -0.4971029460430145, "logps/chosen": -3.28098201751709, "logps/rejected": -6.584285736083984, "loss": 0.8101, "rewards/accuracies": 0.0, "rewards/chosen": 0.33120742440223694, "rewards/margins": -0.16116061806678772, "rewards/rejected": 0.49236804246902466, "step": 9699 }, { "epoch": 1.57, "learning_rate": 3.25782960514395e-07, "logits/chosen": -0.5762789845466614, "logits/rejected": -0.6158632040023804, "logps/chosen": -126.75151062011719, "logps/rejected": -58.71125030517578, "loss": 0.6731, "rewards/accuracies": 0.0, "rewards/chosen": 0.4927108883857727, "rewards/margins": -1.0043067932128906, "rewards/rejected": 1.497017741203308, "step": 9700 }, { "epoch": 1.57, "learning_rate": 3.2565977713612736e-07, "logits/chosen": -0.7509979605674744, "logits/rejected": -0.693885862827301, "logps/chosen": -91.14147186279297, "logps/rejected": -104.66158294677734, "loss": 0.1106, "rewards/accuracies": 1.0, "rewards/chosen": 2.954491376876831, "rewards/margins": 1.5222395658493042, "rewards/rejected": 1.4322518110275269, "step": 9701 }, { "epoch": 1.57, "learning_rate": 3.2553660580311954e-07, "logits/chosen": -0.8639048337936401, "logits/rejected": -0.6206904053688049, "logps/chosen": -138.3470916748047, "logps/rejected": -85.56914520263672, "loss": 0.2077, "rewards/accuracies": 1.0, "rewards/chosen": 4.269999980926514, "rewards/margins": 0.832374095916748, "rewards/rejected": 3.4376258850097656, "step": 9702 }, { "epoch": 1.57, "learning_rate": 3.2541344652388147e-07, "logits/chosen": -0.7000515460968018, "logits/rejected": -0.5409128069877625, "logps/chosen": -105.75538635253906, "logps/rejected": -81.68238830566406, "loss": 0.5433, "rewards/accuracies": 1.0, "rewards/chosen": 1.7848633527755737, "rewards/margins": 0.7295219898223877, "rewards/rejected": 1.055341362953186, "step": 9703 }, { "epoch": 1.58, "learning_rate": 3.252902993069222e-07, "logits/chosen": -0.8391900658607483, "logits/rejected": -0.7702010273933411, "logps/chosen": -89.28431701660156, "logps/rejected": -36.859275817871094, "loss": 0.4209, "rewards/accuracies": 0.0, "rewards/chosen": 2.3058013916015625, "rewards/margins": -0.04637789726257324, "rewards/rejected": 2.3521792888641357, "step": 9704 }, { "epoch": 1.58, "learning_rate": 3.251671641607502e-07, "logits/chosen": -0.6938190460205078, "logits/rejected": -0.5420179963111877, "logps/chosen": -48.6251106262207, "logps/rejected": -22.160327911376953, "loss": 0.2543, "rewards/accuracies": 1.0, "rewards/chosen": 3.595479965209961, "rewards/margins": 3.2725539207458496, "rewards/rejected": 0.3229261338710785, "step": 9705 }, { "epoch": 1.58, "learning_rate": 3.2504404109387285e-07, "logits/chosen": -0.7627888917922974, "logits/rejected": -0.6062060594558716, "logps/chosen": -90.57633972167969, "logps/rejected": -76.8790512084961, "loss": 0.3122, "rewards/accuracies": 1.0, "rewards/chosen": 2.2025558948516846, "rewards/margins": 0.3478431701660156, "rewards/rejected": 1.854712724685669, "step": 9706 }, { "epoch": 1.58, "learning_rate": 3.249209301147968e-07, "logits/chosen": -0.7670196890830994, "logits/rejected": -0.6623235940933228, "logps/chosen": -111.70086669921875, "logps/rejected": -27.59433364868164, "loss": 0.5871, "rewards/accuracies": 1.0, "rewards/chosen": 4.993780612945557, "rewards/margins": 3.7229206562042236, "rewards/rejected": 1.270859956741333, "step": 9707 }, { "epoch": 1.58, "learning_rate": 3.247978312320279e-07, "logits/chosen": -0.5136023759841919, "logits/rejected": -0.5414610505104065, "logps/chosen": -62.97391891479492, "logps/rejected": -80.73927307128906, "loss": 0.4206, "rewards/accuracies": 1.0, "rewards/chosen": 1.5166820287704468, "rewards/margins": 1.3124386072158813, "rewards/rejected": 0.204243466258049, "step": 9708 }, { "epoch": 1.58, "learning_rate": 3.246747444540711e-07, "logits/chosen": -0.5856903791427612, "logits/rejected": -0.6787092089653015, "logps/chosen": -49.536155700683594, "logps/rejected": -138.2372589111328, "loss": 2.3717, "rewards/accuracies": 0.0, "rewards/chosen": 2.2353341579437256, "rewards/margins": -2.9113805294036865, "rewards/rejected": 5.146714687347412, "step": 9709 }, { "epoch": 1.58, "learning_rate": 3.2455166978943047e-07, "logits/chosen": -0.6328137516975403, "logits/rejected": -0.5531972050666809, "logps/chosen": -115.57865905761719, "logps/rejected": -61.698062896728516, "loss": 0.7538, "rewards/accuracies": 1.0, "rewards/chosen": 1.3565902709960938, "rewards/margins": 0.8177433013916016, "rewards/rejected": 0.5388469696044922, "step": 9710 }, { "epoch": 1.58, "learning_rate": 3.244286072466094e-07, "logits/chosen": -0.8413053154945374, "logits/rejected": -0.5357582569122314, "logps/chosen": -148.8350830078125, "logps/rejected": -88.00931549072266, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 5.992955207824707, "rewards/margins": 3.6199541091918945, "rewards/rejected": 2.3730010986328125, "step": 9711 }, { "epoch": 1.58, "learning_rate": 3.243055568341102e-07, "logits/chosen": -0.9886279702186584, "logits/rejected": -0.709372878074646, "logps/chosen": -102.26795959472656, "logps/rejected": -66.19248962402344, "loss": 0.6192, "rewards/accuracies": 0.0, "rewards/chosen": 1.1211235523223877, "rewards/margins": -0.8185203075408936, "rewards/rejected": 1.9396438598632812, "step": 9712 }, { "epoch": 1.58, "learning_rate": 3.2418251856043465e-07, "logits/chosen": -0.7396926879882812, "logits/rejected": -0.6629427075386047, "logps/chosen": -83.1550064086914, "logps/rejected": -4.2319016456604, "loss": 2.0754, "rewards/accuracies": 1.0, "rewards/chosen": 0.6743659973144531, "rewards/margins": 0.23224329948425293, "rewards/rejected": 0.4421226978302002, "step": 9713 }, { "epoch": 1.58, "learning_rate": 3.240594924340835e-07, "logits/chosen": -0.6907506585121155, "logits/rejected": -0.6698651909828186, "logps/chosen": -80.3262939453125, "logps/rejected": -133.58509826660156, "loss": 0.4004, "rewards/accuracies": 1.0, "rewards/chosen": 0.7363899350166321, "rewards/margins": 0.09953534603118896, "rewards/rejected": 0.6368545889854431, "step": 9714 }, { "epoch": 1.58, "learning_rate": 3.2393647846355663e-07, "logits/chosen": -0.7193387746810913, "logits/rejected": -0.6054108738899231, "logps/chosen": -161.74185180664062, "logps/rejected": -52.23614501953125, "loss": 0.6005, "rewards/accuracies": 1.0, "rewards/chosen": 0.8787811398506165, "rewards/margins": 0.8884628415107727, "rewards/rejected": -0.00968170166015625, "step": 9715 }, { "epoch": 1.58, "learning_rate": 3.2381347665735315e-07, "logits/chosen": -0.6551977396011353, "logits/rejected": -0.6165666580200195, "logps/chosen": -98.44541931152344, "logps/rejected": -94.81063079833984, "loss": 0.2823, "rewards/accuracies": 1.0, "rewards/chosen": 5.095685005187988, "rewards/margins": 1.6152002811431885, "rewards/rejected": 3.4804847240448, "step": 9716 }, { "epoch": 1.58, "learning_rate": 3.236904870239714e-07, "logits/chosen": -0.7974658012390137, "logits/rejected": -0.863235592842102, "logps/chosen": -93.01126098632812, "logps/rejected": -51.13724899291992, "loss": 0.9239, "rewards/accuracies": 0.0, "rewards/chosen": 2.222735643386841, "rewards/margins": -0.7472667694091797, "rewards/rejected": 2.9700024127960205, "step": 9717 }, { "epoch": 1.58, "learning_rate": 3.2356750957190865e-07, "logits/chosen": -0.300619512796402, "logits/rejected": -0.1907190978527069, "logps/chosen": -73.71245574951172, "logps/rejected": -31.70078468322754, "loss": 0.4103, "rewards/accuracies": 1.0, "rewards/chosen": 4.567501068115234, "rewards/margins": 3.4812941551208496, "rewards/rejected": 1.0862070322036743, "step": 9718 }, { "epoch": 1.58, "learning_rate": 3.234445443096616e-07, "logits/chosen": -0.6670917272567749, "logits/rejected": -0.4677496552467346, "logps/chosen": -56.470088958740234, "logps/rejected": -19.26011848449707, "loss": 0.1346, "rewards/accuracies": 1.0, "rewards/chosen": 2.6756763458251953, "rewards/margins": 2.3874335289001465, "rewards/rejected": 0.28824272751808167, "step": 9719 }, { "epoch": 1.58, "learning_rate": 3.23321591245726e-07, "logits/chosen": -0.9182109236717224, "logits/rejected": -0.8897709250450134, "logps/chosen": -96.82454681396484, "logps/rejected": -89.15745544433594, "loss": 1.4735, "rewards/accuracies": 0.0, "rewards/chosen": 1.532813310623169, "rewards/margins": -2.4264931678771973, "rewards/rejected": 3.959306478500366, "step": 9720 }, { "epoch": 1.58, "learning_rate": 3.231986503885966e-07, "logits/chosen": -0.7345280647277832, "logits/rejected": -0.5912007689476013, "logps/chosen": -61.18425750732422, "logps/rejected": -34.13920593261719, "loss": 0.382, "rewards/accuracies": 1.0, "rewards/chosen": 2.830808401107788, "rewards/margins": 2.612569570541382, "rewards/rejected": 0.21823883056640625, "step": 9721 }, { "epoch": 1.58, "learning_rate": 3.230757217467677e-07, "logits/chosen": -0.8706016540527344, "logits/rejected": -0.8231694102287292, "logps/chosen": -96.15058898925781, "logps/rejected": -75.32269287109375, "loss": 0.7191, "rewards/accuracies": 0.0, "rewards/chosen": 0.41987916827201843, "rewards/margins": -0.9676246643066406, "rewards/rejected": 1.3875038623809814, "step": 9722 }, { "epoch": 1.58, "learning_rate": 3.2295280532873224e-07, "logits/chosen": -0.6164606809616089, "logits/rejected": -0.5998362302780151, "logps/chosen": -96.58987426757812, "logps/rejected": -124.3822021484375, "loss": 0.4719, "rewards/accuracies": 1.0, "rewards/chosen": 1.1396392583847046, "rewards/margins": 0.5691909193992615, "rewards/rejected": 0.5704483389854431, "step": 9723 }, { "epoch": 1.58, "learning_rate": 3.228299011429827e-07, "logits/chosen": -0.5816837549209595, "logits/rejected": -0.5033082365989685, "logps/chosen": -181.3188934326172, "logps/rejected": -48.75718688964844, "loss": 1.2831, "rewards/accuracies": 1.0, "rewards/chosen": 6.524069309234619, "rewards/margins": 4.721730709075928, "rewards/rejected": 1.8023384809494019, "step": 9724 }, { "epoch": 1.58, "learning_rate": 3.2270700919801064e-07, "logits/chosen": -0.5293936729431152, "logits/rejected": -0.4718450605869293, "logps/chosen": -46.45415115356445, "logps/rejected": -60.47832489013672, "loss": 1.401, "rewards/accuracies": 0.0, "rewards/chosen": 1.4476845264434814, "rewards/margins": -0.3081817626953125, "rewards/rejected": 1.755866289138794, "step": 9725 }, { "epoch": 1.58, "learning_rate": 3.2258412950230665e-07, "logits/chosen": -0.4591003358364105, "logits/rejected": -0.4637010395526886, "logps/chosen": -1.8168878555297852, "logps/rejected": -2.028505802154541, "loss": 1.1916, "rewards/accuracies": 1.0, "rewards/chosen": 0.2610825002193451, "rewards/margins": 0.034026190638542175, "rewards/rejected": 0.22705630958080292, "step": 9726 }, { "epoch": 1.58, "learning_rate": 3.2246126206436065e-07, "logits/chosen": -0.525437593460083, "logits/rejected": -0.821168065071106, "logps/chosen": -52.76505661010742, "logps/rejected": -33.232112884521484, "loss": 0.5962, "rewards/accuracies": 1.0, "rewards/chosen": 2.516737699508667, "rewards/margins": 0.7601450681686401, "rewards/rejected": 1.7565926313400269, "step": 9727 }, { "epoch": 1.58, "learning_rate": 3.2233840689266145e-07, "logits/chosen": -0.6700636744499207, "logits/rejected": -0.6700636744499207, "logps/chosen": -53.59532928466797, "logps/rejected": -53.59532928466797, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.5269149541854858, "rewards/margins": 0.0, "rewards/rejected": 1.5269149541854858, "step": 9728 }, { "epoch": 1.58, "learning_rate": 3.222155639956974e-07, "logits/chosen": -0.700552761554718, "logits/rejected": -0.6489512324333191, "logps/chosen": -52.32881546020508, "logps/rejected": -53.13670349121094, "loss": 0.2642, "rewards/accuracies": 1.0, "rewards/chosen": 1.8355076313018799, "rewards/margins": 0.38492774963378906, "rewards/rejected": 1.4505798816680908, "step": 9729 }, { "epoch": 1.58, "learning_rate": 3.2209273338195554e-07, "logits/chosen": -0.8494392037391663, "logits/rejected": -0.8302269577980042, "logps/chosen": -61.965660095214844, "logps/rejected": -58.19863510131836, "loss": 2.4935, "rewards/accuracies": 0.0, "rewards/chosen": 0.9294800162315369, "rewards/margins": -1.9994723796844482, "rewards/rejected": 2.92895245552063, "step": 9730 }, { "epoch": 1.58, "learning_rate": 3.2196991505992237e-07, "logits/chosen": -0.8124523162841797, "logits/rejected": -0.7073633670806885, "logps/chosen": -59.56242752075195, "logps/rejected": -44.162940979003906, "loss": 0.5082, "rewards/accuracies": 1.0, "rewards/chosen": 2.233488082885742, "rewards/margins": 0.206770658493042, "rewards/rejected": 2.0267174243927, "step": 9731 }, { "epoch": 1.58, "learning_rate": 3.2184710903808364e-07, "logits/chosen": -1.46260666847229, "logits/rejected": -1.3837543725967407, "logps/chosen": -101.22917938232422, "logps/rejected": -165.98521423339844, "loss": 1.4654, "rewards/accuracies": 0.0, "rewards/chosen": 1.8859550952911377, "rewards/margins": -2.0340538024902344, "rewards/rejected": 3.920008897781372, "step": 9732 }, { "epoch": 1.58, "learning_rate": 3.2172431532492404e-07, "logits/chosen": -0.8230146765708923, "logits/rejected": -0.743315577507019, "logps/chosen": -53.30744552612305, "logps/rejected": -24.70502471923828, "loss": 0.0781, "rewards/accuracies": 1.0, "rewards/chosen": 2.444917678833008, "rewards/margins": 2.1584315299987793, "rewards/rejected": 0.28648605942726135, "step": 9733 }, { "epoch": 1.58, "learning_rate": 3.216015339289273e-07, "logits/chosen": -0.49307212233543396, "logits/rejected": -0.39332088828086853, "logps/chosen": -38.326011657714844, "logps/rejected": -44.1575927734375, "loss": 0.4927, "rewards/accuracies": 1.0, "rewards/chosen": 0.8649612665176392, "rewards/margins": 0.1478210687637329, "rewards/rejected": 0.7171401977539062, "step": 9734 }, { "epoch": 1.58, "learning_rate": 3.2147876485857664e-07, "logits/chosen": -0.9727597236633301, "logits/rejected": -0.9051656723022461, "logps/chosen": -67.65859985351562, "logps/rejected": -48.06552505493164, "loss": 0.8981, "rewards/accuracies": 0.0, "rewards/chosen": 1.685339331626892, "rewards/margins": -0.0755925178527832, "rewards/rejected": 1.7609318494796753, "step": 9735 }, { "epoch": 1.58, "learning_rate": 3.213560081223541e-07, "logits/chosen": -0.6669891476631165, "logits/rejected": -0.6669891476631165, "logps/chosen": -62.331478118896484, "logps/rejected": -62.331478118896484, "loss": 1.3957, "rewards/accuracies": 0.0, "rewards/chosen": 1.3752155303955078, "rewards/margins": 0.0, "rewards/rejected": 1.3752155303955078, "step": 9736 }, { "epoch": 1.58, "learning_rate": 3.21233263728741e-07, "logits/chosen": -0.7219399213790894, "logits/rejected": -0.7521650791168213, "logps/chosen": -65.32933807373047, "logps/rejected": -139.37783813476562, "loss": 1.4005, "rewards/accuracies": 0.0, "rewards/chosen": 2.3511407375335693, "rewards/margins": -2.7343146800994873, "rewards/rejected": 5.085455417633057, "step": 9737 }, { "epoch": 1.58, "learning_rate": 3.2111053168621793e-07, "logits/chosen": -0.5756209492683411, "logits/rejected": -0.5181149840354919, "logps/chosen": -78.24935150146484, "logps/rejected": -30.67003059387207, "loss": 0.619, "rewards/accuracies": 1.0, "rewards/chosen": 2.67266845703125, "rewards/margins": 0.7584329843521118, "rewards/rejected": 1.9142354726791382, "step": 9738 }, { "epoch": 1.58, "learning_rate": 3.2098781200326443e-07, "logits/chosen": -1.164871096611023, "logits/rejected": -1.1448554992675781, "logps/chosen": -131.73703002929688, "logps/rejected": -115.35295104980469, "loss": 0.878, "rewards/accuracies": 0.0, "rewards/chosen": 3.9279160499572754, "rewards/margins": -1.5300354957580566, "rewards/rejected": 5.457951545715332, "step": 9739 }, { "epoch": 1.58, "learning_rate": 3.2086510468835923e-07, "logits/chosen": -1.0271713733673096, "logits/rejected": -1.0095994472503662, "logps/chosen": -138.86834716796875, "logps/rejected": -148.7076416015625, "loss": 0.2477, "rewards/accuracies": 1.0, "rewards/chosen": 6.500817775726318, "rewards/margins": 0.5430374145507812, "rewards/rejected": 5.957780361175537, "step": 9740 }, { "epoch": 1.58, "learning_rate": 3.2074240974998044e-07, "logits/chosen": -0.7587002515792847, "logits/rejected": -0.82329922914505, "logps/chosen": -69.84513854980469, "logps/rejected": -138.2013397216797, "loss": 0.6249, "rewards/accuracies": 0.0, "rewards/chosen": 1.227026343345642, "rewards/margins": -0.9020417928695679, "rewards/rejected": 2.12906813621521, "step": 9741 }, { "epoch": 1.58, "learning_rate": 3.206197271966049e-07, "logits/chosen": -0.3587510585784912, "logits/rejected": -0.33294054865837097, "logps/chosen": -71.21470642089844, "logps/rejected": -62.09223556518555, "loss": 0.9392, "rewards/accuracies": 0.0, "rewards/chosen": 1.790017008781433, "rewards/margins": -1.697133183479309, "rewards/rejected": 3.487150192260742, "step": 9742 }, { "epoch": 1.58, "learning_rate": 3.2049705703670894e-07, "logits/chosen": -0.8135350346565247, "logits/rejected": -0.8135350346565247, "logps/chosen": -61.17874526977539, "logps/rejected": -61.17874526977539, "loss": 0.5643, "rewards/accuracies": 0.0, "rewards/chosen": 1.7182247638702393, "rewards/margins": 0.0, "rewards/rejected": 1.7182247638702393, "step": 9743 }, { "epoch": 1.58, "learning_rate": 3.203743992787679e-07, "logits/chosen": -0.7815648913383484, "logits/rejected": -0.7417522668838501, "logps/chosen": -112.4627914428711, "logps/rejected": -112.3319091796875, "loss": 0.2126, "rewards/accuracies": 1.0, "rewards/chosen": 4.164582252502441, "rewards/margins": 0.861156702041626, "rewards/rejected": 3.3034255504608154, "step": 9744 }, { "epoch": 1.58, "learning_rate": 3.202517539312561e-07, "logits/chosen": -0.7485367059707642, "logits/rejected": -0.8276051878929138, "logps/chosen": -86.23139953613281, "logps/rejected": -166.11074829101562, "loss": 1.2362, "rewards/accuracies": 0.0, "rewards/chosen": 3.58148193359375, "rewards/margins": -2.3779892921447754, "rewards/rejected": 5.959471225738525, "step": 9745 }, { "epoch": 1.58, "learning_rate": 3.201291210026474e-07, "logits/chosen": -0.6266894340515137, "logits/rejected": -0.5348852872848511, "logps/chosen": -82.20915222167969, "logps/rejected": -44.24318313598633, "loss": 0.2093, "rewards/accuracies": 1.0, "rewards/chosen": 2.1002166271209717, "rewards/margins": 0.7043490409851074, "rewards/rejected": 1.3958675861358643, "step": 9746 }, { "epoch": 1.58, "learning_rate": 3.200065005014144e-07, "logits/chosen": -1.0454607009887695, "logits/rejected": -0.9941015839576721, "logps/chosen": -52.34022903442383, "logps/rejected": -20.9248104095459, "loss": 0.1743, "rewards/accuracies": 1.0, "rewards/chosen": 1.6568691730499268, "rewards/margins": 1.4434328079223633, "rewards/rejected": 0.2134363204240799, "step": 9747 }, { "epoch": 1.58, "learning_rate": 3.1988389243602923e-07, "logits/chosen": -0.619133710861206, "logits/rejected": -0.5272101163864136, "logps/chosen": -57.54546356201172, "logps/rejected": -26.666833877563477, "loss": 0.1337, "rewards/accuracies": 1.0, "rewards/chosen": 2.2268683910369873, "rewards/margins": 1.532520055770874, "rewards/rejected": 0.6943483352661133, "step": 9748 }, { "epoch": 1.58, "learning_rate": 3.197612968149628e-07, "logits/chosen": -0.6922525763511658, "logits/rejected": -0.7083901166915894, "logps/chosen": -46.56466293334961, "logps/rejected": -25.175081253051758, "loss": 0.3969, "rewards/accuracies": 1.0, "rewards/chosen": 0.20270462334156036, "rewards/margins": 0.3380889892578125, "rewards/rejected": -0.13538436591625214, "step": 9749 }, { "epoch": 1.58, "learning_rate": 3.1963871364668526e-07, "logits/chosen": -0.6560319662094116, "logits/rejected": -0.5907313227653503, "logps/chosen": -118.61588287353516, "logps/rejected": -130.70379638671875, "loss": 1.2295, "rewards/accuracies": 1.0, "rewards/chosen": 1.237404704093933, "rewards/margins": 0.11125719547271729, "rewards/rejected": 1.1261475086212158, "step": 9750 }, { "epoch": 1.58, "learning_rate": 3.19516142939666e-07, "logits/chosen": -0.5415949821472168, "logits/rejected": -0.44207099080085754, "logps/chosen": -38.778404235839844, "logps/rejected": -46.10479736328125, "loss": 0.1627, "rewards/accuracies": 1.0, "rewards/chosen": 2.50394606590271, "rewards/margins": 1.570814609527588, "rewards/rejected": 0.9331313967704773, "step": 9751 }, { "epoch": 1.58, "learning_rate": 3.1939358470237357e-07, "logits/chosen": -0.7853085994720459, "logits/rejected": -0.7724220156669617, "logps/chosen": -70.77810668945312, "logps/rejected": -74.50591278076172, "loss": 0.7629, "rewards/accuracies": 1.0, "rewards/chosen": 1.4288192987442017, "rewards/margins": 0.2821815013885498, "rewards/rejected": 1.1466377973556519, "step": 9752 }, { "epoch": 1.58, "learning_rate": 3.192710389432755e-07, "logits/chosen": -0.5375455021858215, "logits/rejected": -0.5625413060188293, "logps/chosen": -15.649333953857422, "logps/rejected": -44.32141876220703, "loss": 0.4437, "rewards/accuracies": 0.0, "rewards/chosen": 0.5500594973564148, "rewards/margins": -0.16633379459381104, "rewards/rejected": 0.7163932919502258, "step": 9753 }, { "epoch": 1.58, "learning_rate": 3.1914850567083863e-07, "logits/chosen": -0.6809868812561035, "logits/rejected": -0.5862497687339783, "logps/chosen": -125.15969848632812, "logps/rejected": -123.97674560546875, "loss": 0.5985, "rewards/accuracies": 1.0, "rewards/chosen": 3.78721022605896, "rewards/margins": 0.7466325759887695, "rewards/rejected": 3.0405776500701904, "step": 9754 }, { "epoch": 1.58, "learning_rate": 3.190259848935287e-07, "logits/chosen": -0.9626794457435608, "logits/rejected": -0.8105449676513672, "logps/chosen": -158.67898559570312, "logps/rejected": -25.87853240966797, "loss": 0.2601, "rewards/accuracies": 1.0, "rewards/chosen": 1.0381287336349487, "rewards/margins": 0.4853796362876892, "rewards/rejected": 0.5527490973472595, "step": 9755 }, { "epoch": 1.58, "learning_rate": 3.1890347661981084e-07, "logits/chosen": -0.5755003690719604, "logits/rejected": -0.46894371509552, "logps/chosen": -92.71212768554688, "logps/rejected": -36.76502990722656, "loss": 0.1792, "rewards/accuracies": 1.0, "rewards/chosen": 3.151526689529419, "rewards/margins": 0.8631243705749512, "rewards/rejected": 2.2884023189544678, "step": 9756 }, { "epoch": 1.58, "learning_rate": 3.187809808581492e-07, "logits/chosen": -0.5181906223297119, "logits/rejected": -0.5181906223297119, "logps/chosen": -48.06023025512695, "logps/rejected": -48.06023025512695, "loss": 0.4094, "rewards/accuracies": 0.0, "rewards/chosen": 0.27841684222221375, "rewards/margins": 0.0, "rewards/rejected": 0.27841684222221375, "step": 9757 }, { "epoch": 1.58, "learning_rate": 3.1865849761700704e-07, "logits/chosen": -0.6802069544792175, "logits/rejected": -0.6515623331069946, "logps/chosen": -88.55581665039062, "logps/rejected": -64.09124755859375, "loss": 0.4271, "rewards/accuracies": 0.0, "rewards/chosen": 1.7672828435897827, "rewards/margins": -0.12575912475585938, "rewards/rejected": 1.893041968345642, "step": 9758 }, { "epoch": 1.58, "learning_rate": 3.185360269048469e-07, "logits/chosen": -0.7689977884292603, "logits/rejected": -0.24969373643398285, "logps/chosen": -75.30323028564453, "logps/rejected": -112.46232604980469, "loss": 0.2799, "rewards/accuracies": 1.0, "rewards/chosen": 3.130120038986206, "rewards/margins": 0.47374629974365234, "rewards/rejected": 2.6563737392425537, "step": 9759 }, { "epoch": 1.58, "learning_rate": 3.184135687301302e-07, "logits/chosen": -0.8101013898849487, "logits/rejected": -0.7758162617683411, "logps/chosen": -92.95008850097656, "logps/rejected": -75.00086975097656, "loss": 0.9534, "rewards/accuracies": 0.0, "rewards/chosen": 1.5171600580215454, "rewards/margins": -1.665913462638855, "rewards/rejected": 3.1830735206604004, "step": 9760 }, { "epoch": 1.58, "learning_rate": 3.1829112310131776e-07, "logits/chosen": -1.0524095296859741, "logits/rejected": -0.8311035633087158, "logps/chosen": -86.59046936035156, "logps/rejected": -66.15302276611328, "loss": 0.1354, "rewards/accuracies": 1.0, "rewards/chosen": 3.5751450061798096, "rewards/margins": 1.855420708656311, "rewards/rejected": 1.7197242975234985, "step": 9761 }, { "epoch": 1.58, "learning_rate": 3.181686900268694e-07, "logits/chosen": -0.6390451788902283, "logits/rejected": -0.7094669342041016, "logps/chosen": -66.1640625, "logps/rejected": -67.67386627197266, "loss": 0.5337, "rewards/accuracies": 0.0, "rewards/chosen": 2.327000379562378, "rewards/margins": -0.24068617820739746, "rewards/rejected": 2.5676865577697754, "step": 9762 }, { "epoch": 1.58, "learning_rate": 3.1804626951524393e-07, "logits/chosen": -1.0359934568405151, "logits/rejected": -1.087450623512268, "logps/chosen": -96.13581848144531, "logps/rejected": -168.53671264648438, "loss": 0.4872, "rewards/accuracies": 0.0, "rewards/chosen": 4.513978481292725, "rewards/margins": -0.2676835060119629, "rewards/rejected": 4.7816619873046875, "step": 9763 }, { "epoch": 1.58, "learning_rate": 3.179238615748997e-07, "logits/chosen": -0.4542067348957062, "logits/rejected": -0.5113524794578552, "logps/chosen": -80.86199951171875, "logps/rejected": -71.46163177490234, "loss": 0.7683, "rewards/accuracies": 1.0, "rewards/chosen": 2.2075576782226562, "rewards/margins": 0.5442253351211548, "rewards/rejected": 1.6633323431015015, "step": 9764 }, { "epoch": 1.58, "learning_rate": 3.178014662142937e-07, "logits/chosen": -0.6384074687957764, "logits/rejected": -0.6571460366249084, "logps/chosen": -54.862335205078125, "logps/rejected": -139.16685485839844, "loss": 0.8022, "rewards/accuracies": 1.0, "rewards/chosen": 0.948925793170929, "rewards/margins": 0.6167343258857727, "rewards/rejected": 0.33219146728515625, "step": 9765 }, { "epoch": 1.59, "learning_rate": 3.1767908344188256e-07, "logits/chosen": -0.25160157680511475, "logits/rejected": -0.25160157680511475, "logps/chosen": -97.58091735839844, "logps/rejected": -97.58091735839844, "loss": 0.8219, "rewards/accuracies": 0.0, "rewards/chosen": 0.812390148639679, "rewards/margins": 0.0, "rewards/rejected": 0.812390148639679, "step": 9766 }, { "epoch": 1.59, "learning_rate": 3.175567132661214e-07, "logits/chosen": -0.7645086050033569, "logits/rejected": -0.7444217801094055, "logps/chosen": -48.96869659423828, "logps/rejected": -7.428982257843018, "loss": 0.3599, "rewards/accuracies": 1.0, "rewards/chosen": 1.2363990545272827, "rewards/margins": 0.4425894618034363, "rewards/rejected": 0.7938095927238464, "step": 9767 }, { "epoch": 1.59, "learning_rate": 3.1743435569546516e-07, "logits/chosen": -0.4663418233394623, "logits/rejected": -0.41221699118614197, "logps/chosen": -73.9273452758789, "logps/rejected": -59.10152816772461, "loss": 0.7099, "rewards/accuracies": 1.0, "rewards/chosen": 2.802499532699585, "rewards/margins": 0.24129772186279297, "rewards/rejected": 2.561201810836792, "step": 9768 }, { "epoch": 1.59, "learning_rate": 3.1731201073836753e-07, "logits/chosen": -0.8286372423171997, "logits/rejected": -0.8139637112617493, "logps/chosen": -83.88288116455078, "logps/rejected": -51.72641372680664, "loss": 0.2916, "rewards/accuracies": 1.0, "rewards/chosen": 1.1154030561447144, "rewards/margins": 0.546698808670044, "rewards/rejected": 0.5687042474746704, "step": 9769 }, { "epoch": 1.59, "learning_rate": 3.171896784032814e-07, "logits/chosen": -0.69819176197052, "logits/rejected": -0.69819176197052, "logps/chosen": -19.717979431152344, "logps/rejected": -19.717979431152344, "loss": 0.4749, "rewards/accuracies": 0.0, "rewards/chosen": 0.20322494208812714, "rewards/margins": 0.0, "rewards/rejected": 0.20322494208812714, "step": 9770 }, { "epoch": 1.59, "learning_rate": 3.1706735869865863e-07, "logits/chosen": -0.9002651572227478, "logits/rejected": -0.7778894305229187, "logps/chosen": -76.9167251586914, "logps/rejected": -74.55500030517578, "loss": 0.2045, "rewards/accuracies": 1.0, "rewards/chosen": 2.3916847705841064, "rewards/margins": 1.5811172723770142, "rewards/rejected": 0.8105674982070923, "step": 9771 }, { "epoch": 1.59, "learning_rate": 3.169450516329505e-07, "logits/chosen": -1.3269208669662476, "logits/rejected": -1.2054953575134277, "logps/chosen": -114.13128662109375, "logps/rejected": -178.877685546875, "loss": 1.6334, "rewards/accuracies": 0.0, "rewards/chosen": 4.424899578094482, "rewards/margins": -3.1887969970703125, "rewards/rejected": 7.613696575164795, "step": 9772 }, { "epoch": 1.59, "learning_rate": 3.168227572146072e-07, "logits/chosen": -0.68902587890625, "logits/rejected": -0.6135662198066711, "logps/chosen": -118.03172302246094, "logps/rejected": -137.37924194335938, "loss": 0.3034, "rewards/accuracies": 1.0, "rewards/chosen": 5.484483242034912, "rewards/margins": 0.9544448852539062, "rewards/rejected": 4.530038356781006, "step": 9773 }, { "epoch": 1.59, "learning_rate": 3.167004754520781e-07, "logits/chosen": -1.0676831007003784, "logits/rejected": -0.9959303140640259, "logps/chosen": -98.52975463867188, "logps/rejected": -56.80021667480469, "loss": 0.3504, "rewards/accuracies": 1.0, "rewards/chosen": 2.1653525829315186, "rewards/margins": 0.44017553329467773, "rewards/rejected": 1.7251770496368408, "step": 9774 }, { "epoch": 1.59, "learning_rate": 3.1657820635381185e-07, "logits/chosen": -0.8275589942932129, "logits/rejected": -0.8597176671028137, "logps/chosen": -44.48960494995117, "logps/rejected": -55.72834014892578, "loss": 0.3584, "rewards/accuracies": 1.0, "rewards/chosen": 2.768333911895752, "rewards/margins": 0.3529980182647705, "rewards/rejected": 2.4153358936309814, "step": 9775 }, { "epoch": 1.59, "learning_rate": 3.164559499282559e-07, "logits/chosen": -0.41441473364830017, "logits/rejected": -0.22905859351158142, "logps/chosen": -113.94820404052734, "logps/rejected": -50.94786834716797, "loss": 1.8985, "rewards/accuracies": 1.0, "rewards/chosen": 0.9739662408828735, "rewards/margins": 0.19527667760849, "rewards/rejected": 0.7786895632743835, "step": 9776 }, { "epoch": 1.59, "learning_rate": 3.1633370618385715e-07, "logits/chosen": -0.9577281475067139, "logits/rejected": -0.8585973978042603, "logps/chosen": -61.10329055786133, "logps/rejected": -20.063257217407227, "loss": 2.0767, "rewards/accuracies": 1.0, "rewards/chosen": 1.4611965417861938, "rewards/margins": 1.081512451171875, "rewards/rejected": 0.37968406081199646, "step": 9777 }, { "epoch": 1.59, "learning_rate": 3.162114751290613e-07, "logits/chosen": -0.651195764541626, "logits/rejected": -0.5657652616500854, "logps/chosen": -92.0594253540039, "logps/rejected": -95.03138732910156, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": 5.540565013885498, "rewards/margins": 2.3880975246429443, "rewards/rejected": 3.1524674892425537, "step": 9778 }, { "epoch": 1.59, "learning_rate": 3.160892567723137e-07, "logits/chosen": -1.0942842960357666, "logits/rejected": -1.012316107749939, "logps/chosen": -169.2132110595703, "logps/rejected": -144.8966064453125, "loss": 1.2603, "rewards/accuracies": 0.0, "rewards/chosen": 5.249542236328125, "rewards/margins": -2.4093918800354004, "rewards/rejected": 7.658934116363525, "step": 9779 }, { "epoch": 1.59, "learning_rate": 3.159670511220581e-07, "logits/chosen": -0.6707459688186646, "logits/rejected": -0.6294209957122803, "logps/chosen": -127.10235595703125, "logps/rejected": -57.08951950073242, "loss": 0.6282, "rewards/accuracies": 1.0, "rewards/chosen": 2.4654908180236816, "rewards/margins": 0.0490567684173584, "rewards/rejected": 2.4164340496063232, "step": 9780 }, { "epoch": 1.59, "learning_rate": 3.1584485818673804e-07, "logits/chosen": -0.5719334483146667, "logits/rejected": -0.6112717986106873, "logps/chosen": -65.11653137207031, "logps/rejected": -71.21635437011719, "loss": 0.714, "rewards/accuracies": 0.0, "rewards/chosen": 0.77227783203125, "rewards/margins": -0.08182376623153687, "rewards/rejected": 0.8541015982627869, "step": 9781 }, { "epoch": 1.59, "learning_rate": 3.157226779747958e-07, "logits/chosen": -0.6637523770332336, "logits/rejected": -0.6696046590805054, "logps/chosen": -71.34239196777344, "logps/rejected": -92.57991790771484, "loss": 1.2956, "rewards/accuracies": 0.0, "rewards/chosen": 0.46118852496147156, "rewards/margins": -1.1165282726287842, "rewards/rejected": 1.5777168273925781, "step": 9782 }, { "epoch": 1.59, "learning_rate": 3.1560051049467284e-07, "logits/chosen": -0.7775436043739319, "logits/rejected": -0.7775436043739319, "logps/chosen": -74.81766510009766, "logps/rejected": -74.81766510009766, "loss": 0.3892, "rewards/accuracies": 0.0, "rewards/chosen": 3.3683135509490967, "rewards/margins": 0.0, "rewards/rejected": 3.3683135509490967, "step": 9783 }, { "epoch": 1.59, "learning_rate": 3.154783557548097e-07, "logits/chosen": -0.7876154184341431, "logits/rejected": -0.630798876285553, "logps/chosen": -103.01738739013672, "logps/rejected": -37.454833984375, "loss": 0.9882, "rewards/accuracies": 1.0, "rewards/chosen": 3.0477166175842285, "rewards/margins": 2.5373191833496094, "rewards/rejected": 0.5103973746299744, "step": 9784 }, { "epoch": 1.59, "learning_rate": 3.1535621376364643e-07, "logits/chosen": -0.8056237101554871, "logits/rejected": -0.7919047474861145, "logps/chosen": -232.48031616210938, "logps/rejected": -197.20858764648438, "loss": 0.5466, "rewards/accuracies": 0.0, "rewards/chosen": 4.901956081390381, "rewards/margins": -0.5662598609924316, "rewards/rejected": 5.4682159423828125, "step": 9785 }, { "epoch": 1.59, "learning_rate": 3.1523408452962153e-07, "logits/chosen": -0.6027818918228149, "logits/rejected": -0.7041000127792358, "logps/chosen": -164.7509002685547, "logps/rejected": -52.77946472167969, "loss": 1.3264, "rewards/accuracies": 1.0, "rewards/chosen": 3.5135498046875, "rewards/margins": 1.553277611732483, "rewards/rejected": 1.960272192955017, "step": 9786 }, { "epoch": 1.59, "learning_rate": 3.1511196806117326e-07, "logits/chosen": -1.1307919025421143, "logits/rejected": -1.0119479894638062, "logps/chosen": -107.343994140625, "logps/rejected": -109.93970489501953, "loss": 0.5207, "rewards/accuracies": 0.0, "rewards/chosen": 4.505774021148682, "rewards/margins": -0.5423440933227539, "rewards/rejected": 5.0481181144714355, "step": 9787 }, { "epoch": 1.59, "learning_rate": 3.1498986436673835e-07, "logits/chosen": -0.6585656404495239, "logits/rejected": -0.6590673327445984, "logps/chosen": -109.04141235351562, "logps/rejected": -115.1755142211914, "loss": 0.26, "rewards/accuracies": 1.0, "rewards/chosen": 5.074756145477295, "rewards/margins": 0.9926490783691406, "rewards/rejected": 4.082107067108154, "step": 9788 }, { "epoch": 1.59, "learning_rate": 3.1486777345475334e-07, "logits/chosen": -1.0846234560012817, "logits/rejected": -1.0656567811965942, "logps/chosen": -52.735931396484375, "logps/rejected": -76.60598754882812, "loss": 0.7279, "rewards/accuracies": 0.0, "rewards/chosen": 1.6334975957870483, "rewards/margins": -0.5048555135726929, "rewards/rejected": 2.138353109359741, "step": 9789 }, { "epoch": 1.59, "learning_rate": 3.1474569533365346e-07, "logits/chosen": -0.642539381980896, "logits/rejected": -0.6885954141616821, "logps/chosen": -96.90650939941406, "logps/rejected": -103.30128479003906, "loss": 0.5382, "rewards/accuracies": 0.0, "rewards/chosen": 1.5550949573516846, "rewards/margins": -0.5259902477264404, "rewards/rejected": 2.081085205078125, "step": 9790 }, { "epoch": 1.59, "learning_rate": 3.146236300118731e-07, "logits/chosen": -0.7814814448356628, "logits/rejected": -0.8242652416229248, "logps/chosen": -75.78973388671875, "logps/rejected": -131.54898071289062, "loss": 0.4163, "rewards/accuracies": 1.0, "rewards/chosen": 0.9507889151573181, "rewards/margins": 0.15355533361434937, "rewards/rejected": 0.7972335815429688, "step": 9791 }, { "epoch": 1.59, "learning_rate": 3.145015774978459e-07, "logits/chosen": -0.9286938309669495, "logits/rejected": -0.9699479937553406, "logps/chosen": -37.142173767089844, "logps/rejected": -74.73994445800781, "loss": 1.9152, "rewards/accuracies": 1.0, "rewards/chosen": 1.8635727167129517, "rewards/margins": 0.05744016170501709, "rewards/rejected": 1.8061325550079346, "step": 9792 }, { "epoch": 1.59, "learning_rate": 3.1437953780000446e-07, "logits/chosen": -0.6816701292991638, "logits/rejected": -0.6907428503036499, "logps/chosen": -103.60298919677734, "logps/rejected": -90.3102798461914, "loss": 0.9147, "rewards/accuracies": 0.0, "rewards/chosen": 1.5007965564727783, "rewards/margins": -0.16680753231048584, "rewards/rejected": 1.6676040887832642, "step": 9793 }, { "epoch": 1.59, "learning_rate": 3.142575109267806e-07, "logits/chosen": -0.49190929532051086, "logits/rejected": -0.44442084431648254, "logps/chosen": -40.73939514160156, "logps/rejected": -50.59417724609375, "loss": 1.08, "rewards/accuracies": 0.0, "rewards/chosen": 1.296258568763733, "rewards/margins": -1.149511694908142, "rewards/rejected": 2.445770263671875, "step": 9794 }, { "epoch": 1.59, "learning_rate": 3.141354968866052e-07, "logits/chosen": -0.4737865626811981, "logits/rejected": -0.47637733817100525, "logps/chosen": -3.2570602893829346, "logps/rejected": -4.992164134979248, "loss": 0.8268, "rewards/accuracies": 0.0, "rewards/chosen": 0.15607383847236633, "rewards/margins": -0.10757201910018921, "rewards/rejected": 0.26364585757255554, "step": 9795 }, { "epoch": 1.59, "learning_rate": 3.1401349568790835e-07, "logits/chosen": -0.9458284974098206, "logits/rejected": -0.993270993232727, "logps/chosen": -51.68965530395508, "logps/rejected": -74.31834411621094, "loss": 2.1923, "rewards/accuracies": 1.0, "rewards/chosen": 2.4313061237335205, "rewards/margins": 0.7440983057022095, "rewards/rejected": 1.687207818031311, "step": 9796 }, { "epoch": 1.59, "learning_rate": 3.1389150733911914e-07, "logits/chosen": -0.748933732509613, "logits/rejected": -0.748933732509613, "logps/chosen": -71.39994812011719, "logps/rejected": -71.39994812011719, "loss": 0.5885, "rewards/accuracies": 0.0, "rewards/chosen": 1.9275718927383423, "rewards/margins": 0.0, "rewards/rejected": 1.9275718927383423, "step": 9797 }, { "epoch": 1.59, "learning_rate": 3.1376953184866574e-07, "logits/chosen": -0.8722997903823853, "logits/rejected": -0.8846688866615295, "logps/chosen": -118.0519027709961, "logps/rejected": -128.15963745117188, "loss": 0.5537, "rewards/accuracies": 1.0, "rewards/chosen": 4.811470985412598, "rewards/margins": 0.7152152061462402, "rewards/rejected": 4.096255779266357, "step": 9798 }, { "epoch": 1.59, "learning_rate": 3.136475692249756e-07, "logits/chosen": -0.9743053913116455, "logits/rejected": -0.9664642810821533, "logps/chosen": -69.30160522460938, "logps/rejected": -80.29159545898438, "loss": 0.4861, "rewards/accuracies": 0.0, "rewards/chosen": 1.7389557361602783, "rewards/margins": -0.4504554271697998, "rewards/rejected": 2.189411163330078, "step": 9799 }, { "epoch": 1.59, "learning_rate": 3.135256194764751e-07, "logits/chosen": -0.6322519183158875, "logits/rejected": -0.5744673013687134, "logps/chosen": -90.32675170898438, "logps/rejected": -46.632686614990234, "loss": 0.5685, "rewards/accuracies": 0.0, "rewards/chosen": 0.6558570861816406, "rewards/margins": -0.4226433038711548, "rewards/rejected": 1.0785003900527954, "step": 9800 }, { "epoch": 1.59, "learning_rate": 3.1340368261158987e-07, "logits/chosen": -0.8283998966217041, "logits/rejected": -0.7272474765777588, "logps/chosen": -124.04949188232422, "logps/rejected": -103.29193115234375, "loss": 0.4886, "rewards/accuracies": 0.0, "rewards/chosen": 0.9725562930107117, "rewards/margins": -0.49068528413772583, "rewards/rejected": 1.4632415771484375, "step": 9801 }, { "epoch": 1.59, "learning_rate": 3.132817586387446e-07, "logits/chosen": -0.8084462881088257, "logits/rejected": -0.690138578414917, "logps/chosen": -84.32184600830078, "logps/rejected": -52.426876068115234, "loss": 1.1582, "rewards/accuracies": 0.0, "rewards/chosen": 2.3759026527404785, "rewards/margins": -0.008597850799560547, "rewards/rejected": 2.384500503540039, "step": 9802 }, { "epoch": 1.59, "learning_rate": 3.1315984756636303e-07, "logits/chosen": -0.9660781621932983, "logits/rejected": -0.9959752559661865, "logps/chosen": -201.09474182128906, "logps/rejected": -95.0440673828125, "loss": 0.5385, "rewards/accuracies": 0.0, "rewards/chosen": 4.34175443649292, "rewards/margins": -0.32750988006591797, "rewards/rejected": 4.669264316558838, "step": 9803 }, { "epoch": 1.59, "learning_rate": 3.130379494028682e-07, "logits/chosen": -0.4771069288253784, "logits/rejected": -0.4338260293006897, "logps/chosen": -127.40771484375, "logps/rejected": -81.28091430664062, "loss": 0.7398, "rewards/accuracies": 0.0, "rewards/chosen": 1.32391357421875, "rewards/margins": -0.6223205327987671, "rewards/rejected": 1.946234107017517, "step": 9804 }, { "epoch": 1.59, "learning_rate": 3.129160641566819e-07, "logits/chosen": -0.764427661895752, "logits/rejected": -0.7209615111351013, "logps/chosen": -42.127647399902344, "logps/rejected": -106.71381378173828, "loss": 1.0662, "rewards/accuracies": 1.0, "rewards/chosen": 2.74409556388855, "rewards/margins": 0.15022730827331543, "rewards/rejected": 2.5938682556152344, "step": 9805 }, { "epoch": 1.59, "learning_rate": 3.127941918362254e-07, "logits/chosen": -0.5213772058486938, "logits/rejected": -0.5350053310394287, "logps/chosen": -53.66094207763672, "logps/rejected": -103.03926086425781, "loss": 0.4312, "rewards/accuracies": 1.0, "rewards/chosen": 1.5611565113067627, "rewards/margins": 0.8274017572402954, "rewards/rejected": 0.7337547540664673, "step": 9806 }, { "epoch": 1.59, "learning_rate": 3.1267233244991886e-07, "logits/chosen": -0.7704957127571106, "logits/rejected": -0.7295640110969543, "logps/chosen": -177.40853881835938, "logps/rejected": -103.65481567382812, "loss": 0.1475, "rewards/accuracies": 1.0, "rewards/chosen": 3.338940382003784, "rewards/margins": 1.6098631620407104, "rewards/rejected": 1.7290772199630737, "step": 9807 }, { "epoch": 1.59, "learning_rate": 3.1255048600618174e-07, "logits/chosen": -0.782509446144104, "logits/rejected": -0.759705126285553, "logps/chosen": -62.956398010253906, "logps/rejected": -50.83744812011719, "loss": 0.7156, "rewards/accuracies": 0.0, "rewards/chosen": 1.4894660711288452, "rewards/margins": -0.024437785148620605, "rewards/rejected": 1.5139038562774658, "step": 9808 }, { "epoch": 1.59, "learning_rate": 3.1242865251343223e-07, "logits/chosen": -0.350272536277771, "logits/rejected": -0.3743933439254761, "logps/chosen": -29.75297737121582, "logps/rejected": -27.550329208374023, "loss": 0.4434, "rewards/accuracies": 1.0, "rewards/chosen": 0.5466375350952148, "rewards/margins": 0.11374035477638245, "rewards/rejected": 0.4328971803188324, "step": 9809 }, { "epoch": 1.59, "learning_rate": 3.123068319800881e-07, "logits/chosen": -0.6093990802764893, "logits/rejected": -0.6091609001159668, "logps/chosen": -58.945045471191406, "logps/rejected": -52.05496597290039, "loss": 0.4367, "rewards/accuracies": 1.0, "rewards/chosen": 2.2224481105804443, "rewards/margins": 0.5035138130187988, "rewards/rejected": 1.7189342975616455, "step": 9810 }, { "epoch": 1.59, "learning_rate": 3.121850244145659e-07, "logits/chosen": -1.071800947189331, "logits/rejected": -1.0431673526763916, "logps/chosen": -84.33866882324219, "logps/rejected": -128.2759552001953, "loss": 0.5355, "rewards/accuracies": 1.0, "rewards/chosen": 2.063204288482666, "rewards/margins": 0.8116127252578735, "rewards/rejected": 1.2515915632247925, "step": 9811 }, { "epoch": 1.59, "learning_rate": 3.1206322982528137e-07, "logits/chosen": -0.35128074884414673, "logits/rejected": -0.32565590739250183, "logps/chosen": -48.42709732055664, "logps/rejected": -55.3354606628418, "loss": 0.894, "rewards/accuracies": 0.0, "rewards/chosen": 1.7241696119308472, "rewards/margins": -0.46399223804473877, "rewards/rejected": 2.188161849975586, "step": 9812 }, { "epoch": 1.59, "learning_rate": 3.1194144822064937e-07, "logits/chosen": -0.8453338146209717, "logits/rejected": -0.8658128976821899, "logps/chosen": -55.589080810546875, "logps/rejected": -39.55830383300781, "loss": 0.397, "rewards/accuracies": 0.0, "rewards/chosen": 1.3628120422363281, "rewards/margins": -0.13270115852355957, "rewards/rejected": 1.4955132007598877, "step": 9813 }, { "epoch": 1.59, "learning_rate": 3.1181967960908397e-07, "logits/chosen": -1.0741771459579468, "logits/rejected": -0.9624392986297607, "logps/chosen": -129.4481658935547, "logps/rejected": -161.38038635253906, "loss": 1.3993, "rewards/accuracies": 0.0, "rewards/chosen": 4.772019863128662, "rewards/margins": -2.65523099899292, "rewards/rejected": 7.427250862121582, "step": 9814 }, { "epoch": 1.59, "learning_rate": 3.1169792399899807e-07, "logits/chosen": -0.611834704875946, "logits/rejected": -0.6042875647544861, "logps/chosen": -77.27960205078125, "logps/rejected": -29.715347290039062, "loss": 0.5212, "rewards/accuracies": 0.0, "rewards/chosen": 0.32074663043022156, "rewards/margins": -0.4469093382358551, "rewards/rejected": 0.7676559686660767, "step": 9815 }, { "epoch": 1.59, "learning_rate": 3.115761813988038e-07, "logits/chosen": -0.36249426007270813, "logits/rejected": -0.36249426007270813, "logps/chosen": -1.1831703186035156, "logps/rejected": -1.1831703186035156, "loss": 1.1967, "rewards/accuracies": 0.0, "rewards/chosen": 0.15853416919708252, "rewards/margins": 0.0, "rewards/rejected": 0.15853416919708252, "step": 9816 }, { "epoch": 1.59, "learning_rate": 3.1145445181691267e-07, "logits/chosen": -0.9171136617660522, "logits/rejected": -0.7930718064308167, "logps/chosen": -78.3806381225586, "logps/rejected": -54.216793060302734, "loss": 0.0706, "rewards/accuracies": 1.0, "rewards/chosen": 2.348403215408325, "rewards/margins": 1.9224086999893188, "rewards/rejected": 0.42599448561668396, "step": 9817 }, { "epoch": 1.59, "learning_rate": 3.1133273526173476e-07, "logits/chosen": -0.6354447603225708, "logits/rejected": -0.6101253032684326, "logps/chosen": -126.8817367553711, "logps/rejected": -43.6192626953125, "loss": 0.1454, "rewards/accuracies": 1.0, "rewards/chosen": 4.012143611907959, "rewards/margins": 1.2781927585601807, "rewards/rejected": 2.7339508533477783, "step": 9818 }, { "epoch": 1.59, "learning_rate": 3.1121103174167976e-07, "logits/chosen": -0.3533037304878235, "logits/rejected": -0.339756041765213, "logps/chosen": -3.8842244148254395, "logps/rejected": -8.875689506530762, "loss": 0.2685, "rewards/accuracies": 1.0, "rewards/chosen": 0.3160174787044525, "rewards/margins": 0.40265601873397827, "rewards/rejected": -0.08663854748010635, "step": 9819 }, { "epoch": 1.59, "learning_rate": 3.1108934126515605e-07, "logits/chosen": -0.6230887770652771, "logits/rejected": -0.3442739248275757, "logps/chosen": -177.99044799804688, "logps/rejected": -77.568359375, "loss": 0.0629, "rewards/accuracies": 1.0, "rewards/chosen": 4.563070774078369, "rewards/margins": 3.1704018115997314, "rewards/rejected": 1.3926689624786377, "step": 9820 }, { "epoch": 1.59, "learning_rate": 3.109676638405714e-07, "logits/chosen": -1.0595135688781738, "logits/rejected": -1.0177191495895386, "logps/chosen": -89.56348419189453, "logps/rejected": -69.49148559570312, "loss": 0.9868, "rewards/accuracies": 0.0, "rewards/chosen": 1.677751898765564, "rewards/margins": -0.9548805952072144, "rewards/rejected": 2.6326324939727783, "step": 9821 }, { "epoch": 1.59, "learning_rate": 3.108459994763325e-07, "logits/chosen": -0.6964438557624817, "logits/rejected": -0.5517460703849792, "logps/chosen": -112.69113159179688, "logps/rejected": -73.5942611694336, "loss": 0.7809, "rewards/accuracies": 0.0, "rewards/chosen": 2.2816359996795654, "rewards/margins": -0.9679086208343506, "rewards/rejected": 3.249544620513916, "step": 9822 }, { "epoch": 1.59, "learning_rate": 3.107243481808452e-07, "logits/chosen": -0.8681797981262207, "logits/rejected": -0.848153293132782, "logps/chosen": -84.80632781982422, "logps/rejected": -75.57554626464844, "loss": 0.176, "rewards/accuracies": 1.0, "rewards/chosen": 3.83036732673645, "rewards/margins": 2.9848480224609375, "rewards/rejected": 0.8455192446708679, "step": 9823 }, { "epoch": 1.59, "learning_rate": 3.106027099625146e-07, "logits/chosen": -0.7639782428741455, "logits/rejected": -0.7444572448730469, "logps/chosen": -59.6797981262207, "logps/rejected": -75.57455444335938, "loss": 0.9266, "rewards/accuracies": 0.0, "rewards/chosen": 1.4817081689834595, "rewards/margins": -1.1205700635910034, "rewards/rejected": 2.602278232574463, "step": 9824 }, { "epoch": 1.59, "learning_rate": 3.1048108482974455e-07, "logits/chosen": -0.8712247014045715, "logits/rejected": -0.760471522808075, "logps/chosen": -63.820499420166016, "logps/rejected": -11.900025367736816, "loss": 0.2226, "rewards/accuracies": 1.0, "rewards/chosen": 2.2379696369171143, "rewards/margins": 1.846756100654602, "rewards/rejected": 0.3912135064601898, "step": 9825 }, { "epoch": 1.59, "learning_rate": 3.1035947279093843e-07, "logits/chosen": -0.841111958026886, "logits/rejected": -0.8328585624694824, "logps/chosen": -68.05631256103516, "logps/rejected": -78.3122329711914, "loss": 1.2544, "rewards/accuracies": 0.0, "rewards/chosen": 1.827552080154419, "rewards/margins": -0.11556315422058105, "rewards/rejected": 1.943115234375, "step": 9826 }, { "epoch": 1.6, "learning_rate": 3.102378738544983e-07, "logits/chosen": -0.8075910210609436, "logits/rejected": -0.7894760966300964, "logps/chosen": -159.02403259277344, "logps/rejected": -118.129150390625, "loss": 0.1927, "rewards/accuracies": 1.0, "rewards/chosen": 4.609890937805176, "rewards/margins": 1.533696174621582, "rewards/rejected": 3.0761947631835938, "step": 9827 }, { "epoch": 1.6, "learning_rate": 3.101162880288255e-07, "logits/chosen": -0.33084210753440857, "logits/rejected": -0.37883174419403076, "logps/chosen": -37.18361282348633, "logps/rejected": -90.78907775878906, "loss": 0.2922, "rewards/accuracies": 1.0, "rewards/chosen": 1.0354198217391968, "rewards/margins": 0.391567587852478, "rewards/rejected": 0.6438522338867188, "step": 9828 }, { "epoch": 1.6, "learning_rate": 3.099947153223205e-07, "logits/chosen": -0.7587721347808838, "logits/rejected": -0.6921864151954651, "logps/chosen": -111.94197082519531, "logps/rejected": -71.24180603027344, "loss": 0.1117, "rewards/accuracies": 1.0, "rewards/chosen": 4.804978847503662, "rewards/margins": 1.9084455966949463, "rewards/rejected": 2.896533250808716, "step": 9829 }, { "epoch": 1.6, "learning_rate": 3.0987315574338287e-07, "logits/chosen": -0.6654689311981201, "logits/rejected": -0.6746844053268433, "logps/chosen": -65.69647216796875, "logps/rejected": -45.49928283691406, "loss": 0.8357, "rewards/accuracies": 0.0, "rewards/chosen": 0.6681984066963196, "rewards/margins": -1.3845252990722656, "rewards/rejected": 2.0527236461639404, "step": 9830 }, { "epoch": 1.6, "learning_rate": 3.0975160930041113e-07, "logits/chosen": -0.6268854737281799, "logits/rejected": -0.5996697545051575, "logps/chosen": -64.75172424316406, "logps/rejected": -32.31965637207031, "loss": 0.7319, "rewards/accuracies": 0.0, "rewards/chosen": 1.1727036237716675, "rewards/margins": -0.04918015003204346, "rewards/rejected": 1.221883773803711, "step": 9831 }, { "epoch": 1.6, "learning_rate": 3.096300760018031e-07, "logits/chosen": -0.8662323355674744, "logits/rejected": -0.7760909795761108, "logps/chosen": -104.01614379882812, "logps/rejected": -19.976449966430664, "loss": 1.12, "rewards/accuracies": 1.0, "rewards/chosen": 0.7256912589073181, "rewards/margins": 0.5478391647338867, "rewards/rejected": 0.177852064371109, "step": 9832 }, { "epoch": 1.6, "learning_rate": 3.0950855585595546e-07, "logits/chosen": -0.5653033256530762, "logits/rejected": -0.6640205979347229, "logps/chosen": -83.07705688476562, "logps/rejected": -81.38983917236328, "loss": 2.0657, "rewards/accuracies": 0.0, "rewards/chosen": 1.8850144147872925, "rewards/margins": -2.8006062507629395, "rewards/rejected": 4.6856207847595215, "step": 9833 }, { "epoch": 1.6, "learning_rate": 3.093870488712642e-07, "logits/chosen": -0.5778207182884216, "logits/rejected": -0.6569164395332336, "logps/chosen": -88.31812286376953, "logps/rejected": -93.16397094726562, "loss": 2.2352, "rewards/accuracies": 0.0, "rewards/chosen": 1.7440299987792969, "rewards/margins": -0.5816514492034912, "rewards/rejected": 2.325681447982788, "step": 9834 }, { "epoch": 1.6, "learning_rate": 3.092655550561243e-07, "logits/chosen": -0.9579440951347351, "logits/rejected": -1.0489585399627686, "logps/chosen": -72.87908935546875, "logps/rejected": -106.91097259521484, "loss": 1.7449, "rewards/accuracies": 0.0, "rewards/chosen": 3.297982931137085, "rewards/margins": -3.317742109298706, "rewards/rejected": 6.615725040435791, "step": 9835 }, { "epoch": 1.6, "learning_rate": 3.0914407441892977e-07, "logits/chosen": -0.7116214036941528, "logits/rejected": -0.6727105975151062, "logps/chosen": -132.94406127929688, "logps/rejected": -61.04421615600586, "loss": 2.4568, "rewards/accuracies": 0.0, "rewards/chosen": 3.814605712890625, "rewards/margins": -0.012024402618408203, "rewards/rejected": 3.826630115509033, "step": 9836 }, { "epoch": 1.6, "learning_rate": 3.0902260696807377e-07, "logits/chosen": -0.6275418400764465, "logits/rejected": -0.5407175421714783, "logps/chosen": -56.833221435546875, "logps/rejected": -56.58897399902344, "loss": 1.8649, "rewards/accuracies": 1.0, "rewards/chosen": 1.973017930984497, "rewards/margins": 1.5516788959503174, "rewards/rejected": 0.4213390350341797, "step": 9837 }, { "epoch": 1.6, "learning_rate": 3.089011527119486e-07, "logits/chosen": -0.6111361384391785, "logits/rejected": -0.6111361384391785, "logps/chosen": -61.99564743041992, "logps/rejected": -61.99564743041992, "loss": 0.7278, "rewards/accuracies": 0.0, "rewards/chosen": 2.5328052043914795, "rewards/margins": 0.0, "rewards/rejected": 2.5328052043914795, "step": 9838 }, { "epoch": 1.6, "learning_rate": 3.087797116589456e-07, "logits/chosen": -0.4542738199234009, "logits/rejected": -0.19083969295024872, "logps/chosen": -71.65117645263672, "logps/rejected": -52.006553649902344, "loss": 0.2373, "rewards/accuracies": 1.0, "rewards/chosen": 1.8369499444961548, "rewards/margins": 1.9294995069503784, "rewards/rejected": -0.09254951775074005, "step": 9839 }, { "epoch": 1.6, "learning_rate": 3.086582838174551e-07, "logits/chosen": -0.6851570010185242, "logits/rejected": -0.6241297721862793, "logps/chosen": -109.96102905273438, "logps/rejected": -112.7222671508789, "loss": 0.4503, "rewards/accuracies": 1.0, "rewards/chosen": 2.6201157569885254, "rewards/margins": 0.49060606956481934, "rewards/rejected": 2.129509687423706, "step": 9840 }, { "epoch": 1.6, "learning_rate": 3.085368691958668e-07, "logits/chosen": -0.8868876099586487, "logits/rejected": -0.8666558265686035, "logps/chosen": -73.71145629882812, "logps/rejected": -101.01992797851562, "loss": 0.7404, "rewards/accuracies": 1.0, "rewards/chosen": 3.1118125915527344, "rewards/margins": 0.5857527256011963, "rewards/rejected": 2.526059865951538, "step": 9841 }, { "epoch": 1.6, "learning_rate": 3.0841546780256914e-07, "logits/chosen": -0.6066539883613586, "logits/rejected": -0.49430403113365173, "logps/chosen": -33.4097900390625, "logps/rejected": -15.618873596191406, "loss": 0.7183, "rewards/accuracies": 1.0, "rewards/chosen": 1.522385835647583, "rewards/margins": 0.6150867938995361, "rewards/rejected": 0.9072990417480469, "step": 9842 }, { "epoch": 1.6, "learning_rate": 3.082940796459499e-07, "logits/chosen": -0.7828246355056763, "logits/rejected": -0.6959588527679443, "logps/chosen": -55.348323822021484, "logps/rejected": -91.48792266845703, "loss": 1.4657, "rewards/accuracies": 0.0, "rewards/chosen": 2.041869878768921, "rewards/margins": -0.7121775150299072, "rewards/rejected": 2.754047393798828, "step": 9843 }, { "epoch": 1.6, "learning_rate": 3.0817270473439583e-07, "logits/chosen": -0.44593581557273865, "logits/rejected": -0.42696091532707214, "logps/chosen": -19.918180465698242, "logps/rejected": -3.233711004257202, "loss": 0.6305, "rewards/accuracies": 0.0, "rewards/chosen": 0.1145872101187706, "rewards/margins": -0.1440686583518982, "rewards/rejected": 0.2586558759212494, "step": 9844 }, { "epoch": 1.6, "learning_rate": 3.0805134307629274e-07, "logits/chosen": -0.3745267987251282, "logits/rejected": -0.3330373466014862, "logps/chosen": -66.55464172363281, "logps/rejected": -78.03990936279297, "loss": 0.5118, "rewards/accuracies": 1.0, "rewards/chosen": 1.0432723760604858, "rewards/margins": 0.9376556277275085, "rewards/rejected": 0.10561676323413849, "step": 9845 }, { "epoch": 1.6, "learning_rate": 3.0792999468002564e-07, "logits/chosen": -0.4475511312484741, "logits/rejected": -0.3252379298210144, "logps/chosen": -59.826969146728516, "logps/rejected": -16.391508102416992, "loss": 0.4539, "rewards/accuracies": 1.0, "rewards/chosen": 1.7050670385360718, "rewards/margins": 1.4641813039779663, "rewards/rejected": 0.24088573455810547, "step": 9846 }, { "epoch": 1.6, "learning_rate": 3.078086595539785e-07, "logits/chosen": -0.8346292972564697, "logits/rejected": -0.732801616191864, "logps/chosen": -61.329612731933594, "logps/rejected": -18.33761215209961, "loss": 0.0879, "rewards/accuracies": 1.0, "rewards/chosen": 2.194265127182007, "rewards/margins": 1.7094517946243286, "rewards/rejected": 0.48481330275535583, "step": 9847 }, { "epoch": 1.6, "learning_rate": 3.0768733770653443e-07, "logits/chosen": -0.7021794319152832, "logits/rejected": -0.6745676398277283, "logps/chosen": -44.7171745300293, "logps/rejected": -84.77284240722656, "loss": 0.9122, "rewards/accuracies": 0.0, "rewards/chosen": 2.215911626815796, "rewards/margins": -1.2383975982666016, "rewards/rejected": 3.4543092250823975, "step": 9848 }, { "epoch": 1.6, "learning_rate": 3.0756602914607564e-07, "logits/chosen": -0.8060759902000427, "logits/rejected": -0.5455234050750732, "logps/chosen": -85.51155853271484, "logps/rejected": -19.664958953857422, "loss": 1.0397, "rewards/accuracies": 1.0, "rewards/chosen": 5.71776819229126, "rewards/margins": 5.039053440093994, "rewards/rejected": 0.6787145733833313, "step": 9849 }, { "epoch": 1.6, "learning_rate": 3.074447338809835e-07, "logits/chosen": -0.8587360382080078, "logits/rejected": -0.8600072264671326, "logps/chosen": -171.33660888671875, "logps/rejected": -49.45248031616211, "loss": 1.3963, "rewards/accuracies": 1.0, "rewards/chosen": 4.889105319976807, "rewards/margins": 1.9190022945404053, "rewards/rejected": 2.9701030254364014, "step": 9850 }, { "epoch": 1.6, "learning_rate": 3.0732345191963823e-07, "logits/chosen": -0.5874383449554443, "logits/rejected": -0.4618794620037079, "logps/chosen": -45.81840133666992, "logps/rejected": -7.17061710357666, "loss": 0.1334, "rewards/accuracies": 1.0, "rewards/chosen": 2.159330368041992, "rewards/margins": 1.216951847076416, "rewards/rejected": 0.9423785209655762, "step": 9851 }, { "epoch": 1.6, "learning_rate": 3.0720218327041927e-07, "logits/chosen": -1.3715972900390625, "logits/rejected": -1.345745325088501, "logps/chosen": -67.30885314941406, "logps/rejected": -36.308109283447266, "loss": 0.4302, "rewards/accuracies": 1.0, "rewards/chosen": 2.0204010009765625, "rewards/margins": 1.7035919427871704, "rewards/rejected": 0.3168090879917145, "step": 9852 }, { "epoch": 1.6, "learning_rate": 3.070809279417052e-07, "logits/chosen": -0.6989374756813049, "logits/rejected": -0.6989374756813049, "logps/chosen": -36.206512451171875, "logps/rejected": -36.206512451171875, "loss": 0.7341, "rewards/accuracies": 0.0, "rewards/chosen": 1.7619072198867798, "rewards/margins": 0.0, "rewards/rejected": 1.7619072198867798, "step": 9853 }, { "epoch": 1.6, "learning_rate": 3.069596859418736e-07, "logits/chosen": -0.7291626334190369, "logits/rejected": -0.678138256072998, "logps/chosen": -66.32477569580078, "logps/rejected": -56.69677734375, "loss": 1.1548, "rewards/accuracies": 0.0, "rewards/chosen": 1.2287758588790894, "rewards/margins": -0.6103324890136719, "rewards/rejected": 1.8391083478927612, "step": 9854 }, { "epoch": 1.6, "learning_rate": 3.0683845727930115e-07, "logits/chosen": -1.0314323902130127, "logits/rejected": -0.9634645581245422, "logps/chosen": -95.90911865234375, "logps/rejected": -52.47002410888672, "loss": 0.1017, "rewards/accuracies": 1.0, "rewards/chosen": 3.9735991954803467, "rewards/margins": 1.8110694885253906, "rewards/rejected": 2.162529706954956, "step": 9855 }, { "epoch": 1.6, "learning_rate": 3.067172419623636e-07, "logits/chosen": -0.806842029094696, "logits/rejected": -0.8055052161216736, "logps/chosen": -68.61215209960938, "logps/rejected": -106.14727783203125, "loss": 0.6065, "rewards/accuracies": 0.0, "rewards/chosen": 1.7084823846817017, "rewards/margins": -0.8470994234085083, "rewards/rejected": 2.55558180809021, "step": 9856 }, { "epoch": 1.6, "learning_rate": 3.065960399994357e-07, "logits/chosen": -0.3166447877883911, "logits/rejected": -0.3166447877883911, "logps/chosen": -29.788909912109375, "logps/rejected": -29.788909912109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.9322575330734253, "rewards/margins": 0.0, "rewards/rejected": 1.9322575330734253, "step": 9857 }, { "epoch": 1.6, "learning_rate": 3.064748513988914e-07, "logits/chosen": -0.44131338596343994, "logits/rejected": -0.4474228024482727, "logps/chosen": -5.464924335479736, "logps/rejected": -2.989004611968994, "loss": 0.3651, "rewards/accuracies": 1.0, "rewards/chosen": 0.4823133051395416, "rewards/margins": 0.14056792855262756, "rewards/rejected": 0.34174537658691406, "step": 9858 }, { "epoch": 1.6, "learning_rate": 3.063536761691038e-07, "logits/chosen": -0.5097739100456238, "logits/rejected": -0.5009030103683472, "logps/chosen": -76.77840423583984, "logps/rejected": -129.7978973388672, "loss": 0.3584, "rewards/accuracies": 1.0, "rewards/chosen": 1.8689804077148438, "rewards/margins": 0.7647857666015625, "rewards/rejected": 1.1041946411132812, "step": 9859 }, { "epoch": 1.6, "learning_rate": 3.0623251431844487e-07, "logits/chosen": -0.6907743215560913, "logits/rejected": -0.6658774614334106, "logps/chosen": -129.59796142578125, "logps/rejected": -37.00690460205078, "loss": 0.2502, "rewards/accuracies": 1.0, "rewards/chosen": 0.931262195110321, "rewards/margins": 0.7414283752441406, "rewards/rejected": 0.1898338347673416, "step": 9860 }, { "epoch": 1.6, "learning_rate": 3.0611136585528576e-07, "logits/chosen": -1.0894137620925903, "logits/rejected": -1.0892224311828613, "logps/chosen": -139.110595703125, "logps/rejected": -52.67494201660156, "loss": 0.513, "rewards/accuracies": 1.0, "rewards/chosen": 3.824392795562744, "rewards/margins": 0.9287223815917969, "rewards/rejected": 2.8956704139709473, "step": 9861 }, { "epoch": 1.6, "learning_rate": 3.059902307879967e-07, "logits/chosen": -0.3633570373058319, "logits/rejected": -0.49352508783340454, "logps/chosen": -75.34125518798828, "logps/rejected": -157.53793334960938, "loss": 1.5614, "rewards/accuracies": 0.0, "rewards/chosen": 1.4609062671661377, "rewards/margins": -2.9974114894866943, "rewards/rejected": 4.458317756652832, "step": 9862 }, { "epoch": 1.6, "learning_rate": 3.058691091249469e-07, "logits/chosen": -0.6914677023887634, "logits/rejected": -0.5181968212127686, "logps/chosen": -71.7132568359375, "logps/rejected": -36.480491638183594, "loss": 0.215, "rewards/accuracies": 1.0, "rewards/chosen": 0.6253463625907898, "rewards/margins": 0.6419227719306946, "rewards/rejected": -0.016576385125517845, "step": 9863 }, { "epoch": 1.6, "learning_rate": 3.057480008745048e-07, "logits/chosen": -0.6818872094154358, "logits/rejected": -0.5776188969612122, "logps/chosen": -41.41188049316406, "logps/rejected": -29.982643127441406, "loss": 0.4489, "rewards/accuracies": 1.0, "rewards/chosen": 1.2331764698028564, "rewards/margins": 0.45179903507232666, "rewards/rejected": 0.7813774347305298, "step": 9864 }, { "epoch": 1.6, "learning_rate": 3.056269060450378e-07, "logits/chosen": -0.7131146192550659, "logits/rejected": -0.8054355978965759, "logps/chosen": -71.2347412109375, "logps/rejected": -133.2960205078125, "loss": 3.2364, "rewards/accuracies": 0.0, "rewards/chosen": 2.041429281234741, "rewards/margins": -2.682203531265259, "rewards/rejected": 4.7236328125, "step": 9865 }, { "epoch": 1.6, "learning_rate": 3.0550582464491244e-07, "logits/chosen": -0.8007116913795471, "logits/rejected": -0.7124674320220947, "logps/chosen": -196.65850830078125, "logps/rejected": -61.988426208496094, "loss": 0.0481, "rewards/accuracies": 1.0, "rewards/chosen": 5.08278226852417, "rewards/margins": 2.3484225273132324, "rewards/rejected": 2.7343597412109375, "step": 9866 }, { "epoch": 1.6, "learning_rate": 3.0538475668249425e-07, "logits/chosen": -0.6872118711471558, "logits/rejected": -0.7759221196174622, "logps/chosen": -90.464111328125, "logps/rejected": -107.33397674560547, "loss": 1.4578, "rewards/accuracies": 0.0, "rewards/chosen": 1.3181747198104858, "rewards/margins": -2.6296792030334473, "rewards/rejected": 3.9478538036346436, "step": 9867 }, { "epoch": 1.6, "learning_rate": 3.0526370216614795e-07, "logits/chosen": -0.9602193832397461, "logits/rejected": -0.9239477515220642, "logps/chosen": -94.81919860839844, "logps/rejected": -77.93250274658203, "loss": 1.2731, "rewards/accuracies": 0.0, "rewards/chosen": 1.2111084461212158, "rewards/margins": -0.3392035961151123, "rewards/rejected": 1.5503120422363281, "step": 9868 }, { "epoch": 1.6, "learning_rate": 3.0514266110423715e-07, "logits/chosen": -0.4140523374080658, "logits/rejected": -0.3757815659046173, "logps/chosen": -138.11912536621094, "logps/rejected": -59.024925231933594, "loss": 0.247, "rewards/accuracies": 1.0, "rewards/chosen": 1.6803970336914062, "rewards/margins": 0.49276649951934814, "rewards/rejected": 1.187630534172058, "step": 9869 }, { "epoch": 1.6, "learning_rate": 3.0502163350512477e-07, "logits/chosen": -0.7201984524726868, "logits/rejected": -0.7233955264091492, "logps/chosen": -56.14469909667969, "logps/rejected": -88.07074737548828, "loss": 0.7458, "rewards/accuracies": 1.0, "rewards/chosen": 2.023301839828491, "rewards/margins": 0.9114793539047241, "rewards/rejected": 1.111822485923767, "step": 9870 }, { "epoch": 1.6, "learning_rate": 3.049006193771726e-07, "logits/chosen": -0.7787157893180847, "logits/rejected": -0.7048239707946777, "logps/chosen": -106.84439086914062, "logps/rejected": -45.5515022277832, "loss": 0.7574, "rewards/accuracies": 1.0, "rewards/chosen": 3.887493848800659, "rewards/margins": 2.642613410949707, "rewards/rejected": 1.2448803186416626, "step": 9871 }, { "epoch": 1.6, "learning_rate": 3.0477961872874156e-07, "logits/chosen": -1.065091609954834, "logits/rejected": -1.008726716041565, "logps/chosen": -64.24785614013672, "logps/rejected": -89.71221923828125, "loss": 0.3716, "rewards/accuracies": 1.0, "rewards/chosen": 1.3901199102401733, "rewards/margins": 0.3375556468963623, "rewards/rejected": 1.052564263343811, "step": 9872 }, { "epoch": 1.6, "learning_rate": 3.046586315681917e-07, "logits/chosen": -0.5650554299354553, "logits/rejected": -0.5990949273109436, "logps/chosen": -40.44987487792969, "logps/rejected": -45.250274658203125, "loss": 0.6722, "rewards/accuracies": 0.0, "rewards/chosen": 2.3473098278045654, "rewards/margins": -0.07238245010375977, "rewards/rejected": 2.419692277908325, "step": 9873 }, { "epoch": 1.6, "learning_rate": 3.0453765790388206e-07, "logits/chosen": -0.5043092370033264, "logits/rejected": -0.5670949220657349, "logps/chosen": -64.33641815185547, "logps/rejected": -76.7140121459961, "loss": 2.1392, "rewards/accuracies": 0.0, "rewards/chosen": 1.5698661804199219, "rewards/margins": -3.5990052223205566, "rewards/rejected": 5.1688714027404785, "step": 9874 }, { "epoch": 1.6, "learning_rate": 3.044166977441708e-07, "logits/chosen": -0.888364315032959, "logits/rejected": -0.7758670449256897, "logps/chosen": -75.8617935180664, "logps/rejected": -19.75817108154297, "loss": 0.88, "rewards/accuracies": 1.0, "rewards/chosen": 2.6095802783966064, "rewards/margins": 2.266895294189453, "rewards/rejected": 0.34268495440483093, "step": 9875 }, { "epoch": 1.6, "learning_rate": 3.04295751097415e-07, "logits/chosen": -0.8506045341491699, "logits/rejected": -0.7380548119544983, "logps/chosen": -56.31085968017578, "logps/rejected": -100.35480499267578, "loss": 2.6689, "rewards/accuracies": 1.0, "rewards/chosen": 3.8250832557678223, "rewards/margins": 1.5450599193572998, "rewards/rejected": 2.2800233364105225, "step": 9876 }, { "epoch": 1.6, "learning_rate": 3.0417481797197116e-07, "logits/chosen": -0.8769862651824951, "logits/rejected": -0.8600117564201355, "logps/chosen": -52.415306091308594, "logps/rejected": -71.11012268066406, "loss": 0.2349, "rewards/accuracies": 1.0, "rewards/chosen": 2.4107613563537598, "rewards/margins": 0.6423256397247314, "rewards/rejected": 1.7684357166290283, "step": 9877 }, { "epoch": 1.6, "learning_rate": 3.040538983761944e-07, "logits/chosen": -0.7857506275177002, "logits/rejected": -0.6713311076164246, "logps/chosen": -103.4896011352539, "logps/rejected": -137.92559814453125, "loss": 2.0905, "rewards/accuracies": 0.0, "rewards/chosen": 3.1528375148773193, "rewards/margins": -2.1294867992401123, "rewards/rejected": 5.282324314117432, "step": 9878 }, { "epoch": 1.6, "learning_rate": 3.0393299231843923e-07, "logits/chosen": -0.8854429125785828, "logits/rejected": -0.8801283836364746, "logps/chosen": -108.37821960449219, "logps/rejected": -93.82778930664062, "loss": 1.9112, "rewards/accuracies": 0.0, "rewards/chosen": 1.178796410560608, "rewards/margins": -3.79681396484375, "rewards/rejected": 4.975610256195068, "step": 9879 }, { "epoch": 1.6, "learning_rate": 3.038120998070591e-07, "logits/chosen": -0.7154515385627747, "logits/rejected": -0.6716963052749634, "logps/chosen": -129.8834991455078, "logps/rejected": -95.93998718261719, "loss": 0.9974, "rewards/accuracies": 0.0, "rewards/chosen": -0.212443545460701, "rewards/margins": -1.8186218738555908, "rewards/rejected": 1.6061782836914062, "step": 9880 }, { "epoch": 1.6, "learning_rate": 3.036912208504065e-07, "logits/chosen": -0.7823755741119385, "logits/rejected": -0.7672073841094971, "logps/chosen": -93.11178588867188, "logps/rejected": -83.69512939453125, "loss": 0.6817, "rewards/accuracies": 0.0, "rewards/chosen": 1.5136383771896362, "rewards/margins": -0.7392104864120483, "rewards/rejected": 2.2528488636016846, "step": 9881 }, { "epoch": 1.6, "learning_rate": 3.0357035545683306e-07, "logits/chosen": -0.4091721475124359, "logits/rejected": -0.4091721475124359, "logps/chosen": -63.619842529296875, "logps/rejected": -63.619842529296875, "loss": 1.5951, "rewards/accuracies": 0.0, "rewards/chosen": 0.9691482782363892, "rewards/margins": 0.0, "rewards/rejected": 0.9691482782363892, "step": 9882 }, { "epoch": 1.6, "learning_rate": 3.034495036346894e-07, "logits/chosen": -0.16887637972831726, "logits/rejected": -0.1605224758386612, "logps/chosen": -1.6395210027694702, "logps/rejected": -8.049467086791992, "loss": 0.3973, "rewards/accuracies": 0.0, "rewards/chosen": 0.31613802909851074, "rewards/margins": -0.08048388361930847, "rewards/rejected": 0.3966219127178192, "step": 9883 }, { "epoch": 1.6, "learning_rate": 3.0332866539232526e-07, "logits/chosen": -0.8482654094696045, "logits/rejected": -0.7709159851074219, "logps/chosen": -172.08096313476562, "logps/rejected": -47.99885940551758, "loss": 0.1798, "rewards/accuracies": 1.0, "rewards/chosen": 5.448050022125244, "rewards/margins": 3.0691487789154053, "rewards/rejected": 2.378901243209839, "step": 9884 }, { "epoch": 1.6, "learning_rate": 3.0320784073808945e-07, "logits/chosen": -0.9877065420150757, "logits/rejected": -0.8820340633392334, "logps/chosen": -96.42589569091797, "logps/rejected": -21.535375595092773, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": 5.8722453117370605, "rewards/margins": 5.472794055938721, "rewards/rejected": 0.39945146441459656, "step": 9885 }, { "epoch": 1.6, "learning_rate": 3.030870296803298e-07, "logits/chosen": -0.7713117003440857, "logits/rejected": -0.814481794834137, "logps/chosen": -217.7503662109375, "logps/rejected": -179.9859161376953, "loss": 0.542, "rewards/accuracies": 1.0, "rewards/chosen": 5.108676433563232, "rewards/margins": 3.449241876602173, "rewards/rejected": 1.6594345569610596, "step": 9886 }, { "epoch": 1.6, "learning_rate": 3.0296623222739315e-07, "logits/chosen": -0.9110997915267944, "logits/rejected": -0.8123533129692078, "logps/chosen": -114.94027709960938, "logps/rejected": -90.6138916015625, "loss": 0.2772, "rewards/accuracies": 1.0, "rewards/chosen": 6.082228183746338, "rewards/margins": 0.7815155982971191, "rewards/rejected": 5.300712585449219, "step": 9887 }, { "epoch": 1.6, "learning_rate": 3.028454483876255e-07, "logits/chosen": -0.7630314230918884, "logits/rejected": -0.7884917259216309, "logps/chosen": -94.08866882324219, "logps/rejected": -80.9317626953125, "loss": 0.769, "rewards/accuracies": 0.0, "rewards/chosen": 1.7198837995529175, "rewards/margins": -0.31134116649627686, "rewards/rejected": 2.0312249660491943, "step": 9888 }, { "epoch": 1.61, "learning_rate": 3.027246781693719e-07, "logits/chosen": -0.7183974981307983, "logits/rejected": -0.7253674268722534, "logps/chosen": -112.02174377441406, "logps/rejected": -82.61090087890625, "loss": 1.4151, "rewards/accuracies": 0.0, "rewards/chosen": 0.2281791716814041, "rewards/margins": -2.6195082664489746, "rewards/rejected": 2.8476874828338623, "step": 9889 }, { "epoch": 1.61, "learning_rate": 3.0260392158097636e-07, "logits/chosen": -0.9630552530288696, "logits/rejected": -0.9971378445625305, "logps/chosen": -77.14082336425781, "logps/rejected": -74.95427703857422, "loss": 1.4633, "rewards/accuracies": 0.0, "rewards/chosen": 1.985634684562683, "rewards/margins": -0.45793449878692627, "rewards/rejected": 2.4435691833496094, "step": 9890 }, { "epoch": 1.61, "learning_rate": 3.024831786307821e-07, "logits/chosen": -0.7693695425987244, "logits/rejected": -0.7693695425987244, "logps/chosen": -26.29080581665039, "logps/rejected": -26.29080581665039, "loss": 0.4354, "rewards/accuracies": 0.0, "rewards/chosen": 1.1963661909103394, "rewards/margins": 0.0, "rewards/rejected": 1.1963661909103394, "step": 9891 }, { "epoch": 1.61, "learning_rate": 3.023624493271313e-07, "logits/chosen": -0.6358500123023987, "logits/rejected": -0.5886071920394897, "logps/chosen": -228.2996826171875, "logps/rejected": -29.510456085205078, "loss": 0.2928, "rewards/accuracies": 1.0, "rewards/chosen": 4.546475410461426, "rewards/margins": 3.537034034729004, "rewards/rejected": 1.0094413757324219, "step": 9892 }, { "epoch": 1.61, "learning_rate": 3.0224173367836515e-07, "logits/chosen": -0.8472071290016174, "logits/rejected": -0.9193685054779053, "logps/chosen": -165.61026000976562, "logps/rejected": -216.10464477539062, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 5.602027893066406, "rewards/margins": -0.9449782371520996, "rewards/rejected": 6.547006130218506, "step": 9893 }, { "epoch": 1.61, "learning_rate": 3.021210316928241e-07, "logits/chosen": -0.9573767781257629, "logits/rejected": -0.9384980201721191, "logps/chosen": -54.50839614868164, "logps/rejected": -46.86589050292969, "loss": 0.9457, "rewards/accuracies": 0.0, "rewards/chosen": 1.5951794385910034, "rewards/margins": -0.43629252910614014, "rewards/rejected": 2.0314719676971436, "step": 9894 }, { "epoch": 1.61, "learning_rate": 3.020003433788473e-07, "logits/chosen": -0.5388076901435852, "logits/rejected": -0.6030086278915405, "logps/chosen": -56.28948211669922, "logps/rejected": -102.92704772949219, "loss": 0.1647, "rewards/accuracies": 1.0, "rewards/chosen": 1.130427598953247, "rewards/margins": 1.110456943511963, "rewards/rejected": 0.01997070387005806, "step": 9895 }, { "epoch": 1.61, "learning_rate": 3.018796687447733e-07, "logits/chosen": -1.127361536026001, "logits/rejected": -1.1619936227798462, "logps/chosen": -200.27357482910156, "logps/rejected": -183.42950439453125, "loss": 1.3379, "rewards/accuracies": 0.0, "rewards/chosen": 4.955499172210693, "rewards/margins": -2.385040283203125, "rewards/rejected": 7.340539455413818, "step": 9896 }, { "epoch": 1.61, "learning_rate": 3.0175900779893955e-07, "logits/chosen": -0.6085605621337891, "logits/rejected": -0.5570538640022278, "logps/chosen": -46.315155029296875, "logps/rejected": -41.22442626953125, "loss": 0.3604, "rewards/accuracies": 1.0, "rewards/chosen": 1.9155876636505127, "rewards/margins": 0.02472841739654541, "rewards/rejected": 1.8908592462539673, "step": 9897 }, { "epoch": 1.61, "learning_rate": 3.0163836054968264e-07, "logits/chosen": -0.564928412437439, "logits/rejected": -0.564928412437439, "logps/chosen": -52.29265213012695, "logps/rejected": -52.29265213012695, "loss": 0.6509, "rewards/accuracies": 0.0, "rewards/chosen": 0.45635986328125, "rewards/margins": 0.0, "rewards/rejected": 0.45635986328125, "step": 9898 }, { "epoch": 1.61, "learning_rate": 3.01517727005338e-07, "logits/chosen": -0.9973524808883667, "logits/rejected": -1.023980736732483, "logps/chosen": -101.99400329589844, "logps/rejected": -117.27322387695312, "loss": 2.758, "rewards/accuracies": 0.0, "rewards/chosen": 0.7497268915176392, "rewards/margins": -2.224539279937744, "rewards/rejected": 2.9742660522460938, "step": 9899 }, { "epoch": 1.61, "learning_rate": 3.0139710717424037e-07, "logits/chosen": -0.7487260103225708, "logits/rejected": -0.8950117826461792, "logps/chosen": -66.02215576171875, "logps/rejected": -114.0584487915039, "loss": 1.6481, "rewards/accuracies": 0.0, "rewards/chosen": 2.065605878829956, "rewards/margins": -2.7535598278045654, "rewards/rejected": 4.8191657066345215, "step": 9900 }, { "epoch": 1.61, "learning_rate": 3.012765010647235e-07, "logits/chosen": -0.5003185272216797, "logits/rejected": -0.5066291689872742, "logps/chosen": -3.0339813232421875, "logps/rejected": -2.093897819519043, "loss": 0.4983, "rewards/accuracies": 0.0, "rewards/chosen": 0.13257785141468048, "rewards/margins": -0.15527920424938202, "rewards/rejected": 0.2878570556640625, "step": 9901 }, { "epoch": 1.61, "learning_rate": 3.0115590868512007e-07, "logits/chosen": -0.8120998740196228, "logits/rejected": -0.7948075532913208, "logps/chosen": -67.39262390136719, "logps/rejected": -78.33160400390625, "loss": 0.2625, "rewards/accuracies": 1.0, "rewards/chosen": 1.9345321655273438, "rewards/margins": 0.37215423583984375, "rewards/rejected": 1.5623779296875, "step": 9902 }, { "epoch": 1.61, "learning_rate": 3.010353300437618e-07, "logits/chosen": -0.9743339419364929, "logits/rejected": -0.9596342444419861, "logps/chosen": -76.3189468383789, "logps/rejected": -99.22412872314453, "loss": 2.7592, "rewards/accuracies": 0.0, "rewards/chosen": 0.7005974054336548, "rewards/margins": -1.6740723848342896, "rewards/rejected": 2.3746697902679443, "step": 9903 }, { "epoch": 1.61, "learning_rate": 3.0091476514897974e-07, "logits/chosen": -0.4620089828968048, "logits/rejected": -0.39323338866233826, "logps/chosen": -36.831451416015625, "logps/rejected": -46.967315673828125, "loss": 0.2834, "rewards/accuracies": 1.0, "rewards/chosen": 1.885809302330017, "rewards/margins": 0.7639586925506592, "rewards/rejected": 1.121850609779358, "step": 9904 }, { "epoch": 1.61, "learning_rate": 3.0079421400910356e-07, "logits/chosen": -0.5780776739120483, "logits/rejected": -0.518943190574646, "logps/chosen": -66.4604263305664, "logps/rejected": -90.12691497802734, "loss": 0.2581, "rewards/accuracies": 1.0, "rewards/chosen": 1.3371437788009644, "rewards/margins": 0.4668564200401306, "rewards/rejected": 0.8702873587608337, "step": 9905 }, { "epoch": 1.61, "learning_rate": 3.0067367663246224e-07, "logits/chosen": -0.5081509351730347, "logits/rejected": -0.5241789817810059, "logps/chosen": -5.2955522537231445, "logps/rejected": -1.8622174263000488, "loss": 0.5522, "rewards/accuracies": 0.0, "rewards/chosen": 0.38870278000831604, "rewards/margins": -0.22931143641471863, "rewards/rejected": 0.6180142164230347, "step": 9906 }, { "epoch": 1.61, "learning_rate": 3.0055315302738386e-07, "logits/chosen": -0.9314165711402893, "logits/rejected": -0.9535596966743469, "logps/chosen": -69.52351379394531, "logps/rejected": -118.02711486816406, "loss": 0.3673, "rewards/accuracies": 0.0, "rewards/chosen": 1.122449517250061, "rewards/margins": -0.07541882991790771, "rewards/rejected": 1.1978683471679688, "step": 9907 }, { "epoch": 1.61, "learning_rate": 3.004326432021954e-07, "logits/chosen": -0.4985761344432831, "logits/rejected": -0.5587892532348633, "logps/chosen": -63.293975830078125, "logps/rejected": -70.22193908691406, "loss": 0.6501, "rewards/accuracies": 0.0, "rewards/chosen": 1.166235327720642, "rewards/margins": -0.6135727167129517, "rewards/rejected": 1.7798080444335938, "step": 9908 }, { "epoch": 1.61, "learning_rate": 3.00312147165223e-07, "logits/chosen": -0.629132866859436, "logits/rejected": -0.6392688751220703, "logps/chosen": -26.33688735961914, "logps/rejected": -25.701202392578125, "loss": 0.6301, "rewards/accuracies": 0.0, "rewards/chosen": 0.3577585220336914, "rewards/margins": -0.17841434478759766, "rewards/rejected": 0.5361728668212891, "step": 9909 }, { "epoch": 1.61, "learning_rate": 3.0019166492479183e-07, "logits/chosen": -0.3916090428829193, "logits/rejected": -0.3897196054458618, "logps/chosen": -47.75514602661133, "logps/rejected": -19.071794509887695, "loss": 0.4761, "rewards/accuracies": 0.0, "rewards/chosen": 0.28149834275245667, "rewards/margins": -0.04872322082519531, "rewards/rejected": 0.330221563577652, "step": 9910 }, { "epoch": 1.61, "learning_rate": 3.0007119648922596e-07, "logits/chosen": -0.4953712522983551, "logits/rejected": -0.4953712522983551, "logps/chosen": -11.933576583862305, "logps/rejected": -11.933576583862305, "loss": 0.5008, "rewards/accuracies": 0.0, "rewards/chosen": 0.47614631056785583, "rewards/margins": 0.0, "rewards/rejected": 0.47614631056785583, "step": 9911 }, { "epoch": 1.61, "learning_rate": 2.9995074186684867e-07, "logits/chosen": -0.7522302865982056, "logits/rejected": -0.6055619716644287, "logps/chosen": -91.69972229003906, "logps/rejected": -30.439699172973633, "loss": 0.0959, "rewards/accuracies": 1.0, "rewards/chosen": 2.6993255615234375, "rewards/margins": 2.1629319190979004, "rewards/rejected": 0.5363935828208923, "step": 9912 }, { "epoch": 1.61, "learning_rate": 2.9983030106598216e-07, "logits/chosen": -0.6445494294166565, "logits/rejected": -0.5571755170822144, "logps/chosen": -77.72332763671875, "logps/rejected": -31.621915817260742, "loss": 0.8092, "rewards/accuracies": 0.0, "rewards/chosen": 0.8093323111534119, "rewards/margins": -0.30279868841171265, "rewards/rejected": 1.1121309995651245, "step": 9913 }, { "epoch": 1.61, "learning_rate": 2.997098740949478e-07, "logits/chosen": -0.4191531538963318, "logits/rejected": -0.419883668422699, "logps/chosen": -3.1010091304779053, "logps/rejected": -1.3075460195541382, "loss": 0.6652, "rewards/accuracies": 0.0, "rewards/chosen": 0.30230996012687683, "rewards/margins": -0.008295655250549316, "rewards/rejected": 0.31060561537742615, "step": 9914 }, { "epoch": 1.61, "learning_rate": 2.9958946096206605e-07, "logits/chosen": -0.5672311186790466, "logits/rejected": -0.4053362309932709, "logps/chosen": -58.34503173828125, "logps/rejected": -53.128196716308594, "loss": 0.2869, "rewards/accuracies": 1.0, "rewards/chosen": 2.1428780555725098, "rewards/margins": 0.5252617597579956, "rewards/rejected": 1.6176162958145142, "step": 9915 }, { "epoch": 1.61, "learning_rate": 2.9946906167565627e-07, "logits/chosen": -0.6906325221061707, "logits/rejected": -0.6218380928039551, "logps/chosen": -70.83169555664062, "logps/rejected": -82.99044799804688, "loss": 1.8739, "rewards/accuracies": 0.0, "rewards/chosen": 1.9077484607696533, "rewards/margins": -0.9935469627380371, "rewards/rejected": 2.9012954235076904, "step": 9916 }, { "epoch": 1.61, "learning_rate": 2.9934867624403687e-07, "logits/chosen": -1.1111693382263184, "logits/rejected": -1.0831291675567627, "logps/chosen": -141.10382080078125, "logps/rejected": -107.90252685546875, "loss": 1.3168, "rewards/accuracies": 0.0, "rewards/chosen": 1.1939254999160767, "rewards/margins": -1.065388560295105, "rewards/rejected": 2.2593140602111816, "step": 9917 }, { "epoch": 1.61, "learning_rate": 2.992283046755254e-07, "logits/chosen": -0.6688422560691833, "logits/rejected": -0.6418817043304443, "logps/chosen": -2.5422401428222656, "logps/rejected": -38.89695358276367, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.4702194631099701, "rewards/margins": 0.3483665883541107, "rewards/rejected": 0.12185287475585938, "step": 9918 }, { "epoch": 1.61, "learning_rate": 2.991079469784383e-07, "logits/chosen": -0.5992823839187622, "logits/rejected": -0.5521842241287231, "logps/chosen": -39.92877197265625, "logps/rejected": -63.8490104675293, "loss": 1.1349, "rewards/accuracies": 0.0, "rewards/chosen": 1.6874916553497314, "rewards/margins": -1.48429536819458, "rewards/rejected": 3.1717870235443115, "step": 9919 }, { "epoch": 1.61, "learning_rate": 2.9898760316109115e-07, "logits/chosen": -0.4742453992366791, "logits/rejected": -0.45232731103897095, "logps/chosen": -104.8253402709961, "logps/rejected": -91.75965118408203, "loss": 0.6116, "rewards/accuracies": 0.0, "rewards/chosen": 2.6525979042053223, "rewards/margins": -0.7660796642303467, "rewards/rejected": 3.418677568435669, "step": 9920 }, { "epoch": 1.61, "learning_rate": 2.988672732317987e-07, "logits/chosen": -1.0136181116104126, "logits/rejected": -0.7158359885215759, "logps/chosen": -129.40274047851562, "logps/rejected": -49.089393615722656, "loss": 0.0261, "rewards/accuracies": 1.0, "rewards/chosen": 5.374022006988525, "rewards/margins": 3.811429500579834, "rewards/rejected": 1.5625923871994019, "step": 9921 }, { "epoch": 1.61, "learning_rate": 2.987469571988746e-07, "logits/chosen": -0.548223078250885, "logits/rejected": -0.6173897385597229, "logps/chosen": -80.95862579345703, "logps/rejected": -83.50578308105469, "loss": 0.533, "rewards/accuracies": 1.0, "rewards/chosen": 3.6010048389434814, "rewards/margins": 0.40784525871276855, "rewards/rejected": 3.193159580230713, "step": 9922 }, { "epoch": 1.61, "learning_rate": 2.9862665507063144e-07, "logits/chosen": -0.6136545538902283, "logits/rejected": -0.7141169309616089, "logps/chosen": -68.69621276855469, "logps/rejected": -92.90824890136719, "loss": 1.1829, "rewards/accuracies": 0.0, "rewards/chosen": 1.5369369983673096, "rewards/margins": -1.7072029113769531, "rewards/rejected": 3.2441399097442627, "step": 9923 }, { "epoch": 1.61, "learning_rate": 2.98506366855381e-07, "logits/chosen": -0.7298529148101807, "logits/rejected": -0.7979228496551514, "logps/chosen": -171.649658203125, "logps/rejected": -90.36304473876953, "loss": 0.5449, "rewards/accuracies": 0.0, "rewards/chosen": 3.585714817047119, "rewards/margins": -0.4778280258178711, "rewards/rejected": 4.06354284286499, "step": 9924 }, { "epoch": 1.61, "learning_rate": 2.98386092561434e-07, "logits/chosen": -0.9615793228149414, "logits/rejected": -1.0373482704162598, "logps/chosen": -46.63636779785156, "logps/rejected": -145.9690704345703, "loss": 1.1665, "rewards/accuracies": 0.0, "rewards/chosen": 2.4173874855041504, "rewards/margins": -2.2205748558044434, "rewards/rejected": 4.637962341308594, "step": 9925 }, { "epoch": 1.61, "learning_rate": 2.9826583219710034e-07, "logits/chosen": -0.7358825206756592, "logits/rejected": -0.6006701588630676, "logps/chosen": -61.19140625, "logps/rejected": -28.19713020324707, "loss": 0.1578, "rewards/accuracies": 1.0, "rewards/chosen": 2.024285078048706, "rewards/margins": 1.8282595872879028, "rewards/rejected": 0.19602547585964203, "step": 9926 }, { "epoch": 1.61, "learning_rate": 2.981455857706889e-07, "logits/chosen": -1.2494295835494995, "logits/rejected": -1.1203022003173828, "logps/chosen": -120.31710052490234, "logps/rejected": -115.35980224609375, "loss": 0.4982, "rewards/accuracies": 1.0, "rewards/chosen": 1.3111534118652344, "rewards/margins": 0.8167617321014404, "rewards/rejected": 0.49439164996147156, "step": 9927 }, { "epoch": 1.61, "learning_rate": 2.980253532905075e-07, "logits/chosen": -0.9486972689628601, "logits/rejected": -0.900031328201294, "logps/chosen": -96.42150115966797, "logps/rejected": -39.93881607055664, "loss": 0.467, "rewards/accuracies": 1.0, "rewards/chosen": 2.1915688514709473, "rewards/margins": 1.91813063621521, "rewards/rejected": 0.2734382748603821, "step": 9928 }, { "epoch": 1.61, "learning_rate": 2.97905134764863e-07, "logits/chosen": -0.4437371492385864, "logits/rejected": -0.4263800084590912, "logps/chosen": -92.07221984863281, "logps/rejected": -72.40581512451172, "loss": 0.8018, "rewards/accuracies": 0.0, "rewards/chosen": 1.0351532697677612, "rewards/margins": -0.9624701738357544, "rewards/rejected": 1.9976234436035156, "step": 9929 }, { "epoch": 1.61, "learning_rate": 2.977849302020615e-07, "logits/chosen": -0.8538281321525574, "logits/rejected": -0.8808432817459106, "logps/chosen": -46.83110046386719, "logps/rejected": -70.3205337524414, "loss": 0.5593, "rewards/accuracies": 1.0, "rewards/chosen": 0.8679966330528259, "rewards/margins": 0.11653637886047363, "rewards/rejected": 0.7514602541923523, "step": 9930 }, { "epoch": 1.61, "learning_rate": 2.9766473961040806e-07, "logits/chosen": -0.5947545766830444, "logits/rejected": -0.5038662552833557, "logps/chosen": -144.16799926757812, "logps/rejected": -66.8369140625, "loss": 1.0466, "rewards/accuracies": 1.0, "rewards/chosen": 1.7183258533477783, "rewards/margins": 0.2772705554962158, "rewards/rejected": 1.4410552978515625, "step": 9931 }, { "epoch": 1.61, "learning_rate": 2.9754456299820645e-07, "logits/chosen": -0.6047607660293579, "logits/rejected": -0.6465182304382324, "logps/chosen": -52.24431610107422, "logps/rejected": -58.630332946777344, "loss": 0.4163, "rewards/accuracies": 0.0, "rewards/chosen": 2.291114091873169, "rewards/margins": -0.07866978645324707, "rewards/rejected": 2.369783878326416, "step": 9932 }, { "epoch": 1.61, "learning_rate": 2.9742440037375996e-07, "logits/chosen": -0.5117824077606201, "logits/rejected": -0.1511092483997345, "logps/chosen": -31.384326934814453, "logps/rejected": -107.09268188476562, "loss": 3.7201, "rewards/accuracies": 0.0, "rewards/chosen": 1.0888680219650269, "rewards/margins": -3.4089651107788086, "rewards/rejected": 4.497833251953125, "step": 9933 }, { "epoch": 1.61, "learning_rate": 2.9730425174537053e-07, "logits/chosen": -0.7327065467834473, "logits/rejected": -0.6758279204368591, "logps/chosen": -95.26412200927734, "logps/rejected": -66.54347229003906, "loss": 0.4681, "rewards/accuracies": 0.0, "rewards/chosen": 0.8506096005439758, "rewards/margins": -0.27355116605758667, "rewards/rejected": 1.1241607666015625, "step": 9934 }, { "epoch": 1.61, "learning_rate": 2.971841171213395e-07, "logits/chosen": -1.1495658159255981, "logits/rejected": -1.2535382509231567, "logps/chosen": -221.1344451904297, "logps/rejected": -94.9052963256836, "loss": 1.5193, "rewards/accuracies": 0.0, "rewards/chosen": 3.000056505203247, "rewards/margins": -2.7507379055023193, "rewards/rejected": 5.750794410705566, "step": 9935 }, { "epoch": 1.61, "learning_rate": 2.970639965099668e-07, "logits/chosen": -0.5078706741333008, "logits/rejected": -0.5045903921127319, "logps/chosen": -0.4203449487686157, "logps/rejected": -38.072547912597656, "loss": 0.4042, "rewards/accuracies": 1.0, "rewards/chosen": 0.11751353740692139, "rewards/margins": 0.13011042773723602, "rewards/rejected": -0.01259689312428236, "step": 9936 }, { "epoch": 1.61, "learning_rate": 2.9694388991955187e-07, "logits/chosen": -0.7946106195449829, "logits/rejected": -0.7831824421882629, "logps/chosen": -58.8026237487793, "logps/rejected": -65.05718994140625, "loss": 2.1045, "rewards/accuracies": 1.0, "rewards/chosen": 2.189164400100708, "rewards/margins": 1.4869983196258545, "rewards/rejected": 0.7021660208702087, "step": 9937 }, { "epoch": 1.61, "learning_rate": 2.9682379735839276e-07, "logits/chosen": -0.8695231676101685, "logits/rejected": -0.8066205382347107, "logps/chosen": -116.85530090332031, "logps/rejected": -73.88265991210938, "loss": 0.4137, "rewards/accuracies": 1.0, "rewards/chosen": 1.639668345451355, "rewards/margins": 0.23430025577545166, "rewards/rejected": 1.4053680896759033, "step": 9938 }, { "epoch": 1.61, "learning_rate": 2.9670371883478675e-07, "logits/chosen": -0.9345468282699585, "logits/rejected": -0.8282563090324402, "logps/chosen": -70.80243682861328, "logps/rejected": -65.09269714355469, "loss": 0.3533, "rewards/accuracies": 1.0, "rewards/chosen": 1.236132025718689, "rewards/margins": 0.014873504638671875, "rewards/rejected": 1.221258521080017, "step": 9939 }, { "epoch": 1.61, "learning_rate": 2.9658365435703015e-07, "logits/chosen": -0.43615487217903137, "logits/rejected": -0.43615487217903137, "logps/chosen": -49.33966064453125, "logps/rejected": -49.33966064453125, "loss": 1.2486, "rewards/accuracies": 0.0, "rewards/chosen": 2.0703561305999756, "rewards/margins": 0.0, "rewards/rejected": 2.0703561305999756, "step": 9940 }, { "epoch": 1.61, "learning_rate": 2.9646360393341833e-07, "logits/chosen": -0.5038847327232361, "logits/rejected": -0.44078490138053894, "logps/chosen": -92.5168228149414, "logps/rejected": -105.02391815185547, "loss": 0.4231, "rewards/accuracies": 0.0, "rewards/chosen": 1.3964722156524658, "rewards/margins": -0.042847394943237305, "rewards/rejected": 1.4393196105957031, "step": 9941 }, { "epoch": 1.61, "learning_rate": 2.963435675722456e-07, "logits/chosen": -1.0784318447113037, "logits/rejected": -0.8565474152565002, "logps/chosen": -150.58602905273438, "logps/rejected": -19.182003021240234, "loss": 0.2142, "rewards/accuracies": 1.0, "rewards/chosen": 4.359283447265625, "rewards/margins": 3.8519113063812256, "rewards/rejected": 0.5073720812797546, "step": 9942 }, { "epoch": 1.61, "learning_rate": 2.962235452818053e-07, "logits/chosen": -0.9027661085128784, "logits/rejected": -0.9107083082199097, "logps/chosen": -60.25446319580078, "logps/rejected": -70.258544921875, "loss": 0.8723, "rewards/accuracies": 1.0, "rewards/chosen": 2.1445884704589844, "rewards/margins": 0.635057806968689, "rewards/rejected": 1.5095306634902954, "step": 9943 }, { "epoch": 1.61, "learning_rate": 2.9610353707038995e-07, "logits/chosen": -0.7981021404266357, "logits/rejected": -0.8015643954277039, "logps/chosen": -87.09037780761719, "logps/rejected": -60.30845260620117, "loss": 3.2201, "rewards/accuracies": 1.0, "rewards/chosen": 2.1552910804748535, "rewards/margins": 0.19937634468078613, "rewards/rejected": 1.9559147357940674, "step": 9944 }, { "epoch": 1.61, "learning_rate": 2.959835429462908e-07, "logits/chosen": -0.6564915180206299, "logits/rejected": -0.6026391983032227, "logps/chosen": -112.64883422851562, "logps/rejected": -127.26463317871094, "loss": 0.7597, "rewards/accuracies": 0.0, "rewards/chosen": 0.8651909232139587, "rewards/margins": -1.2384757995605469, "rewards/rejected": 2.1036667823791504, "step": 9945 }, { "epoch": 1.61, "learning_rate": 2.958635629177985e-07, "logits/chosen": -0.8089931011199951, "logits/rejected": -0.7173269987106323, "logps/chosen": -63.616477966308594, "logps/rejected": -88.10614013671875, "loss": 1.5261, "rewards/accuracies": 0.0, "rewards/chosen": 2.9736428260803223, "rewards/margins": -0.9637579917907715, "rewards/rejected": 3.9374008178710938, "step": 9946 }, { "epoch": 1.61, "learning_rate": 2.957435969932024e-07, "logits/chosen": -0.899627685546875, "logits/rejected": -0.7046555876731873, "logps/chosen": -143.21218872070312, "logps/rejected": -34.384681701660156, "loss": 0.4599, "rewards/accuracies": 1.0, "rewards/chosen": 5.01953125, "rewards/margins": 2.565471649169922, "rewards/rejected": 2.454059600830078, "step": 9947 }, { "epoch": 1.61, "learning_rate": 2.95623645180791e-07, "logits/chosen": -0.5068486332893372, "logits/rejected": -0.49043893814086914, "logps/chosen": -87.93867492675781, "logps/rejected": -90.80661010742188, "loss": 0.855, "rewards/accuracies": 1.0, "rewards/chosen": 1.792352318763733, "rewards/margins": 1.7753486633300781, "rewards/rejected": 0.017003631219267845, "step": 9948 }, { "epoch": 1.61, "learning_rate": 2.95503707488852e-07, "logits/chosen": -0.6332573890686035, "logits/rejected": -0.6086918711662292, "logps/chosen": -60.30373764038086, "logps/rejected": -48.2735595703125, "loss": 0.5966, "rewards/accuracies": 1.0, "rewards/chosen": 0.3676128387451172, "rewards/margins": 0.36306723952293396, "rewards/rejected": 0.0045455931685864925, "step": 9949 }, { "epoch": 1.61, "learning_rate": 2.9538378392567165e-07, "logits/chosen": -0.4384545683860779, "logits/rejected": -0.4726756513118744, "logps/chosen": -87.13134002685547, "logps/rejected": -40.881046295166016, "loss": 0.907, "rewards/accuracies": 0.0, "rewards/chosen": 0.3404373228549957, "rewards/margins": -1.629698634147644, "rewards/rejected": 1.9701359272003174, "step": 9950 }, { "epoch": 1.62, "learning_rate": 2.952638744995359e-07, "logits/chosen": -1.1014699935913086, "logits/rejected": -0.9808314442634583, "logps/chosen": -112.72259521484375, "logps/rejected": -186.917724609375, "loss": 0.9349, "rewards/accuracies": 0.0, "rewards/chosen": 4.551034450531006, "rewards/margins": -0.9555621147155762, "rewards/rejected": 5.506596565246582, "step": 9951 }, { "epoch": 1.62, "learning_rate": 2.951439792187292e-07, "logits/chosen": -0.641541600227356, "logits/rejected": -0.6570839881896973, "logps/chosen": -24.276348114013672, "logps/rejected": -55.16804122924805, "loss": 0.811, "rewards/accuracies": 1.0, "rewards/chosen": 1.0207668542861938, "rewards/margins": 0.20986521244049072, "rewards/rejected": 0.8109016418457031, "step": 9952 }, { "epoch": 1.62, "learning_rate": 2.9502409809153514e-07, "logits/chosen": -0.2694377303123474, "logits/rejected": -0.24996735155582428, "logps/chosen": -82.22190856933594, "logps/rejected": -49.819305419921875, "loss": 0.4794, "rewards/accuracies": 0.0, "rewards/chosen": 1.6555031538009644, "rewards/margins": -0.35707175731658936, "rewards/rejected": 2.0125749111175537, "step": 9953 }, { "epoch": 1.62, "learning_rate": 2.949042311262364e-07, "logits/chosen": -1.0823301076889038, "logits/rejected": -1.053964376449585, "logps/chosen": -105.39012145996094, "logps/rejected": -29.34374237060547, "loss": 0.4666, "rewards/accuracies": 1.0, "rewards/chosen": 1.0921738147735596, "rewards/margins": 0.9511669874191284, "rewards/rejected": 0.14100685715675354, "step": 9954 }, { "epoch": 1.62, "learning_rate": 2.9478437833111465e-07, "logits/chosen": -0.6904621124267578, "logits/rejected": -0.6904621124267578, "logps/chosen": -40.437461853027344, "logps/rejected": -40.437461853027344, "loss": 0.6742, "rewards/accuracies": 0.0, "rewards/chosen": 0.37485846877098083, "rewards/margins": 0.0, "rewards/rejected": 0.37485846877098083, "step": 9955 }, { "epoch": 1.62, "learning_rate": 2.946645397144506e-07, "logits/chosen": -0.7505951523780823, "logits/rejected": -0.7845181226730347, "logps/chosen": -82.29924011230469, "logps/rejected": -77.68913269042969, "loss": 0.7277, "rewards/accuracies": 0.0, "rewards/chosen": 2.4341728687286377, "rewards/margins": -1.1506264209747314, "rewards/rejected": 3.584799289703369, "step": 9956 }, { "epoch": 1.62, "learning_rate": 2.9454471528452397e-07, "logits/chosen": -0.5050016641616821, "logits/rejected": -0.5050016641616821, "logps/chosen": -54.472782135009766, "logps/rejected": -54.472782135009766, "loss": 0.3697, "rewards/accuracies": 0.0, "rewards/chosen": 0.428720086812973, "rewards/margins": 0.0, "rewards/rejected": 0.428720086812973, "step": 9957 }, { "epoch": 1.62, "learning_rate": 2.944249050496135e-07, "logits/chosen": -0.7088217735290527, "logits/rejected": -0.7302188277244568, "logps/chosen": -85.85160064697266, "logps/rejected": -126.67799377441406, "loss": 0.8067, "rewards/accuracies": 0.0, "rewards/chosen": 1.2714828252792358, "rewards/margins": -1.3860138654708862, "rewards/rejected": 2.657496690750122, "step": 9958 }, { "epoch": 1.62, "learning_rate": 2.9430510901799684e-07, "logits/chosen": -1.0869293212890625, "logits/rejected": -0.9397972822189331, "logps/chosen": -91.721435546875, "logps/rejected": -58.062782287597656, "loss": 1.9337, "rewards/accuracies": 1.0, "rewards/chosen": 1.5602585077285767, "rewards/margins": 0.867523193359375, "rewards/rejected": 0.6927353143692017, "step": 9959 }, { "epoch": 1.62, "learning_rate": 2.9418532719795086e-07, "logits/chosen": -0.918831467628479, "logits/rejected": -0.918831467628479, "logps/chosen": -98.24820709228516, "logps/rejected": -98.24820709228516, "loss": 0.6556, "rewards/accuracies": 0.0, "rewards/chosen": 4.155062198638916, "rewards/margins": 0.0, "rewards/rejected": 4.155062198638916, "step": 9960 }, { "epoch": 1.62, "learning_rate": 2.940655595977514e-07, "logits/chosen": -0.8137046098709106, "logits/rejected": -0.7983935475349426, "logps/chosen": -28.90645408630371, "logps/rejected": -5.4214091300964355, "loss": 0.418, "rewards/accuracies": 0.0, "rewards/chosen": 0.05757427215576172, "rewards/margins": -0.24788394570350647, "rewards/rejected": 0.3054582178592682, "step": 9961 }, { "epoch": 1.62, "learning_rate": 2.939458062256731e-07, "logits/chosen": -0.7926743626594543, "logits/rejected": -0.750539243221283, "logps/chosen": -73.00923919677734, "logps/rejected": -73.0848617553711, "loss": 0.3965, "rewards/accuracies": 1.0, "rewards/chosen": 2.557410478591919, "rewards/margins": 1.8631393909454346, "rewards/rejected": 0.6942710876464844, "step": 9962 }, { "epoch": 1.62, "learning_rate": 2.9382606708999e-07, "logits/chosen": -0.3454681634902954, "logits/rejected": -0.3715866208076477, "logps/chosen": -26.867656707763672, "logps/rejected": -59.94783020019531, "loss": 0.4501, "rewards/accuracies": 0.0, "rewards/chosen": 0.3608047664165497, "rewards/margins": -0.23487547039985657, "rewards/rejected": 0.5956802368164062, "step": 9963 }, { "epoch": 1.62, "learning_rate": 2.937063421989748e-07, "logits/chosen": -0.5729513764381409, "logits/rejected": -0.5603786110877991, "logps/chosen": -28.21110725402832, "logps/rejected": -28.07935905456543, "loss": 0.3189, "rewards/accuracies": 1.0, "rewards/chosen": 0.42771169543266296, "rewards/margins": 0.21670076251029968, "rewards/rejected": 0.21101093292236328, "step": 9964 }, { "epoch": 1.62, "learning_rate": 2.9358663156089937e-07, "logits/chosen": -0.706163763999939, "logits/rejected": -0.7058324813842773, "logps/chosen": -45.22081756591797, "logps/rejected": -83.50428771972656, "loss": 0.3492, "rewards/accuracies": 0.0, "rewards/chosen": 1.3676468133926392, "rewards/margins": -0.00478518009185791, "rewards/rejected": 1.372431993484497, "step": 9965 }, { "epoch": 1.62, "learning_rate": 2.934669351840345e-07, "logits/chosen": -0.6885402202606201, "logits/rejected": -0.638824999332428, "logps/chosen": -69.59931945800781, "logps/rejected": -59.605224609375, "loss": 0.4944, "rewards/accuracies": 1.0, "rewards/chosen": 2.3448548316955566, "rewards/margins": 0.15153813362121582, "rewards/rejected": 2.193316698074341, "step": 9966 }, { "epoch": 1.62, "learning_rate": 2.933472530766503e-07, "logits/chosen": -0.7962843775749207, "logits/rejected": -0.8422454595565796, "logps/chosen": -94.10856628417969, "logps/rejected": -93.96041107177734, "loss": 0.3804, "rewards/accuracies": 1.0, "rewards/chosen": 1.9886597394943237, "rewards/margins": 0.14308476448059082, "rewards/rejected": 1.845574975013733, "step": 9967 }, { "epoch": 1.62, "learning_rate": 2.932275852470155e-07, "logits/chosen": -0.7020771503448486, "logits/rejected": -0.656914472579956, "logps/chosen": -60.0794677734375, "logps/rejected": -65.87055969238281, "loss": 1.004, "rewards/accuracies": 0.0, "rewards/chosen": 1.2117455005645752, "rewards/margins": -0.45201945304870605, "rewards/rejected": 1.6637649536132812, "step": 9968 }, { "epoch": 1.62, "learning_rate": 2.931079317033981e-07, "logits/chosen": -0.4243006408214569, "logits/rejected": -0.4290693402290344, "logps/chosen": -124.22322082519531, "logps/rejected": -80.53804016113281, "loss": 0.9009, "rewards/accuracies": 0.0, "rewards/chosen": 0.3368240296840668, "rewards/margins": -1.3608901500701904, "rewards/rejected": 1.6977142095565796, "step": 9969 }, { "epoch": 1.62, "learning_rate": 2.9298829245406496e-07, "logits/chosen": -0.5755124688148499, "logits/rejected": -0.6413072943687439, "logps/chosen": -214.63877868652344, "logps/rejected": -45.4732666015625, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": 4.6350417137146, "rewards/margins": 2.791576385498047, "rewards/rejected": 1.8434654474258423, "step": 9970 }, { "epoch": 1.62, "learning_rate": 2.9286866750728203e-07, "logits/chosen": -0.39440736174583435, "logits/rejected": -0.30735495686531067, "logps/chosen": -72.8462142944336, "logps/rejected": -24.440824508666992, "loss": 0.3307, "rewards/accuracies": 1.0, "rewards/chosen": 1.449541449546814, "rewards/margins": 0.23240256309509277, "rewards/rejected": 1.2171388864517212, "step": 9971 }, { "epoch": 1.62, "learning_rate": 2.9274905687131435e-07, "logits/chosen": -0.7249322533607483, "logits/rejected": -0.7859255075454712, "logps/chosen": -107.77195739746094, "logps/rejected": -99.68428039550781, "loss": 1.1325, "rewards/accuracies": 0.0, "rewards/chosen": 2.9356675148010254, "rewards/margins": -0.6674118041992188, "rewards/rejected": 3.603079319000244, "step": 9972 }, { "epoch": 1.62, "learning_rate": 2.9262946055442573e-07, "logits/chosen": -0.6166583299636841, "logits/rejected": -0.6067435145378113, "logps/chosen": -118.13996124267578, "logps/rejected": -115.60926818847656, "loss": 0.5128, "rewards/accuracies": 1.0, "rewards/chosen": 1.2623223066329956, "rewards/margins": 0.930518388748169, "rewards/rejected": 0.3318038880825043, "step": 9973 }, { "epoch": 1.62, "learning_rate": 2.925098785648793e-07, "logits/chosen": -0.45772016048431396, "logits/rejected": -0.45772016048431396, "logps/chosen": -36.33710479736328, "logps/rejected": -36.33710479736328, "loss": 0.6437, "rewards/accuracies": 0.0, "rewards/chosen": 0.2155807465314865, "rewards/margins": 0.0, "rewards/rejected": 0.2155807465314865, "step": 9974 }, { "epoch": 1.62, "learning_rate": 2.923903109109369e-07, "logits/chosen": -1.0822631120681763, "logits/rejected": -0.9687454700469971, "logps/chosen": -63.1489143371582, "logps/rejected": -19.952228546142578, "loss": 0.0927, "rewards/accuracies": 1.0, "rewards/chosen": 3.178840398788452, "rewards/margins": 2.7971911430358887, "rewards/rejected": 0.3816492259502411, "step": 9975 }, { "epoch": 1.62, "learning_rate": 2.922707576008596e-07, "logits/chosen": -0.7638726830482483, "logits/rejected": -0.6793204545974731, "logps/chosen": -211.21534729003906, "logps/rejected": -74.88357543945312, "loss": 0.2715, "rewards/accuracies": 1.0, "rewards/chosen": 4.951594829559326, "rewards/margins": 2.06893253326416, "rewards/rejected": 2.882662296295166, "step": 9976 }, { "epoch": 1.62, "learning_rate": 2.921512186429075e-07, "logits/chosen": -0.9432148933410645, "logits/rejected": -0.9157277941703796, "logps/chosen": -49.29536437988281, "logps/rejected": -53.59461975097656, "loss": 0.3319, "rewards/accuracies": 1.0, "rewards/chosen": 2.375479221343994, "rewards/margins": 0.8479248285293579, "rewards/rejected": 1.5275543928146362, "step": 9977 }, { "epoch": 1.62, "learning_rate": 2.9203169404533935e-07, "logits/chosen": -0.7414752244949341, "logits/rejected": -0.7148600220680237, "logps/chosen": -78.81224060058594, "logps/rejected": -73.13211822509766, "loss": 1.4424, "rewards/accuracies": 0.0, "rewards/chosen": 2.074636936187744, "rewards/margins": -0.6911354064941406, "rewards/rejected": 2.7657723426818848, "step": 9978 }, { "epoch": 1.62, "learning_rate": 2.9191218381641334e-07, "logits/chosen": -0.44512325525283813, "logits/rejected": -0.3719944357872009, "logps/chosen": -84.62232208251953, "logps/rejected": -86.7678451538086, "loss": 0.8765, "rewards/accuracies": 0.0, "rewards/chosen": 3.4078354835510254, "rewards/margins": -0.1560957431793213, "rewards/rejected": 3.5639312267303467, "step": 9979 }, { "epoch": 1.62, "learning_rate": 2.917926879643866e-07, "logits/chosen": -0.7407900094985962, "logits/rejected": -0.7052760720252991, "logps/chosen": -101.96316528320312, "logps/rejected": -97.07887268066406, "loss": 0.8218, "rewards/accuracies": 0.0, "rewards/chosen": 1.4283676147460938, "rewards/margins": -0.16534733772277832, "rewards/rejected": 1.593714952468872, "step": 9980 }, { "epoch": 1.62, "learning_rate": 2.916732064975149e-07, "logits/chosen": -0.992152214050293, "logits/rejected": -0.9137605428695679, "logps/chosen": -103.25653839111328, "logps/rejected": -115.18444061279297, "loss": 0.302, "rewards/accuracies": 1.0, "rewards/chosen": 5.906060218811035, "rewards/margins": 1.3655519485473633, "rewards/rejected": 4.540508270263672, "step": 9981 }, { "epoch": 1.62, "learning_rate": 2.915537394240537e-07, "logits/chosen": -0.32911011576652527, "logits/rejected": -0.32911011576652527, "logps/chosen": -45.117156982421875, "logps/rejected": -45.117156982421875, "loss": 1.1597, "rewards/accuracies": 0.0, "rewards/chosen": 1.5758861303329468, "rewards/margins": 0.0, "rewards/rejected": 1.5758861303329468, "step": 9982 }, { "epoch": 1.62, "learning_rate": 2.9143428675225646e-07, "logits/chosen": -0.7586216330528259, "logits/rejected": -0.6581083536148071, "logps/chosen": -83.40997314453125, "logps/rejected": -69.8708267211914, "loss": 0.8901, "rewards/accuracies": 1.0, "rewards/chosen": 1.3402382135391235, "rewards/margins": 0.19189763069152832, "rewards/rejected": 1.1483405828475952, "step": 9983 }, { "epoch": 1.62, "learning_rate": 2.913148484903768e-07, "logits/chosen": -0.7203102707862854, "logits/rejected": -0.5799739956855774, "logps/chosen": -78.25732421875, "logps/rejected": -50.98064041137695, "loss": 0.1875, "rewards/accuracies": 1.0, "rewards/chosen": 1.4308608770370483, "rewards/margins": 1.1336193084716797, "rewards/rejected": 0.29724159836769104, "step": 9984 }, { "epoch": 1.62, "learning_rate": 2.9119542464666623e-07, "logits/chosen": -0.8233386874198914, "logits/rejected": -0.8233386874198914, "logps/chosen": -62.903072357177734, "logps/rejected": -62.903072357177734, "loss": 0.4276, "rewards/accuracies": 0.0, "rewards/chosen": 2.6224217414855957, "rewards/margins": 0.0, "rewards/rejected": 2.6224217414855957, "step": 9985 }, { "epoch": 1.62, "learning_rate": 2.9107601522937634e-07, "logits/chosen": -0.2322014421224594, "logits/rejected": -0.2212573140859604, "logps/chosen": -9.291231155395508, "logps/rejected": -3.985643148422241, "loss": 1.6987, "rewards/accuracies": 0.0, "rewards/chosen": 0.042006492614746094, "rewards/margins": -0.27159997820854187, "rewards/rejected": 0.31360647082328796, "step": 9986 }, { "epoch": 1.62, "learning_rate": 2.9095662024675677e-07, "logits/chosen": -0.9672127962112427, "logits/rejected": -0.47230347990989685, "logps/chosen": -79.62225341796875, "logps/rejected": -103.58435821533203, "loss": 1.8158, "rewards/accuracies": 0.0, "rewards/chosen": 1.950873613357544, "rewards/margins": -0.47179269790649414, "rewards/rejected": 2.422666311264038, "step": 9987 }, { "epoch": 1.62, "learning_rate": 2.908372397070569e-07, "logits/chosen": -0.7263053059577942, "logits/rejected": -0.7104167342185974, "logps/chosen": -77.41404724121094, "logps/rejected": -52.41620635986328, "loss": 0.5873, "rewards/accuracies": 0.0, "rewards/chosen": 0.993054211139679, "rewards/margins": -0.3603004813194275, "rewards/rejected": 1.3533546924591064, "step": 9988 }, { "epoch": 1.62, "learning_rate": 2.907178736185245e-07, "logits/chosen": -1.0105684995651245, "logits/rejected": -0.9419330954551697, "logps/chosen": -136.09912109375, "logps/rejected": -40.893585205078125, "loss": 1.2435, "rewards/accuracies": 1.0, "rewards/chosen": 1.443745493888855, "rewards/margins": 1.1810123920440674, "rewards/rejected": 0.2627330720424652, "step": 9989 }, { "epoch": 1.62, "learning_rate": 2.905985219894069e-07, "logits/chosen": -0.5369734764099121, "logits/rejected": -0.5148881673812866, "logps/chosen": -67.23983764648438, "logps/rejected": -92.74003601074219, "loss": 2.5419, "rewards/accuracies": 0.0, "rewards/chosen": 1.2924515008926392, "rewards/margins": -0.8236182928085327, "rewards/rejected": 2.116069793701172, "step": 9990 }, { "epoch": 1.62, "learning_rate": 2.9047918482795e-07, "logits/chosen": -0.4234538972377777, "logits/rejected": -0.5005404353141785, "logps/chosen": -84.05986022949219, "logps/rejected": -96.17341613769531, "loss": 0.9877, "rewards/accuracies": 0.0, "rewards/chosen": 0.8890426754951477, "rewards/margins": -0.9735779166221619, "rewards/rejected": 1.8626205921173096, "step": 9991 }, { "epoch": 1.62, "learning_rate": 2.9035986214239904e-07, "logits/chosen": -0.8843003511428833, "logits/rejected": -0.8249422907829285, "logps/chosen": -93.07514953613281, "logps/rejected": -75.19918823242188, "loss": 0.3756, "rewards/accuracies": 1.0, "rewards/chosen": 1.6166603565216064, "rewards/margins": 1.6314232349395752, "rewards/rejected": -0.01476287841796875, "step": 9992 }, { "epoch": 1.62, "learning_rate": 2.902405539409978e-07, "logits/chosen": -0.4814402759075165, "logits/rejected": -0.5023956894874573, "logps/chosen": -76.09144592285156, "logps/rejected": -117.70455932617188, "loss": 0.6474, "rewards/accuracies": 0.0, "rewards/chosen": 2.4380135536193848, "rewards/margins": -0.9047691822052002, "rewards/rejected": 3.342782735824585, "step": 9993 }, { "epoch": 1.62, "learning_rate": 2.901212602319897e-07, "logits/chosen": -0.6053061485290527, "logits/rejected": -0.563178539276123, "logps/chosen": -125.67713928222656, "logps/rejected": -95.5069351196289, "loss": 0.0746, "rewards/accuracies": 1.0, "rewards/chosen": 5.2661943435668945, "rewards/margins": 1.8846628665924072, "rewards/rejected": 3.3815314769744873, "step": 9994 }, { "epoch": 1.62, "learning_rate": 2.900019810236164e-07, "logits/chosen": -0.4674301743507385, "logits/rejected": -0.3811551034450531, "logps/chosen": -39.56988525390625, "logps/rejected": -43.87248992919922, "loss": 0.3667, "rewards/accuracies": 0.0, "rewards/chosen": 1.9540061950683594, "rewards/margins": -0.0563817024230957, "rewards/rejected": 2.010387897491455, "step": 9995 }, { "epoch": 1.62, "learning_rate": 2.8988271632411944e-07, "logits/chosen": -0.47755342721939087, "logits/rejected": -0.45909789204597473, "logps/chosen": -99.50955200195312, "logps/rejected": -55.66951370239258, "loss": 0.4352, "rewards/accuracies": 0.0, "rewards/chosen": 1.4004547595977783, "rewards/margins": -0.27700233459472656, "rewards/rejected": 1.6774570941925049, "step": 9996 }, { "epoch": 1.62, "learning_rate": 2.8976346614173836e-07, "logits/chosen": -0.7380250096321106, "logits/rejected": -0.8071447014808655, "logps/chosen": -160.303466796875, "logps/rejected": -133.9830780029297, "loss": 1.8917, "rewards/accuracies": 0.0, "rewards/chosen": 4.49722146987915, "rewards/margins": -1.0705933570861816, "rewards/rejected": 5.567814826965332, "step": 9997 }, { "epoch": 1.62, "learning_rate": 2.8964423048471273e-07, "logits/chosen": -1.025905966758728, "logits/rejected": -1.0215387344360352, "logps/chosen": -84.4432373046875, "logps/rejected": -85.58045196533203, "loss": 0.4298, "rewards/accuracies": 0.0, "rewards/chosen": 1.5812034606933594, "rewards/margins": -0.0019752979278564453, "rewards/rejected": 1.5831787586212158, "step": 9998 }, { "epoch": 1.62, "learning_rate": 2.895250093612802e-07, "logits/chosen": -0.44703203439712524, "logits/rejected": -0.4039720892906189, "logps/chosen": -57.212860107421875, "logps/rejected": -87.72264099121094, "loss": 0.8434, "rewards/accuracies": 0.0, "rewards/chosen": 1.7622085809707642, "rewards/margins": -1.2775169610977173, "rewards/rejected": 3.0397255420684814, "step": 9999 }, { "epoch": 1.62, "learning_rate": 2.8940580277967817e-07, "logits/chosen": -0.7546448111534119, "logits/rejected": -0.7054921388626099, "logps/chosen": -181.15185546875, "logps/rejected": -162.8988494873047, "loss": 0.5147, "rewards/accuracies": 0.0, "rewards/chosen": 5.19097900390625, "rewards/margins": -0.10746908187866211, "rewards/rejected": 5.298448085784912, "step": 10000 }, { "epoch": 1.62, "learning_rate": 2.8928661074814254e-07, "logits/chosen": -0.6676098108291626, "logits/rejected": -0.5468201041221619, "logps/chosen": -77.25163269042969, "logps/rejected": -37.50093460083008, "loss": 0.4478, "rewards/accuracies": 1.0, "rewards/chosen": 3.1075165271759033, "rewards/margins": 1.4943954944610596, "rewards/rejected": 1.6131210327148438, "step": 10001 }, { "epoch": 1.62, "learning_rate": 2.89167433274908e-07, "logits/chosen": -0.7112386226654053, "logits/rejected": -0.6076526045799255, "logps/chosen": -221.10113525390625, "logps/rejected": -8.344923973083496, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": 3.609326124191284, "rewards/margins": 3.0628702640533447, "rewards/rejected": 0.5464558601379395, "step": 10002 }, { "epoch": 1.62, "learning_rate": 2.8904827036820923e-07, "logits/chosen": -0.8507820963859558, "logits/rejected": -0.8327451944351196, "logps/chosen": -84.8474349975586, "logps/rejected": -101.25395202636719, "loss": 0.525, "rewards/accuracies": 1.0, "rewards/chosen": 1.537824273109436, "rewards/margins": 0.5618309378623962, "rewards/rejected": 0.9759933352470398, "step": 10003 }, { "epoch": 1.62, "learning_rate": 2.889291220362787e-07, "logits/chosen": -0.49260732531547546, "logits/rejected": -0.5021470189094543, "logps/chosen": -5.172008991241455, "logps/rejected": -1.6111364364624023, "loss": 0.7755, "rewards/accuracies": 0.0, "rewards/chosen": 0.09022612869739532, "rewards/margins": -0.22648586332798004, "rewards/rejected": 0.31671199202537537, "step": 10004 }, { "epoch": 1.62, "learning_rate": 2.888099882873488e-07, "logits/chosen": -0.6084535717964172, "logits/rejected": -0.5762109756469727, "logps/chosen": -85.57059478759766, "logps/rejected": -107.76008605957031, "loss": 0.6215, "rewards/accuracies": 1.0, "rewards/chosen": 1.2336517572402954, "rewards/margins": 0.13330388069152832, "rewards/rejected": 1.100347876548767, "step": 10005 }, { "epoch": 1.62, "learning_rate": 2.8869086912965036e-07, "logits/chosen": -0.4635460376739502, "logits/rejected": -0.42685145139694214, "logps/chosen": -41.21316909790039, "logps/rejected": -68.66152954101562, "loss": 0.5498, "rewards/accuracies": 0.0, "rewards/chosen": 1.8068164587020874, "rewards/margins": -0.3630717992782593, "rewards/rejected": 2.1698882579803467, "step": 10006 }, { "epoch": 1.62, "learning_rate": 2.885717645714135e-07, "logits/chosen": -0.5556877255439758, "logits/rejected": -0.4145503640174866, "logps/chosen": -44.29887390136719, "logps/rejected": -81.28424072265625, "loss": 0.4346, "rewards/accuracies": 0.0, "rewards/chosen": 1.74388587474823, "rewards/margins": -0.2660568952560425, "rewards/rejected": 2.0099427700042725, "step": 10007 }, { "epoch": 1.62, "learning_rate": 2.8845267462086703e-07, "logits/chosen": -0.436087965965271, "logits/rejected": -0.35535362362861633, "logps/chosen": -80.56962585449219, "logps/rejected": -57.049461364746094, "loss": 0.6643, "rewards/accuracies": 1.0, "rewards/chosen": 1.7431213855743408, "rewards/margins": 1.0538437366485596, "rewards/rejected": 0.6892776489257812, "step": 10008 }, { "epoch": 1.62, "learning_rate": 2.8833359928623924e-07, "logits/chosen": -0.8709969520568848, "logits/rejected": -0.8747846484184265, "logps/chosen": -98.67222595214844, "logps/rejected": -78.68714141845703, "loss": 0.94, "rewards/accuracies": 0.0, "rewards/chosen": 0.4055130183696747, "rewards/margins": -0.8104636669158936, "rewards/rejected": 1.2159767150878906, "step": 10009 }, { "epoch": 1.62, "learning_rate": 2.8821453857575673e-07, "logits/chosen": -0.5758209228515625, "logits/rejected": -0.5512916445732117, "logps/chosen": -71.85224914550781, "logps/rejected": -84.37340545654297, "loss": 0.6418, "rewards/accuracies": 1.0, "rewards/chosen": 1.2907241582870483, "rewards/margins": 0.6067573428153992, "rewards/rejected": 0.6839668154716492, "step": 10010 }, { "epoch": 1.62, "learning_rate": 2.880954924976459e-07, "logits/chosen": -0.989640474319458, "logits/rejected": -0.9704760313034058, "logps/chosen": -61.948646545410156, "logps/rejected": -92.7463607788086, "loss": 0.6078, "rewards/accuracies": 0.0, "rewards/chosen": 1.9594329595565796, "rewards/margins": -0.8187416791915894, "rewards/rejected": 2.778174638748169, "step": 10011 }, { "epoch": 1.63, "learning_rate": 2.879764610601312e-07, "logits/chosen": -0.48723840713500977, "logits/rejected": -0.48723840713500977, "logps/chosen": -4.2545576095581055, "logps/rejected": -4.2545576095581055, "loss": 0.728, "rewards/accuracies": 0.0, "rewards/chosen": 0.82856684923172, "rewards/margins": 0.0, "rewards/rejected": 0.82856684923172, "step": 10012 }, { "epoch": 1.63, "learning_rate": 2.878574442714371e-07, "logits/chosen": -0.46462374925613403, "logits/rejected": -0.43598827719688416, "logps/chosen": -53.98020553588867, "logps/rejected": -19.953845977783203, "loss": 0.2713, "rewards/accuracies": 1.0, "rewards/chosen": 0.9673812985420227, "rewards/margins": 0.6021801233291626, "rewards/rejected": 0.3652012050151825, "step": 10013 }, { "epoch": 1.63, "learning_rate": 2.8773844213978613e-07, "logits/chosen": -1.1763949394226074, "logits/rejected": -1.1906228065490723, "logps/chosen": -88.5168228149414, "logps/rejected": -88.90362548828125, "loss": 2.1894, "rewards/accuracies": 0.0, "rewards/chosen": 3.0232949256896973, "rewards/margins": -2.820781707763672, "rewards/rejected": 5.844076633453369, "step": 10014 }, { "epoch": 1.63, "learning_rate": 2.876194546734005e-07, "logits/chosen": -0.8626359105110168, "logits/rejected": -0.8189743161201477, "logps/chosen": -93.21672058105469, "logps/rejected": -65.61941528320312, "loss": 0.6414, "rewards/accuracies": 0.0, "rewards/chosen": 1.81719970703125, "rewards/margins": -0.6454384326934814, "rewards/rejected": 2.4626381397247314, "step": 10015 }, { "epoch": 1.63, "learning_rate": 2.8750048188050086e-07, "logits/chosen": -0.38792529702186584, "logits/rejected": -0.5399326086044312, "logps/chosen": -63.930992126464844, "logps/rejected": -130.57107543945312, "loss": 1.6111, "rewards/accuracies": 0.0, "rewards/chosen": 2.159749746322632, "rewards/margins": -2.9339025020599365, "rewards/rejected": 5.093652248382568, "step": 10016 }, { "epoch": 1.63, "learning_rate": 2.8738152376930734e-07, "logits/chosen": -0.36248859763145447, "logits/rejected": -0.2645666003227234, "logps/chosen": -60.36049270629883, "logps/rejected": -75.21661376953125, "loss": 1.4956, "rewards/accuracies": 1.0, "rewards/chosen": 1.487892508506775, "rewards/margins": 1.4809986352920532, "rewards/rejected": 0.006893921177834272, "step": 10017 }, { "epoch": 1.63, "learning_rate": 2.872625803480386e-07, "logits/chosen": -0.6548223495483398, "logits/rejected": -0.6764443516731262, "logps/chosen": -162.550048828125, "logps/rejected": -103.49966430664062, "loss": 1.4286, "rewards/accuracies": 0.0, "rewards/chosen": 6.428341865539551, "rewards/margins": -0.41640615463256836, "rewards/rejected": 6.844748020172119, "step": 10018 }, { "epoch": 1.63, "learning_rate": 2.871436516249128e-07, "logits/chosen": -0.8394910097122192, "logits/rejected": -1.1687426567077637, "logps/chosen": -65.64027404785156, "logps/rejected": -34.02833938598633, "loss": 0.0892, "rewards/accuracies": 1.0, "rewards/chosen": 2.1398932933807373, "rewards/margins": 1.811677098274231, "rewards/rejected": 0.32821616530418396, "step": 10019 }, { "epoch": 1.63, "learning_rate": 2.870247376081464e-07, "logits/chosen": -0.5514079332351685, "logits/rejected": -0.6543821692466736, "logps/chosen": -189.93702697753906, "logps/rejected": -111.7522964477539, "loss": 0.6017, "rewards/accuracies": 0.0, "rewards/chosen": 3.5687973499298096, "rewards/margins": -0.8397619724273682, "rewards/rejected": 4.408559322357178, "step": 10020 }, { "epoch": 1.63, "learning_rate": 2.8690583830595567e-07, "logits/chosen": -0.8917009234428406, "logits/rejected": -0.8559615015983582, "logps/chosen": -93.80340576171875, "logps/rejected": -95.37940979003906, "loss": 0.7963, "rewards/accuracies": 0.0, "rewards/chosen": 1.0477981567382812, "rewards/margins": -0.03374183177947998, "rewards/rejected": 1.0815399885177612, "step": 10021 }, { "epoch": 1.63, "learning_rate": 2.8678695372655495e-07, "logits/chosen": -0.762466311454773, "logits/rejected": -0.7453525066375732, "logps/chosen": -51.208534240722656, "logps/rejected": -83.62898254394531, "loss": 2.0021, "rewards/accuracies": 0.0, "rewards/chosen": 1.7246071100234985, "rewards/margins": -1.5135794878005981, "rewards/rejected": 3.2381865978240967, "step": 10022 }, { "epoch": 1.63, "learning_rate": 2.8666808387815833e-07, "logits/chosen": -0.6477649807929993, "logits/rejected": -0.6255849599838257, "logps/chosen": -55.8704719543457, "logps/rejected": -93.58612823486328, "loss": 0.502, "rewards/accuracies": 1.0, "rewards/chosen": 1.6739376783370972, "rewards/margins": 1.3256733417510986, "rewards/rejected": 0.34826430678367615, "step": 10023 }, { "epoch": 1.63, "learning_rate": 2.865492287689787e-07, "logits/chosen": -0.6339569091796875, "logits/rejected": -0.6786782741546631, "logps/chosen": -81.7426528930664, "logps/rejected": -92.81561279296875, "loss": 2.0624, "rewards/accuracies": 0.0, "rewards/chosen": 1.5499076843261719, "rewards/margins": -2.943317413330078, "rewards/rejected": 4.49322509765625, "step": 10024 }, { "epoch": 1.63, "learning_rate": 2.864303884072275e-07, "logits/chosen": -0.45122992992401123, "logits/rejected": -0.3722221851348877, "logps/chosen": -63.81535339355469, "logps/rejected": -64.1638412475586, "loss": 0.3544, "rewards/accuracies": 1.0, "rewards/chosen": 2.5035400390625, "rewards/margins": 0.4801292419433594, "rewards/rejected": 2.0234107971191406, "step": 10025 }, { "epoch": 1.63, "learning_rate": 2.863115628011158e-07, "logits/chosen": -0.6562327742576599, "logits/rejected": -0.5301302671432495, "logps/chosen": -115.37931823730469, "logps/rejected": -56.060401916503906, "loss": 0.5601, "rewards/accuracies": 0.0, "rewards/chosen": 1.5696274042129517, "rewards/margins": -0.36029279232025146, "rewards/rejected": 1.9299201965332031, "step": 10026 }, { "epoch": 1.63, "learning_rate": 2.8619275195885307e-07, "logits/chosen": -0.7334392070770264, "logits/rejected": -0.692955493927002, "logps/chosen": -42.104736328125, "logps/rejected": -30.514507293701172, "loss": 0.9462, "rewards/accuracies": 1.0, "rewards/chosen": 1.4730370044708252, "rewards/margins": 0.24923217296600342, "rewards/rejected": 1.2238048315048218, "step": 10027 }, { "epoch": 1.63, "learning_rate": 2.860739558886482e-07, "logits/chosen": -0.9374143481254578, "logits/rejected": -0.9389035105705261, "logps/chosen": -70.58915710449219, "logps/rejected": -77.54052734375, "loss": 1.6462, "rewards/accuracies": 0.0, "rewards/chosen": 2.567803144454956, "rewards/margins": -0.2578103542327881, "rewards/rejected": 2.825613498687744, "step": 10028 }, { "epoch": 1.63, "learning_rate": 2.8595517459870863e-07, "logits/chosen": -0.4756408929824829, "logits/rejected": -0.4756408929824829, "logps/chosen": -54.79787826538086, "logps/rejected": -54.79787826538086, "loss": 0.8474, "rewards/accuracies": 0.0, "rewards/chosen": 2.314798355102539, "rewards/margins": 0.0, "rewards/rejected": 2.314798355102539, "step": 10029 }, { "epoch": 1.63, "learning_rate": 2.858364080972414e-07, "logits/chosen": -0.8023340702056885, "logits/rejected": -0.6103033423423767, "logps/chosen": -168.6833038330078, "logps/rejected": -168.19290161132812, "loss": 0.083, "rewards/accuracies": 1.0, "rewards/chosen": 5.965965270996094, "rewards/margins": 1.7434401512145996, "rewards/rejected": 4.222525119781494, "step": 10030 }, { "epoch": 1.63, "learning_rate": 2.857176563924518e-07, "logits/chosen": -0.8964879512786865, "logits/rejected": -0.8701475262641907, "logps/chosen": -73.89997863769531, "logps/rejected": -93.89079284667969, "loss": 0.806, "rewards/accuracies": 1.0, "rewards/chosen": 2.019091844558716, "rewards/margins": 0.6460151672363281, "rewards/rejected": 1.3730766773223877, "step": 10031 }, { "epoch": 1.63, "learning_rate": 2.855989194925447e-07, "logits/chosen": -0.8966856598854065, "logits/rejected": -0.8129490613937378, "logps/chosen": -163.7115936279297, "logps/rejected": -36.17084884643555, "loss": 0.7657, "rewards/accuracies": 1.0, "rewards/chosen": 4.762925624847412, "rewards/margins": 3.5382204055786133, "rewards/rejected": 1.2247051000595093, "step": 10032 }, { "epoch": 1.63, "learning_rate": 2.854801974057234e-07, "logits/chosen": -0.10078948736190796, "logits/rejected": -0.10662949085235596, "logps/chosen": -2.5352871417999268, "logps/rejected": -9.10289478302002, "loss": 0.6964, "rewards/accuracies": 1.0, "rewards/chosen": 0.26088225841522217, "rewards/margins": 0.273037314414978, "rewards/rejected": -0.01215505599975586, "step": 10033 }, { "epoch": 1.63, "learning_rate": 2.8536149014019085e-07, "logits/chosen": -0.5697203278541565, "logits/rejected": -0.5544693470001221, "logps/chosen": -77.8647689819336, "logps/rejected": -89.64207458496094, "loss": 0.7559, "rewards/accuracies": 0.0, "rewards/chosen": 2.283348798751831, "rewards/margins": -0.583197832107544, "rewards/rejected": 2.866546630859375, "step": 10034 }, { "epoch": 1.63, "learning_rate": 2.852427977041483e-07, "logits/chosen": -0.4880387485027313, "logits/rejected": -0.5049534440040588, "logps/chosen": -65.61532592773438, "logps/rejected": -83.44679260253906, "loss": 0.3505, "rewards/accuracies": 1.0, "rewards/chosen": 1.6079682111740112, "rewards/margins": 0.9799789786338806, "rewards/rejected": 0.6279892325401306, "step": 10035 }, { "epoch": 1.63, "learning_rate": 2.8512412010579644e-07, "logits/chosen": -0.6031450033187866, "logits/rejected": -0.4155689775943756, "logps/chosen": -86.20848083496094, "logps/rejected": -24.490478515625, "loss": 0.5043, "rewards/accuracies": 1.0, "rewards/chosen": 2.893951416015625, "rewards/margins": 2.5918033123016357, "rewards/rejected": 0.30214807391166687, "step": 10036 }, { "epoch": 1.63, "learning_rate": 2.850054573533345e-07, "logits/chosen": -0.5645730495452881, "logits/rejected": -0.5343279838562012, "logps/chosen": -89.45207214355469, "logps/rejected": -62.10196304321289, "loss": 0.2552, "rewards/accuracies": 1.0, "rewards/chosen": 2.1137619018554688, "rewards/margins": 0.41842687129974365, "rewards/rejected": 1.695335030555725, "step": 10037 }, { "epoch": 1.63, "learning_rate": 2.848868094549615e-07, "logits/chosen": -0.31871941685676575, "logits/rejected": -0.21816527843475342, "logps/chosen": -38.445518493652344, "logps/rejected": -67.11334991455078, "loss": 0.3422, "rewards/accuracies": 1.0, "rewards/chosen": 1.6175235509872437, "rewards/margins": 0.6121867895126343, "rewards/rejected": 1.0053367614746094, "step": 10038 }, { "epoch": 1.63, "learning_rate": 2.847681764188742e-07, "logits/chosen": -0.8276309370994568, "logits/rejected": -0.8338069319725037, "logps/chosen": -69.39938354492188, "logps/rejected": -88.6065673828125, "loss": 1.6859, "rewards/accuracies": 1.0, "rewards/chosen": 1.5474258661270142, "rewards/margins": 1.3817291259765625, "rewards/rejected": 0.16569672524929047, "step": 10039 }, { "epoch": 1.63, "learning_rate": 2.8464955825326966e-07, "logits/chosen": -0.6910938024520874, "logits/rejected": -0.8582078814506531, "logps/chosen": -107.6514892578125, "logps/rejected": -167.7620849609375, "loss": 2.5414, "rewards/accuracies": 0.0, "rewards/chosen": 2.6744401454925537, "rewards/margins": -4.606462478637695, "rewards/rejected": 7.28090238571167, "step": 10040 }, { "epoch": 1.63, "learning_rate": 2.845309549663427e-07, "logits/chosen": -0.9158151149749756, "logits/rejected": -0.8195784687995911, "logps/chosen": -242.44097900390625, "logps/rejected": -54.7945556640625, "loss": 0.6983, "rewards/accuracies": 1.0, "rewards/chosen": 3.0496888160705566, "rewards/margins": 1.5778900384902954, "rewards/rejected": 1.4717987775802612, "step": 10041 }, { "epoch": 1.63, "learning_rate": 2.8441236656628827e-07, "logits/chosen": -0.582987368106842, "logits/rejected": -0.4445038437843323, "logps/chosen": -61.91356658935547, "logps/rejected": -24.07175064086914, "loss": 0.4527, "rewards/accuracies": 1.0, "rewards/chosen": 1.7618995904922485, "rewards/margins": 1.4415696859359741, "rewards/rejected": 0.320329874753952, "step": 10042 }, { "epoch": 1.63, "learning_rate": 2.842937930612991e-07, "logits/chosen": -0.5417729020118713, "logits/rejected": -0.5413557291030884, "logps/chosen": -2.1030192375183105, "logps/rejected": -20.510379791259766, "loss": 0.6482, "rewards/accuracies": 1.0, "rewards/chosen": 0.26704201102256775, "rewards/margins": 0.14742779731750488, "rewards/rejected": 0.11961422115564346, "step": 10043 }, { "epoch": 1.63, "learning_rate": 2.84175234459568e-07, "logits/chosen": -0.906704306602478, "logits/rejected": -1.0001145601272583, "logps/chosen": -132.40744018554688, "logps/rejected": -195.53875732421875, "loss": 1.062, "rewards/accuracies": 0.0, "rewards/chosen": 0.45462799072265625, "rewards/margins": -1.253193736076355, "rewards/rejected": 1.7078217267990112, "step": 10044 }, { "epoch": 1.63, "learning_rate": 2.8405669076928594e-07, "logits/chosen": -0.8038773536682129, "logits/rejected": -0.8545989394187927, "logps/chosen": -286.82135009765625, "logps/rejected": -137.5677490234375, "loss": 1.9005, "rewards/accuracies": 0.0, "rewards/chosen": 3.2494142055511475, "rewards/margins": -2.395848035812378, "rewards/rejected": 5.645262241363525, "step": 10045 }, { "epoch": 1.63, "learning_rate": 2.839381619986434e-07, "logits/chosen": -0.6853578686714172, "logits/rejected": -0.6853578686714172, "logps/chosen": -5.800072193145752, "logps/rejected": -5.800072193145752, "loss": 0.6136, "rewards/accuracies": 0.0, "rewards/chosen": 0.23271393775939941, "rewards/margins": 0.0, "rewards/rejected": 0.23271393775939941, "step": 10046 }, { "epoch": 1.63, "learning_rate": 2.8381964815582934e-07, "logits/chosen": -0.948836088180542, "logits/rejected": -0.9369066953659058, "logps/chosen": -61.624176025390625, "logps/rejected": -48.50092315673828, "loss": 0.4509, "rewards/accuracies": 0.0, "rewards/chosen": 2.096566915512085, "rewards/margins": -0.23209214210510254, "rewards/rejected": 2.3286590576171875, "step": 10047 }, { "epoch": 1.63, "learning_rate": 2.837011492490322e-07, "logits/chosen": -0.8631110787391663, "logits/rejected": -0.9574987292289734, "logps/chosen": -234.05711364746094, "logps/rejected": -81.80073547363281, "loss": 0.0691, "rewards/accuracies": 1.0, "rewards/chosen": 4.573692321777344, "rewards/margins": 2.6813950538635254, "rewards/rejected": 1.892297387123108, "step": 10048 }, { "epoch": 1.63, "learning_rate": 2.8358266528643884e-07, "logits/chosen": -0.6134449243545532, "logits/rejected": -0.6498551964759827, "logps/chosen": -52.3017578125, "logps/rejected": -88.43939208984375, "loss": 0.4495, "rewards/accuracies": 1.0, "rewards/chosen": 2.1603362560272217, "rewards/margins": 0.6752761602401733, "rewards/rejected": 1.4850600957870483, "step": 10049 }, { "epoch": 1.63, "learning_rate": 2.8346419627623576e-07, "logits/chosen": -0.7381386756896973, "logits/rejected": -0.7359922528266907, "logps/chosen": -72.12371826171875, "logps/rejected": -93.21311950683594, "loss": 0.7779, "rewards/accuracies": 0.0, "rewards/chosen": 2.5603065490722656, "rewards/margins": -0.1525254249572754, "rewards/rejected": 2.712831974029541, "step": 10050 }, { "epoch": 1.63, "learning_rate": 2.8334574222660764e-07, "logits/chosen": -0.8160915970802307, "logits/rejected": -0.6502057909965515, "logps/chosen": -115.97805786132812, "logps/rejected": -70.18449401855469, "loss": 0.2062, "rewards/accuracies": 1.0, "rewards/chosen": 4.727611064910889, "rewards/margins": 2.630368947982788, "rewards/rejected": 2.0972421169281006, "step": 10051 }, { "epoch": 1.63, "learning_rate": 2.832273031457389e-07, "logits/chosen": -0.47911012172698975, "logits/rejected": -0.4874955713748932, "logps/chosen": -66.91194915771484, "logps/rejected": -96.18533325195312, "loss": 0.5688, "rewards/accuracies": 1.0, "rewards/chosen": 0.6559608578681946, "rewards/margins": 0.1173660159111023, "rewards/rejected": 0.5385948419570923, "step": 10052 }, { "epoch": 1.63, "learning_rate": 2.831088790418122e-07, "logits/chosen": -0.35266897082328796, "logits/rejected": -0.4013555347919464, "logps/chosen": -28.89727210998535, "logps/rejected": -56.94738006591797, "loss": 0.7183, "rewards/accuracies": 1.0, "rewards/chosen": 1.4824209213256836, "rewards/margins": 0.2796834707260132, "rewards/rejected": 1.2027374505996704, "step": 10053 }, { "epoch": 1.63, "learning_rate": 2.829904699230099e-07, "logits/chosen": -0.5509078502655029, "logits/rejected": -0.5389479398727417, "logps/chosen": -23.397857666015625, "logps/rejected": -5.024087429046631, "loss": 1.022, "rewards/accuracies": 0.0, "rewards/chosen": 0.016275405883789062, "rewards/margins": -0.18830929696559906, "rewards/rejected": 0.20458470284938812, "step": 10054 }, { "epoch": 1.63, "learning_rate": 2.828720757975126e-07, "logits/chosen": -0.5487302541732788, "logits/rejected": -0.5086220502853394, "logps/chosen": -78.26826477050781, "logps/rejected": -56.3858642578125, "loss": 1.6729, "rewards/accuracies": 0.0, "rewards/chosen": 0.3232834041118622, "rewards/margins": -1.6439483165740967, "rewards/rejected": 1.9672317504882812, "step": 10055 }, { "epoch": 1.63, "learning_rate": 2.827536966735006e-07, "logits/chosen": -0.7308129668235779, "logits/rejected": -0.6096221208572388, "logps/chosen": -91.71243286132812, "logps/rejected": -112.957275390625, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": 3.8387222290039062, "rewards/margins": 2.041372776031494, "rewards/rejected": 1.7973495721817017, "step": 10056 }, { "epoch": 1.63, "learning_rate": 2.826353325591523e-07, "logits/chosen": -0.8874689936637878, "logits/rejected": -0.7832546234130859, "logps/chosen": -120.39309692382812, "logps/rejected": -64.9447021484375, "loss": 0.9182, "rewards/accuracies": 1.0, "rewards/chosen": 5.428341865539551, "rewards/margins": 3.312317132949829, "rewards/rejected": 2.1160247325897217, "step": 10057 }, { "epoch": 1.63, "learning_rate": 2.82516983462646e-07, "logits/chosen": -0.6796014308929443, "logits/rejected": -0.6820530295372009, "logps/chosen": -40.25459289550781, "logps/rejected": -11.59737777709961, "loss": 0.3152, "rewards/accuracies": 1.0, "rewards/chosen": 1.244334101676941, "rewards/margins": 0.3365609049797058, "rewards/rejected": 0.9077731966972351, "step": 10058 }, { "epoch": 1.63, "learning_rate": 2.8239864939215805e-07, "logits/chosen": -0.7416492104530334, "logits/rejected": -0.694399893283844, "logps/chosen": -109.76250457763672, "logps/rejected": -65.69902038574219, "loss": 0.5917, "rewards/accuracies": 0.0, "rewards/chosen": 0.5478691458702087, "rewards/margins": -0.7538276314735413, "rewards/rejected": 1.30169677734375, "step": 10059 }, { "epoch": 1.63, "learning_rate": 2.822803303558646e-07, "logits/chosen": -0.6616299748420715, "logits/rejected": -0.6750956773757935, "logps/chosen": -120.65074157714844, "logps/rejected": -55.28472900390625, "loss": 2.2453, "rewards/accuracies": 1.0, "rewards/chosen": 2.4454712867736816, "rewards/margins": 0.4641815423965454, "rewards/rejected": 1.9812897443771362, "step": 10060 }, { "epoch": 1.63, "learning_rate": 2.821620263619403e-07, "logits/chosen": -0.41177788376808167, "logits/rejected": -0.47144198417663574, "logps/chosen": -69.64899444580078, "logps/rejected": -84.57965087890625, "loss": 0.2155, "rewards/accuracies": 1.0, "rewards/chosen": 2.4104485511779785, "rewards/margins": 0.7137161493301392, "rewards/rejected": 1.6967324018478394, "step": 10061 }, { "epoch": 1.63, "learning_rate": 2.820437374185587e-07, "logits/chosen": -0.21725207567214966, "logits/rejected": -0.32044512033462524, "logps/chosen": -58.62028884887695, "logps/rejected": -75.12255859375, "loss": 0.6294, "rewards/accuracies": 0.0, "rewards/chosen": 1.2003697156906128, "rewards/margins": -0.7904658317565918, "rewards/rejected": 1.9908355474472046, "step": 10062 }, { "epoch": 1.63, "learning_rate": 2.8192546353389264e-07, "logits/chosen": -0.7453104853630066, "logits/rejected": -0.6022632122039795, "logps/chosen": -99.25444793701172, "logps/rejected": -92.97805786132812, "loss": 0.2615, "rewards/accuracies": 1.0, "rewards/chosen": 4.381237030029297, "rewards/margins": 1.7697196006774902, "rewards/rejected": 2.6115174293518066, "step": 10063 }, { "epoch": 1.63, "learning_rate": 2.818072047161134e-07, "logits/chosen": -0.2920653522014618, "logits/rejected": -0.2920653522014618, "logps/chosen": -48.2219352722168, "logps/rejected": -48.2219352722168, "loss": 0.3522, "rewards/accuracies": 0.0, "rewards/chosen": 1.0765732526779175, "rewards/margins": 0.0, "rewards/rejected": 1.0765732526779175, "step": 10064 }, { "epoch": 1.63, "learning_rate": 2.81688960973392e-07, "logits/chosen": -0.9267829656600952, "logits/rejected": -0.9513512849807739, "logps/chosen": -184.81741333007812, "logps/rejected": -98.23204040527344, "loss": 0.0846, "rewards/accuracies": 1.0, "rewards/chosen": 3.437229871749878, "rewards/margins": 1.8553420305252075, "rewards/rejected": 1.5818878412246704, "step": 10065 }, { "epoch": 1.63, "learning_rate": 2.815707323138975e-07, "logits/chosen": -0.4151308834552765, "logits/rejected": -0.3212558627128601, "logps/chosen": -134.22740173339844, "logps/rejected": -43.70695495605469, "loss": 0.4457, "rewards/accuracies": 1.0, "rewards/chosen": 3.1117324829101562, "rewards/margins": 1.5179862976074219, "rewards/rejected": 1.5937461853027344, "step": 10066 }, { "epoch": 1.63, "learning_rate": 2.8145251874579885e-07, "logits/chosen": -1.0411328077316284, "logits/rejected": -1.0085854530334473, "logps/chosen": -59.41913986206055, "logps/rejected": -36.0899772644043, "loss": 0.4181, "rewards/accuracies": 1.0, "rewards/chosen": 0.9353305697441101, "rewards/margins": 0.7739936709403992, "rewards/rejected": 0.16133689880371094, "step": 10067 }, { "epoch": 1.63, "learning_rate": 2.81334320277263e-07, "logits/chosen": -0.7709378004074097, "logits/rejected": -0.6862181425094604, "logps/chosen": -102.83059692382812, "logps/rejected": -63.01064682006836, "loss": 0.4682, "rewards/accuracies": 0.0, "rewards/chosen": 0.4715118408203125, "rewards/margins": -0.37407875061035156, "rewards/rejected": 0.8455905914306641, "step": 10068 }, { "epoch": 1.63, "learning_rate": 2.8121613691645676e-07, "logits/chosen": -0.8134069442749023, "logits/rejected": -0.7760568857192993, "logps/chosen": -109.32705688476562, "logps/rejected": -11.057682991027832, "loss": 0.5205, "rewards/accuracies": 1.0, "rewards/chosen": 1.4188461303710938, "rewards/margins": 0.6023917198181152, "rewards/rejected": 0.8164544105529785, "step": 10069 }, { "epoch": 1.63, "learning_rate": 2.810979686715451e-07, "logits/chosen": -0.5588577389717102, "logits/rejected": -0.5752744674682617, "logps/chosen": -105.05950164794922, "logps/rejected": -60.81819152832031, "loss": 0.4906, "rewards/accuracies": 0.0, "rewards/chosen": 0.8125274777412415, "rewards/margins": -0.5085663199424744, "rewards/rejected": 1.3210937976837158, "step": 10070 }, { "epoch": 1.63, "learning_rate": 2.8097981555069284e-07, "logits/chosen": -0.6024788618087769, "logits/rejected": -0.47209107875823975, "logps/chosen": -61.86212921142578, "logps/rejected": -42.749969482421875, "loss": 1.3675, "rewards/accuracies": 0.0, "rewards/chosen": 1.4087104797363281, "rewards/margins": -1.4831504821777344, "rewards/rejected": 2.8918609619140625, "step": 10071 }, { "epoch": 1.63, "learning_rate": 2.808616775620626e-07, "logits/chosen": -0.5462688207626343, "logits/rejected": -0.5474065542221069, "logps/chosen": -18.53103256225586, "logps/rejected": -3.649003505706787, "loss": 2.1289, "rewards/accuracies": 1.0, "rewards/chosen": 1.1270828247070312, "rewards/margins": 0.8329179286956787, "rewards/rejected": 0.29416486620903015, "step": 10072 }, { "epoch": 1.63, "learning_rate": 2.807435547138172e-07, "logits/chosen": -0.34531962871551514, "logits/rejected": -0.3727893829345703, "logps/chosen": -68.94384765625, "logps/rejected": -41.68880844116211, "loss": 0.2275, "rewards/accuracies": 1.0, "rewards/chosen": 1.4439315795898438, "rewards/margins": 0.6194003820419312, "rewards/rejected": 0.8245311975479126, "step": 10073 }, { "epoch": 1.64, "learning_rate": 2.806254470141174e-07, "logits/chosen": -0.2848392426967621, "logits/rejected": -0.2848392426967621, "logps/chosen": -20.420207977294922, "logps/rejected": -20.420207977294922, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.19481296837329865, "rewards/margins": 0.0, "rewards/rejected": 0.19481296837329865, "step": 10074 }, { "epoch": 1.64, "learning_rate": 2.805073544711236e-07, "logits/chosen": -1.1418449878692627, "logits/rejected": -1.1145715713500977, "logps/chosen": -115.39677429199219, "logps/rejected": -119.33412170410156, "loss": 0.6191, "rewards/accuracies": 1.0, "rewards/chosen": 2.0356414318084717, "rewards/margins": 0.9784407615661621, "rewards/rejected": 1.0572006702423096, "step": 10075 }, { "epoch": 1.64, "learning_rate": 2.8038927709299454e-07, "logits/chosen": -0.31473642587661743, "logits/rejected": -0.30098995566368103, "logps/chosen": -3.6544408798217773, "logps/rejected": -34.03746032714844, "loss": 0.4685, "rewards/accuracies": 1.0, "rewards/chosen": 0.35446712374687195, "rewards/margins": 0.4998086094856262, "rewards/rejected": -0.14534150063991547, "step": 10076 }, { "epoch": 1.64, "learning_rate": 2.8027121488788865e-07, "logits/chosen": -0.7665729522705078, "logits/rejected": -0.515513002872467, "logps/chosen": -140.76531982421875, "logps/rejected": -48.67417907714844, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": 5.189853191375732, "rewards/margins": 4.5667853355407715, "rewards/rejected": 0.6230678558349609, "step": 10077 }, { "epoch": 1.64, "learning_rate": 2.8015316786396257e-07, "logits/chosen": -0.2672126293182373, "logits/rejected": -0.2534796893596649, "logps/chosen": -28.87294578552246, "logps/rejected": -1.7604025602340698, "loss": 0.4668, "rewards/accuracies": 0.0, "rewards/chosen": 0.0478641502559185, "rewards/margins": -0.31891921162605286, "rewards/rejected": 0.36678335070610046, "step": 10078 }, { "epoch": 1.64, "learning_rate": 2.8003513602937256e-07, "logits/chosen": -0.5145337581634521, "logits/rejected": -0.5130541920661926, "logps/chosen": -2.010871171951294, "logps/rejected": -3.0520107746124268, "loss": 0.3815, "rewards/accuracies": 0.0, "rewards/chosen": 0.34889718890190125, "rewards/margins": -0.10880687832832336, "rewards/rejected": 0.4577040672302246, "step": 10079 }, { "epoch": 1.64, "learning_rate": 2.7991711939227315e-07, "logits/chosen": -0.880114734172821, "logits/rejected": -0.880114734172821, "logps/chosen": -45.715248107910156, "logps/rejected": -45.715248107910156, "loss": 0.4843, "rewards/accuracies": 0.0, "rewards/chosen": 1.7904132604599, "rewards/margins": 0.0, "rewards/rejected": 1.7904132604599, "step": 10080 }, { "epoch": 1.64, "learning_rate": 2.7979911796081846e-07, "logits/chosen": -0.7341402769088745, "logits/rejected": -0.7777371406555176, "logps/chosen": -93.9149169921875, "logps/rejected": -110.24978637695312, "loss": 2.526, "rewards/accuracies": 0.0, "rewards/chosen": 0.781507134437561, "rewards/margins": -1.9475089311599731, "rewards/rejected": 2.729016065597534, "step": 10081 }, { "epoch": 1.64, "learning_rate": 2.79681131743161e-07, "logits/chosen": -0.9607948660850525, "logits/rejected": -0.9140536785125732, "logps/chosen": -108.94597625732422, "logps/rejected": -61.55500793457031, "loss": 0.5812, "rewards/accuracies": 0.0, "rewards/chosen": 0.8482994437217712, "rewards/margins": -0.5733703970909119, "rewards/rejected": 1.421669840812683, "step": 10082 }, { "epoch": 1.64, "learning_rate": 2.795631607474529e-07, "logits/chosen": -0.7453484535217285, "logits/rejected": -0.7657610774040222, "logps/chosen": -141.0128173828125, "logps/rejected": -110.79432678222656, "loss": 1.1591, "rewards/accuracies": 0.0, "rewards/chosen": 0.0026794434525072575, "rewards/margins": -0.43530428409576416, "rewards/rejected": 0.4379837214946747, "step": 10083 }, { "epoch": 1.64, "learning_rate": 2.794452049818444e-07, "logits/chosen": -0.6771910786628723, "logits/rejected": -0.6361830234527588, "logps/chosen": -86.73431396484375, "logps/rejected": -111.80127716064453, "loss": 0.7436, "rewards/accuracies": 1.0, "rewards/chosen": 1.4926162958145142, "rewards/margins": 1.5159568786621094, "rewards/rejected": -0.023340607061982155, "step": 10084 }, { "epoch": 1.64, "learning_rate": 2.7932726445448546e-07, "logits/chosen": -0.8505943417549133, "logits/rejected": -0.8426414728164673, "logps/chosen": -104.50645446777344, "logps/rejected": -110.53279113769531, "loss": 2.216, "rewards/accuracies": 0.0, "rewards/chosen": 3.633885145187378, "rewards/margins": -3.4041202068328857, "rewards/rejected": 7.038005352020264, "step": 10085 }, { "epoch": 1.64, "learning_rate": 2.792093391735244e-07, "logits/chosen": -0.7716634273529053, "logits/rejected": -0.7259931564331055, "logps/chosen": -39.68655776977539, "logps/rejected": -67.43463897705078, "loss": 2.4722, "rewards/accuracies": 0.0, "rewards/chosen": 2.222768783569336, "rewards/margins": -0.44374513626098633, "rewards/rejected": 2.6665139198303223, "step": 10086 }, { "epoch": 1.64, "learning_rate": 2.7909142914710897e-07, "logits/chosen": -0.5424940586090088, "logits/rejected": -0.4467063546180725, "logps/chosen": -53.56855773925781, "logps/rejected": -54.143272399902344, "loss": 0.2254, "rewards/accuracies": 1.0, "rewards/chosen": 1.0843902826309204, "rewards/margins": 1.0302772521972656, "rewards/rejected": 0.054113008081912994, "step": 10087 }, { "epoch": 1.64, "learning_rate": 2.789735343833854e-07, "logits/chosen": -0.7528747320175171, "logits/rejected": -0.5812718272209167, "logps/chosen": -109.61978149414062, "logps/rejected": -57.94886016845703, "loss": 0.1031, "rewards/accuracies": 1.0, "rewards/chosen": 5.587301731109619, "rewards/margins": 4.233895778656006, "rewards/rejected": 1.3534058332443237, "step": 10088 }, { "epoch": 1.64, "learning_rate": 2.7885565489049946e-07, "logits/chosen": -0.5552841424942017, "logits/rejected": -0.5673182606697083, "logps/chosen": -61.87720489501953, "logps/rejected": -243.3690643310547, "loss": 2.4461, "rewards/accuracies": 0.0, "rewards/chosen": 1.7070754766464233, "rewards/margins": -4.880551338195801, "rewards/rejected": 6.587626934051514, "step": 10089 }, { "epoch": 1.64, "learning_rate": 2.78737790676595e-07, "logits/chosen": -0.5320347547531128, "logits/rejected": -0.4506435990333557, "logps/chosen": -135.3143310546875, "logps/rejected": -48.38520050048828, "loss": 0.2712, "rewards/accuracies": 1.0, "rewards/chosen": 3.08062744140625, "rewards/margins": 1.6295878887176514, "rewards/rejected": 1.4510395526885986, "step": 10090 }, { "epoch": 1.64, "learning_rate": 2.7861994174981584e-07, "logits/chosen": -0.1024768203496933, "logits/rejected": -0.1024768203496933, "logps/chosen": -1.564927101135254, "logps/rejected": -1.564927101135254, "loss": 0.9968, "rewards/accuracies": 0.0, "rewards/chosen": 0.18085013329982758, "rewards/margins": 0.0, "rewards/rejected": 0.18085013329982758, "step": 10091 }, { "epoch": 1.64, "learning_rate": 2.7850210811830375e-07, "logits/chosen": -0.7900515198707581, "logits/rejected": -0.6804938316345215, "logps/chosen": -92.0704574584961, "logps/rejected": -154.75352478027344, "loss": 0.3377, "rewards/accuracies": 1.0, "rewards/chosen": 3.924614667892456, "rewards/margins": 0.36459875106811523, "rewards/rejected": 3.560015916824341, "step": 10092 }, { "epoch": 1.64, "learning_rate": 2.783842897902004e-07, "logits/chosen": -0.8462619781494141, "logits/rejected": -0.789519190788269, "logps/chosen": -145.98793029785156, "logps/rejected": -149.64004516601562, "loss": 0.2769, "rewards/accuracies": 1.0, "rewards/chosen": 4.774667263031006, "rewards/margins": 0.34926414489746094, "rewards/rejected": 4.425403118133545, "step": 10093 }, { "epoch": 1.64, "learning_rate": 2.782664867736455e-07, "logits/chosen": -0.5987486243247986, "logits/rejected": -0.5987486243247986, "logps/chosen": -78.87599182128906, "logps/rejected": -78.87599182128906, "loss": 0.713, "rewards/accuracies": 0.0, "rewards/chosen": 2.141925096511841, "rewards/margins": 0.0, "rewards/rejected": 2.141925096511841, "step": 10094 }, { "epoch": 1.64, "learning_rate": 2.781486990767783e-07, "logits/chosen": -0.753909707069397, "logits/rejected": -0.753909707069397, "logps/chosen": -54.48716735839844, "logps/rejected": -54.48716735839844, "loss": 0.406, "rewards/accuracies": 0.0, "rewards/chosen": 1.1704437732696533, "rewards/margins": 0.0, "rewards/rejected": 1.1704437732696533, "step": 10095 }, { "epoch": 1.64, "learning_rate": 2.78030926707737e-07, "logits/chosen": -0.6732694506645203, "logits/rejected": -0.6449331045150757, "logps/chosen": -81.73324584960938, "logps/rejected": -95.338134765625, "loss": 0.6483, "rewards/accuracies": 1.0, "rewards/chosen": 3.6832115650177, "rewards/margins": 0.17096471786499023, "rewards/rejected": 3.51224684715271, "step": 10096 }, { "epoch": 1.64, "learning_rate": 2.7791316967465813e-07, "logits/chosen": -0.9260489344596863, "logits/rejected": -0.8772786855697632, "logps/chosen": -112.24150848388672, "logps/rejected": -78.07307434082031, "loss": 0.9162, "rewards/accuracies": 0.0, "rewards/chosen": 0.8483161926269531, "rewards/margins": -1.4495506286621094, "rewards/rejected": 2.2978668212890625, "step": 10097 }, { "epoch": 1.64, "learning_rate": 2.77795427985678e-07, "logits/chosen": -1.0868738889694214, "logits/rejected": -1.136876106262207, "logps/chosen": -85.13125610351562, "logps/rejected": -204.64312744140625, "loss": 1.2768, "rewards/accuracies": 0.0, "rewards/chosen": 5.246638774871826, "rewards/margins": -2.4693708419799805, "rewards/rejected": 7.716009616851807, "step": 10098 }, { "epoch": 1.64, "learning_rate": 2.7767770164893117e-07, "logits/chosen": -0.5565590262413025, "logits/rejected": -0.4611060321331024, "logps/chosen": -46.24236297607422, "logps/rejected": -102.78074645996094, "loss": 1.2728, "rewards/accuracies": 1.0, "rewards/chosen": 2.263603925704956, "rewards/margins": 0.11324834823608398, "rewards/rejected": 2.150355577468872, "step": 10099 }, { "epoch": 1.64, "learning_rate": 2.775599906725517e-07, "logits/chosen": -0.9112427830696106, "logits/rejected": -0.9112427830696106, "logps/chosen": -55.84915542602539, "logps/rejected": -55.84915542602539, "loss": 0.5733, "rewards/accuracies": 0.0, "rewards/chosen": 0.11820144951343536, "rewards/margins": 0.0, "rewards/rejected": 0.11820144951343536, "step": 10100 }, { "epoch": 1.64, "learning_rate": 2.774422950646719e-07, "logits/chosen": -0.784633457660675, "logits/rejected": -0.7766156196594238, "logps/chosen": -94.27528381347656, "logps/rejected": -72.84703826904297, "loss": 0.3709, "rewards/accuracies": 1.0, "rewards/chosen": 0.8733276724815369, "rewards/margins": 0.07619786262512207, "rewards/rejected": 0.7971298098564148, "step": 10101 }, { "epoch": 1.64, "learning_rate": 2.773246148334239e-07, "logits/chosen": -0.6074498891830444, "logits/rejected": -0.6048699021339417, "logps/chosen": -93.91752624511719, "logps/rejected": -76.81008911132812, "loss": 0.9142, "rewards/accuracies": 1.0, "rewards/chosen": 1.8117812871932983, "rewards/margins": 0.6997847557067871, "rewards/rejected": 1.1119965314865112, "step": 10102 }, { "epoch": 1.64, "learning_rate": 2.7720694998693783e-07, "logits/chosen": -1.0098631381988525, "logits/rejected": -0.8796539306640625, "logps/chosen": -64.77704620361328, "logps/rejected": -14.136568069458008, "loss": 0.2526, "rewards/accuracies": 1.0, "rewards/chosen": 1.3283737897872925, "rewards/margins": 0.5542194843292236, "rewards/rejected": 0.7741543054580688, "step": 10103 }, { "epoch": 1.64, "learning_rate": 2.7708930053334356e-07, "logits/chosen": -0.9486338496208191, "logits/rejected": -0.9208593368530273, "logps/chosen": -41.941810607910156, "logps/rejected": -20.380815505981445, "loss": 1.2382, "rewards/accuracies": 1.0, "rewards/chosen": 1.2000519037246704, "rewards/margins": 0.8593729138374329, "rewards/rejected": 0.34067898988723755, "step": 10104 }, { "epoch": 1.64, "learning_rate": 2.769716664807693e-07, "logits/chosen": -0.6963905692100525, "logits/rejected": -0.7215514779090881, "logps/chosen": -130.05889892578125, "logps/rejected": -140.75006103515625, "loss": 0.4996, "rewards/accuracies": 0.0, "rewards/chosen": 3.8812928199768066, "rewards/margins": -0.4622344970703125, "rewards/rejected": 4.343527317047119, "step": 10105 }, { "epoch": 1.64, "learning_rate": 2.7685404783734274e-07, "logits/chosen": -0.8978268504142761, "logits/rejected": -0.8629526495933533, "logps/chosen": -174.63632202148438, "logps/rejected": -100.87118530273438, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": 5.104290962219238, "rewards/margins": 2.9344332218170166, "rewards/rejected": 2.1698577404022217, "step": 10106 }, { "epoch": 1.64, "learning_rate": 2.7673644461119006e-07, "logits/chosen": -0.7891187071800232, "logits/rejected": -0.7960129976272583, "logps/chosen": -71.29795837402344, "logps/rejected": -73.94917297363281, "loss": 1.0848, "rewards/accuracies": 0.0, "rewards/chosen": 2.143458604812622, "rewards/margins": -0.44681406021118164, "rewards/rejected": 2.5902726650238037, "step": 10107 }, { "epoch": 1.64, "learning_rate": 2.766188568104365e-07, "logits/chosen": -0.5345566868782043, "logits/rejected": -0.49380698800086975, "logps/chosen": -105.79324340820312, "logps/rejected": -74.73362731933594, "loss": 1.2567, "rewards/accuracies": 1.0, "rewards/chosen": 4.977578639984131, "rewards/margins": 1.8149869441986084, "rewards/rejected": 3.1625916957855225, "step": 10108 }, { "epoch": 1.64, "learning_rate": 2.765012844432061e-07, "logits/chosen": -0.3990059494972229, "logits/rejected": -0.5804259181022644, "logps/chosen": -62.588233947753906, "logps/rejected": -139.98265075683594, "loss": 0.9865, "rewards/accuracies": 0.0, "rewards/chosen": 2.473679304122925, "rewards/margins": -1.6436350345611572, "rewards/rejected": 4.117314338684082, "step": 10109 }, { "epoch": 1.64, "learning_rate": 2.763837275176223e-07, "logits/chosen": -0.6669230461120605, "logits/rejected": -0.6669230461120605, "logps/chosen": -1.7746331691741943, "logps/rejected": -1.7746331691741943, "loss": 0.8015, "rewards/accuracies": 0.0, "rewards/chosen": 0.16893823444843292, "rewards/margins": 0.0, "rewards/rejected": 0.16893823444843292, "step": 10110 }, { "epoch": 1.64, "learning_rate": 2.7626618604180696e-07, "logits/chosen": -0.647828996181488, "logits/rejected": -0.647828996181488, "logps/chosen": -33.73550033569336, "logps/rejected": -33.73550033569336, "loss": 0.943, "rewards/accuracies": 0.0, "rewards/chosen": 1.5544308423995972, "rewards/margins": 0.0, "rewards/rejected": 1.5544308423995972, "step": 10111 }, { "epoch": 1.64, "learning_rate": 2.7614866002388136e-07, "logits/chosen": -1.0649524927139282, "logits/rejected": -1.0888917446136475, "logps/chosen": -128.5317840576172, "logps/rejected": -129.36199951171875, "loss": 0.7756, "rewards/accuracies": 0.0, "rewards/chosen": 0.8997726440429688, "rewards/margins": -0.9158951044082642, "rewards/rejected": 1.815667748451233, "step": 10112 }, { "epoch": 1.64, "learning_rate": 2.7603114947196507e-07, "logits/chosen": -0.6474413871765137, "logits/rejected": -0.5428026914596558, "logps/chosen": -71.38978576660156, "logps/rejected": -35.917396545410156, "loss": 0.5238, "rewards/accuracies": 1.0, "rewards/chosen": 1.896510362625122, "rewards/margins": 1.773356318473816, "rewards/rejected": 0.12315406650304794, "step": 10113 }, { "epoch": 1.64, "learning_rate": 2.759136543941772e-07, "logits/chosen": -1.1589869260787964, "logits/rejected": -1.2508467435836792, "logps/chosen": -209.47024536132812, "logps/rejected": -224.1840057373047, "loss": 1.0127, "rewards/accuracies": 0.0, "rewards/chosen": 5.715750217437744, "rewards/margins": -1.259751796722412, "rewards/rejected": 6.975502014160156, "step": 10114 }, { "epoch": 1.64, "learning_rate": 2.757961747986355e-07, "logits/chosen": -0.8203468918800354, "logits/rejected": -0.7853786945343018, "logps/chosen": -196.44625854492188, "logps/rejected": -88.0582275390625, "loss": 0.4887, "rewards/accuracies": 1.0, "rewards/chosen": 4.040280342102051, "rewards/margins": 0.36947035789489746, "rewards/rejected": 3.6708099842071533, "step": 10115 }, { "epoch": 1.64, "learning_rate": 2.756787106934565e-07, "logits/chosen": -1.024283766746521, "logits/rejected": -0.9717908501625061, "logps/chosen": -54.93470764160156, "logps/rejected": -19.97130012512207, "loss": 0.108, "rewards/accuracies": 1.0, "rewards/chosen": 2.4730324745178223, "rewards/margins": 2.0127105712890625, "rewards/rejected": 0.46032199263572693, "step": 10116 }, { "epoch": 1.64, "learning_rate": 2.7556126208675633e-07, "logits/chosen": -0.7786701917648315, "logits/rejected": -0.69947749376297, "logps/chosen": -124.62818145751953, "logps/rejected": -65.22212219238281, "loss": 0.2343, "rewards/accuracies": 1.0, "rewards/chosen": 2.6232261657714844, "rewards/margins": 0.5726470947265625, "rewards/rejected": 2.050579071044922, "step": 10117 }, { "epoch": 1.64, "learning_rate": 2.7544382898664907e-07, "logits/chosen": -0.8326452374458313, "logits/rejected": -0.8370383977890015, "logps/chosen": -13.421528816223145, "logps/rejected": -1.3863301277160645, "loss": 0.4621, "rewards/accuracies": 0.0, "rewards/chosen": -0.1992061585187912, "rewards/margins": -0.37857121229171753, "rewards/rejected": 0.17936503887176514, "step": 10118 }, { "epoch": 1.64, "learning_rate": 2.753264114012487e-07, "logits/chosen": -1.002213954925537, "logits/rejected": -1.0352360010147095, "logps/chosen": -67.91551208496094, "logps/rejected": -22.987728118896484, "loss": 0.2568, "rewards/accuracies": 1.0, "rewards/chosen": 2.084047794342041, "rewards/margins": 1.7865653038024902, "rewards/rejected": 0.2974824905395508, "step": 10119 }, { "epoch": 1.64, "learning_rate": 2.752090093386672e-07, "logits/chosen": -0.4859880805015564, "logits/rejected": -0.4449279010295868, "logps/chosen": -51.765525817871094, "logps/rejected": -17.91965675354004, "loss": 0.1825, "rewards/accuracies": 1.0, "rewards/chosen": 1.8139854669570923, "rewards/margins": 1.508305549621582, "rewards/rejected": 0.30567988753318787, "step": 10120 }, { "epoch": 1.64, "learning_rate": 2.750916228070163e-07, "logits/chosen": -0.4524051547050476, "logits/rejected": -0.5075127482414246, "logps/chosen": -58.789424896240234, "logps/rejected": -60.97086715698242, "loss": 1.3748, "rewards/accuracies": 0.0, "rewards/chosen": 1.2094157934188843, "rewards/margins": -1.2326301336288452, "rewards/rejected": 2.4420459270477295, "step": 10121 }, { "epoch": 1.64, "learning_rate": 2.7497425181440603e-07, "logits/chosen": -0.8139853477478027, "logits/rejected": -0.5322235822677612, "logps/chosen": -156.52874755859375, "logps/rejected": -45.51343536376953, "loss": 0.3483, "rewards/accuracies": 1.0, "rewards/chosen": 5.340785503387451, "rewards/margins": 3.0524981021881104, "rewards/rejected": 2.288287401199341, "step": 10122 }, { "epoch": 1.64, "learning_rate": 2.7485689636894593e-07, "logits/chosen": -0.8713518977165222, "logits/rejected": -0.868281900882721, "logps/chosen": -150.0001983642578, "logps/rejected": -125.27278137207031, "loss": 0.2635, "rewards/accuracies": 1.0, "rewards/chosen": 4.629228115081787, "rewards/margins": 0.5159358978271484, "rewards/rejected": 4.113292217254639, "step": 10123 }, { "epoch": 1.64, "learning_rate": 2.747395564787437e-07, "logits/chosen": -0.8871857523918152, "logits/rejected": -0.8561105132102966, "logps/chosen": -84.56268310546875, "logps/rejected": -45.07993698120117, "loss": 0.366, "rewards/accuracies": 1.0, "rewards/chosen": 1.0346252918243408, "rewards/margins": 0.42114681005477905, "rewards/rejected": 0.6134784817695618, "step": 10124 }, { "epoch": 1.64, "learning_rate": 2.74622232151907e-07, "logits/chosen": -1.0750561952590942, "logits/rejected": -0.9487744569778442, "logps/chosen": -286.5059814453125, "logps/rejected": -33.702938079833984, "loss": 0.9274, "rewards/accuracies": 1.0, "rewards/chosen": 2.5356690883636475, "rewards/margins": 2.4005963802337646, "rewards/rejected": 0.1350727081298828, "step": 10125 }, { "epoch": 1.64, "learning_rate": 2.745049233965412e-07, "logits/chosen": -0.5811947584152222, "logits/rejected": -0.5193881988525391, "logps/chosen": -37.74585723876953, "logps/rejected": -30.70049285888672, "loss": 0.4999, "rewards/accuracies": 1.0, "rewards/chosen": 2.7807929515838623, "rewards/margins": 1.4807963371276855, "rewards/rejected": 1.2999966144561768, "step": 10126 }, { "epoch": 1.64, "learning_rate": 2.7438763022075186e-07, "logits/chosen": -0.23369814455509186, "logits/rejected": -0.275613009929657, "logps/chosen": -85.61456298828125, "logps/rejected": -59.98493957519531, "loss": 0.4593, "rewards/accuracies": 0.0, "rewards/chosen": 2.222320556640625, "rewards/margins": -0.40074920654296875, "rewards/rejected": 2.6230697631835938, "step": 10127 }, { "epoch": 1.64, "learning_rate": 2.742703526326422e-07, "logits/chosen": -0.32717105746269226, "logits/rejected": -0.4178050458431244, "logps/chosen": -73.01707458496094, "logps/rejected": -136.72618103027344, "loss": 0.4585, "rewards/accuracies": 0.0, "rewards/chosen": 1.4780982732772827, "rewards/margins": -0.3192405700683594, "rewards/rejected": 1.797338843345642, "step": 10128 }, { "epoch": 1.64, "learning_rate": 2.7415309064031555e-07, "logits/chosen": -0.6443029642105103, "logits/rejected": -0.7211819291114807, "logps/chosen": -61.94049072265625, "logps/rejected": -116.4136734008789, "loss": 1.9593, "rewards/accuracies": 0.0, "rewards/chosen": 0.39563751220703125, "rewards/margins": -0.5207328796386719, "rewards/rejected": 0.9163703918457031, "step": 10129 }, { "epoch": 1.64, "learning_rate": 2.740358442518732e-07, "logits/chosen": -0.9119197130203247, "logits/rejected": -0.7979543209075928, "logps/chosen": -140.6605224609375, "logps/rejected": -84.3647689819336, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 7.9331512451171875, "rewards/margins": 5.294178009033203, "rewards/rejected": 2.6389732360839844, "step": 10130 }, { "epoch": 1.64, "learning_rate": 2.73918613475416e-07, "logits/chosen": -0.7568002939224243, "logits/rejected": -0.8094290494918823, "logps/chosen": -101.54894256591797, "logps/rejected": -111.67315673828125, "loss": 0.3748, "rewards/accuracies": 0.0, "rewards/chosen": 2.7053306102752686, "rewards/margins": -0.06726932525634766, "rewards/rejected": 2.772599935531616, "step": 10131 }, { "epoch": 1.64, "learning_rate": 2.738013983190433e-07, "logits/chosen": -0.8010818362236023, "logits/rejected": -0.9072080254554749, "logps/chosen": -292.91668701171875, "logps/rejected": -100.35513305664062, "loss": 2.5113, "rewards/accuracies": 0.0, "rewards/chosen": 0.9892303347587585, "rewards/margins": -3.9923036098480225, "rewards/rejected": 4.981534004211426, "step": 10132 }, { "epoch": 1.64, "learning_rate": 2.736841987908538e-07, "logits/chosen": -0.5593623518943787, "logits/rejected": -0.6083534359931946, "logps/chosen": -105.23927307128906, "logps/rejected": -89.43389129638672, "loss": 0.4637, "rewards/accuracies": 0.0, "rewards/chosen": 1.5142875909805298, "rewards/margins": -0.11247479915618896, "rewards/rejected": 1.6267623901367188, "step": 10133 }, { "epoch": 1.64, "learning_rate": 2.7356701489894463e-07, "logits/chosen": -0.9758291840553284, "logits/rejected": -0.9595231413841248, "logps/chosen": -60.37498092651367, "logps/rejected": -70.8609848022461, "loss": 0.2661, "rewards/accuracies": 1.0, "rewards/chosen": 0.6659297943115234, "rewards/margins": 0.8509517908096313, "rewards/rejected": -0.18502198159694672, "step": 10134 }, { "epoch": 1.65, "learning_rate": 2.7344984665141236e-07, "logits/chosen": -0.4646076560020447, "logits/rejected": -0.24589505791664124, "logps/chosen": -63.3812255859375, "logps/rejected": -67.5634536743164, "loss": 0.2255, "rewards/accuracies": 1.0, "rewards/chosen": 4.738076686859131, "rewards/margins": 2.3884847164154053, "rewards/rejected": 2.3495919704437256, "step": 10135 }, { "epoch": 1.65, "learning_rate": 2.7333269405635184e-07, "logits/chosen": -0.7474161386489868, "logits/rejected": -0.8001914024353027, "logps/chosen": -186.19786071777344, "logps/rejected": -93.31077575683594, "loss": 0.167, "rewards/accuracies": 1.0, "rewards/chosen": 5.228660583496094, "rewards/margins": 1.1535062789916992, "rewards/rejected": 4.0751543045043945, "step": 10136 }, { "epoch": 1.65, "learning_rate": 2.7321555712185764e-07, "logits/chosen": -0.6861923933029175, "logits/rejected": -0.32671695947647095, "logps/chosen": -157.79135131835938, "logps/rejected": -54.76195526123047, "loss": 0.0233, "rewards/accuracies": 1.0, "rewards/chosen": 5.161709785461426, "rewards/margins": 3.9531869888305664, "rewards/rejected": 1.2085227966308594, "step": 10137 }, { "epoch": 1.65, "learning_rate": 2.730984358560223e-07, "logits/chosen": -0.678798496723175, "logits/rejected": -0.6721671223640442, "logps/chosen": -3.0800914764404297, "logps/rejected": -3.818115234375, "loss": 0.9469, "rewards/accuracies": 0.0, "rewards/chosen": 0.2835972309112549, "rewards/margins": -0.1351887285709381, "rewards/rejected": 0.418785959482193, "step": 10138 }, { "epoch": 1.65, "learning_rate": 2.7298133026693836e-07, "logits/chosen": -0.9297281503677368, "logits/rejected": -0.9297281503677368, "logps/chosen": -87.02828216552734, "logps/rejected": -87.02828216552734, "loss": 0.3653, "rewards/accuracies": 0.0, "rewards/chosen": 1.7699142694473267, "rewards/margins": 0.0, "rewards/rejected": 1.7699142694473267, "step": 10139 }, { "epoch": 1.65, "learning_rate": 2.728642403626962e-07, "logits/chosen": -0.5391898155212402, "logits/rejected": -0.5406084656715393, "logps/chosen": -14.239838600158691, "logps/rejected": -17.70288848876953, "loss": 0.8102, "rewards/accuracies": 1.0, "rewards/chosen": 0.5502114295959473, "rewards/margins": 0.027369022369384766, "rewards/rejected": 0.5228424072265625, "step": 10140 }, { "epoch": 1.65, "learning_rate": 2.727471661513861e-07, "logits/chosen": -0.8302410244941711, "logits/rejected": -0.837571918964386, "logps/chosen": -42.30466079711914, "logps/rejected": -28.76033592224121, "loss": 0.5518, "rewards/accuracies": 0.0, "rewards/chosen": 0.1986290067434311, "rewards/margins": -0.5197267532348633, "rewards/rejected": 0.7183557748794556, "step": 10141 }, { "epoch": 1.65, "learning_rate": 2.726301076410963e-07, "logits/chosen": -0.960439920425415, "logits/rejected": -0.9189531207084656, "logps/chosen": -104.79373931884766, "logps/rejected": -90.34523010253906, "loss": 0.8466, "rewards/accuracies": 0.0, "rewards/chosen": 0.9454193115234375, "rewards/margins": -0.8252273797988892, "rewards/rejected": 1.7706466913223267, "step": 10142 }, { "epoch": 1.65, "learning_rate": 2.725130648399149e-07, "logits/chosen": -0.9446470141410828, "logits/rejected": -0.8923746347427368, "logps/chosen": -56.82105255126953, "logps/rejected": -90.26619720458984, "loss": 1.1551, "rewards/accuracies": 0.0, "rewards/chosen": 0.9483291506767273, "rewards/margins": -2.171765089035034, "rewards/rejected": 3.1200942993164062, "step": 10143 }, { "epoch": 1.65, "learning_rate": 2.7239603775592794e-07, "logits/chosen": -0.40411487221717834, "logits/rejected": -0.45639586448669434, "logps/chosen": -27.42976951599121, "logps/rejected": -75.37159729003906, "loss": 0.8317, "rewards/accuracies": 0.0, "rewards/chosen": 0.2042352706193924, "rewards/margins": -1.441937804222107, "rewards/rejected": 1.646173119544983, "step": 10144 }, { "epoch": 1.65, "learning_rate": 2.7227902639722146e-07, "logits/chosen": -0.5529569387435913, "logits/rejected": -0.41918954253196716, "logps/chosen": -103.55818939208984, "logps/rejected": -115.81538391113281, "loss": 0.4686, "rewards/accuracies": 1.0, "rewards/chosen": 4.3385748863220215, "rewards/margins": 2.5604209899902344, "rewards/rejected": 1.7781540155410767, "step": 10145 }, { "epoch": 1.65, "learning_rate": 2.721620307718793e-07, "logits/chosen": -0.6063422560691833, "logits/rejected": -0.5168023705482483, "logps/chosen": -78.9915771484375, "logps/rejected": -50.40596008300781, "loss": 0.7937, "rewards/accuracies": 0.0, "rewards/chosen": 1.3767731189727783, "rewards/margins": -1.301882266998291, "rewards/rejected": 2.6786553859710693, "step": 10146 }, { "epoch": 1.65, "learning_rate": 2.7204505088798513e-07, "logits/chosen": -1.2156143188476562, "logits/rejected": -1.2207828760147095, "logps/chosen": -79.08076477050781, "logps/rejected": -80.35659790039062, "loss": 1.6732, "rewards/accuracies": 0.0, "rewards/chosen": 4.1959733963012695, "rewards/margins": -1.9707379341125488, "rewards/rejected": 6.166711330413818, "step": 10147 }, { "epoch": 1.65, "learning_rate": 2.719280867536209e-07, "logits/chosen": -0.5912520289421082, "logits/rejected": -0.2538447678089142, "logps/chosen": -88.72978973388672, "logps/rejected": -24.362712860107422, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": 4.511185646057129, "rewards/margins": 4.096062660217285, "rewards/rejected": 0.41512298583984375, "step": 10148 }, { "epoch": 1.65, "learning_rate": 2.71811138376868e-07, "logits/chosen": -0.6555066704750061, "logits/rejected": -0.6532639265060425, "logps/chosen": -18.894676208496094, "logps/rejected": -21.30097198486328, "loss": 1.0206, "rewards/accuracies": 1.0, "rewards/chosen": 0.3241147994995117, "rewards/margins": 0.2150518298149109, "rewards/rejected": 0.10906296223402023, "step": 10149 }, { "epoch": 1.65, "learning_rate": 2.7169420576580606e-07, "logits/chosen": -0.7157053351402283, "logits/rejected": -0.7896471619606018, "logps/chosen": -35.214141845703125, "logps/rejected": -102.38922119140625, "loss": 0.9913, "rewards/accuracies": 0.0, "rewards/chosen": 1.5241283178329468, "rewards/margins": -1.6579183340072632, "rewards/rejected": 3.18204665184021, "step": 10150 }, { "epoch": 1.65, "learning_rate": 2.715772889285143e-07, "logits/chosen": -0.22104643285274506, "logits/rejected": -0.22104643285274506, "logps/chosen": -61.82306671142578, "logps/rejected": -61.82306671142578, "loss": 0.6404, "rewards/accuracies": 0.0, "rewards/chosen": 0.7070282101631165, "rewards/margins": 0.0, "rewards/rejected": 0.7070282101631165, "step": 10151 }, { "epoch": 1.65, "learning_rate": 2.714603878730707e-07, "logits/chosen": -0.8577569723129272, "logits/rejected": -0.7745679020881653, "logps/chosen": -115.85456848144531, "logps/rejected": -114.46937561035156, "loss": 0.1445, "rewards/accuracies": 1.0, "rewards/chosen": 4.016478061676025, "rewards/margins": 1.2015197277069092, "rewards/rejected": 2.814958333969116, "step": 10152 }, { "epoch": 1.65, "learning_rate": 2.713435026075517e-07, "logits/chosen": -0.38560327887535095, "logits/rejected": -0.5272747278213501, "logps/chosen": -49.71348190307617, "logps/rejected": -121.00423431396484, "loss": 1.2371, "rewards/accuracies": 0.0, "rewards/chosen": 2.082193374633789, "rewards/margins": -2.2474493980407715, "rewards/rejected": 4.3296427726745605, "step": 10153 }, { "epoch": 1.65, "learning_rate": 2.7122663314003317e-07, "logits/chosen": -0.27754807472229004, "logits/rejected": -0.27754807472229004, "logps/chosen": -4.190112113952637, "logps/rejected": -4.190112113952637, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.047607421875, "rewards/margins": 0.0, "rewards/rejected": 0.047607421875, "step": 10154 }, { "epoch": 1.65, "learning_rate": 2.711097794785895e-07, "logits/chosen": -1.0229326486587524, "logits/rejected": -1.1418014764785767, "logps/chosen": -300.665283203125, "logps/rejected": -95.38018798828125, "loss": 0.0704, "rewards/accuracies": 1.0, "rewards/chosen": 6.640472412109375, "rewards/margins": 2.069357395172119, "rewards/rejected": 4.571115016937256, "step": 10155 }, { "epoch": 1.65, "learning_rate": 2.709929416312945e-07, "logits/chosen": -0.8703039884567261, "logits/rejected": -0.8027427196502686, "logps/chosen": -44.67990493774414, "logps/rejected": -70.71559143066406, "loss": 1.8887, "rewards/accuracies": 0.0, "rewards/chosen": 2.03306245803833, "rewards/margins": -1.063629388809204, "rewards/rejected": 3.096691846847534, "step": 10156 }, { "epoch": 1.65, "learning_rate": 2.7087611960622016e-07, "logits/chosen": -0.7948709726333618, "logits/rejected": -0.7745448350906372, "logps/chosen": -71.42041015625, "logps/rejected": -86.88041687011719, "loss": 1.2726, "rewards/accuracies": 0.0, "rewards/chosen": 1.1613320112228394, "rewards/margins": -0.038608551025390625, "rewards/rejected": 1.19994056224823, "step": 10157 }, { "epoch": 1.65, "learning_rate": 2.707593134114381e-07, "logits/chosen": -0.8069672584533691, "logits/rejected": -0.7515546083450317, "logps/chosen": -63.39028549194336, "logps/rejected": -15.887704849243164, "loss": 0.1917, "rewards/accuracies": 1.0, "rewards/chosen": 2.068504810333252, "rewards/margins": 1.083308458328247, "rewards/rejected": 0.9851962924003601, "step": 10158 }, { "epoch": 1.65, "learning_rate": 2.706425230550182e-07, "logits/chosen": -0.6605104207992554, "logits/rejected": -0.6729257106781006, "logps/chosen": -75.75006103515625, "logps/rejected": -73.06379699707031, "loss": 0.5582, "rewards/accuracies": 1.0, "rewards/chosen": 3.8099937438964844, "rewards/margins": 1.5867125988006592, "rewards/rejected": 2.223281145095825, "step": 10159 }, { "epoch": 1.65, "learning_rate": 2.7052574854503e-07, "logits/chosen": -0.7701497077941895, "logits/rejected": -0.7006215453147888, "logps/chosen": -59.78981399536133, "logps/rejected": -32.94895553588867, "loss": 0.4128, "rewards/accuracies": 1.0, "rewards/chosen": 2.9377949237823486, "rewards/margins": 1.4921932220458984, "rewards/rejected": 1.4456017017364502, "step": 10160 }, { "epoch": 1.65, "learning_rate": 2.70408989889541e-07, "logits/chosen": -0.7324351072311401, "logits/rejected": -0.5929225087165833, "logps/chosen": -117.7539291381836, "logps/rejected": -47.00249099731445, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": 5.733240604400635, "rewards/margins": 4.268394947052002, "rewards/rejected": 1.4648456573486328, "step": 10161 }, { "epoch": 1.65, "learning_rate": 2.702922470966187e-07, "logits/chosen": -1.0400623083114624, "logits/rejected": -0.9259775280952454, "logps/chosen": -96.2339096069336, "logps/rejected": -26.495689392089844, "loss": 0.0397, "rewards/accuracies": 1.0, "rewards/chosen": 5.0244622230529785, "rewards/margins": 5.012388706207275, "rewards/rejected": 0.012073516845703125, "step": 10162 }, { "epoch": 1.65, "learning_rate": 2.7017552017432833e-07, "logits/chosen": -0.5197526216506958, "logits/rejected": -0.48332199454307556, "logps/chosen": -163.62049865722656, "logps/rejected": -47.6697998046875, "loss": 0.9719, "rewards/accuracies": 0.0, "rewards/chosen": 0.44631195068359375, "rewards/margins": -1.7280900478363037, "rewards/rejected": 2.1744019985198975, "step": 10163 }, { "epoch": 1.65, "learning_rate": 2.700588091307351e-07, "logits/chosen": -0.5372700095176697, "logits/rejected": -0.5230785012245178, "logps/chosen": -112.36072540283203, "logps/rejected": -131.46234130859375, "loss": 1.1444, "rewards/accuracies": 0.0, "rewards/chosen": 1.4020683765411377, "rewards/margins": -0.3033698797225952, "rewards/rejected": 1.705438256263733, "step": 10164 }, { "epoch": 1.65, "learning_rate": 2.6994211397390227e-07, "logits/chosen": -0.422037273645401, "logits/rejected": -0.422037273645401, "logps/chosen": -54.130496978759766, "logps/rejected": -54.130496978759766, "loss": 1.0521, "rewards/accuracies": 0.0, "rewards/chosen": 1.454813838005066, "rewards/margins": 0.0, "rewards/rejected": 1.454813838005066, "step": 10165 }, { "epoch": 1.65, "learning_rate": 2.698254347118927e-07, "logits/chosen": -0.6516960263252258, "logits/rejected": -0.46682479977607727, "logps/chosen": -67.76480102539062, "logps/rejected": -17.342103958129883, "loss": 0.0703, "rewards/accuracies": 1.0, "rewards/chosen": 2.166249990463257, "rewards/margins": 2.0200908184051514, "rewards/rejected": 0.14615917205810547, "step": 10166 }, { "epoch": 1.65, "learning_rate": 2.697087713527675e-07, "logits/chosen": -0.09908273071050644, "logits/rejected": -0.06896702945232391, "logps/chosen": -25.55452537536621, "logps/rejected": -1.8830153942108154, "loss": 0.9949, "rewards/accuracies": 0.0, "rewards/chosen": 0.07268410176038742, "rewards/margins": -0.18012848496437073, "rewards/rejected": 0.25281259417533875, "step": 10167 }, { "epoch": 1.65, "learning_rate": 2.695921239045873e-07, "logits/chosen": -0.8589690327644348, "logits/rejected": -0.7542620301246643, "logps/chosen": -67.63929748535156, "logps/rejected": -41.378543853759766, "loss": 0.2637, "rewards/accuracies": 1.0, "rewards/chosen": 1.9535293579101562, "rewards/margins": 0.5231448411941528, "rewards/rejected": 1.4303845167160034, "step": 10168 }, { "epoch": 1.65, "learning_rate": 2.6947549237541104e-07, "logits/chosen": -0.8816568851470947, "logits/rejected": -0.7444692850112915, "logps/chosen": -177.83595275878906, "logps/rejected": -74.99665832519531, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": 4.926680088043213, "rewards/margins": 2.2717902660369873, "rewards/rejected": 2.6548898220062256, "step": 10169 }, { "epoch": 1.65, "learning_rate": 2.693588767732972e-07, "logits/chosen": -0.6542054414749146, "logits/rejected": -0.6771960258483887, "logps/chosen": -87.87578582763672, "logps/rejected": -134.88360595703125, "loss": 1.085, "rewards/accuracies": 0.0, "rewards/chosen": 4.4056220054626465, "rewards/margins": -1.8973808288574219, "rewards/rejected": 6.303002834320068, "step": 10170 }, { "epoch": 1.65, "learning_rate": 2.692422771063024e-07, "logits/chosen": -1.00946044921875, "logits/rejected": -0.9082357287406921, "logps/chosen": -78.86480712890625, "logps/rejected": -209.407470703125, "loss": 2.4843, "rewards/accuracies": 0.0, "rewards/chosen": 2.1978530883789062, "rewards/margins": -4.872990608215332, "rewards/rejected": 7.070843696594238, "step": 10171 }, { "epoch": 1.65, "learning_rate": 2.6912569338248315e-07, "logits/chosen": -0.8026159405708313, "logits/rejected": -0.7889193892478943, "logps/chosen": -60.55654525756836, "logps/rejected": -65.01264953613281, "loss": 0.4976, "rewards/accuracies": 0.0, "rewards/chosen": 1.64032781124115, "rewards/margins": -0.48812997341156006, "rewards/rejected": 2.12845778465271, "step": 10172 }, { "epoch": 1.65, "learning_rate": 2.690091256098936e-07, "logits/chosen": -0.9945127367973328, "logits/rejected": -0.9795619249343872, "logps/chosen": -144.03216552734375, "logps/rejected": -139.84707641601562, "loss": 0.2367, "rewards/accuracies": 1.0, "rewards/chosen": 7.503045558929443, "rewards/margins": 1.8869781494140625, "rewards/rejected": 5.616067409515381, "step": 10173 }, { "epoch": 1.65, "learning_rate": 2.6889257379658804e-07, "logits/chosen": -0.9109145402908325, "logits/rejected": -0.8668203949928284, "logps/chosen": -185.59344482421875, "logps/rejected": -76.94637298583984, "loss": 0.324, "rewards/accuracies": 1.0, "rewards/chosen": 3.667820692062378, "rewards/margins": 0.10328197479248047, "rewards/rejected": 3.5645387172698975, "step": 10174 }, { "epoch": 1.65, "learning_rate": 2.6877603795061864e-07, "logits/chosen": -0.41752779483795166, "logits/rejected": -0.43771669268608093, "logps/chosen": -42.55889129638672, "logps/rejected": -110.35569763183594, "loss": 0.6943, "rewards/accuracies": 1.0, "rewards/chosen": 1.3855503797531128, "rewards/margins": 0.4258045554161072, "rewards/rejected": 0.9597458243370056, "step": 10175 }, { "epoch": 1.65, "learning_rate": 2.686595180800374e-07, "logits/chosen": -0.4964492619037628, "logits/rejected": -0.4964492619037628, "logps/chosen": -82.79888153076172, "logps/rejected": -82.79888153076172, "loss": 0.3651, "rewards/accuracies": 0.0, "rewards/chosen": 3.1738052368164062, "rewards/margins": 0.0, "rewards/rejected": 3.1738052368164062, "step": 10176 }, { "epoch": 1.65, "learning_rate": 2.6854301419289425e-07, "logits/chosen": -0.24196434020996094, "logits/rejected": -0.2419639676809311, "logps/chosen": -5.494526386260986, "logps/rejected": -3.635573148727417, "loss": 0.7305, "rewards/accuracies": 0.0, "rewards/chosen": 0.039023589342832565, "rewards/margins": -0.14416925609111786, "rewards/rejected": 0.18319284915924072, "step": 10177 }, { "epoch": 1.65, "learning_rate": 2.6842652629723903e-07, "logits/chosen": -0.3336709141731262, "logits/rejected": -0.35965225100517273, "logps/chosen": -19.729660034179688, "logps/rejected": -36.917266845703125, "loss": 0.4036, "rewards/accuracies": 0.0, "rewards/chosen": 0.23274727165699005, "rewards/margins": -0.1603652983903885, "rewards/rejected": 0.39311257004737854, "step": 10178 }, { "epoch": 1.65, "learning_rate": 2.683100544011194e-07, "logits/chosen": -0.5601359605789185, "logits/rejected": -0.4351831376552582, "logps/chosen": -25.365253448486328, "logps/rejected": -38.420902252197266, "loss": 1.0875, "rewards/accuracies": 1.0, "rewards/chosen": 1.1204899549484253, "rewards/margins": 0.9517834186553955, "rewards/rejected": 0.1687065213918686, "step": 10179 }, { "epoch": 1.65, "learning_rate": 2.68193598512583e-07, "logits/chosen": -0.5061179995536804, "logits/rejected": -0.4193693995475769, "logps/chosen": -44.892005920410156, "logps/rejected": -64.59941864013672, "loss": 0.417, "rewards/accuracies": 0.0, "rewards/chosen": 2.169537305831909, "rewards/margins": -0.17592859268188477, "rewards/rejected": 2.345465898513794, "step": 10180 }, { "epoch": 1.65, "learning_rate": 2.6807715863967533e-07, "logits/chosen": -0.9056602120399475, "logits/rejected": -1.1861097812652588, "logps/chosen": -175.425537109375, "logps/rejected": -35.65110778808594, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": 2.416247606277466, "rewards/margins": 2.203993558883667, "rewards/rejected": 0.21225395798683167, "step": 10181 }, { "epoch": 1.65, "learning_rate": 2.679607347904417e-07, "logits/chosen": -0.45660698413848877, "logits/rejected": -0.48117774724960327, "logps/chosen": -90.26895141601562, "logps/rejected": -46.8792724609375, "loss": 2.0263, "rewards/accuracies": 0.0, "rewards/chosen": 0.8084297180175781, "rewards/margins": -0.7895389795303345, "rewards/rejected": 1.5979686975479126, "step": 10182 }, { "epoch": 1.65, "learning_rate": 2.6784432697292557e-07, "logits/chosen": -0.71041339635849, "logits/rejected": -0.5529623627662659, "logps/chosen": -138.85621643066406, "logps/rejected": -85.92242431640625, "loss": 1.027, "rewards/accuracies": 1.0, "rewards/chosen": 3.6402924060821533, "rewards/margins": 0.13889384269714355, "rewards/rejected": 3.5013985633850098, "step": 10183 }, { "epoch": 1.65, "learning_rate": 2.6772793519517e-07, "logits/chosen": -0.9825393557548523, "logits/rejected": -1.1277544498443604, "logps/chosen": -140.24168395996094, "logps/rejected": -99.87977600097656, "loss": 2.2743, "rewards/accuracies": 0.0, "rewards/chosen": 4.392849922180176, "rewards/margins": -4.528175354003906, "rewards/rejected": 8.921025276184082, "step": 10184 }, { "epoch": 1.65, "learning_rate": 2.6761155946521605e-07, "logits/chosen": -0.33024919033050537, "logits/rejected": -0.3258554935455322, "logps/chosen": -5.407783031463623, "logps/rejected": -0.7694510221481323, "loss": 0.4058, "rewards/accuracies": 0.0, "rewards/chosen": -0.033643532544374466, "rewards/margins": -0.17930994927883148, "rewards/rejected": 0.14566642045974731, "step": 10185 }, { "epoch": 1.65, "learning_rate": 2.674951997911045e-07, "logits/chosen": -0.2568809688091278, "logits/rejected": -0.3044736385345459, "logps/chosen": -66.26029968261719, "logps/rejected": -80.00448608398438, "loss": 0.9258, "rewards/accuracies": 0.0, "rewards/chosen": 1.4910942316055298, "rewards/margins": -1.6443687677383423, "rewards/rejected": 3.135462999343872, "step": 10186 }, { "epoch": 1.65, "learning_rate": 2.6737885618087487e-07, "logits/chosen": -0.8632381558418274, "logits/rejected": -0.8234317302703857, "logps/chosen": -202.75897216796875, "logps/rejected": -279.3769836425781, "loss": 0.4065, "rewards/accuracies": 1.0, "rewards/chosen": 2.096670627593994, "rewards/margins": 0.43438732624053955, "rewards/rejected": 1.6622833013534546, "step": 10187 }, { "epoch": 1.65, "learning_rate": 2.67262528642565e-07, "logits/chosen": -0.8741428256034851, "logits/rejected": -0.9019361734390259, "logps/chosen": -83.5517807006836, "logps/rejected": -136.61929321289062, "loss": 2.8449, "rewards/accuracies": 0.0, "rewards/chosen": 1.9600266218185425, "rewards/margins": -5.124488830566406, "rewards/rejected": 7.084515571594238, "step": 10188 }, { "epoch": 1.65, "learning_rate": 2.6714621718421235e-07, "logits/chosen": -0.8416425585746765, "logits/rejected": -0.859172523021698, "logps/chosen": -74.30179595947266, "logps/rejected": -98.46112823486328, "loss": 0.8673, "rewards/accuracies": 1.0, "rewards/chosen": 2.3176629543304443, "rewards/margins": 1.2041391134262085, "rewards/rejected": 1.1135238409042358, "step": 10189 }, { "epoch": 1.65, "learning_rate": 2.670299218138527e-07, "logits/chosen": -0.7355677485466003, "logits/rejected": -0.7488787770271301, "logps/chosen": -4.282478332519531, "logps/rejected": -12.711974143981934, "loss": 0.4893, "rewards/accuracies": 1.0, "rewards/chosen": 0.33135485649108887, "rewards/margins": 0.07537558674812317, "rewards/rejected": 0.2559792697429657, "step": 10190 }, { "epoch": 1.65, "learning_rate": 2.669136425395212e-07, "logits/chosen": -0.6039106845855713, "logits/rejected": -0.6065571308135986, "logps/chosen": -7.76993465423584, "logps/rejected": -3.1865909099578857, "loss": 0.8755, "rewards/accuracies": 0.0, "rewards/chosen": 0.35920363664627075, "rewards/margins": -0.1733066439628601, "rewards/rejected": 0.5325102806091309, "step": 10191 }, { "epoch": 1.65, "learning_rate": 2.667973793692514e-07, "logits/chosen": -0.7106623649597168, "logits/rejected": -0.6613603830337524, "logps/chosen": -87.09346771240234, "logps/rejected": -41.83799743652344, "loss": 0.3792, "rewards/accuracies": 1.0, "rewards/chosen": 1.6793968677520752, "rewards/margins": 0.02297818660736084, "rewards/rejected": 1.6564186811447144, "step": 10192 }, { "epoch": 1.65, "learning_rate": 2.6668113231107627e-07, "logits/chosen": -1.0128588676452637, "logits/rejected": -0.9504086375236511, "logps/chosen": -109.09217834472656, "logps/rejected": -109.54627990722656, "loss": 0.9729, "rewards/accuracies": 0.0, "rewards/chosen": 0.43890380859375, "rewards/margins": -1.7831039428710938, "rewards/rejected": 2.2220077514648438, "step": 10193 }, { "epoch": 1.65, "learning_rate": 2.66564901373027e-07, "logits/chosen": -0.6044124960899353, "logits/rejected": -0.6184656620025635, "logps/chosen": -46.45096206665039, "logps/rejected": -68.2727279663086, "loss": 0.4696, "rewards/accuracies": 1.0, "rewards/chosen": 1.9251164197921753, "rewards/margins": 0.2103710174560547, "rewards/rejected": 1.7147454023361206, "step": 10194 }, { "epoch": 1.65, "learning_rate": 2.664486865631344e-07, "logits/chosen": -0.2590022385120392, "logits/rejected": -0.26526904106140137, "logps/chosen": -10.17292594909668, "logps/rejected": -7.437428951263428, "loss": 0.8046, "rewards/accuracies": 0.0, "rewards/chosen": -0.06354475021362305, "rewards/margins": -0.10900631546974182, "rewards/rejected": 0.045461561530828476, "step": 10195 }, { "epoch": 1.65, "learning_rate": 2.6633248788942753e-07, "logits/chosen": -0.7456978559494019, "logits/rejected": -0.6479195356369019, "logps/chosen": -63.33902359008789, "logps/rejected": -76.0997085571289, "loss": 0.757, "rewards/accuracies": 1.0, "rewards/chosen": 2.2136332988739014, "rewards/margins": 0.3204134702682495, "rewards/rejected": 1.8932198286056519, "step": 10196 }, { "epoch": 1.66, "learning_rate": 2.66216305359935e-07, "logits/chosen": -0.5246107578277588, "logits/rejected": -0.5149020552635193, "logps/chosen": -93.23210144042969, "logps/rejected": -71.42210388183594, "loss": 0.5006, "rewards/accuracies": 0.0, "rewards/chosen": 1.2834014892578125, "rewards/margins": -0.5113632678985596, "rewards/rejected": 1.794764757156372, "step": 10197 }, { "epoch": 1.66, "learning_rate": 2.661001389826835e-07, "logits/chosen": -0.4405042827129364, "logits/rejected": -0.4409511089324951, "logps/chosen": -124.96540832519531, "logps/rejected": -212.1856689453125, "loss": 0.2262, "rewards/accuracies": 1.0, "rewards/chosen": 0.9226409792900085, "rewards/margins": 0.6109054088592529, "rewards/rejected": 0.3117355406284332, "step": 10198 }, { "epoch": 1.66, "learning_rate": 2.6598398876569927e-07, "logits/chosen": -0.7427788376808167, "logits/rejected": -0.7805575132369995, "logps/chosen": -97.50399780273438, "logps/rejected": -59.45713806152344, "loss": 0.1408, "rewards/accuracies": 1.0, "rewards/chosen": 3.3299272060394287, "rewards/margins": 1.2235679626464844, "rewards/rejected": 2.1063592433929443, "step": 10199 }, { "epoch": 1.66, "learning_rate": 2.6586785471700707e-07, "logits/chosen": -0.38637664914131165, "logits/rejected": -0.390610933303833, "logps/chosen": -28.586626052856445, "logps/rejected": -21.517845153808594, "loss": 1.1277, "rewards/accuracies": 0.0, "rewards/chosen": -0.42662620544433594, "rewards/margins": -0.4929584562778473, "rewards/rejected": 0.06633224338293076, "step": 10200 }, { "epoch": 1.66, "learning_rate": 2.657517368446309e-07, "logits/chosen": -0.7043724656105042, "logits/rejected": -0.6671952605247498, "logps/chosen": -55.564273834228516, "logps/rejected": -74.59825134277344, "loss": 0.673, "rewards/accuracies": 0.0, "rewards/chosen": 0.9282436370849609, "rewards/margins": -0.08340346813201904, "rewards/rejected": 1.01164710521698, "step": 10201 }, { "epoch": 1.66, "learning_rate": 2.6563563515659303e-07, "logits/chosen": -0.726513147354126, "logits/rejected": -0.6453307271003723, "logps/chosen": -56.99275207519531, "logps/rejected": -56.43011474609375, "loss": 0.2746, "rewards/accuracies": 1.0, "rewards/chosen": 2.7419304847717285, "rewards/margins": 1.1405388116836548, "rewards/rejected": 1.6013916730880737, "step": 10202 }, { "epoch": 1.66, "learning_rate": 2.6551954966091537e-07, "logits/chosen": -0.958617091178894, "logits/rejected": -1.0157732963562012, "logps/chosen": -110.47914123535156, "logps/rejected": -99.01178741455078, "loss": 0.5984, "rewards/accuracies": 0.0, "rewards/chosen": 0.8863189816474915, "rewards/margins": -0.5967712998390198, "rewards/rejected": 1.4830902814865112, "step": 10203 }, { "epoch": 1.66, "learning_rate": 2.65403480365618e-07, "logits/chosen": -0.6688043475151062, "logits/rejected": -0.6695462465286255, "logps/chosen": -10.218193054199219, "logps/rejected": -2.697446346282959, "loss": 0.4847, "rewards/accuracies": 0.0, "rewards/chosen": 0.1723606139421463, "rewards/margins": -0.34995365142822266, "rewards/rejected": 0.5223142504692078, "step": 10204 }, { "epoch": 1.66, "learning_rate": 2.6528742727872053e-07, "logits/chosen": -0.4567720890045166, "logits/rejected": -0.4625665247440338, "logps/chosen": -9.487238883972168, "logps/rejected": -1.6815106868743896, "loss": 2.8522, "rewards/accuracies": 0.0, "rewards/chosen": 0.0807652473449707, "rewards/margins": -0.18400737643241882, "rewards/rejected": 0.2647726237773895, "step": 10205 }, { "epoch": 1.66, "learning_rate": 2.6517139040824076e-07, "logits/chosen": -0.8357824683189392, "logits/rejected": -0.8134499788284302, "logps/chosen": -72.71476745605469, "logps/rejected": -177.8365936279297, "loss": 0.4537, "rewards/accuracies": 1.0, "rewards/chosen": 1.1636489629745483, "rewards/margins": 0.07590329647064209, "rewards/rejected": 1.0877456665039062, "step": 10206 }, { "epoch": 1.66, "learning_rate": 2.6505536976219625e-07, "logits/chosen": -1.023582935333252, "logits/rejected": -0.9845877289772034, "logps/chosen": -266.5425109863281, "logps/rejected": -82.22378540039062, "loss": 0.7895, "rewards/accuracies": 0.0, "rewards/chosen": 1.0516480207443237, "rewards/margins": -1.2713736295700073, "rewards/rejected": 2.323021650314331, "step": 10207 }, { "epoch": 1.66, "learning_rate": 2.649393653486023e-07, "logits/chosen": -0.7810379862785339, "logits/rejected": -0.8298137187957764, "logps/chosen": -150.13226318359375, "logps/rejected": -116.50186157226562, "loss": 0.102, "rewards/accuracies": 1.0, "rewards/chosen": 3.4238617420196533, "rewards/margins": 2.200793504714966, "rewards/rejected": 1.2230682373046875, "step": 10208 }, { "epoch": 1.66, "learning_rate": 2.6482337717547424e-07, "logits/chosen": -0.6363374590873718, "logits/rejected": -0.5350568890571594, "logps/chosen": -335.30865478515625, "logps/rejected": -42.89208984375, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": 5.993768215179443, "rewards/margins": 3.924060344696045, "rewards/rejected": 2.0697078704833984, "step": 10209 }, { "epoch": 1.66, "learning_rate": 2.647074052508254e-07, "logits/chosen": -0.8839719891548157, "logits/rejected": -0.8543018102645874, "logps/chosen": -110.93446350097656, "logps/rejected": -154.14901733398438, "loss": 0.9913, "rewards/accuracies": 0.0, "rewards/chosen": 0.991162121295929, "rewards/margins": -0.14656370878219604, "rewards/rejected": 1.137725830078125, "step": 10210 }, { "epoch": 1.66, "learning_rate": 2.6459144958266865e-07, "logits/chosen": -0.7704341411590576, "logits/rejected": -0.7704341411590576, "logps/chosen": -75.07862091064453, "logps/rejected": -75.07862091064453, "loss": 0.7437, "rewards/accuracies": 0.0, "rewards/chosen": 2.7527573108673096, "rewards/margins": 0.0, "rewards/rejected": 2.7527573108673096, "step": 10211 }, { "epoch": 1.66, "learning_rate": 2.6447551017901513e-07, "logits/chosen": -0.7704340219497681, "logits/rejected": -0.7155478596687317, "logps/chosen": -73.15028381347656, "logps/rejected": -81.21894836425781, "loss": 0.317, "rewards/accuracies": 1.0, "rewards/chosen": 1.1678314208984375, "rewards/margins": 0.24209362268447876, "rewards/rejected": 0.9257377982139587, "step": 10212 }, { "epoch": 1.66, "learning_rate": 2.643595870478755e-07, "logits/chosen": -0.7971278429031372, "logits/rejected": -0.6970310211181641, "logps/chosen": -53.52285385131836, "logps/rejected": -65.64837646484375, "loss": 2.1524, "rewards/accuracies": 0.0, "rewards/chosen": 1.82770574092865, "rewards/margins": -1.1682568788528442, "rewards/rejected": 2.995962619781494, "step": 10213 }, { "epoch": 1.66, "learning_rate": 2.642436801972587e-07, "logits/chosen": -1.0695326328277588, "logits/rejected": -1.1812132596969604, "logps/chosen": -188.4736328125, "logps/rejected": -75.30078125, "loss": 0.6901, "rewards/accuracies": 0.0, "rewards/chosen": 3.435436964035034, "rewards/margins": -1.0292212963104248, "rewards/rejected": 4.464658260345459, "step": 10214 }, { "epoch": 1.66, "learning_rate": 2.641277896351728e-07, "logits/chosen": -0.7991268038749695, "logits/rejected": -0.810228705406189, "logps/chosen": -69.83802795410156, "logps/rejected": -61.046966552734375, "loss": 0.739, "rewards/accuracies": 1.0, "rewards/chosen": 1.66417396068573, "rewards/margins": 0.2730591297149658, "rewards/rejected": 1.3911148309707642, "step": 10215 }, { "epoch": 1.66, "learning_rate": 2.6401191536962485e-07, "logits/chosen": -0.31654855608940125, "logits/rejected": -0.31654855608940125, "logps/chosen": -23.152267456054688, "logps/rejected": -23.152267456054688, "loss": 0.6067, "rewards/accuracies": 0.0, "rewards/chosen": 1.325626015663147, "rewards/margins": 0.0, "rewards/rejected": 1.325626015663147, "step": 10216 }, { "epoch": 1.66, "learning_rate": 2.638960574086204e-07, "logits/chosen": -0.5570371150970459, "logits/rejected": -0.5603278875350952, "logps/chosen": -74.9068603515625, "logps/rejected": -67.47786712646484, "loss": 1.7166, "rewards/accuracies": 0.0, "rewards/chosen": 1.3490616083145142, "rewards/margins": -0.621392011642456, "rewards/rejected": 1.9704536199569702, "step": 10217 }, { "epoch": 1.66, "learning_rate": 2.637802157601646e-07, "logits/chosen": -0.7299370765686035, "logits/rejected": -0.7390912175178528, "logps/chosen": -85.4635238647461, "logps/rejected": -72.48957824707031, "loss": 0.4038, "rewards/accuracies": 1.0, "rewards/chosen": 2.424481153488159, "rewards/margins": 0.49445033073425293, "rewards/rejected": 1.9300308227539062, "step": 10218 }, { "epoch": 1.66, "learning_rate": 2.6366439043226056e-07, "logits/chosen": -0.8952597379684448, "logits/rejected": -0.8445628881454468, "logps/chosen": -143.1901397705078, "logps/rejected": -93.01586151123047, "loss": 0.6469, "rewards/accuracies": 0.0, "rewards/chosen": 1.5222519636154175, "rewards/margins": -0.7565056085586548, "rewards/rejected": 2.2787575721740723, "step": 10219 }, { "epoch": 1.66, "learning_rate": 2.6354858143291117e-07, "logits/chosen": -0.4396156966686249, "logits/rejected": -0.4799242317676544, "logps/chosen": -54.765960693359375, "logps/rejected": -76.1467514038086, "loss": 1.1797, "rewards/accuracies": 1.0, "rewards/chosen": 1.3826416730880737, "rewards/margins": 0.7940728068351746, "rewards/rejected": 0.5885688662528992, "step": 10220 }, { "epoch": 1.66, "learning_rate": 2.6343278877011714e-07, "logits/chosen": -0.543548583984375, "logits/rejected": -0.4179314076900482, "logps/chosen": -45.08740997314453, "logps/rejected": -52.22681427001953, "loss": 1.9023, "rewards/accuracies": 1.0, "rewards/chosen": 1.6096954345703125, "rewards/margins": 0.8686752319335938, "rewards/rejected": 0.7410202026367188, "step": 10221 }, { "epoch": 1.66, "learning_rate": 2.6331701245187934e-07, "logits/chosen": -0.9740518927574158, "logits/rejected": -0.853754997253418, "logps/chosen": -87.18051147460938, "logps/rejected": -53.195987701416016, "loss": 0.5541, "rewards/accuracies": 1.0, "rewards/chosen": 3.986081838607788, "rewards/margins": 2.075509548187256, "rewards/rejected": 1.9105724096298218, "step": 10222 }, { "epoch": 1.66, "learning_rate": 2.6320125248619613e-07, "logits/chosen": -0.40691572427749634, "logits/rejected": -0.4023870825767517, "logps/chosen": -51.386993408203125, "logps/rejected": -49.69719696044922, "loss": 0.6546, "rewards/accuracies": 0.0, "rewards/chosen": 1.6010299921035767, "rewards/margins": -0.5043357610702515, "rewards/rejected": 2.105365753173828, "step": 10223 }, { "epoch": 1.66, "learning_rate": 2.63085508881066e-07, "logits/chosen": -0.6484122276306152, "logits/rejected": -0.6474138498306274, "logps/chosen": -94.32103729248047, "logps/rejected": -49.44486999511719, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.36210939288139343, "rewards/margins": -1.097373127937317, "rewards/rejected": 1.4594825506210327, "step": 10224 }, { "epoch": 1.66, "learning_rate": 2.629697816444854e-07, "logits/chosen": -0.9485388994216919, "logits/rejected": -0.8548435568809509, "logps/chosen": -68.1161880493164, "logps/rejected": -85.99751281738281, "loss": 0.2412, "rewards/accuracies": 1.0, "rewards/chosen": 2.9235236644744873, "rewards/margins": 0.5687148571014404, "rewards/rejected": 2.354808807373047, "step": 10225 }, { "epoch": 1.66, "learning_rate": 2.628540707844501e-07, "logits/chosen": -0.893460750579834, "logits/rejected": -0.8248693346977234, "logps/chosen": -66.01535034179688, "logps/rejected": -82.7825927734375, "loss": 0.5104, "rewards/accuracies": 1.0, "rewards/chosen": 2.8815231323242188, "rewards/margins": 0.24041748046875, "rewards/rejected": 2.6411056518554688, "step": 10226 }, { "epoch": 1.66, "learning_rate": 2.6273837630895455e-07, "logits/chosen": -0.5691624283790588, "logits/rejected": -0.5714490413665771, "logps/chosen": -4.1726179122924805, "logps/rejected": -9.95915699005127, "loss": 0.4083, "rewards/accuracies": 1.0, "rewards/chosen": 0.2744240462779999, "rewards/margins": 0.23371827602386475, "rewards/rejected": 0.04070577770471573, "step": 10227 }, { "epoch": 1.66, "learning_rate": 2.6262269822599236e-07, "logits/chosen": -0.9142068028450012, "logits/rejected": -0.9811438918113708, "logps/chosen": -73.58433532714844, "logps/rejected": -147.27784729003906, "loss": 1.536, "rewards/accuracies": 0.0, "rewards/chosen": 1.5047928094863892, "rewards/margins": -2.8599700927734375, "rewards/rejected": 4.364762783050537, "step": 10228 }, { "epoch": 1.66, "learning_rate": 2.625070365435554e-07, "logits/chosen": -0.8433011174201965, "logits/rejected": -0.7632610201835632, "logps/chosen": -42.08190155029297, "logps/rejected": -38.4240837097168, "loss": 0.2429, "rewards/accuracies": 1.0, "rewards/chosen": 1.9011276960372925, "rewards/margins": 1.0311203002929688, "rewards/rejected": 0.870007336139679, "step": 10229 }, { "epoch": 1.66, "learning_rate": 2.623913912696354e-07, "logits/chosen": -0.7775477170944214, "logits/rejected": -0.6893659830093384, "logps/chosen": -88.7837142944336, "logps/rejected": -62.72145080566406, "loss": 0.4867, "rewards/accuracies": 0.0, "rewards/chosen": 0.8488708734512329, "rewards/margins": -0.3938026428222656, "rewards/rejected": 1.2426735162734985, "step": 10230 }, { "epoch": 1.66, "learning_rate": 2.622757624122216e-07, "logits/chosen": -0.805253267288208, "logits/rejected": -0.6488720178604126, "logps/chosen": -111.88778686523438, "logps/rejected": -52.64921569824219, "loss": 0.0204, "rewards/accuracies": 1.0, "rewards/chosen": 5.058923244476318, "rewards/margins": 3.47324800491333, "rewards/rejected": 1.5856751203536987, "step": 10231 }, { "epoch": 1.66, "learning_rate": 2.6216014997930357e-07, "logits/chosen": -0.8937689065933228, "logits/rejected": -0.8937689065933228, "logps/chosen": -153.63150024414062, "logps/rejected": -153.63150024414062, "loss": 0.7017, "rewards/accuracies": 0.0, "rewards/chosen": 3.2374024391174316, "rewards/margins": 0.0, "rewards/rejected": 3.2374024391174316, "step": 10232 }, { "epoch": 1.66, "learning_rate": 2.6204455397886847e-07, "logits/chosen": -1.0287874937057495, "logits/rejected": -0.9871547818183899, "logps/chosen": -77.2638168334961, "logps/rejected": -101.36628723144531, "loss": 0.8507, "rewards/accuracies": 0.0, "rewards/chosen": 0.9340553283691406, "rewards/margins": -1.3830163478851318, "rewards/rejected": 2.3170716762542725, "step": 10233 }, { "epoch": 1.66, "learning_rate": 2.619289744189033e-07, "logits/chosen": -0.5504552721977234, "logits/rejected": -0.5504552721977234, "logps/chosen": -97.88325500488281, "logps/rejected": -97.88325500488281, "loss": 0.6991, "rewards/accuracies": 0.0, "rewards/chosen": 1.8327759504318237, "rewards/margins": 0.0, "rewards/rejected": 1.8327759504318237, "step": 10234 }, { "epoch": 1.66, "learning_rate": 2.618134113073932e-07, "logits/chosen": -0.6357059478759766, "logits/rejected": -0.6472097635269165, "logps/chosen": -33.32441711425781, "logps/rejected": -35.19655990600586, "loss": 0.5173, "rewards/accuracies": 0.0, "rewards/chosen": 2.3486077785491943, "rewards/margins": -0.21950292587280273, "rewards/rejected": 2.568110704421997, "step": 10235 }, { "epoch": 1.66, "learning_rate": 2.616978646523228e-07, "logits/chosen": -0.8715402483940125, "logits/rejected": -0.9586293697357178, "logps/chosen": -195.64413452148438, "logps/rejected": -68.27619934082031, "loss": 0.1668, "rewards/accuracies": 1.0, "rewards/chosen": 4.08831787109375, "rewards/margins": 1.6487936973571777, "rewards/rejected": 2.4395241737365723, "step": 10236 }, { "epoch": 1.66, "learning_rate": 2.61582334461675e-07, "logits/chosen": -0.5645550489425659, "logits/rejected": -0.533812403678894, "logps/chosen": -62.982200622558594, "logps/rejected": -103.12886047363281, "loss": 1.3801, "rewards/accuracies": 0.0, "rewards/chosen": 1.1199439764022827, "rewards/margins": -0.7707161903381348, "rewards/rejected": 1.8906601667404175, "step": 10237 }, { "epoch": 1.66, "learning_rate": 2.6146682074343206e-07, "logits/chosen": -0.4939057230949402, "logits/rejected": -0.3939894437789917, "logps/chosen": -47.982391357421875, "logps/rejected": -80.5451431274414, "loss": 0.2055, "rewards/accuracies": 1.0, "rewards/chosen": 2.3795242309570312, "rewards/margins": 0.9689048528671265, "rewards/rejected": 1.4106193780899048, "step": 10238 }, { "epoch": 1.66, "learning_rate": 2.6135132350557467e-07, "logits/chosen": -0.5860005021095276, "logits/rejected": -0.5860005021095276, "logps/chosen": -26.236637115478516, "logps/rejected": -26.236637115478516, "loss": 0.3492, "rewards/accuracies": 0.0, "rewards/chosen": 1.279404878616333, "rewards/margins": 0.0, "rewards/rejected": 1.279404878616333, "step": 10239 }, { "epoch": 1.66, "learning_rate": 2.612358427560828e-07, "logits/chosen": -0.6055164933204651, "logits/rejected": -0.6685433983802795, "logps/chosen": -72.1881332397461, "logps/rejected": -48.845497131347656, "loss": 0.9315, "rewards/accuracies": 0.0, "rewards/chosen": 1.4595940113067627, "rewards/margins": -1.0813775062561035, "rewards/rejected": 2.540971517562866, "step": 10240 }, { "epoch": 1.66, "learning_rate": 2.611203785029349e-07, "logits/chosen": -0.4327477514743805, "logits/rejected": -0.3316425085067749, "logps/chosen": -88.60205078125, "logps/rejected": -18.730941772460938, "loss": 0.7451, "rewards/accuracies": 1.0, "rewards/chosen": 0.5191581845283508, "rewards/margins": 0.19981977343559265, "rewards/rejected": 0.3193384110927582, "step": 10241 }, { "epoch": 1.66, "learning_rate": 2.6100493075410847e-07, "logits/chosen": -0.914563000202179, "logits/rejected": -0.4872511923313141, "logps/chosen": -75.22994995117188, "logps/rejected": -215.58322143554688, "loss": 0.7129, "rewards/accuracies": 1.0, "rewards/chosen": 2.7879836559295654, "rewards/margins": 1.339401125907898, "rewards/rejected": 1.4485825300216675, "step": 10242 }, { "epoch": 1.66, "learning_rate": 2.6088949951758016e-07, "logits/chosen": -0.934974730014801, "logits/rejected": -0.9942561388015747, "logps/chosen": -145.85252380371094, "logps/rejected": -118.16667175292969, "loss": 1.1852, "rewards/accuracies": 0.0, "rewards/chosen": 4.841920375823975, "rewards/margins": -2.236337661743164, "rewards/rejected": 7.078258037567139, "step": 10243 }, { "epoch": 1.66, "learning_rate": 2.6077408480132474e-07, "logits/chosen": -0.6418991088867188, "logits/rejected": -0.6362393498420715, "logps/chosen": -24.33266830444336, "logps/rejected": -34.71723175048828, "loss": 0.9858, "rewards/accuracies": 0.0, "rewards/chosen": 0.2050636261701584, "rewards/margins": -0.07361070811748505, "rewards/rejected": 0.27867433428764343, "step": 10244 }, { "epoch": 1.66, "learning_rate": 2.606586866133167e-07, "logits/chosen": -0.6954663395881653, "logits/rejected": -0.5442718863487244, "logps/chosen": -121.91131591796875, "logps/rejected": -41.99415588378906, "loss": 0.3338, "rewards/accuracies": 1.0, "rewards/chosen": 1.8262299299240112, "rewards/margins": 0.4202018976211548, "rewards/rejected": 1.4060280323028564, "step": 10245 }, { "epoch": 1.66, "learning_rate": 2.6054330496152856e-07, "logits/chosen": -0.49845194816589355, "logits/rejected": -0.5621037483215332, "logps/chosen": -135.25521850585938, "logps/rejected": -115.51670837402344, "loss": 1.672, "rewards/accuracies": 0.0, "rewards/chosen": 5.455299377441406, "rewards/margins": -1.952378749847412, "rewards/rejected": 7.407678127288818, "step": 10246 }, { "epoch": 1.66, "learning_rate": 2.604279398539324e-07, "logits/chosen": -0.7576419115066528, "logits/rejected": -0.7461486458778381, "logps/chosen": -74.01132202148438, "logps/rejected": -82.24885559082031, "loss": 1.0236, "rewards/accuracies": 0.0, "rewards/chosen": 1.2994736433029175, "rewards/margins": -0.6227874755859375, "rewards/rejected": 1.922261118888855, "step": 10247 }, { "epoch": 1.66, "learning_rate": 2.6031259129849865e-07, "logits/chosen": -0.8474605679512024, "logits/rejected": -0.8442406058311462, "logps/chosen": -80.56280517578125, "logps/rejected": -82.07185363769531, "loss": 1.0835, "rewards/accuracies": 0.0, "rewards/chosen": 0.8801490664482117, "rewards/margins": -0.38511890172958374, "rewards/rejected": 1.2652679681777954, "step": 10248 }, { "epoch": 1.66, "learning_rate": 2.60197259303197e-07, "logits/chosen": -0.8797311782836914, "logits/rejected": -0.8716239333152771, "logps/chosen": -47.1947021484375, "logps/rejected": -64.18921661376953, "loss": 0.5834, "rewards/accuracies": 1.0, "rewards/chosen": 2.722991943359375, "rewards/margins": 0.021775007247924805, "rewards/rejected": 2.70121693611145, "step": 10249 }, { "epoch": 1.66, "learning_rate": 2.600819438759956e-07, "logits/chosen": -0.7988908886909485, "logits/rejected": -0.744138777256012, "logps/chosen": -58.78421401977539, "logps/rejected": -17.974544525146484, "loss": 0.099, "rewards/accuracies": 1.0, "rewards/chosen": 1.8312695026397705, "rewards/margins": 1.5230337381362915, "rewards/rejected": 0.3082357347011566, "step": 10250 }, { "epoch": 1.66, "learning_rate": 2.5996664502486187e-07, "logits/chosen": -0.3284781277179718, "logits/rejected": -0.31811001896858215, "logps/chosen": -4.5054521560668945, "logps/rejected": -19.809040069580078, "loss": 0.5466, "rewards/accuracies": 1.0, "rewards/chosen": 0.5134841203689575, "rewards/margins": 0.20655623078346252, "rewards/rejected": 0.306927889585495, "step": 10251 }, { "epoch": 1.66, "learning_rate": 2.5985136275776166e-07, "logits/chosen": -0.5336397290229797, "logits/rejected": -0.48290789127349854, "logps/chosen": -62.24495315551758, "logps/rejected": -64.12063598632812, "loss": 0.1696, "rewards/accuracies": 1.0, "rewards/chosen": 2.7361340522766113, "rewards/margins": 0.9492154121398926, "rewards/rejected": 1.7869186401367188, "step": 10252 }, { "epoch": 1.66, "learning_rate": 2.597360970826601e-07, "logits/chosen": -1.0269187688827515, "logits/rejected": -0.846179187297821, "logps/chosen": -162.71824645996094, "logps/rejected": -131.95407104492188, "loss": 0.4392, "rewards/accuracies": 1.0, "rewards/chosen": 5.959602355957031, "rewards/margins": 4.6996355056762695, "rewards/rejected": 1.2599670886993408, "step": 10253 }, { "epoch": 1.66, "learning_rate": 2.596208480075206e-07, "logits/chosen": -0.5930889844894409, "logits/rejected": -0.5930889844894409, "logps/chosen": -67.84410095214844, "logps/rejected": -67.84410095214844, "loss": 0.4854, "rewards/accuracies": 0.0, "rewards/chosen": 0.8111114501953125, "rewards/margins": 0.0, "rewards/rejected": 0.8111114501953125, "step": 10254 }, { "epoch": 1.66, "learning_rate": 2.595056155403063e-07, "logits/chosen": -0.8389250040054321, "logits/rejected": -0.7214400768280029, "logps/chosen": -108.97464752197266, "logps/rejected": -28.173574447631836, "loss": 0.3418, "rewards/accuracies": 1.0, "rewards/chosen": 0.4133186340332031, "rewards/margins": 0.4691869616508484, "rewards/rejected": -0.05586833879351616, "step": 10255 }, { "epoch": 1.66, "learning_rate": 2.5939039968897815e-07, "logits/chosen": -0.27792853116989136, "logits/rejected": -0.27792853116989136, "logps/chosen": -74.1357650756836, "logps/rejected": -74.1357650756836, "loss": 0.5334, "rewards/accuracies": 0.0, "rewards/chosen": 0.7378196716308594, "rewards/margins": 0.0, "rewards/rejected": 0.7378196716308594, "step": 10256 }, { "epoch": 1.66, "learning_rate": 2.592752004614969e-07, "logits/chosen": -1.0898768901824951, "logits/rejected": -1.0625134706497192, "logps/chosen": -36.93811798095703, "logps/rejected": -107.9954605102539, "loss": 0.2754, "rewards/accuracies": 1.0, "rewards/chosen": 2.7575440406799316, "rewards/margins": 1.2983086109161377, "rewards/rejected": 1.459235429763794, "step": 10257 }, { "epoch": 1.66, "learning_rate": 2.591600178658213e-07, "logits/chosen": -0.44599971175193787, "logits/rejected": -0.44599971175193787, "logps/chosen": -14.90097427368164, "logps/rejected": -14.90097427368164, "loss": 1.1051, "rewards/accuracies": 0.0, "rewards/chosen": 0.902845025062561, "rewards/margins": 0.0, "rewards/rejected": 0.902845025062561, "step": 10258 }, { "epoch": 1.67, "learning_rate": 2.590448519099099e-07, "logits/chosen": -0.5735769867897034, "logits/rejected": -0.5457203388214111, "logps/chosen": -57.05284118652344, "logps/rejected": -78.42738342285156, "loss": 1.1873, "rewards/accuracies": 1.0, "rewards/chosen": 1.827684760093689, "rewards/margins": 0.016191840171813965, "rewards/rejected": 1.811492919921875, "step": 10259 }, { "epoch": 1.67, "learning_rate": 2.58929702601719e-07, "logits/chosen": -0.900936484336853, "logits/rejected": -1.005421757698059, "logps/chosen": -108.60599517822266, "logps/rejected": -94.96346282958984, "loss": 1.7343, "rewards/accuracies": 0.0, "rewards/chosen": 1.0629494190216064, "rewards/margins": -3.357109308242798, "rewards/rejected": 4.420058727264404, "step": 10260 }, { "epoch": 1.67, "learning_rate": 2.5881456994920484e-07, "logits/chosen": -0.7087976336479187, "logits/rejected": -0.7194294333457947, "logps/chosen": -61.6556396484375, "logps/rejected": -91.07135009765625, "loss": 0.6408, "rewards/accuracies": 1.0, "rewards/chosen": 2.2413177490234375, "rewards/margins": 0.09816193580627441, "rewards/rejected": 2.143155813217163, "step": 10261 }, { "epoch": 1.67, "learning_rate": 2.586994539603217e-07, "logits/chosen": -0.2595311105251312, "logits/rejected": -0.28554466366767883, "logps/chosen": -30.473190307617188, "logps/rejected": -65.23970031738281, "loss": 0.698, "rewards/accuracies": 0.0, "rewards/chosen": 1.5799381732940674, "rewards/margins": -0.46265077590942383, "rewards/rejected": 2.042588949203491, "step": 10262 }, { "epoch": 1.67, "learning_rate": 2.5858435464302315e-07, "logits/chosen": -0.7571995854377747, "logits/rejected": -0.6001016497612, "logps/chosen": -99.52581787109375, "logps/rejected": -9.882735252380371, "loss": 0.1684, "rewards/accuracies": 1.0, "rewards/chosen": 6.062613010406494, "rewards/margins": 5.312355995178223, "rewards/rejected": 0.7502568364143372, "step": 10263 }, { "epoch": 1.67, "learning_rate": 2.5846927200526126e-07, "logits/chosen": -0.3908330798149109, "logits/rejected": -0.4152490198612213, "logps/chosen": -53.40827941894531, "logps/rejected": -89.2340087890625, "loss": 0.8556, "rewards/accuracies": 1.0, "rewards/chosen": 1.4540451765060425, "rewards/margins": 1.225659966468811, "rewards/rejected": 0.22838516533374786, "step": 10264 }, { "epoch": 1.67, "learning_rate": 2.5835420605498746e-07, "logits/chosen": -0.8401390314102173, "logits/rejected": -0.7527723908424377, "logps/chosen": -89.08058166503906, "logps/rejected": -99.9581298828125, "loss": 3.1231, "rewards/accuracies": 0.0, "rewards/chosen": 0.9140838980674744, "rewards/margins": -5.168328762054443, "rewards/rejected": 6.0824127197265625, "step": 10265 }, { "epoch": 1.67, "learning_rate": 2.5823915680015134e-07, "logits/chosen": -0.8760949373245239, "logits/rejected": -0.9762905836105347, "logps/chosen": -295.03887939453125, "logps/rejected": -99.38131713867188, "loss": 0.1426, "rewards/accuracies": 1.0, "rewards/chosen": 4.045581340789795, "rewards/margins": 1.5793490409851074, "rewards/rejected": 2.4662322998046875, "step": 10266 }, { "epoch": 1.67, "learning_rate": 2.581241242487021e-07, "logits/chosen": -0.9061471819877625, "logits/rejected": -0.8483335971832275, "logps/chosen": -112.06785583496094, "logps/rejected": -271.46990966796875, "loss": 0.9362, "rewards/accuracies": 0.0, "rewards/chosen": 2.7581100463867188, "rewards/margins": -1.5683550834655762, "rewards/rejected": 4.326465129852295, "step": 10267 }, { "epoch": 1.67, "learning_rate": 2.5800910840858716e-07, "logits/chosen": -0.9220409989356995, "logits/rejected": -0.7286848425865173, "logps/chosen": -101.88640594482422, "logps/rejected": -127.36512756347656, "loss": 0.1811, "rewards/accuracies": 1.0, "rewards/chosen": 4.310029029846191, "rewards/margins": 0.9521265029907227, "rewards/rejected": 3.3579025268554688, "step": 10268 }, { "epoch": 1.67, "learning_rate": 2.5789410928775315e-07, "logits/chosen": -0.7862778306007385, "logits/rejected": -0.7483320236206055, "logps/chosen": -63.86031723022461, "logps/rejected": -57.84899139404297, "loss": 0.8714, "rewards/accuracies": 0.0, "rewards/chosen": 2.3049418926239014, "rewards/margins": -0.2783024311065674, "rewards/rejected": 2.5832443237304688, "step": 10269 }, { "epoch": 1.67, "learning_rate": 2.577791268941452e-07, "logits/chosen": -0.693374752998352, "logits/rejected": -0.7094277143478394, "logps/chosen": -91.54653930664062, "logps/rejected": -58.664825439453125, "loss": 0.6109, "rewards/accuracies": 0.0, "rewards/chosen": 1.322113037109375, "rewards/margins": -0.4542732238769531, "rewards/rejected": 1.7763862609863281, "step": 10270 }, { "epoch": 1.67, "learning_rate": 2.576641612357079e-07, "logits/chosen": -0.7097880244255066, "logits/rejected": -0.7097880244255066, "logps/chosen": -55.567466735839844, "logps/rejected": -55.567466735839844, "loss": 0.3902, "rewards/accuracies": 0.0, "rewards/chosen": 2.206716299057007, "rewards/margins": 0.0, "rewards/rejected": 2.206716299057007, "step": 10271 }, { "epoch": 1.67, "learning_rate": 2.5754921232038383e-07, "logits/chosen": -0.40685757994651794, "logits/rejected": -0.4518459439277649, "logps/chosen": -38.10493850708008, "logps/rejected": -69.8281478881836, "loss": 0.5804, "rewards/accuracies": 1.0, "rewards/chosen": 2.0491256713867188, "rewards/margins": 0.5169898271560669, "rewards/rejected": 1.5321358442306519, "step": 10272 }, { "epoch": 1.67, "learning_rate": 2.574342801561153e-07, "logits/chosen": -0.7256985902786255, "logits/rejected": -0.5926128029823303, "logps/chosen": -114.74441528320312, "logps/rejected": -74.00269317626953, "loss": 0.4389, "rewards/accuracies": 1.0, "rewards/chosen": 4.856088161468506, "rewards/margins": 2.346423864364624, "rewards/rejected": 2.509664297103882, "step": 10273 }, { "epoch": 1.67, "learning_rate": 2.573193647508426e-07, "logits/chosen": -0.7389131188392639, "logits/rejected": -0.7546294927597046, "logps/chosen": -63.73820495605469, "logps/rejected": -158.8035430908203, "loss": 1.1188, "rewards/accuracies": 1.0, "rewards/chosen": 1.0958222150802612, "rewards/margins": 0.20523077249526978, "rewards/rejected": 0.8905914425849915, "step": 10274 }, { "epoch": 1.67, "learning_rate": 2.572044661125058e-07, "logits/chosen": -0.6453919410705566, "logits/rejected": -0.5839279294013977, "logps/chosen": -56.280982971191406, "logps/rejected": -83.0735092163086, "loss": 2.9509, "rewards/accuracies": 1.0, "rewards/chosen": 2.7262299060821533, "rewards/margins": 0.37165307998657227, "rewards/rejected": 2.354576826095581, "step": 10275 }, { "epoch": 1.67, "learning_rate": 2.5708958424904283e-07, "logits/chosen": -0.8581269979476929, "logits/rejected": -0.8229002356529236, "logps/chosen": -64.55729675292969, "logps/rejected": -31.749412536621094, "loss": 0.1063, "rewards/accuracies": 1.0, "rewards/chosen": 3.017604112625122, "rewards/margins": 2.6062352657318115, "rewards/rejected": 0.4113689363002777, "step": 10276 }, { "epoch": 1.67, "learning_rate": 2.569747191683913e-07, "logits/chosen": -0.9179515242576599, "logits/rejected": -0.8930215239524841, "logps/chosen": -110.57390594482422, "logps/rejected": -99.28605651855469, "loss": 0.8277, "rewards/accuracies": 0.0, "rewards/chosen": 2.186211347579956, "rewards/margins": -1.193039894104004, "rewards/rejected": 3.37925124168396, "step": 10277 }, { "epoch": 1.67, "learning_rate": 2.568598708784869e-07, "logits/chosen": -0.6989117860794067, "logits/rejected": -0.6570730209350586, "logps/chosen": -35.16398620605469, "logps/rejected": -76.59557342529297, "loss": 1.4664, "rewards/accuracies": 1.0, "rewards/chosen": 2.1128957271575928, "rewards/margins": 0.826622486114502, "rewards/rejected": 1.2862732410430908, "step": 10278 }, { "epoch": 1.67, "learning_rate": 2.5674503938726486e-07, "logits/chosen": -0.5602716207504272, "logits/rejected": -0.6333548426628113, "logps/chosen": -90.96234130859375, "logps/rejected": -143.09625244140625, "loss": 1.8624, "rewards/accuracies": 0.0, "rewards/chosen": 1.5687233209609985, "rewards/margins": -3.0063729286193848, "rewards/rejected": 4.575096130371094, "step": 10279 }, { "epoch": 1.67, "learning_rate": 2.5663022470265915e-07, "logits/chosen": -0.5467813014984131, "logits/rejected": -0.5124152302742004, "logps/chosen": -61.890480041503906, "logps/rejected": -70.9046630859375, "loss": 0.3518, "rewards/accuracies": 1.0, "rewards/chosen": 1.2084206342697144, "rewards/margins": 0.006946563720703125, "rewards/rejected": 1.2014740705490112, "step": 10280 }, { "epoch": 1.67, "learning_rate": 2.565154268326019e-07, "logits/chosen": -0.9287398457527161, "logits/rejected": -0.8754309415817261, "logps/chosen": -99.19941711425781, "logps/rejected": -88.60488891601562, "loss": 0.5883, "rewards/accuracies": 1.0, "rewards/chosen": 1.8519623279571533, "rewards/margins": 0.4059104919433594, "rewards/rejected": 1.446051836013794, "step": 10281 }, { "epoch": 1.67, "learning_rate": 2.5640064578502495e-07, "logits/chosen": -0.7341958284378052, "logits/rejected": -0.7341958284378052, "logps/chosen": -123.23229217529297, "logps/rejected": -123.23229217529297, "loss": 1.0575, "rewards/accuracies": 0.0, "rewards/chosen": 2.3039498329162598, "rewards/margins": 0.0, "rewards/rejected": 2.3039498329162598, "step": 10282 }, { "epoch": 1.67, "learning_rate": 2.562858815678583e-07, "logits/chosen": -0.7837460041046143, "logits/rejected": -0.7994586825370789, "logps/chosen": -74.90919494628906, "logps/rejected": -49.42143249511719, "loss": 0.4795, "rewards/accuracies": 0.0, "rewards/chosen": 1.668036699295044, "rewards/margins": -0.38178014755249023, "rewards/rejected": 2.049816846847534, "step": 10283 }, { "epoch": 1.67, "learning_rate": 2.5617113418903137e-07, "logits/chosen": -0.8128105998039246, "logits/rejected": -0.7955902218818665, "logps/chosen": -74.53370666503906, "logps/rejected": -136.12081909179688, "loss": 0.5841, "rewards/accuracies": 1.0, "rewards/chosen": 1.1592133045196533, "rewards/margins": 0.9109436273574829, "rewards/rejected": 0.24826966226100922, "step": 10284 }, { "epoch": 1.67, "learning_rate": 2.560564036564716e-07, "logits/chosen": -0.2241644561290741, "logits/rejected": -0.10290223360061646, "logps/chosen": -34.87233352661133, "logps/rejected": -5.30352258682251, "loss": 0.2931, "rewards/accuracies": 1.0, "rewards/chosen": 1.1978648900985718, "rewards/margins": 0.6532019972801208, "rewards/rejected": 0.5446628928184509, "step": 10285 }, { "epoch": 1.67, "learning_rate": 2.559416899781065e-07, "logits/chosen": -0.9440475702285767, "logits/rejected": -0.7040731906890869, "logps/chosen": -180.43199157714844, "logps/rejected": -104.95924377441406, "loss": 0.1132, "rewards/accuracies": 1.0, "rewards/chosen": 4.471744060516357, "rewards/margins": 1.7915332317352295, "rewards/rejected": 2.680210828781128, "step": 10286 }, { "epoch": 1.67, "learning_rate": 2.5582699316186104e-07, "logits/chosen": -0.6949462890625, "logits/rejected": -0.5327593684196472, "logps/chosen": -77.86347961425781, "logps/rejected": -58.16588592529297, "loss": 0.1239, "rewards/accuracies": 1.0, "rewards/chosen": 3.1024482250213623, "rewards/margins": 1.2842589616775513, "rewards/rejected": 1.818189263343811, "step": 10287 }, { "epoch": 1.67, "learning_rate": 2.557123132156601e-07, "logits/chosen": -0.5948992371559143, "logits/rejected": -0.748887836933136, "logps/chosen": -98.95821380615234, "logps/rejected": -100.20630645751953, "loss": 2.128, "rewards/accuracies": 0.0, "rewards/chosen": 1.4784812927246094, "rewards/margins": -3.41802978515625, "rewards/rejected": 4.896511077880859, "step": 10288 }, { "epoch": 1.67, "learning_rate": 2.5559765014742676e-07, "logits/chosen": -1.0571906566619873, "logits/rejected": -0.7719091773033142, "logps/chosen": -115.91793823242188, "logps/rejected": -151.90997314453125, "loss": 0.6267, "rewards/accuracies": 1.0, "rewards/chosen": 4.8962249755859375, "rewards/margins": 5.167977809906006, "rewards/rejected": -0.2717529237270355, "step": 10289 }, { "epoch": 1.67, "learning_rate": 2.554830039650834e-07, "logits/chosen": -0.5952132344245911, "logits/rejected": -0.39219626784324646, "logps/chosen": -114.81275939941406, "logps/rejected": -80.01217651367188, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": 5.3995208740234375, "rewards/margins": 3.3750686645507812, "rewards/rejected": 2.0244522094726562, "step": 10290 }, { "epoch": 1.67, "learning_rate": 2.553683746765506e-07, "logits/chosen": -0.22731290757656097, "logits/rejected": -0.22645992040634155, "logps/chosen": -1.8928488492965698, "logps/rejected": -2.869070053100586, "loss": 0.7162, "rewards/accuracies": 0.0, "rewards/chosen": 0.24291057884693146, "rewards/margins": -0.15779246389865875, "rewards/rejected": 0.4007030427455902, "step": 10291 }, { "epoch": 1.67, "learning_rate": 2.552537622897486e-07, "logits/chosen": -1.2707585096359253, "logits/rejected": -1.0835239887237549, "logps/chosen": -108.79869079589844, "logps/rejected": -95.51899719238281, "loss": 0.3676, "rewards/accuracies": 1.0, "rewards/chosen": 6.004266262054443, "rewards/margins": 0.3774137496948242, "rewards/rejected": 5.626852512359619, "step": 10292 }, { "epoch": 1.67, "learning_rate": 2.5513916681259564e-07, "logits/chosen": -0.6541051864624023, "logits/rejected": -0.5659661293029785, "logps/chosen": -100.05638122558594, "logps/rejected": -65.49577331542969, "loss": 0.3924, "rewards/accuracies": 1.0, "rewards/chosen": 1.9143425226211548, "rewards/margins": 0.3758659362792969, "rewards/rejected": 1.538476586341858, "step": 10293 }, { "epoch": 1.67, "learning_rate": 2.550245882530095e-07, "logits/chosen": -0.7807069420814514, "logits/rejected": -0.6970920562744141, "logps/chosen": -178.20379638671875, "logps/rejected": -194.60556030273438, "loss": 1.4944, "rewards/accuracies": 0.0, "rewards/chosen": 6.8104705810546875, "rewards/margins": -2.9312868118286133, "rewards/rejected": 9.7417573928833, "step": 10294 }, { "epoch": 1.67, "learning_rate": 2.549100266189062e-07, "logits/chosen": -0.8600746989250183, "logits/rejected": -1.1413671970367432, "logps/chosen": -113.20701599121094, "logps/rejected": -36.11109924316406, "loss": 0.4081, "rewards/accuracies": 1.0, "rewards/chosen": 0.7689293026924133, "rewards/margins": 0.5532501339912415, "rewards/rejected": 0.21567916870117188, "step": 10295 }, { "epoch": 1.67, "learning_rate": 2.547954819182012e-07, "logits/chosen": -0.7464025616645813, "logits/rejected": -0.7403903007507324, "logps/chosen": -52.99289321899414, "logps/rejected": -100.51641845703125, "loss": 0.6439, "rewards/accuracies": 1.0, "rewards/chosen": 2.7453320026397705, "rewards/margins": 0.16271090507507324, "rewards/rejected": 2.5826210975646973, "step": 10296 }, { "epoch": 1.67, "learning_rate": 2.5468095415880795e-07, "logits/chosen": -1.1726866960525513, "logits/rejected": -1.131108283996582, "logps/chosen": -128.21678161621094, "logps/rejected": -113.55316162109375, "loss": 1.0517, "rewards/accuracies": 0.0, "rewards/chosen": 3.6085832118988037, "rewards/margins": -1.8273236751556396, "rewards/rejected": 5.435906887054443, "step": 10297 }, { "epoch": 1.67, "learning_rate": 2.5456644334863987e-07, "logits/chosen": -0.6353945136070251, "logits/rejected": -0.5979876518249512, "logps/chosen": -48.842315673828125, "logps/rejected": -24.35297203063965, "loss": 0.61, "rewards/accuracies": 1.0, "rewards/chosen": 2.4415223598480225, "rewards/margins": 0.9956698417663574, "rewards/rejected": 1.445852518081665, "step": 10298 }, { "epoch": 1.67, "learning_rate": 2.5445194949560796e-07, "logits/chosen": -0.41867607831954956, "logits/rejected": -0.40202030539512634, "logps/chosen": -80.3411865234375, "logps/rejected": -45.06011962890625, "loss": 0.5235, "rewards/accuracies": 0.0, "rewards/chosen": 0.5833236575126648, "rewards/margins": -0.39738696813583374, "rewards/rejected": 0.9807106256484985, "step": 10299 }, { "epoch": 1.67, "learning_rate": 2.543374726076232e-07, "logits/chosen": -0.7609419226646423, "logits/rejected": -0.7495867609977722, "logps/chosen": -279.33062744140625, "logps/rejected": -154.7089080810547, "loss": 0.4054, "rewards/accuracies": 1.0, "rewards/chosen": 3.525604248046875, "rewards/margins": 3.459179639816284, "rewards/rejected": 0.06642456352710724, "step": 10300 }, { "epoch": 1.67, "learning_rate": 2.5422301269259425e-07, "logits/chosen": -0.33863896131515503, "logits/rejected": -0.35936814546585083, "logps/chosen": -14.820889472961426, "logps/rejected": -2.7192652225494385, "loss": 0.5668, "rewards/accuracies": 0.0, "rewards/chosen": -0.2561068534851074, "rewards/margins": -0.36040613055229187, "rewards/rejected": 0.10429928451776505, "step": 10301 }, { "epoch": 1.67, "learning_rate": 2.541085697584299e-07, "logits/chosen": -1.043327808380127, "logits/rejected": -1.0293437242507935, "logps/chosen": -80.86381530761719, "logps/rejected": -69.21009063720703, "loss": 0.6278, "rewards/accuracies": 0.0, "rewards/chosen": 1.6016746759414673, "rewards/margins": -0.9083939790725708, "rewards/rejected": 2.510068655014038, "step": 10302 }, { "epoch": 1.67, "learning_rate": 2.5399414381303655e-07, "logits/chosen": -1.0811365842819214, "logits/rejected": -1.0214158296585083, "logps/chosen": -92.61387634277344, "logps/rejected": -27.65406608581543, "loss": 0.1211, "rewards/accuracies": 1.0, "rewards/chosen": 1.6002991199493408, "rewards/margins": 1.5057159662246704, "rewards/rejected": 0.09458313137292862, "step": 10303 }, { "epoch": 1.67, "learning_rate": 2.5387973486432024e-07, "logits/chosen": -0.7625731825828552, "logits/rejected": -0.7630940079689026, "logps/chosen": -56.64775466918945, "logps/rejected": -94.3902816772461, "loss": 0.4887, "rewards/accuracies": 1.0, "rewards/chosen": 1.4873775243759155, "rewards/margins": 0.9698550701141357, "rewards/rejected": 0.5175224542617798, "step": 10304 }, { "epoch": 1.67, "learning_rate": 2.537653429201853e-07, "logits/chosen": -0.5032694935798645, "logits/rejected": -0.43273594975471497, "logps/chosen": -50.899375915527344, "logps/rejected": -79.56736755371094, "loss": 0.2332, "rewards/accuracies": 1.0, "rewards/chosen": 1.9561805725097656, "rewards/margins": 0.6006728410720825, "rewards/rejected": 1.355507731437683, "step": 10305 }, { "epoch": 1.67, "learning_rate": 2.536509679885355e-07, "logits/chosen": -1.3055081367492676, "logits/rejected": -1.219905972480774, "logps/chosen": -141.50692749023438, "logps/rejected": -20.143075942993164, "loss": 0.0787, "rewards/accuracies": 1.0, "rewards/chosen": 6.001565456390381, "rewards/margins": 5.6383280754089355, "rewards/rejected": 0.363237589597702, "step": 10306 }, { "epoch": 1.67, "learning_rate": 2.5353661007727256e-07, "logits/chosen": -0.8141466975212097, "logits/rejected": -0.6081036329269409, "logps/chosen": -105.86058807373047, "logps/rejected": -107.37732696533203, "loss": 0.185, "rewards/accuracies": 1.0, "rewards/chosen": 3.548625946044922, "rewards/margins": 2.322369337081909, "rewards/rejected": 1.2262566089630127, "step": 10307 }, { "epoch": 1.67, "learning_rate": 2.5342226919429806e-07, "logits/chosen": -0.34916186332702637, "logits/rejected": -0.29752475023269653, "logps/chosen": -103.71023559570312, "logps/rejected": -71.37362670898438, "loss": 0.2517, "rewards/accuracies": 1.0, "rewards/chosen": 1.845710039138794, "rewards/margins": 0.4971656799316406, "rewards/rejected": 1.3485443592071533, "step": 10308 }, { "epoch": 1.67, "learning_rate": 2.5330794534751135e-07, "logits/chosen": -0.5578539967536926, "logits/rejected": -0.5253534913063049, "logps/chosen": -108.23194122314453, "logps/rejected": -61.0265007019043, "loss": 0.6981, "rewards/accuracies": 0.0, "rewards/chosen": 1.1782585382461548, "rewards/margins": -0.8625484704971313, "rewards/rejected": 2.040807008743286, "step": 10309 }, { "epoch": 1.67, "learning_rate": 2.5319363854481167e-07, "logits/chosen": -0.8494890332221985, "logits/rejected": -0.8810230493545532, "logps/chosen": -107.58226776123047, "logps/rejected": -166.28553771972656, "loss": 1.3288, "rewards/accuracies": 0.0, "rewards/chosen": 2.2992165088653564, "rewards/margins": -1.2543342113494873, "rewards/rejected": 3.5535507202148438, "step": 10310 }, { "epoch": 1.67, "learning_rate": 2.5307934879409604e-07, "logits/chosen": -0.6695848703384399, "logits/rejected": -0.6311625838279724, "logps/chosen": -65.02664947509766, "logps/rejected": -81.57726287841797, "loss": 0.5573, "rewards/accuracies": 0.0, "rewards/chosen": 1.6327011585235596, "rewards/margins": -0.7108237743377686, "rewards/rejected": 2.343524932861328, "step": 10311 }, { "epoch": 1.67, "learning_rate": 2.5296507610326126e-07, "logits/chosen": -0.3929065763950348, "logits/rejected": -0.4216083288192749, "logps/chosen": -21.26153564453125, "logps/rejected": -25.104734420776367, "loss": 0.7302, "rewards/accuracies": 0.0, "rewards/chosen": 0.25263863801956177, "rewards/margins": -0.8778658509254456, "rewards/rejected": 1.1305044889450073, "step": 10312 }, { "epoch": 1.67, "learning_rate": 2.52850820480202e-07, "logits/chosen": -0.10493779927492142, "logits/rejected": -0.10493779927492142, "logps/chosen": -10.619553565979004, "logps/rejected": -10.619553565979004, "loss": 0.8262, "rewards/accuracies": 0.0, "rewards/chosen": 0.4742526113986969, "rewards/margins": 0.0, "rewards/rejected": 0.4742526113986969, "step": 10313 }, { "epoch": 1.67, "learning_rate": 2.527365819328125e-07, "logits/chosen": -0.8474990129470825, "logits/rejected": -0.7843786478042603, "logps/chosen": -87.0130615234375, "logps/rejected": -177.248779296875, "loss": 0.3469, "rewards/accuracies": 1.0, "rewards/chosen": 2.1122703552246094, "rewards/margins": 0.8521537780761719, "rewards/rejected": 1.2601165771484375, "step": 10314 }, { "epoch": 1.67, "learning_rate": 2.5262236046898576e-07, "logits/chosen": -0.5674116611480713, "logits/rejected": -0.6044583320617676, "logps/chosen": -103.64419555664062, "logps/rejected": -126.18572998046875, "loss": 1.1075, "rewards/accuracies": 0.0, "rewards/chosen": 2.8502533435821533, "rewards/margins": -1.5150468349456787, "rewards/rejected": 4.365300178527832, "step": 10315 }, { "epoch": 1.67, "learning_rate": 2.52508156096613e-07, "logits/chosen": -0.7988157868385315, "logits/rejected": -0.8009111285209656, "logps/chosen": -2.5876364707946777, "logps/rejected": -1.2599470615386963, "loss": 0.6688, "rewards/accuracies": 0.0, "rewards/chosen": 0.16786670684814453, "rewards/margins": -0.15508043766021729, "rewards/rejected": 0.3229471445083618, "step": 10316 }, { "epoch": 1.67, "learning_rate": 2.523939688235851e-07, "logits/chosen": -0.40711137652397156, "logits/rejected": -0.40711137652397156, "logps/chosen": -105.49015045166016, "logps/rejected": -105.49015045166016, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.10411911457777023, "rewards/margins": 0.0, "rewards/rejected": 0.10411911457777023, "step": 10317 }, { "epoch": 1.67, "learning_rate": 2.5227979865779085e-07, "logits/chosen": -0.4315726161003113, "logits/rejected": -0.40074819326400757, "logps/chosen": -68.22530364990234, "logps/rejected": -101.38677978515625, "loss": 0.433, "rewards/accuracies": 1.0, "rewards/chosen": 2.501305341720581, "rewards/margins": 0.18794775009155273, "rewards/rejected": 2.3133575916290283, "step": 10318 }, { "epoch": 1.67, "learning_rate": 2.521656456071188e-07, "logits/chosen": -0.5717145204544067, "logits/rejected": -0.5717145204544067, "logps/chosen": -74.34394836425781, "logps/rejected": -74.34394836425781, "loss": 0.4299, "rewards/accuracies": 0.0, "rewards/chosen": 2.028477430343628, "rewards/margins": 0.0, "rewards/rejected": 2.028477430343628, "step": 10319 }, { "epoch": 1.68, "learning_rate": 2.5205150967945537e-07, "logits/chosen": -0.5277493596076965, "logits/rejected": -0.4133220911026001, "logps/chosen": -57.72777557373047, "logps/rejected": -44.588958740234375, "loss": 0.9214, "rewards/accuracies": 1.0, "rewards/chosen": 2.7682061195373535, "rewards/margins": 0.8606125116348267, "rewards/rejected": 1.9075936079025269, "step": 10320 }, { "epoch": 1.68, "learning_rate": 2.519373908826869e-07, "logits/chosen": -1.0231852531433105, "logits/rejected": -0.984596848487854, "logps/chosen": -102.38813781738281, "logps/rejected": -105.10295104980469, "loss": 0.3304, "rewards/accuracies": 1.0, "rewards/chosen": 1.9725449085235596, "rewards/margins": 0.4676560163497925, "rewards/rejected": 1.504888892173767, "step": 10321 }, { "epoch": 1.68, "learning_rate": 2.518232892246972e-07, "logits/chosen": -0.5704402923583984, "logits/rejected": -0.5465449094772339, "logps/chosen": -43.961177825927734, "logps/rejected": -72.2977294921875, "loss": 1.5579, "rewards/accuracies": 0.0, "rewards/chosen": 1.0093090534210205, "rewards/margins": -2.44105863571167, "rewards/rejected": 3.4503676891326904, "step": 10322 }, { "epoch": 1.68, "learning_rate": 2.517092047133701e-07, "logits/chosen": -0.7745253443717957, "logits/rejected": -0.7781264781951904, "logps/chosen": -102.2772216796875, "logps/rejected": -122.23858642578125, "loss": 1.0748, "rewards/accuracies": 0.0, "rewards/chosen": 1.0661407709121704, "rewards/margins": -0.2555725574493408, "rewards/rejected": 1.3217133283615112, "step": 10323 }, { "epoch": 1.68, "learning_rate": 2.515951373565873e-07, "logits/chosen": -0.5648022890090942, "logits/rejected": -0.5532820224761963, "logps/chosen": -65.24462127685547, "logps/rejected": -19.31624984741211, "loss": 0.3828, "rewards/accuracies": 1.0, "rewards/chosen": 0.8363701105117798, "rewards/margins": 0.5222009420394897, "rewards/rejected": 0.31416913866996765, "step": 10324 }, { "epoch": 1.68, "learning_rate": 2.514810871622304e-07, "logits/chosen": -0.5193166732788086, "logits/rejected": -0.5683264136314392, "logps/chosen": -94.59339141845703, "logps/rejected": -127.96312713623047, "loss": 0.6803, "rewards/accuracies": 0.0, "rewards/chosen": 3.289206027984619, "rewards/margins": -1.0562448501586914, "rewards/rejected": 4.3454508781433105, "step": 10325 }, { "epoch": 1.68, "learning_rate": 2.513670541381787e-07, "logits/chosen": -0.2795465588569641, "logits/rejected": -0.2809433043003082, "logps/chosen": -2.340691566467285, "logps/rejected": -4.297891139984131, "loss": 0.5312, "rewards/accuracies": 1.0, "rewards/chosen": 0.18465128540992737, "rewards/margins": 0.2586434483528137, "rewards/rejected": -0.07399215549230576, "step": 10326 }, { "epoch": 1.68, "learning_rate": 2.5125303829231116e-07, "logits/chosen": -0.8896573185920715, "logits/rejected": -0.8789676427841187, "logps/chosen": -93.63197326660156, "logps/rejected": -136.41928100585938, "loss": 2.921, "rewards/accuracies": 0.0, "rewards/chosen": 1.1527801752090454, "rewards/margins": -5.688028335571289, "rewards/rejected": 6.840808391571045, "step": 10327 }, { "epoch": 1.68, "learning_rate": 2.511390396325047e-07, "logits/chosen": -0.4731253981590271, "logits/rejected": -0.4776521325111389, "logps/chosen": -56.47239685058594, "logps/rejected": -63.422035217285156, "loss": 0.8379, "rewards/accuracies": 0.0, "rewards/chosen": 1.7505645751953125, "rewards/margins": -0.13631665706634521, "rewards/rejected": 1.8868812322616577, "step": 10328 }, { "epoch": 1.68, "learning_rate": 2.5102505816663617e-07, "logits/chosen": -0.8474816083908081, "logits/rejected": -0.7156928777694702, "logps/chosen": -52.156288146972656, "logps/rejected": -59.020423889160156, "loss": 1.5041, "rewards/accuracies": 0.0, "rewards/chosen": 1.531562089920044, "rewards/margins": -0.6699569225311279, "rewards/rejected": 2.201519012451172, "step": 10329 }, { "epoch": 1.68, "learning_rate": 2.5091109390258005e-07, "logits/chosen": -0.4483754336833954, "logits/rejected": -0.3533398509025574, "logps/chosen": -80.91630554199219, "logps/rejected": -36.09512710571289, "loss": 0.3446, "rewards/accuracies": 1.0, "rewards/chosen": 2.185150146484375, "rewards/margins": 0.732537031173706, "rewards/rejected": 1.452613115310669, "step": 10330 }, { "epoch": 1.68, "learning_rate": 2.507971468482106e-07, "logits/chosen": -0.7597783803939819, "logits/rejected": -0.5120465159416199, "logps/chosen": -136.23690795898438, "logps/rejected": -26.546354293823242, "loss": 0.1248, "rewards/accuracies": 1.0, "rewards/chosen": 4.854733467102051, "rewards/margins": 4.676008701324463, "rewards/rejected": 0.17872467637062073, "step": 10331 }, { "epoch": 1.68, "learning_rate": 2.506832170114002e-07, "logits/chosen": -0.7008572816848755, "logits/rejected": -0.6837139129638672, "logps/chosen": -133.1538848876953, "logps/rejected": -66.18338012695312, "loss": 0.4964, "rewards/accuracies": 0.0, "rewards/chosen": 4.438195705413818, "rewards/margins": -0.5057282447814941, "rewards/rejected": 4.9439239501953125, "step": 10332 }, { "epoch": 1.68, "learning_rate": 2.5056930440002043e-07, "logits/chosen": -0.5187509655952454, "logits/rejected": -0.4401215612888336, "logps/chosen": -60.445098876953125, "logps/rejected": -21.194740295410156, "loss": 0.3839, "rewards/accuracies": 0.0, "rewards/chosen": 1.518652319908142, "rewards/margins": -0.07804453372955322, "rewards/rejected": 1.5966968536376953, "step": 10333 }, { "epoch": 1.68, "learning_rate": 2.504554090219418e-07, "logits/chosen": -0.33039391040802, "logits/rejected": -0.3025254011154175, "logps/chosen": -82.25262451171875, "logps/rejected": -63.004241943359375, "loss": 1.0252, "rewards/accuracies": 0.0, "rewards/chosen": 0.7387496829032898, "rewards/margins": -0.6723572611808777, "rewards/rejected": 1.4111069440841675, "step": 10334 }, { "epoch": 1.68, "learning_rate": 2.5034153088503296e-07, "logits/chosen": -0.7942051291465759, "logits/rejected": -0.749036967754364, "logps/chosen": -82.42295837402344, "logps/rejected": -32.673797607421875, "loss": 0.3696, "rewards/accuracies": 1.0, "rewards/chosen": 1.0367339849472046, "rewards/margins": 0.033721923828125, "rewards/rejected": 1.0030120611190796, "step": 10335 }, { "epoch": 1.68, "learning_rate": 2.502276699971623e-07, "logits/chosen": -0.5800819993019104, "logits/rejected": -0.460014671087265, "logps/chosen": -149.7666015625, "logps/rejected": -64.21965026855469, "loss": 0.069, "rewards/accuracies": 1.0, "rewards/chosen": 4.620709419250488, "rewards/margins": 2.042855978012085, "rewards/rejected": 2.5778534412384033, "step": 10336 }, { "epoch": 1.68, "learning_rate": 2.50113826366196e-07, "logits/chosen": -0.8177247643470764, "logits/rejected": -0.765145480632782, "logps/chosen": -31.50372314453125, "logps/rejected": -72.84831237792969, "loss": 0.5459, "rewards/accuracies": 0.0, "rewards/chosen": 1.3362702131271362, "rewards/margins": -0.04789876937866211, "rewards/rejected": 1.3841689825057983, "step": 10337 }, { "epoch": 1.68, "learning_rate": 2.500000000000001e-07, "logits/chosen": -0.42461270093917847, "logits/rejected": -0.3728611469268799, "logps/chosen": -137.7461395263672, "logps/rejected": -55.35404968261719, "loss": 0.5726, "rewards/accuracies": 0.0, "rewards/chosen": 1.4886932373046875, "rewards/margins": -0.7292373180389404, "rewards/rejected": 2.217930555343628, "step": 10338 }, { "epoch": 1.68, "learning_rate": 2.4988619090643846e-07, "logits/chosen": -0.9584502577781677, "logits/rejected": -0.8298215866088867, "logps/chosen": -85.81478881835938, "logps/rejected": -81.30715942382812, "loss": 1.2137, "rewards/accuracies": 1.0, "rewards/chosen": 2.559063673019409, "rewards/margins": 0.6964766979217529, "rewards/rejected": 1.8625869750976562, "step": 10339 }, { "epoch": 1.68, "learning_rate": 2.497723990933746e-07, "logits/chosen": -0.9624207019805908, "logits/rejected": -0.9624207019805908, "logps/chosen": -65.83909606933594, "logps/rejected": -65.83909606933594, "loss": 0.3899, "rewards/accuracies": 0.0, "rewards/chosen": 1.4960052967071533, "rewards/margins": 0.0, "rewards/rejected": 1.4960052967071533, "step": 10340 }, { "epoch": 1.68, "learning_rate": 2.4965862456867014e-07, "logits/chosen": -0.8304547071456909, "logits/rejected": -0.8046250939369202, "logps/chosen": -175.31732177734375, "logps/rejected": -203.90963745117188, "loss": 0.4921, "rewards/accuracies": 0.0, "rewards/chosen": 5.736441135406494, "rewards/margins": -0.5055418014526367, "rewards/rejected": 6.241982936859131, "step": 10341 }, { "epoch": 1.68, "learning_rate": 2.4954486734018614e-07, "logits/chosen": -0.7063593864440918, "logits/rejected": -0.4645882248878479, "logps/chosen": -107.89598083496094, "logps/rejected": -64.76289367675781, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": 4.2431535720825195, "rewards/margins": 1.9343187808990479, "rewards/rejected": 2.3088347911834717, "step": 10342 }, { "epoch": 1.68, "learning_rate": 2.4943112741578183e-07, "logits/chosen": -0.7076191902160645, "logits/rejected": -0.5582454204559326, "logps/chosen": -113.17704772949219, "logps/rejected": -18.974966049194336, "loss": 2.4505, "rewards/accuracies": 1.0, "rewards/chosen": 1.1163063049316406, "rewards/margins": 0.8560651540756226, "rewards/rejected": 0.2602411210536957, "step": 10343 }, { "epoch": 1.68, "learning_rate": 2.4931740480331585e-07, "logits/chosen": -0.9240341782569885, "logits/rejected": -0.7435742616653442, "logps/chosen": -103.94581604003906, "logps/rejected": -110.0685806274414, "loss": 0.493, "rewards/accuracies": 1.0, "rewards/chosen": 6.150964260101318, "rewards/margins": 2.8247716426849365, "rewards/rejected": 3.326192617416382, "step": 10344 }, { "epoch": 1.68, "learning_rate": 2.492036995106451e-07, "logits/chosen": -0.9329162836074829, "logits/rejected": -0.8971329927444458, "logps/chosen": -51.80403518676758, "logps/rejected": -146.64324951171875, "loss": 0.7144, "rewards/accuracies": 0.0, "rewards/chosen": 1.5061177015304565, "rewards/margins": -0.6862269639968872, "rewards/rejected": 2.1923446655273438, "step": 10345 }, { "epoch": 1.68, "learning_rate": 2.4909001154562575e-07, "logits/chosen": -0.6961399912834167, "logits/rejected": -0.6389605402946472, "logps/chosen": -72.98998260498047, "logps/rejected": -77.81819152832031, "loss": 0.3725, "rewards/accuracies": 0.0, "rewards/chosen": 2.1132233142852783, "rewards/margins": -0.0876915454864502, "rewards/rejected": 2.2009148597717285, "step": 10346 }, { "epoch": 1.68, "learning_rate": 2.489763409161123e-07, "logits/chosen": -0.66008061170578, "logits/rejected": -0.6181079149246216, "logps/chosen": -86.96034240722656, "logps/rejected": -58.08555221557617, "loss": 0.5469, "rewards/accuracies": 1.0, "rewards/chosen": 1.88420569896698, "rewards/margins": 0.7628269195556641, "rewards/rejected": 1.121378779411316, "step": 10347 }, { "epoch": 1.68, "learning_rate": 2.488626876299587e-07, "logits/chosen": -0.4049840569496155, "logits/rejected": -0.4063637852668762, "logps/chosen": -85.79336547851562, "logps/rejected": -115.954345703125, "loss": 0.3052, "rewards/accuracies": 1.0, "rewards/chosen": 2.54331374168396, "rewards/margins": 0.30870532989501953, "rewards/rejected": 2.2346084117889404, "step": 10348 }, { "epoch": 1.68, "learning_rate": 2.487490516950169e-07, "logits/chosen": -0.8825168609619141, "logits/rejected": -0.8609030842781067, "logps/chosen": -69.82765197753906, "logps/rejected": -53.203826904296875, "loss": 1.0109, "rewards/accuracies": 0.0, "rewards/chosen": 1.1210647821426392, "rewards/margins": -0.9665566682815552, "rewards/rejected": 2.0876214504241943, "step": 10349 }, { "epoch": 1.68, "learning_rate": 2.486354331191385e-07, "logits/chosen": -0.40720388293266296, "logits/rejected": -0.4134908616542816, "logps/chosen": -75.16322326660156, "logps/rejected": -72.72219848632812, "loss": 1.5013, "rewards/accuracies": 0.0, "rewards/chosen": 1.3856338262557983, "rewards/margins": -1.314501166343689, "rewards/rejected": 2.7001349925994873, "step": 10350 }, { "epoch": 1.68, "learning_rate": 2.48521831910173e-07, "logits/chosen": -0.6442658305168152, "logits/rejected": -0.45257413387298584, "logps/chosen": -154.90274047851562, "logps/rejected": -62.46661376953125, "loss": 0.1024, "rewards/accuracies": 1.0, "rewards/chosen": 4.798821926116943, "rewards/margins": 3.460916757583618, "rewards/rejected": 1.3379051685333252, "step": 10351 }, { "epoch": 1.68, "learning_rate": 2.4840824807596963e-07, "logits/chosen": -0.37840747833251953, "logits/rejected": -0.37840747833251953, "logps/chosen": -27.690645217895508, "logps/rejected": -27.690645217895508, "loss": 0.8735, "rewards/accuracies": 0.0, "rewards/chosen": 1.9814318418502808, "rewards/margins": 0.0, "rewards/rejected": 1.9814318418502808, "step": 10352 }, { "epoch": 1.68, "learning_rate": 2.482946816243755e-07, "logits/chosen": -0.5469508767127991, "logits/rejected": -0.5155247449874878, "logps/chosen": -69.9281234741211, "logps/rejected": -53.78473663330078, "loss": 0.2179, "rewards/accuracies": 1.0, "rewards/chosen": 2.087082624435425, "rewards/margins": 0.6826475858688354, "rewards/rejected": 1.4044350385665894, "step": 10353 }, { "epoch": 1.68, "learning_rate": 2.4818113256323744e-07, "logits/chosen": -0.581850528717041, "logits/rejected": -0.6345347166061401, "logps/chosen": -110.79403686523438, "logps/rejected": -113.51136016845703, "loss": 0.905, "rewards/accuracies": 0.0, "rewards/chosen": 1.453229546546936, "rewards/margins": -1.128137230873108, "rewards/rejected": 2.581366777420044, "step": 10354 }, { "epoch": 1.68, "learning_rate": 2.480676009004002e-07, "logits/chosen": -0.7170258164405823, "logits/rejected": -0.7170258164405823, "logps/chosen": -48.34710693359375, "logps/rejected": -48.34710693359375, "loss": 1.1996, "rewards/accuracies": 0.0, "rewards/chosen": 4.201043605804443, "rewards/margins": 0.0, "rewards/rejected": 4.201043605804443, "step": 10355 }, { "epoch": 1.68, "learning_rate": 2.479540866437081e-07, "logits/chosen": -0.34657013416290283, "logits/rejected": -0.3882637619972229, "logps/chosen": -36.79588317871094, "logps/rejected": -47.73412322998047, "loss": 0.7612, "rewards/accuracies": 0.0, "rewards/chosen": 1.339058756828308, "rewards/margins": -1.2046278715133667, "rewards/rejected": 2.543686628341675, "step": 10356 }, { "epoch": 1.68, "learning_rate": 2.4784058980100353e-07, "logits/chosen": -0.7583851218223572, "logits/rejected": -0.7389156818389893, "logps/chosen": -23.82404136657715, "logps/rejected": -77.3184814453125, "loss": 0.5821, "rewards/accuracies": 1.0, "rewards/chosen": 1.4114534854888916, "rewards/margins": 0.7749682068824768, "rewards/rejected": 0.6364852786064148, "step": 10357 }, { "epoch": 1.68, "learning_rate": 2.4772711038012844e-07, "logits/chosen": -0.5543404817581177, "logits/rejected": -0.5142128467559814, "logps/chosen": -98.09536743164062, "logps/rejected": -69.75462341308594, "loss": 1.7893, "rewards/accuracies": 0.0, "rewards/chosen": 0.161906436085701, "rewards/margins": -1.8849151134490967, "rewards/rejected": 2.0468215942382812, "step": 10358 }, { "epoch": 1.68, "learning_rate": 2.4761364838892283e-07, "logits/chosen": -1.4375865459442139, "logits/rejected": -1.5046718120574951, "logps/chosen": -211.59823608398438, "logps/rejected": -107.125732421875, "loss": 0.0144, "rewards/accuracies": 1.0, "rewards/chosen": 6.750619411468506, "rewards/margins": 3.6843016147613525, "rewards/rejected": 3.0663177967071533, "step": 10359 }, { "epoch": 1.68, "learning_rate": 2.4750020383522616e-07, "logits/chosen": -0.4239209294319153, "logits/rejected": -0.4432513415813446, "logps/chosen": -57.938621520996094, "logps/rejected": -115.99418640136719, "loss": 2.2521, "rewards/accuracies": 0.0, "rewards/chosen": 3.0066184997558594, "rewards/margins": -1.0643839836120605, "rewards/rejected": 4.07100248336792, "step": 10360 }, { "epoch": 1.68, "learning_rate": 2.4738677672687613e-07, "logits/chosen": -0.9121976494789124, "logits/rejected": -0.8796688318252563, "logps/chosen": -67.70227813720703, "logps/rejected": -11.136518478393555, "loss": 0.1761, "rewards/accuracies": 1.0, "rewards/chosen": 2.9200141429901123, "rewards/margins": 1.9448316097259521, "rewards/rejected": 0.9751825332641602, "step": 10361 }, { "epoch": 1.68, "learning_rate": 2.472733670717097e-07, "logits/chosen": -0.4394870102405548, "logits/rejected": -0.46921178698539734, "logps/chosen": -89.71070098876953, "logps/rejected": -57.68638610839844, "loss": 0.6233, "rewards/accuracies": 0.0, "rewards/chosen": 1.0622352361679077, "rewards/margins": -0.049485087394714355, "rewards/rejected": 1.111720323562622, "step": 10362 }, { "epoch": 1.68, "learning_rate": 2.4715997487756214e-07, "logits/chosen": -0.744566798210144, "logits/rejected": -0.6241852045059204, "logps/chosen": -66.6376953125, "logps/rejected": -57.6407356262207, "loss": 0.3807, "rewards/accuracies": 1.0, "rewards/chosen": 2.0769059658050537, "rewards/margins": 0.3196103572845459, "rewards/rejected": 1.7572956085205078, "step": 10363 }, { "epoch": 1.68, "learning_rate": 2.470466001522681e-07, "logits/chosen": -0.9722837209701538, "logits/rejected": -0.9240303635597229, "logps/chosen": -117.87393188476562, "logps/rejected": -86.10091400146484, "loss": 0.4336, "rewards/accuracies": 0.0, "rewards/chosen": 3.4753053188323975, "rewards/margins": -0.30129313468933105, "rewards/rejected": 3.7765984535217285, "step": 10364 }, { "epoch": 1.68, "learning_rate": 2.469332429036603e-07, "logits/chosen": -0.6498833298683167, "logits/rejected": -0.6567415595054626, "logps/chosen": -84.75216674804688, "logps/rejected": -73.2377700805664, "loss": 2.3526, "rewards/accuracies": 0.0, "rewards/chosen": 1.0029258728027344, "rewards/margins": -0.5004158020019531, "rewards/rejected": 1.5033416748046875, "step": 10365 }, { "epoch": 1.68, "learning_rate": 2.4681990313957107e-07, "logits/chosen": -0.9186802506446838, "logits/rejected": -0.911296546459198, "logps/chosen": -60.44968032836914, "logps/rejected": -61.61433410644531, "loss": 0.5359, "rewards/accuracies": 0.0, "rewards/chosen": 2.2437503337860107, "rewards/margins": -0.5128324031829834, "rewards/rejected": 2.756582736968994, "step": 10366 }, { "epoch": 1.68, "learning_rate": 2.4670658086783073e-07, "logits/chosen": -1.134202480316162, "logits/rejected": -1.1364141702651978, "logps/chosen": -148.01205444335938, "logps/rejected": -180.4178466796875, "loss": 0.7523, "rewards/accuracies": 0.0, "rewards/chosen": 5.272520542144775, "rewards/margins": -1.2137956619262695, "rewards/rejected": 6.486316204071045, "step": 10367 }, { "epoch": 1.68, "learning_rate": 2.4659327609626917e-07, "logits/chosen": -0.782308042049408, "logits/rejected": -0.759748637676239, "logps/chosen": -69.39649963378906, "logps/rejected": -98.49893188476562, "loss": 1.0903, "rewards/accuracies": 0.0, "rewards/chosen": 1.1952675580978394, "rewards/margins": -2.0447821617126465, "rewards/rejected": 3.2400498390197754, "step": 10368 }, { "epoch": 1.68, "learning_rate": 2.4647998883271426e-07, "logits/chosen": -0.8467827439308167, "logits/rejected": -0.7233166694641113, "logps/chosen": -72.69866180419922, "logps/rejected": -26.551799774169922, "loss": 0.1119, "rewards/accuracies": 1.0, "rewards/chosen": 1.896279215812683, "rewards/margins": 1.5013747215270996, "rewards/rejected": 0.3949045240879059, "step": 10369 }, { "epoch": 1.68, "learning_rate": 2.4636671908499336e-07, "logits/chosen": -0.43477651476860046, "logits/rejected": -0.4509895145893097, "logps/chosen": -74.40230560302734, "logps/rejected": -51.175926208496094, "loss": 1.1092, "rewards/accuracies": 0.0, "rewards/chosen": 0.9864479303359985, "rewards/margins": -0.9302413463592529, "rewards/rejected": 1.9166892766952515, "step": 10370 }, { "epoch": 1.68, "learning_rate": 2.462534668609324e-07, "logits/chosen": -0.766305148601532, "logits/rejected": -0.6623542308807373, "logps/chosen": -44.2227783203125, "logps/rejected": -46.817962646484375, "loss": 0.452, "rewards/accuracies": 1.0, "rewards/chosen": 1.5591747760772705, "rewards/margins": 0.32388579845428467, "rewards/rejected": 1.2352889776229858, "step": 10371 }, { "epoch": 1.68, "learning_rate": 2.4614023216835574e-07, "logits/chosen": -0.8534546494483948, "logits/rejected": -0.7059932351112366, "logps/chosen": -63.674652099609375, "logps/rejected": -57.51895523071289, "loss": 0.1092, "rewards/accuracies": 1.0, "rewards/chosen": 4.08906888961792, "rewards/margins": 1.7454733848571777, "rewards/rejected": 2.343595504760742, "step": 10372 }, { "epoch": 1.68, "learning_rate": 2.4602701501508715e-07, "logits/chosen": -0.9107005000114441, "logits/rejected": -0.8698269724845886, "logps/chosen": -135.36181640625, "logps/rejected": -82.23170471191406, "loss": 0.3677, "rewards/accuracies": 1.0, "rewards/chosen": 3.7324860095977783, "rewards/margins": 1.365386962890625, "rewards/rejected": 2.3670990467071533, "step": 10373 }, { "epoch": 1.68, "learning_rate": 2.4591381540894854e-07, "logits/chosen": -0.9754243493080139, "logits/rejected": -0.986922025680542, "logps/chosen": -55.73661804199219, "logps/rejected": -78.61248779296875, "loss": 1.0174, "rewards/accuracies": 0.0, "rewards/chosen": 0.5460163354873657, "rewards/margins": -0.20958596467971802, "rewards/rejected": 0.7556023001670837, "step": 10374 }, { "epoch": 1.68, "learning_rate": 2.458006333577613e-07, "logits/chosen": -0.7530913949012756, "logits/rejected": -0.7527212500572205, "logps/chosen": -32.63322448730469, "logps/rejected": -46.2147102355957, "loss": 0.8636, "rewards/accuracies": 1.0, "rewards/chosen": 0.5474826693534851, "rewards/margins": 0.05360183119773865, "rewards/rejected": 0.49388083815574646, "step": 10375 }, { "epoch": 1.68, "learning_rate": 2.456874688693449e-07, "logits/chosen": -0.668147087097168, "logits/rejected": -0.7130763530731201, "logps/chosen": -20.052101135253906, "logps/rejected": -42.35214614868164, "loss": 0.8594, "rewards/accuracies": 0.0, "rewards/chosen": 0.8731910586357117, "rewards/margins": -1.0597114562988281, "rewards/rejected": 1.9329025745391846, "step": 10376 }, { "epoch": 1.68, "learning_rate": 2.4557432195151817e-07, "logits/chosen": -0.7625574469566345, "logits/rejected": -0.765465497970581, "logps/chosen": -168.5460662841797, "logps/rejected": -123.75666809082031, "loss": 0.15, "rewards/accuracies": 1.0, "rewards/chosen": 3.3764846324920654, "rewards/margins": 1.2712371349334717, "rewards/rejected": 2.1052474975585938, "step": 10377 }, { "epoch": 1.68, "learning_rate": 2.4546119261209826e-07, "logits/chosen": -0.5126506090164185, "logits/rejected": -0.46678707003593445, "logps/chosen": -75.7847900390625, "logps/rejected": -83.6581802368164, "loss": 0.5292, "rewards/accuracies": 1.0, "rewards/chosen": 1.061865210533142, "rewards/margins": 0.039137959480285645, "rewards/rejected": 1.0227272510528564, "step": 10378 }, { "epoch": 1.68, "learning_rate": 2.453480808589016e-07, "logits/chosen": -0.7815719246864319, "logits/rejected": -0.7958645820617676, "logps/chosen": -44.25028610229492, "logps/rejected": -49.891944885253906, "loss": 0.3289, "rewards/accuracies": 1.0, "rewards/chosen": 2.0216197967529297, "rewards/margins": 0.44409143924713135, "rewards/rejected": 1.5775283575057983, "step": 10379 }, { "epoch": 1.68, "learning_rate": 2.4523498669974293e-07, "logits/chosen": -0.026532474905252457, "logits/rejected": -0.026532474905252457, "logps/chosen": -42.94417953491211, "logps/rejected": -42.94417953491211, "loss": 1.6101, "rewards/accuracies": 0.0, "rewards/chosen": 0.6264244318008423, "rewards/margins": 0.0, "rewards/rejected": 0.6264244318008423, "step": 10380 }, { "epoch": 1.68, "learning_rate": 2.4512191014243617e-07, "logits/chosen": -0.8520798087120056, "logits/rejected": -0.8429690003395081, "logps/chosen": -51.916282653808594, "logps/rejected": -27.25380516052246, "loss": 1.8564, "rewards/accuracies": 0.0, "rewards/chosen": 1.1557769775390625, "rewards/margins": -0.7446020841598511, "rewards/rejected": 1.9003790616989136, "step": 10381 }, { "epoch": 1.69, "learning_rate": 2.450088511947936e-07, "logits/chosen": -0.4513263404369354, "logits/rejected": -0.4266573488712311, "logps/chosen": -70.599609375, "logps/rejected": -60.76801300048828, "loss": 0.5186, "rewards/accuracies": 1.0, "rewards/chosen": 0.6117141842842102, "rewards/margins": 0.34722787141799927, "rewards/rejected": 0.26448631286621094, "step": 10382 }, { "epoch": 1.69, "learning_rate": 2.4489580986462685e-07, "logits/chosen": -0.20475678145885468, "logits/rejected": -0.19590647518634796, "logps/chosen": -4.684701442718506, "logps/rejected": -31.469846725463867, "loss": 0.657, "rewards/accuracies": 1.0, "rewards/chosen": 0.20635519921779633, "rewards/margins": 0.16814032196998596, "rewards/rejected": 0.038214873522520065, "step": 10383 }, { "epoch": 1.69, "learning_rate": 2.447827861597456e-07, "logits/chosen": -0.6276462078094482, "logits/rejected": -0.6407634019851685, "logps/chosen": -182.27879333496094, "logps/rejected": -50.251075744628906, "loss": 0.809, "rewards/accuracies": 0.0, "rewards/chosen": 0.5816925168037415, "rewards/margins": -1.2767784595489502, "rewards/rejected": 1.8584709167480469, "step": 10384 }, { "epoch": 1.69, "learning_rate": 2.4466978008795905e-07, "logits/chosen": -0.40810301899909973, "logits/rejected": -0.30825650691986084, "logps/chosen": -76.58676147460938, "logps/rejected": -84.3047103881836, "loss": 0.6642, "rewards/accuracies": 0.0, "rewards/chosen": 1.8773040771484375, "rewards/margins": -0.5841577053070068, "rewards/rejected": 2.4614617824554443, "step": 10385 }, { "epoch": 1.69, "learning_rate": 2.445567916570747e-07, "logits/chosen": -0.7080987691879272, "logits/rejected": -0.722842276096344, "logps/chosen": -117.1568832397461, "logps/rejected": -204.1880645751953, "loss": 1.1076, "rewards/accuracies": 0.0, "rewards/chosen": 3.9795539379119873, "rewards/margins": -0.717083215713501, "rewards/rejected": 4.696637153625488, "step": 10386 }, { "epoch": 1.69, "learning_rate": 2.4444382087489914e-07, "logits/chosen": -0.9286807775497437, "logits/rejected": -0.9503555297851562, "logps/chosen": -73.77096557617188, "logps/rejected": -109.95799255371094, "loss": 1.8336, "rewards/accuracies": 0.0, "rewards/chosen": 2.484854221343994, "rewards/margins": -1.5393142700195312, "rewards/rejected": 4.024168491363525, "step": 10387 }, { "epoch": 1.69, "learning_rate": 2.4433086774923726e-07, "logits/chosen": -0.5681363344192505, "logits/rejected": -0.5476164221763611, "logps/chosen": -53.79551696777344, "logps/rejected": -65.56468200683594, "loss": 0.7558, "rewards/accuracies": 0.0, "rewards/chosen": 2.8228745460510254, "rewards/margins": -0.43152618408203125, "rewards/rejected": 3.2544007301330566, "step": 10388 }, { "epoch": 1.69, "learning_rate": 2.442179322878935e-07, "logits/chosen": -1.0993517637252808, "logits/rejected": -1.1088745594024658, "logps/chosen": -124.79644775390625, "logps/rejected": -87.20657348632812, "loss": 0.5484, "rewards/accuracies": 0.0, "rewards/chosen": 1.8475311994552612, "rewards/margins": -0.5295332670211792, "rewards/rejected": 2.3770644664764404, "step": 10389 }, { "epoch": 1.69, "learning_rate": 2.4410501449867016e-07, "logits/chosen": -1.1193039417266846, "logits/rejected": -0.8924912810325623, "logps/chosen": -100.90654754638672, "logps/rejected": -20.437734603881836, "loss": 0.1582, "rewards/accuracies": 1.0, "rewards/chosen": 4.77068567276001, "rewards/margins": 4.418224811553955, "rewards/rejected": 0.35246068239212036, "step": 10390 }, { "epoch": 1.69, "learning_rate": 2.4399211438936924e-07, "logits/chosen": -1.0099878311157227, "logits/rejected": -0.9437036514282227, "logps/chosen": -149.4908447265625, "logps/rejected": -80.16666412353516, "loss": 0.1889, "rewards/accuracies": 1.0, "rewards/chosen": 5.338403224945068, "rewards/margins": 2.0673415660858154, "rewards/rejected": 3.271061658859253, "step": 10391 }, { "epoch": 1.69, "learning_rate": 2.4387923196779057e-07, "logits/chosen": -0.9724401235580444, "logits/rejected": -0.7666569948196411, "logps/chosen": -129.08302307128906, "logps/rejected": -48.6787223815918, "loss": 0.358, "rewards/accuracies": 1.0, "rewards/chosen": 6.18756103515625, "rewards/margins": 4.346694469451904, "rewards/rejected": 1.8408664464950562, "step": 10392 }, { "epoch": 1.69, "learning_rate": 2.4376636724173385e-07, "logits/chosen": -0.6673433780670166, "logits/rejected": -0.6215798854827881, "logps/chosen": -68.57659149169922, "logps/rejected": -57.87409973144531, "loss": 0.538, "rewards/accuracies": 1.0, "rewards/chosen": 2.111323595046997, "rewards/margins": 0.3075805902481079, "rewards/rejected": 1.8037430047988892, "step": 10393 }, { "epoch": 1.69, "learning_rate": 2.4365352021899634e-07, "logits/chosen": -0.8633062839508057, "logits/rejected": -0.733623743057251, "logps/chosen": -80.36419677734375, "logps/rejected": -22.67745590209961, "loss": 2.4707, "rewards/accuracies": 1.0, "rewards/chosen": 2.5410592555999756, "rewards/margins": 1.6914613246917725, "rewards/rejected": 0.8495979309082031, "step": 10394 }, { "epoch": 1.69, "learning_rate": 2.435406909073753e-07, "logits/chosen": -0.7550045847892761, "logits/rejected": -0.8518438935279846, "logps/chosen": -71.51675415039062, "logps/rejected": -94.54853820800781, "loss": 0.9784, "rewards/accuracies": 0.0, "rewards/chosen": 0.9624695181846619, "rewards/margins": -1.7988173961639404, "rewards/rejected": 2.761286973953247, "step": 10395 }, { "epoch": 1.69, "learning_rate": 2.434278793146656e-07, "logits/chosen": -0.6645457744598389, "logits/rejected": -0.6677459478378296, "logps/chosen": -53.08864212036133, "logps/rejected": -148.876953125, "loss": 0.9818, "rewards/accuracies": 0.0, "rewards/chosen": 1.622538447380066, "rewards/margins": -0.019978642463684082, "rewards/rejected": 1.64251708984375, "step": 10396 }, { "epoch": 1.69, "learning_rate": 2.4331508544866197e-07, "logits/chosen": -0.36610668897628784, "logits/rejected": -0.23762205243110657, "logps/chosen": -98.6943359375, "logps/rejected": -58.75777053833008, "loss": 0.2899, "rewards/accuracies": 1.0, "rewards/chosen": 3.303851366043091, "rewards/margins": 0.5143694877624512, "rewards/rejected": 2.7894818782806396, "step": 10397 }, { "epoch": 1.69, "learning_rate": 2.4320230931715695e-07, "logits/chosen": -0.6496427059173584, "logits/rejected": -0.6074203252792358, "logps/chosen": -87.44174194335938, "logps/rejected": -98.3187255859375, "loss": 0.91, "rewards/accuracies": 1.0, "rewards/chosen": 2.0327377319335938, "rewards/margins": 0.26910173892974854, "rewards/rejected": 1.7636359930038452, "step": 10398 }, { "epoch": 1.69, "learning_rate": 2.430895509279427e-07, "logits/chosen": -0.28635939955711365, "logits/rejected": -0.28661441802978516, "logps/chosen": -1.5814762115478516, "logps/rejected": -3.1782608032226562, "loss": 2.2758, "rewards/accuracies": 0.0, "rewards/chosen": 0.45524922013282776, "rewards/margins": -0.1318422257900238, "rewards/rejected": 0.5870914459228516, "step": 10399 }, { "epoch": 1.69, "learning_rate": 2.429768102888094e-07, "logits/chosen": -0.6479470133781433, "logits/rejected": -0.580759584903717, "logps/chosen": -77.21949005126953, "logps/rejected": -20.76220703125, "loss": 0.3631, "rewards/accuracies": 0.0, "rewards/chosen": 0.15809325873851776, "rewards/margins": -0.0050720274448394775, "rewards/rejected": 0.16316528618335724, "step": 10400 }, { "epoch": 1.69, "learning_rate": 2.428640874075467e-07, "logits/chosen": -0.27114802598953247, "logits/rejected": -0.26534414291381836, "logps/chosen": -19.7271785736084, "logps/rejected": -21.599735260009766, "loss": 1.587, "rewards/accuracies": 0.0, "rewards/chosen": -0.0742681547999382, "rewards/margins": -0.13859234750270844, "rewards/rejected": 0.06432419270277023, "step": 10401 }, { "epoch": 1.69, "learning_rate": 2.4275138229194236e-07, "logits/chosen": -0.495057612657547, "logits/rejected": -0.3909130096435547, "logps/chosen": -69.90306091308594, "logps/rejected": -14.082369804382324, "loss": 0.4539, "rewards/accuracies": 1.0, "rewards/chosen": 2.5424866676330566, "rewards/margins": 1.4293349981307983, "rewards/rejected": 1.1131516695022583, "step": 10402 }, { "epoch": 1.69, "learning_rate": 2.426386949497836e-07, "logits/chosen": -0.6793553233146667, "logits/rejected": -0.6211554408073425, "logps/chosen": -79.28376770019531, "logps/rejected": -49.16753005981445, "loss": 1.3512, "rewards/accuracies": 0.0, "rewards/chosen": 1.0992218255996704, "rewards/margins": -0.0479358434677124, "rewards/rejected": 1.1471576690673828, "step": 10403 }, { "epoch": 1.69, "learning_rate": 2.425260253888556e-07, "logits/chosen": -1.0460996627807617, "logits/rejected": -1.0543498992919922, "logps/chosen": -68.15975952148438, "logps/rejected": -68.9796142578125, "loss": 2.2569, "rewards/accuracies": 0.0, "rewards/chosen": 3.320845127105713, "rewards/margins": -0.01934361457824707, "rewards/rejected": 3.34018874168396, "step": 10404 }, { "epoch": 1.69, "learning_rate": 2.4241337361694303e-07, "logits/chosen": -0.7875582575798035, "logits/rejected": -0.4127407670021057, "logps/chosen": -83.79014587402344, "logps/rejected": -60.49528121948242, "loss": 0.504, "rewards/accuracies": 0.0, "rewards/chosen": 1.5935882329940796, "rewards/margins": -0.5319417715072632, "rewards/rejected": 2.1255300045013428, "step": 10405 }, { "epoch": 1.69, "learning_rate": 2.4230073964182924e-07, "logits/chosen": -0.7811603546142578, "logits/rejected": -0.7331652641296387, "logps/chosen": -46.445045471191406, "logps/rejected": -14.462492942810059, "loss": 0.2916, "rewards/accuracies": 1.0, "rewards/chosen": 2.065783739089966, "rewards/margins": 1.0365064144134521, "rewards/rejected": 1.0292773246765137, "step": 10406 }, { "epoch": 1.69, "learning_rate": 2.4218812347129574e-07, "logits/chosen": -0.2712004780769348, "logits/rejected": -0.27460777759552, "logps/chosen": -1.8256324529647827, "logps/rejected": -2.6662943363189697, "loss": 1.9608, "rewards/accuracies": 0.0, "rewards/chosen": 0.2667715549468994, "rewards/margins": -0.025322377681732178, "rewards/rejected": 0.2920939326286316, "step": 10407 }, { "epoch": 1.69, "learning_rate": 2.4207552511312366e-07, "logits/chosen": -0.5999172925949097, "logits/rejected": -0.5831429958343506, "logps/chosen": -90.77216339111328, "logps/rejected": -85.36488342285156, "loss": 1.5051, "rewards/accuracies": 0.0, "rewards/chosen": 1.2425018548965454, "rewards/margins": -0.7695862054824829, "rewards/rejected": 2.0120880603790283, "step": 10408 }, { "epoch": 1.69, "learning_rate": 2.4196294457509214e-07, "logits/chosen": -0.7994712591171265, "logits/rejected": -0.7415727972984314, "logps/chosen": -230.48822021484375, "logps/rejected": -78.82347106933594, "loss": 0.4696, "rewards/accuracies": 0.0, "rewards/chosen": 1.8824280500411987, "rewards/margins": -0.1493927240371704, "rewards/rejected": 2.031820774078369, "step": 10409 }, { "epoch": 1.69, "learning_rate": 2.4185038186497974e-07, "logits/chosen": -0.6443266272544861, "logits/rejected": -0.649320125579834, "logps/chosen": -86.69950866699219, "logps/rejected": -118.04669952392578, "loss": 1.1159, "rewards/accuracies": 0.0, "rewards/chosen": 1.3387451171875, "rewards/margins": -1.5699822902679443, "rewards/rejected": 2.9087274074554443, "step": 10410 }, { "epoch": 1.69, "learning_rate": 2.4173783699056313e-07, "logits/chosen": -0.7616047859191895, "logits/rejected": -0.8409610986709595, "logps/chosen": -117.45323944091797, "logps/rejected": -143.21603393554688, "loss": 1.2872, "rewards/accuracies": 0.0, "rewards/chosen": 2.0492637157440186, "rewards/margins": -1.343839406967163, "rewards/rejected": 3.3931031227111816, "step": 10411 }, { "epoch": 1.69, "learning_rate": 2.416253099596185e-07, "logits/chosen": -0.6447286605834961, "logits/rejected": -0.6464598178863525, "logps/chosen": -5.958900451660156, "logps/rejected": -21.694822311401367, "loss": 0.3998, "rewards/accuracies": 0.0, "rewards/chosen": 0.21360884606838226, "rewards/margins": -0.19155950844287872, "rewards/rejected": 0.405168354511261, "step": 10412 }, { "epoch": 1.69, "learning_rate": 2.4151280077992e-07, "logits/chosen": -0.6467691659927368, "logits/rejected": -0.7076041102409363, "logps/chosen": -226.6120147705078, "logps/rejected": -150.4824676513672, "loss": 1.3979, "rewards/accuracies": 0.0, "rewards/chosen": 4.682191371917725, "rewards/margins": -2.7262911796569824, "rewards/rejected": 7.408482551574707, "step": 10413 }, { "epoch": 1.69, "learning_rate": 2.4140030945924135e-07, "logits/chosen": -0.9563678503036499, "logits/rejected": -0.9504860639572144, "logps/chosen": -87.48146057128906, "logps/rejected": -78.19622802734375, "loss": 0.5831, "rewards/accuracies": 0.0, "rewards/chosen": 1.6495682001113892, "rewards/margins": -0.6458901166915894, "rewards/rejected": 2.2954583168029785, "step": 10414 }, { "epoch": 1.69, "learning_rate": 2.4128783600535415e-07, "logits/chosen": -0.5183140635490417, "logits/rejected": -0.48721879720687866, "logps/chosen": -46.29652786254883, "logps/rejected": -94.98461151123047, "loss": 0.7078, "rewards/accuracies": 1.0, "rewards/chosen": 1.8437680006027222, "rewards/margins": 0.04115569591522217, "rewards/rejected": 1.8026123046875, "step": 10415 }, { "epoch": 1.69, "learning_rate": 2.4117538042602974e-07, "logits/chosen": -0.6720666289329529, "logits/rejected": -0.5232740044593811, "logps/chosen": -96.44915008544922, "logps/rejected": -27.342288970947266, "loss": 0.2007, "rewards/accuracies": 1.0, "rewards/chosen": 0.7983765006065369, "rewards/margins": 0.7865768671035767, "rewards/rejected": 0.011799621395766735, "step": 10416 }, { "epoch": 1.69, "learning_rate": 2.410629427290374e-07, "logits/chosen": -0.6879502534866333, "logits/rejected": -0.6561118364334106, "logps/chosen": -60.48149108886719, "logps/rejected": -56.49024200439453, "loss": 0.6983, "rewards/accuracies": 0.0, "rewards/chosen": 2.487866163253784, "rewards/margins": -0.799447774887085, "rewards/rejected": 3.287313938140869, "step": 10417 }, { "epoch": 1.69, "learning_rate": 2.4095052292214575e-07, "logits/chosen": -0.7282270789146423, "logits/rejected": -0.7777202725410461, "logps/chosen": -80.52325439453125, "logps/rejected": -91.85176086425781, "loss": 1.3856, "rewards/accuracies": 0.0, "rewards/chosen": 1.6471771001815796, "rewards/margins": -1.8273323774337769, "rewards/rejected": 3.4745094776153564, "step": 10418 }, { "epoch": 1.69, "learning_rate": 2.4083812101312164e-07, "logits/chosen": -0.4378989040851593, "logits/rejected": -0.4366200566291809, "logps/chosen": -36.45216751098633, "logps/rejected": -25.224685668945312, "loss": 0.4503, "rewards/accuracies": 0.0, "rewards/chosen": 0.15546265244483948, "rewards/margins": -0.30496636033058167, "rewards/rejected": 0.46042901277542114, "step": 10419 }, { "epoch": 1.69, "learning_rate": 2.4072573700973136e-07, "logits/chosen": -0.9795250296592712, "logits/rejected": -1.1314470767974854, "logps/chosen": -131.87759399414062, "logps/rejected": -117.61537170410156, "loss": 0.0237, "rewards/accuracies": 1.0, "rewards/chosen": 7.963523864746094, "rewards/margins": 3.8619885444641113, "rewards/rejected": 4.101535320281982, "step": 10420 }, { "epoch": 1.69, "learning_rate": 2.4061337091973916e-07, "logits/chosen": -0.8567731976509094, "logits/rejected": -0.730499267578125, "logps/chosen": -171.239013671875, "logps/rejected": -72.49600219726562, "loss": 0.2821, "rewards/accuracies": 1.0, "rewards/chosen": 6.8493804931640625, "rewards/margins": 4.037717342376709, "rewards/rejected": 2.8116631507873535, "step": 10421 }, { "epoch": 1.69, "learning_rate": 2.4050102275090897e-07, "logits/chosen": -0.6940304636955261, "logits/rejected": -0.707173228263855, "logps/chosen": -97.95028686523438, "logps/rejected": -49.15470886230469, "loss": 0.2551, "rewards/accuracies": 1.0, "rewards/chosen": 2.363410234451294, "rewards/margins": 0.6877189874649048, "rewards/rejected": 1.6756912469863892, "step": 10422 }, { "epoch": 1.69, "learning_rate": 2.403886925110024e-07, "logits/chosen": -0.6812625527381897, "logits/rejected": -0.6970213055610657, "logps/chosen": -60.069725036621094, "logps/rejected": -79.24142456054688, "loss": 0.3516, "rewards/accuracies": 1.0, "rewards/chosen": 1.1763817071914673, "rewards/margins": 0.1699378490447998, "rewards/rejected": 1.0064438581466675, "step": 10423 }, { "epoch": 1.69, "learning_rate": 2.4027638020778097e-07, "logits/chosen": -0.6809211373329163, "logits/rejected": -0.7022461891174316, "logps/chosen": -51.79778289794922, "logps/rejected": -92.28189849853516, "loss": 0.8099, "rewards/accuracies": 0.0, "rewards/chosen": 0.5255512595176697, "rewards/margins": -0.1791675090789795, "rewards/rejected": 0.7047187685966492, "step": 10424 }, { "epoch": 1.69, "learning_rate": 2.4016408584900394e-07, "logits/chosen": -0.5519852638244629, "logits/rejected": -0.5608732104301453, "logps/chosen": -129.8398895263672, "logps/rejected": -58.10202407836914, "loss": 1.1227, "rewards/accuracies": 1.0, "rewards/chosen": 4.169040203094482, "rewards/margins": 2.21781063079834, "rewards/rejected": 1.951229453086853, "step": 10425 }, { "epoch": 1.69, "learning_rate": 2.400518094424301e-07, "logits/chosen": -0.8718676567077637, "logits/rejected": -0.9507270455360413, "logps/chosen": -148.6605224609375, "logps/rejected": -92.0097885131836, "loss": 0.5753, "rewards/accuracies": 0.0, "rewards/chosen": 4.988107204437256, "rewards/margins": -0.7600107192993164, "rewards/rejected": 5.748117923736572, "step": 10426 }, { "epoch": 1.69, "learning_rate": 2.399395509958167e-07, "logits/chosen": -0.8329352140426636, "logits/rejected": -0.684614360332489, "logps/chosen": -167.6534423828125, "logps/rejected": -47.05363082885742, "loss": 1.3173, "rewards/accuracies": 1.0, "rewards/chosen": 6.2948760986328125, "rewards/margins": 4.834231376647949, "rewards/rejected": 1.4606449604034424, "step": 10427 }, { "epoch": 1.69, "learning_rate": 2.3982731051691936e-07, "logits/chosen": -0.5031400918960571, "logits/rejected": -0.1754743754863739, "logps/chosen": -125.57853698730469, "logps/rejected": -43.35520553588867, "loss": 0.2352, "rewards/accuracies": 1.0, "rewards/chosen": 3.9930131435394287, "rewards/margins": 3.3751652240753174, "rewards/rejected": 0.6178478598594666, "step": 10428 }, { "epoch": 1.69, "learning_rate": 2.397150880134933e-07, "logits/chosen": -0.6414139866828918, "logits/rejected": -0.4940798282623291, "logps/chosen": -49.79304504394531, "logps/rejected": -51.670013427734375, "loss": 0.3316, "rewards/accuracies": 1.0, "rewards/chosen": 2.2443313598632812, "rewards/margins": 0.11267542839050293, "rewards/rejected": 2.1316559314727783, "step": 10429 }, { "epoch": 1.69, "learning_rate": 2.396028834932916e-07, "logits/chosen": -0.6796345114707947, "logits/rejected": -0.6005632877349854, "logps/chosen": -44.901771545410156, "logps/rejected": -57.89158248901367, "loss": 0.6577, "rewards/accuracies": 0.0, "rewards/chosen": 1.7547543048858643, "rewards/margins": -0.9342103004455566, "rewards/rejected": 2.688964605331421, "step": 10430 }, { "epoch": 1.69, "learning_rate": 2.394906969640669e-07, "logits/chosen": -0.3744696080684662, "logits/rejected": -0.3744696080684662, "logps/chosen": -73.89079284667969, "logps/rejected": -73.89079284667969, "loss": 0.471, "rewards/accuracies": 0.0, "rewards/chosen": 0.8894394040107727, "rewards/margins": 0.0, "rewards/rejected": 0.8894394040107727, "step": 10431 }, { "epoch": 1.69, "learning_rate": 2.3937852843357e-07, "logits/chosen": -0.681887149810791, "logits/rejected": -0.7270652055740356, "logps/chosen": -113.19485473632812, "logps/rejected": -81.97229766845703, "loss": 1.104, "rewards/accuracies": 0.0, "rewards/chosen": 1.368493676185608, "rewards/margins": -0.8462494611740112, "rewards/rejected": 2.214743137359619, "step": 10432 }, { "epoch": 1.69, "learning_rate": 2.3926637790955085e-07, "logits/chosen": -0.7999841570854187, "logits/rejected": -0.8900037407875061, "logps/chosen": -86.52789306640625, "logps/rejected": -62.08876419067383, "loss": 0.6819, "rewards/accuracies": 0.0, "rewards/chosen": 2.1856706142425537, "rewards/margins": -0.03591179847717285, "rewards/rejected": 2.2215824127197266, "step": 10433 }, { "epoch": 1.69, "learning_rate": 2.3915424539975774e-07, "logits/chosen": -0.7824490666389465, "logits/rejected": -0.7710379958152771, "logps/chosen": -133.68081665039062, "logps/rejected": -112.05175018310547, "loss": 0.3432, "rewards/accuracies": 1.0, "rewards/chosen": 4.112094402313232, "rewards/margins": 2.701777935028076, "rewards/rejected": 1.4103164672851562, "step": 10434 }, { "epoch": 1.69, "learning_rate": 2.390421309119384e-07, "logits/chosen": -0.7510733008384705, "logits/rejected": -0.6124290227890015, "logps/chosen": -114.96426391601562, "logps/rejected": -19.401161193847656, "loss": 0.1123, "rewards/accuracies": 1.0, "rewards/chosen": 3.354989767074585, "rewards/margins": 3.036102294921875, "rewards/rejected": 0.31888753175735474, "step": 10435 }, { "epoch": 1.69, "learning_rate": 2.389300344538383e-07, "logits/chosen": -0.7200652360916138, "logits/rejected": -0.6038782596588135, "logps/chosen": -42.39646530151367, "logps/rejected": -68.58375549316406, "loss": 0.2812, "rewards/accuracies": 1.0, "rewards/chosen": 1.842087984085083, "rewards/margins": 0.9645542502403259, "rewards/rejected": 0.8775337338447571, "step": 10436 }, { "epoch": 1.69, "learning_rate": 2.3881795603320275e-07, "logits/chosen": -0.8831526637077332, "logits/rejected": -0.8452187776565552, "logps/chosen": -106.18511962890625, "logps/rejected": -94.61097717285156, "loss": 0.8648, "rewards/accuracies": 0.0, "rewards/chosen": 0.839251697063446, "rewards/margins": -0.5782715678215027, "rewards/rejected": 1.4175232648849487, "step": 10437 }, { "epoch": 1.69, "learning_rate": 2.38705895657775e-07, "logits/chosen": -0.7278887629508972, "logits/rejected": -0.764523983001709, "logps/chosen": -90.12962341308594, "logps/rejected": -82.81753540039062, "loss": 0.9326, "rewards/accuracies": 0.0, "rewards/chosen": 0.6947159171104431, "rewards/margins": -1.5467314720153809, "rewards/rejected": 2.2414474487304688, "step": 10438 }, { "epoch": 1.69, "learning_rate": 2.3859385333529764e-07, "logits/chosen": -0.7185295224189758, "logits/rejected": -0.7185295224189758, "logps/chosen": -77.78631591796875, "logps/rejected": -77.78631591796875, "loss": 0.3943, "rewards/accuracies": 0.0, "rewards/chosen": 2.8713090419769287, "rewards/margins": 0.0, "rewards/rejected": 2.8713090419769287, "step": 10439 }, { "epoch": 1.69, "learning_rate": 2.384818290735113e-07, "logits/chosen": -1.0023188591003418, "logits/rejected": -0.9740238785743713, "logps/chosen": -100.02902221679688, "logps/rejected": -33.10955810546875, "loss": 0.6364, "rewards/accuracies": 1.0, "rewards/chosen": 5.0008544921875, "rewards/margins": 3.57155704498291, "rewards/rejected": 1.4292973279953003, "step": 10440 }, { "epoch": 1.69, "learning_rate": 2.3836982288015633e-07, "logits/chosen": -0.6558494567871094, "logits/rejected": -0.6299520134925842, "logps/chosen": -49.690101623535156, "logps/rejected": -84.67117309570312, "loss": 1.196, "rewards/accuracies": 0.0, "rewards/chosen": 1.750836968421936, "rewards/margins": -1.983101725578308, "rewards/rejected": 3.733938694000244, "step": 10441 }, { "epoch": 1.69, "learning_rate": 2.3825783476297085e-07, "logits/chosen": -0.14293427765369415, "logits/rejected": -0.1489907056093216, "logps/chosen": -3.913022041320801, "logps/rejected": -1.3770276308059692, "loss": 1.119, "rewards/accuracies": 0.0, "rewards/chosen": 0.060013916343450546, "rewards/margins": -0.15581469237804413, "rewards/rejected": 0.21582861244678497, "step": 10442 }, { "epoch": 1.7, "learning_rate": 2.381458647296925e-07, "logits/chosen": -0.6138578057289124, "logits/rejected": -0.6730008721351624, "logps/chosen": -118.40924072265625, "logps/rejected": -180.86306762695312, "loss": 1.5414, "rewards/accuracies": 0.0, "rewards/chosen": 4.763360500335693, "rewards/margins": -3.0255188941955566, "rewards/rejected": 7.78887939453125, "step": 10443 }, { "epoch": 1.7, "learning_rate": 2.3803391278805706e-07, "logits/chosen": -0.357787549495697, "logits/rejected": -0.2881750464439392, "logps/chosen": -43.866355895996094, "logps/rejected": -13.844085693359375, "loss": 0.551, "rewards/accuracies": 1.0, "rewards/chosen": 1.0790481567382812, "rewards/margins": 0.19733655452728271, "rewards/rejected": 0.8817116022109985, "step": 10444 }, { "epoch": 1.7, "learning_rate": 2.379219789457997e-07, "logits/chosen": -0.7274147272109985, "logits/rejected": -0.681122899055481, "logps/chosen": -75.40266418457031, "logps/rejected": -48.48245620727539, "loss": 0.9042, "rewards/accuracies": 0.0, "rewards/chosen": 0.9668907523155212, "rewards/margins": -0.8327533602714539, "rewards/rejected": 1.799644112586975, "step": 10445 }, { "epoch": 1.7, "learning_rate": 2.3781006321065355e-07, "logits/chosen": -0.5266746282577515, "logits/rejected": -0.5273740291595459, "logps/chosen": -4.837088584899902, "logps/rejected": -2.786774158477783, "loss": 0.71, "rewards/accuracies": 0.0, "rewards/chosen": 0.27931591868400574, "rewards/margins": -0.06618335843086243, "rewards/rejected": 0.34549927711486816, "step": 10446 }, { "epoch": 1.7, "learning_rate": 2.376981655903514e-07, "logits/chosen": -0.7179773449897766, "logits/rejected": -0.7166277766227722, "logps/chosen": -33.39309310913086, "logps/rejected": -42.978599548339844, "loss": 0.8099, "rewards/accuracies": 1.0, "rewards/chosen": 1.8592350482940674, "rewards/margins": 0.25498318672180176, "rewards/rejected": 1.6042518615722656, "step": 10447 }, { "epoch": 1.7, "learning_rate": 2.3758628609262387e-07, "logits/chosen": -0.7834450006484985, "logits/rejected": -0.6059974431991577, "logps/chosen": -65.10578918457031, "logps/rejected": -22.4163818359375, "loss": 0.2132, "rewards/accuracies": 1.0, "rewards/chosen": 1.7616928815841675, "rewards/margins": 1.0631179809570312, "rewards/rejected": 0.6985748410224915, "step": 10448 }, { "epoch": 1.7, "learning_rate": 2.3747442472520118e-07, "logits/chosen": -0.7013539671897888, "logits/rejected": -0.6441230773925781, "logps/chosen": -137.73171997070312, "logps/rejected": -64.41722106933594, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": 4.069998264312744, "rewards/margins": 2.2982840538024902, "rewards/rejected": 1.7717140913009644, "step": 10449 }, { "epoch": 1.7, "learning_rate": 2.373625814958115e-07, "logits/chosen": -0.8357442021369934, "logits/rejected": -0.8308331966400146, "logps/chosen": -152.19720458984375, "logps/rejected": -98.13630676269531, "loss": 0.3587, "rewards/accuracies": 1.0, "rewards/chosen": 2.1476852893829346, "rewards/margins": 0.02621912956237793, "rewards/rejected": 2.1214661598205566, "step": 10450 }, { "epoch": 1.7, "learning_rate": 2.3725075641218256e-07, "logits/chosen": -0.3284035623073578, "logits/rejected": -0.17647691071033478, "logps/chosen": -62.633941650390625, "logps/rejected": -20.796907424926758, "loss": 0.6002, "rewards/accuracies": 1.0, "rewards/chosen": 1.9763764142990112, "rewards/margins": 1.423682689666748, "rewards/rejected": 0.552693784236908, "step": 10451 }, { "epoch": 1.7, "learning_rate": 2.3713894948204e-07, "logits/chosen": -0.6534731984138489, "logits/rejected": -0.6665695905685425, "logps/chosen": -64.39875793457031, "logps/rejected": -61.15849304199219, "loss": 0.5968, "rewards/accuracies": 0.0, "rewards/chosen": 1.9996185302734375, "rewards/margins": -0.4063377380371094, "rewards/rejected": 2.405956268310547, "step": 10452 }, { "epoch": 1.7, "learning_rate": 2.37027160713109e-07, "logits/chosen": -0.6924356818199158, "logits/rejected": -0.6949896812438965, "logps/chosen": -119.93789672851562, "logps/rejected": -49.975582122802734, "loss": 0.4213, "rewards/accuracies": 1.0, "rewards/chosen": 1.5205353498458862, "rewards/margins": 0.03517425060272217, "rewards/rejected": 1.485361099243164, "step": 10453 }, { "epoch": 1.7, "learning_rate": 2.3691539011311274e-07, "logits/chosen": -0.9600151777267456, "logits/rejected": -0.9420385360717773, "logps/chosen": -126.81419372558594, "logps/rejected": -83.83000183105469, "loss": 0.6316, "rewards/accuracies": 0.0, "rewards/chosen": 2.165257215499878, "rewards/margins": -0.930363655090332, "rewards/rejected": 3.09562087059021, "step": 10454 }, { "epoch": 1.7, "learning_rate": 2.3680363768977386e-07, "logits/chosen": -0.3679194152355194, "logits/rejected": -0.3448096215724945, "logps/chosen": -23.153757095336914, "logps/rejected": -19.2154541015625, "loss": 1.4064, "rewards/accuracies": 1.0, "rewards/chosen": -0.023673249408602715, "rewards/margins": 0.1468944549560547, "rewards/rejected": -0.17056770622730255, "step": 10455 }, { "epoch": 1.7, "learning_rate": 2.3669190345081307e-07, "logits/chosen": -0.4363512396812439, "logits/rejected": -0.37838757038116455, "logps/chosen": -30.976045608520508, "logps/rejected": -11.712936401367188, "loss": 0.371, "rewards/accuracies": 1.0, "rewards/chosen": 1.6842920780181885, "rewards/margins": 0.48016297817230225, "rewards/rejected": 1.2041290998458862, "step": 10456 }, { "epoch": 1.7, "learning_rate": 2.3658018740395047e-07, "logits/chosen": -0.8139804005622864, "logits/rejected": -0.7390710711479187, "logps/chosen": -140.24156188964844, "logps/rejected": -107.69970703125, "loss": 0.8825, "rewards/accuracies": 1.0, "rewards/chosen": 3.9758026599884033, "rewards/margins": 1.3517119884490967, "rewards/rejected": 2.6240906715393066, "step": 10457 }, { "epoch": 1.7, "learning_rate": 2.3646848955690423e-07, "logits/chosen": -0.7451192736625671, "logits/rejected": -0.618301272392273, "logps/chosen": -153.3309783935547, "logps/rejected": -90.27913665771484, "loss": 0.3397, "rewards/accuracies": 1.0, "rewards/chosen": 4.9420061111450195, "rewards/margins": 3.1616463661193848, "rewards/rejected": 1.7803596258163452, "step": 10458 }, { "epoch": 1.7, "learning_rate": 2.3635680991739198e-07, "logits/chosen": -0.6862545013427734, "logits/rejected": -0.680464506149292, "logps/chosen": -55.30244827270508, "logps/rejected": -50.39600372314453, "loss": 0.4995, "rewards/accuracies": 1.0, "rewards/chosen": 2.541813373565674, "rewards/margins": 0.2403874397277832, "rewards/rejected": 2.3014259338378906, "step": 10459 }, { "epoch": 1.7, "learning_rate": 2.3624514849312944e-07, "logits/chosen": -0.3395061194896698, "logits/rejected": -0.3395061194896698, "logps/chosen": -40.871952056884766, "logps/rejected": -40.871952056884766, "loss": 0.7121, "rewards/accuracies": 0.0, "rewards/chosen": 0.316061794757843, "rewards/margins": 0.0, "rewards/rejected": 0.316061794757843, "step": 10460 }, { "epoch": 1.7, "learning_rate": 2.361335052918314e-07, "logits/chosen": -0.8220264315605164, "logits/rejected": -0.7208000421524048, "logps/chosen": -98.74034118652344, "logps/rejected": -63.650123596191406, "loss": 0.7436, "rewards/accuracies": 1.0, "rewards/chosen": 1.4979660511016846, "rewards/margins": 0.46163487434387207, "rewards/rejected": 1.0363311767578125, "step": 10461 }, { "epoch": 1.7, "learning_rate": 2.3602188032121163e-07, "logits/chosen": -0.461063027381897, "logits/rejected": -0.3811168968677521, "logps/chosen": -139.91058349609375, "logps/rejected": -42.352081298828125, "loss": 0.1901, "rewards/accuracies": 1.0, "rewards/chosen": 1.850195288658142, "rewards/margins": 0.8360671997070312, "rewards/rejected": 1.0141280889511108, "step": 10462 }, { "epoch": 1.7, "learning_rate": 2.359102735889819e-07, "logits/chosen": -0.23667532205581665, "logits/rejected": -0.18797795474529266, "logps/chosen": -41.23814392089844, "logps/rejected": -1.520384430885315, "loss": 0.476, "rewards/accuracies": 1.0, "rewards/chosen": 0.4429088532924652, "rewards/margins": 0.059398770332336426, "rewards/rejected": 0.3835100829601288, "step": 10463 }, { "epoch": 1.7, "learning_rate": 2.3579868510285368e-07, "logits/chosen": -0.9074257612228394, "logits/rejected": -0.8954623341560364, "logps/chosen": -91.28324890136719, "logps/rejected": -79.24324798583984, "loss": 0.9048, "rewards/accuracies": 0.0, "rewards/chosen": 0.9652649164199829, "rewards/margins": -0.5118995904922485, "rewards/rejected": 1.4771645069122314, "step": 10464 }, { "epoch": 1.7, "learning_rate": 2.356871148705362e-07, "logits/chosen": -0.5494306683540344, "logits/rejected": -0.3861250877380371, "logps/chosen": -87.8957290649414, "logps/rejected": -11.994873046875, "loss": 0.2184, "rewards/accuracies": 1.0, "rewards/chosen": 0.8599052429199219, "rewards/margins": 0.6093285083770752, "rewards/rejected": 0.2505767047405243, "step": 10465 }, { "epoch": 1.7, "learning_rate": 2.3557556289973835e-07, "logits/chosen": -0.9803423285484314, "logits/rejected": -0.8883962631225586, "logps/chosen": -108.95536804199219, "logps/rejected": -140.7320098876953, "loss": 1.8481, "rewards/accuracies": 0.0, "rewards/chosen": 4.950141906738281, "rewards/margins": -0.6858458518981934, "rewards/rejected": 5.635987758636475, "step": 10466 }, { "epoch": 1.7, "learning_rate": 2.3546402919816687e-07, "logits/chosen": -0.692639172077179, "logits/rejected": -0.6511498093605042, "logps/chosen": -70.471923828125, "logps/rejected": -61.542869567871094, "loss": 0.9488, "rewards/accuracies": 0.0, "rewards/chosen": 1.8865692615509033, "rewards/margins": -0.007413506507873535, "rewards/rejected": 1.8939827680587769, "step": 10467 }, { "epoch": 1.7, "learning_rate": 2.353525137735281e-07, "logits/chosen": -0.5057948231697083, "logits/rejected": -0.6657784581184387, "logps/chosen": -37.64561080932617, "logps/rejected": -34.428985595703125, "loss": 1.9047, "rewards/accuracies": 0.0, "rewards/chosen": 1.194252371788025, "rewards/margins": -1.2858647108078003, "rewards/rejected": 2.480117082595825, "step": 10468 }, { "epoch": 1.7, "learning_rate": 2.3524101663352637e-07, "logits/chosen": -0.4546928405761719, "logits/rejected": -0.4546928405761719, "logps/chosen": -30.018112182617188, "logps/rejected": -30.018112182617188, "loss": 1.8977, "rewards/accuracies": 0.0, "rewards/chosen": 1.496158242225647, "rewards/margins": 0.0, "rewards/rejected": 1.496158242225647, "step": 10469 }, { "epoch": 1.7, "learning_rate": 2.3512953778586537e-07, "logits/chosen": -0.6197242736816406, "logits/rejected": -0.6903895735740662, "logps/chosen": -68.85237121582031, "logps/rejected": -143.06700134277344, "loss": 1.8975, "rewards/accuracies": 0.0, "rewards/chosen": 2.445416212081909, "rewards/margins": -1.6564104557037354, "rewards/rejected": 4.1018266677856445, "step": 10470 }, { "epoch": 1.7, "learning_rate": 2.350180772382469e-07, "logits/chosen": -0.2910621464252472, "logits/rejected": -0.24082303047180176, "logps/chosen": -90.28329467773438, "logps/rejected": -54.96183776855469, "loss": 0.0972, "rewards/accuracies": 1.0, "rewards/chosen": 4.267909526824951, "rewards/margins": 2.360475540161133, "rewards/rejected": 1.907434105873108, "step": 10471 }, { "epoch": 1.7, "learning_rate": 2.3490663499837221e-07, "logits/chosen": -0.9393596649169922, "logits/rejected": -1.028726577758789, "logps/chosen": -250.3001708984375, "logps/rejected": -138.12208557128906, "loss": 1.4095, "rewards/accuracies": 0.0, "rewards/chosen": 6.182794094085693, "rewards/margins": -2.755537509918213, "rewards/rejected": 8.938331604003906, "step": 10472 }, { "epoch": 1.7, "learning_rate": 2.347952110739405e-07, "logits/chosen": -0.46234840154647827, "logits/rejected": -0.47636255621910095, "logps/chosen": -74.15886688232422, "logps/rejected": -69.40293884277344, "loss": 1.8057, "rewards/accuracies": 0.0, "rewards/chosen": 0.9113609194755554, "rewards/margins": -0.6523544192314148, "rewards/rejected": 1.5637153387069702, "step": 10473 }, { "epoch": 1.7, "learning_rate": 2.346838054726505e-07, "logits/chosen": -0.6163160800933838, "logits/rejected": -0.5515183806419373, "logps/chosen": -108.28848266601562, "logps/rejected": -72.82708740234375, "loss": 0.1845, "rewards/accuracies": 1.0, "rewards/chosen": 3.5504989624023438, "rewards/margins": 0.8542799949645996, "rewards/rejected": 2.696218967437744, "step": 10474 }, { "epoch": 1.7, "learning_rate": 2.345724182021989e-07, "logits/chosen": -0.8716767430305481, "logits/rejected": -0.6859058737754822, "logps/chosen": -78.5165023803711, "logps/rejected": -35.70011520385742, "loss": 0.0665, "rewards/accuracies": 1.0, "rewards/chosen": 2.6103196144104004, "rewards/margins": 2.1207706928253174, "rewards/rejected": 0.4895488917827606, "step": 10475 }, { "epoch": 1.7, "learning_rate": 2.3446104927028193e-07, "logits/chosen": -0.913382887840271, "logits/rejected": -0.7089687585830688, "logps/chosen": -81.39315795898438, "logps/rejected": -31.563446044921875, "loss": 0.445, "rewards/accuracies": 1.0, "rewards/chosen": 1.706018090248108, "rewards/margins": 1.1529537439346313, "rewards/rejected": 0.5530643463134766, "step": 10476 }, { "epoch": 1.7, "learning_rate": 2.343496986845937e-07, "logits/chosen": -0.3414306938648224, "logits/rejected": -0.3414306938648224, "logps/chosen": -34.6694221496582, "logps/rejected": -34.6694221496582, "loss": 0.359, "rewards/accuracies": 0.0, "rewards/chosen": 0.2823368012905121, "rewards/margins": 0.0, "rewards/rejected": 0.2823368012905121, "step": 10477 }, { "epoch": 1.7, "learning_rate": 2.3423836645282785e-07, "logits/chosen": -0.6734222173690796, "logits/rejected": -0.5927813053131104, "logps/chosen": -142.73123168945312, "logps/rejected": -55.61371994018555, "loss": 0.0677, "rewards/accuracies": 1.0, "rewards/chosen": 4.942044258117676, "rewards/margins": 2.951775074005127, "rewards/rejected": 1.9902690649032593, "step": 10478 }, { "epoch": 1.7, "learning_rate": 2.3412705258267602e-07, "logits/chosen": -0.7795178294181824, "logits/rejected": -0.8366421461105347, "logps/chosen": -124.17822265625, "logps/rejected": -101.7774658203125, "loss": 0.5753, "rewards/accuracies": 0.0, "rewards/chosen": 1.8284271955490112, "rewards/margins": -0.5995062589645386, "rewards/rejected": 2.42793345451355, "step": 10479 }, { "epoch": 1.7, "learning_rate": 2.3401575708182935e-07, "logits/chosen": -1.1867574453353882, "logits/rejected": -1.3492472171783447, "logps/chosen": -108.6944580078125, "logps/rejected": -35.12626647949219, "loss": 0.8469, "rewards/accuracies": 1.0, "rewards/chosen": 1.340246558189392, "rewards/margins": 0.9585452675819397, "rewards/rejected": 0.3817012906074524, "step": 10480 }, { "epoch": 1.7, "learning_rate": 2.3390447995797692e-07, "logits/chosen": -0.8443264365196228, "logits/rejected": -0.7557494044303894, "logps/chosen": -102.57948303222656, "logps/rejected": -53.900360107421875, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": 6.890098571777344, "rewards/margins": 3.378854990005493, "rewards/rejected": 3.5112435817718506, "step": 10481 }, { "epoch": 1.7, "learning_rate": 2.337932212188073e-07, "logits/chosen": -0.8527410626411438, "logits/rejected": -0.7180366516113281, "logps/chosen": -104.09269714355469, "logps/rejected": -107.05001831054688, "loss": 0.283, "rewards/accuracies": 1.0, "rewards/chosen": 2.6759262084960938, "rewards/margins": 0.5578811168670654, "rewards/rejected": 2.1180450916290283, "step": 10482 }, { "epoch": 1.7, "learning_rate": 2.3368198087200702e-07, "logits/chosen": -0.7023950219154358, "logits/rejected": -0.6421074867248535, "logps/chosen": -86.09950256347656, "logps/rejected": -54.431331634521484, "loss": 0.6649, "rewards/accuracies": 0.0, "rewards/chosen": 1.786889672279358, "rewards/margins": -0.9293438196182251, "rewards/rejected": 2.716233491897583, "step": 10483 }, { "epoch": 1.7, "learning_rate": 2.3357075892526212e-07, "logits/chosen": -1.033129334449768, "logits/rejected": -1.0648139715194702, "logps/chosen": -95.22532653808594, "logps/rejected": -55.78876495361328, "loss": 0.4297, "rewards/accuracies": 0.0, "rewards/chosen": 1.754756212234497, "rewards/margins": -0.23135292530059814, "rewards/rejected": 1.9861091375350952, "step": 10484 }, { "epoch": 1.7, "learning_rate": 2.334595553862566e-07, "logits/chosen": -0.9136088490486145, "logits/rejected": -0.8816163539886475, "logps/chosen": -79.51421356201172, "logps/rejected": -44.6711311340332, "loss": 0.2884, "rewards/accuracies": 1.0, "rewards/chosen": 2.535607099533081, "rewards/margins": 0.5516040325164795, "rewards/rejected": 1.9840030670166016, "step": 10485 }, { "epoch": 1.7, "learning_rate": 2.3334837026267402e-07, "logits/chosen": -0.5859048366546631, "logits/rejected": -0.5788770914077759, "logps/chosen": -141.12310791015625, "logps/rejected": -119.0055923461914, "loss": 1.0177, "rewards/accuracies": 1.0, "rewards/chosen": 4.030941963195801, "rewards/margins": 1.531043291091919, "rewards/rejected": 2.499898672103882, "step": 10486 }, { "epoch": 1.7, "learning_rate": 2.3323720356219573e-07, "logits/chosen": -0.9909605383872986, "logits/rejected": -0.903800368309021, "logps/chosen": -103.76042938232422, "logps/rejected": -149.35047912597656, "loss": 0.322, "rewards/accuracies": 1.0, "rewards/chosen": 4.483212947845459, "rewards/margins": 0.18925094604492188, "rewards/rejected": 4.293962001800537, "step": 10487 }, { "epoch": 1.7, "learning_rate": 2.3312605529250274e-07, "logits/chosen": -0.5919700264930725, "logits/rejected": -0.6328839063644409, "logps/chosen": -91.16767120361328, "logps/rejected": -119.58816528320312, "loss": 2.4906, "rewards/accuracies": 0.0, "rewards/chosen": 1.4956810474395752, "rewards/margins": -0.9373893737792969, "rewards/rejected": 2.433070421218872, "step": 10488 }, { "epoch": 1.7, "learning_rate": 2.33014925461274e-07, "logits/chosen": -0.9634265303611755, "logits/rejected": -0.9634265303611755, "logps/chosen": -110.40516662597656, "logps/rejected": -110.40516662597656, "loss": 0.41, "rewards/accuracies": 0.0, "rewards/chosen": 0.8722847104072571, "rewards/margins": 0.0, "rewards/rejected": 0.8722847104072571, "step": 10489 }, { "epoch": 1.7, "learning_rate": 2.329038140761878e-07, "logits/chosen": -0.5087342262268066, "logits/rejected": -0.5392724275588989, "logps/chosen": -4.593704700469971, "logps/rejected": -21.24197006225586, "loss": 0.8502, "rewards/accuracies": 0.0, "rewards/chosen": 0.4458925724029541, "rewards/margins": -0.08913666009902954, "rewards/rejected": 0.5350292325019836, "step": 10490 }, { "epoch": 1.7, "learning_rate": 2.327927211449206e-07, "logits/chosen": -0.8178126811981201, "logits/rejected": -0.8050978779792786, "logps/chosen": -49.563751220703125, "logps/rejected": -114.67210388183594, "loss": 0.4434, "rewards/accuracies": 0.0, "rewards/chosen": 1.0912407636642456, "rewards/margins": -0.33015668392181396, "rewards/rejected": 1.4213974475860596, "step": 10491 }, { "epoch": 1.7, "learning_rate": 2.3268164667514824e-07, "logits/chosen": -0.991645336151123, "logits/rejected": -0.9159663319587708, "logps/chosen": -42.13623809814453, "logps/rejected": -22.99296760559082, "loss": 0.3097, "rewards/accuracies": 1.0, "rewards/chosen": 1.5459407567977905, "rewards/margins": 1.215816855430603, "rewards/rejected": 0.3301239013671875, "step": 10492 }, { "epoch": 1.7, "learning_rate": 2.3257059067454447e-07, "logits/chosen": -0.7904247641563416, "logits/rejected": -0.7964598536491394, "logps/chosen": -102.45553588867188, "logps/rejected": -52.940589904785156, "loss": 0.6031, "rewards/accuracies": 0.0, "rewards/chosen": 1.4631332159042358, "rewards/margins": -0.3030815124511719, "rewards/rejected": 1.7662147283554077, "step": 10493 }, { "epoch": 1.7, "learning_rate": 2.3245955315078264e-07, "logits/chosen": -0.7958889007568359, "logits/rejected": -0.7037749290466309, "logps/chosen": -86.31749725341797, "logps/rejected": -6.338417053222656, "loss": 0.1625, "rewards/accuracies": 1.0, "rewards/chosen": 2.3163888454437256, "rewards/margins": 1.3839972019195557, "rewards/rejected": 0.9323916435241699, "step": 10494 }, { "epoch": 1.7, "learning_rate": 2.3234853411153406e-07, "logits/chosen": -0.7164519429206848, "logits/rejected": -0.6609954833984375, "logps/chosen": -43.363014221191406, "logps/rejected": -49.3740119934082, "loss": 0.413, "rewards/accuracies": 1.0, "rewards/chosen": 2.0699081420898438, "rewards/margins": 0.004576444625854492, "rewards/rejected": 2.0653316974639893, "step": 10495 }, { "epoch": 1.7, "learning_rate": 2.3223753356446918e-07, "logits/chosen": -0.974188506603241, "logits/rejected": -0.9485383629798889, "logps/chosen": -74.75247192382812, "logps/rejected": -137.08111572265625, "loss": 0.7556, "rewards/accuracies": 0.0, "rewards/chosen": 4.826940059661865, "rewards/margins": -0.9227428436279297, "rewards/rejected": 5.749682903289795, "step": 10496 }, { "epoch": 1.7, "learning_rate": 2.3212655151725736e-07, "logits/chosen": -0.6863893866539001, "logits/rejected": -0.6992942690849304, "logps/chosen": -85.61668395996094, "logps/rejected": -120.16622924804688, "loss": 1.273, "rewards/accuracies": 0.0, "rewards/chosen": 1.657313585281372, "rewards/margins": -2.2859740257263184, "rewards/rejected": 3.9432876110076904, "step": 10497 }, { "epoch": 1.7, "learning_rate": 2.3201558797756598e-07, "logits/chosen": -0.9612865447998047, "logits/rejected": -0.9439949989318848, "logps/chosen": -121.81340026855469, "logps/rejected": -116.01919555664062, "loss": 0.8149, "rewards/accuracies": 0.0, "rewards/chosen": 4.072264194488525, "rewards/margins": -1.3655290603637695, "rewards/rejected": 5.437793254852295, "step": 10498 }, { "epoch": 1.7, "learning_rate": 2.3190464295306195e-07, "logits/chosen": -1.0047428607940674, "logits/rejected": -0.9459739327430725, "logps/chosen": -126.01194763183594, "logps/rejected": -55.658966064453125, "loss": 0.6519, "rewards/accuracies": 0.0, "rewards/chosen": 1.4771744012832642, "rewards/margins": -0.7546349763870239, "rewards/rejected": 2.231809377670288, "step": 10499 }, { "epoch": 1.7, "learning_rate": 2.3179371645141016e-07, "logits/chosen": -0.10148011893033981, "logits/rejected": -0.10223197937011719, "logps/chosen": -14.795694351196289, "logps/rejected": -5.029299736022949, "loss": 1.2446, "rewards/accuracies": 0.0, "rewards/chosen": -0.2966442108154297, "rewards/margins": -0.42806631326675415, "rewards/rejected": 0.13142208755016327, "step": 10500 }, { "epoch": 1.7, "learning_rate": 2.31682808480275e-07, "logits/chosen": -0.7353296875953674, "logits/rejected": -0.7430892586708069, "logps/chosen": -33.169776916503906, "logps/rejected": -42.26783752441406, "loss": 0.2951, "rewards/accuracies": 1.0, "rewards/chosen": 1.9055789709091187, "rewards/margins": 0.3631945848464966, "rewards/rejected": 1.542384386062622, "step": 10501 }, { "epoch": 1.7, "learning_rate": 2.315719190473187e-07, "logits/chosen": -0.7132552266120911, "logits/rejected": -0.7293692827224731, "logps/chosen": -74.17708587646484, "logps/rejected": -80.07437133789062, "loss": 0.7257, "rewards/accuracies": 0.0, "rewards/chosen": 0.34814682602882385, "rewards/margins": -1.1368690729141235, "rewards/rejected": 1.485015869140625, "step": 10502 }, { "epoch": 1.7, "learning_rate": 2.3146104816020313e-07, "logits/chosen": -0.790867030620575, "logits/rejected": -0.7827863097190857, "logps/chosen": -32.632568359375, "logps/rejected": -73.05204772949219, "loss": 0.3816, "rewards/accuracies": 1.0, "rewards/chosen": 2.216313123703003, "rewards/margins": 0.03822159767150879, "rewards/rejected": 2.178091526031494, "step": 10503 }, { "epoch": 1.7, "learning_rate": 2.31350195826588e-07, "logits/chosen": -0.8294233679771423, "logits/rejected": -0.865196943283081, "logps/chosen": -68.58074188232422, "logps/rejected": -143.55166625976562, "loss": 2.2806, "rewards/accuracies": 0.0, "rewards/chosen": 1.1228020191192627, "rewards/margins": -4.519845962524414, "rewards/rejected": 5.642648220062256, "step": 10504 }, { "epoch": 1.71, "learning_rate": 2.312393620541325e-07, "logits/chosen": -0.789069414138794, "logits/rejected": -0.7412468194961548, "logps/chosen": -87.52973937988281, "logps/rejected": -139.8944091796875, "loss": 3.0307, "rewards/accuracies": 0.0, "rewards/chosen": 2.2709100246429443, "rewards/margins": -2.2105109691619873, "rewards/rejected": 4.481420993804932, "step": 10505 }, { "epoch": 1.71, "learning_rate": 2.3112854685049392e-07, "logits/chosen": -0.7332239747047424, "logits/rejected": -0.6459075808525085, "logps/chosen": -65.21853637695312, "logps/rejected": -51.705013275146484, "loss": 0.3418, "rewards/accuracies": 1.0, "rewards/chosen": 3.308819532394409, "rewards/margins": 0.6318645477294922, "rewards/rejected": 2.676954984664917, "step": 10506 }, { "epoch": 1.71, "learning_rate": 2.3101775022332882e-07, "logits/chosen": -0.7268016338348389, "logits/rejected": -0.7414383888244629, "logps/chosen": -180.31744384765625, "logps/rejected": -65.954833984375, "loss": 0.1137, "rewards/accuracies": 1.0, "rewards/chosen": 3.622692823410034, "rewards/margins": 1.8035590648651123, "rewards/rejected": 1.8191337585449219, "step": 10507 }, { "epoch": 1.71, "learning_rate": 2.3090697218029186e-07, "logits/chosen": -0.7083839774131775, "logits/rejected": -0.6632592678070068, "logps/chosen": -70.20062255859375, "logps/rejected": -28.101531982421875, "loss": 0.4174, "rewards/accuracies": 0.0, "rewards/chosen": 1.3455742597579956, "rewards/margins": -0.14546775817871094, "rewards/rejected": 1.4910420179367065, "step": 10508 }, { "epoch": 1.71, "learning_rate": 2.3079621272903715e-07, "logits/chosen": -0.7807332873344421, "logits/rejected": -0.7807332873344421, "logps/chosen": -31.31598663330078, "logps/rejected": -31.31598663330078, "loss": 0.7727, "rewards/accuracies": 0.0, "rewards/chosen": 1.6936492919921875, "rewards/margins": 0.0, "rewards/rejected": 1.6936492919921875, "step": 10509 }, { "epoch": 1.71, "learning_rate": 2.3068547187721672e-07, "logits/chosen": -0.9763838648796082, "logits/rejected": -0.995836079120636, "logps/chosen": -100.28875732421875, "logps/rejected": -100.05937957763672, "loss": 1.5458, "rewards/accuracies": 0.0, "rewards/chosen": 1.059162974357605, "rewards/margins": -1.743857502937317, "rewards/rejected": 2.803020477294922, "step": 10510 }, { "epoch": 1.71, "learning_rate": 2.3057474963248202e-07, "logits/chosen": -0.7760961055755615, "logits/rejected": -0.7760961055755615, "logps/chosen": -95.26858520507812, "logps/rejected": -95.26858520507812, "loss": 0.4348, "rewards/accuracies": 0.0, "rewards/chosen": 2.1920456886291504, "rewards/margins": 0.0, "rewards/rejected": 2.1920456886291504, "step": 10511 }, { "epoch": 1.71, "learning_rate": 2.304640460024827e-07, "logits/chosen": -0.6201667785644531, "logits/rejected": -0.7270089983940125, "logps/chosen": -134.61956787109375, "logps/rejected": -171.3487091064453, "loss": 3.0824, "rewards/accuracies": 0.0, "rewards/chosen": 1.4397553205490112, "rewards/margins": -5.90609884262085, "rewards/rejected": 7.34585428237915, "step": 10512 }, { "epoch": 1.71, "learning_rate": 2.3035336099486757e-07, "logits/chosen": -0.4719974994659424, "logits/rejected": -0.4719974994659424, "logps/chosen": -54.1631965637207, "logps/rejected": -54.1631965637207, "loss": 2.3066, "rewards/accuracies": 0.0, "rewards/chosen": 2.301476001739502, "rewards/margins": 0.0, "rewards/rejected": 2.301476001739502, "step": 10513 }, { "epoch": 1.71, "learning_rate": 2.3024269461728358e-07, "logits/chosen": -0.725781261920929, "logits/rejected": -0.7133350372314453, "logps/chosen": -63.00531768798828, "logps/rejected": -76.24819946289062, "loss": 1.1959, "rewards/accuracies": 0.0, "rewards/chosen": 1.798072099685669, "rewards/margins": -0.27557897567749023, "rewards/rejected": 2.073651075363159, "step": 10514 }, { "epoch": 1.71, "learning_rate": 2.3013204687737713e-07, "logits/chosen": -0.5186397433280945, "logits/rejected": -0.46501386165618896, "logps/chosen": -35.74485778808594, "logps/rejected": -29.14898681640625, "loss": 0.4305, "rewards/accuracies": 1.0, "rewards/chosen": 2.271449327468872, "rewards/margins": 1.1745223999023438, "rewards/rejected": 1.0969269275665283, "step": 10515 }, { "epoch": 1.71, "learning_rate": 2.3002141778279254e-07, "logits/chosen": -0.7680222392082214, "logits/rejected": -0.6227183938026428, "logps/chosen": -71.92424011230469, "logps/rejected": -63.55378723144531, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": 2.800022840499878, "rewards/margins": 3.441606044769287, "rewards/rejected": -0.641583263874054, "step": 10516 }, { "epoch": 1.71, "learning_rate": 2.2991080734117364e-07, "logits/chosen": -0.8759034872055054, "logits/rejected": -0.8243134617805481, "logps/chosen": -84.96281433105469, "logps/rejected": -130.3882293701172, "loss": 0.1653, "rewards/accuracies": 1.0, "rewards/chosen": 6.207562446594238, "rewards/margins": 1.164799690246582, "rewards/rejected": 5.042762756347656, "step": 10517 }, { "epoch": 1.71, "learning_rate": 2.298002155601621e-07, "logits/chosen": -0.7873924970626831, "logits/rejected": -0.8656922578811646, "logps/chosen": -72.50344848632812, "logps/rejected": -171.88168334960938, "loss": 2.1208, "rewards/accuracies": 0.0, "rewards/chosen": 2.274878740310669, "rewards/margins": -4.075689315795898, "rewards/rejected": 6.350567817687988, "step": 10518 }, { "epoch": 1.71, "learning_rate": 2.296896424473992e-07, "logits/chosen": -0.964130699634552, "logits/rejected": -1.0416501760482788, "logps/chosen": -132.28070068359375, "logps/rejected": -113.37437438964844, "loss": 1.1382, "rewards/accuracies": 0.0, "rewards/chosen": 3.4315521717071533, "rewards/margins": -2.126208543777466, "rewards/rejected": 5.557760715484619, "step": 10519 }, { "epoch": 1.71, "learning_rate": 2.2957908801052417e-07, "logits/chosen": -0.8430516719818115, "logits/rejected": -0.784554123878479, "logps/chosen": -46.942203521728516, "logps/rejected": -47.995635986328125, "loss": 0.6111, "rewards/accuracies": 1.0, "rewards/chosen": 2.053736448287964, "rewards/margins": 0.18450427055358887, "rewards/rejected": 1.869232177734375, "step": 10520 }, { "epoch": 1.71, "learning_rate": 2.2946855225717555e-07, "logits/chosen": -0.626115620136261, "logits/rejected": -0.6218101978302002, "logps/chosen": -6.939461708068848, "logps/rejected": -10.239339828491211, "loss": 0.3481, "rewards/accuracies": 1.0, "rewards/chosen": 0.16428767144680023, "rewards/margins": 0.05117636173963547, "rewards/rejected": 0.11311130970716476, "step": 10521 }, { "epoch": 1.71, "learning_rate": 2.2935803519498998e-07, "logits/chosen": -0.5224507451057434, "logits/rejected": -0.5225634574890137, "logps/chosen": -2.982074737548828, "logps/rejected": -1.6807928085327148, "loss": 0.4086, "rewards/accuracies": 0.0, "rewards/chosen": 0.2943440079689026, "rewards/margins": -0.16471457481384277, "rewards/rejected": 0.45905858278274536, "step": 10522 }, { "epoch": 1.71, "learning_rate": 2.2924753683160358e-07, "logits/chosen": -0.5528344511985779, "logits/rejected": -0.582513689994812, "logps/chosen": -56.1297607421875, "logps/rejected": -92.5845947265625, "loss": 1.9066, "rewards/accuracies": 0.0, "rewards/chosen": 2.046689748764038, "rewards/margins": -0.03470754623413086, "rewards/rejected": 2.081397294998169, "step": 10523 }, { "epoch": 1.71, "learning_rate": 2.2913705717465027e-07, "logits/chosen": -0.9967252016067505, "logits/rejected": -1.004138469696045, "logps/chosen": -65.99830627441406, "logps/rejected": -77.05622863769531, "loss": 0.3892, "rewards/accuracies": 1.0, "rewards/chosen": 2.0674850940704346, "rewards/margins": 0.24065864086151123, "rewards/rejected": 1.8268264532089233, "step": 10524 }, { "epoch": 1.71, "learning_rate": 2.2902659623176361e-07, "logits/chosen": -0.4378167986869812, "logits/rejected": -0.41098615527153015, "logps/chosen": -45.89174270629883, "logps/rejected": -37.128562927246094, "loss": 0.5743, "rewards/accuracies": 0.0, "rewards/chosen": 1.058802843093872, "rewards/margins": -0.07193529605865479, "rewards/rejected": 1.1307381391525269, "step": 10525 }, { "epoch": 1.71, "learning_rate": 2.2891615401057495e-07, "logits/chosen": -0.613128125667572, "logits/rejected": -0.5100592374801636, "logps/chosen": -65.1694107055664, "logps/rejected": -36.89051818847656, "loss": 0.5342, "rewards/accuracies": 0.0, "rewards/chosen": 1.6047683954238892, "rewards/margins": -0.5816513299942017, "rewards/rejected": 2.186419725418091, "step": 10526 }, { "epoch": 1.71, "learning_rate": 2.288057305187152e-07, "logits/chosen": -0.5585347414016724, "logits/rejected": -0.4969395399093628, "logps/chosen": -58.6314582824707, "logps/rejected": -77.34375, "loss": 0.4779, "rewards/accuracies": 1.0, "rewards/chosen": 1.5934590101242065, "rewards/margins": 0.33211255073547363, "rewards/rejected": 1.261346459388733, "step": 10527 }, { "epoch": 1.71, "learning_rate": 2.286953257638133e-07, "logits/chosen": -0.9168662428855896, "logits/rejected": -0.8593922853469849, "logps/chosen": -72.67054748535156, "logps/rejected": -123.43023681640625, "loss": 0.9669, "rewards/accuracies": 0.0, "rewards/chosen": 2.1551620960235596, "rewards/margins": -1.7384400367736816, "rewards/rejected": 3.893602132797241, "step": 10528 }, { "epoch": 1.71, "learning_rate": 2.285849397534974e-07, "logits/chosen": -1.2911057472229004, "logits/rejected": -1.2584047317504883, "logps/chosen": -111.5689926147461, "logps/rejected": -40.337913513183594, "loss": 0.4068, "rewards/accuracies": 0.0, "rewards/chosen": -0.04636077955365181, "rewards/margins": -0.16060905158519745, "rewards/rejected": 0.11424827575683594, "step": 10529 }, { "epoch": 1.71, "learning_rate": 2.2847457249539386e-07, "logits/chosen": -0.6835564970970154, "logits/rejected": -0.773168683052063, "logps/chosen": -75.05152893066406, "logps/rejected": -115.68682861328125, "loss": 2.381, "rewards/accuracies": 0.0, "rewards/chosen": 1.9160979986190796, "rewards/margins": -1.9558929204940796, "rewards/rejected": 3.871990919113159, "step": 10530 }, { "epoch": 1.71, "learning_rate": 2.2836422399712836e-07, "logits/chosen": -0.4225902855396271, "logits/rejected": -0.4173729419708252, "logps/chosen": -97.61288452148438, "logps/rejected": -90.24803161621094, "loss": 0.7053, "rewards/accuracies": 0.0, "rewards/chosen": 1.2146164178848267, "rewards/margins": -0.30043482780456543, "rewards/rejected": 1.515051245689392, "step": 10531 }, { "epoch": 1.71, "learning_rate": 2.2825389426632446e-07, "logits/chosen": -0.7013663649559021, "logits/rejected": -0.6767133474349976, "logps/chosen": -47.070919036865234, "logps/rejected": -85.34611511230469, "loss": 1.7804, "rewards/accuracies": 1.0, "rewards/chosen": 2.8234119415283203, "rewards/margins": 0.8435389995574951, "rewards/rejected": 1.9798729419708252, "step": 10532 }, { "epoch": 1.71, "learning_rate": 2.281435833106053e-07, "logits/chosen": -0.9189339876174927, "logits/rejected": -1.051862359046936, "logps/chosen": -147.2962646484375, "logps/rejected": -135.12521362304688, "loss": 2.1146, "rewards/accuracies": 0.0, "rewards/chosen": 4.490447998046875, "rewards/margins": -3.7504730224609375, "rewards/rejected": 8.240921020507812, "step": 10533 }, { "epoch": 1.71, "learning_rate": 2.2803329113759252e-07, "logits/chosen": -0.6736294627189636, "logits/rejected": -0.6118040680885315, "logps/chosen": -80.82711791992188, "logps/rejected": -59.166194915771484, "loss": 0.2744, "rewards/accuracies": 1.0, "rewards/chosen": 1.7706947326660156, "rewards/margins": 1.1200908422470093, "rewards/rejected": 0.6506038904190063, "step": 10534 }, { "epoch": 1.71, "learning_rate": 2.2792301775490557e-07, "logits/chosen": -0.6593913435935974, "logits/rejected": -0.5313265919685364, "logps/chosen": -149.05638122558594, "logps/rejected": -88.47366333007812, "loss": 0.115, "rewards/accuracies": 1.0, "rewards/chosen": 4.431272983551025, "rewards/margins": 2.876445770263672, "rewards/rejected": 1.554827094078064, "step": 10535 }, { "epoch": 1.71, "learning_rate": 2.2781276317016385e-07, "logits/chosen": -0.5469911694526672, "logits/rejected": -0.591185450553894, "logps/chosen": -122.58433532714844, "logps/rejected": -62.259159088134766, "loss": 0.5815, "rewards/accuracies": 0.0, "rewards/chosen": 0.7062363028526306, "rewards/margins": -0.6445552706718445, "rewards/rejected": 1.350791573524475, "step": 10536 }, { "epoch": 1.71, "learning_rate": 2.277025273909846e-07, "logits/chosen": -0.6012707352638245, "logits/rejected": -0.6746386289596558, "logps/chosen": -65.46823120117188, "logps/rejected": -108.94241333007812, "loss": 2.8722, "rewards/accuracies": 0.0, "rewards/chosen": 0.9602821469306946, "rewards/margins": -5.017014026641846, "rewards/rejected": 5.977296352386475, "step": 10537 }, { "epoch": 1.71, "learning_rate": 2.2759231042498433e-07, "logits/chosen": -0.8243581652641296, "logits/rejected": -0.631848931312561, "logps/chosen": -69.53672790527344, "logps/rejected": -51.56223678588867, "loss": 1.7522, "rewards/accuracies": 1.0, "rewards/chosen": 4.9978156089782715, "rewards/margins": 2.9253199100494385, "rewards/rejected": 2.072495698928833, "step": 10538 }, { "epoch": 1.71, "learning_rate": 2.2748211227977772e-07, "logits/chosen": -0.726728618144989, "logits/rejected": -0.6761081218719482, "logps/chosen": -101.56938934326172, "logps/rejected": -112.44686889648438, "loss": 0.5205, "rewards/accuracies": 1.0, "rewards/chosen": 6.0571818351745605, "rewards/margins": 3.5744211673736572, "rewards/rejected": 2.4827606678009033, "step": 10539 }, { "epoch": 1.71, "learning_rate": 2.2737193296297873e-07, "logits/chosen": -0.31048837304115295, "logits/rejected": -0.37879887223243713, "logps/chosen": -72.5810775756836, "logps/rejected": -65.82124328613281, "loss": 0.4376, "rewards/accuracies": 0.0, "rewards/chosen": 1.3698265552520752, "rewards/margins": -0.30663836002349854, "rewards/rejected": 1.6764649152755737, "step": 10540 }, { "epoch": 1.71, "learning_rate": 2.272617724821994e-07, "logits/chosen": -0.602636456489563, "logits/rejected": -0.6179074048995972, "logps/chosen": -70.04586791992188, "logps/rejected": -175.53659057617188, "loss": 0.6293, "rewards/accuracies": 1.0, "rewards/chosen": 1.2037369012832642, "rewards/margins": 0.2740478515625, "rewards/rejected": 0.9296890497207642, "step": 10541 }, { "epoch": 1.71, "learning_rate": 2.2715163084505108e-07, "logits/chosen": -1.0347806215286255, "logits/rejected": -0.9947217702865601, "logps/chosen": -52.644378662109375, "logps/rejected": -73.47904968261719, "loss": 0.3071, "rewards/accuracies": 1.0, "rewards/chosen": 2.463165283203125, "rewards/margins": 0.36980581283569336, "rewards/rejected": 2.0933594703674316, "step": 10542 }, { "epoch": 1.71, "learning_rate": 2.2704150805914314e-07, "logits/chosen": -0.7122069597244263, "logits/rejected": -0.657387375831604, "logps/chosen": -84.79519653320312, "logps/rejected": -50.48054504394531, "loss": 0.6445, "rewards/accuracies": 0.0, "rewards/chosen": 1.25640869140625, "rewards/margins": -0.47169196605682373, "rewards/rejected": 1.7281006574630737, "step": 10543 }, { "epoch": 1.71, "learning_rate": 2.2693140413208446e-07, "logits/chosen": -0.7877843379974365, "logits/rejected": -0.8489769697189331, "logps/chosen": -149.56643676757812, "logps/rejected": -144.95948791503906, "loss": 1.4886, "rewards/accuracies": 0.0, "rewards/chosen": 5.018437385559082, "rewards/margins": -2.794071674346924, "rewards/rejected": 7.812509059906006, "step": 10544 }, { "epoch": 1.71, "learning_rate": 2.2682131907148173e-07, "logits/chosen": -0.45291298627853394, "logits/rejected": -0.4568379521369934, "logps/chosen": -133.89508056640625, "logps/rejected": -77.39604187011719, "loss": 0.8848, "rewards/accuracies": 0.0, "rewards/chosen": 2.6632354259490967, "rewards/margins": -0.19974613189697266, "rewards/rejected": 2.8629815578460693, "step": 10545 }, { "epoch": 1.71, "learning_rate": 2.267112528849412e-07, "logits/chosen": -0.5257787108421326, "logits/rejected": -0.4858156144618988, "logps/chosen": -59.971946716308594, "logps/rejected": -45.81678771972656, "loss": 1.0474, "rewards/accuracies": 0.0, "rewards/chosen": 1.200548529624939, "rewards/margins": -0.37990880012512207, "rewards/rejected": 1.580457329750061, "step": 10546 }, { "epoch": 1.71, "learning_rate": 2.2660120558006707e-07, "logits/chosen": -0.522578239440918, "logits/rejected": -0.4334023892879486, "logps/chosen": -91.79812622070312, "logps/rejected": -76.51107025146484, "loss": 0.5999, "rewards/accuracies": 0.0, "rewards/chosen": 0.5013427734375, "rewards/margins": -0.8389061689376831, "rewards/rejected": 1.340248942375183, "step": 10547 }, { "epoch": 1.71, "learning_rate": 2.2649117716446287e-07, "logits/chosen": -1.2484865188598633, "logits/rejected": -1.0943166017532349, "logps/chosen": -167.55450439453125, "logps/rejected": -75.05641174316406, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 6.7343902587890625, "rewards/margins": 4.865118503570557, "rewards/rejected": 1.8692718744277954, "step": 10548 }, { "epoch": 1.71, "learning_rate": 2.2638116764573019e-07, "logits/chosen": -0.40068888664245605, "logits/rejected": -0.40068888664245605, "logps/chosen": -1.0911222696304321, "logps/rejected": -1.0911222696304321, "loss": 1.6719, "rewards/accuracies": 0.0, "rewards/chosen": 0.1605949103832245, "rewards/margins": 0.0, "rewards/rejected": 0.1605949103832245, "step": 10549 }, { "epoch": 1.71, "learning_rate": 2.2627117703146998e-07, "logits/chosen": -0.949411928653717, "logits/rejected": -0.4581899642944336, "logps/chosen": -277.580322265625, "logps/rejected": -62.568626403808594, "loss": 0.2571, "rewards/accuracies": 1.0, "rewards/chosen": 5.638769626617432, "rewards/margins": 2.577009677886963, "rewards/rejected": 3.0617599487304688, "step": 10550 }, { "epoch": 1.71, "learning_rate": 2.2616120532928124e-07, "logits/chosen": -0.5580525398254395, "logits/rejected": -0.5580525398254395, "logps/chosen": -40.23314666748047, "logps/rejected": -40.23314666748047, "loss": 0.3801, "rewards/accuracies": 0.0, "rewards/chosen": 1.1309112310409546, "rewards/margins": 0.0, "rewards/rejected": 1.1309112310409546, "step": 10551 }, { "epoch": 1.71, "learning_rate": 2.2605125254676205e-07, "logits/chosen": -0.6264223456382751, "logits/rejected": -0.5046300292015076, "logps/chosen": -50.61565399169922, "logps/rejected": -28.99833869934082, "loss": 0.3284, "rewards/accuracies": 1.0, "rewards/chosen": 1.9950119256973267, "rewards/margins": 1.8553142547607422, "rewards/rejected": 0.13969765603542328, "step": 10552 }, { "epoch": 1.71, "learning_rate": 2.2594131869150945e-07, "logits/chosen": -0.8745822906494141, "logits/rejected": -0.9072723388671875, "logps/chosen": -59.914161682128906, "logps/rejected": -66.40443420410156, "loss": 0.3754, "rewards/accuracies": 1.0, "rewards/chosen": 2.5808265209198, "rewards/margins": 1.290274739265442, "rewards/rejected": 1.290551781654358, "step": 10553 }, { "epoch": 1.71, "learning_rate": 2.2583140377111837e-07, "logits/chosen": -0.4624888002872467, "logits/rejected": -0.4624888002872467, "logps/chosen": -19.88190460205078, "logps/rejected": -19.88190460205078, "loss": 0.3689, "rewards/accuracies": 0.0, "rewards/chosen": 0.3912239074707031, "rewards/margins": 0.0, "rewards/rejected": 0.3912239074707031, "step": 10554 }, { "epoch": 1.71, "learning_rate": 2.2572150779318322e-07, "logits/chosen": -0.7185034155845642, "logits/rejected": -0.6303238272666931, "logps/chosen": -61.691402435302734, "logps/rejected": -100.43873596191406, "loss": 1.4945, "rewards/accuracies": 1.0, "rewards/chosen": 3.1244053840637207, "rewards/margins": 0.0411076545715332, "rewards/rejected": 3.0832977294921875, "step": 10555 }, { "epoch": 1.71, "learning_rate": 2.256116307652965e-07, "logits/chosen": -1.0014711618423462, "logits/rejected": -0.9722974896430969, "logps/chosen": -45.59984588623047, "logps/rejected": -86.5966796875, "loss": 1.4677, "rewards/accuracies": 0.0, "rewards/chosen": 1.7476959228515625, "rewards/margins": -0.5578200817108154, "rewards/rejected": 2.305516004562378, "step": 10556 }, { "epoch": 1.71, "learning_rate": 2.2550177269504993e-07, "logits/chosen": -0.8065794110298157, "logits/rejected": -0.8065794110298157, "logps/chosen": -64.74501037597656, "logps/rejected": -64.74501037597656, "loss": 0.3806, "rewards/accuracies": 0.0, "rewards/chosen": 3.3875038623809814, "rewards/margins": 0.0, "rewards/rejected": 3.3875038623809814, "step": 10557 }, { "epoch": 1.71, "learning_rate": 2.253919335900334e-07, "logits/chosen": -0.9978958368301392, "logits/rejected": -0.8868575096130371, "logps/chosen": -122.10484313964844, "logps/rejected": -33.28520965576172, "loss": 0.2507, "rewards/accuracies": 1.0, "rewards/chosen": 0.9876602292060852, "rewards/margins": 0.9162392020225525, "rewards/rejected": 0.0714210495352745, "step": 10558 }, { "epoch": 1.71, "learning_rate": 2.252821134578361e-07, "logits/chosen": -1.1020961999893188, "logits/rejected": -1.0890352725982666, "logps/chosen": -128.73968505859375, "logps/rejected": -40.92725372314453, "loss": 1.1735, "rewards/accuracies": 1.0, "rewards/chosen": 1.6593201160430908, "rewards/margins": 1.423186182975769, "rewards/rejected": 0.23613396286964417, "step": 10559 }, { "epoch": 1.71, "learning_rate": 2.2517231230604512e-07, "logits/chosen": -0.958968997001648, "logits/rejected": -0.8511739373207092, "logps/chosen": -109.35379028320312, "logps/rejected": -32.298500061035156, "loss": 1.1981, "rewards/accuracies": 1.0, "rewards/chosen": 2.3971526622772217, "rewards/margins": 2.1206836700439453, "rewards/rejected": 0.27646905183792114, "step": 10560 }, { "epoch": 1.71, "learning_rate": 2.2506253014224714e-07, "logits/chosen": -0.7145266532897949, "logits/rejected": -0.680823564529419, "logps/chosen": -49.7429313659668, "logps/rejected": -51.374542236328125, "loss": 0.5425, "rewards/accuracies": 1.0, "rewards/chosen": 2.9871394634246826, "rewards/margins": 0.03892326354980469, "rewards/rejected": 2.948216199874878, "step": 10561 }, { "epoch": 1.71, "learning_rate": 2.249527669740266e-07, "logits/chosen": -0.7198546528816223, "logits/rejected": -0.7138835787773132, "logps/chosen": -119.1952133178711, "logps/rejected": -176.424072265625, "loss": 0.5405, "rewards/accuracies": 0.0, "rewards/chosen": 5.001184940338135, "rewards/margins": -0.658571720123291, "rewards/rejected": 5.659756660461426, "step": 10562 }, { "epoch": 1.71, "learning_rate": 2.248430228089676e-07, "logits/chosen": -0.5145577788352966, "logits/rejected": -0.5567982792854309, "logps/chosen": -34.38868713378906, "logps/rejected": -112.40200805664062, "loss": 1.5541, "rewards/accuracies": 0.0, "rewards/chosen": 1.72264564037323, "rewards/margins": -1.4893370866775513, "rewards/rejected": 3.2119827270507812, "step": 10563 }, { "epoch": 1.71, "learning_rate": 2.2473329765465194e-07, "logits/chosen": -0.9289027452468872, "logits/rejected": -0.9699369668960571, "logps/chosen": -228.7118377685547, "logps/rejected": -159.40357971191406, "loss": 1.1062, "rewards/accuracies": 0.0, "rewards/chosen": 3.9505813121795654, "rewards/margins": -2.0923049449920654, "rewards/rejected": 6.042886257171631, "step": 10564 }, { "epoch": 1.71, "learning_rate": 2.24623591518661e-07, "logits/chosen": -0.5496849417686462, "logits/rejected": -0.5808751583099365, "logps/chosen": -169.07635498046875, "logps/rejected": -99.2273941040039, "loss": 1.2817, "rewards/accuracies": 0.0, "rewards/chosen": 3.113961935043335, "rewards/margins": -2.4655587673187256, "rewards/rejected": 5.5795207023620605, "step": 10565 }, { "epoch": 1.71, "learning_rate": 2.2451390440857404e-07, "logits/chosen": -0.3633911609649658, "logits/rejected": -0.3633911609649658, "logps/chosen": -35.06167984008789, "logps/rejected": -35.06167984008789, "loss": 1.4669, "rewards/accuracies": 0.0, "rewards/chosen": 0.40731239318847656, "rewards/margins": 0.0, "rewards/rejected": 0.40731239318847656, "step": 10566 }, { "epoch": 1.72, "learning_rate": 2.2440423633196985e-07, "logits/chosen": -1.0186794996261597, "logits/rejected": -1.0272431373596191, "logps/chosen": -64.70218658447266, "logps/rejected": -66.50782012939453, "loss": 1.2462, "rewards/accuracies": 0.0, "rewards/chosen": 1.568396806716919, "rewards/margins": -0.12643730640411377, "rewards/rejected": 1.6948341131210327, "step": 10567 }, { "epoch": 1.72, "learning_rate": 2.2429458729642502e-07, "logits/chosen": -0.7667421102523804, "logits/rejected": -0.8286134600639343, "logps/chosen": -75.07234954833984, "logps/rejected": -68.11552429199219, "loss": 0.7801, "rewards/accuracies": 0.0, "rewards/chosen": 1.0822120904922485, "rewards/margins": -0.3128105401992798, "rewards/rejected": 1.3950226306915283, "step": 10568 }, { "epoch": 1.72, "learning_rate": 2.2418495730951565e-07, "logits/chosen": -0.3849145174026489, "logits/rejected": -0.3827085793018341, "logps/chosen": -13.90773868560791, "logps/rejected": -4.5012431144714355, "loss": 0.4696, "rewards/accuracies": 0.0, "rewards/chosen": -0.1691625565290451, "rewards/margins": -0.3381667137145996, "rewards/rejected": 0.1690041571855545, "step": 10569 }, { "epoch": 1.72, "learning_rate": 2.240753463788157e-07, "logits/chosen": -0.58404940366745, "logits/rejected": -0.49784934520721436, "logps/chosen": -34.89767074584961, "logps/rejected": -48.629600524902344, "loss": 1.5849, "rewards/accuracies": 0.0, "rewards/chosen": 0.992117702960968, "rewards/margins": -1.1611485481262207, "rewards/rejected": 2.153266191482544, "step": 10570 }, { "epoch": 1.72, "learning_rate": 2.2396575451189874e-07, "logits/chosen": -0.302845299243927, "logits/rejected": -0.302845299243927, "logps/chosen": -57.07083511352539, "logps/rejected": -57.07083511352539, "loss": 0.9035, "rewards/accuracies": 0.0, "rewards/chosen": 0.2045009583234787, "rewards/margins": 0.0, "rewards/rejected": 0.2045009583234787, "step": 10571 }, { "epoch": 1.72, "learning_rate": 2.238561817163361e-07, "logits/chosen": -0.9476828575134277, "logits/rejected": -0.9051182866096497, "logps/chosen": -43.22227478027344, "logps/rejected": -64.51701354980469, "loss": 0.5833, "rewards/accuracies": 0.0, "rewards/chosen": 1.05766761302948, "rewards/margins": -0.48080289363861084, "rewards/rejected": 1.5384705066680908, "step": 10572 }, { "epoch": 1.72, "learning_rate": 2.2374662799969856e-07, "logits/chosen": -0.9788544774055481, "logits/rejected": -0.883581280708313, "logps/chosen": -146.84046936035156, "logps/rejected": -124.46437072753906, "loss": 0.1503, "rewards/accuracies": 1.0, "rewards/chosen": 5.802250862121582, "rewards/margins": 1.3769593238830566, "rewards/rejected": 4.425291538238525, "step": 10573 }, { "epoch": 1.72, "learning_rate": 2.236370933695549e-07, "logits/chosen": -0.7905710935592651, "logits/rejected": -0.7597282528877258, "logps/chosen": -96.55430603027344, "logps/rejected": -114.31199645996094, "loss": 1.9689, "rewards/accuracies": 0.0, "rewards/chosen": 3.784449815750122, "rewards/margins": -3.910003900527954, "rewards/rejected": 7.694453716278076, "step": 10574 }, { "epoch": 1.72, "learning_rate": 2.2352757783347332e-07, "logits/chosen": -0.7373634576797485, "logits/rejected": -0.6047751307487488, "logps/chosen": -83.4337158203125, "logps/rejected": -48.61225509643555, "loss": 0.3282, "rewards/accuracies": 1.0, "rewards/chosen": 2.8065927028656006, "rewards/margins": 1.8285489082336426, "rewards/rejected": 0.9780437350273132, "step": 10575 }, { "epoch": 1.72, "learning_rate": 2.234180813990199e-07, "logits/chosen": -0.8259850740432739, "logits/rejected": -0.858432948589325, "logps/chosen": -69.35147857666016, "logps/rejected": -66.24906158447266, "loss": 0.8206, "rewards/accuracies": 0.0, "rewards/chosen": 0.6467262506484985, "rewards/margins": -0.6158668994903564, "rewards/rejected": 1.262593150138855, "step": 10576 }, { "epoch": 1.72, "learning_rate": 2.2330860407376024e-07, "logits/chosen": -0.7878768444061279, "logits/rejected": -0.7766463160514832, "logps/chosen": -84.20008850097656, "logps/rejected": -109.45339965820312, "loss": 0.277, "rewards/accuracies": 1.0, "rewards/chosen": 1.0868133306503296, "rewards/margins": 0.3551291823387146, "rewards/rejected": 0.731684148311615, "step": 10577 }, { "epoch": 1.72, "learning_rate": 2.2319914586525774e-07, "logits/chosen": -0.9265938401222229, "logits/rejected": -0.8800988793373108, "logps/chosen": -108.64031982421875, "logps/rejected": -134.359130859375, "loss": 2.3881, "rewards/accuracies": 0.0, "rewards/chosen": 0.2605171203613281, "rewards/margins": -4.563838958740234, "rewards/rejected": 4.8243560791015625, "step": 10578 }, { "epoch": 1.72, "learning_rate": 2.2308970678107535e-07, "logits/chosen": -0.20182454586029053, "logits/rejected": -0.26012516021728516, "logps/chosen": -3.1168384552001953, "logps/rejected": -64.8035659790039, "loss": 0.9396, "rewards/accuracies": 0.0, "rewards/chosen": 0.3733553886413574, "rewards/margins": -0.4808308482170105, "rewards/rejected": 0.8541862368583679, "step": 10579 }, { "epoch": 1.72, "learning_rate": 2.229802868287739e-07, "logits/chosen": -0.2268567979335785, "logits/rejected": -0.2577965259552002, "logps/chosen": -60.526885986328125, "logps/rejected": -80.37893676757812, "loss": 0.6654, "rewards/accuracies": 0.0, "rewards/chosen": 2.6376075744628906, "rewards/margins": -0.4392220973968506, "rewards/rejected": 3.076829671859741, "step": 10580 }, { "epoch": 1.72, "learning_rate": 2.2287088601591357e-07, "logits/chosen": -0.46710798144340515, "logits/rejected": -0.35922855138778687, "logps/chosen": -52.547523498535156, "logps/rejected": -53.94834899902344, "loss": 0.6771, "rewards/accuracies": 0.0, "rewards/chosen": 1.3121566772460938, "rewards/margins": -0.569050669670105, "rewards/rejected": 1.8812073469161987, "step": 10581 }, { "epoch": 1.72, "learning_rate": 2.227615043500527e-07, "logits/chosen": -1.0780750513076782, "logits/rejected": -0.9315521121025085, "logps/chosen": -160.18597412109375, "logps/rejected": -57.974525451660156, "loss": 0.3393, "rewards/accuracies": 1.0, "rewards/chosen": 5.503004550933838, "rewards/margins": 5.3196306228637695, "rewards/rejected": 0.18337403237819672, "step": 10582 }, { "epoch": 1.72, "learning_rate": 2.2265214183874876e-07, "logits/chosen": -0.7669126987457275, "logits/rejected": -0.729629397392273, "logps/chosen": -26.469676971435547, "logps/rejected": -117.69841003417969, "loss": 1.2606, "rewards/accuracies": 0.0, "rewards/chosen": 2.79852294921875, "rewards/margins": -0.3600327968597412, "rewards/rejected": 3.158555746078491, "step": 10583 }, { "epoch": 1.72, "learning_rate": 2.2254279848955732e-07, "logits/chosen": -0.7327899932861328, "logits/rejected": -0.7327899932861328, "logps/chosen": -55.6326904296875, "logps/rejected": -55.6326904296875, "loss": 0.6045, "rewards/accuracies": 0.0, "rewards/chosen": 1.7017990350723267, "rewards/margins": 0.0, "rewards/rejected": 1.7017990350723267, "step": 10584 }, { "epoch": 1.72, "learning_rate": 2.224334743100334e-07, "logits/chosen": -0.6503114700317383, "logits/rejected": -0.6503114700317383, "logps/chosen": -71.77781677246094, "logps/rejected": -71.77781677246094, "loss": 1.4962, "rewards/accuracies": 0.0, "rewards/chosen": 1.6496849060058594, "rewards/margins": 0.0, "rewards/rejected": 1.6496849060058594, "step": 10585 }, { "epoch": 1.72, "learning_rate": 2.2232416930772985e-07, "logits/chosen": -0.4992668628692627, "logits/rejected": -0.5388221144676208, "logps/chosen": -62.8101806640625, "logps/rejected": -75.15718078613281, "loss": 0.5353, "rewards/accuracies": 1.0, "rewards/chosen": 1.7370041608810425, "rewards/margins": 0.7795884013175964, "rewards/rejected": 0.957415759563446, "step": 10586 }, { "epoch": 1.72, "learning_rate": 2.2221488349019902e-07, "logits/chosen": -0.8629854321479797, "logits/rejected": -0.837485134601593, "logps/chosen": -74.8643569946289, "logps/rejected": -56.88838195800781, "loss": 0.5087, "rewards/accuracies": 0.0, "rewards/chosen": 1.9134758710861206, "rewards/margins": -0.4218863248825073, "rewards/rejected": 2.335362195968628, "step": 10587 }, { "epoch": 1.72, "learning_rate": 2.2210561686499108e-07, "logits/chosen": -0.896745502948761, "logits/rejected": -0.9638130068778992, "logps/chosen": -74.41436767578125, "logps/rejected": -85.31346130371094, "loss": 1.1822, "rewards/accuracies": 0.0, "rewards/chosen": 2.7723541259765625, "rewards/margins": -2.1763381958007812, "rewards/rejected": 4.948692321777344, "step": 10588 }, { "epoch": 1.72, "learning_rate": 2.2199636943965561e-07, "logits/chosen": -1.0166869163513184, "logits/rejected": -1.0382583141326904, "logps/chosen": -101.23881530761719, "logps/rejected": -126.63835144042969, "loss": 2.9049, "rewards/accuracies": 0.0, "rewards/chosen": 0.4611053466796875, "rewards/margins": -5.747262477874756, "rewards/rejected": 6.208367824554443, "step": 10589 }, { "epoch": 1.72, "learning_rate": 2.2188714122174061e-07, "logits/chosen": -0.8310025334358215, "logits/rejected": -0.8310025334358215, "logps/chosen": -5.233895301818848, "logps/rejected": -5.233895301818848, "loss": 1.3134, "rewards/accuracies": 0.0, "rewards/chosen": 0.5357141494750977, "rewards/margins": 0.0, "rewards/rejected": 0.5357141494750977, "step": 10590 }, { "epoch": 1.72, "learning_rate": 2.2177793221879253e-07, "logits/chosen": -0.3784055709838867, "logits/rejected": -0.4511620104312897, "logps/chosen": -181.069580078125, "logps/rejected": -91.10226440429688, "loss": 1.8716, "rewards/accuracies": 1.0, "rewards/chosen": 4.205529689788818, "rewards/margins": 1.6256475448608398, "rewards/rejected": 2.5798821449279785, "step": 10591 }, { "epoch": 1.72, "learning_rate": 2.2166874243835693e-07, "logits/chosen": -0.7285657525062561, "logits/rejected": -0.6206356287002563, "logps/chosen": -71.54263305664062, "logps/rejected": -89.65476989746094, "loss": 1.6236, "rewards/accuracies": 0.0, "rewards/chosen": 1.3540161848068237, "rewards/margins": -1.1292906999588013, "rewards/rejected": 2.483306884765625, "step": 10592 }, { "epoch": 1.72, "learning_rate": 2.2155957188797742e-07, "logits/chosen": -0.6836166977882385, "logits/rejected": -0.6993446350097656, "logps/chosen": -54.75541687011719, "logps/rejected": -45.85132598876953, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 0.9603469967842102, "rewards/margins": -1.0200347900390625, "rewards/rejected": 1.9803818464279175, "step": 10593 }, { "epoch": 1.72, "learning_rate": 2.2145042057519707e-07, "logits/chosen": -0.9939413666725159, "logits/rejected": -0.8645162582397461, "logps/chosen": -137.281982421875, "logps/rejected": -65.9640884399414, "loss": 0.2895, "rewards/accuracies": 1.0, "rewards/chosen": 4.9686126708984375, "rewards/margins": 2.716876983642578, "rewards/rejected": 2.2517356872558594, "step": 10594 }, { "epoch": 1.72, "learning_rate": 2.2134128850755684e-07, "logits/chosen": -0.6120555400848389, "logits/rejected": -0.5746932029724121, "logps/chosen": -52.10078430175781, "logps/rejected": -10.764192581176758, "loss": 0.9738, "rewards/accuracies": 0.0, "rewards/chosen": 0.11155777424573898, "rewards/margins": -0.07645044475793839, "rewards/rejected": 0.18800821900367737, "step": 10595 }, { "epoch": 1.72, "learning_rate": 2.2123217569259706e-07, "logits/chosen": -0.7192742228507996, "logits/rejected": -0.6652970314025879, "logps/chosen": -211.69677734375, "logps/rejected": -76.99005126953125, "loss": 1.5319, "rewards/accuracies": 1.0, "rewards/chosen": 3.7181777954101562, "rewards/margins": 0.4305381774902344, "rewards/rejected": 3.287639617919922, "step": 10596 }, { "epoch": 1.72, "learning_rate": 2.2112308213785596e-07, "logits/chosen": -0.55622398853302, "logits/rejected": -0.45744588971138, "logps/chosen": -38.77073287963867, "logps/rejected": -6.871994495391846, "loss": 0.7238, "rewards/accuracies": 1.0, "rewards/chosen": 2.1850993633270264, "rewards/margins": 1.3754003047943115, "rewards/rejected": 0.8096990585327148, "step": 10597 }, { "epoch": 1.72, "learning_rate": 2.2101400785087138e-07, "logits/chosen": -0.6666476130485535, "logits/rejected": -0.6944435834884644, "logps/chosen": -87.29287719726562, "logps/rejected": -102.50399017333984, "loss": 2.3153, "rewards/accuracies": 1.0, "rewards/chosen": 1.6797103881835938, "rewards/margins": 0.5028244256973267, "rewards/rejected": 1.176885962486267, "step": 10598 }, { "epoch": 1.72, "learning_rate": 2.2090495283917888e-07, "logits/chosen": -0.7808352708816528, "logits/rejected": -0.7013189196586609, "logps/chosen": -38.69601821899414, "logps/rejected": -28.6448974609375, "loss": 0.2024, "rewards/accuracies": 1.0, "rewards/chosen": 1.5150543451309204, "rewards/margins": 1.0070075988769531, "rewards/rejected": 0.5080467462539673, "step": 10599 }, { "epoch": 1.72, "learning_rate": 2.2079591711031347e-07, "logits/chosen": -0.44605228304862976, "logits/rejected": -0.22344674170017242, "logps/chosen": -90.83251953125, "logps/rejected": -12.55565357208252, "loss": 0.2852, "rewards/accuracies": 1.0, "rewards/chosen": 2.1192803382873535, "rewards/margins": 1.5092573165893555, "rewards/rejected": 0.610023021697998, "step": 10600 }, { "epoch": 1.72, "learning_rate": 2.2068690067180817e-07, "logits/chosen": -0.5016118288040161, "logits/rejected": -0.5108677744865417, "logps/chosen": -48.369319915771484, "logps/rejected": -64.43959045410156, "loss": 0.259, "rewards/accuracies": 1.0, "rewards/chosen": 0.46187591552734375, "rewards/margins": 0.4305976927280426, "rewards/rejected": 0.031278230249881744, "step": 10601 }, { "epoch": 1.72, "learning_rate": 2.2057790353119532e-07, "logits/chosen": -0.44718125462532043, "logits/rejected": -0.4529190957546234, "logps/chosen": -2.7786872386932373, "logps/rejected": -1.318324327468872, "loss": 1.0287, "rewards/accuracies": 0.0, "rewards/chosen": 0.1716274470090866, "rewards/margins": -0.04357202351093292, "rewards/rejected": 0.21519947052001953, "step": 10602 }, { "epoch": 1.72, "learning_rate": 2.2046892569600528e-07, "logits/chosen": -1.143925666809082, "logits/rejected": -1.0109295845031738, "logps/chosen": -114.17037200927734, "logps/rejected": -65.77754974365234, "loss": 1.3888, "rewards/accuracies": 1.0, "rewards/chosen": 5.951450347900391, "rewards/margins": 4.323836326599121, "rewards/rejected": 1.62761390209198, "step": 10603 }, { "epoch": 1.72, "learning_rate": 2.2035996717376765e-07, "logits/chosen": -0.4550430178642273, "logits/rejected": -0.1731249988079071, "logps/chosen": -60.634925842285156, "logps/rejected": -47.29325866699219, "loss": 0.1422, "rewards/accuracies": 1.0, "rewards/chosen": 2.055194854736328, "rewards/margins": 1.3065738677978516, "rewards/rejected": 0.7486209869384766, "step": 10604 }, { "epoch": 1.72, "learning_rate": 2.2025102797201017e-07, "logits/chosen": -0.7117379307746887, "logits/rejected": -0.688208281993866, "logps/chosen": -86.20836639404297, "logps/rejected": -106.23648071289062, "loss": 1.2577, "rewards/accuracies": 0.0, "rewards/chosen": 2.677136182785034, "rewards/margins": -2.10337233543396, "rewards/rejected": 4.780508518218994, "step": 10605 }, { "epoch": 1.72, "learning_rate": 2.2014210809825977e-07, "logits/chosen": -0.11740751564502716, "logits/rejected": -0.11245691776275635, "logps/chosen": -5.539918899536133, "logps/rejected": -2.871070623397827, "loss": 0.6442, "rewards/accuracies": 0.0, "rewards/chosen": 0.14039312303066254, "rewards/margins": -0.09295269846916199, "rewards/rejected": 0.23334582149982452, "step": 10606 }, { "epoch": 1.72, "learning_rate": 2.2003320756004151e-07, "logits/chosen": -0.8601163029670715, "logits/rejected": -0.8212130069732666, "logps/chosen": -151.72073364257812, "logps/rejected": -75.92819213867188, "loss": 0.6795, "rewards/accuracies": 0.0, "rewards/chosen": 1.967987060546875, "rewards/margins": -0.6656539440155029, "rewards/rejected": 2.633641004562378, "step": 10607 }, { "epoch": 1.72, "learning_rate": 2.1992432636487974e-07, "logits/chosen": -1.261582612991333, "logits/rejected": -1.1215461492538452, "logps/chosen": -83.97186279296875, "logps/rejected": -53.687355041503906, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": 4.207916259765625, "rewards/margins": 2.356187343597412, "rewards/rejected": 1.8517287969589233, "step": 10608 }, { "epoch": 1.72, "learning_rate": 2.1981546452029677e-07, "logits/chosen": -0.6847289204597473, "logits/rejected": -0.68001389503479, "logps/chosen": -96.09075927734375, "logps/rejected": -48.56010055541992, "loss": 0.7488, "rewards/accuracies": 0.0, "rewards/chosen": 2.162071943283081, "rewards/margins": -0.16105246543884277, "rewards/rejected": 2.323124408721924, "step": 10609 }, { "epoch": 1.72, "learning_rate": 2.197066220338142e-07, "logits/chosen": -0.7067692279815674, "logits/rejected": -0.67380291223526, "logps/chosen": -51.85078430175781, "logps/rejected": -55.29620361328125, "loss": 0.8475, "rewards/accuracies": 0.0, "rewards/chosen": 0.8736175894737244, "rewards/margins": -0.9299995303153992, "rewards/rejected": 1.8036171197891235, "step": 10610 }, { "epoch": 1.72, "learning_rate": 2.1959779891295166e-07, "logits/chosen": -0.745824933052063, "logits/rejected": -0.8049370646476746, "logps/chosen": -78.83393096923828, "logps/rejected": -169.8348846435547, "loss": 0.7099, "rewards/accuracies": 0.0, "rewards/chosen": 0.2547752559185028, "rewards/margins": -0.945330023765564, "rewards/rejected": 1.2001053094863892, "step": 10611 }, { "epoch": 1.72, "learning_rate": 2.1948899516522828e-07, "logits/chosen": -0.8212882876396179, "logits/rejected": -0.8212882876396179, "logps/chosen": -66.82086181640625, "logps/rejected": -66.82086181640625, "loss": 0.3739, "rewards/accuracies": 0.0, "rewards/chosen": 1.8156143426895142, "rewards/margins": 0.0, "rewards/rejected": 1.8156143426895142, "step": 10612 }, { "epoch": 1.72, "learning_rate": 2.1938021079816094e-07, "logits/chosen": -0.6961458325386047, "logits/rejected": -0.5459895133972168, "logps/chosen": -94.7904281616211, "logps/rejected": -52.770103454589844, "loss": 0.2254, "rewards/accuracies": 1.0, "rewards/chosen": 0.7666366696357727, "rewards/margins": 0.5933143496513367, "rewards/rejected": 0.17332230508327484, "step": 10613 }, { "epoch": 1.72, "learning_rate": 2.1927144581926594e-07, "logits/chosen": -0.8116838335990906, "logits/rejected": -0.5021358132362366, "logps/chosen": -99.89381408691406, "logps/rejected": -102.86293029785156, "loss": 0.3778, "rewards/accuracies": 1.0, "rewards/chosen": 3.979719638824463, "rewards/margins": 0.10323190689086914, "rewards/rejected": 3.8764877319335938, "step": 10614 }, { "epoch": 1.72, "learning_rate": 2.1916270023605753e-07, "logits/chosen": -0.8316991925239563, "logits/rejected": -0.8299710154533386, "logps/chosen": -88.67436218261719, "logps/rejected": -97.19974517822266, "loss": 0.9144, "rewards/accuracies": 0.0, "rewards/chosen": 1.5997711420059204, "rewards/margins": -0.7357498407363892, "rewards/rejected": 2.3355209827423096, "step": 10615 }, { "epoch": 1.72, "learning_rate": 2.1905397405604948e-07, "logits/chosen": -0.8545404076576233, "logits/rejected": -0.8953177332878113, "logps/chosen": -207.0337371826172, "logps/rejected": -153.60934448242188, "loss": 1.88, "rewards/accuracies": 0.0, "rewards/chosen": 2.114550828933716, "rewards/margins": -2.2775299549102783, "rewards/rejected": 4.392080783843994, "step": 10616 }, { "epoch": 1.72, "learning_rate": 2.1894526728675329e-07, "logits/chosen": -0.4365856349468231, "logits/rejected": -0.4383794963359833, "logps/chosen": -1.0563408136367798, "logps/rejected": -4.515227794647217, "loss": 1.0391, "rewards/accuracies": 0.0, "rewards/chosen": 0.2034250795841217, "rewards/margins": -0.07254290580749512, "rewards/rejected": 0.2759679853916168, "step": 10617 }, { "epoch": 1.72, "learning_rate": 2.188365799356799e-07, "logits/chosen": -0.8403195738792419, "logits/rejected": -0.8508516550064087, "logps/chosen": -119.5394058227539, "logps/rejected": -86.77813720703125, "loss": 0.5218, "rewards/accuracies": 0.0, "rewards/chosen": 0.3919990658760071, "rewards/margins": -0.37351149320602417, "rewards/rejected": 0.7655105590820312, "step": 10618 }, { "epoch": 1.72, "learning_rate": 2.187279120103383e-07, "logits/chosen": -0.8885118365287781, "logits/rejected": -0.8580990433692932, "logps/chosen": -78.92744445800781, "logps/rejected": -118.4986572265625, "loss": 1.5553, "rewards/accuracies": 1.0, "rewards/chosen": 2.540642499923706, "rewards/margins": 1.4232596158981323, "rewards/rejected": 1.1173828840255737, "step": 10619 }, { "epoch": 1.72, "learning_rate": 2.1861926351823674e-07, "logits/chosen": -0.4394513964653015, "logits/rejected": -0.41004371643066406, "logps/chosen": -39.31553649902344, "logps/rejected": -5.3464202880859375, "loss": 0.3978, "rewards/accuracies": 1.0, "rewards/chosen": 0.7165191769599915, "rewards/margins": 0.15586340427398682, "rewards/rejected": 0.5606557726860046, "step": 10620 }, { "epoch": 1.72, "learning_rate": 2.185106344668814e-07, "logits/chosen": -0.5566880106925964, "logits/rejected": -0.6046397686004639, "logps/chosen": -30.492107391357422, "logps/rejected": -127.94732666015625, "loss": 0.2018, "rewards/accuracies": 1.0, "rewards/chosen": 1.379998803138733, "rewards/margins": 0.734570324420929, "rewards/rejected": 0.645428478717804, "step": 10621 }, { "epoch": 1.72, "learning_rate": 2.1840202486377795e-07, "logits/chosen": -1.0339998006820679, "logits/rejected": -0.7678559422492981, "logps/chosen": -188.8436737060547, "logps/rejected": -86.1155014038086, "loss": 0.1411, "rewards/accuracies": 1.0, "rewards/chosen": 5.121768474578857, "rewards/margins": 2.0442934036254883, "rewards/rejected": 3.077475070953369, "step": 10622 }, { "epoch": 1.72, "learning_rate": 2.182934347164299e-07, "logits/chosen": -0.18440581858158112, "logits/rejected": -0.18401455879211426, "logps/chosen": -2.788149833679199, "logps/rejected": -1.5873605012893677, "loss": 0.3426, "rewards/accuracies": 1.0, "rewards/chosen": 0.3060208261013031, "rewards/margins": 0.03273329138755798, "rewards/rejected": 0.2732875347137451, "step": 10623 }, { "epoch": 1.72, "learning_rate": 2.1818486403233998e-07, "logits/chosen": -0.45414915680885315, "logits/rejected": -0.3854137659072876, "logps/chosen": -113.56035614013672, "logps/rejected": -60.383697509765625, "loss": 0.4864, "rewards/accuracies": 1.0, "rewards/chosen": 6.11792516708374, "rewards/margins": 4.887839317321777, "rewards/rejected": 1.2300857305526733, "step": 10624 }, { "epoch": 1.72, "learning_rate": 2.1807631281900962e-07, "logits/chosen": -0.2077651023864746, "logits/rejected": -0.2077651023864746, "logps/chosen": -72.2968978881836, "logps/rejected": -72.2968978881836, "loss": 0.347, "rewards/accuracies": 0.0, "rewards/chosen": 0.6083770990371704, "rewards/margins": 0.0, "rewards/rejected": 0.6083770990371704, "step": 10625 }, { "epoch": 1.72, "learning_rate": 2.179677810839382e-07, "logits/chosen": -1.1533950567245483, "logits/rejected": -1.1048264503479004, "logps/chosen": -125.05033874511719, "logps/rejected": -78.2741928100586, "loss": 0.87, "rewards/accuracies": 0.0, "rewards/chosen": 1.904669165611267, "rewards/margins": -1.4548896551132202, "rewards/rejected": 3.3595588207244873, "step": 10626 }, { "epoch": 1.72, "learning_rate": 2.1785926883462475e-07, "logits/chosen": -0.5464212894439697, "logits/rejected": -0.46763575077056885, "logps/chosen": -98.21273040771484, "logps/rejected": -107.29542541503906, "loss": 0.4199, "rewards/accuracies": 1.0, "rewards/chosen": 1.5063194036483765, "rewards/margins": 0.583489179611206, "rewards/rejected": 0.9228302240371704, "step": 10627 }, { "epoch": 1.73, "learning_rate": 2.1775077607856597e-07, "logits/chosen": -0.4677387475967407, "logits/rejected": -0.46855610609054565, "logps/chosen": -53.06133270263672, "logps/rejected": -80.26673889160156, "loss": 0.4758, "rewards/accuracies": 0.0, "rewards/chosen": 1.868811845779419, "rewards/margins": -0.09474563598632812, "rewards/rejected": 1.963557481765747, "step": 10628 }, { "epoch": 1.73, "learning_rate": 2.1764230282325802e-07, "logits/chosen": -0.7661174535751343, "logits/rejected": -0.8475719094276428, "logps/chosen": -78.640380859375, "logps/rejected": -93.68072509765625, "loss": 1.3748, "rewards/accuracies": 0.0, "rewards/chosen": 2.5562942028045654, "rewards/margins": -2.625267267227173, "rewards/rejected": 5.181561470031738, "step": 10629 }, { "epoch": 1.73, "learning_rate": 2.1753384907619514e-07, "logits/chosen": -1.0700427293777466, "logits/rejected": -0.8887303471565247, "logps/chosen": -144.43270874023438, "logps/rejected": -150.61776733398438, "loss": 0.1452, "rewards/accuracies": 1.0, "rewards/chosen": 3.3682053089141846, "rewards/margins": 2.216387987136841, "rewards/rejected": 1.1518173217773438, "step": 10630 }, { "epoch": 1.73, "learning_rate": 2.1742541484487076e-07, "logits/chosen": -0.3854857385158539, "logits/rejected": -0.3854857385158539, "logps/chosen": -55.714202880859375, "logps/rejected": -55.714202880859375, "loss": 0.4412, "rewards/accuracies": 0.0, "rewards/chosen": 1.6378799676895142, "rewards/margins": 0.0, "rewards/rejected": 1.6378799676895142, "step": 10631 }, { "epoch": 1.73, "learning_rate": 2.1731700013677622e-07, "logits/chosen": -1.0511958599090576, "logits/rejected": -0.9669785499572754, "logps/chosen": -61.59848403930664, "logps/rejected": -53.16179656982422, "loss": 0.2125, "rewards/accuracies": 1.0, "rewards/chosen": 2.40032696723938, "rewards/margins": 0.6523418426513672, "rewards/rejected": 1.7479851245880127, "step": 10632 }, { "epoch": 1.73, "learning_rate": 2.1720860495940242e-07, "logits/chosen": -1.1717921495437622, "logits/rejected": -1.2759654521942139, "logps/chosen": -206.6326141357422, "logps/rejected": -84.2144775390625, "loss": 0.2371, "rewards/accuracies": 1.0, "rewards/chosen": 5.357936382293701, "rewards/margins": 0.5678300857543945, "rewards/rejected": 4.790106296539307, "step": 10633 }, { "epoch": 1.73, "learning_rate": 2.1710022932023803e-07, "logits/chosen": -0.20568503439426422, "logits/rejected": -0.16747406125068665, "logps/chosen": -43.17916488647461, "logps/rejected": -25.719133377075195, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.7275524139404297, "rewards/margins": 0.05961662530899048, "rewards/rejected": 0.6679357886314392, "step": 10634 }, { "epoch": 1.73, "learning_rate": 2.1699187322677116e-07, "logits/chosen": -0.6687389612197876, "logits/rejected": -0.6682386994361877, "logps/chosen": -3.755793809890747, "logps/rejected": -5.659482955932617, "loss": 0.6191, "rewards/accuracies": 1.0, "rewards/chosen": 0.27999886870384216, "rewards/margins": 0.10325603187084198, "rewards/rejected": 0.17674283683300018, "step": 10635 }, { "epoch": 1.73, "learning_rate": 2.1688353668648784e-07, "logits/chosen": -1.1050173044204712, "logits/rejected": -0.5265400409698486, "logps/chosen": -191.75486755371094, "logps/rejected": -106.00814819335938, "loss": 0.9684, "rewards/accuracies": 1.0, "rewards/chosen": 4.928947448730469, "rewards/margins": 2.6770613193511963, "rewards/rejected": 2.2518861293792725, "step": 10636 }, { "epoch": 1.73, "learning_rate": 2.167752197068734e-07, "logits/chosen": -0.6435900330543518, "logits/rejected": -0.5125865340232849, "logps/chosen": -52.820762634277344, "logps/rejected": -11.410432815551758, "loss": 0.2289, "rewards/accuracies": 1.0, "rewards/chosen": 1.01567542552948, "rewards/margins": 0.579303503036499, "rewards/rejected": 0.43637189269065857, "step": 10637 }, { "epoch": 1.73, "learning_rate": 2.1666692229541124e-07, "logits/chosen": -0.6603740453720093, "logits/rejected": -0.6727995276451111, "logps/chosen": -49.056488037109375, "logps/rejected": -131.44166564941406, "loss": 1.4633, "rewards/accuracies": 0.0, "rewards/chosen": 2.867818593978882, "rewards/margins": -2.830472707748413, "rewards/rejected": 5.698291301727295, "step": 10638 }, { "epoch": 1.73, "learning_rate": 2.16558644459584e-07, "logits/chosen": -0.7768938541412354, "logits/rejected": -0.6753857135772705, "logps/chosen": -85.860595703125, "logps/rejected": -127.9429931640625, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 3.66294264793396, "rewards/margins": 0.5205812454223633, "rewards/rejected": 3.1423614025115967, "step": 10639 }, { "epoch": 1.73, "learning_rate": 2.164503862068723e-07, "logits/chosen": -0.7316197752952576, "logits/rejected": -0.7336022853851318, "logps/chosen": -65.67049407958984, "logps/rejected": -12.457603454589844, "loss": 0.712, "rewards/accuracies": 0.0, "rewards/chosen": 0.4401542842388153, "rewards/margins": -0.4457997977733612, "rewards/rejected": 0.8859540820121765, "step": 10640 }, { "epoch": 1.73, "learning_rate": 2.1634214754475627e-07, "logits/chosen": -0.6839828491210938, "logits/rejected": -0.5979293584823608, "logps/chosen": -68.35702514648438, "logps/rejected": -25.517642974853516, "loss": 0.3392, "rewards/accuracies": 1.0, "rewards/chosen": 1.316096544265747, "rewards/margins": 0.3255554437637329, "rewards/rejected": 0.9905411005020142, "step": 10641 }, { "epoch": 1.73, "learning_rate": 2.1623392848071354e-07, "logits/chosen": -0.6811546087265015, "logits/rejected": -0.46967241168022156, "logps/chosen": -68.48187255859375, "logps/rejected": -43.700103759765625, "loss": 0.3352, "rewards/accuracies": 1.0, "rewards/chosen": 1.863549828529358, "rewards/margins": 1.2084743976593018, "rewards/rejected": 0.6550754904747009, "step": 10642 }, { "epoch": 1.73, "learning_rate": 2.1612572902222126e-07, "logits/chosen": -0.17149439454078674, "logits/rejected": -0.20941564440727234, "logps/chosen": -105.66128540039062, "logps/rejected": -82.42108154296875, "loss": 1.0893, "rewards/accuracies": 0.0, "rewards/chosen": 1.1020195484161377, "rewards/margins": -1.5442008972167969, "rewards/rejected": 2.6462204456329346, "step": 10643 }, { "epoch": 1.73, "learning_rate": 2.160175491767553e-07, "logits/chosen": -1.0105894804000854, "logits/rejected": -0.978975772857666, "logps/chosen": -101.24822235107422, "logps/rejected": -50.64595031738281, "loss": 0.8013, "rewards/accuracies": 0.0, "rewards/chosen": 1.2500275373458862, "rewards/margins": -1.2534865140914917, "rewards/rejected": 2.503514051437378, "step": 10644 }, { "epoch": 1.73, "learning_rate": 2.1590938895178944e-07, "logits/chosen": -0.5688344240188599, "logits/rejected": -0.5226550102233887, "logps/chosen": -86.12049865722656, "logps/rejected": -48.43589782714844, "loss": 0.6014, "rewards/accuracies": 0.0, "rewards/chosen": 1.8179572820663452, "rewards/margins": -0.8163567781448364, "rewards/rejected": 2.6343140602111816, "step": 10645 }, { "epoch": 1.73, "learning_rate": 2.1580124835479684e-07, "logits/chosen": -0.9215200543403625, "logits/rejected": -0.9010307192802429, "logps/chosen": -179.7123260498047, "logps/rejected": -88.81303405761719, "loss": 0.4965, "rewards/accuracies": 1.0, "rewards/chosen": 2.1929032802581787, "rewards/margins": 0.5484391450881958, "rewards/rejected": 1.644464135169983, "step": 10646 }, { "epoch": 1.73, "learning_rate": 2.1569312739324874e-07, "logits/chosen": -0.523449718952179, "logits/rejected": -0.44055402278900146, "logps/chosen": -41.00080108642578, "logps/rejected": -27.235273361206055, "loss": 0.2525, "rewards/accuracies": 1.0, "rewards/chosen": 1.7694625854492188, "rewards/margins": 0.5932890176773071, "rewards/rejected": 1.1761735677719116, "step": 10647 }, { "epoch": 1.73, "learning_rate": 2.155850260746156e-07, "logits/chosen": -0.9297318458557129, "logits/rejected": -0.9272330403327942, "logps/chosen": -52.84722900390625, "logps/rejected": -125.79214477539062, "loss": 2.6238, "rewards/accuracies": 0.0, "rewards/chosen": 2.255030870437622, "rewards/margins": -1.389312744140625, "rewards/rejected": 3.644343614578247, "step": 10648 }, { "epoch": 1.73, "learning_rate": 2.1547694440636578e-07, "logits/chosen": -0.7933179140090942, "logits/rejected": -0.6005948185920715, "logps/chosen": -179.84814453125, "logps/rejected": -222.92044067382812, "loss": 0.3973, "rewards/accuracies": 1.0, "rewards/chosen": 4.582135200500488, "rewards/margins": 0.13191843032836914, "rewards/rejected": 4.450216770172119, "step": 10649 }, { "epoch": 1.73, "learning_rate": 2.1536888239596712e-07, "logits/chosen": -0.5868041515350342, "logits/rejected": -0.5573402047157288, "logps/chosen": -79.40933227539062, "logps/rejected": -41.183197021484375, "loss": 0.5668, "rewards/accuracies": 0.0, "rewards/chosen": 1.4996017217636108, "rewards/margins": -0.7328613996505737, "rewards/rejected": 2.2324631214141846, "step": 10650 }, { "epoch": 1.73, "learning_rate": 2.152608400508853e-07, "logits/chosen": -0.6261947154998779, "logits/rejected": -0.6318910717964172, "logps/chosen": -29.640155792236328, "logps/rejected": -26.498571395874023, "loss": 0.4725, "rewards/accuracies": 1.0, "rewards/chosen": 0.6073455810546875, "rewards/margins": 0.047794878482818604, "rewards/rejected": 0.5595507025718689, "step": 10651 }, { "epoch": 1.73, "learning_rate": 2.1515281737858536e-07, "logits/chosen": -0.44917628169059753, "logits/rejected": -0.45587754249572754, "logps/chosen": -53.734474182128906, "logps/rejected": -77.05919647216797, "loss": 0.3597, "rewards/accuracies": 1.0, "rewards/chosen": 2.1651580333709717, "rewards/margins": 0.2479614019393921, "rewards/rejected": 1.9171966314315796, "step": 10652 }, { "epoch": 1.73, "learning_rate": 2.1504481438653032e-07, "logits/chosen": -0.4476175308227539, "logits/rejected": -0.4476175308227539, "logps/chosen": -68.80889892578125, "logps/rejected": -68.80889892578125, "loss": 0.363, "rewards/accuracies": 0.0, "rewards/chosen": 1.4214057922363281, "rewards/margins": 0.0, "rewards/rejected": 1.4214057922363281, "step": 10653 }, { "epoch": 1.73, "learning_rate": 2.1493683108218253e-07, "logits/chosen": -0.19972509145736694, "logits/rejected": -0.19972509145736694, "logps/chosen": -75.42430877685547, "logps/rejected": -75.42430877685547, "loss": 1.6412, "rewards/accuracies": 0.0, "rewards/chosen": 1.3144210577011108, "rewards/margins": 0.0, "rewards/rejected": 1.3144210577011108, "step": 10654 }, { "epoch": 1.73, "learning_rate": 2.1482886747300222e-07, "logits/chosen": -1.01028573513031, "logits/rejected": -0.9829155206680298, "logps/chosen": -128.61268615722656, "logps/rejected": -105.59870910644531, "loss": 0.5852, "rewards/accuracies": 0.0, "rewards/chosen": 1.390142798423767, "rewards/margins": -0.30323028564453125, "rewards/rejected": 1.6933730840682983, "step": 10655 }, { "epoch": 1.73, "learning_rate": 2.1472092356644904e-07, "logits/chosen": -0.5672390460968018, "logits/rejected": -0.6363673806190491, "logps/chosen": -72.94170379638672, "logps/rejected": -54.3704833984375, "loss": 2.2234, "rewards/accuracies": 0.0, "rewards/chosen": 1.369027018547058, "rewards/margins": -0.9321800470352173, "rewards/rejected": 2.3012070655822754, "step": 10656 }, { "epoch": 1.73, "learning_rate": 2.146129993699805e-07, "logits/chosen": -0.682924211025238, "logits/rejected": -0.6200524568557739, "logps/chosen": -72.90995788574219, "logps/rejected": -39.927818298339844, "loss": 0.6721, "rewards/accuracies": 1.0, "rewards/chosen": 1.8602348566055298, "rewards/margins": 0.2674376964569092, "rewards/rejected": 1.5927971601486206, "step": 10657 }, { "epoch": 1.73, "learning_rate": 2.1450509489105356e-07, "logits/chosen": -0.7647293210029602, "logits/rejected": -0.7179739475250244, "logps/chosen": -116.9716796875, "logps/rejected": -108.14041137695312, "loss": 0.1652, "rewards/accuracies": 1.0, "rewards/chosen": 2.135389804840088, "rewards/margins": 1.0021576881408691, "rewards/rejected": 1.1332321166992188, "step": 10658 }, { "epoch": 1.73, "learning_rate": 2.1439721013712302e-07, "logits/chosen": -0.6400550007820129, "logits/rejected": -0.6162842512130737, "logps/chosen": -48.294673919677734, "logps/rejected": -86.07292175292969, "loss": 1.6647, "rewards/accuracies": 1.0, "rewards/chosen": 1.419277548789978, "rewards/margins": 0.21782875061035156, "rewards/rejected": 1.2014487981796265, "step": 10659 }, { "epoch": 1.73, "learning_rate": 2.14289345115643e-07, "logits/chosen": -0.5456767082214355, "logits/rejected": -0.5268372893333435, "logps/chosen": -126.67463684082031, "logps/rejected": -54.12106704711914, "loss": 0.2888, "rewards/accuracies": 1.0, "rewards/chosen": 3.631718397140503, "rewards/margins": 2.135025978088379, "rewards/rejected": 1.4966922998428345, "step": 10660 }, { "epoch": 1.73, "learning_rate": 2.1418149983406568e-07, "logits/chosen": -0.13215595483779907, "logits/rejected": -0.1268845498561859, "logps/chosen": -1.4582996368408203, "logps/rejected": -4.093070983886719, "loss": 0.3711, "rewards/accuracies": 0.0, "rewards/chosen": 0.2830835282802582, "rewards/margins": -0.019550472497940063, "rewards/rejected": 0.30263400077819824, "step": 10661 }, { "epoch": 1.73, "learning_rate": 2.140736742998424e-07, "logits/chosen": -0.8051957488059998, "logits/rejected": -0.813146710395813, "logps/chosen": -79.03263092041016, "logps/rejected": -106.34446716308594, "loss": 0.5942, "rewards/accuracies": 1.0, "rewards/chosen": 1.6060189008712769, "rewards/margins": 1.5948768854141235, "rewards/rejected": 0.011141967959702015, "step": 10662 }, { "epoch": 1.73, "learning_rate": 2.1396586852042265e-07, "logits/chosen": -1.1957755088806152, "logits/rejected": -1.2404650449752808, "logps/chosen": -85.859130859375, "logps/rejected": -79.03787231445312, "loss": 1.0618, "rewards/accuracies": 0.0, "rewards/chosen": 2.1755454540252686, "rewards/margins": -0.018056631088256836, "rewards/rejected": 2.1936020851135254, "step": 10663 }, { "epoch": 1.73, "learning_rate": 2.1385808250325503e-07, "logits/chosen": -0.9313474297523499, "logits/rejected": -0.965129017829895, "logps/chosen": -166.47348022460938, "logps/rejected": -28.262414932250977, "loss": 0.1582, "rewards/accuracies": 1.0, "rewards/chosen": 1.0485886335372925, "rewards/margins": 1.0327297449111938, "rewards/rejected": 0.015858842059969902, "step": 10664 }, { "epoch": 1.73, "learning_rate": 2.1375031625578627e-07, "logits/chosen": -0.7576042413711548, "logits/rejected": -0.7825510501861572, "logps/chosen": -105.80799102783203, "logps/rejected": -112.39751434326172, "loss": 0.8251, "rewards/accuracies": 1.0, "rewards/chosen": 2.650524139404297, "rewards/margins": 0.3077712059020996, "rewards/rejected": 2.3427529335021973, "step": 10665 }, { "epoch": 1.73, "learning_rate": 2.136425697854623e-07, "logits/chosen": -0.17781895399093628, "logits/rejected": -0.17781895399093628, "logps/chosen": -1.9383498430252075, "logps/rejected": -1.9383498430252075, "loss": 1.4934, "rewards/accuracies": 0.0, "rewards/chosen": 0.30371856689453125, "rewards/margins": 0.0, "rewards/rejected": 0.30371856689453125, "step": 10666 }, { "epoch": 1.73, "learning_rate": 2.135348430997271e-07, "logits/chosen": -0.45910465717315674, "logits/rejected": -0.46327677369117737, "logps/chosen": -9.527220726013184, "logps/rejected": -12.713960647583008, "loss": 0.8123, "rewards/accuracies": 1.0, "rewards/chosen": 0.13322582840919495, "rewards/margins": 0.07290315628051758, "rewards/rejected": 0.06032266840338707, "step": 10667 }, { "epoch": 1.73, "learning_rate": 2.1342713620602376e-07, "logits/chosen": -0.3185875713825226, "logits/rejected": -0.3947276771068573, "logps/chosen": -58.94633102416992, "logps/rejected": -34.066349029541016, "loss": 0.976, "rewards/accuracies": 0.0, "rewards/chosen": 0.693942666053772, "rewards/margins": -1.2285034656524658, "rewards/rejected": 1.9224461317062378, "step": 10668 }, { "epoch": 1.73, "learning_rate": 2.1331944911179363e-07, "logits/chosen": -0.3423682153224945, "logits/rejected": -0.3363839387893677, "logps/chosen": -2.0968399047851562, "logps/rejected": -9.12679672241211, "loss": 0.3242, "rewards/accuracies": 1.0, "rewards/chosen": 0.3161483705043793, "rewards/margins": 0.16120557487010956, "rewards/rejected": 0.15494279563426971, "step": 10669 }, { "epoch": 1.73, "learning_rate": 2.1321178182447709e-07, "logits/chosen": -0.6373480558395386, "logits/rejected": -0.5353876948356628, "logps/chosen": -83.06783294677734, "logps/rejected": -13.965039253234863, "loss": 1.0135, "rewards/accuracies": 1.0, "rewards/chosen": 2.832202196121216, "rewards/margins": 2.6418161392211914, "rewards/rejected": 0.190386101603508, "step": 10670 }, { "epoch": 1.73, "learning_rate": 2.1310413435151265e-07, "logits/chosen": -0.612259030342102, "logits/rejected": -0.5685679316520691, "logps/chosen": -54.272483825683594, "logps/rejected": -77.11602020263672, "loss": 0.5289, "rewards/accuracies": 1.0, "rewards/chosen": 1.9861549139022827, "rewards/margins": 0.3324241638183594, "rewards/rejected": 1.6537307500839233, "step": 10671 }, { "epoch": 1.73, "learning_rate": 2.1299650670033808e-07, "logits/chosen": -1.469730257987976, "logits/rejected": -1.342939853668213, "logps/chosen": -52.054176330566406, "logps/rejected": -185.33163452148438, "loss": 2.3315, "rewards/accuracies": 0.0, "rewards/chosen": 2.1477677822113037, "rewards/margins": -4.287376403808594, "rewards/rejected": 6.435143947601318, "step": 10672 }, { "epoch": 1.73, "learning_rate": 2.1288889887838907e-07, "logits/chosen": -0.9485200643539429, "logits/rejected": -0.8991143703460693, "logps/chosen": -56.97382354736328, "logps/rejected": -76.29432678222656, "loss": 0.5252, "rewards/accuracies": 0.0, "rewards/chosen": 1.4920631647109985, "rewards/margins": -0.5496102571487427, "rewards/rejected": 2.041673421859741, "step": 10673 }, { "epoch": 1.73, "learning_rate": 2.1278131089310065e-07, "logits/chosen": -0.5149670243263245, "logits/rejected": -0.5149670243263245, "logps/chosen": -22.484573364257812, "logps/rejected": -22.484573364257812, "loss": 0.495, "rewards/accuracies": 0.0, "rewards/chosen": 0.12775783240795135, "rewards/margins": 0.0, "rewards/rejected": 0.12775783240795135, "step": 10674 }, { "epoch": 1.73, "learning_rate": 2.1267374275190575e-07, "logits/chosen": -0.9473255276679993, "logits/rejected": -0.7825820446014404, "logps/chosen": -67.7732925415039, "logps/rejected": -94.53871154785156, "loss": 0.9829, "rewards/accuracies": 0.0, "rewards/chosen": 0.70675128698349, "rewards/margins": -1.506626844406128, "rewards/rejected": 2.2133781909942627, "step": 10675 }, { "epoch": 1.73, "learning_rate": 2.1256619446223672e-07, "logits/chosen": -0.9523319602012634, "logits/rejected": -0.8834137320518494, "logps/chosen": -109.46271514892578, "logps/rejected": -81.36711883544922, "loss": 1.9749, "rewards/accuracies": 1.0, "rewards/chosen": 2.3369927406311035, "rewards/margins": 0.052398681640625, "rewards/rejected": 2.2845940589904785, "step": 10676 }, { "epoch": 1.73, "learning_rate": 2.124586660315238e-07, "logits/chosen": -0.5327081084251404, "logits/rejected": -0.5327081084251404, "logps/chosen": -144.70118713378906, "logps/rejected": -144.70118713378906, "loss": 0.3772, "rewards/accuracies": 0.0, "rewards/chosen": 0.7099319696426392, "rewards/margins": 0.0, "rewards/rejected": 0.7099319696426392, "step": 10677 }, { "epoch": 1.73, "learning_rate": 2.1235115746719646e-07, "logits/chosen": -0.8300836682319641, "logits/rejected": -0.8040400743484497, "logps/chosen": -86.545166015625, "logps/rejected": -125.5516586303711, "loss": 0.3338, "rewards/accuracies": 1.0, "rewards/chosen": 0.8626953363418579, "rewards/margins": 0.5374717712402344, "rewards/rejected": 0.32522353529930115, "step": 10678 }, { "epoch": 1.73, "learning_rate": 2.1224366877668214e-07, "logits/chosen": -0.4834037125110626, "logits/rejected": -0.2929839789867401, "logps/chosen": -64.78260040283203, "logps/rejected": -12.660558700561523, "loss": 0.1709, "rewards/accuracies": 1.0, "rewards/chosen": 1.963079810142517, "rewards/margins": 1.2882035970687866, "rewards/rejected": 0.6748762130737305, "step": 10679 }, { "epoch": 1.73, "learning_rate": 2.1213619996740763e-07, "logits/chosen": -0.4458629786968231, "logits/rejected": -0.4458629786968231, "logps/chosen": -18.485027313232422, "logps/rejected": -18.485027313232422, "loss": 0.394, "rewards/accuracies": 0.0, "rewards/chosen": 1.1869757175445557, "rewards/margins": 0.0, "rewards/rejected": 1.1869757175445557, "step": 10680 }, { "epoch": 1.73, "learning_rate": 2.12028751046798e-07, "logits/chosen": -0.7686965465545654, "logits/rejected": -0.6282927393913269, "logps/chosen": -188.3748779296875, "logps/rejected": -120.96829223632812, "loss": 0.2658, "rewards/accuracies": 1.0, "rewards/chosen": 6.27400541305542, "rewards/margins": 1.6580462455749512, "rewards/rejected": 4.615959167480469, "step": 10681 }, { "epoch": 1.73, "learning_rate": 2.1192132202227674e-07, "logits/chosen": -0.5045550465583801, "logits/rejected": -0.4435516595840454, "logps/chosen": -99.61904907226562, "logps/rejected": -60.10469436645508, "loss": 0.6416, "rewards/accuracies": 0.0, "rewards/chosen": 1.1724647283554077, "rewards/margins": -0.758965015411377, "rewards/rejected": 1.9314297437667847, "step": 10682 }, { "epoch": 1.73, "learning_rate": 2.118139129012665e-07, "logits/chosen": -1.0216577053070068, "logits/rejected": -1.0023895502090454, "logps/chosen": -42.88361358642578, "logps/rejected": -67.78211975097656, "loss": 0.9431, "rewards/accuracies": 1.0, "rewards/chosen": 1.9278564453125, "rewards/margins": 1.6304244995117188, "rewards/rejected": 0.29743194580078125, "step": 10683 }, { "epoch": 1.73, "learning_rate": 2.117065236911878e-07, "logits/chosen": -0.992111325263977, "logits/rejected": -1.0254195928573608, "logps/chosen": -225.857177734375, "logps/rejected": -183.40945434570312, "loss": 2.0549, "rewards/accuracies": 0.0, "rewards/chosen": 4.6356964111328125, "rewards/margins": -2.9063754081726074, "rewards/rejected": 7.54207181930542, "step": 10684 }, { "epoch": 1.73, "learning_rate": 2.1159915439946068e-07, "logits/chosen": -0.516841471195221, "logits/rejected": -0.5711820721626282, "logps/chosen": -123.29444122314453, "logps/rejected": -130.72262573242188, "loss": 1.1178, "rewards/accuracies": 1.0, "rewards/chosen": 1.4732002019882202, "rewards/margins": 0.48427045345306396, "rewards/rejected": 0.9889297485351562, "step": 10685 }, { "epoch": 1.73, "learning_rate": 2.1149180503350289e-07, "logits/chosen": -0.8187417984008789, "logits/rejected": -0.7681625485420227, "logps/chosen": -158.84844970703125, "logps/rejected": -51.61442565917969, "loss": 1.6, "rewards/accuracies": 1.0, "rewards/chosen": 3.9943604469299316, "rewards/margins": 0.24152684211730957, "rewards/rejected": 3.752833604812622, "step": 10686 }, { "epoch": 1.73, "learning_rate": 2.1138447560073169e-07, "logits/chosen": -1.0524765253067017, "logits/rejected": -1.0851527452468872, "logps/chosen": -120.55380249023438, "logps/rejected": -158.06759643554688, "loss": 1.2752, "rewards/accuracies": 0.0, "rewards/chosen": 5.607705593109131, "rewards/margins": -2.3133697509765625, "rewards/rejected": 7.921075344085693, "step": 10687 }, { "epoch": 1.73, "learning_rate": 2.1127716610856216e-07, "logits/chosen": -0.7620619535446167, "logits/rejected": -0.7620619535446167, "logps/chosen": -50.88593292236328, "logps/rejected": -50.88593292236328, "loss": 0.4339, "rewards/accuracies": 0.0, "rewards/chosen": 0.06910743564367294, "rewards/margins": 0.0, "rewards/rejected": 0.06910743564367294, "step": 10688 }, { "epoch": 1.73, "learning_rate": 2.111698765644087e-07, "logits/chosen": -0.9388554096221924, "logits/rejected": -0.9264184832572937, "logps/chosen": -15.166025161743164, "logps/rejected": -21.648649215698242, "loss": 0.7444, "rewards/accuracies": 0.0, "rewards/chosen": 0.428471177816391, "rewards/margins": -0.926041841506958, "rewards/rejected": 1.3545130491256714, "step": 10689 }, { "epoch": 1.74, "learning_rate": 2.110626069756836e-07, "logits/chosen": -0.6806671619415283, "logits/rejected": -0.6473087668418884, "logps/chosen": -58.91254425048828, "logps/rejected": -85.53787231445312, "loss": 0.5843, "rewards/accuracies": 1.0, "rewards/chosen": 1.7209465503692627, "rewards/margins": 0.7252998948097229, "rewards/rejected": 0.9956466555595398, "step": 10690 }, { "epoch": 1.74, "learning_rate": 2.109553573497987e-07, "logits/chosen": -0.7852209210395813, "logits/rejected": -0.824722945690155, "logps/chosen": -71.89942932128906, "logps/rejected": -99.59843444824219, "loss": 1.6715, "rewards/accuracies": 0.0, "rewards/chosen": 1.7030266523361206, "rewards/margins": -1.6597892045974731, "rewards/rejected": 3.3628158569335938, "step": 10691 }, { "epoch": 1.74, "learning_rate": 2.1084812769416337e-07, "logits/chosen": -1.015655279159546, "logits/rejected": -0.9551886320114136, "logps/chosen": -87.9542007446289, "logps/rejected": -14.430685043334961, "loss": 0.1041, "rewards/accuracies": 1.0, "rewards/chosen": 1.811498999595642, "rewards/margins": 1.5382795333862305, "rewards/rejected": 0.273219496011734, "step": 10692 }, { "epoch": 1.74, "learning_rate": 2.1074091801618666e-07, "logits/chosen": -1.0236232280731201, "logits/rejected": -1.0176730155944824, "logps/chosen": -62.021873474121094, "logps/rejected": -57.26373291015625, "loss": 0.5257, "rewards/accuracies": 1.0, "rewards/chosen": 2.369696855545044, "rewards/margins": 0.2848076820373535, "rewards/rejected": 2.0848891735076904, "step": 10693 }, { "epoch": 1.74, "learning_rate": 2.106337283232753e-07, "logits/chosen": -0.5013774633407593, "logits/rejected": -0.43665868043899536, "logps/chosen": -54.6772346496582, "logps/rejected": -69.73953247070312, "loss": 0.2616, "rewards/accuracies": 1.0, "rewards/chosen": 1.5237034559249878, "rewards/margins": 0.5992146134376526, "rewards/rejected": 0.9244888424873352, "step": 10694 }, { "epoch": 1.74, "learning_rate": 2.1052655862283548e-07, "logits/chosen": -0.2544041574001312, "logits/rejected": -0.2544041574001312, "logps/chosen": -0.4057551920413971, "logps/rejected": -0.4057551920413971, "loss": 0.7387, "rewards/accuracies": 0.0, "rewards/chosen": 0.08477999269962311, "rewards/margins": 0.0, "rewards/rejected": 0.08477999269962311, "step": 10695 }, { "epoch": 1.74, "learning_rate": 2.1041940892227128e-07, "logits/chosen": -0.5351162552833557, "logits/rejected": -0.43106237053871155, "logps/chosen": -74.68062591552734, "logps/rejected": -89.9725570678711, "loss": 0.6615, "rewards/accuracies": 0.0, "rewards/chosen": 1.8918968439102173, "rewards/margins": -0.28658831119537354, "rewards/rejected": 2.178485155105591, "step": 10696 }, { "epoch": 1.74, "learning_rate": 2.1031227922898608e-07, "logits/chosen": -0.7854613661766052, "logits/rejected": -0.7686941623687744, "logps/chosen": -66.65973663330078, "logps/rejected": -51.260772705078125, "loss": 1.4001, "rewards/accuracies": 0.0, "rewards/chosen": 1.0067657232284546, "rewards/margins": -0.49425506591796875, "rewards/rejected": 1.5010207891464233, "step": 10697 }, { "epoch": 1.74, "learning_rate": 2.1020516955038115e-07, "logits/chosen": -0.6368311047554016, "logits/rejected": -0.6220183372497559, "logps/chosen": -77.11848449707031, "logps/rejected": -83.48921203613281, "loss": 0.4143, "rewards/accuracies": 0.0, "rewards/chosen": 2.671170949935913, "rewards/margins": -0.19542217254638672, "rewards/rejected": 2.8665931224823, "step": 10698 }, { "epoch": 1.74, "learning_rate": 2.100980798938571e-07, "logits/chosen": -0.850138783454895, "logits/rejected": -0.7913610339164734, "logps/chosen": -202.52159118652344, "logps/rejected": -80.427490234375, "loss": 0.3788, "rewards/accuracies": 1.0, "rewards/chosen": 5.156773567199707, "rewards/margins": 2.37123966217041, "rewards/rejected": 2.785533905029297, "step": 10699 }, { "epoch": 1.74, "learning_rate": 2.099910102668125e-07, "logits/chosen": -0.8112426400184631, "logits/rejected": -0.58610600233078, "logps/chosen": -73.4173812866211, "logps/rejected": -23.956491470336914, "loss": 0.1349, "rewards/accuracies": 1.0, "rewards/chosen": 2.5457656383514404, "rewards/margins": 2.5353574752807617, "rewards/rejected": 0.010408210568130016, "step": 10700 }, { "epoch": 1.74, "learning_rate": 2.0988396067664517e-07, "logits/chosen": -0.5347210168838501, "logits/rejected": -0.5903837084770203, "logps/chosen": -70.0885238647461, "logps/rejected": -65.09698486328125, "loss": 0.5532, "rewards/accuracies": 0.0, "rewards/chosen": 1.763323187828064, "rewards/margins": -0.3163560628890991, "rewards/rejected": 2.079679250717163, "step": 10701 }, { "epoch": 1.74, "learning_rate": 2.0977693113075084e-07, "logits/chosen": -0.7547943592071533, "logits/rejected": -0.6917476058006287, "logps/chosen": -44.07630157470703, "logps/rejected": -65.76653289794922, "loss": 0.2148, "rewards/accuracies": 1.0, "rewards/chosen": 2.5333709716796875, "rewards/margins": 1.2752723693847656, "rewards/rejected": 1.2580986022949219, "step": 10702 }, { "epoch": 1.74, "learning_rate": 2.0966992163652464e-07, "logits/chosen": -0.8766194581985474, "logits/rejected": -0.6592333316802979, "logps/chosen": -124.72975158691406, "logps/rejected": -120.24465942382812, "loss": 0.1175, "rewards/accuracies": 1.0, "rewards/chosen": 3.8637406826019287, "rewards/margins": 1.3499863147735596, "rewards/rejected": 2.513754367828369, "step": 10703 }, { "epoch": 1.74, "learning_rate": 2.0956293220135956e-07, "logits/chosen": -1.1322368383407593, "logits/rejected": -0.9731934070587158, "logps/chosen": -129.5110626220703, "logps/rejected": -20.68655014038086, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": 4.701953411102295, "rewards/margins": 4.233032703399658, "rewards/rejected": 0.4689205288887024, "step": 10704 }, { "epoch": 1.74, "learning_rate": 2.0945596283264793e-07, "logits/chosen": -0.8581221103668213, "logits/rejected": -0.772553563117981, "logps/chosen": -60.39225387573242, "logps/rejected": -26.725128173828125, "loss": 0.0379, "rewards/accuracies": 1.0, "rewards/chosen": 3.8404057025909424, "rewards/margins": 2.7013163566589355, "rewards/rejected": 1.1390892267227173, "step": 10705 }, { "epoch": 1.74, "learning_rate": 2.093490135377799e-07, "logits/chosen": -0.6488468050956726, "logits/rejected": -0.6488468050956726, "logps/chosen": -83.29605102539062, "logps/rejected": -83.29605102539062, "loss": 1.3074, "rewards/accuracies": 0.0, "rewards/chosen": 2.2288925647735596, "rewards/margins": 0.0, "rewards/rejected": 2.2288925647735596, "step": 10706 }, { "epoch": 1.74, "learning_rate": 2.0924208432414509e-07, "logits/chosen": -0.944780707359314, "logits/rejected": -0.7636886239051819, "logps/chosen": -181.90811157226562, "logps/rejected": -193.73788452148438, "loss": 0.4409, "rewards/accuracies": 1.0, "rewards/chosen": 5.7601213455200195, "rewards/margins": 0.22707080841064453, "rewards/rejected": 5.533050537109375, "step": 10707 }, { "epoch": 1.74, "learning_rate": 2.0913517519913088e-07, "logits/chosen": -0.6779645085334778, "logits/rejected": -0.6934293508529663, "logps/chosen": -99.96023559570312, "logps/rejected": -86.73002624511719, "loss": 0.7113, "rewards/accuracies": 0.0, "rewards/chosen": 0.18072509765625, "rewards/margins": -1.1094818115234375, "rewards/rejected": 1.2902069091796875, "step": 10708 }, { "epoch": 1.74, "learning_rate": 2.0902828617012402e-07, "logits/chosen": -0.5216848254203796, "logits/rejected": -0.5200778245925903, "logps/chosen": -2.882169723510742, "logps/rejected": -1.5989906787872314, "loss": 0.6696, "rewards/accuracies": 1.0, "rewards/chosen": 0.35105806589126587, "rewards/margins": 0.09635642170906067, "rewards/rejected": 0.2547016441822052, "step": 10709 }, { "epoch": 1.74, "learning_rate": 2.0892141724450924e-07, "logits/chosen": -0.8767688274383545, "logits/rejected": -0.8137720227241516, "logps/chosen": -106.7500991821289, "logps/rejected": -17.70425033569336, "loss": 1.5872, "rewards/accuracies": 1.0, "rewards/chosen": 1.5275230407714844, "rewards/margins": 1.3225514888763428, "rewards/rejected": 0.20497150719165802, "step": 10710 }, { "epoch": 1.74, "learning_rate": 2.088145684296705e-07, "logits/chosen": -1.0717840194702148, "logits/rejected": -1.0624969005584717, "logps/chosen": -54.5744743347168, "logps/rejected": -74.87738037109375, "loss": 0.4564, "rewards/accuracies": 1.0, "rewards/chosen": 2.3777058124542236, "rewards/margins": 1.5904858112335205, "rewards/rejected": 0.7872200012207031, "step": 10711 }, { "epoch": 1.74, "learning_rate": 2.0870773973298967e-07, "logits/chosen": -0.7018701434135437, "logits/rejected": -0.5571639537811279, "logps/chosen": -122.08020782470703, "logps/rejected": -47.51312255859375, "loss": 0.1781, "rewards/accuracies": 1.0, "rewards/chosen": 5.161811351776123, "rewards/margins": 3.048583507537842, "rewards/rejected": 2.1132278442382812, "step": 10712 }, { "epoch": 1.74, "learning_rate": 2.0860093116184795e-07, "logits/chosen": -0.575450599193573, "logits/rejected": -0.5079718232154846, "logps/chosen": -75.40667724609375, "logps/rejected": -108.31684875488281, "loss": 1.7463, "rewards/accuracies": 0.0, "rewards/chosen": 1.955603837966919, "rewards/margins": -0.24323654174804688, "rewards/rejected": 2.198840379714966, "step": 10713 }, { "epoch": 1.74, "learning_rate": 2.0849414272362448e-07, "logits/chosen": -0.837926983833313, "logits/rejected": -0.8477198481559753, "logps/chosen": -72.20950317382812, "logps/rejected": -54.59774398803711, "loss": 1.6378, "rewards/accuracies": 0.0, "rewards/chosen": 1.1059112548828125, "rewards/margins": -0.7043155431747437, "rewards/rejected": 1.8102267980575562, "step": 10714 }, { "epoch": 1.74, "learning_rate": 2.0838737442569748e-07, "logits/chosen": -0.3556111752986908, "logits/rejected": -0.3556111752986908, "logps/chosen": -35.54965591430664, "logps/rejected": -35.54965591430664, "loss": 1.2152, "rewards/accuracies": 0.0, "rewards/chosen": 1.0395077466964722, "rewards/margins": 0.0, "rewards/rejected": 1.0395077466964722, "step": 10715 }, { "epoch": 1.74, "learning_rate": 2.0828062627544384e-07, "logits/chosen": -1.1029678583145142, "logits/rejected": -0.9754905104637146, "logps/chosen": -95.13883209228516, "logps/rejected": -185.53787231445312, "loss": 0.4106, "rewards/accuracies": 0.0, "rewards/chosen": 4.524385929107666, "rewards/margins": -0.14239883422851562, "rewards/rejected": 4.666784763336182, "step": 10716 }, { "epoch": 1.74, "learning_rate": 2.0817389828023846e-07, "logits/chosen": -0.8739255666732788, "logits/rejected": -0.8609779477119446, "logps/chosen": -76.50010681152344, "logps/rejected": -65.48827362060547, "loss": 0.541, "rewards/accuracies": 0.0, "rewards/chosen": 1.470906138420105, "rewards/margins": -0.2141578197479248, "rewards/rejected": 1.6850639581680298, "step": 10717 }, { "epoch": 1.74, "learning_rate": 2.0806719044745564e-07, "logits/chosen": -0.24224014580249786, "logits/rejected": -0.2473430335521698, "logps/chosen": -1.8168272972106934, "logps/rejected": -1.8686474561691284, "loss": 0.4074, "rewards/accuracies": 1.0, "rewards/chosen": 0.2139153778553009, "rewards/margins": 0.009357377886772156, "rewards/rejected": 0.20455799996852875, "step": 10718 }, { "epoch": 1.74, "learning_rate": 2.0796050278446748e-07, "logits/chosen": -0.5385925769805908, "logits/rejected": -0.5512136220932007, "logps/chosen": -67.06732940673828, "logps/rejected": -38.692359924316406, "loss": 0.7058, "rewards/accuracies": 0.0, "rewards/chosen": 0.9198028445243835, "rewards/margins": -0.8001362681388855, "rewards/rejected": 1.719939112663269, "step": 10719 }, { "epoch": 1.74, "learning_rate": 2.078538352986454e-07, "logits/chosen": -0.48935896158218384, "logits/rejected": -0.4810333549976349, "logps/chosen": -59.290931701660156, "logps/rejected": -105.30596160888672, "loss": 0.6151, "rewards/accuracies": 1.0, "rewards/chosen": 1.474951982498169, "rewards/margins": 0.8322166800498962, "rewards/rejected": 0.6427353024482727, "step": 10720 }, { "epoch": 1.74, "learning_rate": 2.0774718799735885e-07, "logits/chosen": -0.7424355149269104, "logits/rejected": -0.7033799290657043, "logps/chosen": -89.77224731445312, "logps/rejected": -59.82014465332031, "loss": 0.6419, "rewards/accuracies": 0.0, "rewards/chosen": 1.7755721807479858, "rewards/margins": -0.3413194417953491, "rewards/rejected": 2.116891622543335, "step": 10721 }, { "epoch": 1.74, "learning_rate": 2.0764056088797645e-07, "logits/chosen": -0.8426046371459961, "logits/rejected": -0.8039524555206299, "logps/chosen": -117.06971740722656, "logps/rejected": -72.46910095214844, "loss": 0.0395, "rewards/accuracies": 1.0, "rewards/chosen": 6.128495693206787, "rewards/margins": 2.783022880554199, "rewards/rejected": 3.345472812652588, "step": 10722 }, { "epoch": 1.74, "learning_rate": 2.0753395397786478e-07, "logits/chosen": -0.8363837599754333, "logits/rejected": -0.8005467653274536, "logps/chosen": -74.86389923095703, "logps/rejected": -58.14582061767578, "loss": 0.0276, "rewards/accuracies": 1.0, "rewards/chosen": 4.3186821937561035, "rewards/margins": 2.942002773284912, "rewards/rejected": 1.3766793012619019, "step": 10723 }, { "epoch": 1.74, "learning_rate": 2.0742736727438976e-07, "logits/chosen": -0.6331992745399475, "logits/rejected": -0.5240927338600159, "logps/chosen": -104.79702758789062, "logps/rejected": -103.97562408447266, "loss": 0.4036, "rewards/accuracies": 0.0, "rewards/chosen": 1.0318481922149658, "rewards/margins": -0.16352307796478271, "rewards/rejected": 1.1953712701797485, "step": 10724 }, { "epoch": 1.74, "learning_rate": 2.073208007849151e-07, "logits/chosen": -1.1743950843811035, "logits/rejected": -1.1237446069717407, "logps/chosen": -70.42659759521484, "logps/rejected": -77.65592956542969, "loss": 0.094, "rewards/accuracies": 1.0, "rewards/chosen": 3.1491858959198, "rewards/margins": 1.5961127281188965, "rewards/rejected": 1.5530731678009033, "step": 10725 }, { "epoch": 1.74, "learning_rate": 2.0721425451680396e-07, "logits/chosen": -1.0859884023666382, "logits/rejected": -0.9664700031280518, "logps/chosen": -73.26011657714844, "logps/rejected": -41.42681884765625, "loss": 0.7398, "rewards/accuracies": 0.0, "rewards/chosen": 0.6840751767158508, "rewards/margins": -0.12205201387405396, "rewards/rejected": 0.8061271905899048, "step": 10726 }, { "epoch": 1.74, "learning_rate": 2.0710772847741725e-07, "logits/chosen": -0.8578936457633972, "logits/rejected": -0.7476765513420105, "logps/chosen": -148.61465454101562, "logps/rejected": -89.93710327148438, "loss": 0.1849, "rewards/accuracies": 1.0, "rewards/chosen": 4.902171611785889, "rewards/margins": 0.8455018997192383, "rewards/rejected": 4.05666971206665, "step": 10727 }, { "epoch": 1.74, "learning_rate": 2.0700122267411535e-07, "logits/chosen": -0.5320590138435364, "logits/rejected": -0.5676267743110657, "logps/chosen": -58.43338394165039, "logps/rejected": -78.50465393066406, "loss": 0.5561, "rewards/accuracies": 0.0, "rewards/chosen": 1.3372395038604736, "rewards/margins": -0.03382217884063721, "rewards/rejected": 1.3710616827011108, "step": 10728 }, { "epoch": 1.74, "learning_rate": 2.0689473711425638e-07, "logits/chosen": -1.059761643409729, "logits/rejected": -1.1989812850952148, "logps/chosen": -147.26171875, "logps/rejected": -250.67019653320312, "loss": 2.7456, "rewards/accuracies": 0.0, "rewards/chosen": 2.511199951171875, "rewards/margins": -3.3497161865234375, "rewards/rejected": 5.8609161376953125, "step": 10729 }, { "epoch": 1.74, "learning_rate": 2.0678827180519787e-07, "logits/chosen": -0.3786739706993103, "logits/rejected": -0.41996055841445923, "logps/chosen": -7.96618127822876, "logps/rejected": -30.27886199951172, "loss": 0.5274, "rewards/accuracies": 0.0, "rewards/chosen": 0.2484159916639328, "rewards/margins": -0.43245065212249756, "rewards/rejected": 0.6808666586875916, "step": 10730 }, { "epoch": 1.74, "learning_rate": 2.0668182675429523e-07, "logits/chosen": -0.8918797969818115, "logits/rejected": -0.8827037811279297, "logps/chosen": -121.17213439941406, "logps/rejected": -83.68583679199219, "loss": 1.8862, "rewards/accuracies": 0.0, "rewards/chosen": 1.6993149518966675, "rewards/margins": -1.8152915239334106, "rewards/rejected": 3.514606475830078, "step": 10731 }, { "epoch": 1.74, "learning_rate": 2.065754019689031e-07, "logits/chosen": -0.27416735887527466, "logits/rejected": -0.2582564353942871, "logps/chosen": -42.4656982421875, "logps/rejected": -85.4866943359375, "loss": 0.3671, "rewards/accuracies": 1.0, "rewards/chosen": 1.5518616437911987, "rewards/margins": 0.8249207139015198, "rewards/rejected": 0.726940929889679, "step": 10732 }, { "epoch": 1.74, "learning_rate": 2.0646899745637414e-07, "logits/chosen": -0.85922771692276, "logits/rejected": -0.8242648243904114, "logps/chosen": -42.73302459716797, "logps/rejected": -48.20716094970703, "loss": 0.2299, "rewards/accuracies": 1.0, "rewards/chosen": 2.2516143321990967, "rewards/margins": 0.6999961137771606, "rewards/rejected": 1.551618218421936, "step": 10733 }, { "epoch": 1.74, "learning_rate": 2.0636261322406018e-07, "logits/chosen": -0.7062804698944092, "logits/rejected": -0.6839005947113037, "logps/chosen": -80.00016784667969, "logps/rejected": -121.00175476074219, "loss": 0.3283, "rewards/accuracies": 1.0, "rewards/chosen": 5.278225898742676, "rewards/margins": 0.10129880905151367, "rewards/rejected": 5.176927089691162, "step": 10734 }, { "epoch": 1.74, "learning_rate": 2.0625624927931107e-07, "logits/chosen": -0.7860225439071655, "logits/rejected": -0.7717747092247009, "logps/chosen": -39.74799346923828, "logps/rejected": -19.95983123779297, "loss": 1.4897, "rewards/accuracies": 1.0, "rewards/chosen": 1.0595035552978516, "rewards/margins": 0.7689085006713867, "rewards/rejected": 0.29059505462646484, "step": 10735 }, { "epoch": 1.74, "learning_rate": 2.0614990562947587e-07, "logits/chosen": -0.9172038435935974, "logits/rejected": -0.8854485154151917, "logps/chosen": -135.8232421875, "logps/rejected": -94.47012329101562, "loss": 0.2104, "rewards/accuracies": 1.0, "rewards/chosen": 4.287623882293701, "rewards/margins": 0.7782626152038574, "rewards/rejected": 3.5093612670898438, "step": 10736 }, { "epoch": 1.74, "learning_rate": 2.060435822819016e-07, "logits/chosen": -0.40098056197166443, "logits/rejected": -0.34032219648361206, "logps/chosen": -54.046791076660156, "logps/rejected": -84.14179992675781, "loss": 0.288, "rewards/accuracies": 1.0, "rewards/chosen": 1.443795084953308, "rewards/margins": 0.7972527146339417, "rewards/rejected": 0.6465423703193665, "step": 10737 }, { "epoch": 1.74, "learning_rate": 2.059372792439345e-07, "logits/chosen": -1.0159889459609985, "logits/rejected": -0.7900192737579346, "logps/chosen": -190.16278076171875, "logps/rejected": -97.67428588867188, "loss": 0.3009, "rewards/accuracies": 1.0, "rewards/chosen": 4.259713649749756, "rewards/margins": 0.21159934997558594, "rewards/rejected": 4.04811429977417, "step": 10738 }, { "epoch": 1.74, "learning_rate": 2.058309965229188e-07, "logits/chosen": -1.2263511419296265, "logits/rejected": -1.1924268007278442, "logps/chosen": -34.157493591308594, "logps/rejected": -20.962343215942383, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": 2.243273973464966, "rewards/margins": 1.8355807065963745, "rewards/rejected": 0.4076932966709137, "step": 10739 }, { "epoch": 1.74, "learning_rate": 2.0572473412619794e-07, "logits/chosen": -0.4796759784221649, "logits/rejected": -0.4715431034564972, "logps/chosen": -81.25935363769531, "logps/rejected": -93.80130004882812, "loss": 0.3807, "rewards/accuracies": 1.0, "rewards/chosen": 2.5234551429748535, "rewards/margins": 0.9569412469863892, "rewards/rejected": 1.5665138959884644, "step": 10740 }, { "epoch": 1.74, "learning_rate": 2.0561849206111337e-07, "logits/chosen": -0.9392452239990234, "logits/rejected": -0.8604116439819336, "logps/chosen": -220.26254272460938, "logps/rejected": -27.281869888305664, "loss": 0.1411, "rewards/accuracies": 1.0, "rewards/chosen": 5.212805271148682, "rewards/margins": 4.758502960205078, "rewards/rejected": 0.4543024003505707, "step": 10741 }, { "epoch": 1.74, "learning_rate": 2.0551227033500568e-07, "logits/chosen": -1.1275168657302856, "logits/rejected": -1.1751444339752197, "logps/chosen": -202.0977783203125, "logps/rejected": -80.14517211914062, "loss": 0.2862, "rewards/accuracies": 1.0, "rewards/chosen": 4.278863430023193, "rewards/margins": 1.0813672542572021, "rewards/rejected": 3.197496175765991, "step": 10742 }, { "epoch": 1.74, "learning_rate": 2.0540606895521344e-07, "logits/chosen": -0.18642264604568481, "logits/rejected": -0.20380660891532898, "logps/chosen": -11.444538116455078, "logps/rejected": -56.664146423339844, "loss": 0.329, "rewards/accuracies": 1.0, "rewards/chosen": -0.16555319726467133, "rewards/margins": 0.10286416113376617, "rewards/rejected": -0.2684173583984375, "step": 10743 }, { "epoch": 1.74, "learning_rate": 2.0529988792907455e-07, "logits/chosen": -0.48580724000930786, "logits/rejected": -0.46275264024734497, "logps/chosen": -49.22974395751953, "logps/rejected": -19.381214141845703, "loss": 1.7315, "rewards/accuracies": 1.0, "rewards/chosen": 0.3606143891811371, "rewards/margins": 0.0625995397567749, "rewards/rejected": 0.2980148494243622, "step": 10744 }, { "epoch": 1.74, "learning_rate": 2.0519372726392477e-07, "logits/chosen": -1.2053413391113281, "logits/rejected": -1.0092271566390991, "logps/chosen": -146.69418334960938, "logps/rejected": -71.92156982421875, "loss": 0.086, "rewards/accuracies": 1.0, "rewards/chosen": 5.086181640625, "rewards/margins": 1.8726866245269775, "rewards/rejected": 3.2134950160980225, "step": 10745 }, { "epoch": 1.74, "learning_rate": 2.050875869670991e-07, "logits/chosen": -0.8257432579994202, "logits/rejected": -0.7018716335296631, "logps/chosen": -198.79061889648438, "logps/rejected": -27.573434829711914, "loss": 0.9225, "rewards/accuracies": 1.0, "rewards/chosen": 4.369476318359375, "rewards/margins": 4.176670551300049, "rewards/rejected": 0.19280587136745453, "step": 10746 }, { "epoch": 1.74, "learning_rate": 2.0498146704593055e-07, "logits/chosen": -0.23349423706531525, "logits/rejected": -0.30588653683662415, "logps/chosen": -86.34175109863281, "logps/rejected": -54.82392883300781, "loss": 0.7877, "rewards/accuracies": 0.0, "rewards/chosen": 1.23509681224823, "rewards/margins": -1.1708937883377075, "rewards/rejected": 2.4059906005859375, "step": 10747 }, { "epoch": 1.74, "learning_rate": 2.048753675077513e-07, "logits/chosen": -0.9868435859680176, "logits/rejected": -1.0066070556640625, "logps/chosen": -131.326416015625, "logps/rejected": -65.32138061523438, "loss": 0.435, "rewards/accuracies": 0.0, "rewards/chosen": 4.41133451461792, "rewards/margins": -0.29405641555786133, "rewards/rejected": 4.705390930175781, "step": 10748 }, { "epoch": 1.74, "learning_rate": 2.0476928835989166e-07, "logits/chosen": -0.46607327461242676, "logits/rejected": -0.46353068947792053, "logps/chosen": -40.762962341308594, "logps/rejected": -57.7535400390625, "loss": 0.9666, "rewards/accuracies": 0.0, "rewards/chosen": 1.2882969379425049, "rewards/margins": -0.8984761238098145, "rewards/rejected": 2.1867730617523193, "step": 10749 }, { "epoch": 1.74, "learning_rate": 2.0466322960968047e-07, "logits/chosen": -0.6176028251647949, "logits/rejected": -0.6037272214889526, "logps/chosen": -54.91847229003906, "logps/rejected": -80.00802612304688, "loss": 0.9405, "rewards/accuracies": 1.0, "rewards/chosen": 2.202763319015503, "rewards/margins": 0.19220256805419922, "rewards/rejected": 2.0105607509613037, "step": 10750 }, { "epoch": 1.75, "learning_rate": 2.045571912644458e-07, "logits/chosen": -0.951202392578125, "logits/rejected": -0.8061058521270752, "logps/chosen": -164.55007934570312, "logps/rejected": -138.39581298828125, "loss": 1.9875, "rewards/accuracies": 1.0, "rewards/chosen": 5.608010768890381, "rewards/margins": 0.132415771484375, "rewards/rejected": 5.475594997406006, "step": 10751 }, { "epoch": 1.75, "learning_rate": 2.0445117333151358e-07, "logits/chosen": -1.0354595184326172, "logits/rejected": -1.0185301303863525, "logps/chosen": -60.475189208984375, "logps/rejected": -60.172332763671875, "loss": 0.7634, "rewards/accuracies": 0.0, "rewards/chosen": 0.9654617309570312, "rewards/margins": -1.2760612964630127, "rewards/rejected": 2.241523027420044, "step": 10752 }, { "epoch": 1.75, "learning_rate": 2.043451758182089e-07, "logits/chosen": -0.3097080886363983, "logits/rejected": -0.20371510088443756, "logps/chosen": -59.24428939819336, "logps/rejected": -65.76365661621094, "loss": 0.5418, "rewards/accuracies": 1.0, "rewards/chosen": 2.484314441680908, "rewards/margins": 1.0371700525283813, "rewards/rejected": 1.4471443891525269, "step": 10753 }, { "epoch": 1.75, "learning_rate": 2.0423919873185498e-07, "logits/chosen": -0.5600987672805786, "logits/rejected": -0.5058439373970032, "logps/chosen": -53.07270050048828, "logps/rejected": -100.7930908203125, "loss": 0.6177, "rewards/accuracies": 1.0, "rewards/chosen": 1.1988579034805298, "rewards/margins": 1.3835670948028564, "rewards/rejected": -0.18470917642116547, "step": 10754 }, { "epoch": 1.75, "learning_rate": 2.04133242079774e-07, "logits/chosen": -0.6708142161369324, "logits/rejected": -0.628365159034729, "logps/chosen": -94.41441345214844, "logps/rejected": -145.05612182617188, "loss": 1.7388, "rewards/accuracies": 0.0, "rewards/chosen": 1.4977775812149048, "rewards/margins": -2.720719337463379, "rewards/rejected": 4.218496799468994, "step": 10755 }, { "epoch": 1.75, "learning_rate": 2.040273058692863e-07, "logits/chosen": -0.8021575212478638, "logits/rejected": -0.6312777996063232, "logps/chosen": -71.19308471679688, "logps/rejected": -10.822263717651367, "loss": 0.8139, "rewards/accuracies": 1.0, "rewards/chosen": 2.256906270980835, "rewards/margins": 1.8341162204742432, "rewards/rejected": 0.4227900505065918, "step": 10756 }, { "epoch": 1.75, "learning_rate": 2.0392139010771146e-07, "logits/chosen": -0.9058411121368408, "logits/rejected": -0.8314200639724731, "logps/chosen": -60.624420166015625, "logps/rejected": -87.58332824707031, "loss": 1.3453, "rewards/accuracies": 1.0, "rewards/chosen": 3.1007964611053467, "rewards/margins": 1.030576229095459, "rewards/rejected": 2.0702202320098877, "step": 10757 }, { "epoch": 1.75, "learning_rate": 2.038154948023668e-07, "logits/chosen": -0.9578402638435364, "logits/rejected": -0.840569019317627, "logps/chosen": -85.31169891357422, "logps/rejected": -33.60838317871094, "loss": 0.2925, "rewards/accuracies": 1.0, "rewards/chosen": 1.4423240423202515, "rewards/margins": 0.2703547477722168, "rewards/rejected": 1.1719692945480347, "step": 10758 }, { "epoch": 1.75, "learning_rate": 2.0370961996056917e-07, "logits/chosen": -0.6836600303649902, "logits/rejected": -0.7431470155715942, "logps/chosen": -68.65675354003906, "logps/rejected": -87.93476104736328, "loss": 0.4697, "rewards/accuracies": 1.0, "rewards/chosen": 2.1879754066467285, "rewards/margins": 0.4000931978225708, "rewards/rejected": 1.7878822088241577, "step": 10759 }, { "epoch": 1.75, "learning_rate": 2.036037655896331e-07, "logits/chosen": -0.6296863555908203, "logits/rejected": -0.5392645597457886, "logps/chosen": -30.191650390625, "logps/rejected": -12.377105712890625, "loss": 0.5835, "rewards/accuracies": 1.0, "rewards/chosen": 1.2913856506347656, "rewards/margins": 0.3277549743652344, "rewards/rejected": 0.9636306762695312, "step": 10760 }, { "epoch": 1.75, "learning_rate": 2.0349793169687246e-07, "logits/chosen": -0.9445214867591858, "logits/rejected": -0.5567293167114258, "logps/chosen": -7.774082183837891, "logps/rejected": -190.9876708984375, "loss": 2.6015, "rewards/accuracies": 0.0, "rewards/chosen": 0.1277731955051422, "rewards/margins": -3.190192461013794, "rewards/rejected": 3.3179657459259033, "step": 10761 }, { "epoch": 1.75, "learning_rate": 2.03392118289599e-07, "logits/chosen": -0.5762351751327515, "logits/rejected": -0.5762351751327515, "logps/chosen": -50.30097198486328, "logps/rejected": -50.30097198486328, "loss": 0.3865, "rewards/accuracies": 0.0, "rewards/chosen": 2.8145179748535156, "rewards/margins": 0.0, "rewards/rejected": 2.8145179748535156, "step": 10762 }, { "epoch": 1.75, "learning_rate": 2.0328632537512387e-07, "logits/chosen": -0.9428321719169617, "logits/rejected": -0.8460928201675415, "logps/chosen": -76.39060974121094, "logps/rejected": -130.34429931640625, "loss": 1.8483, "rewards/accuracies": 0.0, "rewards/chosen": 2.9850165843963623, "rewards/margins": -3.61222767829895, "rewards/rejected": 6.5972442626953125, "step": 10763 }, { "epoch": 1.75, "learning_rate": 2.0318055296075587e-07, "logits/chosen": -0.7776039838790894, "logits/rejected": -0.4348706007003784, "logps/chosen": -144.94027709960938, "logps/rejected": -93.93632507324219, "loss": 1.2942, "rewards/accuracies": 1.0, "rewards/chosen": 3.5368728637695312, "rewards/margins": 0.3011748790740967, "rewards/rejected": 3.2356979846954346, "step": 10764 }, { "epoch": 1.75, "learning_rate": 2.0307480105380338e-07, "logits/chosen": -1.1082850694656372, "logits/rejected": -0.9884747266769409, "logps/chosen": -142.83155822753906, "logps/rejected": -75.47866821289062, "loss": 0.3877, "rewards/accuracies": 1.0, "rewards/chosen": 3.790823459625244, "rewards/margins": 2.1280555725097656, "rewards/rejected": 1.662767767906189, "step": 10765 }, { "epoch": 1.75, "learning_rate": 2.029690696615724e-07, "logits/chosen": -0.5300865173339844, "logits/rejected": -0.4987807273864746, "logps/chosen": -73.92233276367188, "logps/rejected": -75.2780990600586, "loss": 0.3504, "rewards/accuracies": 1.0, "rewards/chosen": 1.5715621709823608, "rewards/margins": 0.10299301147460938, "rewards/rejected": 1.4685691595077515, "step": 10766 }, { "epoch": 1.75, "learning_rate": 2.0286335879136833e-07, "logits/chosen": -0.6774681210517883, "logits/rejected": -0.5856204032897949, "logps/chosen": -71.92929077148438, "logps/rejected": -80.99807739257812, "loss": 0.925, "rewards/accuracies": 1.0, "rewards/chosen": 2.317145586013794, "rewards/margins": 0.3195427656173706, "rewards/rejected": 1.9976028203964233, "step": 10767 }, { "epoch": 1.75, "learning_rate": 2.0275766845049445e-07, "logits/chosen": -0.6983503103256226, "logits/rejected": -0.6920821070671082, "logps/chosen": -97.05535888671875, "logps/rejected": -132.50582885742188, "loss": 0.5542, "rewards/accuracies": 0.0, "rewards/chosen": 0.8396331667900085, "rewards/margins": -0.6969925761222839, "rewards/rejected": 1.5366257429122925, "step": 10768 }, { "epoch": 1.75, "learning_rate": 2.0265199864625337e-07, "logits/chosen": -0.6568626165390015, "logits/rejected": -0.6996760368347168, "logps/chosen": -64.699951171875, "logps/rejected": -81.46989440917969, "loss": 0.4914, "rewards/accuracies": 1.0, "rewards/chosen": 2.9358794689178467, "rewards/margins": 0.23444366455078125, "rewards/rejected": 2.7014358043670654, "step": 10769 }, { "epoch": 1.75, "learning_rate": 2.025463493859455e-07, "logits/chosen": -0.731835126876831, "logits/rejected": -0.8095952272415161, "logps/chosen": -198.1370849609375, "logps/rejected": -107.2070541381836, "loss": 1.9114, "rewards/accuracies": 0.0, "rewards/chosen": 1.4565337896347046, "rewards/margins": -3.125311851501465, "rewards/rejected": 4.581845760345459, "step": 10770 }, { "epoch": 1.75, "learning_rate": 2.024407206768703e-07, "logits/chosen": -0.8247441649436951, "logits/rejected": -0.754224419593811, "logps/chosen": -122.28208923339844, "logps/rejected": -67.15264892578125, "loss": 1.3744, "rewards/accuracies": 1.0, "rewards/chosen": 1.7482712268829346, "rewards/margins": 0.01313173770904541, "rewards/rejected": 1.7351394891738892, "step": 10771 }, { "epoch": 1.75, "learning_rate": 2.0233511252632595e-07, "logits/chosen": -0.8621523380279541, "logits/rejected": -0.8163528442382812, "logps/chosen": -90.62700653076172, "logps/rejected": -61.70399475097656, "loss": 0.8204, "rewards/accuracies": 0.0, "rewards/chosen": 1.5908790826797485, "rewards/margins": -0.9964486360549927, "rewards/rejected": 2.587327718734741, "step": 10772 }, { "epoch": 1.75, "learning_rate": 2.0222952494160862e-07, "logits/chosen": -0.6256519556045532, "logits/rejected": -0.6212571263313293, "logps/chosen": -83.39007568359375, "logps/rejected": -58.26795959472656, "loss": 0.3271, "rewards/accuracies": 1.0, "rewards/chosen": 2.222646474838257, "rewards/margins": 1.0387704372406006, "rewards/rejected": 1.1838760375976562, "step": 10773 }, { "epoch": 1.75, "learning_rate": 2.021239579300138e-07, "logits/chosen": -0.6706148982048035, "logits/rejected": -0.6168915629386902, "logps/chosen": -57.082916259765625, "logps/rejected": -27.697582244873047, "loss": 0.349, "rewards/accuracies": 1.0, "rewards/chosen": 2.2047622203826904, "rewards/margins": 0.9859973192214966, "rewards/rejected": 1.2187649011611938, "step": 10774 }, { "epoch": 1.75, "learning_rate": 2.020184114988347e-07, "logits/chosen": -0.7297842502593994, "logits/rejected": -0.6847984790802002, "logps/chosen": -88.92388916015625, "logps/rejected": -34.35662841796875, "loss": 0.6685, "rewards/accuracies": 0.0, "rewards/chosen": 0.9234825372695923, "rewards/margins": -0.7487941980361938, "rewards/rejected": 1.6722767353057861, "step": 10775 }, { "epoch": 1.75, "learning_rate": 2.0191288565536408e-07, "logits/chosen": -1.031601071357727, "logits/rejected": -1.12589693069458, "logps/chosen": -196.65762329101562, "logps/rejected": -126.69404602050781, "loss": 0.6928, "rewards/accuracies": 0.0, "rewards/chosen": 5.484729290008545, "rewards/margins": -1.042144775390625, "rewards/rejected": 6.52687406539917, "step": 10776 }, { "epoch": 1.75, "learning_rate": 2.0180738040689232e-07, "logits/chosen": -0.9469200968742371, "logits/rejected": -0.6860086917877197, "logps/chosen": -266.59027099609375, "logps/rejected": -123.99885559082031, "loss": 0.473, "rewards/accuracies": 0.0, "rewards/chosen": 3.9004058837890625, "rewards/margins": -0.3459625244140625, "rewards/rejected": 4.246368408203125, "step": 10777 }, { "epoch": 1.75, "learning_rate": 2.0170189576070928e-07, "logits/chosen": -0.7318376898765564, "logits/rejected": -0.5850952863693237, "logps/chosen": -53.26477813720703, "logps/rejected": -19.385549545288086, "loss": 0.2921, "rewards/accuracies": 1.0, "rewards/chosen": 1.2381744384765625, "rewards/margins": 0.9501937627792358, "rewards/rejected": 0.2879806458950043, "step": 10778 }, { "epoch": 1.75, "learning_rate": 2.015964317241025e-07, "logits/chosen": -1.0300383567810059, "logits/rejected": -0.9175193905830383, "logps/chosen": -182.79379272460938, "logps/rejected": -67.58833312988281, "loss": 0.7924, "rewards/accuracies": 0.0, "rewards/chosen": 0.5477340817451477, "rewards/margins": -1.0861892700195312, "rewards/rejected": 1.6339234113693237, "step": 10779 }, { "epoch": 1.75, "learning_rate": 2.0149098830435895e-07, "logits/chosen": -0.7493352293968201, "logits/rejected": -0.7242205739021301, "logps/chosen": -27.232826232910156, "logps/rejected": -68.23899841308594, "loss": 0.6228, "rewards/accuracies": 0.0, "rewards/chosen": 1.3450138568878174, "rewards/margins": -0.746110200881958, "rewards/rejected": 2.0911240577697754, "step": 10780 }, { "epoch": 1.75, "learning_rate": 2.013855655087634e-07, "logits/chosen": -0.852478563785553, "logits/rejected": -0.7657701373100281, "logps/chosen": -42.7954216003418, "logps/rejected": -154.3173828125, "loss": 1.5619, "rewards/accuracies": 1.0, "rewards/chosen": 1.786387324333191, "rewards/margins": 0.7624554634094238, "rewards/rejected": 1.023931860923767, "step": 10781 }, { "epoch": 1.75, "learning_rate": 2.0128016334459997e-07, "logits/chosen": -0.6126457452774048, "logits/rejected": -0.5977505445480347, "logps/chosen": -3.9389891624450684, "logps/rejected": -44.32860565185547, "loss": 0.9179, "rewards/accuracies": 1.0, "rewards/chosen": 0.5263923406600952, "rewards/margins": 0.3863300085067749, "rewards/rejected": 0.1400623321533203, "step": 10782 }, { "epoch": 1.75, "learning_rate": 2.0117478181915053e-07, "logits/chosen": -0.8240094780921936, "logits/rejected": -0.8370700478553772, "logps/chosen": -84.94413757324219, "logps/rejected": -70.89984130859375, "loss": 1.6725, "rewards/accuracies": 0.0, "rewards/chosen": 1.1270493268966675, "rewards/margins": -2.167880058288574, "rewards/rejected": 3.2949295043945312, "step": 10783 }, { "epoch": 1.75, "learning_rate": 2.0106942093969637e-07, "logits/chosen": -0.5275450944900513, "logits/rejected": -0.5760036706924438, "logps/chosen": -74.53211975097656, "logps/rejected": -90.89743041992188, "loss": 1.8074, "rewards/accuracies": 0.0, "rewards/chosen": 1.3412468433380127, "rewards/margins": -1.5666587352752686, "rewards/rejected": 2.9079055786132812, "step": 10784 }, { "epoch": 1.75, "learning_rate": 2.009640807135165e-07, "logits/chosen": -0.6608006954193115, "logits/rejected": -0.6980764269828796, "logps/chosen": -52.4311408996582, "logps/rejected": -59.28412628173828, "loss": 0.8239, "rewards/accuracies": 0.0, "rewards/chosen": 1.4465099573135376, "rewards/margins": -1.1342731714248657, "rewards/rejected": 2.5807831287384033, "step": 10785 }, { "epoch": 1.75, "learning_rate": 2.0085876114788935e-07, "logits/chosen": -1.068626880645752, "logits/rejected": -1.1150659322738647, "logps/chosen": -189.46096801757812, "logps/rejected": -25.601158142089844, "loss": 0.0604, "rewards/accuracies": 1.0, "rewards/chosen": 4.263156414031982, "rewards/margins": 4.302058696746826, "rewards/rejected": -0.03890209272503853, "step": 10786 }, { "epoch": 1.75, "learning_rate": 2.007534622500911e-07, "logits/chosen": -0.6132063865661621, "logits/rejected": -0.6132063865661621, "logps/chosen": -26.571674346923828, "logps/rejected": -26.571674346923828, "loss": 1.0138, "rewards/accuracies": 0.0, "rewards/chosen": 1.7227963209152222, "rewards/margins": 0.0, "rewards/rejected": 1.7227963209152222, "step": 10787 }, { "epoch": 1.75, "learning_rate": 2.006481840273973e-07, "logits/chosen": -0.7101580500602722, "logits/rejected": -0.6701302528381348, "logps/chosen": -49.359107971191406, "logps/rejected": -35.99618148803711, "loss": 0.2665, "rewards/accuracies": 1.0, "rewards/chosen": 1.9842636585235596, "rewards/margins": 0.4399524927139282, "rewards/rejected": 1.5443111658096313, "step": 10788 }, { "epoch": 1.75, "learning_rate": 2.0054292648708131e-07, "logits/chosen": -0.45916780829429626, "logits/rejected": -0.4989279806613922, "logps/chosen": -31.122024536132812, "logps/rejected": -90.63224792480469, "loss": 1.4879, "rewards/accuracies": 0.0, "rewards/chosen": 0.6606811881065369, "rewards/margins": -0.12829816341400146, "rewards/rejected": 0.7889793515205383, "step": 10789 }, { "epoch": 1.75, "learning_rate": 2.0043768963641582e-07, "logits/chosen": -0.14797553420066833, "logits/rejected": -0.04886406660079956, "logps/chosen": -51.114906311035156, "logps/rejected": -3.062042236328125, "loss": 0.8241, "rewards/accuracies": 0.0, "rewards/chosen": 0.20342408120632172, "rewards/margins": -0.057664379477500916, "rewards/rejected": 0.26108846068382263, "step": 10790 }, { "epoch": 1.75, "learning_rate": 2.003324734826713e-07, "logits/chosen": -0.6791378259658813, "logits/rejected": -0.7207925915718079, "logps/chosen": -81.10720825195312, "logps/rejected": -133.68023681640625, "loss": 1.6468, "rewards/accuracies": 0.0, "rewards/chosen": 2.3454010486602783, "rewards/margins": -2.9013917446136475, "rewards/rejected": 5.246792793273926, "step": 10791 }, { "epoch": 1.75, "learning_rate": 2.0022727803311757e-07, "logits/chosen": -0.7340337038040161, "logits/rejected": -0.6167930364608765, "logps/chosen": -39.896263122558594, "logps/rejected": -72.80736541748047, "loss": 1.1147, "rewards/accuracies": 1.0, "rewards/chosen": 2.668715715408325, "rewards/margins": 0.05868840217590332, "rewards/rejected": 2.610027313232422, "step": 10792 }, { "epoch": 1.75, "learning_rate": 2.001221032950222e-07, "logits/chosen": -0.9758320450782776, "logits/rejected": -0.8727685213088989, "logps/chosen": -104.21542358398438, "logps/rejected": -66.8951416015625, "loss": 0.9573, "rewards/accuracies": 0.0, "rewards/chosen": 0.5825851559638977, "rewards/margins": -0.8722373843193054, "rewards/rejected": 1.4548225402832031, "step": 10793 }, { "epoch": 1.75, "learning_rate": 2.0001694927565227e-07, "logits/chosen": -0.5771075487136841, "logits/rejected": -0.602759599685669, "logps/chosen": -46.16033172607422, "logps/rejected": -80.9476089477539, "loss": 1.299, "rewards/accuracies": 0.0, "rewards/chosen": 0.5826076865196228, "rewards/margins": -0.2437419891357422, "rewards/rejected": 0.826349675655365, "step": 10794 }, { "epoch": 1.75, "learning_rate": 1.9991181598227247e-07, "logits/chosen": -0.7457588911056519, "logits/rejected": -0.6956600546836853, "logps/chosen": -37.26469802856445, "logps/rejected": -12.361855506896973, "loss": 0.9056, "rewards/accuracies": 0.0, "rewards/chosen": 0.3870304226875305, "rewards/margins": -0.748279869556427, "rewards/rejected": 1.1353102922439575, "step": 10795 }, { "epoch": 1.75, "learning_rate": 1.9980670342214695e-07, "logits/chosen": -0.47287023067474365, "logits/rejected": -0.44868162274360657, "logps/chosen": -85.71040344238281, "logps/rejected": -66.02743530273438, "loss": 0.5471, "rewards/accuracies": 0.0, "rewards/chosen": 2.170307159423828, "rewards/margins": -0.6414268016815186, "rewards/rejected": 2.8117339611053467, "step": 10796 }, { "epoch": 1.75, "learning_rate": 1.9970161160253756e-07, "logits/chosen": -0.8383339047431946, "logits/rejected": -0.7399907112121582, "logps/chosen": -89.36811828613281, "logps/rejected": -59.25484085083008, "loss": 1.1193, "rewards/accuracies": 0.0, "rewards/chosen": 0.6728668212890625, "rewards/margins": -0.9715816974639893, "rewards/rejected": 1.6444485187530518, "step": 10797 }, { "epoch": 1.75, "learning_rate": 1.995965405307055e-07, "logits/chosen": -0.6182031631469727, "logits/rejected": -0.6988764405250549, "logps/chosen": -89.24418640136719, "logps/rejected": -201.160888671875, "loss": 1.636, "rewards/accuracies": 0.0, "rewards/chosen": 2.461437940597534, "rewards/margins": -3.228503465652466, "rewards/rejected": 5.68994140625, "step": 10798 }, { "epoch": 1.75, "learning_rate": 1.9949149021390994e-07, "logits/chosen": -1.0040556192398071, "logits/rejected": -1.075298547744751, "logps/chosen": -178.96360778808594, "logps/rejected": -63.51609802246094, "loss": 0.0517, "rewards/accuracies": 1.0, "rewards/chosen": 3.252241611480713, "rewards/margins": 2.2446417808532715, "rewards/rejected": 1.0075997114181519, "step": 10799 }, { "epoch": 1.75, "learning_rate": 1.9938646065940912e-07, "logits/chosen": -0.7324373722076416, "logits/rejected": -0.699079155921936, "logps/chosen": -115.2973403930664, "logps/rejected": -90.4852066040039, "loss": 0.3972, "rewards/accuracies": 1.0, "rewards/chosen": 5.864818572998047, "rewards/margins": 2.557356119155884, "rewards/rejected": 3.307462453842163, "step": 10800 }, { "epoch": 1.75, "learning_rate": 1.992814518744592e-07, "logits/chosen": -0.8585118055343628, "logits/rejected": -0.8695387840270996, "logps/chosen": -111.85212707519531, "logps/rejected": -57.82864761352539, "loss": 0.5731, "rewards/accuracies": 0.0, "rewards/chosen": 0.7470169067382812, "rewards/margins": -0.6330165863037109, "rewards/rejected": 1.3800334930419922, "step": 10801 }, { "epoch": 1.75, "learning_rate": 1.9917646386631575e-07, "logits/chosen": -0.48546579480171204, "logits/rejected": -0.5380499362945557, "logps/chosen": -65.64593505859375, "logps/rejected": -44.592628479003906, "loss": 2.0555, "rewards/accuracies": 0.0, "rewards/chosen": 1.085261583328247, "rewards/margins": -0.8435112237930298, "rewards/rejected": 1.9287728071212769, "step": 10802 }, { "epoch": 1.75, "learning_rate": 1.9907149664223205e-07, "logits/chosen": -0.9796473979949951, "logits/rejected": -0.8979489207267761, "logps/chosen": -78.39604187011719, "logps/rejected": -22.656051635742188, "loss": 0.2234, "rewards/accuracies": 1.0, "rewards/chosen": 1.5620125532150269, "rewards/margins": 0.7288326621055603, "rewards/rejected": 0.8331798911094666, "step": 10803 }, { "epoch": 1.75, "learning_rate": 1.9896655020946073e-07, "logits/chosen": -0.5672740936279297, "logits/rejected": -0.6343508958816528, "logps/chosen": -100.45335388183594, "logps/rejected": -45.971309661865234, "loss": 0.6625, "rewards/accuracies": 0.0, "rewards/chosen": 1.6669906377792358, "rewards/margins": -0.9193042516708374, "rewards/rejected": 2.5862948894500732, "step": 10804 }, { "epoch": 1.75, "learning_rate": 1.988616245752522e-07, "logits/chosen": -0.9832066893577576, "logits/rejected": -0.9964426755905151, "logps/chosen": -161.68154907226562, "logps/rejected": -125.08683776855469, "loss": 0.1627, "rewards/accuracies": 1.0, "rewards/chosen": 5.287205696105957, "rewards/margins": 1.2435317039489746, "rewards/rejected": 4.043673992156982, "step": 10805 }, { "epoch": 1.75, "learning_rate": 1.9875671974685597e-07, "logits/chosen": -0.5840548872947693, "logits/rejected": -0.5840548872947693, "logps/chosen": -75.99877166748047, "logps/rejected": -75.99877166748047, "loss": 1.2425, "rewards/accuracies": 0.0, "rewards/chosen": 2.793565511703491, "rewards/margins": 0.0, "rewards/rejected": 2.793565511703491, "step": 10806 }, { "epoch": 1.75, "learning_rate": 1.9865183573152021e-07, "logits/chosen": -0.7292464971542358, "logits/rejected": -0.7006170153617859, "logps/chosen": -162.59945678710938, "logps/rejected": -78.2896499633789, "loss": 0.6001, "rewards/accuracies": 0.0, "rewards/chosen": 0.9311721920967102, "rewards/margins": -0.11676865816116333, "rewards/rejected": 1.0479408502578735, "step": 10807 }, { "epoch": 1.75, "learning_rate": 1.9854697253649104e-07, "logits/chosen": -0.6781920194625854, "logits/rejected": -0.6486232876777649, "logps/chosen": -121.41490936279297, "logps/rejected": -96.25816345214844, "loss": 0.8662, "rewards/accuracies": 0.0, "rewards/chosen": 0.22465820610523224, "rewards/margins": -1.1767470836639404, "rewards/rejected": 1.4014053344726562, "step": 10808 }, { "epoch": 1.75, "learning_rate": 1.984421301690139e-07, "logits/chosen": -0.7296221852302551, "logits/rejected": -0.7853918671607971, "logps/chosen": -39.420372009277344, "logps/rejected": -142.2412872314453, "loss": 3.3898, "rewards/accuracies": 0.0, "rewards/chosen": 1.5515655279159546, "rewards/margins": -3.6524124145507812, "rewards/rejected": 5.203978061676025, "step": 10809 }, { "epoch": 1.75, "learning_rate": 1.9833730863633196e-07, "logits/chosen": -0.4044223427772522, "logits/rejected": -0.4360414445400238, "logps/chosen": -81.87225341796875, "logps/rejected": -124.41409301757812, "loss": 0.6051, "rewards/accuracies": 0.0, "rewards/chosen": 1.3687270879745483, "rewards/margins": -0.8202515840530396, "rewards/rejected": 2.188978672027588, "step": 10810 }, { "epoch": 1.75, "learning_rate": 1.9823250794568792e-07, "logits/chosen": -0.8688339591026306, "logits/rejected": -0.7243927717208862, "logps/chosen": -129.73257446289062, "logps/rejected": -76.6014633178711, "loss": 0.1672, "rewards/accuracies": 1.0, "rewards/chosen": 7.1223602294921875, "rewards/margins": 4.76817512512207, "rewards/rejected": 2.354184865951538, "step": 10811 }, { "epoch": 1.75, "learning_rate": 1.9812772810432193e-07, "logits/chosen": -0.9088097810745239, "logits/rejected": -0.7568972110748291, "logps/chosen": -118.623779296875, "logps/rejected": -44.38795852661133, "loss": 0.3343, "rewards/accuracies": 1.0, "rewards/chosen": 5.498227119445801, "rewards/margins": 4.227910041809082, "rewards/rejected": 1.2703170776367188, "step": 10812 }, { "epoch": 1.76, "learning_rate": 1.9802296911947386e-07, "logits/chosen": -0.4534142315387726, "logits/rejected": -0.35496267676353455, "logps/chosen": -41.73908996582031, "logps/rejected": -33.85873794555664, "loss": 0.4098, "rewards/accuracies": 1.0, "rewards/chosen": 1.2096554040908813, "rewards/margins": 0.1587066650390625, "rewards/rejected": 1.0509487390518188, "step": 10813 }, { "epoch": 1.76, "learning_rate": 1.9791823099838106e-07, "logits/chosen": -0.7195221781730652, "logits/rejected": -0.7119538187980652, "logps/chosen": -131.65834045410156, "logps/rejected": -159.61700439453125, "loss": 1.0584, "rewards/accuracies": 0.0, "rewards/chosen": 5.653087139129639, "rewards/margins": -1.95341157913208, "rewards/rejected": 7.606498718261719, "step": 10814 }, { "epoch": 1.76, "learning_rate": 1.9781351374828037e-07, "logits/chosen": -0.5590139627456665, "logits/rejected": -0.43710142374038696, "logps/chosen": -67.38490295410156, "logps/rejected": -58.095237731933594, "loss": 0.2265, "rewards/accuracies": 1.0, "rewards/chosen": 2.169058322906494, "rewards/margins": 2.189324378967285, "rewards/rejected": -0.020265961065888405, "step": 10815 }, { "epoch": 1.76, "learning_rate": 1.9770881737640637e-07, "logits/chosen": -0.7828915119171143, "logits/rejected": -0.7417554259300232, "logps/chosen": -120.7336196899414, "logps/rejected": -47.392127990722656, "loss": 0.5457, "rewards/accuracies": 0.0, "rewards/chosen": 2.1735846996307373, "rewards/margins": -0.44013524055480957, "rewards/rejected": 2.613719940185547, "step": 10816 }, { "epoch": 1.76, "learning_rate": 1.9760414188999296e-07, "logits/chosen": -0.4009600579738617, "logits/rejected": -0.2048695832490921, "logps/chosen": -97.13955688476562, "logps/rejected": -60.720314025878906, "loss": 0.0695, "rewards/accuracies": 1.0, "rewards/chosen": 2.6703224182128906, "rewards/margins": 2.4539847373962402, "rewards/rejected": 0.21633759140968323, "step": 10817 }, { "epoch": 1.76, "learning_rate": 1.9749948729627186e-07, "logits/chosen": -0.5821056365966797, "logits/rejected": -0.4181825518608093, "logps/chosen": -53.960716247558594, "logps/rejected": -25.491615295410156, "loss": 0.2024, "rewards/accuracies": 1.0, "rewards/chosen": 1.5070946216583252, "rewards/margins": 1.450582504272461, "rewards/rejected": 0.05651206895709038, "step": 10818 }, { "epoch": 1.76, "learning_rate": 1.973948536024741e-07, "logits/chosen": -0.6388022303581238, "logits/rejected": -0.5227475166320801, "logps/chosen": -118.83305358886719, "logps/rejected": -67.47451782226562, "loss": 0.6337, "rewards/accuracies": 1.0, "rewards/chosen": 1.1680634021759033, "rewards/margins": 0.4013855457305908, "rewards/rejected": 0.7666778564453125, "step": 10819 }, { "epoch": 1.76, "learning_rate": 1.972902408158285e-07, "logits/chosen": -0.7748082876205444, "logits/rejected": -0.7288756966590881, "logps/chosen": -53.96656799316406, "logps/rejected": -58.30207824707031, "loss": 0.4829, "rewards/accuracies": 1.0, "rewards/chosen": 1.6191139221191406, "rewards/margins": 0.2373443841934204, "rewards/rejected": 1.3817695379257202, "step": 10820 }, { "epoch": 1.76, "learning_rate": 1.971856489435632e-07, "logits/chosen": -0.9320145845413208, "logits/rejected": -0.8766303062438965, "logps/chosen": -88.45436096191406, "logps/rejected": -76.05436706542969, "loss": 0.3611, "rewards/accuracies": 1.0, "rewards/chosen": 3.226191759109497, "rewards/margins": 0.007768154144287109, "rewards/rejected": 3.21842360496521, "step": 10821 }, { "epoch": 1.76, "learning_rate": 1.9708107799290407e-07, "logits/chosen": -0.5464785099029541, "logits/rejected": -0.5482890605926514, "logps/chosen": -103.3890380859375, "logps/rejected": -78.24139404296875, "loss": 1.0788, "rewards/accuracies": 1.0, "rewards/chosen": 1.5567550659179688, "rewards/margins": 0.07419586181640625, "rewards/rejected": 1.4825592041015625, "step": 10822 }, { "epoch": 1.76, "learning_rate": 1.9697652797107646e-07, "logits/chosen": -0.5083655714988708, "logits/rejected": -0.5751082897186279, "logps/chosen": -56.86296081542969, "logps/rejected": -59.46135711669922, "loss": 1.3952, "rewards/accuracies": 0.0, "rewards/chosen": 0.7501190304756165, "rewards/margins": -1.7042579650878906, "rewards/rejected": 2.4543769359588623, "step": 10823 }, { "epoch": 1.76, "learning_rate": 1.9687199888530326e-07, "logits/chosen": -0.843187689781189, "logits/rejected": -0.7797987461090088, "logps/chosen": -219.32896423339844, "logps/rejected": -74.24766540527344, "loss": 0.3634, "rewards/accuracies": 1.0, "rewards/chosen": 3.0209336280822754, "rewards/margins": 1.1934677362442017, "rewards/rejected": 1.8274658918380737, "step": 10824 }, { "epoch": 1.76, "learning_rate": 1.9676749074280695e-07, "logits/chosen": -0.46020984649658203, "logits/rejected": -0.46020984649658203, "logps/chosen": -60.24943542480469, "logps/rejected": -60.24943542480469, "loss": 0.4643, "rewards/accuracies": 0.0, "rewards/chosen": 0.008905410766601562, "rewards/margins": 0.0, "rewards/rejected": 0.008905410766601562, "step": 10825 }, { "epoch": 1.76, "learning_rate": 1.966630035508076e-07, "logits/chosen": -0.562131941318512, "logits/rejected": -0.6054272055625916, "logps/chosen": -213.68331909179688, "logps/rejected": -81.12892150878906, "loss": 0.2538, "rewards/accuracies": 1.0, "rewards/chosen": 3.2902374267578125, "rewards/margins": 0.46083521842956543, "rewards/rejected": 2.829402208328247, "step": 10826 }, { "epoch": 1.76, "learning_rate": 1.9655853731652472e-07, "logits/chosen": -1.1061923503875732, "logits/rejected": -1.0931432247161865, "logps/chosen": -94.88409423828125, "logps/rejected": -95.8781967163086, "loss": 1.0028, "rewards/accuracies": 0.0, "rewards/chosen": 4.937225341796875, "rewards/margins": -1.178574562072754, "rewards/rejected": 6.115799903869629, "step": 10827 }, { "epoch": 1.76, "learning_rate": 1.9645409204717556e-07, "logits/chosen": -1.0110622644424438, "logits/rejected": -0.9655768275260925, "logps/chosen": -42.46249008178711, "logps/rejected": -9.626508712768555, "loss": 0.5062, "rewards/accuracies": 1.0, "rewards/chosen": 2.316990375518799, "rewards/margins": 1.8822842836380005, "rewards/rejected": 0.4347061216831207, "step": 10828 }, { "epoch": 1.76, "learning_rate": 1.9634966774997657e-07, "logits/chosen": -0.39483049511909485, "logits/rejected": -0.47164520621299744, "logps/chosen": -60.57930374145508, "logps/rejected": -43.6251220703125, "loss": 0.7003, "rewards/accuracies": 0.0, "rewards/chosen": 1.0553646087646484, "rewards/margins": -0.7756626605987549, "rewards/rejected": 1.8310272693634033, "step": 10829 }, { "epoch": 1.76, "learning_rate": 1.9624526443214223e-07, "logits/chosen": -0.4566495716571808, "logits/rejected": -0.4438682496547699, "logps/chosen": -18.435970306396484, "logps/rejected": -6.5350775718688965, "loss": 0.5345, "rewards/accuracies": 1.0, "rewards/chosen": 0.2711622416973114, "rewards/margins": 0.039103955030441284, "rewards/rejected": 0.23205828666687012, "step": 10830 }, { "epoch": 1.76, "learning_rate": 1.961408821008862e-07, "logits/chosen": -0.7148653864860535, "logits/rejected": -0.7148653864860535, "logps/chosen": -81.3561782836914, "logps/rejected": -81.3561782836914, "loss": 0.5487, "rewards/accuracies": 0.0, "rewards/chosen": 2.3501763343811035, "rewards/margins": 0.0, "rewards/rejected": 2.3501763343811035, "step": 10831 }, { "epoch": 1.76, "learning_rate": 1.9603652076341981e-07, "logits/chosen": -0.5074198246002197, "logits/rejected": -0.4911394417285919, "logps/chosen": -57.57908630371094, "logps/rejected": -75.83929443359375, "loss": 0.9123, "rewards/accuracies": 0.0, "rewards/chosen": 1.862343668937683, "rewards/margins": -0.9906805753707886, "rewards/rejected": 2.8530242443084717, "step": 10832 }, { "epoch": 1.76, "learning_rate": 1.9593218042695392e-07, "logits/chosen": -0.6483793258666992, "logits/rejected": -0.6918314695358276, "logps/chosen": -48.074954986572266, "logps/rejected": -90.12603759765625, "loss": 0.9294, "rewards/accuracies": 0.0, "rewards/chosen": 0.6613090634346008, "rewards/margins": -0.15618973970413208, "rewards/rejected": 0.8174988031387329, "step": 10833 }, { "epoch": 1.76, "learning_rate": 1.958278610986971e-07, "logits/chosen": -0.47491222620010376, "logits/rejected": -0.5130590796470642, "logps/chosen": -29.875282287597656, "logps/rejected": -50.142845153808594, "loss": 0.7918, "rewards/accuracies": 0.0, "rewards/chosen": 1.9403259754180908, "rewards/margins": -0.32883214950561523, "rewards/rejected": 2.269158124923706, "step": 10834 }, { "epoch": 1.76, "learning_rate": 1.9572356278585712e-07, "logits/chosen": -0.43966904282569885, "logits/rejected": -0.4749351739883423, "logps/chosen": -67.83000183105469, "logps/rejected": -88.16012573242188, "loss": 0.6847, "rewards/accuracies": 0.0, "rewards/chosen": 2.285069227218628, "rewards/margins": -0.3650391101837158, "rewards/rejected": 2.6501083374023438, "step": 10835 }, { "epoch": 1.76, "learning_rate": 1.9561928549563966e-07, "logits/chosen": -0.6524422764778137, "logits/rejected": -0.6610248684883118, "logps/chosen": -35.99488067626953, "logps/rejected": -44.991111755371094, "loss": 0.5062, "rewards/accuracies": 0.0, "rewards/chosen": 1.223078966140747, "rewards/margins": -0.4968787431716919, "rewards/rejected": 1.719957709312439, "step": 10836 }, { "epoch": 1.76, "learning_rate": 1.955150292352497e-07, "logits/chosen": -0.9921424388885498, "logits/rejected": -0.9233531355857849, "logps/chosen": -91.91679382324219, "logps/rejected": -99.1209716796875, "loss": 0.1759, "rewards/accuracies": 1.0, "rewards/chosen": 2.037989854812622, "rewards/margins": 1.1028335094451904, "rewards/rejected": 0.9351562857627869, "step": 10837 }, { "epoch": 1.76, "learning_rate": 1.9541079401188998e-07, "logits/chosen": -0.6347948908805847, "logits/rejected": -0.6746366024017334, "logps/chosen": -113.92374420166016, "logps/rejected": -75.29307556152344, "loss": 1.295, "rewards/accuracies": 0.0, "rewards/chosen": 0.8591484427452087, "rewards/margins": -2.466201066970825, "rewards/rejected": 3.3253495693206787, "step": 10838 }, { "epoch": 1.76, "learning_rate": 1.953065798327625e-07, "logits/chosen": -0.6613918542861938, "logits/rejected": -0.6167894005775452, "logps/chosen": -147.98300170898438, "logps/rejected": -122.44934844970703, "loss": 0.6348, "rewards/accuracies": 0.0, "rewards/chosen": 3.652453660964966, "rewards/margins": -0.8383948802947998, "rewards/rejected": 4.490848541259766, "step": 10839 }, { "epoch": 1.76, "learning_rate": 1.9520238670506716e-07, "logits/chosen": -0.37482714653015137, "logits/rejected": -0.3691725432872772, "logps/chosen": -2.309222459793091, "logps/rejected": -15.120265007019043, "loss": 0.9903, "rewards/accuracies": 1.0, "rewards/chosen": 0.28446242213249207, "rewards/margins": 0.04776589572429657, "rewards/rejected": 0.2366965264081955, "step": 10840 }, { "epoch": 1.76, "learning_rate": 1.95098214636003e-07, "logits/chosen": -0.683947741985321, "logits/rejected": -0.6407672166824341, "logps/chosen": -89.77894592285156, "logps/rejected": -57.15839385986328, "loss": 0.5086, "rewards/accuracies": 0.0, "rewards/chosen": 1.1689552068710327, "rewards/margins": -0.01634526252746582, "rewards/rejected": 1.1853004693984985, "step": 10841 }, { "epoch": 1.76, "learning_rate": 1.949940636327671e-07, "logits/chosen": -0.9183467626571655, "logits/rejected": -0.7943248152732849, "logps/chosen": -108.75566864013672, "logps/rejected": -56.33585739135742, "loss": 1.134, "rewards/accuracies": 1.0, "rewards/chosen": 4.152782440185547, "rewards/margins": 2.1452419757843018, "rewards/rejected": 2.007540464401245, "step": 10842 }, { "epoch": 1.76, "learning_rate": 1.948899337025554e-07, "logits/chosen": -1.0418767929077148, "logits/rejected": -1.0585849285125732, "logps/chosen": -49.74266815185547, "logps/rejected": -123.2357177734375, "loss": 2.0359, "rewards/accuracies": 0.0, "rewards/chosen": 4.243795871734619, "rewards/margins": -3.2067689895629883, "rewards/rejected": 7.450564861297607, "step": 10843 }, { "epoch": 1.76, "learning_rate": 1.9478582485256246e-07, "logits/chosen": -0.6820181608200073, "logits/rejected": -0.6617563962936401, "logps/chosen": -91.5979995727539, "logps/rejected": -79.6025390625, "loss": 0.5662, "rewards/accuracies": 0.0, "rewards/chosen": 1.2251328229904175, "rewards/margins": -0.6838164329528809, "rewards/rejected": 1.9089492559432983, "step": 10844 }, { "epoch": 1.76, "learning_rate": 1.9468173708998086e-07, "logits/chosen": -0.7953412532806396, "logits/rejected": -0.5662326216697693, "logps/chosen": -123.43360900878906, "logps/rejected": -78.18727111816406, "loss": 0.0123, "rewards/accuracies": 1.0, "rewards/chosen": 6.206492900848389, "rewards/margins": 3.8674256801605225, "rewards/rejected": 2.339067220687866, "step": 10845 }, { "epoch": 1.76, "learning_rate": 1.945776704220025e-07, "logits/chosen": -0.33713874220848083, "logits/rejected": -0.27253642678260803, "logps/chosen": -70.20183563232422, "logps/rejected": -71.42508697509766, "loss": 1.4403, "rewards/accuracies": 0.0, "rewards/chosen": 1.331285834312439, "rewards/margins": -0.8940919637680054, "rewards/rejected": 2.2253777980804443, "step": 10846 }, { "epoch": 1.76, "learning_rate": 1.9447362485581698e-07, "logits/chosen": -0.6022090315818787, "logits/rejected": -0.6022090315818787, "logps/chosen": -67.99409484863281, "logps/rejected": -67.99409484863281, "loss": 0.3613, "rewards/accuracies": 0.0, "rewards/chosen": 2.6535918712615967, "rewards/margins": 0.0, "rewards/rejected": 2.6535918712615967, "step": 10847 }, { "epoch": 1.76, "learning_rate": 1.9436960039861322e-07, "logits/chosen": -0.6670010685920715, "logits/rejected": -0.5889079570770264, "logps/chosen": -57.57390213012695, "logps/rejected": -90.74000549316406, "loss": 0.2927, "rewards/accuracies": 1.0, "rewards/chosen": 1.5064709186553955, "rewards/margins": 0.4494060277938843, "rewards/rejected": 1.0570648908615112, "step": 10848 }, { "epoch": 1.76, "learning_rate": 1.9426559705757807e-07, "logits/chosen": -0.8912518620491028, "logits/rejected": -0.8912518620491028, "logps/chosen": -88.11543273925781, "logps/rejected": -88.11543273925781, "loss": 0.3571, "rewards/accuracies": 0.0, "rewards/chosen": 2.866380453109741, "rewards/margins": 0.0, "rewards/rejected": 2.866380453109741, "step": 10849 }, { "epoch": 1.76, "learning_rate": 1.9416161483989735e-07, "logits/chosen": -0.6348316073417664, "logits/rejected": -0.5904650688171387, "logps/chosen": -40.163299560546875, "logps/rejected": -24.859540939331055, "loss": 0.1461, "rewards/accuracies": 1.0, "rewards/chosen": 1.7641667127609253, "rewards/margins": 1.1507500410079956, "rewards/rejected": 0.6134166717529297, "step": 10850 }, { "epoch": 1.76, "learning_rate": 1.9405765375275507e-07, "logits/chosen": -0.41282257437705994, "logits/rejected": -0.41232308745384216, "logps/chosen": -21.449214935302734, "logps/rejected": -17.35324478149414, "loss": 0.5577, "rewards/accuracies": 0.0, "rewards/chosen": 0.40479204058647156, "rewards/margins": -0.12563952803611755, "rewards/rejected": 0.5304315686225891, "step": 10851 }, { "epoch": 1.76, "learning_rate": 1.9395371380333426e-07, "logits/chosen": -0.7359335422515869, "logits/rejected": -0.7680449485778809, "logps/chosen": -64.50166320800781, "logps/rejected": -111.72157287597656, "loss": 0.4347, "rewards/accuracies": 1.0, "rewards/chosen": 1.584436058998108, "rewards/margins": 0.3734382390975952, "rewards/rejected": 1.2109978199005127, "step": 10852 }, { "epoch": 1.76, "learning_rate": 1.9384979499881577e-07, "logits/chosen": -0.9738306403160095, "logits/rejected": -0.9898396730422974, "logps/chosen": -101.32630157470703, "logps/rejected": -104.07715606689453, "loss": 0.5755, "rewards/accuracies": 0.0, "rewards/chosen": 1.129206895828247, "rewards/margins": -0.7205665111541748, "rewards/rejected": 1.8497734069824219, "step": 10853 }, { "epoch": 1.76, "learning_rate": 1.9374589734637998e-07, "logits/chosen": -0.9471383094787598, "logits/rejected": -0.9979842901229858, "logps/chosen": -179.64097595214844, "logps/rejected": -195.45164489746094, "loss": 3.4323, "rewards/accuracies": 0.0, "rewards/chosen": 4.620384216308594, "rewards/margins": -3.519754409790039, "rewards/rejected": 8.140138626098633, "step": 10854 }, { "epoch": 1.76, "learning_rate": 1.9364202085320454e-07, "logits/chosen": -0.9818929433822632, "logits/rejected": -0.9818929433822632, "logps/chosen": -101.92892456054688, "logps/rejected": -101.92892456054688, "loss": 2.7517, "rewards/accuracies": 0.0, "rewards/chosen": 1.8320029973983765, "rewards/margins": 0.0, "rewards/rejected": 1.8320029973983765, "step": 10855 }, { "epoch": 1.76, "learning_rate": 1.935381655264668e-07, "logits/chosen": -1.2285809516906738, "logits/rejected": -1.1679632663726807, "logps/chosen": -112.27171325683594, "logps/rejected": -55.958213806152344, "loss": 0.5516, "rewards/accuracies": 0.0, "rewards/chosen": 1.0504578351974487, "rewards/margins": -0.6635520458221436, "rewards/rejected": 1.7140098810195923, "step": 10856 }, { "epoch": 1.76, "learning_rate": 1.9343433137334191e-07, "logits/chosen": -0.533470869064331, "logits/rejected": -0.428137868642807, "logps/chosen": -87.28009033203125, "logps/rejected": -38.08307647705078, "loss": 0.3353, "rewards/accuracies": 1.0, "rewards/chosen": 2.5399155616760254, "rewards/margins": 1.0637882947921753, "rewards/rejected": 1.47612726688385, "step": 10857 }, { "epoch": 1.76, "learning_rate": 1.9333051840100416e-07, "logits/chosen": -0.6856552362442017, "logits/rejected": -0.7048766613006592, "logps/chosen": -73.08905792236328, "logps/rejected": -127.96653747558594, "loss": 1.1578, "rewards/accuracies": 0.0, "rewards/chosen": 2.020087480545044, "rewards/margins": -2.039588212966919, "rewards/rejected": 4.059675693511963, "step": 10858 }, { "epoch": 1.76, "learning_rate": 1.932267266166257e-07, "logits/chosen": -0.7445061206817627, "logits/rejected": -0.7445061206817627, "logps/chosen": -66.5743637084961, "logps/rejected": -66.5743637084961, "loss": 1.652, "rewards/accuracies": 0.0, "rewards/chosen": 2.1933631896972656, "rewards/margins": 0.0, "rewards/rejected": 2.1933631896972656, "step": 10859 }, { "epoch": 1.76, "learning_rate": 1.9312295602737783e-07, "logits/chosen": -0.6476022601127625, "logits/rejected": -0.6476022601127625, "logps/chosen": -56.64606857299805, "logps/rejected": -56.64606857299805, "loss": 1.4665, "rewards/accuracies": 0.0, "rewards/chosen": 1.4824596643447876, "rewards/margins": 0.0, "rewards/rejected": 1.4824596643447876, "step": 10860 }, { "epoch": 1.76, "learning_rate": 1.9301920664042985e-07, "logits/chosen": -0.8698731064796448, "logits/rejected": -0.7038404941558838, "logps/chosen": -122.46429443359375, "logps/rejected": -107.88916015625, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 5.654577732086182, "rewards/margins": 4.055966377258301, "rewards/rejected": 1.5986114740371704, "step": 10861 }, { "epoch": 1.76, "learning_rate": 1.9291547846295003e-07, "logits/chosen": -0.43810921907424927, "logits/rejected": -0.2912044823169708, "logps/chosen": -121.78730773925781, "logps/rejected": -16.189382553100586, "loss": 0.0996, "rewards/accuracies": 1.0, "rewards/chosen": 1.8287734985351562, "rewards/margins": 1.6082792282104492, "rewards/rejected": 0.22049427032470703, "step": 10862 }, { "epoch": 1.76, "learning_rate": 1.9281177150210515e-07, "logits/chosen": -0.5624955892562866, "logits/rejected": -0.5769274234771729, "logps/chosen": -68.26614379882812, "logps/rejected": -110.4759292602539, "loss": 1.2596, "rewards/accuracies": 0.0, "rewards/chosen": 0.7385429739952087, "rewards/margins": -0.1854407787322998, "rewards/rejected": 0.9239837527275085, "step": 10863 }, { "epoch": 1.76, "learning_rate": 1.9270808576506003e-07, "logits/chosen": -0.5133196115493774, "logits/rejected": -0.5133196115493774, "logps/chosen": -79.28746032714844, "logps/rejected": -79.28746032714844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 1.938543677330017, "rewards/margins": 0.0, "rewards/rejected": 1.938543677330017, "step": 10864 }, { "epoch": 1.76, "learning_rate": 1.926044212589788e-07, "logits/chosen": -0.8237634897232056, "logits/rejected": -0.48766154050827026, "logps/chosen": -87.03623962402344, "logps/rejected": -70.25261688232422, "loss": 0.545, "rewards/accuracies": 1.0, "rewards/chosen": 2.155139207839966, "rewards/margins": 1.5126144886016846, "rewards/rejected": 0.6425247192382812, "step": 10865 }, { "epoch": 1.76, "learning_rate": 1.9250077799102322e-07, "logits/chosen": -0.6867814064025879, "logits/rejected": -0.5175197124481201, "logps/chosen": -65.57186126708984, "logps/rejected": -108.59667205810547, "loss": 0.4893, "rewards/accuracies": 0.0, "rewards/chosen": 1.5812278985977173, "rewards/margins": -0.06854403018951416, "rewards/rejected": 1.6497719287872314, "step": 10866 }, { "epoch": 1.76, "learning_rate": 1.923971559683545e-07, "logits/chosen": -0.12648609280586243, "logits/rejected": -0.1390969455242157, "logps/chosen": -19.83675193786621, "logps/rejected": -56.272422790527344, "loss": 0.7955, "rewards/accuracies": 0.0, "rewards/chosen": 0.37971898913383484, "rewards/margins": -0.44736459851264954, "rewards/rejected": 0.8270835876464844, "step": 10867 }, { "epoch": 1.76, "learning_rate": 1.922935551981315e-07, "logits/chosen": -0.2611618638038635, "logits/rejected": -0.25434550642967224, "logps/chosen": -54.924903869628906, "logps/rejected": -46.24892044067383, "loss": 0.2988, "rewards/accuracies": 1.0, "rewards/chosen": 2.437828779220581, "rewards/margins": 0.20859861373901367, "rewards/rejected": 2.2292301654815674, "step": 10868 }, { "epoch": 1.76, "learning_rate": 1.9218997568751255e-07, "logits/chosen": -0.9303531050682068, "logits/rejected": -1.2308316230773926, "logps/chosen": -97.43516540527344, "logps/rejected": -33.704654693603516, "loss": 0.3282, "rewards/accuracies": 1.0, "rewards/chosen": 1.6570099592208862, "rewards/margins": 1.4212006330490112, "rewards/rejected": 0.235809326171875, "step": 10869 }, { "epoch": 1.76, "learning_rate": 1.920864174436535e-07, "logits/chosen": -0.6868239641189575, "logits/rejected": -0.6060621738433838, "logps/chosen": -66.60334014892578, "logps/rejected": -19.587617874145508, "loss": 0.9522, "rewards/accuracies": 1.0, "rewards/chosen": 1.9999687671661377, "rewards/margins": 1.6338295936584473, "rewards/rejected": 0.3661392331123352, "step": 10870 }, { "epoch": 1.76, "learning_rate": 1.9198288047370976e-07, "logits/chosen": -0.5564705729484558, "logits/rejected": -0.5590291023254395, "logps/chosen": -84.16397857666016, "logps/rejected": -126.4794921875, "loss": 0.954, "rewards/accuracies": 0.0, "rewards/chosen": 3.6013734340667725, "rewards/margins": -1.744105577468872, "rewards/rejected": 5.3454790115356445, "step": 10871 }, { "epoch": 1.76, "learning_rate": 1.9187936478483425e-07, "logits/chosen": -0.8371759653091431, "logits/rejected": -0.8339117765426636, "logps/chosen": -67.53700256347656, "logps/rejected": -59.504150390625, "loss": 0.5144, "rewards/accuracies": 1.0, "rewards/chosen": 1.1746292114257812, "rewards/margins": 0.41979217529296875, "rewards/rejected": 0.7548370361328125, "step": 10872 }, { "epoch": 1.76, "learning_rate": 1.9177587038417935e-07, "logits/chosen": -1.054051399230957, "logits/rejected": -1.121911883354187, "logps/chosen": -36.460838317871094, "logps/rejected": -149.328857421875, "loss": 2.1238, "rewards/accuracies": 0.0, "rewards/chosen": 1.7568634748458862, "rewards/margins": -4.143040180206299, "rewards/rejected": 5.899903774261475, "step": 10873 }, { "epoch": 1.76, "learning_rate": 1.9167239727889521e-07, "logits/chosen": -0.7406137585639954, "logits/rejected": -0.5631402134895325, "logps/chosen": -185.1116485595703, "logps/rejected": -17.530860900878906, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": 5.555333137512207, "rewards/margins": 4.5096282958984375, "rewards/rejected": 1.0457048416137695, "step": 10874 }, { "epoch": 1.77, "learning_rate": 1.9156894547613117e-07, "logits/chosen": -1.2781548500061035, "logits/rejected": -1.2097419500350952, "logps/chosen": -148.32643127441406, "logps/rejected": -164.21206665039062, "loss": 0.3543, "rewards/accuracies": 1.0, "rewards/chosen": 6.311856269836426, "rewards/margins": 0.07639646530151367, "rewards/rejected": 6.235459804534912, "step": 10875 }, { "epoch": 1.77, "learning_rate": 1.9146551498303442e-07, "logits/chosen": -0.7926474213600159, "logits/rejected": -0.7797780632972717, "logps/chosen": -59.75111389160156, "logps/rejected": -99.39472961425781, "loss": 0.8385, "rewards/accuracies": 1.0, "rewards/chosen": 1.8276138305664062, "rewards/margins": 0.00751340389251709, "rewards/rejected": 1.8201004266738892, "step": 10876 }, { "epoch": 1.77, "learning_rate": 1.913621058067514e-07, "logits/chosen": -0.6491197347640991, "logits/rejected": -0.7947705388069153, "logps/chosen": -39.119850158691406, "logps/rejected": -143.25157165527344, "loss": 2.0836, "rewards/accuracies": 0.0, "rewards/chosen": 1.959619164466858, "rewards/margins": -3.559478759765625, "rewards/rejected": 5.519097805023193, "step": 10877 }, { "epoch": 1.77, "learning_rate": 1.9125871795442628e-07, "logits/chosen": -0.8528283834457397, "logits/rejected": -0.7245094180107117, "logps/chosen": -74.7595443725586, "logps/rejected": -73.31266784667969, "loss": 0.6956, "rewards/accuracies": 0.0, "rewards/chosen": 1.5666542053222656, "rewards/margins": -0.7062187194824219, "rewards/rejected": 2.2728729248046875, "step": 10878 }, { "epoch": 1.77, "learning_rate": 1.911553514332026e-07, "logits/chosen": -0.5146892666816711, "logits/rejected": -0.48126983642578125, "logps/chosen": -73.24890899658203, "logps/rejected": -52.89031982421875, "loss": 0.1934, "rewards/accuracies": 1.0, "rewards/chosen": 1.5326370000839233, "rewards/margins": 0.8176547884941101, "rewards/rejected": 0.7149822115898132, "step": 10879 }, { "epoch": 1.77, "learning_rate": 1.9105200625022172e-07, "logits/chosen": -0.6001561880111694, "logits/rejected": -0.5998966693878174, "logps/chosen": -73.63081359863281, "logps/rejected": -131.81861877441406, "loss": 0.5668, "rewards/accuracies": 1.0, "rewards/chosen": 1.0771645307540894, "rewards/margins": 0.5914360284805298, "rewards/rejected": 0.4857284724712372, "step": 10880 }, { "epoch": 1.77, "learning_rate": 1.90948682412624e-07, "logits/chosen": -0.47860386967658997, "logits/rejected": -0.45382747054100037, "logps/chosen": -101.58208465576172, "logps/rejected": -83.5738525390625, "loss": 0.961, "rewards/accuracies": 0.0, "rewards/chosen": 0.8646950125694275, "rewards/margins": -1.2908422946929932, "rewards/rejected": 2.1555373668670654, "step": 10881 }, { "epoch": 1.77, "learning_rate": 1.908453799275479e-07, "logits/chosen": -0.7151501774787903, "logits/rejected": -0.7354695200920105, "logps/chosen": -90.03309631347656, "logps/rejected": -62.97190856933594, "loss": 0.989, "rewards/accuracies": 0.0, "rewards/chosen": 1.516058325767517, "rewards/margins": -1.8146346807479858, "rewards/rejected": 3.330693006515503, "step": 10882 }, { "epoch": 1.77, "learning_rate": 1.90742098802131e-07, "logits/chosen": -0.753668487071991, "logits/rejected": -0.6653139591217041, "logps/chosen": -123.69140625, "logps/rejected": -32.2375602722168, "loss": 0.1919, "rewards/accuracies": 1.0, "rewards/chosen": 1.2834396362304688, "rewards/margins": 1.1288303136825562, "rewards/rejected": 0.1546093076467514, "step": 10883 }, { "epoch": 1.77, "learning_rate": 1.9063883904350868e-07, "logits/chosen": -0.7872545123100281, "logits/rejected": -0.7737657427787781, "logps/chosen": -79.9311294555664, "logps/rejected": -72.71923828125, "loss": 1.2012, "rewards/accuracies": 0.0, "rewards/chosen": 1.6373909711837769, "rewards/margins": -1.5094634294509888, "rewards/rejected": 3.1468544006347656, "step": 10884 }, { "epoch": 1.77, "learning_rate": 1.9053560065881551e-07, "logits/chosen": -0.4012860357761383, "logits/rejected": -0.4414665997028351, "logps/chosen": -28.477327346801758, "logps/rejected": -72.83639526367188, "loss": 0.9139, "rewards/accuracies": 1.0, "rewards/chosen": 0.4478830397129059, "rewards/margins": 0.036971479654312134, "rewards/rejected": 0.41091156005859375, "step": 10885 }, { "epoch": 1.77, "learning_rate": 1.90432383655184e-07, "logits/chosen": -0.800154983997345, "logits/rejected": -0.7451760768890381, "logps/chosen": -44.30888366699219, "logps/rejected": -24.089588165283203, "loss": 0.1459, "rewards/accuracies": 1.0, "rewards/chosen": 2.206275224685669, "rewards/margins": 1.6765754222869873, "rewards/rejected": 0.5296997427940369, "step": 10886 }, { "epoch": 1.77, "learning_rate": 1.9032918803974586e-07, "logits/chosen": -1.0902997255325317, "logits/rejected": -0.9666783809661865, "logps/chosen": -177.14854431152344, "logps/rejected": -175.08688354492188, "loss": 0.5944, "rewards/accuracies": 0.0, "rewards/chosen": 6.331309795379639, "rewards/margins": -0.8162822723388672, "rewards/rejected": 7.147592067718506, "step": 10887 }, { "epoch": 1.77, "learning_rate": 1.9022601381963045e-07, "logits/chosen": -0.755700945854187, "logits/rejected": -0.7394866347312927, "logps/chosen": -123.6629867553711, "logps/rejected": -111.26576232910156, "loss": 1.1747, "rewards/accuracies": 1.0, "rewards/chosen": 4.75983190536499, "rewards/margins": 0.13191604614257812, "rewards/rejected": 4.627915859222412, "step": 10888 }, { "epoch": 1.77, "learning_rate": 1.9012286100196656e-07, "logits/chosen": -1.1701796054840088, "logits/rejected": -1.0699836015701294, "logps/chosen": -148.46380615234375, "logps/rejected": -109.99187469482422, "loss": 0.3969, "rewards/accuracies": 1.0, "rewards/chosen": 5.711041450500488, "rewards/margins": 1.375856876373291, "rewards/rejected": 4.335184574127197, "step": 10889 }, { "epoch": 1.77, "learning_rate": 1.9001972959388069e-07, "logits/chosen": -0.9198802709579468, "logits/rejected": -0.9855586290359497, "logps/chosen": -64.48472595214844, "logps/rejected": -78.43670654296875, "loss": 1.9842, "rewards/accuracies": 0.0, "rewards/chosen": 1.7128219604492188, "rewards/margins": -2.2249343395233154, "rewards/rejected": 3.937756299972534, "step": 10890 }, { "epoch": 1.77, "learning_rate": 1.8991661960249865e-07, "logits/chosen": -0.9117143154144287, "logits/rejected": -0.7851671576499939, "logps/chosen": -93.79988098144531, "logps/rejected": -147.51365661621094, "loss": 0.9178, "rewards/accuracies": 0.0, "rewards/chosen": 3.4821975231170654, "rewards/margins": -1.336024522781372, "rewards/rejected": 4.8182220458984375, "step": 10891 }, { "epoch": 1.77, "learning_rate": 1.8981353103494397e-07, "logits/chosen": -0.7016653418540955, "logits/rejected": -0.7165762782096863, "logps/chosen": -78.94268798828125, "logps/rejected": -54.64842987060547, "loss": 0.94, "rewards/accuracies": 0.0, "rewards/chosen": 2.00650954246521, "rewards/margins": -0.26696085929870605, "rewards/rejected": 2.273470401763916, "step": 10892 }, { "epoch": 1.77, "learning_rate": 1.8971046389833951e-07, "logits/chosen": -0.6355236768722534, "logits/rejected": -0.6431252956390381, "logps/chosen": -55.069889068603516, "logps/rejected": -95.62042236328125, "loss": 0.8929, "rewards/accuracies": 1.0, "rewards/chosen": 0.5140533447265625, "rewards/margins": 0.2485458254814148, "rewards/rejected": 0.2655075192451477, "step": 10893 }, { "epoch": 1.77, "learning_rate": 1.8960741819980575e-07, "logits/chosen": -0.9687482118606567, "logits/rejected": -0.8734164834022522, "logps/chosen": -77.48463439941406, "logps/rejected": -56.05942916870117, "loss": 0.0437, "rewards/accuracies": 1.0, "rewards/chosen": 3.817195177078247, "rewards/margins": 2.518305778503418, "rewards/rejected": 1.2988895177841187, "step": 10894 }, { "epoch": 1.77, "learning_rate": 1.8950439394646267e-07, "logits/chosen": -0.7836443781852722, "logits/rejected": -0.7709399461746216, "logps/chosen": -69.88589477539062, "logps/rejected": -11.134102821350098, "loss": 0.4723, "rewards/accuracies": 1.0, "rewards/chosen": 0.8597999811172485, "rewards/margins": 0.14208215475082397, "rewards/rejected": 0.7177178263664246, "step": 10895 }, { "epoch": 1.77, "learning_rate": 1.8940139114542786e-07, "logits/chosen": -0.6981054544448853, "logits/rejected": -0.7087018489837646, "logps/chosen": -94.31370544433594, "logps/rejected": -67.66588592529297, "loss": 0.7264, "rewards/accuracies": 0.0, "rewards/chosen": 1.51324462890625, "rewards/margins": -0.7465674877166748, "rewards/rejected": 2.259812116622925, "step": 10896 }, { "epoch": 1.77, "learning_rate": 1.8929840980381794e-07, "logits/chosen": -0.9097456932067871, "logits/rejected": -0.7536413669586182, "logps/chosen": -130.40234375, "logps/rejected": -61.83523178100586, "loss": 0.3953, "rewards/accuracies": 1.0, "rewards/chosen": 6.539947509765625, "rewards/margins": 5.096302509307861, "rewards/rejected": 1.4436451196670532, "step": 10897 }, { "epoch": 1.77, "learning_rate": 1.8919544992874826e-07, "logits/chosen": -0.5881543755531311, "logits/rejected": -0.6112148761749268, "logps/chosen": -73.90396118164062, "logps/rejected": -82.59603881835938, "loss": 0.5876, "rewards/accuracies": 1.0, "rewards/chosen": 2.730120897293091, "rewards/margins": 0.5766983032226562, "rewards/rejected": 2.1534225940704346, "step": 10898 }, { "epoch": 1.77, "learning_rate": 1.890925115273319e-07, "logits/chosen": -0.6496841311454773, "logits/rejected": -0.47958382964134216, "logps/chosen": -95.2107162475586, "logps/rejected": -77.76053619384766, "loss": 0.179, "rewards/accuracies": 1.0, "rewards/chosen": 4.758060455322266, "rewards/margins": 1.5157501697540283, "rewards/rejected": 3.2423102855682373, "step": 10899 }, { "epoch": 1.77, "learning_rate": 1.889895946066814e-07, "logits/chosen": -0.35502541065216064, "logits/rejected": -0.35502541065216064, "logps/chosen": -82.07568359375, "logps/rejected": -82.07568359375, "loss": 1.3833, "rewards/accuracies": 0.0, "rewards/chosen": 1.6421997547149658, "rewards/margins": 0.0, "rewards/rejected": 1.6421997547149658, "step": 10900 }, { "epoch": 1.77, "learning_rate": 1.8888669917390688e-07, "logits/chosen": -0.7494766116142273, "logits/rejected": -0.7392404675483704, "logps/chosen": -45.05403518676758, "logps/rejected": -54.10469055175781, "loss": 1.1317, "rewards/accuracies": 1.0, "rewards/chosen": 0.9572162628173828, "rewards/margins": 0.322201132774353, "rewards/rejected": 0.6350151300430298, "step": 10901 }, { "epoch": 1.77, "learning_rate": 1.8878382523611785e-07, "logits/chosen": -0.31584709882736206, "logits/rejected": -0.31584709882736206, "logps/chosen": -54.0937614440918, "logps/rejected": -54.0937614440918, "loss": 0.9935, "rewards/accuracies": 0.0, "rewards/chosen": 1.0964726209640503, "rewards/margins": 0.0, "rewards/rejected": 1.0964726209640503, "step": 10902 }, { "epoch": 1.77, "learning_rate": 1.8868097280042162e-07, "logits/chosen": -0.24569177627563477, "logits/rejected": -0.24524085223674774, "logps/chosen": -59.7280387878418, "logps/rejected": -75.80709838867188, "loss": 0.4047, "rewards/accuracies": 1.0, "rewards/chosen": 2.059786558151245, "rewards/margins": 1.0456576347351074, "rewards/rejected": 1.0141289234161377, "step": 10903 }, { "epoch": 1.77, "learning_rate": 1.8857814187392457e-07, "logits/chosen": -0.46556153893470764, "logits/rejected": -0.5171871185302734, "logps/chosen": -40.800209045410156, "logps/rejected": -113.40950775146484, "loss": 0.115, "rewards/accuracies": 1.0, "rewards/chosen": 1.2443122863769531, "rewards/margins": 1.3894485235214233, "rewards/rejected": -0.1451362669467926, "step": 10904 }, { "epoch": 1.77, "learning_rate": 1.8847533246373104e-07, "logits/chosen": -1.0169825553894043, "logits/rejected": -0.9327875375747681, "logps/chosen": -123.25764465332031, "logps/rejected": -72.99137878417969, "loss": 0.9699, "rewards/accuracies": 0.0, "rewards/chosen": 0.703356921672821, "rewards/margins": -0.015906572341918945, "rewards/rejected": 0.71926349401474, "step": 10905 }, { "epoch": 1.77, "learning_rate": 1.883725445769445e-07, "logits/chosen": -0.6657800674438477, "logits/rejected": -0.2892746329307556, "logps/chosen": -61.79909133911133, "logps/rejected": -57.46061706542969, "loss": 0.4625, "rewards/accuracies": 0.0, "rewards/chosen": 1.0770962238311768, "rewards/margins": -0.15873301029205322, "rewards/rejected": 1.23582923412323, "step": 10906 }, { "epoch": 1.77, "learning_rate": 1.8826977822066637e-07, "logits/chosen": -0.8886151313781738, "logits/rejected": -0.9034653902053833, "logps/chosen": -88.1440658569336, "logps/rejected": -114.64601135253906, "loss": 0.7511, "rewards/accuracies": 0.0, "rewards/chosen": 1.4083824157714844, "rewards/margins": -1.2125604152679443, "rewards/rejected": 2.6209428310394287, "step": 10907 }, { "epoch": 1.77, "learning_rate": 1.8816703340199708e-07, "logits/chosen": -1.2500656843185425, "logits/rejected": -1.2922165393829346, "logps/chosen": -84.5245361328125, "logps/rejected": -98.32333374023438, "loss": 0.7323, "rewards/accuracies": 0.0, "rewards/chosen": 0.40798187255859375, "rewards/margins": -1.110634684562683, "rewards/rejected": 1.5186165571212769, "step": 10908 }, { "epoch": 1.77, "learning_rate": 1.8806431012803503e-07, "logits/chosen": -0.478306382894516, "logits/rejected": -0.4798058867454529, "logps/chosen": -2.663813829421997, "logps/rejected": -1.6386786699295044, "loss": 0.8171, "rewards/accuracies": 0.0, "rewards/chosen": 0.40225765109062195, "rewards/margins": -0.11479148268699646, "rewards/rejected": 0.5170491337776184, "step": 10909 }, { "epoch": 1.77, "learning_rate": 1.8796160840587776e-07, "logits/chosen": -1.0121853351593018, "logits/rejected": -1.027130365371704, "logps/chosen": -137.4284210205078, "logps/rejected": -111.35975646972656, "loss": 0.3437, "rewards/accuracies": 1.0, "rewards/chosen": 5.212376594543457, "rewards/margins": 0.1521315574645996, "rewards/rejected": 5.060245037078857, "step": 10910 }, { "epoch": 1.77, "learning_rate": 1.8785892824262067e-07, "logits/chosen": -0.5785548686981201, "logits/rejected": -0.5988551378250122, "logps/chosen": -97.78343200683594, "logps/rejected": -103.22914123535156, "loss": 0.4111, "rewards/accuracies": 0.0, "rewards/chosen": 1.5029510259628296, "rewards/margins": -0.1081230640411377, "rewards/rejected": 1.6110740900039673, "step": 10911 }, { "epoch": 1.77, "learning_rate": 1.877562696453583e-07, "logits/chosen": -1.0813361406326294, "logits/rejected": -1.250272512435913, "logps/chosen": -235.47853088378906, "logps/rejected": -250.79812622070312, "loss": 0.4762, "rewards/accuracies": 0.0, "rewards/chosen": 6.502659797668457, "rewards/margins": -0.37743043899536133, "rewards/rejected": 6.880090236663818, "step": 10912 }, { "epoch": 1.77, "learning_rate": 1.87653632621183e-07, "logits/chosen": -1.0295686721801758, "logits/rejected": -1.1176797151565552, "logps/chosen": -222.12564086914062, "logps/rejected": -198.7562255859375, "loss": 2.5564, "rewards/accuracies": 0.0, "rewards/chosen": 3.354518175125122, "rewards/margins": -2.4567549228668213, "rewards/rejected": 5.811273097991943, "step": 10913 }, { "epoch": 1.77, "learning_rate": 1.8755101717718647e-07, "logits/chosen": -0.9112401604652405, "logits/rejected": -1.2055162191390991, "logps/chosen": -74.805419921875, "logps/rejected": -35.9122428894043, "loss": 0.6599, "rewards/accuracies": 1.0, "rewards/chosen": 2.285540819168091, "rewards/margins": 2.0134055614471436, "rewards/rejected": 0.2721351683139801, "step": 10914 }, { "epoch": 1.77, "learning_rate": 1.8744842332045803e-07, "logits/chosen": -0.5076099038124084, "logits/rejected": -0.4921978712081909, "logps/chosen": -76.495849609375, "logps/rejected": -44.24932861328125, "loss": 1.0187, "rewards/accuracies": 0.0, "rewards/chosen": 0.8933822512626648, "rewards/margins": -1.4508812427520752, "rewards/rejected": 2.3442635536193848, "step": 10915 }, { "epoch": 1.77, "learning_rate": 1.8734585105808632e-07, "logits/chosen": -0.35426580905914307, "logits/rejected": -0.5225423574447632, "logps/chosen": -41.718719482421875, "logps/rejected": -255.61029052734375, "loss": 0.3107, "rewards/accuracies": 1.0, "rewards/chosen": 2.0637402534484863, "rewards/margins": 0.19485902786254883, "rewards/rejected": 1.8688812255859375, "step": 10916 }, { "epoch": 1.77, "learning_rate": 1.8724330039715774e-07, "logits/chosen": -0.9896901845932007, "logits/rejected": -1.0261890888214111, "logps/chosen": -79.47101593017578, "logps/rejected": -74.2811050415039, "loss": 1.4546, "rewards/accuracies": 0.0, "rewards/chosen": 0.4683952331542969, "rewards/margins": -1.2622756958007812, "rewards/rejected": 1.7306709289550781, "step": 10917 }, { "epoch": 1.77, "learning_rate": 1.8714077134475798e-07, "logits/chosen": -0.39795711636543274, "logits/rejected": -0.38630202412605286, "logps/chosen": -26.570402145385742, "logps/rejected": -2.5251080989837646, "loss": 0.3985, "rewards/accuracies": 0.0, "rewards/chosen": 0.3627292811870575, "rewards/margins": -0.07493066787719727, "rewards/rejected": 0.43765994906425476, "step": 10918 }, { "epoch": 1.77, "learning_rate": 1.8703826390797044e-07, "logits/chosen": -0.7265734672546387, "logits/rejected": -0.867176353931427, "logps/chosen": -55.19580078125, "logps/rejected": -117.97592163085938, "loss": 2.1957, "rewards/accuracies": 0.0, "rewards/chosen": 3.2754180431365967, "rewards/margins": -3.5391757488250732, "rewards/rejected": 6.81459379196167, "step": 10919 }, { "epoch": 1.77, "learning_rate": 1.869357780938778e-07, "logits/chosen": -0.830032467842102, "logits/rejected": -0.830032467842102, "logps/chosen": -44.66007995605469, "logps/rejected": -44.66007995605469, "loss": 0.4106, "rewards/accuracies": 0.0, "rewards/chosen": 2.9238479137420654, "rewards/margins": 0.0, "rewards/rejected": 2.9238479137420654, "step": 10920 }, { "epoch": 1.77, "learning_rate": 1.8683331390956043e-07, "logits/chosen": -0.5262947678565979, "logits/rejected": -0.5569862127304077, "logps/chosen": -50.75450134277344, "logps/rejected": -65.30260467529297, "loss": 0.4266, "rewards/accuracies": 0.0, "rewards/chosen": 0.6449142694473267, "rewards/margins": -0.22922593355178833, "rewards/rejected": 0.874140202999115, "step": 10921 }, { "epoch": 1.77, "learning_rate": 1.8673087136209802e-07, "logits/chosen": -0.6945030093193054, "logits/rejected": -0.7672205567359924, "logps/chosen": -86.39037322998047, "logps/rejected": -107.31692504882812, "loss": 0.7954, "rewards/accuracies": 1.0, "rewards/chosen": 2.720107316970825, "rewards/margins": 1.085466742515564, "rewards/rejected": 1.6346405744552612, "step": 10922 }, { "epoch": 1.77, "learning_rate": 1.8662845045856806e-07, "logits/chosen": -0.5635879039764404, "logits/rejected": -0.5689677596092224, "logps/chosen": -40.01552200317383, "logps/rejected": -4.1911420822143555, "loss": 2.2456, "rewards/accuracies": 0.0, "rewards/chosen": 0.11681061238050461, "rewards/margins": -0.24304214119911194, "rewards/rejected": 0.35985276103019714, "step": 10923 }, { "epoch": 1.77, "learning_rate": 1.8652605120604726e-07, "logits/chosen": -0.7350084185600281, "logits/rejected": -0.8600637316703796, "logps/chosen": -78.70050048828125, "logps/rejected": -125.04579162597656, "loss": 2.2331, "rewards/accuracies": 0.0, "rewards/chosen": 1.7130157947540283, "rewards/margins": -3.366999864578247, "rewards/rejected": 5.080015659332275, "step": 10924 }, { "epoch": 1.77, "learning_rate": 1.8642367361160994e-07, "logits/chosen": -0.6781880259513855, "logits/rejected": -0.5888975262641907, "logps/chosen": -57.52019500732422, "logps/rejected": -35.954586029052734, "loss": 1.4535, "rewards/accuracies": 1.0, "rewards/chosen": 2.025994062423706, "rewards/margins": 0.8574931621551514, "rewards/rejected": 1.1685009002685547, "step": 10925 }, { "epoch": 1.77, "learning_rate": 1.8632131768232988e-07, "logits/chosen": -0.9615589380264282, "logits/rejected": -0.7591084837913513, "logps/chosen": -222.1534881591797, "logps/rejected": -41.96568298339844, "loss": 0.2446, "rewards/accuracies": 1.0, "rewards/chosen": 1.9797104597091675, "rewards/margins": 1.7782906293869019, "rewards/rejected": 0.20141983032226562, "step": 10926 }, { "epoch": 1.77, "learning_rate": 1.862189834252786e-07, "logits/chosen": -0.5305880308151245, "logits/rejected": -0.4511130750179291, "logps/chosen": -51.830955505371094, "logps/rejected": -74.5965347290039, "loss": 0.2624, "rewards/accuracies": 1.0, "rewards/chosen": 2.2468788623809814, "rewards/margins": 0.4750854969024658, "rewards/rejected": 1.7717933654785156, "step": 10927 }, { "epoch": 1.77, "learning_rate": 1.8611667084752663e-07, "logits/chosen": -0.8718137145042419, "logits/rejected": -0.8126198649406433, "logps/chosen": -70.38033294677734, "logps/rejected": -24.04407501220703, "loss": 0.4348, "rewards/accuracies": 0.0, "rewards/chosen": 0.7105598449707031, "rewards/margins": -0.21557235717773438, "rewards/rejected": 0.9261322021484375, "step": 10928 }, { "epoch": 1.77, "learning_rate": 1.860143799561426e-07, "logits/chosen": -0.5924234390258789, "logits/rejected": -0.6465862989425659, "logps/chosen": -32.118980407714844, "logps/rejected": -71.36296844482422, "loss": 1.415, "rewards/accuracies": 0.0, "rewards/chosen": 1.4349602460861206, "rewards/margins": -1.7827178239822388, "rewards/rejected": 3.2176780700683594, "step": 10929 }, { "epoch": 1.77, "learning_rate": 1.8591211075819413e-07, "logits/chosen": -0.8344486355781555, "logits/rejected": -0.8246393203735352, "logps/chosen": -114.60342407226562, "logps/rejected": -256.45477294921875, "loss": 0.2893, "rewards/accuracies": 1.0, "rewards/chosen": 6.51992654800415, "rewards/margins": 0.2785630226135254, "rewards/rejected": 6.241363525390625, "step": 10930 }, { "epoch": 1.77, "learning_rate": 1.8580986326074665e-07, "logits/chosen": -0.3721272051334381, "logits/rejected": -0.3721272051334381, "logps/chosen": -40.27621078491211, "logps/rejected": -40.27621078491211, "loss": 0.7362, "rewards/accuracies": 0.0, "rewards/chosen": 0.29308509826660156, "rewards/margins": 0.0, "rewards/rejected": 0.29308509826660156, "step": 10931 }, { "epoch": 1.77, "learning_rate": 1.8570763747086492e-07, "logits/chosen": -0.4732849895954132, "logits/rejected": -0.4548250734806061, "logps/chosen": -43.740447998046875, "logps/rejected": -44.29603576660156, "loss": 0.49, "rewards/accuracies": 1.0, "rewards/chosen": 1.8367607593536377, "rewards/margins": 0.9045807123184204, "rewards/rejected": 0.9321800470352173, "step": 10932 }, { "epoch": 1.77, "learning_rate": 1.8560543339561146e-07, "logits/chosen": -0.9098483920097351, "logits/rejected": -1.0352935791015625, "logps/chosen": -91.26664733886719, "logps/rejected": -84.74755859375, "loss": 1.5858, "rewards/accuracies": 0.0, "rewards/chosen": 0.4936279356479645, "rewards/margins": -2.140786647796631, "rewards/rejected": 2.6344146728515625, "step": 10933 }, { "epoch": 1.77, "learning_rate": 1.8550325104204767e-07, "logits/chosen": -0.7501893639564514, "logits/rejected": -0.7103222012519836, "logps/chosen": -121.61991882324219, "logps/rejected": -118.4012451171875, "loss": 0.7702, "rewards/accuracies": 0.0, "rewards/chosen": 2.0877397060394287, "rewards/margins": -0.30443406105041504, "rewards/rejected": 2.3921737670898438, "step": 10934 }, { "epoch": 1.77, "learning_rate": 1.8540109041723362e-07, "logits/chosen": -0.539699137210846, "logits/rejected": -0.5700490474700928, "logps/chosen": -24.11254119873047, "logps/rejected": -61.32865905761719, "loss": 0.7043, "rewards/accuracies": 0.0, "rewards/chosen": 1.1510857343673706, "rewards/margins": -0.9299484491348267, "rewards/rejected": 2.0810341835021973, "step": 10935 }, { "epoch": 1.78, "learning_rate": 1.8529895152822733e-07, "logits/chosen": -0.6909454464912415, "logits/rejected": -0.6394988894462585, "logps/chosen": -75.28672790527344, "logps/rejected": -50.83515930175781, "loss": 1.2683, "rewards/accuracies": 0.0, "rewards/chosen": 1.2674652338027954, "rewards/margins": -1.4079681634902954, "rewards/rejected": 2.675433397293091, "step": 10936 }, { "epoch": 1.78, "learning_rate": 1.851968343820859e-07, "logits/chosen": -0.47956183552742004, "logits/rejected": -0.3630838394165039, "logps/chosen": -57.30397033691406, "logps/rejected": -16.452186584472656, "loss": 0.1013, "rewards/accuracies": 1.0, "rewards/chosen": 2.0663819313049316, "rewards/margins": 1.5566442012786865, "rewards/rejected": 0.5097377896308899, "step": 10937 }, { "epoch": 1.78, "learning_rate": 1.850947389858643e-07, "logits/chosen": -1.010404348373413, "logits/rejected": -0.9006872177124023, "logps/chosen": -32.614627838134766, "logps/rejected": -39.98844909667969, "loss": 1.504, "rewards/accuracies": 1.0, "rewards/chosen": 1.9110878705978394, "rewards/margins": 2.043916702270508, "rewards/rejected": -0.13282890617847443, "step": 10938 }, { "epoch": 1.78, "learning_rate": 1.8499266534661678e-07, "logits/chosen": -0.8690334558486938, "logits/rejected": -0.8690334558486938, "logps/chosen": -108.37007141113281, "logps/rejected": -108.37007141113281, "loss": 0.3584, "rewards/accuracies": 0.0, "rewards/chosen": 3.9671127796173096, "rewards/margins": 0.0, "rewards/rejected": 3.9671127796173096, "step": 10939 }, { "epoch": 1.78, "learning_rate": 1.8489061347139529e-07, "logits/chosen": -0.664965808391571, "logits/rejected": -0.5947452783584595, "logps/chosen": -148.23760986328125, "logps/rejected": -92.17127227783203, "loss": 1.9005, "rewards/accuracies": 0.0, "rewards/chosen": 4.227475166320801, "rewards/margins": -1.167992115020752, "rewards/rejected": 5.395467281341553, "step": 10940 }, { "epoch": 1.78, "learning_rate": 1.84788583367251e-07, "logits/chosen": -0.21735505759716034, "logits/rejected": -0.2375487983226776, "logps/chosen": -23.9033203125, "logps/rejected": -31.302045822143555, "loss": 0.8273, "rewards/accuracies": 0.0, "rewards/chosen": 0.16683541238307953, "rewards/margins": -0.31341421604156494, "rewards/rejected": 0.4802496135234833, "step": 10941 }, { "epoch": 1.78, "learning_rate": 1.8468657504123286e-07, "logits/chosen": -0.555243730545044, "logits/rejected": -0.49923908710479736, "logps/chosen": -69.02287292480469, "logps/rejected": -127.08916473388672, "loss": 0.4185, "rewards/accuracies": 1.0, "rewards/chosen": 1.275262475013733, "rewards/margins": 0.38763201236724854, "rewards/rejected": 0.8876304626464844, "step": 10942 }, { "epoch": 1.78, "learning_rate": 1.8458458850038905e-07, "logits/chosen": -0.3105854094028473, "logits/rejected": -0.31020209193229675, "logps/chosen": -3.3612098693847656, "logps/rejected": -1.250651240348816, "loss": 0.6626, "rewards/accuracies": 0.0, "rewards/chosen": 0.05462508276104927, "rewards/margins": -0.10953260958194733, "rewards/rejected": 0.1641576886177063, "step": 10943 }, { "epoch": 1.78, "learning_rate": 1.844826237517655e-07, "logits/chosen": -1.1581147909164429, "logits/rejected": -1.136618733406067, "logps/chosen": -110.51406860351562, "logps/rejected": -110.51872253417969, "loss": 0.8423, "rewards/accuracies": 0.0, "rewards/chosen": 1.2069320678710938, "rewards/margins": -1.370457410812378, "rewards/rejected": 2.5773894786834717, "step": 10944 }, { "epoch": 1.78, "learning_rate": 1.8438068080240738e-07, "logits/chosen": -0.4557627737522125, "logits/rejected": -0.34428325295448303, "logps/chosen": -52.76615524291992, "logps/rejected": -36.2260627746582, "loss": 0.9676, "rewards/accuracies": 1.0, "rewards/chosen": 2.1235616207122803, "rewards/margins": 0.8561300039291382, "rewards/rejected": 1.267431616783142, "step": 10945 }, { "epoch": 1.78, "learning_rate": 1.8427875965935758e-07, "logits/chosen": -0.7061136364936829, "logits/rejected": -0.6310017704963684, "logps/chosen": -54.545204162597656, "logps/rejected": -49.3549690246582, "loss": 0.7044, "rewards/accuracies": 1.0, "rewards/chosen": 0.8594116568565369, "rewards/margins": 0.7228172421455383, "rewards/rejected": 0.13659439980983734, "step": 10946 }, { "epoch": 1.78, "learning_rate": 1.8417686032965828e-07, "logits/chosen": -0.842759907245636, "logits/rejected": -0.9283156991004944, "logps/chosen": -175.38180541992188, "logps/rejected": -140.58778381347656, "loss": 2.2441, "rewards/accuracies": 0.0, "rewards/chosen": 3.0406434535980225, "rewards/margins": -3.8101484775543213, "rewards/rejected": 6.850791931152344, "step": 10947 }, { "epoch": 1.78, "learning_rate": 1.840749828203495e-07, "logits/chosen": -0.9584366679191589, "logits/rejected": -0.9526366591453552, "logps/chosen": -43.743125915527344, "logps/rejected": -72.08128356933594, "loss": 0.3669, "rewards/accuracies": 0.0, "rewards/chosen": 1.4780701398849487, "rewards/margins": -0.052811384201049805, "rewards/rejected": 1.5308815240859985, "step": 10948 }, { "epoch": 1.78, "learning_rate": 1.8397312713847018e-07, "logits/chosen": -1.043890357017517, "logits/rejected": -1.0489782094955444, "logps/chosen": -96.66352844238281, "logps/rejected": -175.6898651123047, "loss": 2.727, "rewards/accuracies": 0.0, "rewards/chosen": 2.412118673324585, "rewards/margins": -5.120115280151367, "rewards/rejected": 7.532234191894531, "step": 10949 }, { "epoch": 1.78, "learning_rate": 1.8387129329105738e-07, "logits/chosen": -1.0408865213394165, "logits/rejected": -1.0355685949325562, "logps/chosen": -58.53872299194336, "logps/rejected": -81.13265991210938, "loss": 0.2276, "rewards/accuracies": 1.0, "rewards/chosen": 2.055109739303589, "rewards/margins": 0.6742854118347168, "rewards/rejected": 1.380824327468872, "step": 10950 }, { "epoch": 1.78, "learning_rate": 1.8376948128514714e-07, "logits/chosen": -0.6937808394432068, "logits/rejected": -0.6980543732643127, "logps/chosen": -83.9862289428711, "logps/rejected": -70.37718963623047, "loss": 0.5467, "rewards/accuracies": 0.0, "rewards/chosen": 2.3603341579437256, "rewards/margins": -0.6587944030761719, "rewards/rejected": 3.0191285610198975, "step": 10951 }, { "epoch": 1.78, "learning_rate": 1.836676911277733e-07, "logits/chosen": -0.7775830626487732, "logits/rejected": -0.8039467930793762, "logps/chosen": -43.82603073120117, "logps/rejected": -81.579345703125, "loss": 1.1165, "rewards/accuracies": 0.0, "rewards/chosen": 0.6842209100723267, "rewards/margins": -0.6801360845565796, "rewards/rejected": 1.3643569946289062, "step": 10952 }, { "epoch": 1.78, "learning_rate": 1.8356592282596912e-07, "logits/chosen": -1.0525972843170166, "logits/rejected": -1.0439389944076538, "logps/chosen": -94.33358764648438, "logps/rejected": -92.89166259765625, "loss": 0.5498, "rewards/accuracies": 1.0, "rewards/chosen": 2.9699859619140625, "rewards/margins": 0.1706557273864746, "rewards/rejected": 2.799330234527588, "step": 10953 }, { "epoch": 1.78, "learning_rate": 1.834641763867653e-07, "logits/chosen": -1.2533705234527588, "logits/rejected": -1.1147042512893677, "logps/chosen": -102.14306640625, "logps/rejected": -57.7417106628418, "loss": 0.4645, "rewards/accuracies": 0.0, "rewards/chosen": 1.452815294265747, "rewards/margins": -0.316158652305603, "rewards/rejected": 1.76897394657135, "step": 10954 }, { "epoch": 1.78, "learning_rate": 1.8336245181719202e-07, "logits/chosen": -0.8157570958137512, "logits/rejected": -0.5890145301818848, "logps/chosen": -153.806884765625, "logps/rejected": -192.53775024414062, "loss": 0.1675, "rewards/accuracies": 1.0, "rewards/chosen": 4.206897258758545, "rewards/margins": 1.3706178665161133, "rewards/rejected": 2.8362793922424316, "step": 10955 }, { "epoch": 1.78, "learning_rate": 1.8326074912427702e-07, "logits/chosen": -0.9109443426132202, "logits/rejected": -0.8659747838973999, "logps/chosen": -135.20864868164062, "logps/rejected": -164.8328857421875, "loss": 0.2297, "rewards/accuracies": 1.0, "rewards/chosen": 6.513291835784912, "rewards/margins": 0.5426254272460938, "rewards/rejected": 5.970666408538818, "step": 10956 }, { "epoch": 1.78, "learning_rate": 1.8315906831504751e-07, "logits/chosen": -0.8714759945869446, "logits/rejected": -0.9203869104385376, "logps/chosen": -72.18492126464844, "logps/rejected": -114.6729736328125, "loss": 0.4866, "rewards/accuracies": 0.0, "rewards/chosen": 1.9424362182617188, "rewards/margins": -0.40919947624206543, "rewards/rejected": 2.351635694503784, "step": 10957 }, { "epoch": 1.78, "learning_rate": 1.830574093965282e-07, "logits/chosen": -0.9410257339477539, "logits/rejected": -0.7918442487716675, "logps/chosen": -121.86122131347656, "logps/rejected": -21.158151626586914, "loss": 0.0777, "rewards/accuracies": 1.0, "rewards/chosen": 2.758439779281616, "rewards/margins": 2.396775484085083, "rewards/rejected": 0.36166438460350037, "step": 10958 }, { "epoch": 1.78, "learning_rate": 1.829557723757431e-07, "logits/chosen": -1.1310707330703735, "logits/rejected": -1.0677756071090698, "logps/chosen": -94.32994842529297, "logps/rejected": -190.72354125976562, "loss": 1.137, "rewards/accuracies": 0.0, "rewards/chosen": 1.482338786125183, "rewards/margins": -1.9840689897537231, "rewards/rejected": 3.4664077758789062, "step": 10959 }, { "epoch": 1.78, "learning_rate": 1.8285415725971403e-07, "logits/chosen": -0.8174588680267334, "logits/rejected": -0.7626755833625793, "logps/chosen": -66.05667114257812, "logps/rejected": -72.40019226074219, "loss": 0.8027, "rewards/accuracies": 0.0, "rewards/chosen": 1.6681228876113892, "rewards/margins": -1.1876343488693237, "rewards/rejected": 2.855757236480713, "step": 10960 }, { "epoch": 1.78, "learning_rate": 1.8275256405546208e-07, "logits/chosen": -0.685066282749176, "logits/rejected": -0.6415833830833435, "logps/chosen": -74.39604187011719, "logps/rejected": -60.941612243652344, "loss": 0.7849, "rewards/accuracies": 1.0, "rewards/chosen": 1.7320114374160767, "rewards/margins": 0.7166633605957031, "rewards/rejected": 1.0153480768203735, "step": 10961 }, { "epoch": 1.78, "learning_rate": 1.826509927700061e-07, "logits/chosen": -0.6444399952888489, "logits/rejected": -0.6190049648284912, "logps/chosen": -109.57524108886719, "logps/rejected": -74.79737854003906, "loss": 1.7345, "rewards/accuracies": 0.0, "rewards/chosen": 0.04831848293542862, "rewards/margins": -1.3168426752090454, "rewards/rejected": 1.3651611804962158, "step": 10962 }, { "epoch": 1.78, "learning_rate": 1.8254944341036356e-07, "logits/chosen": -0.39055031538009644, "logits/rejected": -0.40783557295799255, "logps/chosen": -58.95484161376953, "logps/rejected": -87.59430694580078, "loss": 0.8317, "rewards/accuracies": 0.0, "rewards/chosen": 0.33597490191459656, "rewards/margins": -0.47366026043891907, "rewards/rejected": 0.8096351623535156, "step": 10963 }, { "epoch": 1.78, "learning_rate": 1.8244791598355092e-07, "logits/chosen": -0.6951825022697449, "logits/rejected": -0.7710645794868469, "logps/chosen": -158.21214294433594, "logps/rejected": -116.58053588867188, "loss": 1.6205, "rewards/accuracies": 0.0, "rewards/chosen": 4.076048374176025, "rewards/margins": -3.164689540863037, "rewards/rejected": 7.2407379150390625, "step": 10964 }, { "epoch": 1.78, "learning_rate": 1.823464104965824e-07, "logits/chosen": -0.679344117641449, "logits/rejected": -0.6545221209526062, "logps/chosen": -36.28282928466797, "logps/rejected": -65.82530975341797, "loss": 0.741, "rewards/accuracies": 0.0, "rewards/chosen": 2.171245574951172, "rewards/margins": -0.6840934753417969, "rewards/rejected": 2.8553390502929688, "step": 10965 }, { "epoch": 1.78, "learning_rate": 1.8224492695647143e-07, "logits/chosen": -0.8465536832809448, "logits/rejected": -0.8465536832809448, "logps/chosen": -68.52447509765625, "logps/rejected": -68.52447509765625, "loss": 0.6528, "rewards/accuracies": 0.0, "rewards/chosen": 2.391270399093628, "rewards/margins": 0.0, "rewards/rejected": 2.391270399093628, "step": 10966 }, { "epoch": 1.78, "learning_rate": 1.8214346537022917e-07, "logits/chosen": -0.5039889812469482, "logits/rejected": -0.4799516201019287, "logps/chosen": -75.63768005371094, "logps/rejected": -46.563499450683594, "loss": 0.4343, "rewards/accuracies": 1.0, "rewards/chosen": 1.2819687128067017, "rewards/margins": 0.4087226986885071, "rewards/rejected": 0.8732460141181946, "step": 10967 }, { "epoch": 1.78, "learning_rate": 1.820420257448661e-07, "logits/chosen": -0.7280999422073364, "logits/rejected": -0.6850290894508362, "logps/chosen": -92.7647705078125, "logps/rejected": -54.603031158447266, "loss": 1.1837, "rewards/accuracies": 0.0, "rewards/chosen": 1.8913520574569702, "rewards/margins": -0.03745460510253906, "rewards/rejected": 1.9288066625595093, "step": 10968 }, { "epoch": 1.78, "learning_rate": 1.8194060808739025e-07, "logits/chosen": -0.8210909962654114, "logits/rejected": -0.7581952810287476, "logps/chosen": -76.81819152832031, "logps/rejected": -66.04971313476562, "loss": 0.6211, "rewards/accuracies": 0.0, "rewards/chosen": 1.5524040460586548, "rewards/margins": -0.675979733467102, "rewards/rejected": 2.228383779525757, "step": 10969 }, { "epoch": 1.78, "learning_rate": 1.8183921240480908e-07, "logits/chosen": -0.8448782563209534, "logits/rejected": -0.8814723491668701, "logps/chosen": -87.24285125732422, "logps/rejected": -131.1410369873047, "loss": 1.3418, "rewards/accuracies": 0.0, "rewards/chosen": 1.2463455200195312, "rewards/margins": -0.5525543689727783, "rewards/rejected": 1.7988998889923096, "step": 10970 }, { "epoch": 1.78, "learning_rate": 1.8173783870412772e-07, "logits/chosen": -0.6621714234352112, "logits/rejected": -0.6285746693611145, "logps/chosen": -54.7549934387207, "logps/rejected": -107.51964569091797, "loss": 1.3949, "rewards/accuracies": 0.0, "rewards/chosen": 1.921718955039978, "rewards/margins": -0.7249752283096313, "rewards/rejected": 2.6466941833496094, "step": 10971 }, { "epoch": 1.78, "learning_rate": 1.8163648699235046e-07, "logits/chosen": -0.7381097078323364, "logits/rejected": -0.7480397820472717, "logps/chosen": -52.9829216003418, "logps/rejected": -90.65353393554688, "loss": 1.0577, "rewards/accuracies": 0.0, "rewards/chosen": 2.4635181427001953, "rewards/margins": -1.813039779663086, "rewards/rejected": 4.276557922363281, "step": 10972 }, { "epoch": 1.78, "learning_rate": 1.815351572764794e-07, "logits/chosen": -0.8783726096153259, "logits/rejected": -0.9361255168914795, "logps/chosen": -85.97190856933594, "logps/rejected": -49.112953186035156, "loss": 0.7711, "rewards/accuracies": 0.0, "rewards/chosen": 1.5344527959823608, "rewards/margins": -1.2483972311019897, "rewards/rejected": 2.7828500270843506, "step": 10973 }, { "epoch": 1.78, "learning_rate": 1.8143384956351576e-07, "logits/chosen": -0.3523256182670593, "logits/rejected": -0.36878320574760437, "logps/chosen": -12.92489242553711, "logps/rejected": -4.310225486755371, "loss": 0.4874, "rewards/accuracies": 0.0, "rewards/chosen": -0.08475904911756516, "rewards/margins": -0.43833544850349426, "rewards/rejected": 0.3535763919353485, "step": 10974 }, { "epoch": 1.78, "learning_rate": 1.813325638604587e-07, "logits/chosen": -0.6913871169090271, "logits/rejected": -0.654202401638031, "logps/chosen": -46.04119873046875, "logps/rejected": -21.734439849853516, "loss": 0.3147, "rewards/accuracies": 1.0, "rewards/chosen": 0.5977546572685242, "rewards/margins": 0.4766567051410675, "rewards/rejected": 0.12109794467687607, "step": 10975 }, { "epoch": 1.78, "learning_rate": 1.8123130017430632e-07, "logits/chosen": -0.9180803298950195, "logits/rejected": -0.6052102446556091, "logps/chosen": -132.2279052734375, "logps/rejected": -60.13191604614258, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": 4.870677471160889, "rewards/margins": 4.519497394561768, "rewards/rejected": 0.3511802852153778, "step": 10976 }, { "epoch": 1.78, "learning_rate": 1.8113005851205477e-07, "logits/chosen": -0.6204915046691895, "logits/rejected": -0.6204915046691895, "logps/chosen": -81.42715454101562, "logps/rejected": -81.42715454101562, "loss": 0.5346, "rewards/accuracies": 0.0, "rewards/chosen": 1.7410682439804077, "rewards/margins": 0.0, "rewards/rejected": 1.7410682439804077, "step": 10977 }, { "epoch": 1.78, "learning_rate": 1.8102883888069914e-07, "logits/chosen": -0.8669675588607788, "logits/rejected": -1.0609056949615479, "logps/chosen": -148.64315795898438, "logps/rejected": -116.69905853271484, "loss": 0.6334, "rewards/accuracies": 1.0, "rewards/chosen": 4.731808662414551, "rewards/margins": 1.0643441677093506, "rewards/rejected": 3.6674644947052, "step": 10978 }, { "epoch": 1.78, "learning_rate": 1.8092764128723243e-07, "logits/chosen": -0.628059446811676, "logits/rejected": -0.4541483223438263, "logps/chosen": -150.9854736328125, "logps/rejected": -116.38581848144531, "loss": 0.6286, "rewards/accuracies": 1.0, "rewards/chosen": 4.5355224609375, "rewards/margins": 2.5893845558166504, "rewards/rejected": 1.9461380243301392, "step": 10979 }, { "epoch": 1.78, "learning_rate": 1.808264657386468e-07, "logits/chosen": -0.3947618901729584, "logits/rejected": -0.4472258687019348, "logps/chosen": -63.963706970214844, "logps/rejected": -41.62605667114258, "loss": 1.8753, "rewards/accuracies": 0.0, "rewards/chosen": 0.8496170043945312, "rewards/margins": -0.1386200189590454, "rewards/rejected": 0.9882370233535767, "step": 10980 }, { "epoch": 1.78, "learning_rate": 1.8072531224193216e-07, "logits/chosen": -0.3173843026161194, "logits/rejected": -0.44050145149230957, "logps/chosen": -85.94306182861328, "logps/rejected": -98.84170532226562, "loss": 2.0232, "rewards/accuracies": 0.0, "rewards/chosen": 0.6482864618301392, "rewards/margins": -2.4054059982299805, "rewards/rejected": 3.053692579269409, "step": 10981 }, { "epoch": 1.78, "learning_rate": 1.8062418080407758e-07, "logits/chosen": -0.5654017925262451, "logits/rejected": -0.5654017925262451, "logps/chosen": -60.18470764160156, "logps/rejected": -60.18470764160156, "loss": 0.4337, "rewards/accuracies": 0.0, "rewards/chosen": 2.322103261947632, "rewards/margins": 0.0, "rewards/rejected": 2.322103261947632, "step": 10982 }, { "epoch": 1.78, "learning_rate": 1.8052307143207008e-07, "logits/chosen": -0.350150465965271, "logits/rejected": -0.350150465965271, "logps/chosen": -0.9666997194290161, "logps/rejected": -0.9666997194290161, "loss": 0.3608, "rewards/accuracies": 0.0, "rewards/chosen": 0.46688205003738403, "rewards/margins": 0.0, "rewards/rejected": 0.46688205003738403, "step": 10983 }, { "epoch": 1.78, "learning_rate": 1.804219841328955e-07, "logits/chosen": -0.3711073696613312, "logits/rejected": -0.4008946418762207, "logps/chosen": -110.93429565429688, "logps/rejected": -84.12783813476562, "loss": 0.2342, "rewards/accuracies": 1.0, "rewards/chosen": 2.1407792568206787, "rewards/margins": 1.0501748323440552, "rewards/rejected": 1.0906044244766235, "step": 10984 }, { "epoch": 1.78, "learning_rate": 1.8032091891353778e-07, "logits/chosen": -0.7376312613487244, "logits/rejected": -0.5442023277282715, "logps/chosen": -58.55609130859375, "logps/rejected": -22.036113739013672, "loss": 1.1911, "rewards/accuracies": 1.0, "rewards/chosen": 1.7860993146896362, "rewards/margins": 1.574341893196106, "rewards/rejected": 0.21175746619701385, "step": 10985 }, { "epoch": 1.78, "learning_rate": 1.8021987578097992e-07, "logits/chosen": -0.905092716217041, "logits/rejected": -0.7629544138908386, "logps/chosen": -97.98371124267578, "logps/rejected": -96.85707092285156, "loss": 0.0565, "rewards/accuracies": 1.0, "rewards/chosen": 5.253513336181641, "rewards/margins": 3.2898430824279785, "rewards/rejected": 1.9636703729629517, "step": 10986 }, { "epoch": 1.78, "learning_rate": 1.8011885474220267e-07, "logits/chosen": -0.056075289845466614, "logits/rejected": -0.056075289845466614, "logps/chosen": -77.19052124023438, "logps/rejected": -77.19052124023438, "loss": 2.3532, "rewards/accuracies": 0.0, "rewards/chosen": 0.03570861741900444, "rewards/margins": 0.0, "rewards/rejected": 0.03570861741900444, "step": 10987 }, { "epoch": 1.78, "learning_rate": 1.8001785580418604e-07, "logits/chosen": -0.47739821672439575, "logits/rejected": -0.44852906465530396, "logps/chosen": -40.20515441894531, "logps/rejected": -19.581642150878906, "loss": 0.2795, "rewards/accuracies": 1.0, "rewards/chosen": 0.5947815179824829, "rewards/margins": 0.4147768020629883, "rewards/rejected": 0.18000470101833344, "step": 10988 }, { "epoch": 1.78, "learning_rate": 1.7991687897390767e-07, "logits/chosen": -0.734785258769989, "logits/rejected": -0.7604174613952637, "logps/chosen": -125.3871841430664, "logps/rejected": -115.35029602050781, "loss": 0.4637, "rewards/accuracies": 0.0, "rewards/chosen": 5.706218719482422, "rewards/margins": -0.352053165435791, "rewards/rejected": 6.058271884918213, "step": 10989 }, { "epoch": 1.78, "learning_rate": 1.798159242583443e-07, "logits/chosen": -0.3739128112792969, "logits/rejected": -0.35282430052757263, "logps/chosen": -77.87063598632812, "logps/rejected": -20.766277313232422, "loss": 0.3741, "rewards/accuracies": 0.0, "rewards/chosen": 0.3027351498603821, "rewards/margins": -0.0011190474033355713, "rewards/rejected": 0.30385419726371765, "step": 10990 }, { "epoch": 1.78, "learning_rate": 1.7971499166447113e-07, "logits/chosen": -0.4286929965019226, "logits/rejected": -0.5114750862121582, "logps/chosen": -39.98543167114258, "logps/rejected": -96.5535888671875, "loss": 0.6538, "rewards/accuracies": 0.0, "rewards/chosen": 1.5788929462432861, "rewards/margins": -0.35680198669433594, "rewards/rejected": 1.935694932937622, "step": 10991 }, { "epoch": 1.78, "learning_rate": 1.796140811992613e-07, "logits/chosen": -0.8345722556114197, "logits/rejected": -0.7852335572242737, "logps/chosen": -28.130887985229492, "logps/rejected": -102.17417907714844, "loss": 0.7168, "rewards/accuracies": 0.0, "rewards/chosen": 2.329345226287842, "rewards/margins": -0.16210007667541504, "rewards/rejected": 2.491445302963257, "step": 10992 }, { "epoch": 1.78, "learning_rate": 1.795131928696871e-07, "logits/chosen": 0.030649861320853233, "logits/rejected": 0.02767624333500862, "logps/chosen": -3.7352936267852783, "logps/rejected": -1.253027319908142, "loss": 0.7284, "rewards/accuracies": 0.0, "rewards/chosen": 0.4479019343852997, "rewards/margins": -0.13634273409843445, "rewards/rejected": 0.5842446684837341, "step": 10993 }, { "epoch": 1.78, "learning_rate": 1.794123266827186e-07, "logits/chosen": -0.5750899314880371, "logits/rejected": -0.5854392647743225, "logps/chosen": -121.40791320800781, "logps/rejected": -193.26100158691406, "loss": 0.3936, "rewards/accuracies": 1.0, "rewards/chosen": 6.251521587371826, "rewards/margins": 0.07355833053588867, "rewards/rejected": 6.1779632568359375, "step": 10994 }, { "epoch": 1.78, "learning_rate": 1.7931148264532514e-07, "logits/chosen": -0.49058446288108826, "logits/rejected": -0.49247416853904724, "logps/chosen": -79.13799285888672, "logps/rejected": -30.81496810913086, "loss": 0.573, "rewards/accuracies": 1.0, "rewards/chosen": 2.3520500659942627, "rewards/margins": 0.9176162481307983, "rewards/rejected": 1.4344338178634644, "step": 10995 }, { "epoch": 1.78, "learning_rate": 1.792106607644736e-07, "logits/chosen": -0.581226646900177, "logits/rejected": -0.5583881735801697, "logps/chosen": -72.67446899414062, "logps/rejected": -89.85986328125, "loss": 0.2583, "rewards/accuracies": 1.0, "rewards/chosen": 1.9612442255020142, "rewards/margins": 0.70477294921875, "rewards/rejected": 1.2564712762832642, "step": 10996 }, { "epoch": 1.78, "learning_rate": 1.7910986104713028e-07, "logits/chosen": -1.19049870967865, "logits/rejected": -1.068837285041809, "logps/chosen": -140.17874145507812, "logps/rejected": -22.41639518737793, "loss": 0.0461, "rewards/accuracies": 1.0, "rewards/chosen": 4.628306865692139, "rewards/margins": 4.395845413208008, "rewards/rejected": 0.23246155679225922, "step": 10997 }, { "epoch": 1.79, "learning_rate": 1.790090835002591e-07, "logits/chosen": -0.4914869964122772, "logits/rejected": -0.4505678415298462, "logps/chosen": -90.62884521484375, "logps/rejected": -85.00755310058594, "loss": 1.2517, "rewards/accuracies": 0.0, "rewards/chosen": 0.03742370754480362, "rewards/margins": -1.2222121953964233, "rewards/rejected": 1.2596359252929688, "step": 10998 }, { "epoch": 1.79, "learning_rate": 1.7890832813082318e-07, "logits/chosen": -0.6974648237228394, "logits/rejected": -0.2830468714237213, "logps/chosen": -83.65745544433594, "logps/rejected": -102.16175842285156, "loss": 0.1729, "rewards/accuracies": 1.0, "rewards/chosen": 5.250199794769287, "rewards/margins": 1.0144529342651367, "rewards/rejected": 4.23574686050415, "step": 10999 }, { "epoch": 1.79, "learning_rate": 1.7880759494578341e-07, "logits/chosen": -0.6511287093162537, "logits/rejected": -0.642923891544342, "logps/chosen": -56.30055236816406, "logps/rejected": -20.697311401367188, "loss": 0.3441, "rewards/accuracies": 1.0, "rewards/chosen": 1.6116310358047485, "rewards/margins": 0.5564571619033813, "rewards/rejected": 1.0551738739013672, "step": 11000 }, { "epoch": 1.79, "learning_rate": 1.7870688395209983e-07, "logits/chosen": -0.49783816933631897, "logits/rejected": -0.4877561032772064, "logps/chosen": -23.21299934387207, "logps/rejected": -23.254634857177734, "loss": 0.9788, "rewards/accuracies": 0.0, "rewards/chosen": 0.1049184799194336, "rewards/margins": -0.25154438614845276, "rewards/rejected": 0.35646286606788635, "step": 11001 }, { "epoch": 1.79, "learning_rate": 1.7860619515673032e-07, "logits/chosen": -0.8264259099960327, "logits/rejected": -0.7331664562225342, "logps/chosen": -118.2617416381836, "logps/rejected": -121.99852752685547, "loss": 0.0551, "rewards/accuracies": 1.0, "rewards/chosen": 4.184433937072754, "rewards/margins": 2.539120674133301, "rewards/rejected": 1.6453132629394531, "step": 11002 }, { "epoch": 1.79, "learning_rate": 1.785055285666318e-07, "logits/chosen": -0.637885570526123, "logits/rejected": -0.6353196501731873, "logps/chosen": -53.648475646972656, "logps/rejected": -80.49197387695312, "loss": 0.4427, "rewards/accuracies": 1.0, "rewards/chosen": 1.9689972400665283, "rewards/margins": 1.3122353553771973, "rewards/rejected": 0.6567619442939758, "step": 11003 }, { "epoch": 1.79, "learning_rate": 1.7840488418875914e-07, "logits/chosen": -0.6529398560523987, "logits/rejected": -0.6124076843261719, "logps/chosen": -16.629642486572266, "logps/rejected": -26.466999053955078, "loss": 0.572, "rewards/accuracies": 0.0, "rewards/chosen": 1.555405855178833, "rewards/margins": -0.061447858810424805, "rewards/rejected": 1.6168537139892578, "step": 11004 }, { "epoch": 1.79, "learning_rate": 1.7830426203006615e-07, "logits/chosen": -0.6977154016494751, "logits/rejected": -0.6459087133407593, "logps/chosen": -76.17111206054688, "logps/rejected": -84.33668518066406, "loss": 0.313, "rewards/accuracies": 1.0, "rewards/chosen": 2.1663100719451904, "rewards/margins": 0.26854240894317627, "rewards/rejected": 1.8977676630020142, "step": 11005 }, { "epoch": 1.79, "learning_rate": 1.782036620975046e-07, "logits/chosen": -0.6569836139678955, "logits/rejected": -0.5702270269393921, "logps/chosen": -86.40245819091797, "logps/rejected": -40.2026252746582, "loss": 0.697, "rewards/accuracies": 1.0, "rewards/chosen": 1.2337135076522827, "rewards/margins": 1.1305667161941528, "rewards/rejected": 0.1031467467546463, "step": 11006 }, { "epoch": 1.79, "learning_rate": 1.7810308439802529e-07, "logits/chosen": -1.0186086893081665, "logits/rejected": -0.9912916421890259, "logps/chosen": -161.7469024658203, "logps/rejected": -204.56817626953125, "loss": 2.4156, "rewards/accuracies": 0.0, "rewards/chosen": 3.192845106124878, "rewards/margins": -3.9193832874298096, "rewards/rejected": 7.1122283935546875, "step": 11007 }, { "epoch": 1.79, "learning_rate": 1.7800252893857682e-07, "logits/chosen": -0.16312342882156372, "logits/rejected": -0.8297048807144165, "logps/chosen": -15.386941909790039, "logps/rejected": -67.05455780029297, "loss": 0.9387, "rewards/accuracies": 0.0, "rewards/chosen": 0.6906988024711609, "rewards/margins": -1.5738847255706787, "rewards/rejected": 2.2645835876464844, "step": 11008 }, { "epoch": 1.79, "learning_rate": 1.7790199572610704e-07, "logits/chosen": -0.8720962405204773, "logits/rejected": -0.8125296235084534, "logps/chosen": -66.59425354003906, "logps/rejected": -83.97264099121094, "loss": 1.2684, "rewards/accuracies": 1.0, "rewards/chosen": 1.5903793573379517, "rewards/margins": 0.13055729866027832, "rewards/rejected": 1.4598220586776733, "step": 11009 }, { "epoch": 1.79, "learning_rate": 1.7780148476756146e-07, "logits/chosen": -1.013826847076416, "logits/rejected": -1.0155038833618164, "logps/chosen": -94.40530395507812, "logps/rejected": -76.21573638916016, "loss": 0.7689, "rewards/accuracies": 0.0, "rewards/chosen": 1.0114609003067017, "rewards/margins": -1.2690285444259644, "rewards/rejected": 2.280489444732666, "step": 11010 }, { "epoch": 1.79, "learning_rate": 1.7770099606988482e-07, "logits/chosen": -0.6029720306396484, "logits/rejected": -0.6203072667121887, "logps/chosen": -77.26641845703125, "logps/rejected": -69.52839660644531, "loss": 0.7055, "rewards/accuracies": 0.0, "rewards/chosen": 1.1703903675079346, "rewards/margins": -0.2861891984939575, "rewards/rejected": 1.456579566001892, "step": 11011 }, { "epoch": 1.79, "learning_rate": 1.7760052964001949e-07, "logits/chosen": -0.6934388875961304, "logits/rejected": -0.6706127524375916, "logps/chosen": -226.38583374023438, "logps/rejected": -127.3419189453125, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": 4.759472846984863, "rewards/margins": 3.841623067855835, "rewards/rejected": 0.9178497195243835, "step": 11012 }, { "epoch": 1.79, "learning_rate": 1.7750008548490718e-07, "logits/chosen": -0.7118868231773376, "logits/rejected": -0.6061245203018188, "logps/chosen": -62.51599884033203, "logps/rejected": -28.757183074951172, "loss": 0.4527, "rewards/accuracies": 1.0, "rewards/chosen": 1.7222603559494019, "rewards/margins": 1.7051975727081299, "rewards/rejected": 0.017062759026885033, "step": 11013 }, { "epoch": 1.79, "learning_rate": 1.7739966361148728e-07, "logits/chosen": -0.5893094539642334, "logits/rejected": -0.5632880330085754, "logps/chosen": -34.14659118652344, "logps/rejected": -23.147544860839844, "loss": 0.7691, "rewards/accuracies": 0.0, "rewards/chosen": 1.624457597732544, "rewards/margins": -0.1900489330291748, "rewards/rejected": 1.8145065307617188, "step": 11014 }, { "epoch": 1.79, "learning_rate": 1.7729926402669838e-07, "logits/chosen": -1.1371694803237915, "logits/rejected": -1.0615241527557373, "logps/chosen": -149.91250610351562, "logps/rejected": -81.55886840820312, "loss": 0.1073, "rewards/accuracies": 1.0, "rewards/chosen": 5.693394660949707, "rewards/margins": 2.8851449489593506, "rewards/rejected": 2.8082497119903564, "step": 11015 }, { "epoch": 1.79, "learning_rate": 1.7719888673747668e-07, "logits/chosen": -0.36498504877090454, "logits/rejected": -0.40177804231643677, "logps/chosen": -50.75366973876953, "logps/rejected": -49.08997344970703, "loss": 1.0893, "rewards/accuracies": 0.0, "rewards/chosen": 1.7088546752929688, "rewards/margins": -0.8707230091094971, "rewards/rejected": 2.579577684402466, "step": 11016 }, { "epoch": 1.79, "learning_rate": 1.7709853175075768e-07, "logits/chosen": -0.39525100588798523, "logits/rejected": -0.39525100588798523, "logps/chosen": -1.6893572807312012, "logps/rejected": -1.6893572807312012, "loss": 0.5674, "rewards/accuracies": 0.0, "rewards/chosen": 0.27193203568458557, "rewards/margins": 0.0, "rewards/rejected": 0.27193203568458557, "step": 11017 }, { "epoch": 1.79, "learning_rate": 1.769981990734747e-07, "logits/chosen": -0.7344767451286316, "logits/rejected": -0.7344767451286316, "logps/chosen": -93.20026397705078, "logps/rejected": -93.20026397705078, "loss": 0.3484, "rewards/accuracies": 0.0, "rewards/chosen": 2.651186466217041, "rewards/margins": 0.0, "rewards/rejected": 2.651186466217041, "step": 11018 }, { "epoch": 1.79, "learning_rate": 1.7689788871256e-07, "logits/chosen": -0.11836361885070801, "logits/rejected": -0.14285792410373688, "logps/chosen": -68.08822631835938, "logps/rejected": -60.84685516357422, "loss": 0.7294, "rewards/accuracies": 0.0, "rewards/chosen": 0.7349769473075867, "rewards/margins": -1.0441772937774658, "rewards/rejected": 1.7791541814804077, "step": 11019 }, { "epoch": 1.79, "learning_rate": 1.7679760067494388e-07, "logits/chosen": -1.090636968612671, "logits/rejected": -1.0534456968307495, "logps/chosen": -74.03130340576172, "logps/rejected": -8.571045875549316, "loss": 1.8636, "rewards/accuracies": 1.0, "rewards/chosen": 1.5863579511642456, "rewards/margins": 1.2970012426376343, "rewards/rejected": 0.28935670852661133, "step": 11020 }, { "epoch": 1.79, "learning_rate": 1.7669733496755552e-07, "logits/chosen": -0.8445821404457092, "logits/rejected": -0.7787126302719116, "logps/chosen": -49.36223602294922, "logps/rejected": -75.12545013427734, "loss": 1.4896, "rewards/accuracies": 1.0, "rewards/chosen": 2.9672799110412598, "rewards/margins": 0.3876359462738037, "rewards/rejected": 2.579643964767456, "step": 11021 }, { "epoch": 1.79, "learning_rate": 1.76597091597322e-07, "logits/chosen": -0.8664221167564392, "logits/rejected": -0.7577369809150696, "logps/chosen": -82.4030990600586, "logps/rejected": -79.1185531616211, "loss": 0.1867, "rewards/accuracies": 1.0, "rewards/chosen": 4.057267189025879, "rewards/margins": 1.7885072231292725, "rewards/rejected": 2.2687599658966064, "step": 11022 }, { "epoch": 1.79, "learning_rate": 1.7649687057116956e-07, "logits/chosen": -0.5431740880012512, "logits/rejected": -0.5261936187744141, "logps/chosen": -30.686063766479492, "logps/rejected": -68.61659240722656, "loss": 0.7127, "rewards/accuracies": 1.0, "rewards/chosen": 1.6932398080825806, "rewards/margins": 0.388022780418396, "rewards/rejected": 1.3052170276641846, "step": 11023 }, { "epoch": 1.79, "learning_rate": 1.763966718960222e-07, "logits/chosen": -0.7427470088005066, "logits/rejected": -0.6780597567558289, "logps/chosen": -70.87733459472656, "logps/rejected": -24.914810180664062, "loss": 0.9513, "rewards/accuracies": 1.0, "rewards/chosen": 2.932438611984253, "rewards/margins": 1.6170276403427124, "rewards/rejected": 1.3154109716415405, "step": 11024 }, { "epoch": 1.79, "learning_rate": 1.7629649557880284e-07, "logits/chosen": -0.4918660819530487, "logits/rejected": -0.4403526782989502, "logps/chosen": -99.81414794921875, "logps/rejected": -54.52913284301758, "loss": 1.0808, "rewards/accuracies": 0.0, "rewards/chosen": 0.1199188232421875, "rewards/margins": -0.9754269123077393, "rewards/rejected": 1.0953457355499268, "step": 11025 }, { "epoch": 1.79, "learning_rate": 1.7619634162643287e-07, "logits/chosen": -0.9951204657554626, "logits/rejected": -0.9774460792541504, "logps/chosen": -64.54170227050781, "logps/rejected": -66.69062042236328, "loss": 1.2725, "rewards/accuracies": 0.0, "rewards/chosen": 2.17472243309021, "rewards/margins": -1.2450599670410156, "rewards/rejected": 3.4197824001312256, "step": 11026 }, { "epoch": 1.79, "learning_rate": 1.760962100458317e-07, "logits/chosen": -0.5910750031471252, "logits/rejected": -0.597100019454956, "logps/chosen": -42.020599365234375, "logps/rejected": -105.81936645507812, "loss": 0.2788, "rewards/accuracies": 1.0, "rewards/chosen": 1.2214069366455078, "rewards/margins": 0.40621525049209595, "rewards/rejected": 0.8151916861534119, "step": 11027 }, { "epoch": 1.79, "learning_rate": 1.759961008439178e-07, "logits/chosen": -1.233790636062622, "logits/rejected": -1.2729429006576538, "logps/chosen": -45.62920379638672, "logps/rejected": -89.364990234375, "loss": 0.7299, "rewards/accuracies": 0.0, "rewards/chosen": 2.0611908435821533, "rewards/margins": -0.3295912742614746, "rewards/rejected": 2.390782117843628, "step": 11028 }, { "epoch": 1.79, "learning_rate": 1.7589601402760734e-07, "logits/chosen": -0.5138157606124878, "logits/rejected": -0.5138157606124878, "logps/chosen": -62.834320068359375, "logps/rejected": -62.834320068359375, "loss": 0.683, "rewards/accuracies": 0.0, "rewards/chosen": 1.4576538801193237, "rewards/margins": 0.0, "rewards/rejected": 1.4576538801193237, "step": 11029 }, { "epoch": 1.79, "learning_rate": 1.7579594960381582e-07, "logits/chosen": -0.6656073927879333, "logits/rejected": -0.6528110504150391, "logps/chosen": -105.80961608886719, "logps/rejected": -58.305259704589844, "loss": 0.6071, "rewards/accuracies": 0.0, "rewards/chosen": 1.351782202720642, "rewards/margins": -0.8166786432266235, "rewards/rejected": 2.1684608459472656, "step": 11030 }, { "epoch": 1.79, "learning_rate": 1.7569590757945634e-07, "logits/chosen": -0.6241623759269714, "logits/rejected": -0.572486162185669, "logps/chosen": -64.85343933105469, "logps/rejected": -74.38798522949219, "loss": 0.4993, "rewards/accuracies": 1.0, "rewards/chosen": 2.3947060108184814, "rewards/margins": 0.2977592945098877, "rewards/rejected": 2.0969467163085938, "step": 11031 }, { "epoch": 1.79, "learning_rate": 1.7559588796144126e-07, "logits/chosen": -0.9401872158050537, "logits/rejected": -0.9068288803100586, "logps/chosen": -239.06939697265625, "logps/rejected": -72.98959350585938, "loss": 0.4821, "rewards/accuracies": 0.0, "rewards/chosen": 1.9246704578399658, "rewards/margins": -0.08119583129882812, "rewards/rejected": 2.005866289138794, "step": 11032 }, { "epoch": 1.79, "learning_rate": 1.7549589075668057e-07, "logits/chosen": -0.7107284069061279, "logits/rejected": -0.6603150963783264, "logps/chosen": -138.74826049804688, "logps/rejected": -125.46515655517578, "loss": 4.6492, "rewards/accuracies": 0.0, "rewards/chosen": 1.367346167564392, "rewards/margins": -4.215146541595459, "rewards/rejected": 5.582492828369141, "step": 11033 }, { "epoch": 1.79, "learning_rate": 1.753959159720836e-07, "logits/chosen": -0.5680192708969116, "logits/rejected": -0.5917657613754272, "logps/chosen": -12.194106101989746, "logps/rejected": -2.5708415508270264, "loss": 0.9202, "rewards/accuracies": 0.0, "rewards/chosen": -0.10614795982837677, "rewards/margins": -0.33571553230285645, "rewards/rejected": 0.22956755757331848, "step": 11034 }, { "epoch": 1.79, "learning_rate": 1.7529596361455717e-07, "logits/chosen": -0.5065650939941406, "logits/rejected": -0.4279825687408447, "logps/chosen": -63.34069061279297, "logps/rejected": -91.43202209472656, "loss": 0.5907, "rewards/accuracies": 0.0, "rewards/chosen": 1.4887688159942627, "rewards/margins": -0.2252722978591919, "rewards/rejected": 1.7140411138534546, "step": 11035 }, { "epoch": 1.79, "learning_rate": 1.7519603369100743e-07, "logits/chosen": -0.6301791071891785, "logits/rejected": -0.6261200904846191, "logps/chosen": -204.2550048828125, "logps/rejected": -146.07916259765625, "loss": 0.3498, "rewards/accuracies": 1.0, "rewards/chosen": 5.6075897216796875, "rewards/margins": 0.00402069091796875, "rewards/rejected": 5.603569030761719, "step": 11036 }, { "epoch": 1.79, "learning_rate": 1.750961262083383e-07, "logits/chosen": -0.7872645854949951, "logits/rejected": -0.7845374345779419, "logps/chosen": -163.32208251953125, "logps/rejected": -83.90943145751953, "loss": 0.0747, "rewards/accuracies": 1.0, "rewards/chosen": 2.517953634262085, "rewards/margins": 1.8523484468460083, "rewards/rejected": 0.6656051874160767, "step": 11037 }, { "epoch": 1.79, "learning_rate": 1.7499624117345275e-07, "logits/chosen": -0.7773325443267822, "logits/rejected": -0.5333781242370605, "logps/chosen": -138.50717163085938, "logps/rejected": -32.548545837402344, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 7.236278057098389, "rewards/margins": 5.587465763092041, "rewards/rejected": 1.648812174797058, "step": 11038 }, { "epoch": 1.79, "learning_rate": 1.748963785932515e-07, "logits/chosen": -0.640476644039154, "logits/rejected": -0.7138217091560364, "logps/chosen": -51.406925201416016, "logps/rejected": -67.85588073730469, "loss": 0.8809, "rewards/accuracies": 0.0, "rewards/chosen": 1.7312793731689453, "rewards/margins": -0.7612683773040771, "rewards/rejected": 2.4925477504730225, "step": 11039 }, { "epoch": 1.79, "learning_rate": 1.7479653847463456e-07, "logits/chosen": -1.023600459098816, "logits/rejected": -0.9242682456970215, "logps/chosen": -153.1438751220703, "logps/rejected": -62.291690826416016, "loss": 0.9546, "rewards/accuracies": 1.0, "rewards/chosen": 4.867164611816406, "rewards/margins": 2.720756769180298, "rewards/rejected": 2.1464078426361084, "step": 11040 }, { "epoch": 1.79, "learning_rate": 1.746967208244995e-07, "logits/chosen": -0.21577325463294983, "logits/rejected": -0.21023103594779968, "logps/chosen": -36.93656921386719, "logps/rejected": -18.57122230529785, "loss": 0.9656, "rewards/accuracies": 1.0, "rewards/chosen": 0.4608684480190277, "rewards/margins": 0.569941520690918, "rewards/rejected": -0.10907306522130966, "step": 11041 }, { "epoch": 1.79, "learning_rate": 1.7459692564974315e-07, "logits/chosen": -0.3380959928035736, "logits/rejected": -0.33911848068237305, "logps/chosen": -3.867170810699463, "logps/rejected": -2.5835399627685547, "loss": 0.6988, "rewards/accuracies": 0.0, "rewards/chosen": 0.34439706802368164, "rewards/margins": -0.14843103289604187, "rewards/rejected": 0.4928281009197235, "step": 11042 }, { "epoch": 1.79, "learning_rate": 1.7449715295726015e-07, "logits/chosen": -0.45956486463546753, "logits/rejected": -0.3623933792114258, "logps/chosen": -92.84859466552734, "logps/rejected": -69.60608673095703, "loss": 0.2687, "rewards/accuracies": 1.0, "rewards/chosen": 4.5006937980651855, "rewards/margins": 2.3901193141937256, "rewards/rejected": 2.11057448387146, "step": 11043 }, { "epoch": 1.79, "learning_rate": 1.7439740275394405e-07, "logits/chosen": -0.4932045042514801, "logits/rejected": -0.47382208704948425, "logps/chosen": -150.9325408935547, "logps/rejected": -125.92015075683594, "loss": 0.3983, "rewards/accuracies": 1.0, "rewards/chosen": 5.2886857986450195, "rewards/margins": 0.8411955833435059, "rewards/rejected": 4.447490215301514, "step": 11044 }, { "epoch": 1.79, "learning_rate": 1.7429767504668637e-07, "logits/chosen": -0.6024499535560608, "logits/rejected": -0.6796782612800598, "logps/chosen": -95.49656677246094, "logps/rejected": -118.32025146484375, "loss": 2.2086, "rewards/accuracies": 0.0, "rewards/chosen": 2.2309632301330566, "rewards/margins": -3.0970749855041504, "rewards/rejected": 5.328038215637207, "step": 11045 }, { "epoch": 1.79, "learning_rate": 1.7419796984237766e-07, "logits/chosen": -1.0577058792114258, "logits/rejected": -0.8139392733573914, "logps/chosen": -123.84273529052734, "logps/rejected": -20.219722747802734, "loss": 0.0157, "rewards/accuracies": 1.0, "rewards/chosen": 4.939072608947754, "rewards/margins": 4.6430487632751465, "rewards/rejected": 0.2960239350795746, "step": 11046 }, { "epoch": 1.79, "learning_rate": 1.7409828714790636e-07, "logits/chosen": -0.5808932781219482, "logits/rejected": -0.6526042222976685, "logps/chosen": -75.2503662109375, "logps/rejected": -45.12623977661133, "loss": 0.9436, "rewards/accuracies": 0.0, "rewards/chosen": 1.5359611511230469, "rewards/margins": -0.770770788192749, "rewards/rejected": 2.306731939315796, "step": 11047 }, { "epoch": 1.79, "learning_rate": 1.7399862697015982e-07, "logits/chosen": -0.8637623190879822, "logits/rejected": -1.0720728635787964, "logps/chosen": -156.533203125, "logps/rejected": -92.05742645263672, "loss": 0.7852, "rewards/accuracies": 1.0, "rewards/chosen": 4.096866130828857, "rewards/margins": 0.037629127502441406, "rewards/rejected": 4.059237003326416, "step": 11048 }, { "epoch": 1.79, "learning_rate": 1.7389898931602336e-07, "logits/chosen": -0.8427116274833679, "logits/rejected": -0.7777827382087708, "logps/chosen": -69.73303985595703, "logps/rejected": -69.93141174316406, "loss": 0.4445, "rewards/accuracies": 0.0, "rewards/chosen": 1.7101433277130127, "rewards/margins": -0.013307929039001465, "rewards/rejected": 1.7234512567520142, "step": 11049 }, { "epoch": 1.79, "learning_rate": 1.7379937419238134e-07, "logits/chosen": -0.4181365966796875, "logits/rejected": -0.4347277879714966, "logps/chosen": -84.10969543457031, "logps/rejected": -100.56999206542969, "loss": 0.9954, "rewards/accuracies": 0.0, "rewards/chosen": 0.6505066156387329, "rewards/margins": -1.035627007484436, "rewards/rejected": 1.686133623123169, "step": 11050 }, { "epoch": 1.79, "learning_rate": 1.736997816061158e-07, "logits/chosen": -0.9561642408370972, "logits/rejected": -0.9082553386688232, "logps/chosen": -79.56572723388672, "logps/rejected": -75.70764923095703, "loss": 0.5404, "rewards/accuracies": 0.0, "rewards/chosen": 2.524752140045166, "rewards/margins": -0.6573638916015625, "rewards/rejected": 3.1821160316467285, "step": 11051 }, { "epoch": 1.79, "learning_rate": 1.7360021156410808e-07, "logits/chosen": -0.9758825898170471, "logits/rejected": -0.8156892657279968, "logps/chosen": -141.91258239746094, "logps/rejected": -61.322120666503906, "loss": 0.3359, "rewards/accuracies": 1.0, "rewards/chosen": 5.044835090637207, "rewards/margins": 2.4246773719787598, "rewards/rejected": 2.6201577186584473, "step": 11052 }, { "epoch": 1.79, "learning_rate": 1.7350066407323715e-07, "logits/chosen": -0.8831653594970703, "logits/rejected": -0.7980416417121887, "logps/chosen": -120.97343444824219, "logps/rejected": -77.82308959960938, "loss": 0.6218, "rewards/accuracies": 0.0, "rewards/chosen": 1.2742111682891846, "rewards/margins": -0.8802216053009033, "rewards/rejected": 2.154432773590088, "step": 11053 }, { "epoch": 1.79, "learning_rate": 1.7340113914038113e-07, "logits/chosen": -0.6031818985939026, "logits/rejected": -0.6031818985939026, "logps/chosen": -9.892061233520508, "logps/rejected": -9.892061233520508, "loss": 0.874, "rewards/accuracies": 0.0, "rewards/chosen": 0.8873712420463562, "rewards/margins": 0.0, "rewards/rejected": 0.8873712420463562, "step": 11054 }, { "epoch": 1.79, "learning_rate": 1.7330163677241588e-07, "logits/chosen": -0.7793711423873901, "logits/rejected": -0.5631921887397766, "logps/chosen": -120.9940185546875, "logps/rejected": -19.75409507751465, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 5.279611110687256, "rewards/margins": 5.141177654266357, "rewards/rejected": 0.13843365013599396, "step": 11055 }, { "epoch": 1.79, "learning_rate": 1.7320215697621648e-07, "logits/chosen": -0.24389687180519104, "logits/rejected": -0.3409841060638428, "logps/chosen": -52.007869720458984, "logps/rejected": -67.73613739013672, "loss": 1.0339, "rewards/accuracies": 0.0, "rewards/chosen": 0.7972130179405212, "rewards/margins": -0.9716903567314148, "rewards/rejected": 1.768903374671936, "step": 11056 }, { "epoch": 1.79, "learning_rate": 1.731026997586557e-07, "logits/chosen": -0.8308031558990479, "logits/rejected": -0.881022036075592, "logps/chosen": -100.73177337646484, "logps/rejected": -154.00628662109375, "loss": 0.8659, "rewards/accuracies": 0.0, "rewards/chosen": 6.2318339347839355, "rewards/margins": -0.7699241638183594, "rewards/rejected": 7.001758098602295, "step": 11057 }, { "epoch": 1.79, "learning_rate": 1.730032651266054e-07, "logits/chosen": -0.5523689389228821, "logits/rejected": -0.5399607419967651, "logps/chosen": -52.01148223876953, "logps/rejected": -82.05900573730469, "loss": 1.3117, "rewards/accuracies": 1.0, "rewards/chosen": 2.2423501014709473, "rewards/margins": 0.4864640235900879, "rewards/rejected": 1.7558860778808594, "step": 11058 }, { "epoch": 1.8, "learning_rate": 1.7290385308693522e-07, "logits/chosen": -0.9311310052871704, "logits/rejected": -0.8074525594711304, "logps/chosen": -140.96170043945312, "logps/rejected": -73.46231079101562, "loss": 0.1049, "rewards/accuracies": 1.0, "rewards/chosen": 5.261016845703125, "rewards/margins": 2.1814894676208496, "rewards/rejected": 3.0795273780822754, "step": 11059 }, { "epoch": 1.8, "learning_rate": 1.7280446364651375e-07, "logits/chosen": -0.5489341616630554, "logits/rejected": -0.5489341616630554, "logps/chosen": -29.847148895263672, "logps/rejected": -29.847148895263672, "loss": 1.2608, "rewards/accuracies": 0.0, "rewards/chosen": 1.1298809051513672, "rewards/margins": 0.0, "rewards/rejected": 1.1298809051513672, "step": 11060 }, { "epoch": 1.8, "learning_rate": 1.7270509681220808e-07, "logits/chosen": -0.5072834491729736, "logits/rejected": -0.49707305431365967, "logps/chosen": -46.83110809326172, "logps/rejected": -45.47369384765625, "loss": 0.3168, "rewards/accuracies": 1.0, "rewards/chosen": 1.4660770893096924, "rewards/margins": 0.13813936710357666, "rewards/rejected": 1.3279377222061157, "step": 11061 }, { "epoch": 1.8, "learning_rate": 1.7260575259088317e-07, "logits/chosen": -1.0693154335021973, "logits/rejected": -0.7750711441040039, "logps/chosen": -150.6304931640625, "logps/rejected": -92.96443939208984, "loss": 0.7463, "rewards/accuracies": 1.0, "rewards/chosen": 5.005261421203613, "rewards/margins": 1.5243356227874756, "rewards/rejected": 3.4809257984161377, "step": 11062 }, { "epoch": 1.8, "learning_rate": 1.7250643098940309e-07, "logits/chosen": -0.5216283202171326, "logits/rejected": -0.5216283202171326, "logps/chosen": -89.78246307373047, "logps/rejected": -89.78246307373047, "loss": 0.7294, "rewards/accuracies": 0.0, "rewards/chosen": 2.34907603263855, "rewards/margins": 0.0, "rewards/rejected": 2.34907603263855, "step": 11063 }, { "epoch": 1.8, "learning_rate": 1.724071320146297e-07, "logits/chosen": -0.28169694542884827, "logits/rejected": -0.2925623655319214, "logps/chosen": -109.35853576660156, "logps/rejected": -52.20692443847656, "loss": 0.5536, "rewards/accuracies": 0.0, "rewards/chosen": 1.5590142011642456, "rewards/margins": -0.6709641218185425, "rewards/rejected": 2.229978322982788, "step": 11064 }, { "epoch": 1.8, "learning_rate": 1.7230785567342392e-07, "logits/chosen": -0.9340104460716248, "logits/rejected": -0.874451756477356, "logps/chosen": -128.2142791748047, "logps/rejected": -117.36073303222656, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": 5.360234260559082, "rewards/margins": 3.9265947341918945, "rewards/rejected": 1.4336395263671875, "step": 11065 }, { "epoch": 1.8, "learning_rate": 1.7220860197264448e-07, "logits/chosen": -0.4950764775276184, "logits/rejected": -0.4950764775276184, "logps/chosen": -4.245034217834473, "logps/rejected": -4.245034217834473, "loss": 0.3474, "rewards/accuracies": 0.0, "rewards/chosen": 0.25440284609794617, "rewards/margins": 0.0, "rewards/rejected": 0.25440284609794617, "step": 11066 }, { "epoch": 1.8, "learning_rate": 1.721093709191493e-07, "logits/chosen": -0.9307170510292053, "logits/rejected": -0.8413705825805664, "logps/chosen": -94.59901428222656, "logps/rejected": -24.2608642578125, "loss": 0.1002, "rewards/accuracies": 1.0, "rewards/chosen": 2.084120988845825, "rewards/margins": 1.7226920127868652, "rewards/rejected": 0.36142903566360474, "step": 11067 }, { "epoch": 1.8, "learning_rate": 1.7201016251979394e-07, "logits/chosen": -1.108054757118225, "logits/rejected": -1.0588889122009277, "logps/chosen": -103.2982406616211, "logps/rejected": -25.938074111938477, "loss": 0.302, "rewards/accuracies": 1.0, "rewards/chosen": 2.0592172145843506, "rewards/margins": 2.0461137294769287, "rewards/rejected": 0.013103485107421875, "step": 11068 }, { "epoch": 1.8, "learning_rate": 1.7191097678143295e-07, "logits/chosen": -0.8099499344825745, "logits/rejected": -0.8099499344825745, "logps/chosen": -156.04608154296875, "logps/rejected": -156.04608154296875, "loss": 1.3867, "rewards/accuracies": 0.0, "rewards/chosen": 3.727813720703125, "rewards/margins": 0.0, "rewards/rejected": 3.727813720703125, "step": 11069 }, { "epoch": 1.8, "learning_rate": 1.718118137109189e-07, "logits/chosen": -0.43380415439605713, "logits/rejected": -1.0274572372436523, "logps/chosen": -69.21117401123047, "logps/rejected": -34.802032470703125, "loss": 0.0817, "rewards/accuracies": 1.0, "rewards/chosen": 2.0688469409942627, "rewards/margins": 1.8683029413223267, "rewards/rejected": 0.20054398477077484, "step": 11070 }, { "epoch": 1.8, "learning_rate": 1.717126733151033e-07, "logits/chosen": -0.8486126065254211, "logits/rejected": -0.8486126065254211, "logps/chosen": -53.77646255493164, "logps/rejected": -53.77646255493164, "loss": 0.9395, "rewards/accuracies": 0.0, "rewards/chosen": 1.1230617761611938, "rewards/margins": 0.0, "rewards/rejected": 1.1230617761611938, "step": 11071 }, { "epoch": 1.8, "learning_rate": 1.7161355560083556e-07, "logits/chosen": -0.9119635224342346, "logits/rejected": -0.8664514422416687, "logps/chosen": -102.34516906738281, "logps/rejected": -51.17333984375, "loss": 1.1927, "rewards/accuracies": 0.0, "rewards/chosen": -0.18002091348171234, "rewards/margins": -1.3157211542129517, "rewards/rejected": 1.1357002258300781, "step": 11072 }, { "epoch": 1.8, "learning_rate": 1.7151446057496404e-07, "logits/chosen": -0.5257312059402466, "logits/rejected": -0.5257312059402466, "logps/chosen": -29.98371696472168, "logps/rejected": -29.98371696472168, "loss": 0.3738, "rewards/accuracies": 0.0, "rewards/chosen": 1.440559983253479, "rewards/margins": 0.0, "rewards/rejected": 1.440559983253479, "step": 11073 }, { "epoch": 1.8, "learning_rate": 1.7141538824433505e-07, "logits/chosen": -0.7927528023719788, "logits/rejected": -0.73505038022995, "logps/chosen": -101.70854187011719, "logps/rejected": -85.57367706298828, "loss": 0.3745, "rewards/accuracies": 1.0, "rewards/chosen": 2.9224441051483154, "rewards/margins": 0.3492460250854492, "rewards/rejected": 2.573198080062866, "step": 11074 }, { "epoch": 1.8, "learning_rate": 1.713163386157937e-07, "logits/chosen": -0.7068760991096497, "logits/rejected": -0.6705816388130188, "logps/chosen": -170.64599609375, "logps/rejected": -123.33331298828125, "loss": 0.8356, "rewards/accuracies": 0.0, "rewards/chosen": 3.7000367641448975, "rewards/margins": -1.4506471157073975, "rewards/rejected": 5.150683879852295, "step": 11075 }, { "epoch": 1.8, "learning_rate": 1.712173116961832e-07, "logits/chosen": -0.5194751024246216, "logits/rejected": -0.4692031741142273, "logps/chosen": -98.75664520263672, "logps/rejected": -68.522216796875, "loss": 0.5992, "rewards/accuracies": 0.0, "rewards/chosen": 0.3654579222202301, "rewards/margins": -0.4462287724018097, "rewards/rejected": 0.8116866946220398, "step": 11076 }, { "epoch": 1.8, "learning_rate": 1.7111830749234567e-07, "logits/chosen": -0.938601553440094, "logits/rejected": -0.9893268346786499, "logps/chosen": -178.7148895263672, "logps/rejected": -148.384033203125, "loss": 1.4412, "rewards/accuracies": 1.0, "rewards/chosen": 5.257340908050537, "rewards/margins": 0.11390209197998047, "rewards/rejected": 5.143438816070557, "step": 11077 }, { "epoch": 1.8, "learning_rate": 1.71019326011121e-07, "logits/chosen": -0.9910066723823547, "logits/rejected": -1.006697654724121, "logps/chosen": -105.41681671142578, "logps/rejected": -218.50042724609375, "loss": 1.2311, "rewards/accuracies": 0.0, "rewards/chosen": 2.104464054107666, "rewards/margins": -1.4836599826812744, "rewards/rejected": 3.5881240367889404, "step": 11078 }, { "epoch": 1.8, "learning_rate": 1.709203672593482e-07, "logits/chosen": -0.8499178886413574, "logits/rejected": -0.767203152179718, "logps/chosen": -265.7198791503906, "logps/rejected": -65.23222351074219, "loss": 1.558, "rewards/accuracies": 0.0, "rewards/chosen": 0.19801636040210724, "rewards/margins": -1.8102142810821533, "rewards/rejected": 2.008230686187744, "step": 11079 }, { "epoch": 1.8, "learning_rate": 1.7082143124386411e-07, "logits/chosen": -1.1267547607421875, "logits/rejected": -1.1394612789154053, "logps/chosen": -94.37158203125, "logps/rejected": -72.5166015625, "loss": 0.633, "rewards/accuracies": 0.0, "rewards/chosen": 0.776519775390625, "rewards/margins": -0.010235607624053955, "rewards/rejected": 0.786755383014679, "step": 11080 }, { "epoch": 1.8, "learning_rate": 1.707225179715044e-07, "logits/chosen": -0.7706595659255981, "logits/rejected": -0.8150681853294373, "logps/chosen": -102.86109924316406, "logps/rejected": -89.53365325927734, "loss": 0.4416, "rewards/accuracies": 1.0, "rewards/chosen": 0.8733612298965454, "rewards/margins": 0.08578568696975708, "rewards/rejected": 0.7875755429267883, "step": 11081 }, { "epoch": 1.8, "learning_rate": 1.706236274491032e-07, "logits/chosen": -0.6433099508285522, "logits/rejected": -0.6279045939445496, "logps/chosen": -90.05671691894531, "logps/rejected": -145.65530395507812, "loss": 1.9752, "rewards/accuracies": 0.0, "rewards/chosen": 1.8555786609649658, "rewards/margins": -3.9052765369415283, "rewards/rejected": 5.760855197906494, "step": 11082 }, { "epoch": 1.8, "learning_rate": 1.7052475968349262e-07, "logits/chosen": -0.740631103515625, "logits/rejected": -0.7044835686683655, "logps/chosen": -155.87762451171875, "logps/rejected": -92.96926879882812, "loss": 1.0884, "rewards/accuracies": 0.0, "rewards/chosen": 1.3659149408340454, "rewards/margins": -2.011263370513916, "rewards/rejected": 3.377178192138672, "step": 11083 }, { "epoch": 1.8, "learning_rate": 1.7042591468150376e-07, "logits/chosen": -1.290201187133789, "logits/rejected": -1.248306155204773, "logps/chosen": -70.84173583984375, "logps/rejected": -28.392940521240234, "loss": 0.9975, "rewards/accuracies": 1.0, "rewards/chosen": 1.3164596557617188, "rewards/margins": 1.3691695928573608, "rewards/rejected": -0.05270996317267418, "step": 11084 }, { "epoch": 1.8, "learning_rate": 1.7032709244996556e-07, "logits/chosen": -0.42635080218315125, "logits/rejected": -0.463867723941803, "logps/chosen": -37.328514099121094, "logps/rejected": -47.263282775878906, "loss": 0.5945, "rewards/accuracies": 0.0, "rewards/chosen": 1.2376540899276733, "rewards/margins": -0.44133996963500977, "rewards/rejected": 1.678994059562683, "step": 11085 }, { "epoch": 1.8, "learning_rate": 1.7022829299570606e-07, "logits/chosen": -0.5279465317726135, "logits/rejected": -0.5336843132972717, "logps/chosen": -68.68010711669922, "logps/rejected": -44.349430084228516, "loss": 1.0395, "rewards/accuracies": 0.0, "rewards/chosen": 0.44447633624076843, "rewards/margins": -0.9775848388671875, "rewards/rejected": 1.4220612049102783, "step": 11086 }, { "epoch": 1.8, "learning_rate": 1.70129516325551e-07, "logits/chosen": -0.9569525718688965, "logits/rejected": -0.5955748558044434, "logps/chosen": -92.75724792480469, "logps/rejected": -116.65226745605469, "loss": 0.4815, "rewards/accuracies": 1.0, "rewards/chosen": 4.4408769607543945, "rewards/margins": 0.8680450916290283, "rewards/rejected": 3.572831869125366, "step": 11087 }, { "epoch": 1.8, "learning_rate": 1.700307624463253e-07, "logits/chosen": -0.6119472980499268, "logits/rejected": -0.6046286225318909, "logps/chosen": -104.7895736694336, "logps/rejected": -72.42976379394531, "loss": 0.4908, "rewards/accuracies": 1.0, "rewards/chosen": 2.096513509750366, "rewards/margins": 0.8876115083694458, "rewards/rejected": 1.2089020013809204, "step": 11088 }, { "epoch": 1.8, "learning_rate": 1.6993203136485152e-07, "logits/chosen": -0.6918913125991821, "logits/rejected": -0.6641973853111267, "logps/chosen": -46.76251983642578, "logps/rejected": -56.25001907348633, "loss": 0.9417, "rewards/accuracies": 0.0, "rewards/chosen": 1.5115944147109985, "rewards/margins": -0.7807682752609253, "rewards/rejected": 2.292362689971924, "step": 11089 }, { "epoch": 1.8, "learning_rate": 1.6983332308795144e-07, "logits/chosen": -0.9419132471084595, "logits/rejected": -0.7993802428245544, "logps/chosen": -167.58538818359375, "logps/rejected": -68.66465759277344, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 6.272009372711182, "rewards/margins": 4.20730447769165, "rewards/rejected": 2.0647048950195312, "step": 11090 }, { "epoch": 1.8, "learning_rate": 1.697346376224445e-07, "logits/chosen": -0.537540078163147, "logits/rejected": -0.5744267702102661, "logps/chosen": -35.292198181152344, "logps/rejected": -76.18927764892578, "loss": 0.5463, "rewards/accuracies": 0.0, "rewards/chosen": 1.892679214477539, "rewards/margins": -0.6588895320892334, "rewards/rejected": 2.5515687465667725, "step": 11091 }, { "epoch": 1.8, "learning_rate": 1.6963597497514926e-07, "logits/chosen": -0.3480750322341919, "logits/rejected": -0.46419578790664673, "logps/chosen": -77.26528930664062, "logps/rejected": -92.1717529296875, "loss": 1.9286, "rewards/accuracies": 0.0, "rewards/chosen": 1.2612762451171875, "rewards/margins": -3.4563674926757812, "rewards/rejected": 4.717643737792969, "step": 11092 }, { "epoch": 1.8, "learning_rate": 1.69537335152882e-07, "logits/chosen": -0.9070351123809814, "logits/rejected": -0.9189115762710571, "logps/chosen": -138.54745483398438, "logps/rejected": -240.85336303710938, "loss": 1.7893, "rewards/accuracies": 0.0, "rewards/chosen": 4.7071075439453125, "rewards/margins": -2.379138469696045, "rewards/rejected": 7.086246013641357, "step": 11093 }, { "epoch": 1.8, "learning_rate": 1.6943871816245824e-07, "logits/chosen": -0.012620602734386921, "logits/rejected": 0.00463726045563817, "logps/chosen": -4.28298807144165, "logps/rejected": -8.877362251281738, "loss": 0.3466, "rewards/accuracies": 1.0, "rewards/chosen": 0.1618439257144928, "rewards/margins": 0.09407783299684525, "rewards/rejected": 0.06776609271764755, "step": 11094 }, { "epoch": 1.8, "learning_rate": 1.6934012401069104e-07, "logits/chosen": -1.0045589208602905, "logits/rejected": -0.9770037531852722, "logps/chosen": -177.6881103515625, "logps/rejected": -97.93781280517578, "loss": 1.2304, "rewards/accuracies": 0.0, "rewards/chosen": 1.597102403640747, "rewards/margins": -0.35869061946868896, "rewards/rejected": 1.955793023109436, "step": 11095 }, { "epoch": 1.8, "learning_rate": 1.6924155270439272e-07, "logits/chosen": -0.6015282869338989, "logits/rejected": -0.6015282869338989, "logps/chosen": -92.65765380859375, "logps/rejected": -92.65765380859375, "loss": 0.3937, "rewards/accuracies": 0.0, "rewards/chosen": 1.4982246160507202, "rewards/margins": 0.0, "rewards/rejected": 1.4982246160507202, "step": 11096 }, { "epoch": 1.8, "learning_rate": 1.6914300425037331e-07, "logits/chosen": -0.6218661665916443, "logits/rejected": -0.5620181560516357, "logps/chosen": -87.09426879882812, "logps/rejected": -126.44552612304688, "loss": 0.3709, "rewards/accuracies": 1.0, "rewards/chosen": 3.279711961746216, "rewards/margins": 0.5484251976013184, "rewards/rejected": 2.7312867641448975, "step": 11097 }, { "epoch": 1.8, "learning_rate": 1.6904447865544185e-07, "logits/chosen": -0.7376397848129272, "logits/rejected": -0.47080475091934204, "logps/chosen": -61.414306640625, "logps/rejected": -52.469303131103516, "loss": 1.2714, "rewards/accuracies": 1.0, "rewards/chosen": 3.468287706375122, "rewards/margins": 1.9154895544052124, "rewards/rejected": 1.5527981519699097, "step": 11098 }, { "epoch": 1.8, "learning_rate": 1.6894597592640525e-07, "logits/chosen": -0.7730110883712769, "logits/rejected": -0.7923021912574768, "logps/chosen": -42.585384368896484, "logps/rejected": -28.770593643188477, "loss": 0.6439, "rewards/accuracies": 0.0, "rewards/chosen": 0.4533924162387848, "rewards/margins": -0.9070971012115479, "rewards/rejected": 1.3604894876480103, "step": 11099 }, { "epoch": 1.8, "learning_rate": 1.6884749607006937e-07, "logits/chosen": -0.3958572745323181, "logits/rejected": -0.362650066614151, "logps/chosen": -41.125030517578125, "logps/rejected": -36.55128860473633, "loss": 0.1774, "rewards/accuracies": 1.0, "rewards/chosen": 1.9216278791427612, "rewards/margins": 0.9273926615715027, "rewards/rejected": 0.9942352175712585, "step": 11100 }, { "epoch": 1.8, "learning_rate": 1.6874903909323795e-07, "logits/chosen": -0.6904151439666748, "logits/rejected": -0.6375849843025208, "logps/chosen": -53.481910705566406, "logps/rejected": -34.485416412353516, "loss": 1.2105, "rewards/accuracies": 1.0, "rewards/chosen": 2.1429498195648193, "rewards/margins": 0.16584372520446777, "rewards/rejected": 1.9771060943603516, "step": 11101 }, { "epoch": 1.8, "learning_rate": 1.6865060500271383e-07, "logits/chosen": -1.0574370622634888, "logits/rejected": -1.130629539489746, "logps/chosen": -181.7352294921875, "logps/rejected": -114.68460083007812, "loss": 1.3533, "rewards/accuracies": 0.0, "rewards/chosen": 1.0714248418807983, "rewards/margins": -2.492018222808838, "rewards/rejected": 3.5634429454803467, "step": 11102 }, { "epoch": 1.8, "learning_rate": 1.6855219380529738e-07, "logits/chosen": -0.7122577428817749, "logits/rejected": -0.729315996170044, "logps/chosen": -85.19389343261719, "logps/rejected": -116.60072326660156, "loss": 1.0907, "rewards/accuracies": 0.0, "rewards/chosen": 3.976522207260132, "rewards/margins": -0.4370076656341553, "rewards/rejected": 4.413529872894287, "step": 11103 }, { "epoch": 1.8, "learning_rate": 1.6845380550778843e-07, "logits/chosen": -0.8252723813056946, "logits/rejected": -0.6465196013450623, "logps/chosen": -69.33480834960938, "logps/rejected": -44.801551818847656, "loss": 0.14, "rewards/accuracies": 1.0, "rewards/chosen": 2.6387252807617188, "rewards/margins": 1.9111378192901611, "rewards/rejected": 0.7275875210762024, "step": 11104 }, { "epoch": 1.8, "learning_rate": 1.6835544011698415e-07, "logits/chosen": -0.8379682302474976, "logits/rejected": -0.835250735282898, "logps/chosen": -104.53394317626953, "logps/rejected": -76.78935241699219, "loss": 0.3582, "rewards/accuracies": 0.0, "rewards/chosen": 1.3572494983673096, "rewards/margins": -0.025844573974609375, "rewards/rejected": 1.383094072341919, "step": 11105 }, { "epoch": 1.8, "learning_rate": 1.682570976396811e-07, "logits/chosen": -0.997612476348877, "logits/rejected": -0.943030059337616, "logps/chosen": -36.737674713134766, "logps/rejected": -49.58921813964844, "loss": 0.7891, "rewards/accuracies": 1.0, "rewards/chosen": 1.835791826248169, "rewards/margins": 0.3643592596054077, "rewards/rejected": 1.4714325666427612, "step": 11106 }, { "epoch": 1.8, "learning_rate": 1.681587780826735e-07, "logits/chosen": -0.805937647819519, "logits/rejected": -0.7404181957244873, "logps/chosen": -141.74856567382812, "logps/rejected": -146.7960205078125, "loss": 0.3725, "rewards/accuracies": 1.0, "rewards/chosen": 4.007501125335693, "rewards/margins": 0.19016098976135254, "rewards/rejected": 3.817340135574341, "step": 11107 }, { "epoch": 1.8, "learning_rate": 1.6806048145275452e-07, "logits/chosen": -0.607218325138092, "logits/rejected": -0.5808653235435486, "logps/chosen": -94.11824035644531, "logps/rejected": -44.694671630859375, "loss": 0.4357, "rewards/accuracies": 0.0, "rewards/chosen": 1.72259521484375, "rewards/margins": -0.2909972667694092, "rewards/rejected": 2.013592481613159, "step": 11108 }, { "epoch": 1.8, "learning_rate": 1.6796220775671532e-07, "logits/chosen": -0.3553523123264313, "logits/rejected": -0.3403538763523102, "logps/chosen": -3.1378445625305176, "logps/rejected": -2.701944351196289, "loss": 0.3628, "rewards/accuracies": 1.0, "rewards/chosen": 0.33190131187438965, "rewards/margins": 0.007448047399520874, "rewards/rejected": 0.3244532644748688, "step": 11109 }, { "epoch": 1.8, "learning_rate": 1.678639570013459e-07, "logits/chosen": -0.7214239239692688, "logits/rejected": -0.754314661026001, "logps/chosen": -70.06456756591797, "logps/rejected": -93.10143280029297, "loss": 1.2736, "rewards/accuracies": 0.0, "rewards/chosen": 1.2290900945663452, "rewards/margins": -2.4342546463012695, "rewards/rejected": 3.663344621658325, "step": 11110 }, { "epoch": 1.8, "learning_rate": 1.6776572919343423e-07, "logits/chosen": -0.896696150302887, "logits/rejected": -0.8699139356613159, "logps/chosen": -52.59707260131836, "logps/rejected": -61.06493377685547, "loss": 0.9195, "rewards/accuracies": 0.0, "rewards/chosen": 0.5719470977783203, "rewards/margins": -0.25888556241989136, "rewards/rejected": 0.8308326601982117, "step": 11111 }, { "epoch": 1.8, "learning_rate": 1.676675243397672e-07, "logits/chosen": -0.6363897323608398, "logits/rejected": -0.6116167306900024, "logps/chosen": -51.986209869384766, "logps/rejected": -69.82606506347656, "loss": 0.3768, "rewards/accuracies": 1.0, "rewards/chosen": 1.3354114294052124, "rewards/margins": 0.11083710193634033, "rewards/rejected": 1.224574327468872, "step": 11112 }, { "epoch": 1.8, "learning_rate": 1.675693424471295e-07, "logits/chosen": -0.5963456630706787, "logits/rejected": -0.5963456630706787, "logps/chosen": -60.29193115234375, "logps/rejected": -60.29193115234375, "loss": 0.3868, "rewards/accuracies": 0.0, "rewards/chosen": 2.1857049465179443, "rewards/margins": 0.0, "rewards/rejected": 2.1857049465179443, "step": 11113 }, { "epoch": 1.8, "learning_rate": 1.6747118352230493e-07, "logits/chosen": -0.802044153213501, "logits/rejected": -0.7796527743339539, "logps/chosen": -61.835968017578125, "logps/rejected": -47.32732391357422, "loss": 0.5654, "rewards/accuracies": 0.0, "rewards/chosen": 1.9536720514297485, "rewards/margins": -0.1329735517501831, "rewards/rejected": 2.0866456031799316, "step": 11114 }, { "epoch": 1.8, "learning_rate": 1.67373047572075e-07, "logits/chosen": -0.13997861742973328, "logits/rejected": -0.16312862932682037, "logps/chosen": -5.873417377471924, "logps/rejected": -34.533634185791016, "loss": 0.3928, "rewards/accuracies": 1.0, "rewards/chosen": 0.0804663673043251, "rewards/margins": 0.17764921486377716, "rewards/rejected": -0.09718284755945206, "step": 11115 }, { "epoch": 1.8, "learning_rate": 1.6727493460322012e-07, "logits/chosen": -0.6653191447257996, "logits/rejected": -0.6592968702316284, "logps/chosen": -3.220454216003418, "logps/rejected": -20.272050857543945, "loss": 0.8792, "rewards/accuracies": 0.0, "rewards/chosen": 0.38066336512565613, "rewards/margins": -0.03569349646568298, "rewards/rejected": 0.4163568615913391, "step": 11116 }, { "epoch": 1.8, "learning_rate": 1.671768446225192e-07, "logits/chosen": -0.5875605344772339, "logits/rejected": -0.5666775107383728, "logps/chosen": -41.435081481933594, "logps/rejected": -87.32881927490234, "loss": 0.3314, "rewards/accuracies": 1.0, "rewards/chosen": 1.2197808027267456, "rewards/margins": 0.2824189066886902, "rewards/rejected": 0.9373618960380554, "step": 11117 }, { "epoch": 1.8, "learning_rate": 1.670787776367489e-07, "logits/chosen": -0.8562654256820679, "logits/rejected": -0.8355653285980225, "logps/chosen": -155.86306762695312, "logps/rejected": -170.798828125, "loss": 2.6727, "rewards/accuracies": 0.0, "rewards/chosen": 4.620025634765625, "rewards/margins": -3.948455810546875, "rewards/rejected": 8.5684814453125, "step": 11118 }, { "epoch": 1.8, "learning_rate": 1.6698073365268506e-07, "logits/chosen": -0.695882260799408, "logits/rejected": -0.5933144092559814, "logps/chosen": -51.064300537109375, "logps/rejected": -73.81396484375, "loss": 0.16, "rewards/accuracies": 1.0, "rewards/chosen": 3.2579667568206787, "rewards/margins": 1.0487451553344727, "rewards/rejected": 2.209221601486206, "step": 11119 }, { "epoch": 1.8, "learning_rate": 1.6688271267710134e-07, "logits/chosen": -0.7682735919952393, "logits/rejected": -0.850913405418396, "logps/chosen": -85.0645751953125, "logps/rejected": -80.1731948852539, "loss": 1.3679, "rewards/accuracies": 0.0, "rewards/chosen": 1.9049590826034546, "rewards/margins": -2.4437246322631836, "rewards/rejected": 4.348683834075928, "step": 11120 }, { "epoch": 1.81, "learning_rate": 1.667847147167704e-07, "logits/chosen": -0.5306499600410461, "logits/rejected": -0.5306499600410461, "logps/chosen": -0.7494174838066101, "logps/rejected": -0.7494174838066101, "loss": 0.3598, "rewards/accuracies": 0.0, "rewards/chosen": 0.14605019986629486, "rewards/margins": 0.0, "rewards/rejected": 0.14605019986629486, "step": 11121 }, { "epoch": 1.81, "learning_rate": 1.6668673977846253e-07, "logits/chosen": -0.9362311959266663, "logits/rejected": -0.9511054158210754, "logps/chosen": -82.50785827636719, "logps/rejected": -91.78950500488281, "loss": 0.4916, "rewards/accuracies": 1.0, "rewards/chosen": 2.1161575317382812, "rewards/margins": 0.6841002702713013, "rewards/rejected": 1.43205726146698, "step": 11122 }, { "epoch": 1.81, "learning_rate": 1.665887878689473e-07, "logits/chosen": -1.0401983261108398, "logits/rejected": -1.0412299633026123, "logps/chosen": -86.15898132324219, "logps/rejected": -83.57249450683594, "loss": 0.6068, "rewards/accuracies": 1.0, "rewards/chosen": 2.2060515880584717, "rewards/margins": 0.03419327735900879, "rewards/rejected": 2.171858310699463, "step": 11123 }, { "epoch": 1.81, "learning_rate": 1.6649085899499194e-07, "logits/chosen": -1.0872784852981567, "logits/rejected": -1.063806414604187, "logps/chosen": -148.5435791015625, "logps/rejected": -163.40065002441406, "loss": 0.6163, "rewards/accuracies": 1.0, "rewards/chosen": 5.539776802062988, "rewards/margins": 0.3747391700744629, "rewards/rejected": 5.165037631988525, "step": 11124 }, { "epoch": 1.81, "learning_rate": 1.6639295316336265e-07, "logits/chosen": -0.6148041486740112, "logits/rejected": -0.7107257843017578, "logps/chosen": -120.25113677978516, "logps/rejected": -116.98341369628906, "loss": 0.7353, "rewards/accuracies": 1.0, "rewards/chosen": 2.8278160095214844, "rewards/margins": 1.8244476318359375, "rewards/rejected": 1.0033683776855469, "step": 11125 }, { "epoch": 1.81, "learning_rate": 1.662950703808235e-07, "logits/chosen": -1.0476545095443726, "logits/rejected": -1.0872316360473633, "logps/chosen": -144.5225830078125, "logps/rejected": -50.229488372802734, "loss": 0.0562, "rewards/accuracies": 1.0, "rewards/chosen": 3.9568755626678467, "rewards/margins": 2.177074909210205, "rewards/rejected": 1.7798007726669312, "step": 11126 }, { "epoch": 1.81, "learning_rate": 1.6619721065413761e-07, "logits/chosen": -0.7677038908004761, "logits/rejected": -0.7302206158638, "logps/chosen": -100.68170166015625, "logps/rejected": -131.11459350585938, "loss": 0.4659, "rewards/accuracies": 0.0, "rewards/chosen": 3.758441209793091, "rewards/margins": -0.21333003044128418, "rewards/rejected": 3.971771240234375, "step": 11127 }, { "epoch": 1.81, "learning_rate": 1.6609937399006586e-07, "logits/chosen": -0.7024379372596741, "logits/rejected": -0.6156690716743469, "logps/chosen": -74.92420196533203, "logps/rejected": -89.91190338134766, "loss": 1.7002, "rewards/accuracies": 0.0, "rewards/chosen": 3.5936782360076904, "rewards/margins": -1.132972002029419, "rewards/rejected": 4.726650238037109, "step": 11128 }, { "epoch": 1.81, "learning_rate": 1.660015603953681e-07, "logits/chosen": -0.8776631951332092, "logits/rejected": -0.8362390398979187, "logps/chosen": -73.29185485839844, "logps/rejected": -135.3171844482422, "loss": 2.3492, "rewards/accuracies": 0.0, "rewards/chosen": 2.582906484603882, "rewards/margins": -4.416396141052246, "rewards/rejected": 6.999302864074707, "step": 11129 }, { "epoch": 1.81, "learning_rate": 1.6590376987680204e-07, "logits/chosen": -0.5781509876251221, "logits/rejected": -0.6031472682952881, "logps/chosen": -7.792045593261719, "logps/rejected": -27.04804229736328, "loss": 0.9217, "rewards/accuracies": 0.0, "rewards/chosen": 0.5828151106834412, "rewards/margins": -0.058197855949401855, "rewards/rejected": 0.641012966632843, "step": 11130 }, { "epoch": 1.81, "learning_rate": 1.658060024411244e-07, "logits/chosen": -0.6265820264816284, "logits/rejected": -0.6158905029296875, "logps/chosen": -2.507460117340088, "logps/rejected": -46.23395919799805, "loss": 0.4084, "rewards/accuracies": 1.0, "rewards/chosen": 0.4620074927806854, "rewards/margins": 0.30680006742477417, "rewards/rejected": 0.15520744025707245, "step": 11131 }, { "epoch": 1.81, "learning_rate": 1.6570825809508964e-07, "logits/chosen": -0.7030742168426514, "logits/rejected": -0.6347287893295288, "logps/chosen": -53.042457580566406, "logps/rejected": -73.30701446533203, "loss": 0.6441, "rewards/accuracies": 1.0, "rewards/chosen": 2.134436845779419, "rewards/margins": 0.4549499750137329, "rewards/rejected": 1.679486870765686, "step": 11132 }, { "epoch": 1.81, "learning_rate": 1.6561053684545135e-07, "logits/chosen": -0.5796554684638977, "logits/rejected": -0.580560028553009, "logps/chosen": -6.140040874481201, "logps/rejected": -17.142131805419922, "loss": 0.8758, "rewards/accuracies": 0.0, "rewards/chosen": 0.3191368579864502, "rewards/margins": -0.02020031213760376, "rewards/rejected": 0.33933717012405396, "step": 11133 }, { "epoch": 1.81, "learning_rate": 1.655128386989607e-07, "logits/chosen": -0.9423208832740784, "logits/rejected": -0.916739284992218, "logps/chosen": -109.44587707519531, "logps/rejected": -64.85700225830078, "loss": 0.8993, "rewards/accuracies": 0.0, "rewards/chosen": 2.1511733531951904, "rewards/margins": -0.4515359401702881, "rewards/rejected": 2.6027092933654785, "step": 11134 }, { "epoch": 1.81, "learning_rate": 1.6541516366236818e-07, "logits/chosen": -1.1355623006820679, "logits/rejected": -1.0890202522277832, "logps/chosen": -269.6171875, "logps/rejected": -53.25968551635742, "loss": 0.3028, "rewards/accuracies": 1.0, "rewards/chosen": 2.664013624191284, "rewards/margins": 0.19466042518615723, "rewards/rejected": 2.469353199005127, "step": 11135 }, { "epoch": 1.81, "learning_rate": 1.653175117424218e-07, "logits/chosen": -0.907088577747345, "logits/rejected": -0.8197304010391235, "logps/chosen": -66.7817611694336, "logps/rejected": -101.67625427246094, "loss": 0.4648, "rewards/accuracies": 0.0, "rewards/chosen": 1.4648094177246094, "rewards/margins": -0.24799573421478271, "rewards/rejected": 1.712805151939392, "step": 11136 }, { "epoch": 1.81, "learning_rate": 1.6521988294586874e-07, "logits/chosen": -0.8014296889305115, "logits/rejected": -0.7200788855552673, "logps/chosen": -66.37567138671875, "logps/rejected": -64.98519134521484, "loss": 2.6745, "rewards/accuracies": 0.0, "rewards/chosen": 0.49081116914749146, "rewards/margins": -1.3138015270233154, "rewards/rejected": 1.8046127557754517, "step": 11137 }, { "epoch": 1.81, "learning_rate": 1.651222772794539e-07, "logits/chosen": -0.5329411029815674, "logits/rejected": -0.5065861940383911, "logps/chosen": -100.19918823242188, "logps/rejected": -88.20454406738281, "loss": 0.9055, "rewards/accuracies": 0.0, "rewards/chosen": 1.7837570905685425, "rewards/margins": -1.6291245222091675, "rewards/rejected": 3.41288161277771, "step": 11138 }, { "epoch": 1.81, "learning_rate": 1.650246947499212e-07, "logits/chosen": -0.731829047203064, "logits/rejected": -0.7716164588928223, "logps/chosen": -124.90020751953125, "logps/rejected": -91.3114013671875, "loss": 2.1697, "rewards/accuracies": 0.0, "rewards/chosen": 0.9525421261787415, "rewards/margins": -4.315756320953369, "rewards/rejected": 5.268298625946045, "step": 11139 }, { "epoch": 1.81, "learning_rate": 1.6492713536401232e-07, "logits/chosen": -0.7914515137672424, "logits/rejected": -0.7940750122070312, "logps/chosen": -50.74846649169922, "logps/rejected": -71.49876403808594, "loss": 1.9437, "rewards/accuracies": 0.0, "rewards/chosen": 1.8141273260116577, "rewards/margins": -0.7850679159164429, "rewards/rejected": 2.5991952419281006, "step": 11140 }, { "epoch": 1.81, "learning_rate": 1.648295991284681e-07, "logits/chosen": -0.523479163646698, "logits/rejected": -0.5280421376228333, "logps/chosen": -75.47415161132812, "logps/rejected": -58.49127197265625, "loss": 0.3783, "rewards/accuracies": 0.0, "rewards/chosen": 1.2399933338165283, "rewards/margins": -0.05308687686920166, "rewards/rejected": 1.29308021068573, "step": 11141 }, { "epoch": 1.81, "learning_rate": 1.6473208605002703e-07, "logits/chosen": -0.731575071811676, "logits/rejected": -0.592663586139679, "logps/chosen": -67.2471923828125, "logps/rejected": -76.82007598876953, "loss": 0.2214, "rewards/accuracies": 1.0, "rewards/chosen": 2.3214035034179688, "rewards/margins": 0.8155266046524048, "rewards/rejected": 1.505876898765564, "step": 11142 }, { "epoch": 1.81, "learning_rate": 1.6463459613542663e-07, "logits/chosen": -0.7379482388496399, "logits/rejected": -0.6927189826965332, "logps/chosen": -57.6270866394043, "logps/rejected": -76.09136962890625, "loss": 0.216, "rewards/accuracies": 1.0, "rewards/chosen": 1.9650547504425049, "rewards/margins": 0.6356709003448486, "rewards/rejected": 1.3293838500976562, "step": 11143 }, { "epoch": 1.81, "learning_rate": 1.6453712939140218e-07, "logits/chosen": -0.7830467820167542, "logits/rejected": -0.7305477857589722, "logps/chosen": -98.20249938964844, "logps/rejected": -116.92411804199219, "loss": 0.4362, "rewards/accuracies": 1.0, "rewards/chosen": 3.9184768199920654, "rewards/margins": 1.6002731323242188, "rewards/rejected": 2.3182036876678467, "step": 11144 }, { "epoch": 1.81, "learning_rate": 1.644396858246881e-07, "logits/chosen": -0.184371218085289, "logits/rejected": -0.18420827388763428, "logps/chosen": -4.351582050323486, "logps/rejected": -9.529333114624023, "loss": 0.2834, "rewards/accuracies": 1.0, "rewards/chosen": 0.1644064038991928, "rewards/margins": 0.3112630844116211, "rewards/rejected": -0.14685669541358948, "step": 11145 }, { "epoch": 1.81, "learning_rate": 1.6434226544201645e-07, "logits/chosen": -0.6961163878440857, "logits/rejected": -0.6856403946876526, "logps/chosen": -89.12709045410156, "logps/rejected": -57.06555938720703, "loss": 0.8832, "rewards/accuracies": 1.0, "rewards/chosen": 2.407667636871338, "rewards/margins": 0.5202027559280396, "rewards/rejected": 1.8874648809432983, "step": 11146 }, { "epoch": 1.81, "learning_rate": 1.642448682501184e-07, "logits/chosen": -0.8065496683120728, "logits/rejected": -0.7661619186401367, "logps/chosen": -43.813236236572266, "logps/rejected": -48.42166519165039, "loss": 0.4598, "rewards/accuracies": 0.0, "rewards/chosen": 1.4173271656036377, "rewards/margins": -0.2074207067489624, "rewards/rejected": 1.6247478723526, "step": 11147 }, { "epoch": 1.81, "learning_rate": 1.6414749425572289e-07, "logits/chosen": -0.6244502663612366, "logits/rejected": -0.5157878994941711, "logps/chosen": -99.56981658935547, "logps/rejected": -58.322235107421875, "loss": 0.1011, "rewards/accuracies": 1.0, "rewards/chosen": 4.091538429260254, "rewards/margins": 1.7827012538909912, "rewards/rejected": 2.3088371753692627, "step": 11148 }, { "epoch": 1.81, "learning_rate": 1.640501434655578e-07, "logits/chosen": -0.5965264439582825, "logits/rejected": -0.5041725039482117, "logps/chosen": -75.6653060913086, "logps/rejected": -74.43595886230469, "loss": 0.7673, "rewards/accuracies": 0.0, "rewards/chosen": 1.3077834844589233, "rewards/margins": -0.4267106056213379, "rewards/rejected": 1.7344940900802612, "step": 11149 }, { "epoch": 1.81, "learning_rate": 1.6395281588634886e-07, "logits/chosen": -1.0929474830627441, "logits/rejected": -1.0484660863876343, "logps/chosen": -113.86442565917969, "logps/rejected": -59.48356246948242, "loss": 0.836, "rewards/accuracies": 1.0, "rewards/chosen": 4.053857326507568, "rewards/margins": 2.206145763397217, "rewards/rejected": 1.8477115631103516, "step": 11150 }, { "epoch": 1.81, "learning_rate": 1.6385551152482075e-07, "logits/chosen": -0.598710298538208, "logits/rejected": -0.533664345741272, "logps/chosen": -63.708702087402344, "logps/rejected": -53.84432601928711, "loss": 0.2962, "rewards/accuracies": 1.0, "rewards/chosen": 1.2579841613769531, "rewards/margins": 0.5903163552284241, "rewards/rejected": 0.667667806148529, "step": 11151 }, { "epoch": 1.81, "learning_rate": 1.6375823038769605e-07, "logits/chosen": -0.6747551560401917, "logits/rejected": -0.7034966945648193, "logps/chosen": -190.31655883789062, "logps/rejected": -74.56372833251953, "loss": 0.1145, "rewards/accuracies": 1.0, "rewards/chosen": 3.393319845199585, "rewards/margins": 1.4288086891174316, "rewards/rejected": 1.9645111560821533, "step": 11152 }, { "epoch": 1.81, "learning_rate": 1.6366097248169602e-07, "logits/chosen": -0.5100065469741821, "logits/rejected": -0.5703209638595581, "logps/chosen": -44.76958084106445, "logps/rejected": -98.77208709716797, "loss": 0.5934, "rewards/accuracies": 1.0, "rewards/chosen": 0.5840194821357727, "rewards/margins": 0.4488426446914673, "rewards/rejected": 0.1351768523454666, "step": 11153 }, { "epoch": 1.81, "learning_rate": 1.6356373781354054e-07, "logits/chosen": -0.7119566202163696, "logits/rejected": -0.6828709244728088, "logps/chosen": -125.23216247558594, "logps/rejected": -100.01948547363281, "loss": 0.5004, "rewards/accuracies": 0.0, "rewards/chosen": 5.443261623382568, "rewards/margins": -0.5107192993164062, "rewards/rejected": 5.953980922698975, "step": 11154 }, { "epoch": 1.81, "learning_rate": 1.6346652638994717e-07, "logits/chosen": -0.6016074419021606, "logits/rejected": -0.5556061863899231, "logps/chosen": -73.32572174072266, "logps/rejected": -80.76531219482422, "loss": 0.3945, "rewards/accuracies": 1.0, "rewards/chosen": 1.045117974281311, "rewards/margins": 0.09607774019241333, "rewards/rejected": 0.9490402340888977, "step": 11155 }, { "epoch": 1.81, "learning_rate": 1.6336933821763277e-07, "logits/chosen": -0.8642794489860535, "logits/rejected": -0.7208034992218018, "logps/chosen": -59.09535598754883, "logps/rejected": -79.62853240966797, "loss": 0.5791, "rewards/accuracies": 0.0, "rewards/chosen": 1.7531559467315674, "rewards/margins": -0.7755024433135986, "rewards/rejected": 2.528658390045166, "step": 11156 }, { "epoch": 1.81, "learning_rate": 1.6327217330331162e-07, "logits/chosen": -0.8491490483283997, "logits/rejected": -0.6910311579704285, "logps/chosen": -165.89508056640625, "logps/rejected": -30.891481399536133, "loss": 0.7361, "rewards/accuracies": 1.0, "rewards/chosen": 3.4021453857421875, "rewards/margins": 2.7167322635650635, "rewards/rejected": 0.6854131817817688, "step": 11157 }, { "epoch": 1.81, "learning_rate": 1.6317503165369734e-07, "logits/chosen": -0.8521378636360168, "logits/rejected": -0.8871644735336304, "logps/chosen": -57.30469512939453, "logps/rejected": -54.82696533203125, "loss": 2.0668, "rewards/accuracies": 0.0, "rewards/chosen": 1.672888159751892, "rewards/margins": -0.9323517084121704, "rewards/rejected": 2.6052398681640625, "step": 11158 }, { "epoch": 1.81, "learning_rate": 1.630779132755012e-07, "logits/chosen": -0.9425771236419678, "logits/rejected": -0.9475343823432922, "logps/chosen": -103.48983764648438, "logps/rejected": -46.82279968261719, "loss": 0.7122, "rewards/accuracies": 0.0, "rewards/chosen": 1.6775970458984375, "rewards/margins": -1.009190320968628, "rewards/rejected": 2.6867873668670654, "step": 11159 }, { "epoch": 1.81, "learning_rate": 1.6298081817543336e-07, "logits/chosen": -0.7619296908378601, "logits/rejected": -0.7028335332870483, "logps/chosen": -54.989158630371094, "logps/rejected": -109.75726318359375, "loss": 0.5307, "rewards/accuracies": 0.0, "rewards/chosen": 4.59229040145874, "rewards/margins": -0.042756080627441406, "rewards/rejected": 4.635046482086182, "step": 11160 }, { "epoch": 1.81, "learning_rate": 1.6288374636020192e-07, "logits/chosen": -1.0485419034957886, "logits/rejected": -1.0454392433166504, "logps/chosen": -207.1839599609375, "logps/rejected": -108.25428771972656, "loss": 0.1619, "rewards/accuracies": 1.0, "rewards/chosen": 1.2932541370391846, "rewards/margins": 0.9683471918106079, "rewards/rejected": 0.3249069154262543, "step": 11161 }, { "epoch": 1.81, "learning_rate": 1.6278669783651394e-07, "logits/chosen": -0.7029663920402527, "logits/rejected": -0.6963789463043213, "logps/chosen": -30.07732391357422, "logps/rejected": -31.218135833740234, "loss": 0.3463, "rewards/accuracies": 1.0, "rewards/chosen": 0.8512535095214844, "rewards/margins": 0.3211093544960022, "rewards/rejected": 0.5301441550254822, "step": 11162 }, { "epoch": 1.81, "learning_rate": 1.6268967261107425e-07, "logits/chosen": -0.6374018788337708, "logits/rejected": -0.5646361708641052, "logps/chosen": -75.98970031738281, "logps/rejected": -16.726181030273438, "loss": 2.1882, "rewards/accuracies": 1.0, "rewards/chosen": 0.42869797348976135, "rewards/margins": 0.07722681760787964, "rewards/rejected": 0.3514711558818817, "step": 11163 }, { "epoch": 1.81, "learning_rate": 1.6259267069058668e-07, "logits/chosen": -0.5836969614028931, "logits/rejected": -0.503052294254303, "logps/chosen": -61.853816986083984, "logps/rejected": -59.570919036865234, "loss": 0.3477, "rewards/accuracies": 1.0, "rewards/chosen": 1.4783328771591187, "rewards/margins": 0.21032094955444336, "rewards/rejected": 1.2680119276046753, "step": 11164 }, { "epoch": 1.81, "learning_rate": 1.6249569208175284e-07, "logits/chosen": -1.0414206981658936, "logits/rejected": -1.0520600080490112, "logps/chosen": -170.780517578125, "logps/rejected": -94.92086791992188, "loss": 0.8914, "rewards/accuracies": 0.0, "rewards/chosen": 2.546591281890869, "rewards/margins": -1.5785980224609375, "rewards/rejected": 4.125189304351807, "step": 11165 }, { "epoch": 1.81, "learning_rate": 1.6239873679127337e-07, "logits/chosen": -0.6908972263336182, "logits/rejected": -0.7344815731048584, "logps/chosen": -88.33171844482422, "logps/rejected": -33.87303161621094, "loss": 0.9661, "rewards/accuracies": 0.0, "rewards/chosen": 0.8079864382743835, "rewards/margins": -0.22121542692184448, "rewards/rejected": 1.029201865196228, "step": 11166 }, { "epoch": 1.81, "learning_rate": 1.6230180482584653e-07, "logits/chosen": -1.0687302350997925, "logits/rejected": -0.9983807802200317, "logps/chosen": -122.62265014648438, "logps/rejected": -73.36013793945312, "loss": 0.0985, "rewards/accuracies": 1.0, "rewards/chosen": 5.891705513000488, "rewards/margins": 3.266406297683716, "rewards/rejected": 2.6252992153167725, "step": 11167 }, { "epoch": 1.81, "learning_rate": 1.6220489619216987e-07, "logits/chosen": -0.6114189624786377, "logits/rejected": -0.6252033710479736, "logps/chosen": -0.48858529329299927, "logps/rejected": -19.827312469482422, "loss": 0.2566, "rewards/accuracies": 1.0, "rewards/chosen": 0.21284499764442444, "rewards/margins": 0.494782030582428, "rewards/rejected": -0.28193703293800354, "step": 11168 }, { "epoch": 1.81, "learning_rate": 1.6210801089693848e-07, "logits/chosen": -0.7968663573265076, "logits/rejected": -0.670635998249054, "logps/chosen": -93.02127075195312, "logps/rejected": -94.70447540283203, "loss": 0.0519, "rewards/accuracies": 1.0, "rewards/chosen": 5.037970066070557, "rewards/margins": 2.9120736122131348, "rewards/rejected": 2.125896453857422, "step": 11169 }, { "epoch": 1.81, "learning_rate": 1.6201114894684653e-07, "logits/chosen": -0.8854649066925049, "logits/rejected": -0.8896617889404297, "logps/chosen": -65.28860473632812, "logps/rejected": -50.082740783691406, "loss": 0.4543, "rewards/accuracies": 0.0, "rewards/chosen": 0.7486953735351562, "rewards/margins": -0.2167099118232727, "rewards/rejected": 0.965405285358429, "step": 11170 }, { "epoch": 1.81, "learning_rate": 1.6191431034858594e-07, "logits/chosen": -0.6789159774780273, "logits/rejected": -0.5808941721916199, "logps/chosen": -129.73294067382812, "logps/rejected": -85.490966796875, "loss": 0.144, "rewards/accuracies": 1.0, "rewards/chosen": 3.001535177230835, "rewards/margins": 1.137129306793213, "rewards/rejected": 1.864405870437622, "step": 11171 }, { "epoch": 1.81, "learning_rate": 1.6181749510884762e-07, "logits/chosen": -0.2953124940395355, "logits/rejected": -0.2927245795726776, "logps/chosen": -12.174440383911133, "logps/rejected": -5.310351848602295, "loss": 1.1696, "rewards/accuracies": 0.0, "rewards/chosen": 0.02056570164859295, "rewards/margins": -0.14627580344676971, "rewards/rejected": 0.1668415069580078, "step": 11172 }, { "epoch": 1.81, "learning_rate": 1.6172070323432037e-07, "logits/chosen": -0.5866546034812927, "logits/rejected": -0.6325269937515259, "logps/chosen": -159.32333374023438, "logps/rejected": -152.1041259765625, "loss": 0.2149, "rewards/accuracies": 1.0, "rewards/chosen": 3.2294113636016846, "rewards/margins": 1.904505968093872, "rewards/rejected": 1.3249053955078125, "step": 11173 }, { "epoch": 1.81, "learning_rate": 1.6162393473169183e-07, "logits/chosen": -0.693787693977356, "logits/rejected": -0.7261893153190613, "logps/chosen": -63.145423889160156, "logps/rejected": -64.93571472167969, "loss": 1.0753, "rewards/accuracies": 0.0, "rewards/chosen": 1.487141489982605, "rewards/margins": -0.37320709228515625, "rewards/rejected": 1.8603485822677612, "step": 11174 }, { "epoch": 1.81, "learning_rate": 1.615271896076477e-07, "logits/chosen": -0.9623838663101196, "logits/rejected": -0.6821520328521729, "logps/chosen": -134.3428955078125, "logps/rejected": -147.0981903076172, "loss": 0.5018, "rewards/accuracies": 0.0, "rewards/chosen": 5.639387607574463, "rewards/margins": -0.5309205055236816, "rewards/rejected": 6.1703081130981445, "step": 11175 }, { "epoch": 1.81, "learning_rate": 1.6143046786887193e-07, "logits/chosen": -0.624022901058197, "logits/rejected": -0.6018428802490234, "logps/chosen": -72.01197814941406, "logps/rejected": -129.82720947265625, "loss": 1.6973, "rewards/accuracies": 0.0, "rewards/chosen": 2.4188668727874756, "rewards/margins": -2.6320273876190186, "rewards/rejected": 5.050894260406494, "step": 11176 }, { "epoch": 1.81, "learning_rate": 1.613337695220474e-07, "logits/chosen": -0.5758817791938782, "logits/rejected": -0.556867241859436, "logps/chosen": -70.90441131591797, "logps/rejected": -69.95774841308594, "loss": 1.2749, "rewards/accuracies": 0.0, "rewards/chosen": 1.55462646484375, "rewards/margins": -1.3789856433868408, "rewards/rejected": 2.933612108230591, "step": 11177 }, { "epoch": 1.81, "learning_rate": 1.6123709457385476e-07, "logits/chosen": -0.914435625076294, "logits/rejected": -0.8864313364028931, "logps/chosen": -104.48827362060547, "logps/rejected": -68.6639633178711, "loss": 2.7326, "rewards/accuracies": 0.0, "rewards/chosen": 0.4569229185581207, "rewards/margins": -1.4518699645996094, "rewards/rejected": 1.9087928533554077, "step": 11178 }, { "epoch": 1.81, "learning_rate": 1.6114044303097363e-07, "logits/chosen": -1.017747163772583, "logits/rejected": -0.99396151304245, "logps/chosen": -31.967910766601562, "logps/rejected": -21.971643447875977, "loss": 0.4033, "rewards/accuracies": 1.0, "rewards/chosen": 1.7381919622421265, "rewards/margins": 1.485288381576538, "rewards/rejected": 0.252903550863266, "step": 11179 }, { "epoch": 1.81, "learning_rate": 1.6104381490008139e-07, "logits/chosen": -0.7985151410102844, "logits/rejected": -0.8303468227386475, "logps/chosen": -66.4582748413086, "logps/rejected": -104.57371520996094, "loss": 2.9966, "rewards/accuracies": 0.0, "rewards/chosen": 1.5097038745880127, "rewards/margins": -3.532663583755493, "rewards/rejected": 5.042367458343506, "step": 11180 }, { "epoch": 1.81, "learning_rate": 1.609472101878545e-07, "logits/chosen": -0.5823588371276855, "logits/rejected": -0.5718030333518982, "logps/chosen": -81.24312591552734, "logps/rejected": -68.5360107421875, "loss": 1.7143, "rewards/accuracies": 0.0, "rewards/chosen": 0.9830062985420227, "rewards/margins": -0.2928360104560852, "rewards/rejected": 1.275842308998108, "step": 11181 }, { "epoch": 1.81, "learning_rate": 1.6085062890096707e-07, "logits/chosen": -0.5252040028572083, "logits/rejected": -0.4599469006061554, "logps/chosen": -46.545902252197266, "logps/rejected": -53.78215408325195, "loss": 0.3967, "rewards/accuracies": 1.0, "rewards/chosen": 1.8205875158309937, "rewards/margins": 0.8581954836845398, "rewards/rejected": 0.9623920321464539, "step": 11182 }, { "epoch": 1.82, "learning_rate": 1.6075407104609228e-07, "logits/chosen": -0.6491485238075256, "logits/rejected": -0.6179149746894836, "logps/chosen": -40.51676940917969, "logps/rejected": -62.07550811767578, "loss": 1.8977, "rewards/accuracies": 0.0, "rewards/chosen": 1.6993576288223267, "rewards/margins": -0.07332992553710938, "rewards/rejected": 1.772687554359436, "step": 11183 }, { "epoch": 1.82, "learning_rate": 1.6065753662990105e-07, "logits/chosen": -1.001205325126648, "logits/rejected": -0.9448294639587402, "logps/chosen": -114.29312133789062, "logps/rejected": -195.2559356689453, "loss": 0.8377, "rewards/accuracies": 0.0, "rewards/chosen": 3.8094558715820312, "rewards/margins": -0.6701111793518066, "rewards/rejected": 4.479567050933838, "step": 11184 }, { "epoch": 1.82, "learning_rate": 1.6056102565906333e-07, "logits/chosen": -0.9085009694099426, "logits/rejected": -0.9012826681137085, "logps/chosen": -82.32933807373047, "logps/rejected": -106.79988098144531, "loss": 0.0867, "rewards/accuracies": 1.0, "rewards/chosen": 2.2575204372406006, "rewards/margins": 1.6919457912445068, "rewards/rejected": 0.5655746459960938, "step": 11185 }, { "epoch": 1.82, "learning_rate": 1.6046453814024668e-07, "logits/chosen": -0.6896686553955078, "logits/rejected": -0.6137040257453918, "logps/chosen": -116.77206420898438, "logps/rejected": -25.974674224853516, "loss": 0.1357, "rewards/accuracies": 1.0, "rewards/chosen": 1.6579986810684204, "rewards/margins": 1.5473676919937134, "rewards/rejected": 0.11063098907470703, "step": 11186 }, { "epoch": 1.82, "learning_rate": 1.60368074080118e-07, "logits/chosen": -0.631865918636322, "logits/rejected": -0.7103087306022644, "logps/chosen": -73.44255065917969, "logps/rejected": -89.54319763183594, "loss": 2.2029, "rewards/accuracies": 0.0, "rewards/chosen": 1.6400794982910156, "rewards/margins": -3.122596263885498, "rewards/rejected": 4.762675762176514, "step": 11187 }, { "epoch": 1.82, "learning_rate": 1.6027163348534155e-07, "logits/chosen": -0.6629672646522522, "logits/rejected": -0.6955198049545288, "logps/chosen": -44.5210075378418, "logps/rejected": -99.80760192871094, "loss": 1.2137, "rewards/accuracies": 0.0, "rewards/chosen": 1.0171489715576172, "rewards/margins": -0.5485546588897705, "rewards/rejected": 1.5657036304473877, "step": 11188 }, { "epoch": 1.82, "learning_rate": 1.601752163625809e-07, "logits/chosen": -0.7108946442604065, "logits/rejected": -0.6450812816619873, "logps/chosen": -112.00398254394531, "logps/rejected": -47.02217102050781, "loss": 0.9289, "rewards/accuracies": 0.0, "rewards/chosen": 0.3229629695415497, "rewards/margins": -0.6170909404754639, "rewards/rejected": 0.9400539398193359, "step": 11189 }, { "epoch": 1.82, "learning_rate": 1.6007882271849716e-07, "logits/chosen": -0.615951418876648, "logits/rejected": -0.6232080459594727, "logps/chosen": -68.70819091796875, "logps/rejected": -91.38191223144531, "loss": 1.5735, "rewards/accuracies": 0.0, "rewards/chosen": 1.7456893920898438, "rewards/margins": -1.211916446685791, "rewards/rejected": 2.9576058387756348, "step": 11190 }, { "epoch": 1.82, "learning_rate": 1.5998245255975056e-07, "logits/chosen": -0.8186133503913879, "logits/rejected": -0.8443445563316345, "logps/chosen": -75.22933959960938, "logps/rejected": -88.94409942626953, "loss": 0.4091, "rewards/accuracies": 0.0, "rewards/chosen": 0.4746108949184418, "rewards/margins": -0.15428391098976135, "rewards/rejected": 0.6288948059082031, "step": 11191 }, { "epoch": 1.82, "learning_rate": 1.5988610589299906e-07, "logits/chosen": -0.8860777020454407, "logits/rejected": -0.8439087867736816, "logps/chosen": -109.50834655761719, "logps/rejected": -122.34173583984375, "loss": 1.2799, "rewards/accuracies": 0.0, "rewards/chosen": 4.553943157196045, "rewards/margins": -0.5596675872802734, "rewards/rejected": 5.113610744476318, "step": 11192 }, { "epoch": 1.82, "learning_rate": 1.5978978272489962e-07, "logits/chosen": -0.7918384671211243, "logits/rejected": -0.46790599822998047, "logps/chosen": -134.85433959960938, "logps/rejected": -44.957279205322266, "loss": 0.7356, "rewards/accuracies": 1.0, "rewards/chosen": 4.771157741546631, "rewards/margins": 2.9390206336975098, "rewards/rejected": 1.8321369886398315, "step": 11193 }, { "epoch": 1.82, "learning_rate": 1.596934830621069e-07, "logits/chosen": -0.6576471328735352, "logits/rejected": -0.6491113305091858, "logps/chosen": -49.08062744140625, "logps/rejected": -98.65333557128906, "loss": 0.1447, "rewards/accuracies": 1.0, "rewards/chosen": 2.3696579933166504, "rewards/margins": 1.1082336902618408, "rewards/rejected": 1.2614243030548096, "step": 11194 }, { "epoch": 1.82, "learning_rate": 1.595972069112747e-07, "logits/chosen": -0.8138986229896545, "logits/rejected": -0.8138986229896545, "logps/chosen": -55.24122619628906, "logps/rejected": -55.24122619628906, "loss": 1.1263, "rewards/accuracies": 0.0, "rewards/chosen": 1.013617753982544, "rewards/margins": 0.0, "rewards/rejected": 1.013617753982544, "step": 11195 }, { "epoch": 1.82, "learning_rate": 1.5950095427905435e-07, "logits/chosen": -0.6682145595550537, "logits/rejected": -0.6609456539154053, "logps/chosen": -59.626380920410156, "logps/rejected": -70.1916732788086, "loss": 0.413, "rewards/accuracies": 1.0, "rewards/chosen": 0.7260093688964844, "rewards/margins": 0.10470426082611084, "rewards/rejected": 0.6213051080703735, "step": 11196 }, { "epoch": 1.82, "learning_rate": 1.5940472517209647e-07, "logits/chosen": -0.7487509846687317, "logits/rejected": -1.1007945537567139, "logps/chosen": -95.7940444946289, "logps/rejected": -35.266841888427734, "loss": 0.1137, "rewards/accuracies": 1.0, "rewards/chosen": 1.8843399286270142, "rewards/margins": 1.52056086063385, "rewards/rejected": 0.36377906799316406, "step": 11197 }, { "epoch": 1.82, "learning_rate": 1.5930851959704916e-07, "logits/chosen": -1.1898293495178223, "logits/rejected": -1.2066948413848877, "logps/chosen": -119.55580139160156, "logps/rejected": -57.500431060791016, "loss": 0.9529, "rewards/accuracies": 0.0, "rewards/chosen": 1.312886118888855, "rewards/margins": -0.5919390916824341, "rewards/rejected": 1.904825210571289, "step": 11198 }, { "epoch": 1.82, "learning_rate": 1.5921233756055962e-07, "logits/chosen": -0.7141280174255371, "logits/rejected": -0.5757929086685181, "logps/chosen": -72.10478210449219, "logps/rejected": -62.274288177490234, "loss": 0.1657, "rewards/accuracies": 1.0, "rewards/chosen": 3.2319939136505127, "rewards/margins": 1.1683108806610107, "rewards/rejected": 2.063683032989502, "step": 11199 }, { "epoch": 1.82, "learning_rate": 1.5911617906927284e-07, "logits/chosen": -0.24503281712532043, "logits/rejected": -0.2689114511013031, "logps/chosen": -15.299766540527344, "logps/rejected": -19.76010513305664, "loss": 1.2921, "rewards/accuracies": 0.0, "rewards/chosen": 0.173472598195076, "rewards/margins": -0.4298017621040344, "rewards/rejected": 0.6032743453979492, "step": 11200 }, { "epoch": 1.82, "learning_rate": 1.5902004412983276e-07, "logits/chosen": -0.621052622795105, "logits/rejected": -0.5055612325668335, "logps/chosen": -81.04310607910156, "logps/rejected": -97.47596740722656, "loss": 0.3589, "rewards/accuracies": 0.0, "rewards/chosen": 3.3385207653045654, "rewards/margins": -0.03996586799621582, "rewards/rejected": 3.3784866333007812, "step": 11201 }, { "epoch": 1.82, "learning_rate": 1.5892393274888117e-07, "logits/chosen": -0.8652605414390564, "logits/rejected": -0.8786470293998718, "logps/chosen": -93.22367858886719, "logps/rejected": -151.0176239013672, "loss": 1.3249, "rewards/accuracies": 0.0, "rewards/chosen": 2.2591354846954346, "rewards/margins": -1.8663713932037354, "rewards/rejected": 4.12550687789917, "step": 11202 }, { "epoch": 1.82, "learning_rate": 1.588278449330586e-07, "logits/chosen": -0.8699038624763489, "logits/rejected": -0.7591440081596375, "logps/chosen": -111.40009307861328, "logps/rejected": -73.13800048828125, "loss": 0.1364, "rewards/accuracies": 1.0, "rewards/chosen": 4.894478797912598, "rewards/margins": 3.2505249977111816, "rewards/rejected": 1.6439536809921265, "step": 11203 }, { "epoch": 1.82, "learning_rate": 1.5873178068900362e-07, "logits/chosen": -0.5105686783790588, "logits/rejected": -0.5118784308433533, "logps/chosen": -81.85796356201172, "logps/rejected": -38.37968063354492, "loss": 2.1977, "rewards/accuracies": 0.0, "rewards/chosen": 1.286891222000122, "rewards/margins": -0.46445345878601074, "rewards/rejected": 1.7513446807861328, "step": 11204 }, { "epoch": 1.82, "learning_rate": 1.5863574002335368e-07, "logits/chosen": -0.8535888195037842, "logits/rejected": -0.8482896089553833, "logps/chosen": -58.1728401184082, "logps/rejected": -75.97117614746094, "loss": 0.7013, "rewards/accuracies": 0.0, "rewards/chosen": 0.9505214691162109, "rewards/margins": -1.0803768634796143, "rewards/rejected": 2.030898332595825, "step": 11205 }, { "epoch": 1.82, "learning_rate": 1.5853972294274387e-07, "logits/chosen": -0.6072774529457092, "logits/rejected": -0.608015775680542, "logps/chosen": -70.92694854736328, "logps/rejected": -43.166168212890625, "loss": 0.7934, "rewards/accuracies": 0.0, "rewards/chosen": 1.0649559497833252, "rewards/margins": -0.5230395793914795, "rewards/rejected": 1.5879955291748047, "step": 11206 }, { "epoch": 1.82, "learning_rate": 1.5844372945380829e-07, "logits/chosen": -0.7428228855133057, "logits/rejected": -0.6446526646614075, "logps/chosen": -75.22003173828125, "logps/rejected": -60.045135498046875, "loss": 0.4698, "rewards/accuracies": 1.0, "rewards/chosen": 2.0748627185821533, "rewards/margins": 0.587908148765564, "rewards/rejected": 1.4869545698165894, "step": 11207 }, { "epoch": 1.82, "learning_rate": 1.5834775956317935e-07, "logits/chosen": -0.6372406482696533, "logits/rejected": -0.6140996217727661, "logps/chosen": -83.31326293945312, "logps/rejected": -94.35777282714844, "loss": 0.7034, "rewards/accuracies": 1.0, "rewards/chosen": 2.3337998390197754, "rewards/margins": 0.5276939868927002, "rewards/rejected": 1.8061058521270752, "step": 11208 }, { "epoch": 1.82, "learning_rate": 1.5825181327748727e-07, "logits/chosen": -0.8994734883308411, "logits/rejected": -0.8123015761375427, "logps/chosen": -48.1789665222168, "logps/rejected": -40.87791061401367, "loss": 0.4483, "rewards/accuracies": 1.0, "rewards/chosen": 3.2479207515716553, "rewards/margins": 1.32108473777771, "rewards/rejected": 1.9268360137939453, "step": 11209 }, { "epoch": 1.82, "learning_rate": 1.5815589060336148e-07, "logits/chosen": -1.6726752519607544, "logits/rejected": -1.5217550992965698, "logps/chosen": -84.71855926513672, "logps/rejected": -188.12774658203125, "loss": 2.0559, "rewards/accuracies": 0.0, "rewards/chosen": 2.6192924976348877, "rewards/margins": -2.904268503189087, "rewards/rejected": 5.523561000823975, "step": 11210 }, { "epoch": 1.82, "learning_rate": 1.580599915474289e-07, "logits/chosen": -0.7347248792648315, "logits/rejected": -0.6804466843605042, "logps/chosen": -47.11534118652344, "logps/rejected": -46.717567443847656, "loss": 0.66, "rewards/accuracies": 0.0, "rewards/chosen": 0.9913978576660156, "rewards/margins": -0.34901511669158936, "rewards/rejected": 1.340412974357605, "step": 11211 }, { "epoch": 1.82, "learning_rate": 1.5796411611631554e-07, "logits/chosen": -0.7652960419654846, "logits/rejected": -0.57468181848526, "logps/chosen": -212.31362915039062, "logps/rejected": -49.66778564453125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": 5.935772895812988, "rewards/margins": 4.487876892089844, "rewards/rejected": 1.447895884513855, "step": 11212 }, { "epoch": 1.82, "learning_rate": 1.578682643166453e-07, "logits/chosen": -0.4357498288154602, "logits/rejected": -0.3738763928413391, "logps/chosen": -39.146488189697266, "logps/rejected": -48.68885040283203, "loss": 0.3925, "rewards/accuracies": 1.0, "rewards/chosen": 1.5872459411621094, "rewards/margins": 1.0602840185165405, "rewards/rejected": 0.5269619226455688, "step": 11213 }, { "epoch": 1.82, "learning_rate": 1.5777243615504082e-07, "logits/chosen": -0.8391073346138, "logits/rejected": -0.9490086436271667, "logps/chosen": -42.052589416503906, "logps/rejected": -105.44389343261719, "loss": 2.0377, "rewards/accuracies": 0.0, "rewards/chosen": 2.9106223583221436, "rewards/margins": -3.3950297832489014, "rewards/rejected": 6.305652141571045, "step": 11214 }, { "epoch": 1.82, "learning_rate": 1.5767663163812267e-07, "logits/chosen": -0.5188607573509216, "logits/rejected": -0.41550111770629883, "logps/chosen": -52.533546447753906, "logps/rejected": -68.61373901367188, "loss": 1.1875, "rewards/accuracies": 0.0, "rewards/chosen": 1.4302314519882202, "rewards/margins": -0.20949101448059082, "rewards/rejected": 1.639722466468811, "step": 11215 }, { "epoch": 1.82, "learning_rate": 1.5758085077251038e-07, "logits/chosen": -1.0330959558486938, "logits/rejected": -1.0189465284347534, "logps/chosen": -105.37462615966797, "logps/rejected": -68.31051635742188, "loss": 0.6376, "rewards/accuracies": 1.0, "rewards/chosen": 1.5633049011230469, "rewards/margins": 0.17812573909759521, "rewards/rejected": 1.3851791620254517, "step": 11216 }, { "epoch": 1.82, "learning_rate": 1.5748509356482108e-07, "logits/chosen": -0.6487346887588501, "logits/rejected": -0.6487346887588501, "logps/chosen": -173.4351806640625, "logps/rejected": -173.4351806640625, "loss": 0.3566, "rewards/accuracies": 0.0, "rewards/chosen": 5.855309963226318, "rewards/margins": 0.0, "rewards/rejected": 5.855309963226318, "step": 11217 }, { "epoch": 1.82, "learning_rate": 1.5738936002167114e-07, "logits/chosen": -0.47127285599708557, "logits/rejected": -0.4292284846305847, "logps/chosen": -67.14530944824219, "logps/rejected": -99.84466552734375, "loss": 0.0733, "rewards/accuracies": 1.0, "rewards/chosen": 3.4957566261291504, "rewards/margins": 2.103318691253662, "rewards/rejected": 1.3924378156661987, "step": 11218 }, { "epoch": 1.82, "learning_rate": 1.5729365014967438e-07, "logits/chosen": -0.5088186860084534, "logits/rejected": -0.43190503120422363, "logps/chosen": -105.47389221191406, "logps/rejected": -22.799983978271484, "loss": 0.1863, "rewards/accuracies": 1.0, "rewards/chosen": 1.9376251697540283, "rewards/margins": 1.8067569732666016, "rewards/rejected": 0.13086815178394318, "step": 11219 }, { "epoch": 1.82, "learning_rate": 1.5719796395544387e-07, "logits/chosen": -0.661129355430603, "logits/rejected": -0.703803539276123, "logps/chosen": -78.18746185302734, "logps/rejected": -73.20215606689453, "loss": 0.5817, "rewards/accuracies": 0.0, "rewards/chosen": 0.7784385681152344, "rewards/margins": -0.55096435546875, "rewards/rejected": 1.3294029235839844, "step": 11220 }, { "epoch": 1.82, "learning_rate": 1.571023014455903e-07, "logits/chosen": -0.6781727075576782, "logits/rejected": -0.7385222315788269, "logps/chosen": -178.59088134765625, "logps/rejected": -124.23844909667969, "loss": 0.8966, "rewards/accuracies": 1.0, "rewards/chosen": 3.394360303878784, "rewards/margins": 1.6347670555114746, "rewards/rejected": 1.7595932483673096, "step": 11221 }, { "epoch": 1.82, "learning_rate": 1.5700666262672324e-07, "logits/chosen": -0.8035205602645874, "logits/rejected": -0.763480544090271, "logps/chosen": -101.88261413574219, "logps/rejected": -50.765281677246094, "loss": 0.7441, "rewards/accuracies": 0.0, "rewards/chosen": 1.0666275024414062, "rewards/margins": -0.2245635986328125, "rewards/rejected": 1.2911911010742188, "step": 11222 }, { "epoch": 1.82, "learning_rate": 1.5691104750545027e-07, "logits/chosen": -0.4614933729171753, "logits/rejected": -0.40377140045166016, "logps/chosen": -56.776885986328125, "logps/rejected": -48.157020568847656, "loss": 0.6956, "rewards/accuracies": 1.0, "rewards/chosen": 1.8441460132598877, "rewards/margins": 0.5831276178359985, "rewards/rejected": 1.2610183954238892, "step": 11223 }, { "epoch": 1.82, "learning_rate": 1.5681545608837775e-07, "logits/chosen": -0.6530205607414246, "logits/rejected": -0.5984647870063782, "logps/chosen": -111.59518432617188, "logps/rejected": -66.20878601074219, "loss": 1.3496, "rewards/accuracies": 0.0, "rewards/chosen": 0.7735671997070312, "rewards/margins": -1.1523849964141846, "rewards/rejected": 1.9259521961212158, "step": 11224 }, { "epoch": 1.82, "learning_rate": 1.5671988838210975e-07, "logits/chosen": -0.8256187438964844, "logits/rejected": -0.8432759046554565, "logps/chosen": -84.66504669189453, "logps/rejected": -78.35997009277344, "loss": 0.7518, "rewards/accuracies": 0.0, "rewards/chosen": 1.6522018909454346, "rewards/margins": -0.9212760925292969, "rewards/rejected": 2.5734779834747314, "step": 11225 }, { "epoch": 1.82, "learning_rate": 1.5662434439324957e-07, "logits/chosen": -1.1410373449325562, "logits/rejected": -1.131407380104065, "logps/chosen": -73.17732238769531, "logps/rejected": -53.038352966308594, "loss": 0.1319, "rewards/accuracies": 1.0, "rewards/chosen": 1.616297960281372, "rewards/margins": 1.2750507593154907, "rewards/rejected": 0.34124717116355896, "step": 11226 }, { "epoch": 1.82, "learning_rate": 1.565288241283979e-07, "logits/chosen": -0.5324143767356873, "logits/rejected": -0.6248897314071655, "logps/chosen": -74.79364013671875, "logps/rejected": -93.42780303955078, "loss": 0.7528, "rewards/accuracies": 0.0, "rewards/chosen": 2.702096700668335, "rewards/margins": -0.9347662925720215, "rewards/rejected": 3.6368629932403564, "step": 11227 }, { "epoch": 1.82, "learning_rate": 1.5643332759415473e-07, "logits/chosen": -0.8993185758590698, "logits/rejected": -0.9188474416732788, "logps/chosen": -233.6044921875, "logps/rejected": -74.10063171386719, "loss": 0.2259, "rewards/accuracies": 1.0, "rewards/chosen": 4.313726902008057, "rewards/margins": 1.8267128467559814, "rewards/rejected": 2.487014055252075, "step": 11228 }, { "epoch": 1.82, "learning_rate": 1.563378547971176e-07, "logits/chosen": -0.5673830509185791, "logits/rejected": -0.5673830509185791, "logps/chosen": -96.43998718261719, "logps/rejected": -96.43998718261719, "loss": 0.8712, "rewards/accuracies": 0.0, "rewards/chosen": 1.0241622924804688, "rewards/margins": 0.0, "rewards/rejected": 1.0241622924804688, "step": 11229 }, { "epoch": 1.82, "learning_rate": 1.5624240574388304e-07, "logits/chosen": -0.9665385484695435, "logits/rejected": -0.7984267473220825, "logps/chosen": -173.71905517578125, "logps/rejected": -62.220420837402344, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": 6.289649963378906, "rewards/margins": 5.682656288146973, "rewards/rejected": 0.6069938540458679, "step": 11230 }, { "epoch": 1.82, "learning_rate": 1.5614698044104546e-07, "logits/chosen": -0.8668524622917175, "logits/rejected": -0.7936044335365295, "logps/chosen": -128.1672821044922, "logps/rejected": -73.43862915039062, "loss": 1.7776, "rewards/accuracies": 1.0, "rewards/chosen": 4.619683742523193, "rewards/margins": 3.040231227874756, "rewards/rejected": 1.5794525146484375, "step": 11231 }, { "epoch": 1.82, "learning_rate": 1.5605157889519816e-07, "logits/chosen": -0.9392393231391907, "logits/rejected": -0.7369226813316345, "logps/chosen": -124.53610229492188, "logps/rejected": -117.57687377929688, "loss": 0.2979, "rewards/accuracies": 1.0, "rewards/chosen": 4.769317626953125, "rewards/margins": 2.507333278656006, "rewards/rejected": 2.261984348297119, "step": 11232 }, { "epoch": 1.82, "learning_rate": 1.559562011129321e-07, "logits/chosen": -0.8545331954956055, "logits/rejected": -0.6641265153884888, "logps/chosen": -60.37303161621094, "logps/rejected": -70.76760864257812, "loss": 0.247, "rewards/accuracies": 1.0, "rewards/chosen": 1.7723503112792969, "rewards/margins": 0.583000898361206, "rewards/rejected": 1.1893494129180908, "step": 11233 }, { "epoch": 1.82, "learning_rate": 1.5586084710083736e-07, "logits/chosen": -0.4881652295589447, "logits/rejected": -0.5035249590873718, "logps/chosen": -85.08975219726562, "logps/rejected": -43.77116012573242, "loss": 0.5052, "rewards/accuracies": 0.0, "rewards/chosen": 0.7532661557197571, "rewards/margins": -0.5288929343223572, "rewards/rejected": 1.2821590900421143, "step": 11234 }, { "epoch": 1.82, "learning_rate": 1.557655168655016e-07, "logits/chosen": -0.8064731359481812, "logits/rejected": -0.8254294991493225, "logps/chosen": -46.880645751953125, "logps/rejected": -39.100379943847656, "loss": 0.5848, "rewards/accuracies": 1.0, "rewards/chosen": 1.2843936681747437, "rewards/margins": 0.13629484176635742, "rewards/rejected": 1.1480988264083862, "step": 11235 }, { "epoch": 1.82, "learning_rate": 1.5567021041351165e-07, "logits/chosen": -0.4277040958404541, "logits/rejected": -0.3472493290901184, "logps/chosen": -75.4698257446289, "logps/rejected": -39.71424865722656, "loss": 0.3724, "rewards/accuracies": 1.0, "rewards/chosen": 1.2136772871017456, "rewards/margins": 0.42123496532440186, "rewards/rejected": 0.7924423217773438, "step": 11236 }, { "epoch": 1.82, "learning_rate": 1.5557492775145186e-07, "logits/chosen": -0.7077725529670715, "logits/rejected": -0.6418853402137756, "logps/chosen": -62.06902313232422, "logps/rejected": -17.980880737304688, "loss": 0.8044, "rewards/accuracies": 1.0, "rewards/chosen": 2.4784958362579346, "rewards/margins": 2.407081365585327, "rewards/rejected": 0.07141437381505966, "step": 11237 }, { "epoch": 1.82, "learning_rate": 1.554796688859058e-07, "logits/chosen": -0.5640609860420227, "logits/rejected": -0.5652827620506287, "logps/chosen": -99.34795379638672, "logps/rejected": -62.04701614379883, "loss": 1.7671, "rewards/accuracies": 0.0, "rewards/chosen": 1.774335503578186, "rewards/margins": -0.7816821336746216, "rewards/rejected": 2.5560176372528076, "step": 11238 }, { "epoch": 1.82, "learning_rate": 1.5538443382345456e-07, "logits/chosen": -0.552307665348053, "logits/rejected": -0.5029814839363098, "logps/chosen": -64.18647766113281, "logps/rejected": -60.53997802734375, "loss": 0.2451, "rewards/accuracies": 1.0, "rewards/chosen": 1.9330886602401733, "rewards/margins": 0.46478569507598877, "rewards/rejected": 1.4683029651641846, "step": 11239 }, { "epoch": 1.82, "learning_rate": 1.5528922257067833e-07, "logits/chosen": -0.9755431413650513, "logits/rejected": -0.6634846925735474, "logps/chosen": -65.59957885742188, "logps/rejected": -34.32986831665039, "loss": 0.6367, "rewards/accuracies": 1.0, "rewards/chosen": 3.6027016639709473, "rewards/margins": 2.4377923011779785, "rewards/rejected": 1.1649093627929688, "step": 11240 }, { "epoch": 1.82, "learning_rate": 1.5519403513415496e-07, "logits/chosen": -0.7222158312797546, "logits/rejected": -0.6386209726333618, "logps/chosen": -36.03348159790039, "logps/rejected": -59.311927795410156, "loss": 0.5778, "rewards/accuracies": 1.0, "rewards/chosen": 1.324309229850769, "rewards/margins": 0.33359116315841675, "rewards/rejected": 0.9907180666923523, "step": 11241 }, { "epoch": 1.82, "learning_rate": 1.5509887152046137e-07, "logits/chosen": -0.8892288208007812, "logits/rejected": -0.822725236415863, "logps/chosen": -82.15602111816406, "logps/rejected": -92.86366271972656, "loss": 0.3868, "rewards/accuracies": 0.0, "rewards/chosen": 2.2470009326934814, "rewards/margins": -0.09441447257995605, "rewards/rejected": 2.3414154052734375, "step": 11242 }, { "epoch": 1.82, "learning_rate": 1.5500373173617204e-07, "logits/chosen": -0.6345855593681335, "logits/rejected": -0.6345855593681335, "logps/chosen": -26.515331268310547, "logps/rejected": -26.515331268310547, "loss": 0.3513, "rewards/accuracies": 0.0, "rewards/chosen": 1.2239669561386108, "rewards/margins": 0.0, "rewards/rejected": 1.2239669561386108, "step": 11243 }, { "epoch": 1.83, "learning_rate": 1.5490861578786053e-07, "logits/chosen": -0.9040542840957642, "logits/rejected": -0.7586565017700195, "logps/chosen": -100.71965789794922, "logps/rejected": -14.77387523651123, "loss": 0.1097, "rewards/accuracies": 1.0, "rewards/chosen": 1.8665459156036377, "rewards/margins": 1.5258523225784302, "rewards/rejected": 0.34069356322288513, "step": 11244 }, { "epoch": 1.83, "learning_rate": 1.5481352368209855e-07, "logits/chosen": -0.23820176720619202, "logits/rejected": -0.23820176720619202, "logps/chosen": -77.11167907714844, "logps/rejected": -77.11167907714844, "loss": 0.3522, "rewards/accuracies": 0.0, "rewards/chosen": 1.4660552740097046, "rewards/margins": 0.0, "rewards/rejected": 1.4660552740097046, "step": 11245 }, { "epoch": 1.83, "learning_rate": 1.547184554254557e-07, "logits/chosen": -0.6668252348899841, "logits/rejected": -0.7105975151062012, "logps/chosen": -63.22016143798828, "logps/rejected": -66.06946563720703, "loss": 0.7087, "rewards/accuracies": 0.0, "rewards/chosen": 1.6772254705429077, "rewards/margins": -0.2773773670196533, "rewards/rejected": 1.954602837562561, "step": 11246 }, { "epoch": 1.83, "learning_rate": 1.5462341102450067e-07, "logits/chosen": -0.8554064631462097, "logits/rejected": -0.6987821459770203, "logps/chosen": -119.8883285522461, "logps/rejected": -77.74992370605469, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": 4.918755531311035, "rewards/margins": 2.4062082767486572, "rewards/rejected": 2.512547254562378, "step": 11247 }, { "epoch": 1.83, "learning_rate": 1.545283904857998e-07, "logits/chosen": -0.6732361316680908, "logits/rejected": -0.6055368781089783, "logps/chosen": -44.252437591552734, "logps/rejected": -59.747013092041016, "loss": 0.7778, "rewards/accuracies": 0.0, "rewards/chosen": 1.002579927444458, "rewards/margins": -0.26598429679870605, "rewards/rejected": 1.268564224243164, "step": 11248 }, { "epoch": 1.83, "learning_rate": 1.5443339381591842e-07, "logits/chosen": -0.5510231852531433, "logits/rejected": -0.6071062684059143, "logps/chosen": -60.16188430786133, "logps/rejected": -81.67521667480469, "loss": 0.7722, "rewards/accuracies": 0.0, "rewards/chosen": 0.6988796591758728, "rewards/margins": -0.486532986164093, "rewards/rejected": 1.1854126453399658, "step": 11249 }, { "epoch": 1.83, "learning_rate": 1.543384210214196e-07, "logits/chosen": -0.6298527121543884, "logits/rejected": -0.6218114495277405, "logps/chosen": -115.13375854492188, "logps/rejected": -93.45152282714844, "loss": 0.48, "rewards/accuracies": 0.0, "rewards/chosen": 1.6009124517440796, "rewards/margins": -0.4484292268753052, "rewards/rejected": 2.0493416786193848, "step": 11250 }, { "epoch": 1.83, "learning_rate": 1.5424347210886534e-07, "logits/chosen": -0.7976697087287903, "logits/rejected": -0.8095963597297668, "logps/chosen": -69.82320404052734, "logps/rejected": -46.177574157714844, "loss": 0.5152, "rewards/accuracies": 1.0, "rewards/chosen": 2.4325568675994873, "rewards/margins": 0.10072541236877441, "rewards/rejected": 2.331831455230713, "step": 11251 }, { "epoch": 1.83, "learning_rate": 1.541485470848154e-07, "logits/chosen": -0.6974227428436279, "logits/rejected": -0.6622283458709717, "logps/chosen": -73.02926635742188, "logps/rejected": -121.51993560791016, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": 3.3022828102111816, "rewards/margins": 2.610715627670288, "rewards/rejected": 0.6915672421455383, "step": 11252 }, { "epoch": 1.83, "learning_rate": 1.540536459558286e-07, "logits/chosen": -0.7701296210289001, "logits/rejected": -0.725430428981781, "logps/chosen": -60.1950569152832, "logps/rejected": -80.77204132080078, "loss": 1.0199, "rewards/accuracies": 1.0, "rewards/chosen": 1.3825161457061768, "rewards/margins": 0.9626216888427734, "rewards/rejected": 0.41989442706108093, "step": 11253 }, { "epoch": 1.83, "learning_rate": 1.5395876872846132e-07, "logits/chosen": -0.9854601621627808, "logits/rejected": -0.9360879063606262, "logps/chosen": -45.65382385253906, "logps/rejected": -102.71282958984375, "loss": 1.3218, "rewards/accuracies": 1.0, "rewards/chosen": 1.679449439048767, "rewards/margins": 0.23970937728881836, "rewards/rejected": 1.4397400617599487, "step": 11254 }, { "epoch": 1.83, "learning_rate": 1.5386391540926897e-07, "logits/chosen": -0.27335119247436523, "logits/rejected": -0.2665751576423645, "logps/chosen": -18.855276107788086, "logps/rejected": -2.1392948627471924, "loss": 0.4443, "rewards/accuracies": 0.0, "rewards/chosen": 0.03967132791876793, "rewards/margins": -0.3401278257369995, "rewards/rejected": 0.37979915738105774, "step": 11255 }, { "epoch": 1.83, "learning_rate": 1.5376908600480477e-07, "logits/chosen": -0.8623810410499573, "logits/rejected": -0.8336163759231567, "logps/chosen": -72.16963195800781, "logps/rejected": -69.66362762451172, "loss": 0.8894, "rewards/accuracies": 0.0, "rewards/chosen": 0.8870956301689148, "rewards/margins": -1.5872673988342285, "rewards/rejected": 2.474363088607788, "step": 11256 }, { "epoch": 1.83, "learning_rate": 1.536742805216208e-07, "logits/chosen": -0.7263191342353821, "logits/rejected": -0.6420701146125793, "logps/chosen": -95.53575897216797, "logps/rejected": -78.3450698852539, "loss": 0.4222, "rewards/accuracies": 1.0, "rewards/chosen": 5.109900951385498, "rewards/margins": 1.6235339641571045, "rewards/rejected": 3.4863669872283936, "step": 11257 }, { "epoch": 1.83, "learning_rate": 1.5357949896626687e-07, "logits/chosen": -0.9120974540710449, "logits/rejected": -0.9073776006698608, "logps/chosen": -89.482177734375, "logps/rejected": -59.580780029296875, "loss": 0.2103, "rewards/accuracies": 1.0, "rewards/chosen": 3.3406739234924316, "rewards/margins": 0.8590302467346191, "rewards/rejected": 2.4816436767578125, "step": 11258 }, { "epoch": 1.83, "learning_rate": 1.5348474134529194e-07, "logits/chosen": -1.3563289642333984, "logits/rejected": -1.3011831045150757, "logps/chosen": -159.75909423828125, "logps/rejected": -38.132266998291016, "loss": 0.1831, "rewards/accuracies": 1.0, "rewards/chosen": 1.3690826892852783, "rewards/margins": 1.1063389778137207, "rewards/rejected": 0.2627437710762024, "step": 11259 }, { "epoch": 1.83, "learning_rate": 1.5339000766524247e-07, "logits/chosen": -0.803932249546051, "logits/rejected": -0.7167274951934814, "logps/chosen": -187.72451782226562, "logps/rejected": -26.74095916748047, "loss": 0.1515, "rewards/accuracies": 1.0, "rewards/chosen": 4.546075344085693, "rewards/margins": 3.771941661834717, "rewards/rejected": 0.7741336822509766, "step": 11260 }, { "epoch": 1.83, "learning_rate": 1.5329529793266394e-07, "logits/chosen": -0.46512463688850403, "logits/rejected": -0.5315234065055847, "logps/chosen": -44.53938674926758, "logps/rejected": -63.89115524291992, "loss": 0.9468, "rewards/accuracies": 0.0, "rewards/chosen": 2.3694050312042236, "rewards/margins": -1.6837480068206787, "rewards/rejected": 4.053153038024902, "step": 11261 }, { "epoch": 1.83, "learning_rate": 1.5320061215409958e-07, "logits/chosen": -1.0358988046646118, "logits/rejected": -1.0495631694793701, "logps/chosen": -46.104400634765625, "logps/rejected": -70.912841796875, "loss": 0.3314, "rewards/accuracies": 1.0, "rewards/chosen": 0.4371933043003082, "rewards/margins": 0.09419402480125427, "rewards/rejected": 0.34299927949905396, "step": 11262 }, { "epoch": 1.83, "learning_rate": 1.531059503360917e-07, "logits/chosen": -0.8672202229499817, "logits/rejected": -0.9133815169334412, "logps/chosen": -225.7941131591797, "logps/rejected": -109.48451232910156, "loss": 1.244, "rewards/accuracies": 0.0, "rewards/chosen": 3.691462755203247, "rewards/margins": -2.29801344871521, "rewards/rejected": 5.989476203918457, "step": 11263 }, { "epoch": 1.83, "learning_rate": 1.5301131248518011e-07, "logits/chosen": -0.32635799050331116, "logits/rejected": -0.2921166718006134, "logps/chosen": -46.3881721496582, "logps/rejected": -65.70345306396484, "loss": 0.9551, "rewards/accuracies": 0.0, "rewards/chosen": 0.9727336764335632, "rewards/margins": -1.7279162406921387, "rewards/rejected": 2.7006499767303467, "step": 11264 }, { "epoch": 1.83, "learning_rate": 1.529166986079038e-07, "logits/chosen": -0.5026200413703918, "logits/rejected": -0.3910863995552063, "logps/chosen": -47.46025466918945, "logps/rejected": -78.12129211425781, "loss": 1.0218, "rewards/accuracies": 0.0, "rewards/chosen": 1.9793987274169922, "rewards/margins": -0.9015820026397705, "rewards/rejected": 2.8809807300567627, "step": 11265 }, { "epoch": 1.83, "learning_rate": 1.5282210871079925e-07, "logits/chosen": -1.1736598014831543, "logits/rejected": -1.1736598014831543, "logps/chosen": -138.06411743164062, "logps/rejected": -138.06411743164062, "loss": 0.3476, "rewards/accuracies": 0.0, "rewards/chosen": 5.513421535491943, "rewards/margins": 0.0, "rewards/rejected": 5.513421535491943, "step": 11266 }, { "epoch": 1.83, "learning_rate": 1.5272754280040216e-07, "logits/chosen": -0.5966761708259583, "logits/rejected": -0.5368408560752869, "logps/chosen": -29.293498992919922, "logps/rejected": -46.452308654785156, "loss": 0.766, "rewards/accuracies": 1.0, "rewards/chosen": 1.3241184949874878, "rewards/margins": 0.17861366271972656, "rewards/rejected": 1.1455048322677612, "step": 11267 }, { "epoch": 1.83, "learning_rate": 1.5263300088324576e-07, "logits/chosen": -0.597950279712677, "logits/rejected": -0.5783727169036865, "logps/chosen": -128.7691650390625, "logps/rejected": -100.87057495117188, "loss": 0.5914, "rewards/accuracies": 0.0, "rewards/chosen": -0.0817413330078125, "rewards/margins": -0.6185150146484375, "rewards/rejected": 0.536773681640625, "step": 11268 }, { "epoch": 1.83, "learning_rate": 1.5253848296586237e-07, "logits/chosen": -1.1179146766662598, "logits/rejected": -1.334848165512085, "logps/chosen": -95.78314208984375, "logps/rejected": -35.1292610168457, "loss": 0.106, "rewards/accuracies": 1.0, "rewards/chosen": 1.905513048171997, "rewards/margins": 1.464793086051941, "rewards/rejected": 0.44071999192237854, "step": 11269 }, { "epoch": 1.83, "learning_rate": 1.5244398905478196e-07, "logits/chosen": -0.5437746047973633, "logits/rejected": -0.5632500052452087, "logps/chosen": -61.78794860839844, "logps/rejected": -137.0537872314453, "loss": 0.4188, "rewards/accuracies": 1.0, "rewards/chosen": 1.7343734502792358, "rewards/margins": 0.43529653549194336, "rewards/rejected": 1.2990769147872925, "step": 11270 }, { "epoch": 1.83, "learning_rate": 1.5234951915653337e-07, "logits/chosen": -0.21803796291351318, "logits/rejected": -0.21803796291351318, "logps/chosen": -21.349706649780273, "logps/rejected": -21.349706649780273, "loss": 0.6377, "rewards/accuracies": 0.0, "rewards/chosen": 0.20262031257152557, "rewards/margins": 0.0, "rewards/rejected": 0.20262031257152557, "step": 11271 }, { "epoch": 1.83, "learning_rate": 1.522550732776434e-07, "logits/chosen": -0.7874784469604492, "logits/rejected": -0.8005173206329346, "logps/chosen": -72.01087951660156, "logps/rejected": -112.05950164794922, "loss": 0.4632, "rewards/accuracies": 0.0, "rewards/chosen": 1.901800513267517, "rewards/margins": -0.2803748846054077, "rewards/rejected": 2.182175397872925, "step": 11272 }, { "epoch": 1.83, "learning_rate": 1.5216065142463764e-07, "logits/chosen": -0.7481168508529663, "logits/rejected": -0.7479497194290161, "logps/chosen": -54.80752182006836, "logps/rejected": -67.49420166015625, "loss": 0.6484, "rewards/accuracies": 0.0, "rewards/chosen": 1.759154200553894, "rewards/margins": -0.4874011278152466, "rewards/rejected": 2.2465553283691406, "step": 11273 }, { "epoch": 1.83, "learning_rate": 1.5206625360403942e-07, "logits/chosen": -0.6115211248397827, "logits/rejected": -0.6214672923088074, "logps/chosen": -94.85507202148438, "logps/rejected": -88.97136688232422, "loss": 1.6618, "rewards/accuracies": 0.0, "rewards/chosen": 1.2386047840118408, "rewards/margins": -0.8924782276153564, "rewards/rejected": 2.1310830116271973, "step": 11274 }, { "epoch": 1.83, "learning_rate": 1.519718798223711e-07, "logits/chosen": -0.8005760312080383, "logits/rejected": -0.8259972929954529, "logps/chosen": -52.47118377685547, "logps/rejected": -72.22966003417969, "loss": 0.8103, "rewards/accuracies": 1.0, "rewards/chosen": 1.6384468078613281, "rewards/margins": 0.5573348999023438, "rewards/rejected": 1.0811119079589844, "step": 11275 }, { "epoch": 1.83, "learning_rate": 1.5187753008615256e-07, "logits/chosen": -0.632588803768158, "logits/rejected": -0.6805404424667358, "logps/chosen": -61.68836975097656, "logps/rejected": -84.61664581298828, "loss": 0.4685, "rewards/accuracies": 0.0, "rewards/chosen": 1.6166809797286987, "rewards/margins": -0.14608383178710938, "rewards/rejected": 1.762764811515808, "step": 11276 }, { "epoch": 1.83, "learning_rate": 1.5178320440190295e-07, "logits/chosen": -0.7408644556999207, "logits/rejected": -0.7577581405639648, "logps/chosen": -83.36495971679688, "logps/rejected": -91.26390838623047, "loss": 0.7123, "rewards/accuracies": 0.0, "rewards/chosen": 2.0529258251190186, "rewards/margins": -1.106135606765747, "rewards/rejected": 3.1590614318847656, "step": 11277 }, { "epoch": 1.83, "learning_rate": 1.5168890277613887e-07, "logits/chosen": -0.6374874114990234, "logits/rejected": -0.6374874114990234, "logps/chosen": -49.62110137939453, "logps/rejected": -49.62110137939453, "loss": 0.5461, "rewards/accuracies": 0.0, "rewards/chosen": 1.5867042541503906, "rewards/margins": 0.0, "rewards/rejected": 1.5867042541503906, "step": 11278 }, { "epoch": 1.83, "learning_rate": 1.5159462521537585e-07, "logits/chosen": -0.7374731302261353, "logits/rejected": -0.699661374092102, "logps/chosen": -62.45363235473633, "logps/rejected": -78.18517303466797, "loss": 0.3408, "rewards/accuracies": 1.0, "rewards/chosen": 2.344440221786499, "rewards/margins": 0.5139600038528442, "rewards/rejected": 1.8304802179336548, "step": 11279 }, { "epoch": 1.83, "learning_rate": 1.5150037172612778e-07, "logits/chosen": -0.9594579339027405, "logits/rejected": -0.8749896287918091, "logps/chosen": -105.55964660644531, "logps/rejected": -94.54258728027344, "loss": 0.9139, "rewards/accuracies": 0.0, "rewards/chosen": 2.3441452980041504, "rewards/margins": -0.8410849571228027, "rewards/rejected": 3.185230255126953, "step": 11280 }, { "epoch": 1.83, "learning_rate": 1.5140614231490645e-07, "logits/chosen": -0.18744298815727234, "logits/rejected": -0.23672211170196533, "logps/chosen": -62.152000427246094, "logps/rejected": -46.32759094238281, "loss": 0.5962, "rewards/accuracies": 0.0, "rewards/chosen": 1.4878135919570923, "rewards/margins": -0.5217605829238892, "rewards/rejected": 2.0095741748809814, "step": 11281 }, { "epoch": 1.83, "learning_rate": 1.513119369882223e-07, "logits/chosen": -1.0476078987121582, "logits/rejected": -0.961692214012146, "logps/chosen": -81.75321960449219, "logps/rejected": -32.018001556396484, "loss": 0.2376, "rewards/accuracies": 1.0, "rewards/chosen": 0.679534912109375, "rewards/margins": 0.623665988445282, "rewards/rejected": 0.05586891248822212, "step": 11282 }, { "epoch": 1.83, "learning_rate": 1.5121775575258378e-07, "logits/chosen": -0.9661564826965332, "logits/rejected": -1.0257158279418945, "logps/chosen": -80.47225952148438, "logps/rejected": -143.25393676757812, "loss": 3.1531, "rewards/accuracies": 0.0, "rewards/chosen": 1.4696396589279175, "rewards/margins": -4.2521772384643555, "rewards/rejected": 5.7218170166015625, "step": 11283 }, { "epoch": 1.83, "learning_rate": 1.511235986144983e-07, "logits/chosen": -0.8167980313301086, "logits/rejected": -0.6854987144470215, "logps/chosen": -98.81718444824219, "logps/rejected": -32.60352325439453, "loss": 0.0405, "rewards/accuracies": 1.0, "rewards/chosen": 2.7037079334259033, "rewards/margins": 2.5841290950775146, "rewards/rejected": 0.11957893520593643, "step": 11284 }, { "epoch": 1.83, "learning_rate": 1.5102946558047093e-07, "logits/chosen": -0.47921109199523926, "logits/rejected": -0.5176451206207275, "logps/chosen": -80.96278381347656, "logps/rejected": -120.76950073242188, "loss": 0.3543, "rewards/accuracies": 1.0, "rewards/chosen": 1.2290420532226562, "rewards/margins": 0.44817352294921875, "rewards/rejected": 0.7808685302734375, "step": 11285 }, { "epoch": 1.83, "learning_rate": 1.5093535665700562e-07, "logits/chosen": -0.7315249443054199, "logits/rejected": -0.7361996173858643, "logps/chosen": -72.05953979492188, "logps/rejected": -38.755287170410156, "loss": 0.9224, "rewards/accuracies": 0.0, "rewards/chosen": 1.6079552173614502, "rewards/margins": -0.8603339195251465, "rewards/rejected": 2.4682891368865967, "step": 11286 }, { "epoch": 1.83, "learning_rate": 1.508412718506042e-07, "logits/chosen": -0.6764804720878601, "logits/rejected": -0.6322503685951233, "logps/chosen": -81.04350280761719, "logps/rejected": -51.436180114746094, "loss": 0.4461, "rewards/accuracies": 0.0, "rewards/chosen": 1.9696533679962158, "rewards/margins": -0.35161662101745605, "rewards/rejected": 2.321269989013672, "step": 11287 }, { "epoch": 1.83, "learning_rate": 1.5074721116776723e-07, "logits/chosen": -0.747295081615448, "logits/rejected": -0.7115634083747864, "logps/chosen": -95.16724395751953, "logps/rejected": -40.90509796142578, "loss": 0.2085, "rewards/accuracies": 1.0, "rewards/chosen": 0.8845421075820923, "rewards/margins": 0.6648937463760376, "rewards/rejected": 0.2196483612060547, "step": 11288 }, { "epoch": 1.83, "learning_rate": 1.506531746149931e-07, "logits/chosen": -0.16309089958667755, "logits/rejected": -0.15608441829681396, "logps/chosen": -11.116854667663574, "logps/rejected": -20.20233726501465, "loss": 0.3298, "rewards/accuracies": 1.0, "rewards/chosen": 0.26262590289115906, "rewards/margins": 0.10643158853054047, "rewards/rejected": 0.1561943143606186, "step": 11289 }, { "epoch": 1.83, "learning_rate": 1.5055916219877928e-07, "logits/chosen": -0.6265466809272766, "logits/rejected": -0.6087281107902527, "logps/chosen": -59.28794479370117, "logps/rejected": -46.53691101074219, "loss": 0.395, "rewards/accuracies": 0.0, "rewards/chosen": 1.125404715538025, "rewards/margins": -0.08906364440917969, "rewards/rejected": 1.2144683599472046, "step": 11290 }, { "epoch": 1.83, "learning_rate": 1.5046517392562075e-07, "logits/chosen": -0.6286274194717407, "logits/rejected": -0.5619086623191833, "logps/chosen": -80.2981948852539, "logps/rejected": -77.91336059570312, "loss": 3.4378, "rewards/accuracies": 0.0, "rewards/chosen": 1.9574912786483765, "rewards/margins": -1.1966553926467896, "rewards/rejected": 3.154146671295166, "step": 11291 }, { "epoch": 1.83, "learning_rate": 1.503712098020115e-07, "logits/chosen": -0.37429267168045044, "logits/rejected": -0.37429267168045044, "logps/chosen": -53.68251037597656, "logps/rejected": -53.68251037597656, "loss": 0.4208, "rewards/accuracies": 0.0, "rewards/chosen": 0.5603427886962891, "rewards/margins": 0.0, "rewards/rejected": 0.5603427886962891, "step": 11292 }, { "epoch": 1.83, "learning_rate": 1.5027726983444327e-07, "logits/chosen": -0.5913025736808777, "logits/rejected": -0.45606955885887146, "logps/chosen": -104.59890747070312, "logps/rejected": -86.68050384521484, "loss": 0.6507, "rewards/accuracies": 1.0, "rewards/chosen": 4.648868083953857, "rewards/margins": 1.1600642204284668, "rewards/rejected": 3.4888038635253906, "step": 11293 }, { "epoch": 1.83, "learning_rate": 1.501833540294068e-07, "logits/chosen": -0.9376446604728699, "logits/rejected": -0.9473844170570374, "logps/chosen": -136.4928436279297, "logps/rejected": -67.2547836303711, "loss": 0.8262, "rewards/accuracies": 0.0, "rewards/chosen": 1.105157494544983, "rewards/margins": -1.3787559270858765, "rewards/rejected": 2.4839134216308594, "step": 11294 }, { "epoch": 1.83, "learning_rate": 1.5008946239339037e-07, "logits/chosen": -0.5495465397834778, "logits/rejected": -0.5217148065567017, "logps/chosen": -93.99321746826172, "logps/rejected": -79.14603424072266, "loss": 0.5094, "rewards/accuracies": 0.0, "rewards/chosen": 2.2034051418304443, "rewards/margins": -0.18737411499023438, "rewards/rejected": 2.3907792568206787, "step": 11295 }, { "epoch": 1.83, "learning_rate": 1.4999559493288138e-07, "logits/chosen": -0.2937934696674347, "logits/rejected": -0.2937934696674347, "logps/chosen": -55.681392669677734, "logps/rejected": -55.681392669677734, "loss": 0.9979, "rewards/accuracies": 0.0, "rewards/chosen": 1.3693302869796753, "rewards/margins": 0.0, "rewards/rejected": 1.3693302869796753, "step": 11296 }, { "epoch": 1.83, "learning_rate": 1.499017516543648e-07, "logits/chosen": -0.46700426936149597, "logits/rejected": -0.4779030978679657, "logps/chosen": -8.2904634475708, "logps/rejected": -1.8308062553405762, "loss": 0.4503, "rewards/accuracies": 0.0, "rewards/chosen": 0.09639759361743927, "rewards/margins": -0.21969328820705414, "rewards/rejected": 0.3160908818244934, "step": 11297 }, { "epoch": 1.83, "learning_rate": 1.4980793256432472e-07, "logits/chosen": -0.663936972618103, "logits/rejected": -0.6702837944030762, "logps/chosen": -99.98170471191406, "logps/rejected": -58.25044631958008, "loss": 1.1889, "rewards/accuracies": 0.0, "rewards/chosen": -0.2748474180698395, "rewards/margins": -1.4148662090301514, "rewards/rejected": 1.1400188207626343, "step": 11298 }, { "epoch": 1.83, "learning_rate": 1.4971413766924286e-07, "logits/chosen": -0.23893584311008453, "logits/rejected": -0.2237991988658905, "logps/chosen": -3.2516000270843506, "logps/rejected": -17.127323150634766, "loss": 0.3198, "rewards/accuracies": 1.0, "rewards/chosen": 0.4212876260280609, "rewards/margins": 0.17078790068626404, "rewards/rejected": 0.2504997253417969, "step": 11299 }, { "epoch": 1.83, "learning_rate": 1.4962036697559954e-07, "logits/chosen": -0.6996456980705261, "logits/rejected": -0.6996456980705261, "logps/chosen": -62.210472106933594, "logps/rejected": -62.210472106933594, "loss": 0.5874, "rewards/accuracies": 0.0, "rewards/chosen": 1.3649498224258423, "rewards/margins": 0.0, "rewards/rejected": 1.3649498224258423, "step": 11300 }, { "epoch": 1.83, "learning_rate": 1.4952662048987373e-07, "logits/chosen": -0.5187528133392334, "logits/rejected": -0.5918766856193542, "logps/chosen": -118.50538635253906, "logps/rejected": -127.8597640991211, "loss": 3.4369, "rewards/accuracies": 0.0, "rewards/chosen": 1.1679061651229858, "rewards/margins": -3.236100196838379, "rewards/rejected": 4.404006481170654, "step": 11301 }, { "epoch": 1.83, "learning_rate": 1.494328982185421e-07, "logits/chosen": -0.8911835551261902, "logits/rejected": -0.6755994558334351, "logps/chosen": -140.4012451171875, "logps/rejected": -64.3380355834961, "loss": 0.5613, "rewards/accuracies": 1.0, "rewards/chosen": 5.6164703369140625, "rewards/margins": 2.2180838584899902, "rewards/rejected": 3.3983864784240723, "step": 11302 }, { "epoch": 1.83, "learning_rate": 1.4933920016808026e-07, "logits/chosen": -0.43536850810050964, "logits/rejected": -0.5140048265457153, "logps/chosen": -66.32359313964844, "logps/rejected": -62.87058639526367, "loss": 0.6476, "rewards/accuracies": 0.0, "rewards/chosen": 1.5139312744140625, "rewards/margins": -0.5316829681396484, "rewards/rejected": 2.045614242553711, "step": 11303 }, { "epoch": 1.83, "learning_rate": 1.492455263449615e-07, "logits/chosen": -0.6836122870445251, "logits/rejected": -0.6950324773788452, "logps/chosen": -239.97799682617188, "logps/rejected": -241.35508728027344, "loss": 1.2337, "rewards/accuracies": 0.0, "rewards/chosen": 1.451014757156372, "rewards/margins": -1.159576416015625, "rewards/rejected": 2.610591173171997, "step": 11304 }, { "epoch": 1.83, "learning_rate": 1.4915187675565822e-07, "logits/chosen": -0.6740483045578003, "logits/rejected": -0.5691965818405151, "logps/chosen": -141.81089782714844, "logps/rejected": -71.18498229980469, "loss": 0.3226, "rewards/accuracies": 1.0, "rewards/chosen": 5.024241924285889, "rewards/margins": 3.680220127105713, "rewards/rejected": 1.3440216779708862, "step": 11305 }, { "epoch": 1.84, "learning_rate": 1.4905825140664037e-07, "logits/chosen": -0.6874371767044067, "logits/rejected": -0.7083859443664551, "logps/chosen": -63.4604606628418, "logps/rejected": -49.336795806884766, "loss": 1.2939, "rewards/accuracies": 0.0, "rewards/chosen": 1.8079814910888672, "rewards/margins": -1.4383561611175537, "rewards/rejected": 3.246337652206421, "step": 11306 }, { "epoch": 1.84, "learning_rate": 1.4896465030437677e-07, "logits/chosen": -0.5312943458557129, "logits/rejected": -0.5702224969863892, "logps/chosen": -188.12649536132812, "logps/rejected": -85.8994140625, "loss": 0.6321, "rewards/accuracies": 0.0, "rewards/chosen": 2.692744493484497, "rewards/margins": -0.8525245189666748, "rewards/rejected": 3.545269012451172, "step": 11307 }, { "epoch": 1.84, "learning_rate": 1.4887107345533422e-07, "logits/chosen": -0.6600798964500427, "logits/rejected": -0.7761449813842773, "logps/chosen": -252.087158203125, "logps/rejected": -85.171630859375, "loss": 0.2735, "rewards/accuracies": 1.0, "rewards/chosen": 5.176049709320068, "rewards/margins": 2.102961540222168, "rewards/rejected": 3.0730881690979004, "step": 11308 }, { "epoch": 1.84, "learning_rate": 1.4877752086597818e-07, "logits/chosen": -1.0642757415771484, "logits/rejected": -1.042478322982788, "logps/chosen": -101.81558990478516, "logps/rejected": -62.29624938964844, "loss": 0.5554, "rewards/accuracies": 0.0, "rewards/chosen": 1.606225609779358, "rewards/margins": -0.7061103582382202, "rewards/rejected": 2.312335968017578, "step": 11309 }, { "epoch": 1.84, "learning_rate": 1.4868399254277202e-07, "logits/chosen": -0.3396306037902832, "logits/rejected": -0.3396306037902832, "logps/chosen": -30.016029357910156, "logps/rejected": -30.016029357910156, "loss": 0.3744, "rewards/accuracies": 0.0, "rewards/chosen": 0.4364273250102997, "rewards/margins": 0.0, "rewards/rejected": 0.4364273250102997, "step": 11310 }, { "epoch": 1.84, "learning_rate": 1.4859048849217798e-07, "logits/chosen": -0.7957682609558105, "logits/rejected": -0.8859447836875916, "logps/chosen": -70.17134094238281, "logps/rejected": -87.29428100585938, "loss": 1.4529, "rewards/accuracies": 0.0, "rewards/chosen": 1.3698593378067017, "rewards/margins": -1.3581870794296265, "rewards/rejected": 2.728046417236328, "step": 11311 }, { "epoch": 1.84, "learning_rate": 1.484970087206559e-07, "logits/chosen": -0.9097650647163391, "logits/rejected": -0.88710618019104, "logps/chosen": -47.92283630371094, "logps/rejected": -96.49418640136719, "loss": 0.6572, "rewards/accuracies": 0.0, "rewards/chosen": 1.2369331121444702, "rewards/margins": -0.9917274713516235, "rewards/rejected": 2.2286605834960938, "step": 11312 }, { "epoch": 1.84, "learning_rate": 1.484035532346648e-07, "logits/chosen": -0.6511043906211853, "logits/rejected": -0.6511043906211853, "logps/chosen": -20.569971084594727, "logps/rejected": -20.569971084594727, "loss": 0.4086, "rewards/accuracies": 0.0, "rewards/chosen": -0.0044189454056322575, "rewards/margins": 0.0, "rewards/rejected": -0.0044189454056322575, "step": 11313 }, { "epoch": 1.84, "learning_rate": 1.4831012204066113e-07, "logits/chosen": -0.572201132774353, "logits/rejected": -0.5562453866004944, "logps/chosen": -119.7652816772461, "logps/rejected": -111.13259887695312, "loss": 0.7423, "rewards/accuracies": 1.0, "rewards/chosen": 1.490735650062561, "rewards/margins": 0.39636993408203125, "rewards/rejected": 1.0943657159805298, "step": 11314 }, { "epoch": 1.84, "learning_rate": 1.4821671514510048e-07, "logits/chosen": -0.8807979226112366, "logits/rejected": -0.8188751339912415, "logps/chosen": -77.27120208740234, "logps/rejected": -84.56028747558594, "loss": 0.9414, "rewards/accuracies": 0.0, "rewards/chosen": 1.4410370588302612, "rewards/margins": -1.1832793951034546, "rewards/rejected": 2.624316453933716, "step": 11315 }, { "epoch": 1.84, "learning_rate": 1.4812333255443605e-07, "logits/chosen": -0.41481903195381165, "logits/rejected": -0.3694656193256378, "logps/chosen": -36.64704895019531, "logps/rejected": -39.426246643066406, "loss": 0.6568, "rewards/accuracies": 1.0, "rewards/chosen": 1.1054500341415405, "rewards/margins": 0.15103071928024292, "rewards/rejected": 0.9544193148612976, "step": 11316 }, { "epoch": 1.84, "learning_rate": 1.4802997427511997e-07, "logits/chosen": -0.5288564562797546, "logits/rejected": -0.5309640765190125, "logps/chosen": -45.57219314575195, "logps/rejected": -86.71798706054688, "loss": 0.5537, "rewards/accuracies": 1.0, "rewards/chosen": 1.1542065143585205, "rewards/margins": 0.6670788526535034, "rewards/rejected": 0.4871276915073395, "step": 11317 }, { "epoch": 1.84, "learning_rate": 1.4793664031360214e-07, "logits/chosen": -0.3123311698436737, "logits/rejected": -0.3475433886051178, "logps/chosen": -99.12081909179688, "logps/rejected": -51.8243408203125, "loss": 1.4818, "rewards/accuracies": 0.0, "rewards/chosen": 0.04613799974322319, "rewards/margins": -2.3623206615448, "rewards/rejected": 2.408458709716797, "step": 11318 }, { "epoch": 1.84, "learning_rate": 1.4784333067633142e-07, "logits/chosen": -0.7273861169815063, "logits/rejected": -0.5784607529640198, "logps/chosen": -78.35243225097656, "logps/rejected": -60.613643646240234, "loss": 0.6219, "rewards/accuracies": 0.0, "rewards/chosen": 2.325787305831909, "rewards/margins": -0.32581233978271484, "rewards/rejected": 2.651599645614624, "step": 11319 }, { "epoch": 1.84, "learning_rate": 1.4775004536975422e-07, "logits/chosen": -0.5264520049095154, "logits/rejected": -0.5856850743293762, "logps/chosen": -69.82798767089844, "logps/rejected": -58.420448303222656, "loss": 0.4677, "rewards/accuracies": 0.0, "rewards/chosen": 1.2415863275527954, "rewards/margins": -0.004210710525512695, "rewards/rejected": 1.245797038078308, "step": 11320 }, { "epoch": 1.84, "learning_rate": 1.4765678440031605e-07, "logits/chosen": -0.5879727602005005, "logits/rejected": -0.40743017196655273, "logps/chosen": -72.58566284179688, "logps/rejected": -23.16919708251953, "loss": 0.1005, "rewards/accuracies": 1.0, "rewards/chosen": 2.283149003982544, "rewards/margins": 2.2090394496917725, "rewards/rejected": 0.07410945743322372, "step": 11321 }, { "epoch": 1.84, "learning_rate": 1.4756354777446002e-07, "logits/chosen": -1.0057553052902222, "logits/rejected": -0.8958557844161987, "logps/chosen": -73.01748657226562, "logps/rejected": -80.70333862304688, "loss": 0.548, "rewards/accuracies": 0.0, "rewards/chosen": 1.4162285327911377, "rewards/margins": -0.3977423906326294, "rewards/rejected": 1.813970923423767, "step": 11322 }, { "epoch": 1.84, "learning_rate": 1.4747033549862819e-07, "logits/chosen": -0.41302400827407837, "logits/rejected": -0.41302400827407837, "logps/chosen": -54.57003402709961, "logps/rejected": -54.57003402709961, "loss": 0.3815, "rewards/accuracies": 0.0, "rewards/chosen": 2.0208606719970703, "rewards/margins": 0.0, "rewards/rejected": 2.0208606719970703, "step": 11323 }, { "epoch": 1.84, "learning_rate": 1.4737714757926033e-07, "logits/chosen": -0.8987575173377991, "logits/rejected": -0.8622395992279053, "logps/chosen": -79.00837707519531, "logps/rejected": -153.3224334716797, "loss": 0.7569, "rewards/accuracies": 0.0, "rewards/chosen": 1.5772713422775269, "rewards/margins": -1.0823570489883423, "rewards/rejected": 2.659628391265869, "step": 11324 }, { "epoch": 1.84, "learning_rate": 1.4728398402279525e-07, "logits/chosen": -0.6909945607185364, "logits/rejected": -0.773639976978302, "logps/chosen": -172.29150390625, "logps/rejected": -91.01593780517578, "loss": 1.6756, "rewards/accuracies": 0.0, "rewards/chosen": 4.077511787414551, "rewards/margins": -0.7897820472717285, "rewards/rejected": 4.867293834686279, "step": 11325 }, { "epoch": 1.84, "learning_rate": 1.4719084483566923e-07, "logits/chosen": -0.9255771040916443, "logits/rejected": -0.8741400837898254, "logps/chosen": -171.28842163085938, "logps/rejected": -41.89685821533203, "loss": 0.1385, "rewards/accuracies": 1.0, "rewards/chosen": 1.8590911626815796, "rewards/margins": 1.499387264251709, "rewards/rejected": 0.35970383882522583, "step": 11326 }, { "epoch": 1.84, "learning_rate": 1.470977300243177e-07, "logits/chosen": -1.2235690355300903, "logits/rejected": -1.1642693281173706, "logps/chosen": -73.78672790527344, "logps/rejected": -59.29827117919922, "loss": 0.2827, "rewards/accuracies": 1.0, "rewards/chosen": 2.265789747238159, "rewards/margins": 1.8534587621688843, "rewards/rejected": 0.4123310148715973, "step": 11327 }, { "epoch": 1.84, "learning_rate": 1.4700463959517363e-07, "logits/chosen": -0.6044449806213379, "logits/rejected": -0.563636839389801, "logps/chosen": -61.81089401245117, "logps/rejected": -44.41298294067383, "loss": 2.8636, "rewards/accuracies": 1.0, "rewards/chosen": 2.024203062057495, "rewards/margins": 0.14944064617156982, "rewards/rejected": 1.8747624158859253, "step": 11328 }, { "epoch": 1.84, "learning_rate": 1.469115735546691e-07, "logits/chosen": -0.6395710110664368, "logits/rejected": -0.511295735836029, "logps/chosen": -165.978271484375, "logps/rejected": -82.24949645996094, "loss": 0.859, "rewards/accuracies": 1.0, "rewards/chosen": 5.5767669677734375, "rewards/margins": 3.6049697399139404, "rewards/rejected": 1.971797227859497, "step": 11329 }, { "epoch": 1.84, "learning_rate": 1.468185319092337e-07, "logits/chosen": -0.47156527638435364, "logits/rejected": -0.4686131775379181, "logps/chosen": -29.979488372802734, "logps/rejected": -1.382589340209961, "loss": 0.781, "rewards/accuracies": 0.0, "rewards/chosen": -0.06639480590820312, "rewards/margins": -0.5307914018630981, "rewards/rejected": 0.46439656615257263, "step": 11330 }, { "epoch": 1.84, "learning_rate": 1.467255146652961e-07, "logits/chosen": -0.4056505262851715, "logits/rejected": -0.40213608741760254, "logps/chosen": -12.594484329223633, "logps/rejected": -4.968929767608643, "loss": 1.0506, "rewards/accuracies": 0.0, "rewards/chosen": 0.3608114421367645, "rewards/margins": -0.09054398536682129, "rewards/rejected": 0.4513554275035858, "step": 11331 }, { "epoch": 1.84, "learning_rate": 1.4663252182928255e-07, "logits/chosen": -0.6836231350898743, "logits/rejected": -0.6771066188812256, "logps/chosen": -71.69195556640625, "logps/rejected": -128.9937744140625, "loss": 0.7939, "rewards/accuracies": 0.0, "rewards/chosen": 0.3348281979560852, "rewards/margins": -1.2279372215270996, "rewards/rejected": 1.5627654790878296, "step": 11332 }, { "epoch": 1.84, "learning_rate": 1.465395534076183e-07, "logits/chosen": -0.8838114738464355, "logits/rejected": -0.6075430512428284, "logps/chosen": -308.33148193359375, "logps/rejected": -110.35374450683594, "loss": 0.1148, "rewards/accuracies": 1.0, "rewards/chosen": 6.07643461227417, "rewards/margins": 1.3885669708251953, "rewards/rejected": 4.687867641448975, "step": 11333 }, { "epoch": 1.84, "learning_rate": 1.4644660940672627e-07, "logits/chosen": -0.6019526124000549, "logits/rejected": -0.6409683227539062, "logps/chosen": -62.329856872558594, "logps/rejected": -64.9336166381836, "loss": 1.9955, "rewards/accuracies": 1.0, "rewards/chosen": 2.1343941688537598, "rewards/margins": 1.334346055984497, "rewards/rejected": 0.8000480532646179, "step": 11334 }, { "epoch": 1.84, "learning_rate": 1.463536898330282e-07, "logits/chosen": -0.4402497708797455, "logits/rejected": -0.4069983661174774, "logps/chosen": -60.46137237548828, "logps/rejected": -39.00578308105469, "loss": 0.3146, "rewards/accuracies": 1.0, "rewards/chosen": 2.3848214149475098, "rewards/margins": 0.6440705060958862, "rewards/rejected": 1.7407509088516235, "step": 11335 }, { "epoch": 1.84, "learning_rate": 1.4626079469294406e-07, "logits/chosen": -0.6320881843566895, "logits/rejected": -0.4110575020313263, "logps/chosen": -149.832275390625, "logps/rejected": -46.184104919433594, "loss": 2.1622, "rewards/accuracies": 1.0, "rewards/chosen": 4.472804546356201, "rewards/margins": 2.7112135887145996, "rewards/rejected": 1.7615909576416016, "step": 11336 }, { "epoch": 1.84, "learning_rate": 1.4616792399289173e-07, "logits/chosen": -0.611747145652771, "logits/rejected": -0.6164951324462891, "logps/chosen": -164.9817657470703, "logps/rejected": -79.84597778320312, "loss": 0.3871, "rewards/accuracies": 0.0, "rewards/chosen": 1.1610428094863892, "rewards/margins": -0.11616289615631104, "rewards/rejected": 1.2772057056427002, "step": 11337 }, { "epoch": 1.84, "learning_rate": 1.4607507773928806e-07, "logits/chosen": -0.7765859961509705, "logits/rejected": -0.7658092975616455, "logps/chosen": -70.0916976928711, "logps/rejected": -55.55834197998047, "loss": 0.2756, "rewards/accuracies": 1.0, "rewards/chosen": 2.779059648513794, "rewards/margins": 0.49620962142944336, "rewards/rejected": 2.2828500270843506, "step": 11338 }, { "epoch": 1.84, "learning_rate": 1.4598225593854746e-07, "logits/chosen": -0.4052559435367584, "logits/rejected": -0.16333246231079102, "logps/chosen": -77.47207641601562, "logps/rejected": -29.54938507080078, "loss": 0.8893, "rewards/accuracies": 1.0, "rewards/chosen": 2.1567139625549316, "rewards/margins": 1.9469497203826904, "rewards/rejected": 0.2097642868757248, "step": 11339 }, { "epoch": 1.84, "learning_rate": 1.458894585970834e-07, "logits/chosen": -0.768140971660614, "logits/rejected": -0.6853879690170288, "logps/chosen": -47.62959671020508, "logps/rejected": -78.63497924804688, "loss": 0.3, "rewards/accuracies": 1.0, "rewards/chosen": 2.535517454147339, "rewards/margins": 0.6039928197860718, "rewards/rejected": 1.931524634361267, "step": 11340 }, { "epoch": 1.84, "learning_rate": 1.45796685721307e-07, "logits/chosen": -0.6782490015029907, "logits/rejected": -0.6068539023399353, "logps/chosen": -108.86605834960938, "logps/rejected": -64.286865234375, "loss": 0.7173, "rewards/accuracies": 1.0, "rewards/chosen": 1.7801055908203125, "rewards/margins": 0.8561103940010071, "rewards/rejected": 0.9239951968193054, "step": 11341 }, { "epoch": 1.84, "learning_rate": 1.457039373176282e-07, "logits/chosen": -0.5363125205039978, "logits/rejected": -0.5653370022773743, "logps/chosen": -43.50038146972656, "logps/rejected": -94.90557861328125, "loss": 0.1824, "rewards/accuracies": 1.0, "rewards/chosen": 1.59703528881073, "rewards/margins": 0.8253906965255737, "rewards/rejected": 0.7716445922851562, "step": 11342 }, { "epoch": 1.84, "learning_rate": 1.4561121339245485e-07, "logits/chosen": -0.3621804118156433, "logits/rejected": -0.3473982512950897, "logps/chosen": -88.20716094970703, "logps/rejected": -49.92552947998047, "loss": 0.4221, "rewards/accuracies": 0.0, "rewards/chosen": 0.7618675231933594, "rewards/margins": -0.263569712638855, "rewards/rejected": 1.0254372358322144, "step": 11343 }, { "epoch": 1.84, "learning_rate": 1.455185139521935e-07, "logits/chosen": -0.5564873218536377, "logits/rejected": -0.5812667608261108, "logps/chosen": -73.32475280761719, "logps/rejected": -84.04734802246094, "loss": 0.4133, "rewards/accuracies": 0.0, "rewards/chosen": 1.1509217023849487, "rewards/margins": -0.014713287353515625, "rewards/rejected": 1.1656349897384644, "step": 11344 }, { "epoch": 1.84, "learning_rate": 1.4542583900324863e-07, "logits/chosen": -0.5163812637329102, "logits/rejected": -0.5157139897346497, "logps/chosen": -2.142820358276367, "logps/rejected": -3.362644672393799, "loss": 0.5314, "rewards/accuracies": 0.0, "rewards/chosen": 0.1287265568971634, "rewards/margins": -0.04001438617706299, "rewards/rejected": 0.16874094307422638, "step": 11345 }, { "epoch": 1.84, "learning_rate": 1.4533318855202338e-07, "logits/chosen": -1.0400469303131104, "logits/rejected": -1.2247551679611206, "logps/chosen": -182.5323486328125, "logps/rejected": -145.75912475585938, "loss": 0.7617, "rewards/accuracies": 0.0, "rewards/chosen": 4.145971775054932, "rewards/margins": -1.0726213455200195, "rewards/rejected": 5.218593120574951, "step": 11346 }, { "epoch": 1.84, "learning_rate": 1.4524056260491874e-07, "logits/chosen": -0.8973634839057922, "logits/rejected": -0.7792782187461853, "logps/chosen": -141.40292358398438, "logps/rejected": -80.62930297851562, "loss": 0.4087, "rewards/accuracies": 1.0, "rewards/chosen": 4.625543117523193, "rewards/margins": 1.8900244235992432, "rewards/rejected": 2.73551869392395, "step": 11347 }, { "epoch": 1.84, "learning_rate": 1.451479611683346e-07, "logits/chosen": -0.4134123623371124, "logits/rejected": -0.4134123623371124, "logps/chosen": -45.28928756713867, "logps/rejected": -45.28928756713867, "loss": 1.7335, "rewards/accuracies": 0.0, "rewards/chosen": 0.3948570191860199, "rewards/margins": 0.0, "rewards/rejected": 0.3948570191860199, "step": 11348 }, { "epoch": 1.84, "learning_rate": 1.4505538424866859e-07, "logits/chosen": -0.8454746603965759, "logits/rejected": -0.7635281682014465, "logps/chosen": -100.53713989257812, "logps/rejected": -74.35098266601562, "loss": 0.1486, "rewards/accuracies": 1.0, "rewards/chosen": 4.23347806930542, "rewards/margins": 1.3048632144927979, "rewards/rejected": 2.928614854812622, "step": 11349 }, { "epoch": 1.84, "learning_rate": 1.4496283185231716e-07, "logits/chosen": -0.8447332978248596, "logits/rejected": -0.861598789691925, "logps/chosen": -70.94744873046875, "logps/rejected": -66.02233123779297, "loss": 0.7652, "rewards/accuracies": 0.0, "rewards/chosen": 1.8085708618164062, "rewards/margins": -1.1547966003417969, "rewards/rejected": 2.963367462158203, "step": 11350 }, { "epoch": 1.84, "learning_rate": 1.4487030398567447e-07, "logits/chosen": -0.7074865102767944, "logits/rejected": -0.694309413433075, "logps/chosen": -34.64224624633789, "logps/rejected": -18.12803077697754, "loss": 0.7333, "rewards/accuracies": 1.0, "rewards/chosen": 0.4340004026889801, "rewards/margins": 0.21746349334716797, "rewards/rejected": 0.21653690934181213, "step": 11351 }, { "epoch": 1.84, "learning_rate": 1.4477780065513368e-07, "logits/chosen": -0.7236183285713196, "logits/rejected": -0.6546159386634827, "logps/chosen": -61.060401916503906, "logps/rejected": -7.924022674560547, "loss": 0.5075, "rewards/accuracies": 1.0, "rewards/chosen": 1.8785256147384644, "rewards/margins": 1.3173069953918457, "rewards/rejected": 0.5612185597419739, "step": 11352 }, { "epoch": 1.84, "learning_rate": 1.446853218670856e-07, "logits/chosen": -0.7799981236457825, "logits/rejected": -0.7376332879066467, "logps/chosen": -49.6575813293457, "logps/rejected": -17.566530227661133, "loss": 0.3622, "rewards/accuracies": 1.0, "rewards/chosen": 1.9349071979522705, "rewards/margins": 0.788459062576294, "rewards/rejected": 1.1464481353759766, "step": 11353 }, { "epoch": 1.84, "learning_rate": 1.445928676279199e-07, "logits/chosen": -0.7853227853775024, "logits/rejected": -0.6372730135917664, "logps/chosen": -112.72604370117188, "logps/rejected": -52.08357238769531, "loss": 0.1175, "rewards/accuracies": 1.0, "rewards/chosen": 4.70718240737915, "rewards/margins": 2.8990044593811035, "rewards/rejected": 1.8081779479980469, "step": 11354 }, { "epoch": 1.84, "learning_rate": 1.4450043794402394e-07, "logits/chosen": -0.613145649433136, "logits/rejected": -0.5947643518447876, "logps/chosen": -69.33607482910156, "logps/rejected": -74.2587890625, "loss": 0.9406, "rewards/accuracies": 0.0, "rewards/chosen": 1.678242564201355, "rewards/margins": -0.1778937578201294, "rewards/rejected": 1.8561363220214844, "step": 11355 }, { "epoch": 1.84, "learning_rate": 1.4440803282178415e-07, "logits/chosen": -0.6992248892784119, "logits/rejected": -0.6525998115539551, "logps/chosen": -60.6121826171875, "logps/rejected": -84.33720397949219, "loss": 0.465, "rewards/accuracies": 1.0, "rewards/chosen": 1.922938585281372, "rewards/margins": 0.44444739818573, "rewards/rejected": 1.478491187095642, "step": 11356 }, { "epoch": 1.84, "learning_rate": 1.4431565226758453e-07, "logits/chosen": -0.6298972964286804, "logits/rejected": -0.657387375831604, "logps/chosen": -70.21004486083984, "logps/rejected": -57.212745666503906, "loss": 0.4337, "rewards/accuracies": 0.0, "rewards/chosen": 2.491140127182007, "rewards/margins": -0.14251708984375, "rewards/rejected": 2.633657217025757, "step": 11357 }, { "epoch": 1.84, "learning_rate": 1.4422329628780795e-07, "logits/chosen": -0.7333893179893494, "logits/rejected": -0.7606339454650879, "logps/chosen": -129.06582641601562, "logps/rejected": -155.9817657470703, "loss": 2.2865, "rewards/accuracies": 0.0, "rewards/chosen": 0.41147613525390625, "rewards/margins": -2.7926926612854004, "rewards/rejected": 3.2041687965393066, "step": 11358 }, { "epoch": 1.84, "learning_rate": 1.4413096488883504e-07, "logits/chosen": -0.8454529047012329, "logits/rejected": -0.8657314777374268, "logps/chosen": -103.1321029663086, "logps/rejected": -69.83114624023438, "loss": 1.1456, "rewards/accuracies": 0.0, "rewards/chosen": 1.2345589399337769, "rewards/margins": -1.5332649946212769, "rewards/rejected": 2.7678239345550537, "step": 11359 }, { "epoch": 1.84, "learning_rate": 1.4403865807704528e-07, "logits/chosen": -0.45088934898376465, "logits/rejected": -0.4357488453388214, "logps/chosen": -73.47570037841797, "logps/rejected": -67.90829467773438, "loss": 0.4003, "rewards/accuracies": 0.0, "rewards/chosen": 1.1411155462265015, "rewards/margins": -0.11922991275787354, "rewards/rejected": 1.260345458984375, "step": 11360 }, { "epoch": 1.84, "learning_rate": 1.4394637585881607e-07, "logits/chosen": -0.8438446521759033, "logits/rejected": -0.8189284801483154, "logps/chosen": -46.157501220703125, "logps/rejected": -73.1697998046875, "loss": 0.2494, "rewards/accuracies": 1.0, "rewards/chosen": 2.704982042312622, "rewards/margins": 0.931904673576355, "rewards/rejected": 1.773077368736267, "step": 11361 }, { "epoch": 1.84, "learning_rate": 1.438541182405234e-07, "logits/chosen": -1.0250561237335205, "logits/rejected": -0.9844791293144226, "logps/chosen": -146.05111694335938, "logps/rejected": -123.08026123046875, "loss": 0.6273, "rewards/accuracies": 0.0, "rewards/chosen": 4.816777229309082, "rewards/margins": -0.22797393798828125, "rewards/rejected": 5.044751167297363, "step": 11362 }, { "epoch": 1.84, "learning_rate": 1.437618852285411e-07, "logits/chosen": -0.668842077255249, "logits/rejected": -0.6730404496192932, "logps/chosen": -86.78070068359375, "logps/rejected": -99.63172149658203, "loss": 0.3944, "rewards/accuracies": 1.0, "rewards/chosen": 1.6743606328964233, "rewards/margins": 0.7666084170341492, "rewards/rejected": 0.9077522158622742, "step": 11363 }, { "epoch": 1.84, "learning_rate": 1.4366967682924192e-07, "logits/chosen": -0.4865706264972687, "logits/rejected": -0.41000065207481384, "logps/chosen": -49.574806213378906, "logps/rejected": -146.21275329589844, "loss": 1.674, "rewards/accuracies": 0.0, "rewards/chosen": 1.9826210737228394, "rewards/margins": -3.2257347106933594, "rewards/rejected": 5.208355903625488, "step": 11364 }, { "epoch": 1.84, "learning_rate": 1.435774930489963e-07, "logits/chosen": -0.9595449566841125, "logits/rejected": -0.8657196164131165, "logps/chosen": -108.69896697998047, "logps/rejected": -33.40990447998047, "loss": 0.5891, "rewards/accuracies": 1.0, "rewards/chosen": 1.1610344648361206, "rewards/margins": 0.9914169907569885, "rewards/rejected": 0.1696174591779709, "step": 11365 }, { "epoch": 1.84, "learning_rate": 1.4348533389417357e-07, "logits/chosen": -1.2774386405944824, "logits/rejected": -1.2281321287155151, "logps/chosen": -112.29539489746094, "logps/rejected": -19.605684280395508, "loss": 1.1644, "rewards/accuracies": 1.0, "rewards/chosen": 6.40045166015625, "rewards/margins": 5.973775863647461, "rewards/rejected": 0.42667561769485474, "step": 11366 }, { "epoch": 1.84, "learning_rate": 1.4339319937114075e-07, "logits/chosen": -0.7490713000297546, "logits/rejected": -0.661935567855835, "logps/chosen": -82.36354064941406, "logps/rejected": -20.406400680541992, "loss": 0.0645, "rewards/accuracies": 1.0, "rewards/chosen": 3.377650499343872, "rewards/margins": 2.8409340381622314, "rewards/rejected": 0.5367164611816406, "step": 11367 }, { "epoch": 1.85, "learning_rate": 1.4330108948626375e-07, "logits/chosen": -0.8979446887969971, "logits/rejected": -0.9020277261734009, "logps/chosen": -216.37472534179688, "logps/rejected": -71.90010070800781, "loss": 0.8205, "rewards/accuracies": 1.0, "rewards/chosen": 4.866186618804932, "rewards/margins": 2.126394748687744, "rewards/rejected": 2.7397918701171875, "step": 11368 }, { "epoch": 1.85, "learning_rate": 1.432090042459062e-07, "logits/chosen": -0.719878077507019, "logits/rejected": -0.8157313466072083, "logps/chosen": -72.20950317382812, "logps/rejected": -105.44776153564453, "loss": 0.6583, "rewards/accuracies": 0.0, "rewards/chosen": 0.7666992545127869, "rewards/margins": -0.07948070764541626, "rewards/rejected": 0.8461799621582031, "step": 11369 }, { "epoch": 1.85, "learning_rate": 1.4311694365643045e-07, "logits/chosen": -0.831558883190155, "logits/rejected": -0.8421531915664673, "logps/chosen": -76.68730926513672, "logps/rejected": -124.14165496826172, "loss": 0.2601, "rewards/accuracies": 1.0, "rewards/chosen": 1.237866997718811, "rewards/margins": 0.4598190188407898, "rewards/rejected": 0.7780479788780212, "step": 11370 }, { "epoch": 1.85, "learning_rate": 1.430249077241972e-07, "logits/chosen": -0.8215369582176208, "logits/rejected": -0.8002070188522339, "logps/chosen": -117.70269775390625, "logps/rejected": -68.5096435546875, "loss": 0.525, "rewards/accuracies": 0.0, "rewards/chosen": 1.8175048828125, "rewards/margins": -0.5908477306365967, "rewards/rejected": 2.4083526134490967, "step": 11371 }, { "epoch": 1.85, "learning_rate": 1.4293289645556494e-07, "logits/chosen": -0.11200175434350967, "logits/rejected": -0.10352662205696106, "logps/chosen": -0.8042572736740112, "logps/rejected": -10.452768325805664, "loss": 0.6413, "rewards/accuracies": 1.0, "rewards/chosen": 0.19043466448783875, "rewards/margins": 0.38321202993392944, "rewards/rejected": -0.1927773505449295, "step": 11372 }, { "epoch": 1.85, "learning_rate": 1.4284090985689113e-07, "logits/chosen": -0.9618673920631409, "logits/rejected": -0.8767044544219971, "logps/chosen": -62.192138671875, "logps/rejected": -41.019405364990234, "loss": 0.1562, "rewards/accuracies": 1.0, "rewards/chosen": 1.334498643875122, "rewards/margins": 1.192264199256897, "rewards/rejected": 0.1422344297170639, "step": 11373 }, { "epoch": 1.85, "learning_rate": 1.4274894793453073e-07, "logits/chosen": -1.405544638633728, "logits/rejected": -1.3821055889129639, "logps/chosen": -63.999046325683594, "logps/rejected": -83.50480651855469, "loss": 0.4901, "rewards/accuracies": 0.0, "rewards/chosen": 2.204021453857422, "rewards/margins": -0.4858543872833252, "rewards/rejected": 2.689875841140747, "step": 11374 }, { "epoch": 1.85, "learning_rate": 1.4265701069483792e-07, "logits/chosen": -1.0322927236557007, "logits/rejected": -0.9217232465744019, "logps/chosen": -110.82486724853516, "logps/rejected": -103.61128234863281, "loss": 0.0872, "rewards/accuracies": 1.0, "rewards/chosen": 4.652950286865234, "rewards/margins": 1.8007681369781494, "rewards/rejected": 2.852182149887085, "step": 11375 }, { "epoch": 1.85, "learning_rate": 1.4256509814416428e-07, "logits/chosen": -0.6167280077934265, "logits/rejected": -0.564723551273346, "logps/chosen": -81.97518920898438, "logps/rejected": -103.09162139892578, "loss": 1.0541, "rewards/accuracies": 0.0, "rewards/chosen": 0.8477516174316406, "rewards/margins": -1.9258010387420654, "rewards/rejected": 2.773552656173706, "step": 11376 }, { "epoch": 1.85, "learning_rate": 1.4247321028886045e-07, "logits/chosen": -0.6105285286903381, "logits/rejected": -0.5307050347328186, "logps/chosen": -88.80946350097656, "logps/rejected": -82.31398010253906, "loss": 0.5742, "rewards/accuracies": 0.0, "rewards/chosen": 1.9706436395645142, "rewards/margins": -0.5791679620742798, "rewards/rejected": 2.549811601638794, "step": 11377 }, { "epoch": 1.85, "learning_rate": 1.4238134713527465e-07, "logits/chosen": -0.6583214402198792, "logits/rejected": -0.8146758675575256, "logps/chosen": -241.73658752441406, "logps/rejected": -156.83319091796875, "loss": 1.084, "rewards/accuracies": 0.0, "rewards/chosen": 2.734513998031616, "rewards/margins": -1.9995834827423096, "rewards/rejected": 4.734097480773926, "step": 11378 }, { "epoch": 1.85, "learning_rate": 1.4228950868975415e-07, "logits/chosen": -0.7248523235321045, "logits/rejected": -0.34925732016563416, "logps/chosen": -145.7025909423828, "logps/rejected": -16.098628997802734, "loss": 0.0244, "rewards/accuracies": 1.0, "rewards/chosen": 4.286599636077881, "rewards/margins": 4.109376430511475, "rewards/rejected": 0.17722320556640625, "step": 11379 }, { "epoch": 1.85, "learning_rate": 1.421976949586437e-07, "logits/chosen": -0.9809831976890564, "logits/rejected": -0.7937321066856384, "logps/chosen": -48.10499572753906, "logps/rejected": -77.33493041992188, "loss": 0.9733, "rewards/accuracies": 0.0, "rewards/chosen": 2.619554281234741, "rewards/margins": -1.2505035400390625, "rewards/rejected": 3.8700578212738037, "step": 11380 }, { "epoch": 1.85, "learning_rate": 1.421059059482872e-07, "logits/chosen": -0.7922030091285706, "logits/rejected": -0.8263155817985535, "logps/chosen": -91.51333618164062, "logps/rejected": -52.76885223388672, "loss": 0.6518, "rewards/accuracies": 0.0, "rewards/chosen": 1.4464737176895142, "rewards/margins": -0.7130790948867798, "rewards/rejected": 2.159552812576294, "step": 11381 }, { "epoch": 1.85, "learning_rate": 1.4201414166502595e-07, "logits/chosen": -0.7500481605529785, "logits/rejected": -0.6144219040870667, "logps/chosen": -72.07493591308594, "logps/rejected": -16.64394187927246, "loss": 0.2066, "rewards/accuracies": 1.0, "rewards/chosen": 2.0032577514648438, "rewards/margins": 1.5031678676605225, "rewards/rejected": 0.5000898241996765, "step": 11382 }, { "epoch": 1.85, "learning_rate": 1.419224021152004e-07, "logits/chosen": -1.0343457460403442, "logits/rejected": -0.9973488450050354, "logps/chosen": -74.15470886230469, "logps/rejected": -78.07398986816406, "loss": 1.0175, "rewards/accuracies": 0.0, "rewards/chosen": 1.196600317955017, "rewards/margins": -1.7882798910140991, "rewards/rejected": 2.984880208969116, "step": 11383 }, { "epoch": 1.85, "learning_rate": 1.4183068730514851e-07, "logits/chosen": -0.6598756909370422, "logits/rejected": -0.7116633057594299, "logps/chosen": -75.73228454589844, "logps/rejected": -24.628646850585938, "loss": 0.5113, "rewards/accuracies": 1.0, "rewards/chosen": 1.6666511297225952, "rewards/margins": 0.7802249789237976, "rewards/rejected": 0.8864261507987976, "step": 11384 }, { "epoch": 1.85, "learning_rate": 1.417389972412073e-07, "logits/chosen": -0.6573067307472229, "logits/rejected": -0.6573067307472229, "logps/chosen": -41.37124252319336, "logps/rejected": -41.37124252319336, "loss": 0.3883, "rewards/accuracies": 0.0, "rewards/chosen": 3.287750005722046, "rewards/margins": 0.0, "rewards/rejected": 3.287750005722046, "step": 11385 }, { "epoch": 1.85, "learning_rate": 1.4164733192971134e-07, "logits/chosen": -0.9202872514724731, "logits/rejected": -0.8019713759422302, "logps/chosen": -98.3324203491211, "logps/rejected": -85.77680969238281, "loss": 0.2453, "rewards/accuracies": 1.0, "rewards/chosen": 3.3440773487091064, "rewards/margins": 0.5753226280212402, "rewards/rejected": 2.768754720687866, "step": 11386 }, { "epoch": 1.85, "learning_rate": 1.4155569137699408e-07, "logits/chosen": -0.6694118976593018, "logits/rejected": -0.616841197013855, "logps/chosen": -91.68620300292969, "logps/rejected": -65.63562774658203, "loss": 1.1778, "rewards/accuracies": 0.0, "rewards/chosen": 1.530554175376892, "rewards/margins": -1.079572319984436, "rewards/rejected": 2.610126495361328, "step": 11387 }, { "epoch": 1.85, "learning_rate": 1.4146407558938695e-07, "logits/chosen": -0.8760412931442261, "logits/rejected": -0.8543257713317871, "logps/chosen": -57.0806999206543, "logps/rejected": -97.07793426513672, "loss": 2.2008, "rewards/accuracies": 0.0, "rewards/chosen": 1.6520404815673828, "rewards/margins": -1.4039676189422607, "rewards/rejected": 3.0560081005096436, "step": 11388 }, { "epoch": 1.85, "learning_rate": 1.413724845732197e-07, "logits/chosen": -0.589230477809906, "logits/rejected": -0.5948511362075806, "logps/chosen": -96.37094116210938, "logps/rejected": -61.747520446777344, "loss": 1.1259, "rewards/accuracies": 0.0, "rewards/chosen": 1.189753770828247, "rewards/margins": -1.095160722732544, "rewards/rejected": 2.284914493560791, "step": 11389 }, { "epoch": 1.85, "learning_rate": 1.4128091833482019e-07, "logits/chosen": -0.7992709279060364, "logits/rejected": -0.7049241065979004, "logps/chosen": -87.52188110351562, "logps/rejected": -56.321170806884766, "loss": 0.8017, "rewards/accuracies": 0.0, "rewards/chosen": 0.777374267578125, "rewards/margins": -1.0138027667999268, "rewards/rejected": 1.7911770343780518, "step": 11390 }, { "epoch": 1.85, "learning_rate": 1.4118937688051507e-07, "logits/chosen": -0.6227903962135315, "logits/rejected": -0.5950967073440552, "logps/chosen": -86.06991577148438, "logps/rejected": -98.52543640136719, "loss": 1.1714, "rewards/accuracies": 0.0, "rewards/chosen": 0.8816162347793579, "rewards/margins": -1.2469924688339233, "rewards/rejected": 2.1286087036132812, "step": 11391 }, { "epoch": 1.85, "learning_rate": 1.4109786021662905e-07, "logits/chosen": -0.7135047912597656, "logits/rejected": -0.6133599281311035, "logps/chosen": -99.63990783691406, "logps/rejected": -36.36369705200195, "loss": 0.5089, "rewards/accuracies": 1.0, "rewards/chosen": 1.238433837890625, "rewards/margins": 0.03485751152038574, "rewards/rejected": 1.2035763263702393, "step": 11392 }, { "epoch": 1.85, "learning_rate": 1.4100636834948476e-07, "logits/chosen": -0.8313408493995667, "logits/rejected": -0.7407994270324707, "logps/chosen": -142.2228546142578, "logps/rejected": -91.98246765136719, "loss": 0.2147, "rewards/accuracies": 1.0, "rewards/chosen": 4.593602180480957, "rewards/margins": 1.1423859596252441, "rewards/rejected": 3.451216220855713, "step": 11393 }, { "epoch": 1.85, "learning_rate": 1.4091490128540373e-07, "logits/chosen": -0.7780497074127197, "logits/rejected": -0.7780497074127197, "logps/chosen": -23.13051986694336, "logps/rejected": -23.13051986694336, "loss": 0.7957, "rewards/accuracies": 0.0, "rewards/chosen": 0.7384248971939087, "rewards/margins": 0.0, "rewards/rejected": 0.7384248971939087, "step": 11394 }, { "epoch": 1.85, "learning_rate": 1.4082345903070514e-07, "logits/chosen": -0.7836156487464905, "logits/rejected": -0.7582037448883057, "logps/chosen": -185.05662536621094, "logps/rejected": -64.22314453125, "loss": 0.8291, "rewards/accuracies": 0.0, "rewards/chosen": 1.0778487920761108, "rewards/margins": -0.9665886163711548, "rewards/rejected": 2.0444374084472656, "step": 11395 }, { "epoch": 1.85, "learning_rate": 1.407320415917072e-07, "logits/chosen": -0.4182078242301941, "logits/rejected": -0.48523592948913574, "logps/chosen": -95.38485717773438, "logps/rejected": -95.08818054199219, "loss": 0.8683, "rewards/accuracies": 0.0, "rewards/chosen": 1.4736435413360596, "rewards/margins": -0.6808006763458252, "rewards/rejected": 2.1544442176818848, "step": 11396 }, { "epoch": 1.85, "learning_rate": 1.4064064897472556e-07, "logits/chosen": -0.5516433715820312, "logits/rejected": -0.4984854459762573, "logps/chosen": -68.57744598388672, "logps/rejected": -89.8826904296875, "loss": 0.7526, "rewards/accuracies": 0.0, "rewards/chosen": 2.3468682765960693, "rewards/margins": -0.25988006591796875, "rewards/rejected": 2.606748342514038, "step": 11397 }, { "epoch": 1.85, "learning_rate": 1.4054928118607496e-07, "logits/chosen": -0.6880096793174744, "logits/rejected": -0.6252647638320923, "logps/chosen": -59.97087860107422, "logps/rejected": -84.73363494873047, "loss": 1.6598, "rewards/accuracies": 0.0, "rewards/chosen": 1.7592735290527344, "rewards/margins": -1.8358445167541504, "rewards/rejected": 3.5951180458068848, "step": 11398 }, { "epoch": 1.85, "learning_rate": 1.404579382320677e-07, "logits/chosen": -0.9610713124275208, "logits/rejected": -0.8156501054763794, "logps/chosen": -141.65061950683594, "logps/rejected": -31.560081481933594, "loss": 1.2138, "rewards/accuracies": 1.0, "rewards/chosen": 4.5231032371521, "rewards/margins": 3.5819830894470215, "rewards/rejected": 0.9411201477050781, "step": 11399 }, { "epoch": 1.85, "learning_rate": 1.4036662011901506e-07, "logits/chosen": -0.8238208293914795, "logits/rejected": -0.7342936396598816, "logps/chosen": -93.17272186279297, "logps/rejected": -42.87016677856445, "loss": 0.895, "rewards/accuracies": 0.0, "rewards/chosen": 1.3383476734161377, "rewards/margins": -0.6862750053405762, "rewards/rejected": 2.024622678756714, "step": 11400 }, { "epoch": 1.85, "learning_rate": 1.4027532685322592e-07, "logits/chosen": -0.319623202085495, "logits/rejected": -0.2921290099620819, "logps/chosen": -46.270023345947266, "logps/rejected": -91.27473449707031, "loss": 1.1186, "rewards/accuracies": 0.0, "rewards/chosen": 0.8764629364013672, "rewards/margins": -1.986954927444458, "rewards/rejected": 2.863417863845825, "step": 11401 }, { "epoch": 1.85, "learning_rate": 1.401840584410081e-07, "logits/chosen": -0.44138774275779724, "logits/rejected": -0.0767354890704155, "logps/chosen": -139.7413330078125, "logps/rejected": -59.97752380371094, "loss": 0.1198, "rewards/accuracies": 1.0, "rewards/chosen": 4.6153106689453125, "rewards/margins": 2.6914565563201904, "rewards/rejected": 1.923854112625122, "step": 11402 }, { "epoch": 1.85, "learning_rate": 1.400928148886671e-07, "logits/chosen": -0.8149070739746094, "logits/rejected": -0.734229326248169, "logps/chosen": -98.40995788574219, "logps/rejected": -74.3003921508789, "loss": 0.2721, "rewards/accuracies": 1.0, "rewards/chosen": 4.7503557205200195, "rewards/margins": 1.4549570083618164, "rewards/rejected": 3.295398712158203, "step": 11403 }, { "epoch": 1.85, "learning_rate": 1.4000159620250724e-07, "logits/chosen": -1.022331714630127, "logits/rejected": -0.9946921467781067, "logps/chosen": -82.86410522460938, "logps/rejected": -79.50137329101562, "loss": 0.9169, "rewards/accuracies": 0.0, "rewards/chosen": 0.3709854185581207, "rewards/margins": -0.7555999755859375, "rewards/rejected": 1.1265853643417358, "step": 11404 }, { "epoch": 1.85, "learning_rate": 1.3991040238883062e-07, "logits/chosen": -0.6104128360748291, "logits/rejected": -0.4585359990596771, "logps/chosen": -46.1053352355957, "logps/rejected": -10.09924602508545, "loss": 0.8184, "rewards/accuracies": 1.0, "rewards/chosen": 1.630639672279358, "rewards/margins": 0.7083945274353027, "rewards/rejected": 0.9222451448440552, "step": 11405 }, { "epoch": 1.85, "learning_rate": 1.3981923345393815e-07, "logits/chosen": -0.6955955028533936, "logits/rejected": -0.6461337208747864, "logps/chosen": -102.25465393066406, "logps/rejected": -88.20748901367188, "loss": 0.4523, "rewards/accuracies": 0.0, "rewards/chosen": 4.449418544769287, "rewards/margins": -0.36460447311401367, "rewards/rejected": 4.814023017883301, "step": 11406 }, { "epoch": 1.85, "learning_rate": 1.397280894041284e-07, "logits/chosen": -0.8601740598678589, "logits/rejected": -0.8227463960647583, "logps/chosen": -81.61610412597656, "logps/rejected": -16.52495765686035, "loss": 0.2977, "rewards/accuracies": 1.0, "rewards/chosen": 1.321160912513733, "rewards/margins": 0.22252583503723145, "rewards/rejected": 1.0986350774765015, "step": 11407 }, { "epoch": 1.85, "learning_rate": 1.3963697024569892e-07, "logits/chosen": -0.8257835507392883, "logits/rejected": -0.888553261756897, "logps/chosen": -40.78033447265625, "logps/rejected": -164.15823364257812, "loss": 2.5598, "rewards/accuracies": 0.0, "rewards/chosen": 2.145848035812378, "rewards/margins": -5.079068183898926, "rewards/rejected": 7.224915981292725, "step": 11408 }, { "epoch": 1.85, "learning_rate": 1.395458759849449e-07, "logits/chosen": -0.6990830898284912, "logits/rejected": -0.6356613636016846, "logps/chosen": -49.182586669921875, "logps/rejected": -57.562103271484375, "loss": 0.6739, "rewards/accuracies": 0.0, "rewards/chosen": 1.313269853591919, "rewards/margins": -0.1134650707244873, "rewards/rejected": 1.4267349243164062, "step": 11409 }, { "epoch": 1.85, "learning_rate": 1.3945480662816028e-07, "logits/chosen": -0.8145794868469238, "logits/rejected": -0.8032000064849854, "logps/chosen": -80.19572448730469, "logps/rejected": -56.873008728027344, "loss": 0.3145, "rewards/accuracies": 1.0, "rewards/chosen": 2.883587598800659, "rewards/margins": 0.7443258762359619, "rewards/rejected": 2.1392617225646973, "step": 11410 }, { "epoch": 1.85, "learning_rate": 1.393637621816369e-07, "logits/chosen": -0.5962194204330444, "logits/rejected": -0.6090205311775208, "logps/chosen": -9.075682640075684, "logps/rejected": -3.0936617851257324, "loss": 1.6448, "rewards/accuracies": 0.0, "rewards/chosen": 0.22563134133815765, "rewards/margins": -0.499469518661499, "rewards/rejected": 0.7251008749008179, "step": 11411 }, { "epoch": 1.85, "learning_rate": 1.3927274265166532e-07, "logits/chosen": 0.0015965882921591401, "logits/rejected": 0.0015965882921591401, "logps/chosen": -9.50094985961914, "logps/rejected": -9.50094985961914, "loss": 0.4293, "rewards/accuracies": 0.0, "rewards/chosen": 0.3042319416999817, "rewards/margins": 0.0, "rewards/rejected": 0.3042319416999817, "step": 11412 }, { "epoch": 1.85, "learning_rate": 1.3918174804453386e-07, "logits/chosen": -0.819842517375946, "logits/rejected": -0.7748508453369141, "logps/chosen": -140.3651123046875, "logps/rejected": -106.72158813476562, "loss": 1.7387, "rewards/accuracies": 0.0, "rewards/chosen": 3.369528293609619, "rewards/margins": -0.8732209205627441, "rewards/rejected": 4.242749214172363, "step": 11413 }, { "epoch": 1.85, "learning_rate": 1.3909077836652966e-07, "logits/chosen": -0.426937073469162, "logits/rejected": -0.426937073469162, "logps/chosen": -0.9301338195800781, "logps/rejected": -0.9301338195800781, "loss": 1.0099, "rewards/accuracies": 0.0, "rewards/chosen": 0.2781859040260315, "rewards/margins": 0.0, "rewards/rejected": 0.2781859040260315, "step": 11414 }, { "epoch": 1.85, "learning_rate": 1.3899983362393754e-07, "logits/chosen": -0.17697858810424805, "logits/rejected": -0.19876737892627716, "logps/chosen": -3.6081230640411377, "logps/rejected": -27.279539108276367, "loss": 0.3318, "rewards/accuracies": 1.0, "rewards/chosen": 0.37597253918647766, "rewards/margins": 0.06032922863960266, "rewards/rejected": 0.315643310546875, "step": 11415 }, { "epoch": 1.85, "learning_rate": 1.3890891382304126e-07, "logits/chosen": -1.0082451105117798, "logits/rejected": -0.9939646124839783, "logps/chosen": -38.795066833496094, "logps/rejected": -88.02294921875, "loss": 0.7953, "rewards/accuracies": 0.0, "rewards/chosen": 2.583078145980835, "rewards/margins": -0.9586775302886963, "rewards/rejected": 3.5417556762695312, "step": 11416 }, { "epoch": 1.85, "learning_rate": 1.3881801897012224e-07, "logits/chosen": -0.5454999804496765, "logits/rejected": -0.5454999804496765, "logps/chosen": -22.585792541503906, "logps/rejected": -22.585792541503906, "loss": 0.3838, "rewards/accuracies": 0.0, "rewards/chosen": 0.23996658623218536, "rewards/margins": 0.0, "rewards/rejected": 0.23996658623218536, "step": 11417 }, { "epoch": 1.85, "learning_rate": 1.387271490714607e-07, "logits/chosen": -0.9321100115776062, "logits/rejected": -0.8464445471763611, "logps/chosen": -116.96804809570312, "logps/rejected": -41.133384704589844, "loss": 0.7048, "rewards/accuracies": 1.0, "rewards/chosen": 1.8195266723632812, "rewards/margins": 1.5863640308380127, "rewards/rejected": 0.23316268622875214, "step": 11418 }, { "epoch": 1.85, "learning_rate": 1.386363041333346e-07, "logits/chosen": -0.985298216342926, "logits/rejected": -0.968636691570282, "logps/chosen": -114.25922393798828, "logps/rejected": -177.82754516601562, "loss": 0.2009, "rewards/accuracies": 1.0, "rewards/chosen": 6.524938106536865, "rewards/margins": 0.9995412826538086, "rewards/rejected": 5.525396823883057, "step": 11419 }, { "epoch": 1.85, "learning_rate": 1.3854548416202084e-07, "logits/chosen": -0.8374383449554443, "logits/rejected": -0.8270090818405151, "logps/chosen": -88.95326232910156, "logps/rejected": -48.52458190917969, "loss": 1.8264, "rewards/accuracies": 0.0, "rewards/chosen": 2.7278671264648438, "rewards/margins": -0.045588016510009766, "rewards/rejected": 2.7734551429748535, "step": 11420 }, { "epoch": 1.85, "learning_rate": 1.3845468916379382e-07, "logits/chosen": -0.9187174439430237, "logits/rejected": -0.9187174439430237, "logps/chosen": -40.63238525390625, "logps/rejected": -40.63238525390625, "loss": 0.5858, "rewards/accuracies": 0.0, "rewards/chosen": 1.8134033679962158, "rewards/margins": 0.0, "rewards/rejected": 1.8134033679962158, "step": 11421 }, { "epoch": 1.85, "learning_rate": 1.3836391914492694e-07, "logits/chosen": -1.028207778930664, "logits/rejected": -0.9144845008850098, "logps/chosen": -82.73287963867188, "logps/rejected": -34.11474609375, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 1.1927002668380737, "rewards/margins": 0.8951191902160645, "rewards/rejected": 0.29758110642433167, "step": 11422 }, { "epoch": 1.85, "learning_rate": 1.3827317411169125e-07, "logits/chosen": -1.1432147026062012, "logits/rejected": -1.0520551204681396, "logps/chosen": -78.84158325195312, "logps/rejected": -41.35625076293945, "loss": 0.1789, "rewards/accuracies": 1.0, "rewards/chosen": 2.826495409011841, "rewards/margins": 0.8885974884033203, "rewards/rejected": 1.9378979206085205, "step": 11423 }, { "epoch": 1.85, "learning_rate": 1.3818245407035672e-07, "logits/chosen": -0.8724045753479004, "logits/rejected": -0.6537339091300964, "logps/chosen": -135.9581756591797, "logps/rejected": -86.31005859375, "loss": 0.1584, "rewards/accuracies": 1.0, "rewards/chosen": 5.294419765472412, "rewards/margins": 2.5064666271209717, "rewards/rejected": 2.7879531383514404, "step": 11424 }, { "epoch": 1.85, "learning_rate": 1.3809175902719094e-07, "logits/chosen": -0.28249090909957886, "logits/rejected": -0.2919692397117615, "logps/chosen": -9.87147331237793, "logps/rejected": -1.5725536346435547, "loss": 0.8532, "rewards/accuracies": 0.0, "rewards/chosen": -0.0405694954097271, "rewards/margins": -0.2705720365047455, "rewards/rejected": 0.23000255227088928, "step": 11425 }, { "epoch": 1.85, "learning_rate": 1.3800108898846018e-07, "logits/chosen": -0.6576612591743469, "logits/rejected": -0.6750317215919495, "logps/chosen": -24.86532974243164, "logps/rejected": -19.290725708007812, "loss": 0.7328, "rewards/accuracies": 0.0, "rewards/chosen": 0.11127644032239914, "rewards/margins": -0.15279462933540344, "rewards/rejected": 0.2640710771083832, "step": 11426 }, { "epoch": 1.85, "learning_rate": 1.3791044396042905e-07, "logits/chosen": -0.7173068523406982, "logits/rejected": -0.6558602452278137, "logps/chosen": -71.77850341796875, "logps/rejected": -70.43650817871094, "loss": 0.4123, "rewards/accuracies": 1.0, "rewards/chosen": 2.8300857543945312, "rewards/margins": 1.0183067321777344, "rewards/rejected": 1.8117790222167969, "step": 11427 }, { "epoch": 1.85, "learning_rate": 1.3781982394935997e-07, "logits/chosen": -0.5630871057510376, "logits/rejected": -0.6048763990402222, "logps/chosen": -52.778846740722656, "logps/rejected": -82.79386138916016, "loss": 0.808, "rewards/accuracies": 0.0, "rewards/chosen": 0.5179511904716492, "rewards/margins": -0.14100801944732666, "rewards/rejected": 0.6589592099189758, "step": 11428 }, { "epoch": 1.86, "learning_rate": 1.3772922896151422e-07, "logits/chosen": -0.6326856017112732, "logits/rejected": -0.601289689540863, "logps/chosen": -117.9866943359375, "logps/rejected": -145.19676208496094, "loss": 0.954, "rewards/accuracies": 0.0, "rewards/chosen": 2.7161407470703125, "rewards/margins": -1.3633761405944824, "rewards/rejected": 4.079516887664795, "step": 11429 }, { "epoch": 1.86, "learning_rate": 1.3763865900315074e-07, "logits/chosen": -0.7226067781448364, "logits/rejected": -0.559457004070282, "logps/chosen": -65.98957061767578, "logps/rejected": -35.981258392333984, "loss": 0.8023, "rewards/accuracies": 1.0, "rewards/chosen": 1.004163384437561, "rewards/margins": 0.4308105707168579, "rewards/rejected": 0.5733528137207031, "step": 11430 }, { "epoch": 1.86, "learning_rate": 1.3754811408052742e-07, "logits/chosen": -0.05809572711586952, "logits/rejected": -0.05809572711586952, "logps/chosen": -6.451813220977783, "logps/rejected": -6.451813220977783, "loss": 0.5907, "rewards/accuracies": 0.0, "rewards/chosen": 0.15521493554115295, "rewards/margins": 0.0, "rewards/rejected": 0.15521493554115295, "step": 11431 }, { "epoch": 1.86, "learning_rate": 1.3745759419989972e-07, "logits/chosen": -0.5344574451446533, "logits/rejected": -0.5518808960914612, "logps/chosen": -60.196537017822266, "logps/rejected": -84.84896087646484, "loss": 0.5802, "rewards/accuracies": 1.0, "rewards/chosen": 1.561727523803711, "rewards/margins": 0.6470966339111328, "rewards/rejected": 0.9146308898925781, "step": 11432 }, { "epoch": 1.86, "learning_rate": 1.3736709936752193e-07, "logits/chosen": -0.6304428577423096, "logits/rejected": -0.6304428577423096, "logps/chosen": -51.875247955322266, "logps/rejected": -51.875247955322266, "loss": 0.3698, "rewards/accuracies": 0.0, "rewards/chosen": 1.5244381427764893, "rewards/margins": 0.0, "rewards/rejected": 1.5244381427764893, "step": 11433 }, { "epoch": 1.86, "learning_rate": 1.3727662958964626e-07, "logits/chosen": -1.1796613931655884, "logits/rejected": -1.024701714515686, "logps/chosen": -71.07823181152344, "logps/rejected": -96.92584228515625, "loss": 0.0524, "rewards/accuracies": 1.0, "rewards/chosen": 5.573513984680176, "rewards/margins": 2.8221771717071533, "rewards/rejected": 2.7513368129730225, "step": 11434 }, { "epoch": 1.86, "learning_rate": 1.3718618487252343e-07, "logits/chosen": -0.818692684173584, "logits/rejected": -0.8436010479927063, "logps/chosen": -97.17317199707031, "logps/rejected": -85.24862670898438, "loss": 1.6397, "rewards/accuracies": 0.0, "rewards/chosen": 1.6543594598770142, "rewards/margins": -2.507965087890625, "rewards/rejected": 4.16232442855835, "step": 11435 }, { "epoch": 1.86, "learning_rate": 1.3709576522240213e-07, "logits/chosen": -0.3050232231616974, "logits/rejected": -0.25278571248054504, "logps/chosen": -74.37174224853516, "logps/rejected": -73.2350082397461, "loss": 0.4532, "rewards/accuracies": 1.0, "rewards/chosen": 1.9385483264923096, "rewards/margins": 0.37056195735931396, "rewards/rejected": 1.5679863691329956, "step": 11436 }, { "epoch": 1.86, "learning_rate": 1.370053706455298e-07, "logits/chosen": -0.4580352008342743, "logits/rejected": -0.4948135316371918, "logps/chosen": -112.13031005859375, "logps/rejected": -71.86239624023438, "loss": 0.0262, "rewards/accuracies": 1.0, "rewards/chosen": 5.819738864898682, "rewards/margins": 3.1996352672576904, "rewards/rejected": 2.620103597640991, "step": 11437 }, { "epoch": 1.86, "learning_rate": 1.3691500114815147e-07, "logits/chosen": -0.7734978199005127, "logits/rejected": -0.6365423202514648, "logps/chosen": -62.80601501464844, "logps/rejected": -73.99546813964844, "loss": 0.6398, "rewards/accuracies": 0.0, "rewards/chosen": 2.0473084449768066, "rewards/margins": -0.33134007453918457, "rewards/rejected": 2.378648519515991, "step": 11438 }, { "epoch": 1.86, "learning_rate": 1.3682465673651122e-07, "logits/chosen": -0.5222083330154419, "logits/rejected": -0.3527440130710602, "logps/chosen": -38.531002044677734, "logps/rejected": -20.703346252441406, "loss": 0.731, "rewards/accuracies": 1.0, "rewards/chosen": 0.8476856350898743, "rewards/margins": 0.44524917006492615, "rewards/rejected": 0.4024364650249481, "step": 11439 }, { "epoch": 1.86, "learning_rate": 1.3673433741685058e-07, "logits/chosen": -0.7014433741569519, "logits/rejected": -0.6403865218162537, "logps/chosen": -48.85613250732422, "logps/rejected": -67.5135269165039, "loss": 0.2153, "rewards/accuracies": 1.0, "rewards/chosen": 1.8722877502441406, "rewards/margins": 0.6663970947265625, "rewards/rejected": 1.2058906555175781, "step": 11440 }, { "epoch": 1.86, "learning_rate": 1.3664404319541017e-07, "logits/chosen": -0.573625922203064, "logits/rejected": -0.58536696434021, "logps/chosen": -61.25577163696289, "logps/rejected": -36.134925842285156, "loss": 0.5624, "rewards/accuracies": 0.0, "rewards/chosen": 0.8185451626777649, "rewards/margins": -0.6692760586738586, "rewards/rejected": 1.4878212213516235, "step": 11441 }, { "epoch": 1.86, "learning_rate": 1.365537740784281e-07, "logits/chosen": -0.5819220542907715, "logits/rejected": -0.45786190032958984, "logps/chosen": -50.37826919555664, "logps/rejected": -18.406583786010742, "loss": 2.5722, "rewards/accuracies": 1.0, "rewards/chosen": 1.4241145849227905, "rewards/margins": 1.3214493989944458, "rewards/rejected": 0.10266514122486115, "step": 11442 }, { "epoch": 1.86, "learning_rate": 1.3646353007214146e-07, "logits/chosen": -0.8375149965286255, "logits/rejected": -0.7543168067932129, "logps/chosen": -185.40093994140625, "logps/rejected": -161.71522521972656, "loss": 0.0578, "rewards/accuracies": 1.0, "rewards/chosen": 5.564276218414307, "rewards/margins": 2.162001132965088, "rewards/rejected": 3.4022750854492188, "step": 11443 }, { "epoch": 1.86, "learning_rate": 1.3637331118278488e-07, "logits/chosen": -0.3375186622142792, "logits/rejected": -0.28115352988243103, "logps/chosen": -44.009857177734375, "logps/rejected": -40.6614990234375, "loss": 2.1878, "rewards/accuracies": 1.0, "rewards/chosen": 1.843000054359436, "rewards/margins": 0.4735591411590576, "rewards/rejected": 1.3694409132003784, "step": 11444 }, { "epoch": 1.86, "learning_rate": 1.3628311741659204e-07, "logits/chosen": -0.8785368204116821, "logits/rejected": -0.8088944554328918, "logps/chosen": -62.67765808105469, "logps/rejected": -19.398399353027344, "loss": 2.0161, "rewards/accuracies": 1.0, "rewards/chosen": 1.2990623712539673, "rewards/margins": 0.8431129455566406, "rewards/rejected": 0.4559493958950043, "step": 11445 }, { "epoch": 1.86, "learning_rate": 1.361929487797941e-07, "logits/chosen": -0.5486177802085876, "logits/rejected": -0.5545331835746765, "logps/chosen": -35.28711700439453, "logps/rejected": -73.80875396728516, "loss": 0.6114, "rewards/accuracies": 0.0, "rewards/chosen": 0.7047027945518494, "rewards/margins": -0.2967018485069275, "rewards/rejected": 1.0014046430587769, "step": 11446 }, { "epoch": 1.86, "learning_rate": 1.3610280527862116e-07, "logits/chosen": -0.8212268948554993, "logits/rejected": -0.779103696346283, "logps/chosen": -132.55056762695312, "logps/rejected": -131.94741821289062, "loss": 0.2548, "rewards/accuracies": 1.0, "rewards/chosen": 6.758264064788818, "rewards/margins": 0.4233365058898926, "rewards/rejected": 6.334927558898926, "step": 11447 }, { "epoch": 1.86, "learning_rate": 1.3601268691930097e-07, "logits/chosen": -0.6806210875511169, "logits/rejected": -0.5292232036590576, "logps/chosen": -50.66175842285156, "logps/rejected": -59.44664764404297, "loss": 1.1677, "rewards/accuracies": 0.0, "rewards/chosen": 1.6417824029922485, "rewards/margins": -1.0593024492263794, "rewards/rejected": 2.701084852218628, "step": 11448 }, { "epoch": 1.86, "learning_rate": 1.359225937080602e-07, "logits/chosen": -0.7765157222747803, "logits/rejected": -0.7812315225601196, "logps/chosen": -5.272309303283691, "logps/rejected": -1.3438639640808105, "loss": 0.8194, "rewards/accuracies": 0.0, "rewards/chosen": 0.1491931974887848, "rewards/margins": -0.2598140835762024, "rewards/rejected": 0.4090072810649872, "step": 11449 }, { "epoch": 1.86, "learning_rate": 1.3583252565112314e-07, "logits/chosen": -0.7611586451530457, "logits/rejected": -0.6904422044754028, "logps/chosen": -62.49897384643555, "logps/rejected": -58.27466583251953, "loss": 0.3158, "rewards/accuracies": 1.0, "rewards/chosen": 2.3069379329681396, "rewards/margins": 1.5290204286575317, "rewards/rejected": 0.7779175043106079, "step": 11450 }, { "epoch": 1.86, "learning_rate": 1.3574248275471295e-07, "logits/chosen": -0.6346027255058289, "logits/rejected": -0.6608086824417114, "logps/chosen": -23.561336517333984, "logps/rejected": -32.40308380126953, "loss": 0.557, "rewards/accuracies": 0.0, "rewards/chosen": 0.6052536368370056, "rewards/margins": -0.09516370296478271, "rewards/rejected": 0.7004173398017883, "step": 11451 }, { "epoch": 1.86, "learning_rate": 1.3565246502505033e-07, "logits/chosen": -0.624478280544281, "logits/rejected": -0.6164962649345398, "logps/chosen": -56.613304138183594, "logps/rejected": -54.636627197265625, "loss": 0.5604, "rewards/accuracies": 1.0, "rewards/chosen": 2.0534331798553467, "rewards/margins": 0.13620901107788086, "rewards/rejected": 1.9172241687774658, "step": 11452 }, { "epoch": 1.86, "learning_rate": 1.3556247246835511e-07, "logits/chosen": -0.5205911993980408, "logits/rejected": -0.5173085927963257, "logps/chosen": -2.9530341625213623, "logps/rejected": -17.975711822509766, "loss": 0.4336, "rewards/accuracies": 1.0, "rewards/chosen": 0.6388394236564636, "rewards/margins": 0.337718665599823, "rewards/rejected": 0.3011207580566406, "step": 11453 }, { "epoch": 1.86, "learning_rate": 1.354725050908445e-07, "logits/chosen": -0.41383495926856995, "logits/rejected": -0.3724769651889801, "logps/chosen": -61.63026428222656, "logps/rejected": -56.78636932373047, "loss": 0.473, "rewards/accuracies": 0.0, "rewards/chosen": 0.8147056698799133, "rewards/margins": -0.39114612340927124, "rewards/rejected": 1.2058517932891846, "step": 11454 }, { "epoch": 1.86, "learning_rate": 1.3538256289873485e-07, "logits/chosen": -0.7821012139320374, "logits/rejected": -0.8413611650466919, "logps/chosen": -72.133056640625, "logps/rejected": -74.15837097167969, "loss": 1.0545, "rewards/accuracies": 0.0, "rewards/chosen": 1.5657227039337158, "rewards/margins": -1.6732077598571777, "rewards/rejected": 3.2389304637908936, "step": 11455 }, { "epoch": 1.86, "learning_rate": 1.352926458982398e-07, "logits/chosen": -0.8988362550735474, "logits/rejected": -0.8734205961227417, "logps/chosen": -84.79265594482422, "logps/rejected": -67.10564422607422, "loss": 0.3904, "rewards/accuracies": 1.0, "rewards/chosen": 1.1855705976486206, "rewards/margins": 0.23809820413589478, "rewards/rejected": 0.9474723935127258, "step": 11456 }, { "epoch": 1.86, "learning_rate": 1.3520275409557224e-07, "logits/chosen": -0.6609531044960022, "logits/rejected": -0.4445492625236511, "logps/chosen": -93.798095703125, "logps/rejected": -50.586631774902344, "loss": 0.2439, "rewards/accuracies": 1.0, "rewards/chosen": 1.7766441106796265, "rewards/margins": 0.4918341636657715, "rewards/rejected": 1.284809947013855, "step": 11457 }, { "epoch": 1.86, "learning_rate": 1.3511288749694244e-07, "logits/chosen": -1.1816974878311157, "logits/rejected": -1.1741894483566284, "logps/chosen": -71.83525085449219, "logps/rejected": -64.90727233886719, "loss": 0.5427, "rewards/accuracies": 0.0, "rewards/chosen": 0.8465057611465454, "rewards/margins": -0.5582993030548096, "rewards/rejected": 1.404805064201355, "step": 11458 }, { "epoch": 1.86, "learning_rate": 1.350230461085597e-07, "logits/chosen": -0.6040358543395996, "logits/rejected": -0.5425860285758972, "logps/chosen": -109.10687255859375, "logps/rejected": -60.66822814941406, "loss": 0.7157, "rewards/accuracies": 0.0, "rewards/chosen": 0.3771774470806122, "rewards/margins": -1.1032363176345825, "rewards/rejected": 1.480413794517517, "step": 11459 }, { "epoch": 1.86, "learning_rate": 1.3493322993663087e-07, "logits/chosen": -1.0909461975097656, "logits/rejected": -0.951583206653595, "logps/chosen": -144.21910095214844, "logps/rejected": -106.76422119140625, "loss": 0.2161, "rewards/accuracies": 1.0, "rewards/chosen": 5.993434429168701, "rewards/margins": 1.078495979309082, "rewards/rejected": 4.914938449859619, "step": 11460 }, { "epoch": 1.86, "learning_rate": 1.3484343898736168e-07, "logits/chosen": -1.0223236083984375, "logits/rejected": -0.7642237544059753, "logps/chosen": -228.22232055664062, "logps/rejected": -83.07078552246094, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": 7.372488498687744, "rewards/margins": 3.7071213722229004, "rewards/rejected": 3.6653671264648438, "step": 11461 }, { "epoch": 1.86, "learning_rate": 1.3475367326695558e-07, "logits/chosen": -0.9710422158241272, "logits/rejected": -0.8640992045402527, "logps/chosen": -118.55805206298828, "logps/rejected": -46.83885955810547, "loss": 0.3213, "rewards/accuracies": 1.0, "rewards/chosen": 0.6971634030342102, "rewards/margins": 0.6146743893623352, "rewards/rejected": 0.082489013671875, "step": 11462 }, { "epoch": 1.86, "learning_rate": 1.3466393278161465e-07, "logits/chosen": -0.5757001638412476, "logits/rejected": -0.5344811081886292, "logps/chosen": -41.11324691772461, "logps/rejected": -53.355003356933594, "loss": 1.0371, "rewards/accuracies": 0.0, "rewards/chosen": 1.669260025024414, "rewards/margins": -0.11268889904022217, "rewards/rejected": 1.7819489240646362, "step": 11463 }, { "epoch": 1.86, "learning_rate": 1.3457421753753928e-07, "logits/chosen": -0.3463667929172516, "logits/rejected": -0.28178906440734863, "logps/chosen": -106.02140808105469, "logps/rejected": -54.074378967285156, "loss": 0.9369, "rewards/accuracies": 0.0, "rewards/chosen": 1.6008208990097046, "rewards/margins": -0.7200967073440552, "rewards/rejected": 2.3209176063537598, "step": 11464 }, { "epoch": 1.86, "learning_rate": 1.3448452754092766e-07, "logits/chosen": -0.6611899733543396, "logits/rejected": -0.6591410636901855, "logps/chosen": -16.849021911621094, "logps/rejected": -46.633270263671875, "loss": 1.1611, "rewards/accuracies": 0.0, "rewards/chosen": 1.2213001251220703, "rewards/margins": -0.8237874507904053, "rewards/rejected": 2.0450875759124756, "step": 11465 }, { "epoch": 1.86, "learning_rate": 1.343948627979768e-07, "logits/chosen": -0.8172008395195007, "logits/rejected": -1.2083505392074585, "logps/chosen": -69.79635620117188, "logps/rejected": -36.917362213134766, "loss": 0.2507, "rewards/accuracies": 1.0, "rewards/chosen": 1.4059791564941406, "rewards/margins": 1.1296249628067017, "rewards/rejected": 0.27635422348976135, "step": 11466 }, { "epoch": 1.86, "learning_rate": 1.3430522331488137e-07, "logits/chosen": -0.5842581391334534, "logits/rejected": -0.6398255228996277, "logps/chosen": -87.71931457519531, "logps/rejected": -186.87857055664062, "loss": 1.5622, "rewards/accuracies": 0.0, "rewards/chosen": 2.653067111968994, "rewards/margins": -3.014556884765625, "rewards/rejected": 5.667623996734619, "step": 11467 }, { "epoch": 1.86, "learning_rate": 1.3421560909783493e-07, "logits/chosen": -0.4231729805469513, "logits/rejected": -0.28704479336738586, "logps/chosen": -38.313232421875, "logps/rejected": -62.28469467163086, "loss": 0.8014, "rewards/accuracies": 0.0, "rewards/chosen": 1.1476486921310425, "rewards/margins": -0.39428281784057617, "rewards/rejected": 1.5419315099716187, "step": 11468 }, { "epoch": 1.86, "learning_rate": 1.3412602015302866e-07, "logits/chosen": -0.8833233714103699, "logits/rejected": -0.8667752742767334, "logps/chosen": -95.19181060791016, "logps/rejected": -77.79698944091797, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 1.2125717401504517, "rewards/margins": -1.0919991731643677, "rewards/rejected": 2.3045709133148193, "step": 11469 }, { "epoch": 1.86, "learning_rate": 1.3403645648665263e-07, "logits/chosen": -0.6436437964439392, "logits/rejected": -0.6757658123970032, "logps/chosen": -99.90478515625, "logps/rejected": -59.29339599609375, "loss": 0.8765, "rewards/accuracies": 0.0, "rewards/chosen": 0.6810302734375, "rewards/margins": -0.8619660139083862, "rewards/rejected": 1.5429962873458862, "step": 11470 }, { "epoch": 1.86, "learning_rate": 1.339469181048945e-07, "logits/chosen": -0.9488769173622131, "logits/rejected": -0.9591962099075317, "logps/chosen": -108.94945526123047, "logps/rejected": -65.85493469238281, "loss": 0.6242, "rewards/accuracies": 0.0, "rewards/chosen": 1.661030650138855, "rewards/margins": -0.43185651302337646, "rewards/rejected": 2.0928871631622314, "step": 11471 }, { "epoch": 1.86, "learning_rate": 1.3385740501394083e-07, "logits/chosen": -1.1518218517303467, "logits/rejected": -1.0244059562683105, "logps/chosen": -102.0869140625, "logps/rejected": -84.46565246582031, "loss": 0.7464, "rewards/accuracies": 0.0, "rewards/chosen": 1.4082145690917969, "rewards/margins": -0.4435478448867798, "rewards/rejected": 1.8517624139785767, "step": 11472 }, { "epoch": 1.86, "learning_rate": 1.3376791721997589e-07, "logits/chosen": -0.710016667842865, "logits/rejected": -0.6999287605285645, "logps/chosen": -80.86836242675781, "logps/rejected": -93.51421356201172, "loss": 0.8211, "rewards/accuracies": 1.0, "rewards/chosen": 1.607397437095642, "rewards/margins": 0.8770270943641663, "rewards/rejected": 0.7303703427314758, "step": 11473 }, { "epoch": 1.86, "learning_rate": 1.336784547291827e-07, "logits/chosen": -0.6116765141487122, "logits/rejected": -0.5624629855155945, "logps/chosen": -63.44607162475586, "logps/rejected": -92.56573486328125, "loss": 1.6172, "rewards/accuracies": 1.0, "rewards/chosen": 2.558229446411133, "rewards/margins": 0.22995781898498535, "rewards/rejected": 2.3282716274261475, "step": 11474 }, { "epoch": 1.86, "learning_rate": 1.3358901754774198e-07, "logits/chosen": -0.718034029006958, "logits/rejected": -0.709966242313385, "logps/chosen": -197.800537109375, "logps/rejected": -75.85372924804688, "loss": 0.1226, "rewards/accuracies": 1.0, "rewards/chosen": 4.828013896942139, "rewards/margins": 1.33333158493042, "rewards/rejected": 3.4946823120117188, "step": 11475 }, { "epoch": 1.86, "learning_rate": 1.3349960568183327e-07, "logits/chosen": -0.8451626896858215, "logits/rejected": -0.7518230676651001, "logps/chosen": -89.69855499267578, "logps/rejected": -65.00780487060547, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 3.9744088649749756, "rewards/margins": 0.15994882583618164, "rewards/rejected": 3.814460039138794, "step": 11476 }, { "epoch": 1.86, "learning_rate": 1.3341021913763378e-07, "logits/chosen": -0.6316842436790466, "logits/rejected": -0.6136293411254883, "logps/chosen": -59.15314865112305, "logps/rejected": -80.31340026855469, "loss": 1.3703, "rewards/accuracies": 1.0, "rewards/chosen": 0.9648250937461853, "rewards/margins": 0.4625858664512634, "rewards/rejected": 0.5022392272949219, "step": 11477 }, { "epoch": 1.86, "learning_rate": 1.3332085792131963e-07, "logits/chosen": -0.6912398338317871, "logits/rejected": -0.6868970394134521, "logps/chosen": -39.854766845703125, "logps/rejected": -17.958925247192383, "loss": 0.2509, "rewards/accuracies": 1.0, "rewards/chosen": 0.9491775631904602, "rewards/margins": 0.4675516188144684, "rewards/rejected": 0.4816259443759918, "step": 11478 }, { "epoch": 1.86, "learning_rate": 1.3323152203906447e-07, "logits/chosen": -0.6531491875648499, "logits/rejected": -0.6108934283256531, "logps/chosen": -54.82171630859375, "logps/rejected": -55.29956817626953, "loss": 0.3078, "rewards/accuracies": 1.0, "rewards/chosen": 2.0154755115509033, "rewards/margins": 0.18705976009368896, "rewards/rejected": 1.8284157514572144, "step": 11479 }, { "epoch": 1.86, "learning_rate": 1.331422114970409e-07, "logits/chosen": -0.6563776135444641, "logits/rejected": -0.6595805883407593, "logps/chosen": -59.48580551147461, "logps/rejected": -59.36872863769531, "loss": 0.2145, "rewards/accuracies": 1.0, "rewards/chosen": 1.0477222204208374, "rewards/margins": 0.641815185546875, "rewards/rejected": 0.4059070646762848, "step": 11480 }, { "epoch": 1.86, "learning_rate": 1.3305292630141913e-07, "logits/chosen": -0.8284559845924377, "logits/rejected": -0.7829935550689697, "logps/chosen": -173.72915649414062, "logps/rejected": -17.00286102294922, "loss": 0.8498, "rewards/accuracies": 1.0, "rewards/chosen": 1.2485138177871704, "rewards/margins": 1.0731430053710938, "rewards/rejected": 0.17537079751491547, "step": 11481 }, { "epoch": 1.86, "learning_rate": 1.329636664583682e-07, "logits/chosen": -0.3647329807281494, "logits/rejected": -0.4301359951496124, "logps/chosen": -10.12248420715332, "logps/rejected": -74.39759826660156, "loss": 3.079, "rewards/accuracies": 0.0, "rewards/chosen": 0.254287451505661, "rewards/margins": -4.265076160430908, "rewards/rejected": 4.5193634033203125, "step": 11482 }, { "epoch": 1.86, "learning_rate": 1.3287443197405483e-07, "logits/chosen": -0.7545638680458069, "logits/rejected": -0.7244402766227722, "logps/chosen": -107.09327697753906, "logps/rejected": -62.56928253173828, "loss": 0.5062, "rewards/accuracies": 1.0, "rewards/chosen": 2.1665802001953125, "rewards/margins": 0.3994094133377075, "rewards/rejected": 1.767170786857605, "step": 11483 }, { "epoch": 1.86, "learning_rate": 1.3278522285464462e-07, "logits/chosen": -0.8712173104286194, "logits/rejected": -0.8316109776496887, "logps/chosen": -69.5936279296875, "logps/rejected": -52.38872528076172, "loss": 0.2222, "rewards/accuracies": 1.0, "rewards/chosen": 1.7453125715255737, "rewards/margins": 1.1428794860839844, "rewards/rejected": 0.6024330258369446, "step": 11484 }, { "epoch": 1.86, "learning_rate": 1.3269603910630072e-07, "logits/chosen": -0.916538655757904, "logits/rejected": -0.9494458436965942, "logps/chosen": -216.2176055908203, "logps/rejected": -136.27989196777344, "loss": 0.9393, "rewards/accuracies": 0.0, "rewards/chosen": 5.604979038238525, "rewards/margins": -0.4412689208984375, "rewards/rejected": 6.046247959136963, "step": 11485 }, { "epoch": 1.86, "learning_rate": 1.3260688073518522e-07, "logits/chosen": -0.9044904112815857, "logits/rejected": -0.929833710193634, "logps/chosen": -51.432899475097656, "logps/rejected": -92.987548828125, "loss": 0.4101, "rewards/accuracies": 1.0, "rewards/chosen": 1.3632789850234985, "rewards/margins": 0.0674750804901123, "rewards/rejected": 1.2958039045333862, "step": 11486 }, { "epoch": 1.86, "learning_rate": 1.3251774774745782e-07, "logits/chosen": -0.6690415143966675, "logits/rejected": -0.6886975765228271, "logps/chosen": -60.750709533691406, "logps/rejected": -81.7236328125, "loss": 0.4124, "rewards/accuracies": 1.0, "rewards/chosen": 1.5068260431289673, "rewards/margins": 0.029319047927856445, "rewards/rejected": 1.4775069952011108, "step": 11487 }, { "epoch": 1.86, "learning_rate": 1.3242864014927703e-07, "logits/chosen": -0.48006510734558105, "logits/rejected": -0.49163946509361267, "logps/chosen": -52.5525016784668, "logps/rejected": -175.91490173339844, "loss": 0.1488, "rewards/accuracies": 1.0, "rewards/chosen": 1.674476981163025, "rewards/margins": 1.2427035570144653, "rewards/rejected": 0.4317733943462372, "step": 11488 }, { "epoch": 1.86, "learning_rate": 1.3233955794679907e-07, "logits/chosen": -0.36666053533554077, "logits/rejected": -0.3672352731227875, "logps/chosen": -5.562762260437012, "logps/rejected": -4.173868656158447, "loss": 0.3922, "rewards/accuracies": 0.0, "rewards/chosen": 0.2259761393070221, "rewards/margins": -0.11067232489585876, "rewards/rejected": 0.33664846420288086, "step": 11489 }, { "epoch": 1.86, "learning_rate": 1.3225050114617896e-07, "logits/chosen": -0.819046676158905, "logits/rejected": -0.7037836909294128, "logps/chosen": -114.60899353027344, "logps/rejected": -61.94580841064453, "loss": 1.1775, "rewards/accuracies": 1.0, "rewards/chosen": 5.43440580368042, "rewards/margins": 3.9256272315979004, "rewards/rejected": 1.50877845287323, "step": 11490 }, { "epoch": 1.87, "learning_rate": 1.321614697535694e-07, "logits/chosen": -1.0754411220550537, "logits/rejected": -1.0443929433822632, "logps/chosen": -85.69194030761719, "logps/rejected": -30.18584442138672, "loss": 1.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.1105072498321533, "rewards/margins": 0.34094393253326416, "rewards/rejected": 0.7695633172988892, "step": 11491 }, { "epoch": 1.87, "learning_rate": 1.3207246377512187e-07, "logits/chosen": -0.9979800581932068, "logits/rejected": -0.9979800581932068, "logps/chosen": -52.114036560058594, "logps/rejected": -52.114036560058594, "loss": 0.498, "rewards/accuracies": 0.0, "rewards/chosen": 2.228532552719116, "rewards/margins": 0.0, "rewards/rejected": 2.228532552719116, "step": 11492 }, { "epoch": 1.87, "learning_rate": 1.3198348321698565e-07, "logits/chosen": -0.8210033178329468, "logits/rejected": -0.9002661108970642, "logps/chosen": -63.920555114746094, "logps/rejected": -102.67216491699219, "loss": 1.1434, "rewards/accuracies": 0.0, "rewards/chosen": 1.6430931091308594, "rewards/margins": -2.167649030685425, "rewards/rejected": 3.810742139816284, "step": 11493 }, { "epoch": 1.87, "learning_rate": 1.3189452808530864e-07, "logits/chosen": -0.8624351620674133, "logits/rejected": -0.844569206237793, "logps/chosen": -99.27557373046875, "logps/rejected": -76.49956512451172, "loss": 0.8656, "rewards/accuracies": 0.0, "rewards/chosen": 4.287939548492432, "rewards/margins": -0.14618158340454102, "rewards/rejected": 4.434121131896973, "step": 11494 }, { "epoch": 1.87, "learning_rate": 1.3180559838623673e-07, "logits/chosen": -0.6165164709091187, "logits/rejected": -0.5509655475616455, "logps/chosen": -104.36679077148438, "logps/rejected": -64.91447448730469, "loss": 0.7883, "rewards/accuracies": 0.0, "rewards/chosen": 2.213381290435791, "rewards/margins": -0.4873373508453369, "rewards/rejected": 2.700718641281128, "step": 11495 }, { "epoch": 1.87, "learning_rate": 1.317166941259139e-07, "logits/chosen": -0.32403257489204407, "logits/rejected": -0.32403257489204407, "logps/chosen": -45.92194747924805, "logps/rejected": -45.92194747924805, "loss": 1.3191, "rewards/accuracies": 0.0, "rewards/chosen": 0.937930703163147, "rewards/margins": 0.0, "rewards/rejected": 0.937930703163147, "step": 11496 }, { "epoch": 1.87, "learning_rate": 1.316278153104829e-07, "logits/chosen": -0.3961579501628876, "logits/rejected": -0.4255458414554596, "logps/chosen": -18.363780975341797, "logps/rejected": -56.261383056640625, "loss": 0.7263, "rewards/accuracies": 0.0, "rewards/chosen": 0.7197340130805969, "rewards/margins": -0.8695421814918518, "rewards/rejected": 1.5892761945724487, "step": 11497 }, { "epoch": 1.87, "learning_rate": 1.3153896194608423e-07, "logits/chosen": -0.2152019441127777, "logits/rejected": -0.21667468547821045, "logps/chosen": -2.1898014545440674, "logps/rejected": -32.141517639160156, "loss": 0.6038, "rewards/accuracies": 0.0, "rewards/chosen": 0.15864737331867218, "rewards/margins": -0.12777079641819, "rewards/rejected": 0.2864181697368622, "step": 11498 }, { "epoch": 1.87, "learning_rate": 1.3145013403885696e-07, "logits/chosen": -0.284213125705719, "logits/rejected": -0.2906150817871094, "logps/chosen": -21.918338775634766, "logps/rejected": -31.392559051513672, "loss": 0.3251, "rewards/accuracies": 1.0, "rewards/chosen": 0.2807178497314453, "rewards/margins": 0.25340691208839417, "rewards/rejected": 0.027310943230986595, "step": 11499 }, { "epoch": 1.87, "learning_rate": 1.31361331594938e-07, "logits/chosen": -0.9245923757553101, "logits/rejected": -0.9992844462394714, "logps/chosen": -123.07810974121094, "logps/rejected": -113.39163970947266, "loss": 1.7311, "rewards/accuracies": 0.0, "rewards/chosen": 0.4852401912212372, "rewards/margins": -3.427692413330078, "rewards/rejected": 3.9129326343536377, "step": 11500 }, { "epoch": 1.87, "learning_rate": 1.3127255462046316e-07, "logits/chosen": -0.7740252017974854, "logits/rejected": -0.8224750757217407, "logps/chosen": -177.5225067138672, "logps/rejected": -96.28195190429688, "loss": 0.2458, "rewards/accuracies": 1.0, "rewards/chosen": 2.6785829067230225, "rewards/margins": 0.5503861904144287, "rewards/rejected": 2.1281967163085938, "step": 11501 }, { "epoch": 1.87, "learning_rate": 1.3118380312156568e-07, "logits/chosen": -0.858774721622467, "logits/rejected": -0.798268735408783, "logps/chosen": -55.301109313964844, "logps/rejected": -45.07740020751953, "loss": 0.4245, "rewards/accuracies": 1.0, "rewards/chosen": 2.2230606079101562, "rewards/margins": 0.2811836004257202, "rewards/rejected": 1.941877007484436, "step": 11502 }, { "epoch": 1.87, "learning_rate": 1.310950771043778e-07, "logits/chosen": -0.4534074664115906, "logits/rejected": -0.44099631905555725, "logps/chosen": -118.8121337890625, "logps/rejected": -84.85932922363281, "loss": 0.2472, "rewards/accuracies": 1.0, "rewards/chosen": 4.470658779144287, "rewards/margins": 1.083125114440918, "rewards/rejected": 3.387533664703369, "step": 11503 }, { "epoch": 1.87, "learning_rate": 1.3100637657502927e-07, "logits/chosen": -0.8943324089050293, "logits/rejected": -0.7497535347938538, "logps/chosen": -108.55937194824219, "logps/rejected": -71.92610168457031, "loss": 0.3814, "rewards/accuracies": 1.0, "rewards/chosen": 2.612497091293335, "rewards/margins": 0.5168046951293945, "rewards/rejected": 2.0956923961639404, "step": 11504 }, { "epoch": 1.87, "learning_rate": 1.3091770153964888e-07, "logits/chosen": -0.7302009463310242, "logits/rejected": -0.6021811366081238, "logps/chosen": -95.35160064697266, "logps/rejected": -60.35879898071289, "loss": 0.2896, "rewards/accuracies": 1.0, "rewards/chosen": 1.5893516540527344, "rewards/margins": 0.638528048992157, "rewards/rejected": 0.9508236050605774, "step": 11505 }, { "epoch": 1.87, "learning_rate": 1.308290520043629e-07, "logits/chosen": -0.5860735177993774, "logits/rejected": -0.6039655208587646, "logps/chosen": -68.70986938476562, "logps/rejected": -90.37300872802734, "loss": 0.946, "rewards/accuracies": 0.0, "rewards/chosen": 1.9479576349258423, "rewards/margins": -1.0632110834121704, "rewards/rejected": 3.0111687183380127, "step": 11506 }, { "epoch": 1.87, "learning_rate": 1.307404279752964e-07, "logits/chosen": -0.6044554114341736, "logits/rejected": -0.5622151494026184, "logps/chosen": -65.82735443115234, "logps/rejected": -65.36943054199219, "loss": 0.8124, "rewards/accuracies": 0.0, "rewards/chosen": 0.5389869809150696, "rewards/margins": -0.4308670163154602, "rewards/rejected": 0.9698539972305298, "step": 11507 }, { "epoch": 1.87, "learning_rate": 1.3065182945857216e-07, "logits/chosen": -0.7921929955482483, "logits/rejected": -0.8055535554885864, "logps/chosen": -107.52444458007812, "logps/rejected": -103.89117431640625, "loss": 1.5504, "rewards/accuracies": 0.0, "rewards/chosen": 1.398699164390564, "rewards/margins": -0.6006660461425781, "rewards/rejected": 1.999365210533142, "step": 11508 }, { "epoch": 1.87, "learning_rate": 1.3056325646031196e-07, "logits/chosen": -0.5036524534225464, "logits/rejected": -0.4258729815483093, "logps/chosen": -85.74630737304688, "logps/rejected": -77.19818115234375, "loss": 0.3154, "rewards/accuracies": 1.0, "rewards/chosen": 2.0762970447540283, "rewards/margins": 0.6767189502716064, "rewards/rejected": 1.3995780944824219, "step": 11509 }, { "epoch": 1.87, "learning_rate": 1.3047470898663486e-07, "logits/chosen": -0.6036127805709839, "logits/rejected": -0.5540751814842224, "logps/chosen": -62.3066520690918, "logps/rejected": -72.49165344238281, "loss": 0.6518, "rewards/accuracies": 0.0, "rewards/chosen": 2.6276919841766357, "rewards/margins": -0.7168691158294678, "rewards/rejected": 3.3445611000061035, "step": 11510 }, { "epoch": 1.87, "learning_rate": 1.3038618704365913e-07, "logits/chosen": -0.6518073081970215, "logits/rejected": -0.6535660624504089, "logps/chosen": -233.6082000732422, "logps/rejected": -127.44612884521484, "loss": 0.1314, "rewards/accuracies": 1.0, "rewards/chosen": 4.074440002441406, "rewards/margins": 1.465796709060669, "rewards/rejected": 2.6086432933807373, "step": 11511 }, { "epoch": 1.87, "learning_rate": 1.302976906375003e-07, "logits/chosen": -0.5728131532669067, "logits/rejected": -0.5575131177902222, "logps/chosen": -32.52458572387695, "logps/rejected": -19.576053619384766, "loss": 0.3532, "rewards/accuracies": 1.0, "rewards/chosen": 0.937460720539093, "rewards/margins": 0.5830141305923462, "rewards/rejected": 0.3544466197490692, "step": 11512 }, { "epoch": 1.87, "learning_rate": 1.3020921977427308e-07, "logits/chosen": -0.2783699631690979, "logits/rejected": -0.0824502557516098, "logps/chosen": -98.26412963867188, "logps/rejected": -18.144359588623047, "loss": 0.8203, "rewards/accuracies": 1.0, "rewards/chosen": 0.5481338500976562, "rewards/margins": 0.2779991030693054, "rewards/rejected": 0.27013474702835083, "step": 11513 }, { "epoch": 1.87, "learning_rate": 1.3012077446008967e-07, "logits/chosen": -0.6251764297485352, "logits/rejected": -0.6735848188400269, "logps/chosen": -64.6198501586914, "logps/rejected": -135.0081787109375, "loss": 0.7279, "rewards/accuracies": 1.0, "rewards/chosen": 1.1664108037948608, "rewards/margins": 0.07993161678314209, "rewards/rejected": 1.0864791870117188, "step": 11514 }, { "epoch": 1.87, "learning_rate": 1.30032354701061e-07, "logits/chosen": -0.9505405426025391, "logits/rejected": -0.96901535987854, "logps/chosen": -118.42750549316406, "logps/rejected": -102.48329162597656, "loss": 1.2872, "rewards/accuracies": 0.0, "rewards/chosen": 1.3592666387557983, "rewards/margins": -2.1091370582580566, "rewards/rejected": 3.4684035778045654, "step": 11515 }, { "epoch": 1.87, "learning_rate": 1.2994396050329586e-07, "logits/chosen": -0.5538065433502197, "logits/rejected": -0.4413647949695587, "logps/chosen": -126.28309631347656, "logps/rejected": -65.03646850585938, "loss": 0.0767, "rewards/accuracies": 1.0, "rewards/chosen": 4.694593906402588, "rewards/margins": 3.2260537147521973, "rewards/rejected": 1.4685401916503906, "step": 11516 }, { "epoch": 1.87, "learning_rate": 1.298555918729015e-07, "logits/chosen": -0.8626571297645569, "logits/rejected": -0.9028984308242798, "logps/chosen": -61.10773468017578, "logps/rejected": -101.53162384033203, "loss": 0.3991, "rewards/accuracies": 1.0, "rewards/chosen": 1.7454742193222046, "rewards/margins": 0.0014121532440185547, "rewards/rejected": 1.744062066078186, "step": 11517 }, { "epoch": 1.87, "learning_rate": 1.297672488159836e-07, "logits/chosen": -0.6293845176696777, "logits/rejected": -0.6346429586410522, "logps/chosen": -9.289803504943848, "logps/rejected": -3.240823984146118, "loss": 0.461, "rewards/accuracies": 0.0, "rewards/chosen": 0.27138757705688477, "rewards/margins": -0.20125600695610046, "rewards/rejected": 0.47264358401298523, "step": 11518 }, { "epoch": 1.87, "learning_rate": 1.296789313386455e-07, "logits/chosen": -0.8053980469703674, "logits/rejected": -0.6996816992759705, "logps/chosen": -175.5614013671875, "logps/rejected": -113.36674499511719, "loss": 0.2552, "rewards/accuracies": 1.0, "rewards/chosen": 3.5691254138946533, "rewards/margins": 1.5633010864257812, "rewards/rejected": 2.005824327468872, "step": 11519 }, { "epoch": 1.87, "learning_rate": 1.2959063944698934e-07, "logits/chosen": -0.5712386965751648, "logits/rejected": -0.4818238914012909, "logps/chosen": -85.01622009277344, "logps/rejected": -31.190492630004883, "loss": 0.8472, "rewards/accuracies": 1.0, "rewards/chosen": 1.1339218616485596, "rewards/margins": 0.9225950837135315, "rewards/rejected": 0.21132679283618927, "step": 11520 }, { "epoch": 1.87, "learning_rate": 1.2950237314711498e-07, "logits/chosen": -0.7728708982467651, "logits/rejected": -0.7746349573135376, "logps/chosen": -40.73455047607422, "logps/rejected": -70.18608093261719, "loss": 0.5093, "rewards/accuracies": 0.0, "rewards/chosen": 1.0489829778671265, "rewards/margins": -0.2706718444824219, "rewards/rejected": 1.3196548223495483, "step": 11521 }, { "epoch": 1.87, "learning_rate": 1.294141324451211e-07, "logits/chosen": -0.5787820219993591, "logits/rejected": -0.5810136198997498, "logps/chosen": -82.47386169433594, "logps/rejected": -82.44032287597656, "loss": 1.0701, "rewards/accuracies": 1.0, "rewards/chosen": 1.318579077720642, "rewards/margins": 0.24856185913085938, "rewards/rejected": 1.0700172185897827, "step": 11522 }, { "epoch": 1.87, "learning_rate": 1.2932591734710406e-07, "logits/chosen": -0.72865891456604, "logits/rejected": -0.33024466037750244, "logps/chosen": -171.73065185546875, "logps/rejected": -22.20381736755371, "loss": 0.3011, "rewards/accuracies": 1.0, "rewards/chosen": 4.517065525054932, "rewards/margins": 4.325150489807129, "rewards/rejected": 0.1919151395559311, "step": 11523 }, { "epoch": 1.87, "learning_rate": 1.2923772785915888e-07, "logits/chosen": -0.6349738240242004, "logits/rejected": -0.4850819408893585, "logps/chosen": -100.07791137695312, "logps/rejected": -101.06964111328125, "loss": 0.0609, "rewards/accuracies": 1.0, "rewards/chosen": 5.130154609680176, "rewards/margins": 2.1013567447662354, "rewards/rejected": 3.0287978649139404, "step": 11524 }, { "epoch": 1.87, "learning_rate": 1.291495639873784e-07, "logits/chosen": -0.9702062606811523, "logits/rejected": -0.6829701662063599, "logps/chosen": -139.93824768066406, "logps/rejected": -47.24323272705078, "loss": 1.1148, "rewards/accuracies": 1.0, "rewards/chosen": 5.313375949859619, "rewards/margins": 3.537769317626953, "rewards/rejected": 1.7756065130233765, "step": 11525 }, { "epoch": 1.87, "learning_rate": 1.2906142573785416e-07, "logits/chosen": -0.10489551723003387, "logits/rejected": -0.10489551723003387, "logps/chosen": -10.273548126220703, "logps/rejected": -10.273548126220703, "loss": 0.5447, "rewards/accuracies": 0.0, "rewards/chosen": 0.389430433511734, "rewards/margins": 0.0, "rewards/rejected": 0.389430433511734, "step": 11526 }, { "epoch": 1.87, "learning_rate": 1.2897331311667542e-07, "logits/chosen": -0.423162579536438, "logits/rejected": -0.423162579536438, "logps/chosen": -0.7612674832344055, "logps/rejected": -0.7612674832344055, "loss": 0.3812, "rewards/accuracies": 0.0, "rewards/chosen": 0.21756254136562347, "rewards/margins": 0.0, "rewards/rejected": 0.21756254136562347, "step": 11527 }, { "epoch": 1.87, "learning_rate": 1.288852261299302e-07, "logits/chosen": -0.7063391208648682, "logits/rejected": -0.7003402709960938, "logps/chosen": -53.745079040527344, "logps/rejected": -89.44525909423828, "loss": 0.2996, "rewards/accuracies": 1.0, "rewards/chosen": 2.012998342514038, "rewards/margins": 0.2880791425704956, "rewards/rejected": 1.7249191999435425, "step": 11528 }, { "epoch": 1.87, "learning_rate": 1.2879716478370418e-07, "logits/chosen": -0.593941867351532, "logits/rejected": -0.593941867351532, "logps/chosen": -64.69151306152344, "logps/rejected": -64.69151306152344, "loss": 0.4403, "rewards/accuracies": 0.0, "rewards/chosen": 2.316577911376953, "rewards/margins": 0.0, "rewards/rejected": 2.316577911376953, "step": 11529 }, { "epoch": 1.87, "learning_rate": 1.2870912908408182e-07, "logits/chosen": -0.7229068279266357, "logits/rejected": -0.6962411403656006, "logps/chosen": -81.03868103027344, "logps/rejected": -100.501708984375, "loss": 0.4713, "rewards/accuracies": 1.0, "rewards/chosen": 1.1608695983886719, "rewards/margins": 0.9156020879745483, "rewards/rejected": 0.24526749551296234, "step": 11530 }, { "epoch": 1.87, "learning_rate": 1.286211190371454e-07, "logits/chosen": -0.8756198883056641, "logits/rejected": -0.8765790462493896, "logps/chosen": -122.83452606201172, "logps/rejected": -106.42926025390625, "loss": 0.3068, "rewards/accuracies": 1.0, "rewards/chosen": 0.7501549124717712, "rewards/margins": 0.21744006872177124, "rewards/rejected": 0.53271484375, "step": 11531 }, { "epoch": 1.87, "learning_rate": 1.285331346489757e-07, "logits/chosen": -0.8026325702667236, "logits/rejected": -0.7973563075065613, "logps/chosen": -54.27082061767578, "logps/rejected": -47.83979797363281, "loss": 1.2345, "rewards/accuracies": 1.0, "rewards/chosen": 2.3787193298339844, "rewards/margins": 0.13406586647033691, "rewards/rejected": 2.2446534633636475, "step": 11532 }, { "epoch": 1.87, "learning_rate": 1.2844517592565146e-07, "logits/chosen": -0.9750023484230042, "logits/rejected": -0.9175083637237549, "logps/chosen": -66.76661682128906, "logps/rejected": -81.95867919921875, "loss": 2.3478, "rewards/accuracies": 1.0, "rewards/chosen": 1.152764916419983, "rewards/margins": 0.2981048822402954, "rewards/rejected": 0.8546600341796875, "step": 11533 }, { "epoch": 1.87, "learning_rate": 1.2835724287325001e-07, "logits/chosen": -0.3613984286785126, "logits/rejected": -0.34453505277633667, "logps/chosen": -72.01005554199219, "logps/rejected": -41.13626480102539, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 1.8550995588302612, "rewards/margins": 0.2854717969894409, "rewards/rejected": 1.5696277618408203, "step": 11534 }, { "epoch": 1.87, "learning_rate": 1.2826933549784636e-07, "logits/chosen": -0.7712742686271667, "logits/rejected": -0.7263278961181641, "logps/chosen": -61.29094314575195, "logps/rejected": -31.564706802368164, "loss": 0.3987, "rewards/accuracies": 1.0, "rewards/chosen": 2.1125447750091553, "rewards/margins": 0.7666610479354858, "rewards/rejected": 1.3458837270736694, "step": 11535 }, { "epoch": 1.87, "learning_rate": 1.281814538055145e-07, "logits/chosen": -1.0860655307769775, "logits/rejected": -0.9521797299385071, "logps/chosen": -76.5141830444336, "logps/rejected": -88.96134185791016, "loss": 0.6309, "rewards/accuracies": 1.0, "rewards/chosen": 4.019345760345459, "rewards/margins": 1.963801383972168, "rewards/rejected": 2.055544376373291, "step": 11536 }, { "epoch": 1.87, "learning_rate": 1.280935978023258e-07, "logits/chosen": -0.6514899730682373, "logits/rejected": -0.674138605594635, "logps/chosen": -159.13352966308594, "logps/rejected": -65.29861450195312, "loss": 0.0638, "rewards/accuracies": 1.0, "rewards/chosen": 3.051457166671753, "rewards/margins": 2.0557639598846436, "rewards/rejected": 0.9956932067871094, "step": 11537 }, { "epoch": 1.87, "learning_rate": 1.2800576749435065e-07, "logits/chosen": -0.26427197456359863, "logits/rejected": -0.2378116250038147, "logps/chosen": -13.207489967346191, "logps/rejected": -31.546920776367188, "loss": 0.8095, "rewards/accuracies": 0.0, "rewards/chosen": 0.5865897536277771, "rewards/margins": -0.8251168131828308, "rewards/rejected": 1.411706566810608, "step": 11538 }, { "epoch": 1.87, "learning_rate": 1.27917962887657e-07, "logits/chosen": -0.37510648369789124, "logits/rejected": -0.3825758993625641, "logps/chosen": -13.414216041564941, "logps/rejected": -1.573281168937683, "loss": 0.8377, "rewards/accuracies": 0.0, "rewards/chosen": -0.15377692878246307, "rewards/margins": -0.37034478783607483, "rewards/rejected": 0.21656785905361176, "step": 11539 }, { "epoch": 1.87, "learning_rate": 1.2783018398831153e-07, "logits/chosen": -0.5832034945487976, "logits/rejected": -0.5739190578460693, "logps/chosen": -49.933345794677734, "logps/rejected": -93.03501892089844, "loss": 0.529, "rewards/accuracies": 1.0, "rewards/chosen": 2.002354145050049, "rewards/margins": 0.2984691858291626, "rewards/rejected": 1.7038849592208862, "step": 11540 }, { "epoch": 1.87, "learning_rate": 1.2774243080237872e-07, "logits/chosen": -0.7517885565757751, "logits/rejected": -0.7084436416625977, "logps/chosen": -70.75064849853516, "logps/rejected": -44.29922866821289, "loss": 1.2659, "rewards/accuracies": 1.0, "rewards/chosen": 1.085039496421814, "rewards/margins": 0.22421491146087646, "rewards/rejected": 0.8608245849609375, "step": 11541 }, { "epoch": 1.87, "learning_rate": 1.2765470333592177e-07, "logits/chosen": -1.1308637857437134, "logits/rejected": -1.0413225889205933, "logps/chosen": -99.81983947753906, "logps/rejected": -30.680910110473633, "loss": 0.2084, "rewards/accuracies": 1.0, "rewards/chosen": 0.9466003775596619, "rewards/margins": 0.812877893447876, "rewards/rejected": 0.13372249901294708, "step": 11542 }, { "epoch": 1.87, "learning_rate": 1.275670015950015e-07, "logits/chosen": -0.7995144128799438, "logits/rejected": -0.7511221766471863, "logps/chosen": -25.412147521972656, "logps/rejected": -51.85187530517578, "loss": 1.1972, "rewards/accuracies": 0.0, "rewards/chosen": 1.3833767175674438, "rewards/margins": -0.575534462928772, "rewards/rejected": 1.9589111804962158, "step": 11543 }, { "epoch": 1.87, "learning_rate": 1.2747932558567758e-07, "logits/chosen": -0.7712650299072266, "logits/rejected": -0.6766121983528137, "logps/chosen": -97.07878112792969, "logps/rejected": -58.16230773925781, "loss": 0.0838, "rewards/accuracies": 1.0, "rewards/chosen": 4.001552104949951, "rewards/margins": 2.546353340148926, "rewards/rejected": 1.4551986455917358, "step": 11544 }, { "epoch": 1.87, "learning_rate": 1.273916753140073e-07, "logits/chosen": -0.3402884900569916, "logits/rejected": -0.3402884900569916, "logps/chosen": -3.681710958480835, "logps/rejected": -3.681710958480835, "loss": 0.8429, "rewards/accuracies": 0.0, "rewards/chosen": 0.37704479694366455, "rewards/margins": 0.0, "rewards/rejected": 0.37704479694366455, "step": 11545 }, { "epoch": 1.87, "learning_rate": 1.2730405078604672e-07, "logits/chosen": -0.554040253162384, "logits/rejected": -0.5187534093856812, "logps/chosen": -38.69773864746094, "logps/rejected": -47.28445816040039, "loss": 1.1563, "rewards/accuracies": 1.0, "rewards/chosen": 2.3193154335021973, "rewards/margins": 0.10652971267700195, "rewards/rejected": 2.2127857208251953, "step": 11546 }, { "epoch": 1.87, "learning_rate": 1.2721645200784963e-07, "logits/chosen": -1.034906268119812, "logits/rejected": -1.0645720958709717, "logps/chosen": -88.32707214355469, "logps/rejected": -109.65098571777344, "loss": 1.7067, "rewards/accuracies": 0.0, "rewards/chosen": 1.9073173999786377, "rewards/margins": -2.3897058963775635, "rewards/rejected": 4.297023296356201, "step": 11547 }, { "epoch": 1.87, "learning_rate": 1.2712887898546853e-07, "logits/chosen": -0.6825747489929199, "logits/rejected": -0.7216712236404419, "logps/chosen": -154.87884521484375, "logps/rejected": -92.52266693115234, "loss": 0.32, "rewards/accuracies": 1.0, "rewards/chosen": 4.616995334625244, "rewards/margins": 0.23537540435791016, "rewards/rejected": 4.381619930267334, "step": 11548 }, { "epoch": 1.87, "learning_rate": 1.2704133172495362e-07, "logits/chosen": -0.802657961845398, "logits/rejected": -1.1148635149002075, "logps/chosen": -215.5780029296875, "logps/rejected": -35.44621276855469, "loss": 0.1141, "rewards/accuracies": 1.0, "rewards/chosen": 2.2490525245666504, "rewards/margins": 1.9077683687210083, "rewards/rejected": 0.3412841856479645, "step": 11549 }, { "epoch": 1.87, "learning_rate": 1.2695381023235385e-07, "logits/chosen": -0.6158490777015686, "logits/rejected": -0.5794515609741211, "logps/chosen": -37.53163146972656, "logps/rejected": -78.96382141113281, "loss": 0.1826, "rewards/accuracies": 1.0, "rewards/chosen": 1.9149792194366455, "rewards/margins": 0.8348751068115234, "rewards/rejected": 1.080104112625122, "step": 11550 }, { "epoch": 1.87, "learning_rate": 1.2686631451371587e-07, "logits/chosen": -0.4914800226688385, "logits/rejected": -0.4894782602787018, "logps/chosen": -2.534729242324829, "logps/rejected": -6.060302257537842, "loss": 0.3632, "rewards/accuracies": 1.0, "rewards/chosen": 0.2737700343132019, "rewards/margins": 0.2069142758846283, "rewards/rejected": 0.0668557658791542, "step": 11551 }, { "epoch": 1.88, "learning_rate": 1.2677884457508504e-07, "logits/chosen": -0.6424582004547119, "logits/rejected": -0.5770329833030701, "logps/chosen": -190.0758056640625, "logps/rejected": -80.8479995727539, "loss": 0.6136, "rewards/accuracies": 0.0, "rewards/chosen": 2.670483350753784, "rewards/margins": -0.38837146759033203, "rewards/rejected": 3.058854818344116, "step": 11552 }, { "epoch": 1.88, "learning_rate": 1.2669140042250448e-07, "logits/chosen": -0.47537606954574585, "logits/rejected": -0.4264293313026428, "logps/chosen": -76.28856658935547, "logps/rejected": -76.6236343383789, "loss": 0.2781, "rewards/accuracies": 1.0, "rewards/chosen": 2.461454153060913, "rewards/margins": 0.33033084869384766, "rewards/rejected": 2.1311233043670654, "step": 11553 }, { "epoch": 1.88, "learning_rate": 1.266039820620159e-07, "logits/chosen": -0.5457755923271179, "logits/rejected": -0.5239187479019165, "logps/chosen": -98.69131469726562, "logps/rejected": -109.79943084716797, "loss": 0.3054, "rewards/accuracies": 1.0, "rewards/chosen": 1.4692405462265015, "rewards/margins": 0.327250599861145, "rewards/rejected": 1.1419899463653564, "step": 11554 }, { "epoch": 1.88, "learning_rate": 1.2651658949965916e-07, "logits/chosen": -0.8441911935806274, "logits/rejected": -0.7318044900894165, "logps/chosen": -74.15005493164062, "logps/rejected": -66.3249740600586, "loss": 1.7311, "rewards/accuracies": 1.0, "rewards/chosen": 4.029163360595703, "rewards/margins": 1.943596601486206, "rewards/rejected": 2.085566759109497, "step": 11555 }, { "epoch": 1.88, "learning_rate": 1.26429222741472e-07, "logits/chosen": -0.8783060908317566, "logits/rejected": -0.915780246257782, "logps/chosen": -116.24842834472656, "logps/rejected": -131.7222900390625, "loss": 0.6036, "rewards/accuracies": 0.0, "rewards/chosen": 0.6885917782783508, "rewards/margins": -0.77767413854599, "rewards/rejected": 1.4662659168243408, "step": 11556 }, { "epoch": 1.88, "learning_rate": 1.26341881793491e-07, "logits/chosen": -0.2379181683063507, "logits/rejected": -0.2379181683063507, "logps/chosen": -1.3247519731521606, "logps/rejected": -1.3247519731521606, "loss": 0.3576, "rewards/accuracies": 0.0, "rewards/chosen": 0.1914350688457489, "rewards/margins": 0.0, "rewards/rejected": 0.1914350688457489, "step": 11557 }, { "epoch": 1.88, "learning_rate": 1.2625456666175016e-07, "logits/chosen": -0.2066350281238556, "logits/rejected": -0.17017927765846252, "logps/chosen": -70.77656555175781, "logps/rejected": -61.13048553466797, "loss": 0.0754, "rewards/accuracies": 1.0, "rewards/chosen": 3.7477614879608154, "rewards/margins": 2.2925591468811035, "rewards/rejected": 1.4552024602890015, "step": 11558 }, { "epoch": 1.88, "learning_rate": 1.261672773522825e-07, "logits/chosen": -0.8020498752593994, "logits/rejected": -0.7210018038749695, "logps/chosen": -63.16832733154297, "logps/rejected": -144.76693725585938, "loss": 1.4254, "rewards/accuracies": 0.0, "rewards/chosen": 2.4113717079162598, "rewards/margins": -2.7812066078186035, "rewards/rejected": 5.192578315734863, "step": 11559 }, { "epoch": 1.88, "learning_rate": 1.2608001387111862e-07, "logits/chosen": -0.5512278079986572, "logits/rejected": -0.6139649748802185, "logps/chosen": -59.00929260253906, "logps/rejected": -114.20094299316406, "loss": 0.5651, "rewards/accuracies": 0.0, "rewards/chosen": 1.6332015991210938, "rewards/margins": -0.3566009998321533, "rewards/rejected": 1.989802598953247, "step": 11560 }, { "epoch": 1.88, "learning_rate": 1.2599277622428788e-07, "logits/chosen": -0.8242537379264832, "logits/rejected": -0.7344450950622559, "logps/chosen": -69.59394836425781, "logps/rejected": -64.4365234375, "loss": 1.3486, "rewards/accuracies": 1.0, "rewards/chosen": 1.9387367963790894, "rewards/margins": 0.558016300201416, "rewards/rejected": 1.3807204961776733, "step": 11561 }, { "epoch": 1.88, "learning_rate": 1.2590556441781723e-07, "logits/chosen": -0.8347870707511902, "logits/rejected": -0.7816959619522095, "logps/chosen": -92.61769104003906, "logps/rejected": -91.6968002319336, "loss": 1.1455, "rewards/accuracies": 0.0, "rewards/chosen": 5.119578838348389, "rewards/margins": -0.5328817367553711, "rewards/rejected": 5.65246057510376, "step": 11562 }, { "epoch": 1.88, "learning_rate": 1.2581837845773252e-07, "logits/chosen": -0.5848836898803711, "logits/rejected": -0.4154164493083954, "logps/chosen": -119.53044891357422, "logps/rejected": -44.53528594970703, "loss": 0.516, "rewards/accuracies": 1.0, "rewards/chosen": 3.082669973373413, "rewards/margins": 0.9911088943481445, "rewards/rejected": 2.0915610790252686, "step": 11563 }, { "epoch": 1.88, "learning_rate": 1.2573121835005717e-07, "logits/chosen": -1.0224556922912598, "logits/rejected": -1.1357141733169556, "logps/chosen": -201.20860290527344, "logps/rejected": -173.06051635742188, "loss": 1.6475, "rewards/accuracies": 0.0, "rewards/chosen": 3.1828765869140625, "rewards/margins": -2.8640780448913574, "rewards/rejected": 6.04695463180542, "step": 11564 }, { "epoch": 1.88, "learning_rate": 1.2564408410081346e-07, "logits/chosen": -0.6105933785438538, "logits/rejected": -0.5463402271270752, "logps/chosen": -66.58946990966797, "logps/rejected": -51.896949768066406, "loss": 0.8212, "rewards/accuracies": 0.0, "rewards/chosen": 1.9714339971542358, "rewards/margins": -0.2216416597366333, "rewards/rejected": 2.193075656890869, "step": 11565 }, { "epoch": 1.88, "learning_rate": 1.2555697571602115e-07, "logits/chosen": -0.8507751822471619, "logits/rejected": -0.7562176585197449, "logps/chosen": -47.47917556762695, "logps/rejected": -78.2213134765625, "loss": 0.1122, "rewards/accuracies": 1.0, "rewards/chosen": 3.913656234741211, "rewards/margins": 1.8079769611358643, "rewards/rejected": 2.1056792736053467, "step": 11566 }, { "epoch": 1.88, "learning_rate": 1.2546989320169899e-07, "logits/chosen": -0.9408379197120667, "logits/rejected": -0.8963870406150818, "logps/chosen": -117.56747436523438, "logps/rejected": -85.13752746582031, "loss": 1.2051, "rewards/accuracies": 1.0, "rewards/chosen": 3.9557526111602783, "rewards/margins": 1.0380568504333496, "rewards/rejected": 2.9176957607269287, "step": 11567 }, { "epoch": 1.88, "learning_rate": 1.2538283656386318e-07, "logits/chosen": -0.7804995179176331, "logits/rejected": -0.7085992693901062, "logps/chosen": -104.87702941894531, "logps/rejected": -39.73917007446289, "loss": 1.9364, "rewards/accuracies": 1.0, "rewards/chosen": 5.136518955230713, "rewards/margins": 2.6904513835906982, "rewards/rejected": 2.4460675716400146, "step": 11568 }, { "epoch": 1.88, "learning_rate": 1.252958058085289e-07, "logits/chosen": -0.9496018290519714, "logits/rejected": -0.9393016695976257, "logps/chosen": -59.07343292236328, "logps/rejected": -62.858585357666016, "loss": 0.5646, "rewards/accuracies": 1.0, "rewards/chosen": 2.8361122608184814, "rewards/margins": 0.9171352386474609, "rewards/rejected": 1.9189770221710205, "step": 11569 }, { "epoch": 1.88, "learning_rate": 1.2520880094170877e-07, "logits/chosen": -0.7205846309661865, "logits/rejected": -0.7283895611763, "logps/chosen": -133.75027465820312, "logps/rejected": -152.92031860351562, "loss": 2.5033, "rewards/accuracies": 0.0, "rewards/chosen": 0.748852550983429, "rewards/margins": -3.9825351238250732, "rewards/rejected": 4.731387615203857, "step": 11570 }, { "epoch": 1.88, "learning_rate": 1.2512182196941434e-07, "logits/chosen": -0.6669979691505432, "logits/rejected": -0.6098472476005554, "logps/chosen": -34.12187194824219, "logps/rejected": -58.962284088134766, "loss": 0.5768, "rewards/accuracies": 0.0, "rewards/chosen": 1.3573440313339233, "rewards/margins": -0.3133518695831299, "rewards/rejected": 1.6706959009170532, "step": 11571 }, { "epoch": 1.88, "learning_rate": 1.2503486889765474e-07, "logits/chosen": -1.0889980792999268, "logits/rejected": -1.0367181301116943, "logps/chosen": -83.21197509765625, "logps/rejected": -77.81332397460938, "loss": 1.304, "rewards/accuracies": 0.0, "rewards/chosen": 1.0566262006759644, "rewards/margins": -2.390941619873047, "rewards/rejected": 3.4475677013397217, "step": 11572 }, { "epoch": 1.88, "learning_rate": 1.249479417324379e-07, "logits/chosen": -0.30054980516433716, "logits/rejected": -0.30054980516433716, "logps/chosen": -37.45849609375, "logps/rejected": -37.45849609375, "loss": 0.4847, "rewards/accuracies": 0.0, "rewards/chosen": 1.7301349639892578, "rewards/margins": 0.0, "rewards/rejected": 1.7301349639892578, "step": 11573 }, { "epoch": 1.88, "learning_rate": 1.2486104047976937e-07, "logits/chosen": -0.9056329727172852, "logits/rejected": -0.8267223834991455, "logps/chosen": -36.05344009399414, "logps/rejected": -60.659210205078125, "loss": 0.3448, "rewards/accuracies": 1.0, "rewards/chosen": 1.7713794708251953, "rewards/margins": 0.232283353805542, "rewards/rejected": 1.5390961170196533, "step": 11574 }, { "epoch": 1.88, "learning_rate": 1.2477416514565348e-07, "logits/chosen": -0.36753326654434204, "logits/rejected": -0.23152804374694824, "logps/chosen": -85.47097778320312, "logps/rejected": -119.113037109375, "loss": 0.5161, "rewards/accuracies": 1.0, "rewards/chosen": 4.813481330871582, "rewards/margins": 0.7072405815124512, "rewards/rejected": 4.106240749359131, "step": 11575 }, { "epoch": 1.88, "learning_rate": 1.246873157360922e-07, "logits/chosen": -0.8598206639289856, "logits/rejected": -0.7320667505264282, "logps/chosen": -153.0273895263672, "logps/rejected": -50.943904876708984, "loss": 0.3953, "rewards/accuracies": 1.0, "rewards/chosen": 4.859580993652344, "rewards/margins": 3.2347569465637207, "rewards/rejected": 1.6248241662979126, "step": 11576 }, { "epoch": 1.88, "learning_rate": 1.2460049225708636e-07, "logits/chosen": -0.46369725465774536, "logits/rejected": -0.48284226655960083, "logps/chosen": -69.2758560180664, "logps/rejected": -50.093353271484375, "loss": 0.9792, "rewards/accuracies": 0.0, "rewards/chosen": 1.832661509513855, "rewards/margins": -0.7766770124435425, "rewards/rejected": 2.6093385219573975, "step": 11577 }, { "epoch": 1.88, "learning_rate": 1.2451369471463423e-07, "logits/chosen": -1.0445841550827026, "logits/rejected": -1.0721707344055176, "logps/chosen": -111.99636840820312, "logps/rejected": -82.09928894042969, "loss": 0.5519, "rewards/accuracies": 0.0, "rewards/chosen": 1.2304130792617798, "rewards/margins": -0.6949164867401123, "rewards/rejected": 1.925329566001892, "step": 11578 }, { "epoch": 1.88, "learning_rate": 1.2442692311473307e-07, "logits/chosen": -0.4538159966468811, "logits/rejected": -0.4080781936645508, "logps/chosen": -34.26960754394531, "logps/rejected": -92.1998519897461, "loss": 1.4198, "rewards/accuracies": 0.0, "rewards/chosen": 1.116006851196289, "rewards/margins": -2.3683674335479736, "rewards/rejected": 3.4843742847442627, "step": 11579 }, { "epoch": 1.88, "learning_rate": 1.243401774633777e-07, "logits/chosen": -0.6707376837730408, "logits/rejected": -0.6707376837730408, "logps/chosen": -61.375431060791016, "logps/rejected": -61.375431060791016, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.8754131197929382, "rewards/margins": 0.0, "rewards/rejected": 0.8754131197929382, "step": 11580 }, { "epoch": 1.88, "learning_rate": 1.2425345776656165e-07, "logits/chosen": -0.6128578782081604, "logits/rejected": -0.5436263084411621, "logps/chosen": -51.819393157958984, "logps/rejected": -64.98324584960938, "loss": 0.2825, "rewards/accuracies": 1.0, "rewards/chosen": 1.8421863317489624, "rewards/margins": 0.3886340856552124, "rewards/rejected": 1.45355224609375, "step": 11581 }, { "epoch": 1.88, "learning_rate": 1.241667640302762e-07, "logits/chosen": -0.7328188419342041, "logits/rejected": -0.6529113054275513, "logps/chosen": -89.68569946289062, "logps/rejected": -66.8574447631836, "loss": 0.596, "rewards/accuracies": 1.0, "rewards/chosen": 3.442707061767578, "rewards/margins": 2.104733943939209, "rewards/rejected": 1.3379729986190796, "step": 11582 }, { "epoch": 1.88, "learning_rate": 1.2408009626051135e-07, "logits/chosen": -0.8112335801124573, "logits/rejected": -0.7771422863006592, "logps/chosen": -67.53472137451172, "logps/rejected": -107.52459716796875, "loss": 0.8179, "rewards/accuracies": 0.0, "rewards/chosen": 2.329427480697632, "rewards/margins": -1.2957098484039307, "rewards/rejected": 3.6251373291015625, "step": 11583 }, { "epoch": 1.88, "learning_rate": 1.239934544632547e-07, "logits/chosen": -1.0172674655914307, "logits/rejected": -0.9216132760047913, "logps/chosen": -232.30718994140625, "logps/rejected": -398.6591796875, "loss": 1.4755, "rewards/accuracies": 0.0, "rewards/chosen": 4.625683784484863, "rewards/margins": -2.7013120651245117, "rewards/rejected": 7.326995849609375, "step": 11584 }, { "epoch": 1.88, "learning_rate": 1.239068386444927e-07, "logits/chosen": -0.7131273746490479, "logits/rejected": -0.572401762008667, "logps/chosen": -96.05547332763672, "logps/rejected": -71.089111328125, "loss": 1.5658, "rewards/accuracies": 1.0, "rewards/chosen": 1.5688332319259644, "rewards/margins": 0.3170250654220581, "rewards/rejected": 1.2518081665039062, "step": 11585 }, { "epoch": 1.88, "learning_rate": 1.2382024881020936e-07, "logits/chosen": -0.8379786610603333, "logits/rejected": -0.840320885181427, "logps/chosen": -105.34407043457031, "logps/rejected": -75.39726257324219, "loss": 0.6923, "rewards/accuracies": 0.0, "rewards/chosen": 0.6096252799034119, "rewards/margins": -1.0318541526794434, "rewards/rejected": 1.6414794921875, "step": 11586 }, { "epoch": 1.88, "learning_rate": 1.2373368496638752e-07, "logits/chosen": -0.7324205636978149, "logits/rejected": -0.7554844617843628, "logps/chosen": -256.10394287109375, "logps/rejected": -133.91622924804688, "loss": 0.2174, "rewards/accuracies": 1.0, "rewards/chosen": 3.172991991043091, "rewards/margins": 0.7668273448944092, "rewards/rejected": 2.4061646461486816, "step": 11587 }, { "epoch": 1.88, "learning_rate": 1.2364714711900765e-07, "logits/chosen": -0.7856605648994446, "logits/rejected": -0.6349999904632568, "logps/chosen": -73.29147338867188, "logps/rejected": -57.90819549560547, "loss": 0.7538, "rewards/accuracies": 0.0, "rewards/chosen": 1.7828963994979858, "rewards/margins": -0.4847222566604614, "rewards/rejected": 2.2676186561584473, "step": 11588 }, { "epoch": 1.88, "learning_rate": 1.235606352740488e-07, "logits/chosen": -0.7929086685180664, "logits/rejected": -0.7457414269447327, "logps/chosen": -60.658531188964844, "logps/rejected": -65.26263427734375, "loss": 0.148, "rewards/accuracies": 1.0, "rewards/chosen": 2.9489800930023193, "rewards/margins": 1.0790222883224487, "rewards/rejected": 1.8699578046798706, "step": 11589 }, { "epoch": 1.88, "learning_rate": 1.2347414943748836e-07, "logits/chosen": -0.7443480491638184, "logits/rejected": -0.7266851663589478, "logps/chosen": -66.3820571899414, "logps/rejected": -104.98063659667969, "loss": 0.5273, "rewards/accuracies": 0.0, "rewards/chosen": 1.1932014226913452, "rewards/margins": -0.4176872968673706, "rewards/rejected": 1.6108887195587158, "step": 11590 }, { "epoch": 1.88, "learning_rate": 1.2338768961530128e-07, "logits/chosen": -0.3964162766933441, "logits/rejected": -0.2868848741054535, "logps/chosen": -84.26872253417969, "logps/rejected": -68.02842712402344, "loss": 0.1632, "rewards/accuracies": 1.0, "rewards/chosen": 4.169686317443848, "rewards/margins": 1.7024788856506348, "rewards/rejected": 2.467207431793213, "step": 11591 }, { "epoch": 1.88, "learning_rate": 1.2330125581346146e-07, "logits/chosen": -0.7797145247459412, "logits/rejected": -1.0792574882507324, "logps/chosen": -48.186058044433594, "logps/rejected": -52.29581832885742, "loss": 0.3379, "rewards/accuracies": 1.0, "rewards/chosen": 0.7351360321044922, "rewards/margins": 0.5555473566055298, "rewards/rejected": 0.1795887053012848, "step": 11592 }, { "epoch": 1.88, "learning_rate": 1.2321484803794037e-07, "logits/chosen": -0.46873605251312256, "logits/rejected": -0.4405055046081543, "logps/chosen": -24.748062133789062, "logps/rejected": -4.74742317199707, "loss": 0.4974, "rewards/accuracies": 0.0, "rewards/chosen": -0.14794807136058807, "rewards/margins": -0.4100152254104614, "rewards/rejected": 0.26206716895103455, "step": 11593 }, { "epoch": 1.88, "learning_rate": 1.2312846629470824e-07, "logits/chosen": -0.9056442379951477, "logits/rejected": -0.8893208503723145, "logps/chosen": -77.0665283203125, "logps/rejected": -53.96125793457031, "loss": 1.8819, "rewards/accuracies": 0.0, "rewards/chosen": 0.9423751831054688, "rewards/margins": -0.3381866216659546, "rewards/rejected": 1.2805618047714233, "step": 11594 }, { "epoch": 1.88, "learning_rate": 1.2304211058973295e-07, "logits/chosen": -0.6129536628723145, "logits/rejected": -0.6427484154701233, "logps/chosen": -43.72796630859375, "logps/rejected": -52.05876922607422, "loss": 1.0516, "rewards/accuracies": 0.0, "rewards/chosen": 1.556470513343811, "rewards/margins": -0.827372670173645, "rewards/rejected": 2.383843183517456, "step": 11595 }, { "epoch": 1.88, "learning_rate": 1.229557809289812e-07, "logits/chosen": -1.055938959121704, "logits/rejected": -1.0263400077819824, "logps/chosen": -148.69747924804688, "logps/rejected": -102.88921356201172, "loss": 0.2311, "rewards/accuracies": 1.0, "rewards/chosen": 4.224557399749756, "rewards/margins": 0.5932576656341553, "rewards/rejected": 3.6312997341156006, "step": 11596 }, { "epoch": 1.88, "learning_rate": 1.2286947731841712e-07, "logits/chosen": -0.6887428760528564, "logits/rejected": -0.7326096296310425, "logps/chosen": -44.80018615722656, "logps/rejected": -61.16051483154297, "loss": 0.8383, "rewards/accuracies": 0.0, "rewards/chosen": 1.2519844770431519, "rewards/margins": -0.2250518798828125, "rewards/rejected": 1.4770363569259644, "step": 11597 }, { "epoch": 1.88, "learning_rate": 1.2278319976400392e-07, "logits/chosen": -0.7545543909072876, "logits/rejected": -0.7545543909072876, "logps/chosen": -58.318443298339844, "logps/rejected": -58.318443298339844, "loss": 0.4787, "rewards/accuracies": 0.0, "rewards/chosen": 1.3609619140625, "rewards/margins": 0.0, "rewards/rejected": 1.3609619140625, "step": 11598 }, { "epoch": 1.88, "learning_rate": 1.2269694827170223e-07, "logits/chosen": -0.8175550103187561, "logits/rejected": -0.7149890661239624, "logps/chosen": -86.76161193847656, "logps/rejected": -72.84515380859375, "loss": 1.4902, "rewards/accuracies": 1.0, "rewards/chosen": 5.336813449859619, "rewards/margins": 1.533186435699463, "rewards/rejected": 3.8036270141601562, "step": 11599 }, { "epoch": 1.88, "learning_rate": 1.2261072284747148e-07, "logits/chosen": -1.1390862464904785, "logits/rejected": -1.0831772089004517, "logps/chosen": -77.29849243164062, "logps/rejected": -75.83639526367188, "loss": 0.3459, "rewards/accuracies": 1.0, "rewards/chosen": 3.3776283264160156, "rewards/margins": 0.06585144996643066, "rewards/rejected": 3.311776876449585, "step": 11600 }, { "epoch": 1.88, "learning_rate": 1.225245234972689e-07, "logits/chosen": -0.5393888354301453, "logits/rejected": -0.4219535291194916, "logps/chosen": -83.28507995605469, "logps/rejected": -23.782909393310547, "loss": 0.8733, "rewards/accuracies": 1.0, "rewards/chosen": 1.4257690906524658, "rewards/margins": 1.166832447052002, "rewards/rejected": 0.25893670320510864, "step": 11601 }, { "epoch": 1.88, "learning_rate": 1.2243835022705001e-07, "logits/chosen": -0.771363377571106, "logits/rejected": -0.7697678208351135, "logps/chosen": -135.26589965820312, "logps/rejected": -119.15202331542969, "loss": 0.4052, "rewards/accuracies": 0.0, "rewards/chosen": 4.673770427703857, "rewards/margins": -0.008701801300048828, "rewards/rejected": 4.682472229003906, "step": 11602 }, { "epoch": 1.88, "learning_rate": 1.2235220304276845e-07, "logits/chosen": -0.8100758790969849, "logits/rejected": -0.8967480659484863, "logps/chosen": -212.80728149414062, "logps/rejected": -90.96183776855469, "loss": 0.5539, "rewards/accuracies": 0.0, "rewards/chosen": 3.5267274379730225, "rewards/margins": -0.56502366065979, "rewards/rejected": 4.0917510986328125, "step": 11603 }, { "epoch": 1.88, "learning_rate": 1.2226608195037647e-07, "logits/chosen": -0.719662070274353, "logits/rejected": -0.6697070598602295, "logps/chosen": -116.75466918945312, "logps/rejected": -79.90650939941406, "loss": 1.6499, "rewards/accuracies": 0.0, "rewards/chosen": 1.4407501220703125, "rewards/margins": -0.822268009185791, "rewards/rejected": 2.2630181312561035, "step": 11604 }, { "epoch": 1.88, "learning_rate": 1.2217998695582393e-07, "logits/chosen": -0.4133656322956085, "logits/rejected": -0.27482131123542786, "logps/chosen": -148.07017517089844, "logps/rejected": -46.680694580078125, "loss": 0.8573, "rewards/accuracies": 0.0, "rewards/chosen": -0.3054001033306122, "rewards/margins": -0.46168291568756104, "rewards/rejected": 0.15628281235694885, "step": 11605 }, { "epoch": 1.88, "learning_rate": 1.2209391806505946e-07, "logits/chosen": -1.10813307762146, "logits/rejected": -1.0662744045257568, "logps/chosen": -70.48551177978516, "logps/rejected": -87.46719360351562, "loss": 0.6174, "rewards/accuracies": 1.0, "rewards/chosen": 2.14031982421875, "rewards/margins": 1.2871993780136108, "rewards/rejected": 0.8531204462051392, "step": 11606 }, { "epoch": 1.88, "learning_rate": 1.220078752840294e-07, "logits/chosen": -1.0236150026321411, "logits/rejected": -1.2612313032150269, "logps/chosen": -102.26148223876953, "logps/rejected": -34.07060623168945, "loss": 0.151, "rewards/accuracies": 1.0, "rewards/chosen": 1.6465263366699219, "rewards/margins": 1.3283374309539795, "rewards/rejected": 0.31818887591362, "step": 11607 }, { "epoch": 1.88, "learning_rate": 1.2192185861867865e-07, "logits/chosen": -0.6328271627426147, "logits/rejected": -0.5117208361625671, "logps/chosen": -63.39319610595703, "logps/rejected": -76.49641418457031, "loss": 0.263, "rewards/accuracies": 1.0, "rewards/chosen": 1.7871315479278564, "rewards/margins": 0.6721458435058594, "rewards/rejected": 1.114985704421997, "step": 11608 }, { "epoch": 1.88, "learning_rate": 1.218358680749499e-07, "logits/chosen": -0.774607241153717, "logits/rejected": -0.6949832439422607, "logps/chosen": -139.29234313964844, "logps/rejected": -104.34162139892578, "loss": 1.3824, "rewards/accuracies": 1.0, "rewards/chosen": 1.0745528936386108, "rewards/margins": 0.012100934982299805, "rewards/rejected": 1.062451958656311, "step": 11609 }, { "epoch": 1.88, "learning_rate": 1.2174990365878446e-07, "logits/chosen": -0.7970901727676392, "logits/rejected": -0.8683919906616211, "logps/chosen": -55.96485137939453, "logps/rejected": -87.36341857910156, "loss": 1.2247, "rewards/accuracies": 0.0, "rewards/chosen": 1.0043572187423706, "rewards/margins": -0.8508857488632202, "rewards/rejected": 1.8552429676055908, "step": 11610 }, { "epoch": 1.88, "learning_rate": 1.2166396537612178e-07, "logits/chosen": -0.68923419713974, "logits/rejected": -0.6505564451217651, "logps/chosen": -38.43588638305664, "logps/rejected": -63.452598571777344, "loss": 0.4243, "rewards/accuracies": 0.0, "rewards/chosen": 1.8619701862335205, "rewards/margins": -0.23372602462768555, "rewards/rejected": 2.095696210861206, "step": 11611 }, { "epoch": 1.88, "learning_rate": 1.215780532328991e-07, "logits/chosen": -0.8020870685577393, "logits/rejected": -0.593680202960968, "logps/chosen": -104.4825210571289, "logps/rejected": -72.83744812011719, "loss": 0.3999, "rewards/accuracies": 1.0, "rewards/chosen": 3.8518226146698, "rewards/margins": 1.1704115867614746, "rewards/rejected": 2.681411027908325, "step": 11612 }, { "epoch": 1.88, "learning_rate": 1.2149216723505245e-07, "logits/chosen": -0.7421135902404785, "logits/rejected": -0.7017305493354797, "logps/chosen": -135.96871948242188, "logps/rejected": -74.19467163085938, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 7.288577556610107, "rewards/margins": 5.252109527587891, "rewards/rejected": 2.0364677906036377, "step": 11613 }, { "epoch": 1.89, "learning_rate": 1.2140630738851543e-07, "logits/chosen": -0.8300330638885498, "logits/rejected": -0.7874186635017395, "logps/chosen": -75.86692810058594, "logps/rejected": -4.197689056396484, "loss": 0.1806, "rewards/accuracies": 1.0, "rewards/chosen": 1.5863227844238281, "rewards/margins": 0.9632562398910522, "rewards/rejected": 0.6230665445327759, "step": 11614 }, { "epoch": 1.89, "learning_rate": 1.2132047369922038e-07, "logits/chosen": -0.856683075428009, "logits/rejected": -0.8451488018035889, "logps/chosen": -70.96764373779297, "logps/rejected": -69.76592254638672, "loss": 0.4494, "rewards/accuracies": 1.0, "rewards/chosen": 3.2114243507385254, "rewards/margins": 0.14175724983215332, "rewards/rejected": 3.069667100906372, "step": 11615 }, { "epoch": 1.89, "learning_rate": 1.2123466617309742e-07, "logits/chosen": -0.6089493036270142, "logits/rejected": -0.5686548948287964, "logps/chosen": -60.31694030761719, "logps/rejected": -60.939361572265625, "loss": 0.4552, "rewards/accuracies": 0.0, "rewards/chosen": 1.5412139892578125, "rewards/margins": -0.09613192081451416, "rewards/rejected": 1.6373459100723267, "step": 11616 }, { "epoch": 1.89, "learning_rate": 1.2114888481607521e-07, "logits/chosen": -0.6527578234672546, "logits/rejected": -0.6312196850776672, "logps/chosen": -119.52383422851562, "logps/rejected": -166.12469482421875, "loss": 1.0508, "rewards/accuracies": 0.0, "rewards/chosen": 4.522250652313232, "rewards/margins": -1.9695663452148438, "rewards/rejected": 6.491816997528076, "step": 11617 }, { "epoch": 1.89, "learning_rate": 1.2106312963408023e-07, "logits/chosen": -0.8644636273384094, "logits/rejected": -0.7760361433029175, "logps/chosen": -44.09744644165039, "logps/rejected": -84.43912506103516, "loss": 0.899, "rewards/accuracies": 0.0, "rewards/chosen": 2.2469851970672607, "rewards/margins": -1.080916404724121, "rewards/rejected": 3.327901601791382, "step": 11618 }, { "epoch": 1.89, "learning_rate": 1.2097740063303752e-07, "logits/chosen": -0.2524040639400482, "logits/rejected": -0.2524040639400482, "logps/chosen": -43.91230010986328, "logps/rejected": -43.91230010986328, "loss": 0.9987, "rewards/accuracies": 0.0, "rewards/chosen": 0.3639114499092102, "rewards/margins": 0.0, "rewards/rejected": 0.3639114499092102, "step": 11619 }, { "epoch": 1.89, "learning_rate": 1.2089169781886998e-07, "logits/chosen": -0.6853998303413391, "logits/rejected": -0.6853998303413391, "logps/chosen": -0.7409961223602295, "logps/rejected": -0.7409961223602295, "loss": 0.4195, "rewards/accuracies": 0.0, "rewards/chosen": 0.2465573102235794, "rewards/margins": 0.0, "rewards/rejected": 0.2465573102235794, "step": 11620 }, { "epoch": 1.89, "learning_rate": 1.2080602119749915e-07, "logits/chosen": -1.1141602993011475, "logits/rejected": -0.9845085740089417, "logps/chosen": -150.36778259277344, "logps/rejected": -129.6676025390625, "loss": 0.4706, "rewards/accuracies": 0.0, "rewards/chosen": 4.557827949523926, "rewards/margins": -0.080718994140625, "rewards/rejected": 4.638546943664551, "step": 11621 }, { "epoch": 1.89, "learning_rate": 1.2072037077484414e-07, "logits/chosen": -0.8767584562301636, "logits/rejected": -0.9244124889373779, "logps/chosen": -168.64242553710938, "logps/rejected": -127.33137512207031, "loss": 0.0493, "rewards/accuracies": 1.0, "rewards/chosen": 3.713177442550659, "rewards/margins": 2.5520598888397217, "rewards/rejected": 1.1611175537109375, "step": 11622 }, { "epoch": 1.89, "learning_rate": 1.206347465568228e-07, "logits/chosen": -0.754050076007843, "logits/rejected": -0.8099150061607361, "logps/chosen": -54.30979919433594, "logps/rejected": -97.94761657714844, "loss": 1.7787, "rewards/accuracies": 0.0, "rewards/chosen": 1.6731064319610596, "rewards/margins": -3.355961561203003, "rewards/rejected": 5.0290679931640625, "step": 11623 }, { "epoch": 1.89, "learning_rate": 1.2054914854935083e-07, "logits/chosen": -0.5760564208030701, "logits/rejected": -0.5760564208030701, "logps/chosen": -19.544382095336914, "logps/rejected": -19.544382095336914, "loss": 0.5253, "rewards/accuracies": 0.0, "rewards/chosen": 0.046767424792051315, "rewards/margins": 0.0, "rewards/rejected": 0.046767424792051315, "step": 11624 }, { "epoch": 1.89, "learning_rate": 1.204635767583424e-07, "logits/chosen": -1.0323474407196045, "logits/rejected": -0.8686875104904175, "logps/chosen": -108.8689956665039, "logps/rejected": -21.629383087158203, "loss": 0.0513, "rewards/accuracies": 1.0, "rewards/chosen": 6.802089691162109, "rewards/margins": 6.43452262878418, "rewards/rejected": 0.3675670623779297, "step": 11625 }, { "epoch": 1.89, "learning_rate": 1.2037803118970947e-07, "logits/chosen": -0.8163884878158569, "logits/rejected": -0.6220961809158325, "logps/chosen": -111.08460998535156, "logps/rejected": -96.02119445800781, "loss": 0.1253, "rewards/accuracies": 1.0, "rewards/chosen": 5.548994541168213, "rewards/margins": 3.804068088531494, "rewards/rejected": 1.7449264526367188, "step": 11626 }, { "epoch": 1.89, "learning_rate": 1.2029251184936274e-07, "logits/chosen": -0.507634162902832, "logits/rejected": -0.49583205580711365, "logps/chosen": -38.983585357666016, "logps/rejected": -91.91361236572266, "loss": 0.9881, "rewards/accuracies": 0.0, "rewards/chosen": 2.30944561958313, "rewards/margins": -1.5578525066375732, "rewards/rejected": 3.867298126220703, "step": 11627 }, { "epoch": 1.89, "learning_rate": 1.2020701874321042e-07, "logits/chosen": -0.4126286506652832, "logits/rejected": -0.4893696904182434, "logps/chosen": -68.84832763671875, "logps/rejected": -48.31932067871094, "loss": 2.5992, "rewards/accuracies": 1.0, "rewards/chosen": 1.5382232666015625, "rewards/margins": 0.09360349178314209, "rewards/rejected": 1.4446197748184204, "step": 11628 }, { "epoch": 1.89, "learning_rate": 1.2012155187715967e-07, "logits/chosen": -0.4124079644680023, "logits/rejected": -0.41532251238822937, "logps/chosen": -81.18051147460938, "logps/rejected": -43.296295166015625, "loss": 0.6595, "rewards/accuracies": 0.0, "rewards/chosen": 1.4405243396759033, "rewards/margins": -0.7159836292266846, "rewards/rejected": 2.156507968902588, "step": 11629 }, { "epoch": 1.89, "learning_rate": 1.2003611125711503e-07, "logits/chosen": -0.8675827980041504, "logits/rejected": -0.9493355751037598, "logps/chosen": -63.47386932373047, "logps/rejected": -154.34735107421875, "loss": 3.4323, "rewards/accuracies": 0.0, "rewards/chosen": 0.9747001528739929, "rewards/margins": -5.536585330963135, "rewards/rejected": 6.511285305023193, "step": 11630 }, { "epoch": 1.89, "learning_rate": 1.1995069688898003e-07, "logits/chosen": -0.49015671014785767, "logits/rejected": -0.4486449658870697, "logps/chosen": -45.248191833496094, "logps/rejected": -53.546234130859375, "loss": 0.5011, "rewards/accuracies": 0.0, "rewards/chosen": 1.217108964920044, "rewards/margins": -0.3993041515350342, "rewards/rejected": 1.6164131164550781, "step": 11631 }, { "epoch": 1.89, "learning_rate": 1.1986530877865568e-07, "logits/chosen": -0.7149522304534912, "logits/rejected": -0.6381545662879944, "logps/chosen": -126.4935302734375, "logps/rejected": -113.96279907226562, "loss": 2.751, "rewards/accuracies": 1.0, "rewards/chosen": 1.8589966297149658, "rewards/margins": 0.9933281540870667, "rewards/rejected": 0.8656684756278992, "step": 11632 }, { "epoch": 1.89, "learning_rate": 1.1977994693204175e-07, "logits/chosen": -0.6942201852798462, "logits/rejected": -0.6587759256362915, "logps/chosen": -75.94927978515625, "logps/rejected": -67.19720458984375, "loss": 1.2901, "rewards/accuracies": 0.0, "rewards/chosen": 1.4584388732910156, "rewards/margins": -0.36952435970306396, "rewards/rejected": 1.8279632329940796, "step": 11633 }, { "epoch": 1.89, "learning_rate": 1.1969461135503572e-07, "logits/chosen": -1.0444285869598389, "logits/rejected": -1.0937821865081787, "logps/chosen": -273.159423828125, "logps/rejected": -126.606689453125, "loss": 0.8767, "rewards/accuracies": 0.0, "rewards/chosen": 4.474841594696045, "rewards/margins": -1.3056349754333496, "rewards/rejected": 5.7804765701293945, "step": 11634 }, { "epoch": 1.89, "learning_rate": 1.1960930205353365e-07, "logits/chosen": -0.727512538433075, "logits/rejected": -0.8073199391365051, "logps/chosen": -159.29318237304688, "logps/rejected": -78.23908996582031, "loss": 0.224, "rewards/accuracies": 1.0, "rewards/chosen": 2.1354873180389404, "rewards/margins": 0.7587020397186279, "rewards/rejected": 1.3767852783203125, "step": 11635 }, { "epoch": 1.89, "learning_rate": 1.195240190334294e-07, "logits/chosen": -0.9388865232467651, "logits/rejected": -0.934692919254303, "logps/chosen": -74.85181427001953, "logps/rejected": -107.135498046875, "loss": 1.4063, "rewards/accuracies": 0.0, "rewards/chosen": 1.431378960609436, "rewards/margins": -2.424069881439209, "rewards/rejected": 3.8554489612579346, "step": 11636 }, { "epoch": 1.89, "learning_rate": 1.1943876230061546e-07, "logits/chosen": -0.8303847908973694, "logits/rejected": -0.633635938167572, "logps/chosen": -149.2301025390625, "logps/rejected": -32.284515380859375, "loss": 0.8072, "rewards/accuracies": 1.0, "rewards/chosen": 4.659924507141113, "rewards/margins": 4.058775424957275, "rewards/rejected": 0.6011490225791931, "step": 11637 }, { "epoch": 1.89, "learning_rate": 1.19353531860982e-07, "logits/chosen": -0.6356663107872009, "logits/rejected": -0.6356663107872009, "logps/chosen": -17.829896926879883, "logps/rejected": -17.829896926879883, "loss": 0.3513, "rewards/accuracies": 0.0, "rewards/chosen": 1.3617178201675415, "rewards/margins": 0.0, "rewards/rejected": 1.3617178201675415, "step": 11638 }, { "epoch": 1.89, "learning_rate": 1.1926832772041796e-07, "logits/chosen": -0.47834813594818115, "logits/rejected": -0.3532082438468933, "logps/chosen": -40.961273193359375, "logps/rejected": -25.606298446655273, "loss": 0.3076, "rewards/accuracies": 1.0, "rewards/chosen": 1.6890029907226562, "rewards/margins": 1.3613497018814087, "rewards/rejected": 0.32765331864356995, "step": 11639 }, { "epoch": 1.89, "learning_rate": 1.1918314988480977e-07, "logits/chosen": -0.7287468314170837, "logits/rejected": -0.7214418053627014, "logps/chosen": -33.826717376708984, "logps/rejected": -63.80438995361328, "loss": 0.6065, "rewards/accuracies": 0.0, "rewards/chosen": 1.094062089920044, "rewards/margins": -0.6882331371307373, "rewards/rejected": 1.7822952270507812, "step": 11640 }, { "epoch": 1.89, "learning_rate": 1.1909799836004275e-07, "logits/chosen": -0.5253287553787231, "logits/rejected": -0.37326255440711975, "logps/chosen": -64.24805450439453, "logps/rejected": -28.287633895874023, "loss": 0.7128, "rewards/accuracies": 1.0, "rewards/chosen": 2.99778151512146, "rewards/margins": 2.110774517059326, "rewards/rejected": 0.887006938457489, "step": 11641 }, { "epoch": 1.89, "learning_rate": 1.1901287315199976e-07, "logits/chosen": -0.5506341457366943, "logits/rejected": -0.3526662588119507, "logps/chosen": -58.956512451171875, "logps/rejected": -52.22423553466797, "loss": 0.2549, "rewards/accuracies": 1.0, "rewards/chosen": 2.08953857421875, "rewards/margins": 1.4040405750274658, "rewards/rejected": 0.685498058795929, "step": 11642 }, { "epoch": 1.89, "learning_rate": 1.1892777426656248e-07, "logits/chosen": -0.8105945587158203, "logits/rejected": -0.7094877362251282, "logps/chosen": -200.14346313476562, "logps/rejected": -106.72288513183594, "loss": 1.3596, "rewards/accuracies": 0.0, "rewards/chosen": 3.537501573562622, "rewards/margins": -2.6497042179107666, "rewards/rejected": 6.187205791473389, "step": 11643 }, { "epoch": 1.89, "learning_rate": 1.1884270170961009e-07, "logits/chosen": -0.5768867135047913, "logits/rejected": -0.513857901096344, "logps/chosen": -39.17871856689453, "logps/rejected": -69.7707290649414, "loss": 0.4677, "rewards/accuracies": 1.0, "rewards/chosen": 2.442265272140503, "rewards/margins": 0.47613978385925293, "rewards/rejected": 1.96612548828125, "step": 11644 }, { "epoch": 1.89, "learning_rate": 1.187576554870205e-07, "logits/chosen": -0.9894945025444031, "logits/rejected": -0.9595181345939636, "logps/chosen": -94.81854248046875, "logps/rejected": -80.7080307006836, "loss": 0.8451, "rewards/accuracies": 1.0, "rewards/chosen": 3.192108154296875, "rewards/margins": 0.19228124618530273, "rewards/rejected": 2.9998269081115723, "step": 11645 }, { "epoch": 1.89, "learning_rate": 1.1867263560466967e-07, "logits/chosen": -0.5266060829162598, "logits/rejected": -0.578640878200531, "logps/chosen": -51.47242736816406, "logps/rejected": -53.861717224121094, "loss": 0.7374, "rewards/accuracies": 0.0, "rewards/chosen": 1.3375412225723267, "rewards/margins": -1.0481399297714233, "rewards/rejected": 2.38568115234375, "step": 11646 }, { "epoch": 1.89, "learning_rate": 1.1858764206843148e-07, "logits/chosen": -0.33165672421455383, "logits/rejected": -0.33165672421455383, "logps/chosen": -75.22319793701172, "logps/rejected": -75.22319793701172, "loss": 0.3641, "rewards/accuracies": 0.0, "rewards/chosen": 1.9359581470489502, "rewards/margins": 0.0, "rewards/rejected": 1.9359581470489502, "step": 11647 }, { "epoch": 1.89, "learning_rate": 1.1850267488417837e-07, "logits/chosen": -0.9217329621315002, "logits/rejected": -0.8827310800552368, "logps/chosen": -120.59799194335938, "logps/rejected": -129.6405487060547, "loss": 1.4713, "rewards/accuracies": 0.0, "rewards/chosen": 4.429454326629639, "rewards/margins": -2.7839431762695312, "rewards/rejected": 7.21339750289917, "step": 11648 }, { "epoch": 1.89, "learning_rate": 1.184177340577805e-07, "logits/chosen": -0.4740142524242401, "logits/rejected": -0.4855649471282959, "logps/chosen": -57.03926086425781, "logps/rejected": -60.55025863647461, "loss": 1.0705, "rewards/accuracies": 0.0, "rewards/chosen": 0.8408737182617188, "rewards/margins": -0.22834360599517822, "rewards/rejected": 1.069217324256897, "step": 11649 }, { "epoch": 1.89, "learning_rate": 1.1833281959510683e-07, "logits/chosen": -0.6367549896240234, "logits/rejected": -0.6287513971328735, "logps/chosen": -57.306304931640625, "logps/rejected": -78.87374877929688, "loss": 1.2033, "rewards/accuracies": 1.0, "rewards/chosen": 1.6636794805526733, "rewards/margins": 0.01056361198425293, "rewards/rejected": 1.6531158685684204, "step": 11650 }, { "epoch": 1.89, "learning_rate": 1.1824793150202378e-07, "logits/chosen": -0.7940659523010254, "logits/rejected": -0.6675357818603516, "logps/chosen": -107.48146057128906, "logps/rejected": -79.23223114013672, "loss": 0.0744, "rewards/accuracies": 1.0, "rewards/chosen": 3.670942783355713, "rewards/margins": 2.47624135017395, "rewards/rejected": 1.1947014331817627, "step": 11651 }, { "epoch": 1.89, "learning_rate": 1.1816306978439666e-07, "logits/chosen": -0.7275574207305908, "logits/rejected": -0.8500006794929504, "logps/chosen": -92.14120483398438, "logps/rejected": -118.58635711669922, "loss": 0.6657, "rewards/accuracies": 0.0, "rewards/chosen": 0.7512580752372742, "rewards/margins": -0.8769837021827698, "rewards/rejected": 1.628241777420044, "step": 11652 }, { "epoch": 1.89, "learning_rate": 1.1807823444808829e-07, "logits/chosen": -0.8324255347251892, "logits/rejected": -0.7338116765022278, "logps/chosen": -83.38322448730469, "logps/rejected": -47.62589645385742, "loss": 0.0783, "rewards/accuracies": 1.0, "rewards/chosen": 1.9207061529159546, "rewards/margins": 1.9497824907302856, "rewards/rejected": -0.029076386243104935, "step": 11653 }, { "epoch": 1.89, "learning_rate": 1.1799342549896024e-07, "logits/chosen": -0.8741217851638794, "logits/rejected": -0.7112934589385986, "logps/chosen": -51.35990905761719, "logps/rejected": -21.375202178955078, "loss": 0.2727, "rewards/accuracies": 1.0, "rewards/chosen": 2.580246686935425, "rewards/margins": 2.35244083404541, "rewards/rejected": 0.22780589759349823, "step": 11654 }, { "epoch": 1.89, "learning_rate": 1.1790864294287184e-07, "logits/chosen": -0.5449258089065552, "logits/rejected": -0.4719969928264618, "logps/chosen": -89.90326690673828, "logps/rejected": -75.20591735839844, "loss": 1.215, "rewards/accuracies": 0.0, "rewards/chosen": 0.7988640069961548, "rewards/margins": -0.5355994701385498, "rewards/rejected": 1.3344634771347046, "step": 11655 }, { "epoch": 1.89, "learning_rate": 1.1782388678568095e-07, "logits/chosen": -1.1496435403823853, "logits/rejected": -1.1433179378509521, "logps/chosen": -49.32598114013672, "logps/rejected": -101.2837142944336, "loss": 0.2198, "rewards/accuracies": 1.0, "rewards/chosen": 1.2703536748886108, "rewards/margins": 0.6318854689598083, "rewards/rejected": 0.6384682059288025, "step": 11656 }, { "epoch": 1.89, "learning_rate": 1.1773915703324316e-07, "logits/chosen": -1.003902792930603, "logits/rejected": -0.981387197971344, "logps/chosen": -84.9486083984375, "logps/rejected": -58.99327087402344, "loss": 0.4557, "rewards/accuracies": 0.0, "rewards/chosen": 1.619361162185669, "rewards/margins": -0.36396098136901855, "rewards/rejected": 1.9833221435546875, "step": 11657 }, { "epoch": 1.89, "learning_rate": 1.1765445369141275e-07, "logits/chosen": -0.619090735912323, "logits/rejected": -0.46721333265304565, "logps/chosen": -65.13575744628906, "logps/rejected": -11.538476943969727, "loss": 0.0768, "rewards/accuracies": 1.0, "rewards/chosen": 2.5724258422851562, "rewards/margins": 2.1096441745758057, "rewards/rejected": 0.46278172731399536, "step": 11658 }, { "epoch": 1.89, "learning_rate": 1.1756977676604169e-07, "logits/chosen": -0.43278294801712036, "logits/rejected": -0.678011953830719, "logps/chosen": -99.87033081054688, "logps/rejected": -105.92829132080078, "loss": 0.8094, "rewards/accuracies": 0.0, "rewards/chosen": 2.0330886840820312, "rewards/margins": -1.3822121620178223, "rewards/rejected": 3.4153008460998535, "step": 11659 }, { "epoch": 1.89, "learning_rate": 1.1748512626298057e-07, "logits/chosen": -0.8938329219818115, "logits/rejected": -0.9444001913070679, "logps/chosen": -144.2528076171875, "logps/rejected": -61.63853454589844, "loss": 1.8929, "rewards/accuracies": 0.0, "rewards/chosen": 0.5154327750205994, "rewards/margins": -1.1955139636993408, "rewards/rejected": 1.7109466791152954, "step": 11660 }, { "epoch": 1.89, "learning_rate": 1.1740050218807774e-07, "logits/chosen": -0.7753884792327881, "logits/rejected": -0.7431581020355225, "logps/chosen": -152.4505615234375, "logps/rejected": -56.285430908203125, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": 5.605096340179443, "rewards/margins": 3.369061231613159, "rewards/rejected": 2.236035108566284, "step": 11661 }, { "epoch": 1.89, "learning_rate": 1.173159045471801e-07, "logits/chosen": -1.0291575193405151, "logits/rejected": -0.6538837552070618, "logps/chosen": -87.23431396484375, "logps/rejected": -106.71128845214844, "loss": 1.0592, "rewards/accuracies": 0.0, "rewards/chosen": 3.208308458328247, "rewards/margins": -1.9715516567230225, "rewards/rejected": 5.1798601150512695, "step": 11662 }, { "epoch": 1.89, "learning_rate": 1.172313333461324e-07, "logits/chosen": -0.5095446705818176, "logits/rejected": -0.5138133764266968, "logps/chosen": -2.7589592933654785, "logps/rejected": -1.5458523035049438, "loss": 0.9933, "rewards/accuracies": 0.0, "rewards/chosen": 0.11181330680847168, "rewards/margins": -0.1813080906867981, "rewards/rejected": 0.2931213974952698, "step": 11663 }, { "epoch": 1.89, "learning_rate": 1.1714678859077787e-07, "logits/chosen": -0.5530076026916504, "logits/rejected": -0.483784556388855, "logps/chosen": -55.99378967285156, "logps/rejected": -75.21685028076172, "loss": 0.4794, "rewards/accuracies": 1.0, "rewards/chosen": 1.7723296880722046, "rewards/margins": 0.024655818939208984, "rewards/rejected": 1.7476738691329956, "step": 11664 }, { "epoch": 1.89, "learning_rate": 1.1706227028695754e-07, "logits/chosen": -0.6315398216247559, "logits/rejected": -0.7150574326515198, "logps/chosen": -44.415008544921875, "logps/rejected": -42.411033630371094, "loss": 1.2681, "rewards/accuracies": 0.0, "rewards/chosen": 0.7266872525215149, "rewards/margins": -1.4649136066436768, "rewards/rejected": 2.191600799560547, "step": 11665 }, { "epoch": 1.89, "learning_rate": 1.1697777844051104e-07, "logits/chosen": -0.5033214092254639, "logits/rejected": -0.6630927324295044, "logps/chosen": -61.010009765625, "logps/rejected": -151.87069702148438, "loss": 1.5318, "rewards/accuracies": 0.0, "rewards/chosen": 1.4315757751464844, "rewards/margins": -3.010373592376709, "rewards/rejected": 4.441949367523193, "step": 11666 }, { "epoch": 1.89, "learning_rate": 1.1689331305727573e-07, "logits/chosen": -0.9057408571243286, "logits/rejected": -0.8185393214225769, "logps/chosen": -87.07589721679688, "logps/rejected": -67.93939971923828, "loss": 0.4224, "rewards/accuracies": 0.0, "rewards/chosen": 1.1305214166641235, "rewards/margins": -0.2432861328125, "rewards/rejected": 1.3738075494766235, "step": 11667 }, { "epoch": 1.89, "learning_rate": 1.1680887414308765e-07, "logits/chosen": -0.8369062542915344, "logits/rejected": -0.8757204413414001, "logps/chosen": -105.32038879394531, "logps/rejected": -195.58047485351562, "loss": 0.6087, "rewards/accuracies": 0.0, "rewards/chosen": 1.6064544916152954, "rewards/margins": -0.13828730583190918, "rewards/rejected": 1.7447417974472046, "step": 11668 }, { "epoch": 1.89, "learning_rate": 1.1672446170378047e-07, "logits/chosen": -0.6904470920562744, "logits/rejected": -0.6839455366134644, "logps/chosen": -76.23484802246094, "logps/rejected": -58.6794548034668, "loss": 0.2358, "rewards/accuracies": 1.0, "rewards/chosen": 2.634937286376953, "rewards/margins": 0.5419933795928955, "rewards/rejected": 2.0929439067840576, "step": 11669 }, { "epoch": 1.89, "learning_rate": 1.1664007574518652e-07, "logits/chosen": -0.7622435092926025, "logits/rejected": -0.48234570026397705, "logps/chosen": -70.78226470947266, "logps/rejected": -23.964977264404297, "loss": 0.6152, "rewards/accuracies": 1.0, "rewards/chosen": 3.5729081630706787, "rewards/margins": 3.446939468383789, "rewards/rejected": 0.12596873939037323, "step": 11670 }, { "epoch": 1.89, "learning_rate": 1.1655571627313582e-07, "logits/chosen": -0.615932285785675, "logits/rejected": -0.5769526958465576, "logps/chosen": -49.62887191772461, "logps/rejected": -56.9119987487793, "loss": 0.2189, "rewards/accuracies": 1.0, "rewards/chosen": 1.6862537860870361, "rewards/margins": 1.0528297424316406, "rewards/rejected": 0.6334239840507507, "step": 11671 }, { "epoch": 1.89, "learning_rate": 1.1647138329345708e-07, "logits/chosen": -0.7471955418586731, "logits/rejected": -0.7471955418586731, "logps/chosen": -74.87364196777344, "logps/rejected": -74.87364196777344, "loss": 0.7163, "rewards/accuracies": 0.0, "rewards/chosen": 1.7016174793243408, "rewards/margins": 0.0, "rewards/rejected": 1.7016174793243408, "step": 11672 }, { "epoch": 1.89, "learning_rate": 1.1638707681197658e-07, "logits/chosen": -0.52665776014328, "logits/rejected": -0.3735813796520233, "logps/chosen": -109.02363586425781, "logps/rejected": -30.080968856811523, "loss": 0.9569, "rewards/accuracies": 1.0, "rewards/chosen": 1.1159042119979858, "rewards/margins": 1.000414490699768, "rewards/rejected": 0.11548977345228195, "step": 11673 }, { "epoch": 1.89, "learning_rate": 1.1630279683451949e-07, "logits/chosen": -0.30764034390449524, "logits/rejected": -0.2854340970516205, "logps/chosen": -53.31934356689453, "logps/rejected": -16.56072998046875, "loss": 0.8438, "rewards/accuracies": 0.0, "rewards/chosen": -0.019326401874423027, "rewards/margins": -0.20283947885036469, "rewards/rejected": 0.1835130751132965, "step": 11674 }, { "epoch": 1.89, "learning_rate": 1.162185433669084e-07, "logits/chosen": -0.9012700915336609, "logits/rejected": -0.8140912652015686, "logps/chosen": -98.62003326416016, "logps/rejected": -132.5013427734375, "loss": 0.1288, "rewards/accuracies": 1.0, "rewards/chosen": 2.453542470932007, "rewards/margins": 1.290108561515808, "rewards/rejected": 1.1634339094161987, "step": 11675 }, { "epoch": 1.9, "learning_rate": 1.1613431641496474e-07, "logits/chosen": -0.61250901222229, "logits/rejected": -0.61250901222229, "logps/chosen": -60.1697883605957, "logps/rejected": -60.1697883605957, "loss": 0.6577, "rewards/accuracies": 0.0, "rewards/chosen": 1.7406399250030518, "rewards/margins": 0.0, "rewards/rejected": 1.7406399250030518, "step": 11676 }, { "epoch": 1.9, "learning_rate": 1.1605011598450748e-07, "logits/chosen": -0.5666615962982178, "logits/rejected": -0.6111446619033813, "logps/chosen": -50.559471130371094, "logps/rejected": -67.55064392089844, "loss": 0.6997, "rewards/accuracies": 0.0, "rewards/chosen": 1.1395301818847656, "rewards/margins": -0.09762656688690186, "rewards/rejected": 1.2371567487716675, "step": 11677 }, { "epoch": 1.9, "learning_rate": 1.159659420813544e-07, "logits/chosen": -0.16352172195911407, "logits/rejected": -0.16352172195911407, "logps/chosen": -0.5159684419631958, "logps/rejected": -0.5159684419631958, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.13133877515792847, "rewards/margins": 0.0, "rewards/rejected": 0.13133877515792847, "step": 11678 }, { "epoch": 1.9, "learning_rate": 1.1588179471132081e-07, "logits/chosen": -0.19978027045726776, "logits/rejected": -0.215072363615036, "logps/chosen": -57.903350830078125, "logps/rejected": -59.394893646240234, "loss": 0.3529, "rewards/accuracies": 1.0, "rewards/chosen": 1.177119493484497, "rewards/margins": 0.4627567529678345, "rewards/rejected": 0.7143627405166626, "step": 11679 }, { "epoch": 1.9, "learning_rate": 1.1579767388022066e-07, "logits/chosen": -0.520565927028656, "logits/rejected": -0.6089799404144287, "logps/chosen": -144.60003662109375, "logps/rejected": -126.69383239746094, "loss": 0.5013, "rewards/accuracies": 0.0, "rewards/chosen": 6.200730800628662, "rewards/margins": -0.31814002990722656, "rewards/rejected": 6.518870830535889, "step": 11680 }, { "epoch": 1.9, "learning_rate": 1.1571357959386602e-07, "logits/chosen": -0.551967978477478, "logits/rejected": -0.5797336101531982, "logps/chosen": -36.42247772216797, "logps/rejected": -55.30731201171875, "loss": 0.473, "rewards/accuracies": 0.0, "rewards/chosen": 1.8616936206817627, "rewards/margins": -0.16167140007019043, "rewards/rejected": 2.023365020751953, "step": 11681 }, { "epoch": 1.9, "learning_rate": 1.1562951185806674e-07, "logits/chosen": -1.1281781196594238, "logits/rejected": -0.7264554500579834, "logps/chosen": -118.0443115234375, "logps/rejected": -63.053672790527344, "loss": 0.7691, "rewards/accuracies": 1.0, "rewards/chosen": 4.912275791168213, "rewards/margins": 2.347553253173828, "rewards/rejected": 2.5647225379943848, "step": 11682 }, { "epoch": 1.9, "learning_rate": 1.1554547067863135e-07, "logits/chosen": -0.5936600565910339, "logits/rejected": -0.6072065830230713, "logps/chosen": -101.91717529296875, "logps/rejected": -6.710907459259033, "loss": 0.3527, "rewards/accuracies": 1.0, "rewards/chosen": 0.4282432496547699, "rewards/margins": 0.01826605200767517, "rewards/rejected": 0.4099771976470947, "step": 11683 }, { "epoch": 1.9, "learning_rate": 1.1546145606136608e-07, "logits/chosen": -0.8614732623100281, "logits/rejected": -0.8846705555915833, "logps/chosen": -129.26840209960938, "logps/rejected": -61.777713775634766, "loss": 1.3079, "rewards/accuracies": 0.0, "rewards/chosen": 0.8202438354492188, "rewards/margins": -0.8628902435302734, "rewards/rejected": 1.6831340789794922, "step": 11684 }, { "epoch": 1.9, "learning_rate": 1.1537746801207582e-07, "logits/chosen": -0.7522718906402588, "logits/rejected": -0.7522718906402588, "logps/chosen": -66.21620178222656, "logps/rejected": -66.21620178222656, "loss": 1.2239, "rewards/accuracies": 0.0, "rewards/chosen": 1.624261498451233, "rewards/margins": 0.0, "rewards/rejected": 1.624261498451233, "step": 11685 }, { "epoch": 1.9, "learning_rate": 1.1529350653656306e-07, "logits/chosen": -1.1432942152023315, "logits/rejected": -0.9808107614517212, "logps/chosen": -148.580810546875, "logps/rejected": -145.52230834960938, "loss": 0.5852, "rewards/accuracies": 1.0, "rewards/chosen": 5.312816143035889, "rewards/margins": 0.0012531280517578125, "rewards/rejected": 5.311563014984131, "step": 11686 }, { "epoch": 1.9, "learning_rate": 1.1520957164062894e-07, "logits/chosen": -0.6972618699073792, "logits/rejected": -0.6838293671607971, "logps/chosen": -84.73817443847656, "logps/rejected": -25.650392532348633, "loss": 0.9881, "rewards/accuracies": 0.0, "rewards/chosen": 0.9572113156318665, "rewards/margins": -0.751756489276886, "rewards/rejected": 1.7089678049087524, "step": 11687 }, { "epoch": 1.9, "learning_rate": 1.1512566333007245e-07, "logits/chosen": -0.6928248405456543, "logits/rejected": -0.7241729497909546, "logps/chosen": -51.479400634765625, "logps/rejected": -109.30998229980469, "loss": 0.3713, "rewards/accuracies": 1.0, "rewards/chosen": 1.3262802362442017, "rewards/margins": 1.1096619367599487, "rewards/rejected": 0.2166183441877365, "step": 11688 }, { "epoch": 1.9, "learning_rate": 1.15041781610691e-07, "logits/chosen": -0.6811549067497253, "logits/rejected": -0.4799891412258148, "logps/chosen": -56.626304626464844, "logps/rejected": -83.7777099609375, "loss": 1.1286, "rewards/accuracies": 0.0, "rewards/chosen": 1.8609329462051392, "rewards/margins": -1.2550727128982544, "rewards/rejected": 3.1160056591033936, "step": 11689 }, { "epoch": 1.9, "learning_rate": 1.1495792648827979e-07, "logits/chosen": -0.960904598236084, "logits/rejected": -0.9251266121864319, "logps/chosen": -67.91605377197266, "logps/rejected": -76.47106170654297, "loss": 0.3757, "rewards/accuracies": 1.0, "rewards/chosen": 2.181267499923706, "rewards/margins": 1.2425856590270996, "rewards/rejected": 0.9386817812919617, "step": 11690 }, { "epoch": 1.9, "learning_rate": 1.1487409796863267e-07, "logits/chosen": -1.0015757083892822, "logits/rejected": -0.968837559223175, "logps/chosen": -128.39024353027344, "logps/rejected": -65.77531433105469, "loss": 0.4002, "rewards/accuracies": 0.0, "rewards/chosen": 1.8261139392852783, "rewards/margins": -0.1912398338317871, "rewards/rejected": 2.0173537731170654, "step": 11691 }, { "epoch": 1.9, "learning_rate": 1.147902960575412e-07, "logits/chosen": -0.6841571927070618, "logits/rejected": -0.5942234992980957, "logps/chosen": -65.76087188720703, "logps/rejected": -85.33847045898438, "loss": 1.0979, "rewards/accuracies": 0.0, "rewards/chosen": 1.536446452140808, "rewards/margins": -1.6751641035079956, "rewards/rejected": 3.2116105556488037, "step": 11692 }, { "epoch": 1.9, "learning_rate": 1.147065207607955e-07, "logits/chosen": -0.9411917328834534, "logits/rejected": -0.883409857749939, "logps/chosen": -65.23133850097656, "logps/rejected": -56.02763748168945, "loss": 0.7759, "rewards/accuracies": 0.0, "rewards/chosen": 1.0001404285430908, "rewards/margins": -1.0904948711395264, "rewards/rejected": 2.090635299682617, "step": 11693 }, { "epoch": 1.9, "learning_rate": 1.1462277208418336e-07, "logits/chosen": -0.7992401719093323, "logits/rejected": -0.80253005027771, "logps/chosen": -52.18321228027344, "logps/rejected": -63.745635986328125, "loss": 0.5933, "rewards/accuracies": 0.0, "rewards/chosen": 1.7829933166503906, "rewards/margins": -0.042598724365234375, "rewards/rejected": 1.825592041015625, "step": 11694 }, { "epoch": 1.9, "learning_rate": 1.145390500334914e-07, "logits/chosen": -0.9682420492172241, "logits/rejected": -0.8702263236045837, "logps/chosen": -107.82561492919922, "logps/rejected": -68.440185546875, "loss": 0.0497, "rewards/accuracies": 1.0, "rewards/chosen": 5.114670753479004, "rewards/margins": 2.5788979530334473, "rewards/rejected": 2.5357728004455566, "step": 11695 }, { "epoch": 1.9, "learning_rate": 1.1445535461450362e-07, "logits/chosen": -0.9873366951942444, "logits/rejected": -0.6874087452888489, "logps/chosen": -133.49618530273438, "logps/rejected": -60.562652587890625, "loss": 0.158, "rewards/accuracies": 1.0, "rewards/chosen": 4.842990398406982, "rewards/margins": 1.2023303508758545, "rewards/rejected": 3.640660047531128, "step": 11696 }, { "epoch": 1.9, "learning_rate": 1.14371685833003e-07, "logits/chosen": -0.26203951239585876, "logits/rejected": -0.2607647180557251, "logps/chosen": -2.4447765350341797, "logps/rejected": -5.213596343994141, "loss": 0.3463, "rewards/accuracies": 1.0, "rewards/chosen": 0.2818801999092102, "rewards/margins": 0.22148729860782623, "rewards/rejected": 0.06039290502667427, "step": 11697 }, { "epoch": 1.9, "learning_rate": 1.142880436947698e-07, "logits/chosen": -0.6787979602813721, "logits/rejected": -0.8014881610870361, "logps/chosen": -108.05117797851562, "logps/rejected": -65.72611236572266, "loss": 0.794, "rewards/accuracies": 1.0, "rewards/chosen": 4.122605800628662, "rewards/margins": 0.6862151622772217, "rewards/rejected": 3.4363906383514404, "step": 11698 }, { "epoch": 1.9, "learning_rate": 1.1420442820558335e-07, "logits/chosen": -0.9885475635528564, "logits/rejected": -1.0150121450424194, "logps/chosen": -176.99505615234375, "logps/rejected": -122.07257843017578, "loss": 0.9584, "rewards/accuracies": 0.0, "rewards/chosen": 0.8360443115234375, "rewards/margins": -1.6613852977752686, "rewards/rejected": 2.497429609298706, "step": 11699 }, { "epoch": 1.9, "learning_rate": 1.1412083937122031e-07, "logits/chosen": -0.7975297570228577, "logits/rejected": -0.79073166847229, "logps/chosen": -7.155146598815918, "logps/rejected": -4.477722644805908, "loss": 0.4243, "rewards/accuracies": 0.0, "rewards/chosen": 0.1371167153120041, "rewards/margins": -0.11977611482143402, "rewards/rejected": 0.2568928301334381, "step": 11700 }, { "epoch": 1.9, "learning_rate": 1.1403727719745621e-07, "logits/chosen": -0.898583710193634, "logits/rejected": -0.8014177083969116, "logps/chosen": -72.59242248535156, "logps/rejected": -60.328392028808594, "loss": 0.4943, "rewards/accuracies": 1.0, "rewards/chosen": 2.364912509918213, "rewards/margins": 0.6454300880432129, "rewards/rejected": 1.719482421875, "step": 11701 }, { "epoch": 1.9, "learning_rate": 1.1395374169006406e-07, "logits/chosen": -0.9574736952781677, "logits/rejected": -0.8192434310913086, "logps/chosen": -82.7994155883789, "logps/rejected": -111.49952697753906, "loss": 0.428, "rewards/accuracies": 1.0, "rewards/chosen": 3.4913575649261475, "rewards/margins": 0.13129901885986328, "rewards/rejected": 3.360058546066284, "step": 11702 }, { "epoch": 1.9, "learning_rate": 1.1387023285481573e-07, "logits/chosen": -0.920796811580658, "logits/rejected": -0.9561989307403564, "logps/chosen": -63.429222106933594, "logps/rejected": -106.96125793457031, "loss": 1.5747, "rewards/accuracies": 0.0, "rewards/chosen": 2.386335849761963, "rewards/margins": -3.101994514465332, "rewards/rejected": 5.488330364227295, "step": 11703 }, { "epoch": 1.9, "learning_rate": 1.1378675069748056e-07, "logits/chosen": -0.6402412056922913, "logits/rejected": -0.6334133148193359, "logps/chosen": -1.2515854835510254, "logps/rejected": -8.347686767578125, "loss": 0.6538, "rewards/accuracies": 1.0, "rewards/chosen": 0.3429841697216034, "rewards/margins": 0.13139674067497253, "rewards/rejected": 0.21158742904663086, "step": 11704 }, { "epoch": 1.9, "learning_rate": 1.1370329522382672e-07, "logits/chosen": -0.6171875596046448, "logits/rejected": -0.4958409070968628, "logps/chosen": -72.53738403320312, "logps/rejected": -67.72377014160156, "loss": 0.5428, "rewards/accuracies": 1.0, "rewards/chosen": 1.41435706615448, "rewards/margins": 0.3374375104904175, "rewards/rejected": 1.0769195556640625, "step": 11705 }, { "epoch": 1.9, "learning_rate": 1.136198664396198e-07, "logits/chosen": -0.7215297222137451, "logits/rejected": -0.7633630633354187, "logps/chosen": -176.2436065673828, "logps/rejected": -154.729248046875, "loss": 0.3891, "rewards/accuracies": 0.0, "rewards/chosen": 4.782818794250488, "rewards/margins": -0.15090608596801758, "rewards/rejected": 4.933724880218506, "step": 11706 }, { "epoch": 1.9, "learning_rate": 1.135364643506243e-07, "logits/chosen": -0.708093523979187, "logits/rejected": -0.6801979541778564, "logps/chosen": -65.9638671875, "logps/rejected": -75.4267578125, "loss": 0.6477, "rewards/accuracies": 0.0, "rewards/chosen": 1.5996185541152954, "rewards/margins": -0.5004714727401733, "rewards/rejected": 2.1000900268554688, "step": 11707 }, { "epoch": 1.9, "learning_rate": 1.1345308896260237e-07, "logits/chosen": -0.3883940577507019, "logits/rejected": -0.437831848859787, "logps/chosen": -44.191062927246094, "logps/rejected": -58.405921936035156, "loss": 0.7222, "rewards/accuracies": 0.0, "rewards/chosen": 1.2781356573104858, "rewards/margins": -0.4525322914123535, "rewards/rejected": 1.7306679487228394, "step": 11708 }, { "epoch": 1.9, "learning_rate": 1.1336974028131436e-07, "logits/chosen": -0.6689133644104004, "logits/rejected": -0.6689133644104004, "logps/chosen": -25.036867141723633, "logps/rejected": -25.036867141723633, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": -0.04138508066534996, "rewards/margins": 0.0, "rewards/rejected": -0.04138508066534996, "step": 11709 }, { "epoch": 1.9, "learning_rate": 1.1328641831251906e-07, "logits/chosen": -0.9294773936271667, "logits/rejected": -0.7650139331817627, "logps/chosen": -176.97299194335938, "logps/rejected": -123.02117156982422, "loss": 0.3069, "rewards/accuracies": 1.0, "rewards/chosen": 2.6892929077148438, "rewards/margins": 0.1674644947052002, "rewards/rejected": 2.5218284130096436, "step": 11710 }, { "epoch": 1.9, "learning_rate": 1.1320312306197299e-07, "logits/chosen": -0.6318548917770386, "logits/rejected": -0.6318548917770386, "logps/chosen": -58.62560272216797, "logps/rejected": -58.62560272216797, "loss": 0.5126, "rewards/accuracies": 0.0, "rewards/chosen": 1.3386353254318237, "rewards/margins": 0.0, "rewards/rejected": 1.3386353254318237, "step": 11711 }, { "epoch": 1.9, "learning_rate": 1.1311985453543132e-07, "logits/chosen": -0.6167559027671814, "logits/rejected": -0.6095376014709473, "logps/chosen": -47.13768768310547, "logps/rejected": -138.13299560546875, "loss": 1.1495, "rewards/accuracies": 1.0, "rewards/chosen": 1.44873046875, "rewards/margins": 0.7866104245185852, "rewards/rejected": 0.6621200442314148, "step": 11712 }, { "epoch": 1.9, "learning_rate": 1.1303661273864696e-07, "logits/chosen": -0.883317768573761, "logits/rejected": -0.7976413369178772, "logps/chosen": -99.6561279296875, "logps/rejected": -62.72053909301758, "loss": 0.9115, "rewards/accuracies": 0.0, "rewards/chosen": 1.2034943103790283, "rewards/margins": -0.978217601776123, "rewards/rejected": 2.1817119121551514, "step": 11713 }, { "epoch": 1.9, "learning_rate": 1.1295339767737122e-07, "logits/chosen": -0.44259318709373474, "logits/rejected": -0.4659120440483093, "logps/chosen": -66.1285400390625, "logps/rejected": -54.77320861816406, "loss": 1.8456, "rewards/accuracies": 0.0, "rewards/chosen": 1.0423545837402344, "rewards/margins": -0.6545028686523438, "rewards/rejected": 1.6968574523925781, "step": 11714 }, { "epoch": 1.9, "learning_rate": 1.1287020935735336e-07, "logits/chosen": -0.27288737893104553, "logits/rejected": -0.27568286657333374, "logps/chosen": -79.89077758789062, "logps/rejected": -55.51527786254883, "loss": 0.5131, "rewards/accuracies": 0.0, "rewards/chosen": 1.5097945928573608, "rewards/margins": -0.3750072717666626, "rewards/rejected": 1.8848018646240234, "step": 11715 }, { "epoch": 1.9, "learning_rate": 1.1278704778434112e-07, "logits/chosen": -0.3252882659435272, "logits/rejected": -0.3526678681373596, "logps/chosen": -94.18055725097656, "logps/rejected": -53.783138275146484, "loss": 0.7398, "rewards/accuracies": 0.0, "rewards/chosen": 0.3111312985420227, "rewards/margins": -0.9678836464881897, "rewards/rejected": 1.2790149450302124, "step": 11716 }, { "epoch": 1.9, "learning_rate": 1.1270391296407982e-07, "logits/chosen": -0.6261312365531921, "logits/rejected": -0.7564933896064758, "logps/chosen": -152.17791748046875, "logps/rejected": -106.75914001464844, "loss": 1.0855, "rewards/accuracies": 0.0, "rewards/chosen": 3.7872161865234375, "rewards/margins": -1.0049943923950195, "rewards/rejected": 4.792210578918457, "step": 11717 }, { "epoch": 1.9, "learning_rate": 1.1262080490231374e-07, "logits/chosen": -0.7437707781791687, "logits/rejected": -0.819780170917511, "logps/chosen": -86.5725326538086, "logps/rejected": -73.25218200683594, "loss": 1.4623, "rewards/accuracies": 0.0, "rewards/chosen": 2.2640328407287598, "rewards/margins": -1.5210075378417969, "rewards/rejected": 3.7850403785705566, "step": 11718 }, { "epoch": 1.9, "learning_rate": 1.1253772360478442e-07, "logits/chosen": -1.010709524154663, "logits/rejected": -0.9731048941612244, "logps/chosen": -83.37799072265625, "logps/rejected": -169.42169189453125, "loss": 1.0608, "rewards/accuracies": 0.0, "rewards/chosen": 5.379769802093506, "rewards/margins": -1.8375244140625, "rewards/rejected": 7.217294216156006, "step": 11719 }, { "epoch": 1.9, "learning_rate": 1.1245466907723233e-07, "logits/chosen": -0.8759530186653137, "logits/rejected": -0.8382095098495483, "logps/chosen": -46.985713958740234, "logps/rejected": -95.70645141601562, "loss": 0.4811, "rewards/accuracies": 0.0, "rewards/chosen": 2.5318219661712646, "rewards/margins": -0.32599902153015137, "rewards/rejected": 2.857820987701416, "step": 11720 }, { "epoch": 1.9, "learning_rate": 1.123716413253955e-07, "logits/chosen": -0.47389543056488037, "logits/rejected": -0.3235640823841095, "logps/chosen": -68.5966796875, "logps/rejected": -29.747133255004883, "loss": 0.0981, "rewards/accuracies": 1.0, "rewards/chosen": 3.0235307216644287, "rewards/margins": 1.578291654586792, "rewards/rejected": 1.4452390670776367, "step": 11721 }, { "epoch": 1.9, "learning_rate": 1.1228864035501068e-07, "logits/chosen": -0.4657912850379944, "logits/rejected": -0.4982568323612213, "logps/chosen": -32.854637145996094, "logps/rejected": -66.61688232421875, "loss": 0.6815, "rewards/accuracies": 0.0, "rewards/chosen": 1.790360689163208, "rewards/margins": -0.48911619186401367, "rewards/rejected": 2.2794768810272217, "step": 11722 }, { "epoch": 1.9, "learning_rate": 1.1220566617181204e-07, "logits/chosen": -0.6632719039916992, "logits/rejected": -0.6191285252571106, "logps/chosen": -45.255645751953125, "logps/rejected": -92.20909118652344, "loss": 0.5869, "rewards/accuracies": 0.0, "rewards/chosen": 1.2943603992462158, "rewards/margins": -0.7000099420547485, "rewards/rejected": 1.9943703413009644, "step": 11723 }, { "epoch": 1.9, "learning_rate": 1.1212271878153267e-07, "logits/chosen": -0.14054597914218903, "logits/rejected": -0.13879674673080444, "logps/chosen": -2.2633790969848633, "logps/rejected": -1.1033496856689453, "loss": 0.4336, "rewards/accuracies": 0.0, "rewards/chosen": 0.16084514558315277, "rewards/margins": -0.17375673353672028, "rewards/rejected": 0.33460187911987305, "step": 11724 }, { "epoch": 1.9, "learning_rate": 1.1203979818990317e-07, "logits/chosen": -0.5710840225219727, "logits/rejected": -0.5850169062614441, "logps/chosen": -69.85507202148438, "logps/rejected": -58.72389221191406, "loss": 0.5664, "rewards/accuracies": 0.0, "rewards/chosen": 0.89666748046875, "rewards/margins": -0.407490611076355, "rewards/rejected": 1.304158091545105, "step": 11725 }, { "epoch": 1.9, "learning_rate": 1.1195690440265287e-07, "logits/chosen": -0.5926927924156189, "logits/rejected": -0.5761992335319519, "logps/chosen": -91.20851135253906, "logps/rejected": -152.16712951660156, "loss": 1.1352, "rewards/accuracies": 0.0, "rewards/chosen": 3.9986252784729004, "rewards/margins": -1.7690458297729492, "rewards/rejected": 5.76767110824585, "step": 11726 }, { "epoch": 1.9, "learning_rate": 1.118740374255086e-07, "logits/chosen": -1.0129973888397217, "logits/rejected": -0.9691630005836487, "logps/chosen": -62.236019134521484, "logps/rejected": -34.31296920776367, "loss": 1.0719, "rewards/accuracies": 1.0, "rewards/chosen": 1.1936542987823486, "rewards/margins": 1.1234909296035767, "rewards/rejected": 0.07016334682703018, "step": 11727 }, { "epoch": 1.9, "learning_rate": 1.1179119726419601e-07, "logits/chosen": -0.7444475889205933, "logits/rejected": -0.6208714246749878, "logps/chosen": -165.33407592773438, "logps/rejected": -46.43634796142578, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": 4.964940071105957, "rewards/margins": 3.7288403511047363, "rewards/rejected": 1.2360996007919312, "step": 11728 }, { "epoch": 1.9, "learning_rate": 1.1170838392443832e-07, "logits/chosen": -0.4641978144645691, "logits/rejected": -0.4641978144645691, "logps/chosen": -65.19181823730469, "logps/rejected": -65.19181823730469, "loss": 1.5201, "rewards/accuracies": 0.0, "rewards/chosen": 2.7121193408966064, "rewards/margins": 0.0, "rewards/rejected": 2.7121193408966064, "step": 11729 }, { "epoch": 1.9, "learning_rate": 1.1162559741195732e-07, "logits/chosen": -0.5041961073875427, "logits/rejected": -0.7091336846351624, "logps/chosen": -99.848876953125, "logps/rejected": -108.11412048339844, "loss": 2.3084, "rewards/accuracies": 0.0, "rewards/chosen": 1.0848244428634644, "rewards/margins": -4.377811431884766, "rewards/rejected": 5.4626359939575195, "step": 11730 }, { "epoch": 1.9, "learning_rate": 1.1154283773247264e-07, "logits/chosen": -0.8147207498550415, "logits/rejected": -0.8751996755599976, "logps/chosen": -45.54417419433594, "logps/rejected": -84.35066223144531, "loss": 1.2252, "rewards/accuracies": 0.0, "rewards/chosen": 1.3429473638534546, "rewards/margins": -2.339261054992676, "rewards/rejected": 3.682208299636841, "step": 11731 }, { "epoch": 1.9, "learning_rate": 1.1146010489170238e-07, "logits/chosen": -0.9110862016677856, "logits/rejected": -0.7614840269088745, "logps/chosen": -111.51812744140625, "logps/rejected": -59.327186584472656, "loss": 0.0515, "rewards/accuracies": 1.0, "rewards/chosen": 5.994911193847656, "rewards/margins": 5.144000053405762, "rewards/rejected": 0.8509109616279602, "step": 11732 }, { "epoch": 1.9, "learning_rate": 1.113773988953623e-07, "logits/chosen": -0.7131451964378357, "logits/rejected": -0.7214683890342712, "logps/chosen": -81.88276672363281, "logps/rejected": -86.23699951171875, "loss": 0.2132, "rewards/accuracies": 1.0, "rewards/chosen": 2.781475782394409, "rewards/margins": 0.6458404064178467, "rewards/rejected": 2.1356353759765625, "step": 11733 }, { "epoch": 1.9, "learning_rate": 1.1129471974916694e-07, "logits/chosen": -0.8045165538787842, "logits/rejected": -0.7886255979537964, "logps/chosen": -71.3635482788086, "logps/rejected": -55.117149353027344, "loss": 0.2459, "rewards/accuracies": 1.0, "rewards/chosen": 1.538044810295105, "rewards/margins": 0.4549546241760254, "rewards/rejected": 1.0830901861190796, "step": 11734 }, { "epoch": 1.9, "learning_rate": 1.1121206745882833e-07, "logits/chosen": -0.6417571306228638, "logits/rejected": -0.6118132472038269, "logps/chosen": -106.59056091308594, "logps/rejected": -103.85559844970703, "loss": 0.4295, "rewards/accuracies": 0.0, "rewards/chosen": 0.7731254696846008, "rewards/margins": -0.15297698974609375, "rewards/rejected": 0.9261024594306946, "step": 11735 }, { "epoch": 1.9, "learning_rate": 1.1112944203005709e-07, "logits/chosen": -0.5862842202186584, "logits/rejected": -0.3058745861053467, "logps/chosen": -126.52648162841797, "logps/rejected": -36.97670364379883, "loss": 0.148, "rewards/accuracies": 1.0, "rewards/chosen": 3.387441396713257, "rewards/margins": 3.259690523147583, "rewards/rejected": 0.12775078415870667, "step": 11736 }, { "epoch": 1.91, "learning_rate": 1.1104684346856208e-07, "logits/chosen": -0.7595144510269165, "logits/rejected": -0.7148433923721313, "logps/chosen": -52.57051086425781, "logps/rejected": -50.475730895996094, "loss": 0.4495, "rewards/accuracies": 1.0, "rewards/chosen": 1.4731369018554688, "rewards/margins": 0.3290809392929077, "rewards/rejected": 1.144055962562561, "step": 11737 }, { "epoch": 1.91, "learning_rate": 1.1096427178004969e-07, "logits/chosen": -0.8629031777381897, "logits/rejected": -0.787096381187439, "logps/chosen": -59.225460052490234, "logps/rejected": -74.70231628417969, "loss": 0.2479, "rewards/accuracies": 1.0, "rewards/chosen": 2.3550541400909424, "rewards/margins": 0.4583927392959595, "rewards/rejected": 1.896661400794983, "step": 11738 }, { "epoch": 1.91, "learning_rate": 1.1088172697022519e-07, "logits/chosen": -0.38864269852638245, "logits/rejected": -0.38864269852638245, "logps/chosen": -28.524568557739258, "logps/rejected": -28.524568557739258, "loss": 1.6517, "rewards/accuracies": 0.0, "rewards/chosen": 1.128422737121582, "rewards/margins": 0.0, "rewards/rejected": 1.128422737121582, "step": 11739 }, { "epoch": 1.91, "learning_rate": 1.1079920904479134e-07, "logits/chosen": -0.8296487331390381, "logits/rejected": -0.8564133644104004, "logps/chosen": -41.2789192199707, "logps/rejected": -31.39911651611328, "loss": 0.3075, "rewards/accuracies": 1.0, "rewards/chosen": 2.231950521469116, "rewards/margins": 0.5099629163742065, "rewards/rejected": 1.7219876050949097, "step": 11740 }, { "epoch": 1.91, "learning_rate": 1.107167180094496e-07, "logits/chosen": -0.38321539759635925, "logits/rejected": -0.41131898760795593, "logps/chosen": -55.64522171020508, "logps/rejected": -41.44624710083008, "loss": 0.7637, "rewards/accuracies": 0.0, "rewards/chosen": 0.6853012442588806, "rewards/margins": -1.2179036140441895, "rewards/rejected": 1.9032047986984253, "step": 11741 }, { "epoch": 1.91, "learning_rate": 1.106342538698991e-07, "logits/chosen": -0.9008504748344421, "logits/rejected": -0.8452619910240173, "logps/chosen": -86.02255249023438, "logps/rejected": -45.952579498291016, "loss": 0.5897, "rewards/accuracies": 0.0, "rewards/chosen": 1.0333961248397827, "rewards/margins": -0.6577625274658203, "rewards/rejected": 1.691158652305603, "step": 11742 }, { "epoch": 1.91, "learning_rate": 1.105518166318376e-07, "logits/chosen": -0.8313692212104797, "logits/rejected": -0.7765263319015503, "logps/chosen": -63.53764343261719, "logps/rejected": -71.44166564941406, "loss": 1.4067, "rewards/accuracies": 1.0, "rewards/chosen": 2.929534912109375, "rewards/margins": 0.5219955444335938, "rewards/rejected": 2.4075393676757812, "step": 11743 }, { "epoch": 1.91, "learning_rate": 1.1046940630096047e-07, "logits/chosen": -0.6358234882354736, "logits/rejected": -0.5609181523323059, "logps/chosen": -128.41744995117188, "logps/rejected": -94.51299285888672, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": 4.785199165344238, "rewards/margins": 3.681645393371582, "rewards/rejected": 1.1035537719726562, "step": 11744 }, { "epoch": 1.91, "learning_rate": 1.1038702288296164e-07, "logits/chosen": -0.7052370309829712, "logits/rejected": -0.6701204776763916, "logps/chosen": -94.6147689819336, "logps/rejected": -40.142120361328125, "loss": 0.671, "rewards/accuracies": 0.0, "rewards/chosen": 0.5430335998535156, "rewards/margins": -1.0012043714523315, "rewards/rejected": 1.5442379713058472, "step": 11745 }, { "epoch": 1.91, "learning_rate": 1.1030466638353292e-07, "logits/chosen": -0.821588933467865, "logits/rejected": -0.7930339574813843, "logps/chosen": -70.34153747558594, "logps/rejected": -132.407958984375, "loss": 0.1501, "rewards/accuracies": 1.0, "rewards/chosen": 1.461450219154358, "rewards/margins": 1.093042016029358, "rewards/rejected": 0.368408203125, "step": 11746 }, { "epoch": 1.91, "learning_rate": 1.1022233680836451e-07, "logits/chosen": -0.7509132623672485, "logits/rejected": -0.8928495049476624, "logps/chosen": -80.11063385009766, "logps/rejected": -120.98255157470703, "loss": 1.253, "rewards/accuracies": 0.0, "rewards/chosen": 1.645607829093933, "rewards/margins": -1.767694115638733, "rewards/rejected": 3.413301944732666, "step": 11747 }, { "epoch": 1.91, "learning_rate": 1.1014003416314438e-07, "logits/chosen": -0.918692409992218, "logits/rejected": -0.918692409992218, "logps/chosen": -13.727510452270508, "logps/rejected": -13.727510452270508, "loss": 0.4149, "rewards/accuracies": 0.0, "rewards/chosen": 1.2902554273605347, "rewards/margins": 0.0, "rewards/rejected": 1.2902554273605347, "step": 11748 }, { "epoch": 1.91, "learning_rate": 1.1005775845355918e-07, "logits/chosen": -0.672605037689209, "logits/rejected": -0.6393750309944153, "logps/chosen": -88.03534698486328, "logps/rejected": -62.13780212402344, "loss": 0.6396, "rewards/accuracies": 0.0, "rewards/chosen": 1.3149696588516235, "rewards/margins": -0.7245262861251831, "rewards/rejected": 2.0394959449768066, "step": 11749 }, { "epoch": 1.91, "learning_rate": 1.0997550968529301e-07, "logits/chosen": -0.5773959755897522, "logits/rejected": -0.527978241443634, "logps/chosen": -78.82577514648438, "logps/rejected": -118.9090347290039, "loss": 0.5483, "rewards/accuracies": 1.0, "rewards/chosen": 1.465571641921997, "rewards/margins": 0.9540115594863892, "rewards/rejected": 0.5115600824356079, "step": 11750 }, { "epoch": 1.91, "learning_rate": 1.0989328786402885e-07, "logits/chosen": -0.7011400461196899, "logits/rejected": -0.7465566992759705, "logps/chosen": -105.73880767822266, "logps/rejected": -119.4967269897461, "loss": 1.7428, "rewards/accuracies": 0.0, "rewards/chosen": 1.6660698652267456, "rewards/margins": -3.421739101409912, "rewards/rejected": 5.087809085845947, "step": 11751 }, { "epoch": 1.91, "learning_rate": 1.098110929954471e-07, "logits/chosen": -0.7202311754226685, "logits/rejected": -0.767114520072937, "logps/chosen": -71.90271759033203, "logps/rejected": -44.38106155395508, "loss": 0.6017, "rewards/accuracies": 0.0, "rewards/chosen": 2.4848687648773193, "rewards/margins": -0.19388985633850098, "rewards/rejected": 2.6787586212158203, "step": 11752 }, { "epoch": 1.91, "learning_rate": 1.0972892508522702e-07, "logits/chosen": -0.5348419547080994, "logits/rejected": -0.5348419547080994, "logps/chosen": -119.03477478027344, "logps/rejected": -119.03477478027344, "loss": 0.4017, "rewards/accuracies": 0.0, "rewards/chosen": 1.1132034063339233, "rewards/margins": 0.0, "rewards/rejected": 1.1132034063339233, "step": 11753 }, { "epoch": 1.91, "learning_rate": 1.0964678413904526e-07, "logits/chosen": -0.47771936655044556, "logits/rejected": -0.47206977009773254, "logps/chosen": -9.317116737365723, "logps/rejected": -20.427204132080078, "loss": 0.6546, "rewards/accuracies": 1.0, "rewards/chosen": 0.22084227204322815, "rewards/margins": 0.002300933003425598, "rewards/rejected": 0.21854133903980255, "step": 11754 }, { "epoch": 1.91, "learning_rate": 1.0956467016257732e-07, "logits/chosen": -0.7601279020309448, "logits/rejected": -0.7288001775741577, "logps/chosen": -98.13751220703125, "logps/rejected": -214.07308959960938, "loss": 0.1839, "rewards/accuracies": 1.0, "rewards/chosen": 5.574398994445801, "rewards/margins": 1.4856200218200684, "rewards/rejected": 4.088778972625732, "step": 11755 }, { "epoch": 1.91, "learning_rate": 1.0948258316149617e-07, "logits/chosen": -0.8876121044158936, "logits/rejected": -0.884378969669342, "logps/chosen": -53.46443176269531, "logps/rejected": -107.94658660888672, "loss": 0.1657, "rewards/accuracies": 1.0, "rewards/chosen": 1.9765182733535767, "rewards/margins": 1.986855387687683, "rewards/rejected": -0.01033706683665514, "step": 11756 }, { "epoch": 1.91, "learning_rate": 1.0940052314147358e-07, "logits/chosen": -0.5251397490501404, "logits/rejected": -0.5251397490501404, "logps/chosen": -63.22651672363281, "logps/rejected": -63.22651672363281, "loss": 0.3475, "rewards/accuracies": 0.0, "rewards/chosen": 1.1833343505859375, "rewards/margins": 0.0, "rewards/rejected": 1.1833343505859375, "step": 11757 }, { "epoch": 1.91, "learning_rate": 1.0931849010817879e-07, "logits/chosen": -0.4329746663570404, "logits/rejected": -0.42185354232788086, "logps/chosen": -57.414451599121094, "logps/rejected": -126.83255004882812, "loss": 1.6938, "rewards/accuracies": 1.0, "rewards/chosen": 1.1071709394454956, "rewards/margins": 0.6889458298683167, "rewards/rejected": 0.41822510957717896, "step": 11758 }, { "epoch": 1.91, "learning_rate": 1.0923648406727981e-07, "logits/chosen": -0.6263301372528076, "logits/rejected": -0.6218611598014832, "logps/chosen": -8.373457908630371, "logps/rejected": -5.42250394821167, "loss": 0.7747, "rewards/accuracies": 1.0, "rewards/chosen": 1.5300660133361816, "rewards/margins": 0.38620686531066895, "rewards/rejected": 1.1438591480255127, "step": 11759 }, { "epoch": 1.91, "learning_rate": 1.0915450502444224e-07, "logits/chosen": -0.8165866732597351, "logits/rejected": -0.6720833778381348, "logps/chosen": -168.293212890625, "logps/rejected": -56.927391052246094, "loss": 0.2422, "rewards/accuracies": 1.0, "rewards/chosen": 3.73785400390625, "rewards/margins": 0.5104713439941406, "rewards/rejected": 3.2273826599121094, "step": 11760 }, { "epoch": 1.91, "learning_rate": 1.0907255298533024e-07, "logits/chosen": -0.8267359733581543, "logits/rejected": -0.7517809271812439, "logps/chosen": -88.89468383789062, "logps/rejected": -92.63818359375, "loss": 0.0931, "rewards/accuracies": 1.0, "rewards/chosen": 4.456855773925781, "rewards/margins": 1.6980559825897217, "rewards/rejected": 2.7587997913360596, "step": 11761 }, { "epoch": 1.91, "learning_rate": 1.089906279556057e-07, "logits/chosen": -0.7905814051628113, "logits/rejected": -0.7126393914222717, "logps/chosen": -48.560264587402344, "logps/rejected": -56.19734191894531, "loss": 0.5752, "rewards/accuracies": 0.0, "rewards/chosen": 1.3503707647323608, "rewards/margins": -0.1456848382949829, "rewards/rejected": 1.4960556030273438, "step": 11762 }, { "epoch": 1.91, "learning_rate": 1.089087299409292e-07, "logits/chosen": -0.39927810430526733, "logits/rejected": -0.37867745757102966, "logps/chosen": -53.0013313293457, "logps/rejected": -86.92118835449219, "loss": 0.3196, "rewards/accuracies": 1.0, "rewards/chosen": 1.4482784271240234, "rewards/margins": 0.2669597864151001, "rewards/rejected": 1.1813186407089233, "step": 11763 }, { "epoch": 1.91, "learning_rate": 1.0882685894695876e-07, "logits/chosen": -1.2817939519882202, "logits/rejected": -1.243537425994873, "logps/chosen": -73.84255981445312, "logps/rejected": -35.38505935668945, "loss": 0.222, "rewards/accuracies": 1.0, "rewards/chosen": 2.552363634109497, "rewards/margins": 2.222597122192383, "rewards/rejected": 0.32976648211479187, "step": 11764 }, { "epoch": 1.91, "learning_rate": 1.0874501497935118e-07, "logits/chosen": -0.28981274366378784, "logits/rejected": -0.2876051068305969, "logps/chosen": -3.659543752670288, "logps/rejected": -9.906648635864258, "loss": 0.3114, "rewards/accuracies": 1.0, "rewards/chosen": 0.20623062551021576, "rewards/margins": 0.15523679554462433, "rewards/rejected": 0.05099382624030113, "step": 11765 }, { "epoch": 1.91, "learning_rate": 1.0866319804376083e-07, "logits/chosen": -0.3699762225151062, "logits/rejected": -0.3699762225151062, "logps/chosen": -55.36283874511719, "logps/rejected": -55.36283874511719, "loss": 0.6085, "rewards/accuracies": 0.0, "rewards/chosen": 0.8384270071983337, "rewards/margins": 0.0, "rewards/rejected": 0.8384270071983337, "step": 11766 }, { "epoch": 1.91, "learning_rate": 1.0858140814584082e-07, "logits/chosen": -0.6995124816894531, "logits/rejected": -0.6523128747940063, "logps/chosen": -46.247528076171875, "logps/rejected": -52.55149841308594, "loss": 0.8607, "rewards/accuracies": 1.0, "rewards/chosen": 2.404839277267456, "rewards/margins": 0.112884521484375, "rewards/rejected": 2.291954755783081, "step": 11767 }, { "epoch": 1.91, "learning_rate": 1.0849964529124167e-07, "logits/chosen": -0.6033328175544739, "logits/rejected": -0.5986534953117371, "logps/chosen": -108.76727294921875, "logps/rejected": -85.69601440429688, "loss": 0.6572, "rewards/accuracies": 1.0, "rewards/chosen": 2.3192169666290283, "rewards/margins": 1.5529663562774658, "rewards/rejected": 0.7662506103515625, "step": 11768 }, { "epoch": 1.91, "learning_rate": 1.084179094856128e-07, "logits/chosen": -0.6918025016784668, "logits/rejected": -0.6246759295463562, "logps/chosen": -100.25770568847656, "logps/rejected": -49.631649017333984, "loss": 0.3677, "rewards/accuracies": 1.0, "rewards/chosen": 1.8697357177734375, "rewards/margins": 0.7645622491836548, "rewards/rejected": 1.1051734685897827, "step": 11769 }, { "epoch": 1.91, "learning_rate": 1.0833620073460103e-07, "logits/chosen": -0.7586767077445984, "logits/rejected": -0.926902711391449, "logps/chosen": -75.2902603149414, "logps/rejected": -137.71388244628906, "loss": 1.5185, "rewards/accuracies": 0.0, "rewards/chosen": 1.224677324295044, "rewards/margins": -2.7601706981658936, "rewards/rejected": 3.9848480224609375, "step": 11770 }, { "epoch": 1.91, "learning_rate": 1.0825451904385197e-07, "logits/chosen": -0.23997245728969574, "logits/rejected": -0.7614027261734009, "logps/chosen": -38.79111099243164, "logps/rejected": -76.4169692993164, "loss": 1.1105, "rewards/accuracies": 1.0, "rewards/chosen": 1.620097041130066, "rewards/margins": 1.6026959419250488, "rewards/rejected": 0.01740112341940403, "step": 11771 }, { "epoch": 1.91, "learning_rate": 1.081728644190088e-07, "logits/chosen": -1.3027987480163574, "logits/rejected": -1.3018146753311157, "logps/chosen": -89.0419921875, "logps/rejected": -40.964134216308594, "loss": 0.1362, "rewards/accuracies": 1.0, "rewards/chosen": 1.296617865562439, "rewards/margins": 1.222586750984192, "rewards/rejected": 0.07403106987476349, "step": 11772 }, { "epoch": 1.91, "learning_rate": 1.080912368657132e-07, "logits/chosen": -0.7422753572463989, "logits/rejected": -0.8157647848129272, "logps/chosen": -78.55726623535156, "logps/rejected": -145.5736083984375, "loss": 2.5612, "rewards/accuracies": 0.0, "rewards/chosen": 1.5274513959884644, "rewards/margins": -4.417264461517334, "rewards/rejected": 5.944715976715088, "step": 11773 }, { "epoch": 1.91, "learning_rate": 1.0800963638960497e-07, "logits/chosen": -0.6834437251091003, "logits/rejected": -0.7228683829307556, "logps/chosen": -101.61656188964844, "logps/rejected": -112.09794616699219, "loss": 0.7259, "rewards/accuracies": 0.0, "rewards/chosen": 5.517489910125732, "rewards/margins": -0.1271681785583496, "rewards/rejected": 5.644658088684082, "step": 11774 }, { "epoch": 1.91, "learning_rate": 1.0792806299632174e-07, "logits/chosen": -1.109887957572937, "logits/rejected": -0.9824128746986389, "logps/chosen": -179.97984313964844, "logps/rejected": -94.33058166503906, "loss": 1.4956, "rewards/accuracies": 1.0, "rewards/chosen": 5.784254550933838, "rewards/margins": 0.24471282958984375, "rewards/rejected": 5.539541721343994, "step": 11775 }, { "epoch": 1.91, "learning_rate": 1.0784651669149958e-07, "logits/chosen": -0.8967671394348145, "logits/rejected": -0.8977416753768921, "logps/chosen": -70.3862075805664, "logps/rejected": -68.98420715332031, "loss": 0.9224, "rewards/accuracies": 0.0, "rewards/chosen": 1.142281413078308, "rewards/margins": -0.9434577226638794, "rewards/rejected": 2.0857391357421875, "step": 11776 }, { "epoch": 1.91, "learning_rate": 1.0776499748077245e-07, "logits/chosen": -1.1789151430130005, "logits/rejected": -1.0304814577102661, "logps/chosen": -191.8553466796875, "logps/rejected": -58.91608428955078, "loss": 0.2749, "rewards/accuracies": 1.0, "rewards/chosen": 4.633847236633301, "rewards/margins": 0.6203980445861816, "rewards/rejected": 4.013449192047119, "step": 11777 }, { "epoch": 1.91, "learning_rate": 1.0768350536977278e-07, "logits/chosen": -1.007201910018921, "logits/rejected": -0.9958837628364563, "logps/chosen": -227.76986694335938, "logps/rejected": -83.2710952758789, "loss": 0.1036, "rewards/accuracies": 1.0, "rewards/chosen": 4.543277263641357, "rewards/margins": 1.6921930313110352, "rewards/rejected": 2.8510842323303223, "step": 11778 }, { "epoch": 1.91, "learning_rate": 1.0760204036413057e-07, "logits/chosen": -0.8459051251411438, "logits/rejected": -0.6371657252311707, "logps/chosen": -119.26483154296875, "logps/rejected": -55.25004196166992, "loss": 0.1863, "rewards/accuracies": 1.0, "rewards/chosen": 4.406103610992432, "rewards/margins": 0.9556262493133545, "rewards/rejected": 3.450477361679077, "step": 11779 }, { "epoch": 1.91, "learning_rate": 1.0752060246947464e-07, "logits/chosen": -0.9028747081756592, "logits/rejected": -1.245605230331421, "logps/chosen": -99.90367126464844, "logps/rejected": -35.68473815917969, "loss": 0.1673, "rewards/accuracies": 1.0, "rewards/chosen": 1.3274002075195312, "rewards/margins": 0.9698406457901001, "rewards/rejected": 0.35755959153175354, "step": 11780 }, { "epoch": 1.91, "learning_rate": 1.0743919169143123e-07, "logits/chosen": -0.8801125288009644, "logits/rejected": -0.8801125288009644, "logps/chosen": -54.89910888671875, "logps/rejected": -54.89910888671875, "loss": 0.6368, "rewards/accuracies": 0.0, "rewards/chosen": 2.8579375743865967, "rewards/margins": 0.0, "rewards/rejected": 2.8579375743865967, "step": 11781 }, { "epoch": 1.91, "learning_rate": 1.0735780803562539e-07, "logits/chosen": -0.7495784163475037, "logits/rejected": -0.703890323638916, "logps/chosen": -44.22086715698242, "logps/rejected": -57.55097961425781, "loss": 0.4539, "rewards/accuracies": 0.0, "rewards/chosen": 1.8753608465194702, "rewards/margins": -0.3698540925979614, "rewards/rejected": 2.2452149391174316, "step": 11782 }, { "epoch": 1.91, "learning_rate": 1.0727645150767966e-07, "logits/chosen": -0.9929916262626648, "logits/rejected": -1.1091933250427246, "logps/chosen": -196.75477600097656, "logps/rejected": -129.3026123046875, "loss": 3.5774, "rewards/accuracies": 1.0, "rewards/chosen": 6.05703592300415, "rewards/margins": 0.3892073631286621, "rewards/rejected": 5.667828559875488, "step": 11783 }, { "epoch": 1.91, "learning_rate": 1.0719512211321529e-07, "logits/chosen": -0.7141842842102051, "logits/rejected": -0.686412513256073, "logps/chosen": -80.96939086914062, "logps/rejected": -61.374061584472656, "loss": 0.7699, "rewards/accuracies": 0.0, "rewards/chosen": 1.1286147832870483, "rewards/margins": -0.9020088911056519, "rewards/rejected": 2.0306236743927, "step": 11784 }, { "epoch": 1.91, "learning_rate": 1.0711381985785111e-07, "logits/chosen": -0.47771725058555603, "logits/rejected": -0.4159157872200012, "logps/chosen": -62.855098724365234, "logps/rejected": -105.29083251953125, "loss": 0.7107, "rewards/accuracies": 1.0, "rewards/chosen": 1.4768497943878174, "rewards/margins": 1.2950817346572876, "rewards/rejected": 0.1817680448293686, "step": 11785 }, { "epoch": 1.91, "learning_rate": 1.0703254474720457e-07, "logits/chosen": -0.8745284676551819, "logits/rejected": -0.8589250445365906, "logps/chosen": -26.149166107177734, "logps/rejected": -1.819547414779663, "loss": 1.0054, "rewards/accuracies": 0.0, "rewards/chosen": 0.5632978677749634, "rewards/margins": -0.046469271183013916, "rewards/rejected": 0.6097671389579773, "step": 11786 }, { "epoch": 1.91, "learning_rate": 1.0695129678689074e-07, "logits/chosen": -0.8283536434173584, "logits/rejected": -0.7903637886047363, "logps/chosen": -76.96075439453125, "logps/rejected": -67.53584289550781, "loss": 0.8051, "rewards/accuracies": 1.0, "rewards/chosen": 2.0311036109924316, "rewards/margins": 0.04330909252166748, "rewards/rejected": 1.9877945184707642, "step": 11787 }, { "epoch": 1.91, "learning_rate": 1.068700759825234e-07, "logits/chosen": -0.820020318031311, "logits/rejected": -0.9951741695404053, "logps/chosen": -69.605224609375, "logps/rejected": -201.44932556152344, "loss": 3.9082, "rewards/accuracies": 0.0, "rewards/chosen": 2.164019823074341, "rewards/margins": -7.733654022216797, "rewards/rejected": 9.897673606872559, "step": 11788 }, { "epoch": 1.91, "learning_rate": 1.0678888233971384e-07, "logits/chosen": -0.37973305583000183, "logits/rejected": -0.3776825964450836, "logps/chosen": -73.89714813232422, "logps/rejected": -90.90138244628906, "loss": 1.2675, "rewards/accuracies": 0.0, "rewards/chosen": 0.8242241144180298, "rewards/margins": -0.12478256225585938, "rewards/rejected": 0.9490066766738892, "step": 11789 }, { "epoch": 1.91, "learning_rate": 1.0670771586407206e-07, "logits/chosen": -0.4275316596031189, "logits/rejected": -0.4200189411640167, "logps/chosen": -0.5898573398590088, "logps/rejected": -7.314256191253662, "loss": 0.3996, "rewards/accuracies": 1.0, "rewards/chosen": 0.18408648669719696, "rewards/margins": 0.16469790041446686, "rewards/rejected": 0.019388580694794655, "step": 11790 }, { "epoch": 1.91, "learning_rate": 1.066265765612056e-07, "logits/chosen": -0.6264863610267639, "logits/rejected": -0.6630602478981018, "logps/chosen": -98.2073974609375, "logps/rejected": -102.23748779296875, "loss": 1.5041, "rewards/accuracies": 0.0, "rewards/chosen": 2.1041336059570312, "rewards/margins": -2.211019992828369, "rewards/rejected": 4.3151535987854, "step": 11791 }, { "epoch": 1.91, "learning_rate": 1.0654546443672069e-07, "logits/chosen": -0.872962236404419, "logits/rejected": -0.8819959759712219, "logps/chosen": -128.21438598632812, "logps/rejected": -51.9233512878418, "loss": 2.2859, "rewards/accuracies": 0.0, "rewards/chosen": 1.0857819318771362, "rewards/margins": -1.2727917432785034, "rewards/rejected": 2.3585736751556396, "step": 11792 }, { "epoch": 1.91, "learning_rate": 1.0646437949622117e-07, "logits/chosen": -0.5827955603599548, "logits/rejected": -0.5827955603599548, "logps/chosen": -38.024208068847656, "logps/rejected": -38.024208068847656, "loss": 0.4147, "rewards/accuracies": 0.0, "rewards/chosen": 1.3202106952667236, "rewards/margins": 0.0, "rewards/rejected": 1.3202106952667236, "step": 11793 }, { "epoch": 1.91, "learning_rate": 1.0638332174530951e-07, "logits/chosen": -0.8508071899414062, "logits/rejected": -0.7820534706115723, "logps/chosen": -56.76996612548828, "logps/rejected": -24.1490478515625, "loss": 2.1214, "rewards/accuracies": 1.0, "rewards/chosen": 2.0938942432403564, "rewards/margins": 1.3509421348571777, "rewards/rejected": 0.7429521679878235, "step": 11794 }, { "epoch": 1.91, "learning_rate": 1.0630229118958572e-07, "logits/chosen": -0.6769036054611206, "logits/rejected": -0.7041611671447754, "logps/chosen": -34.41929626464844, "logps/rejected": -53.08219909667969, "loss": 0.503, "rewards/accuracies": 0.0, "rewards/chosen": 0.8768073916435242, "rewards/margins": -0.5374061465263367, "rewards/rejected": 1.4142135381698608, "step": 11795 }, { "epoch": 1.91, "learning_rate": 1.0622128783464851e-07, "logits/chosen": -0.36757951974868774, "logits/rejected": -0.36757951974868774, "logps/chosen": -72.8731918334961, "logps/rejected": -72.8731918334961, "loss": 0.3793, "rewards/accuracies": 0.0, "rewards/chosen": 0.4683113098144531, "rewards/margins": 0.0, "rewards/rejected": 0.4683113098144531, "step": 11796 }, { "epoch": 1.91, "learning_rate": 1.0614031168609428e-07, "logits/chosen": -0.9381327033042908, "logits/rejected": -0.8219466209411621, "logps/chosen": -198.20947265625, "logps/rejected": -79.38433837890625, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 2.1227967739105225, "rewards/margins": 0.7098557949066162, "rewards/rejected": 1.4129409790039062, "step": 11797 }, { "epoch": 1.91, "learning_rate": 1.0605936274951782e-07, "logits/chosen": -0.6190495491027832, "logits/rejected": -0.5939425230026245, "logps/chosen": -88.70891571044922, "logps/rejected": -60.66242980957031, "loss": 0.5239, "rewards/accuracies": 0.0, "rewards/chosen": 1.3000297546386719, "rewards/margins": -0.25781333446502686, "rewards/rejected": 1.5578430891036987, "step": 11798 }, { "epoch": 1.92, "learning_rate": 1.0597844103051184e-07, "logits/chosen": -0.33090800046920776, "logits/rejected": -0.33090800046920776, "logps/chosen": -27.485816955566406, "logps/rejected": -27.485816955566406, "loss": 0.351, "rewards/accuracies": 0.0, "rewards/chosen": 0.11503314971923828, "rewards/margins": 0.0, "rewards/rejected": 0.11503314971923828, "step": 11799 }, { "epoch": 1.92, "learning_rate": 1.0589754653466743e-07, "logits/chosen": -0.39406460523605347, "logits/rejected": -0.37652114033699036, "logps/chosen": -49.5565185546875, "logps/rejected": -237.18966674804688, "loss": 2.4746, "rewards/accuracies": 0.0, "rewards/chosen": 2.146592855453491, "rewards/margins": -2.7694809436798096, "rewards/rejected": 4.916073799133301, "step": 11800 }, { "epoch": 1.92, "learning_rate": 1.0581667926757337e-07, "logits/chosen": -0.6582254767417908, "logits/rejected": -0.6757080554962158, "logps/chosen": -126.27422332763672, "logps/rejected": -90.67794799804688, "loss": 1.061, "rewards/accuracies": 0.0, "rewards/chosen": 1.346900224685669, "rewards/margins": -0.6609222888946533, "rewards/rejected": 2.0078225135803223, "step": 11801 }, { "epoch": 1.92, "learning_rate": 1.0573583923481711e-07, "logits/chosen": -0.8416273593902588, "logits/rejected": -0.8584616780281067, "logps/chosen": -77.49522399902344, "logps/rejected": -50.991188049316406, "loss": 1.4148, "rewards/accuracies": 0.0, "rewards/chosen": 0.3604751527309418, "rewards/margins": -1.5627555847167969, "rewards/rejected": 1.923230767250061, "step": 11802 }, { "epoch": 1.92, "learning_rate": 1.056550264419837e-07, "logits/chosen": -1.0129917860031128, "logits/rejected": -0.7250153422355652, "logps/chosen": -174.91348266601562, "logps/rejected": -28.08742332458496, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": 6.108819484710693, "rewards/margins": 5.638400554656982, "rewards/rejected": 0.4704187512397766, "step": 11803 }, { "epoch": 1.92, "learning_rate": 1.0557424089465671e-07, "logits/chosen": -0.5869923830032349, "logits/rejected": -0.5443734526634216, "logps/chosen": -57.103538513183594, "logps/rejected": -74.53927612304688, "loss": 1.4378, "rewards/accuracies": 0.0, "rewards/chosen": 0.8212776184082031, "rewards/margins": -1.1756600141525269, "rewards/rejected": 1.99693763256073, "step": 11804 }, { "epoch": 1.92, "learning_rate": 1.054934825984175e-07, "logits/chosen": -0.1828298717737198, "logits/rejected": -0.18503338098526, "logps/chosen": -4.663690090179443, "logps/rejected": -1.6889578104019165, "loss": 0.3496, "rewards/accuracies": 1.0, "rewards/chosen": 0.30467668175697327, "rewards/margins": 0.005120426416397095, "rewards/rejected": 0.29955625534057617, "step": 11805 }, { "epoch": 1.92, "learning_rate": 1.0541275155884594e-07, "logits/chosen": -1.0517693758010864, "logits/rejected": -1.0095796585083008, "logps/chosen": -82.26823425292969, "logps/rejected": -88.7778549194336, "loss": 0.073, "rewards/accuracies": 1.0, "rewards/chosen": 5.461102485656738, "rewards/margins": 3.505788564682007, "rewards/rejected": 1.9553139209747314, "step": 11806 }, { "epoch": 1.92, "learning_rate": 1.0533204778151944e-07, "logits/chosen": -0.5790502429008484, "logits/rejected": -0.799839973449707, "logps/chosen": -32.90026092529297, "logps/rejected": -35.43678283691406, "loss": 1.8837, "rewards/accuracies": 0.0, "rewards/chosen": 1.600603461265564, "rewards/margins": -1.3126534223556519, "rewards/rejected": 2.913256883621216, "step": 11807 }, { "epoch": 1.92, "learning_rate": 1.0525137127201406e-07, "logits/chosen": -0.4762982428073883, "logits/rejected": -0.5311151742935181, "logps/chosen": -87.75580596923828, "logps/rejected": -101.30758666992188, "loss": 0.2947, "rewards/accuracies": 1.0, "rewards/chosen": 2.2885704040527344, "rewards/margins": 0.3513847589492798, "rewards/rejected": 1.9371856451034546, "step": 11808 }, { "epoch": 1.92, "learning_rate": 1.0517072203590393e-07, "logits/chosen": -0.7563748955726624, "logits/rejected": -0.7559429407119751, "logps/chosen": -41.41972351074219, "logps/rejected": -53.033390045166016, "loss": 0.3473, "rewards/accuracies": 1.0, "rewards/chosen": 1.5309470891952515, "rewards/margins": 0.024757742881774902, "rewards/rejected": 1.5061893463134766, "step": 11809 }, { "epoch": 1.92, "learning_rate": 1.0509010007876084e-07, "logits/chosen": -0.6225748062133789, "logits/rejected": -0.6272350549697876, "logps/chosen": -70.05690002441406, "logps/rejected": -92.0682373046875, "loss": 0.6351, "rewards/accuracies": 0.0, "rewards/chosen": 1.9590797424316406, "rewards/margins": -0.8356530666351318, "rewards/rejected": 2.7947328090667725, "step": 11810 }, { "epoch": 1.92, "learning_rate": 1.0500950540615533e-07, "logits/chosen": -0.25769463181495667, "logits/rejected": -0.26871341466903687, "logps/chosen": -2.894885301589966, "logps/rejected": -18.40753173828125, "loss": 1.2404, "rewards/accuracies": 0.0, "rewards/chosen": 0.22163227200508118, "rewards/margins": -0.13437053561210632, "rewards/rejected": 0.3560028076171875, "step": 11811 }, { "epoch": 1.92, "learning_rate": 1.0492893802365543e-07, "logits/chosen": -0.7768290638923645, "logits/rejected": -0.7768290638923645, "logps/chosen": -72.46800994873047, "logps/rejected": -72.46800994873047, "loss": 1.8906, "rewards/accuracies": 0.0, "rewards/chosen": 1.423682451248169, "rewards/margins": 0.0, "rewards/rejected": 1.423682451248169, "step": 11812 }, { "epoch": 1.92, "learning_rate": 1.0484839793682782e-07, "logits/chosen": -0.7775408029556274, "logits/rejected": -0.6391848921775818, "logps/chosen": -118.0638198852539, "logps/rejected": -82.91482543945312, "loss": 0.8025, "rewards/accuracies": 0.0, "rewards/chosen": 0.4775291383266449, "rewards/margins": -1.1877555847167969, "rewards/rejected": 1.6652847528457642, "step": 11813 }, { "epoch": 1.92, "learning_rate": 1.0476788515123685e-07, "logits/chosen": -0.6656622886657715, "logits/rejected": -0.6120317578315735, "logps/chosen": -43.35793685913086, "logps/rejected": -39.764827728271484, "loss": 0.8324, "rewards/accuracies": 1.0, "rewards/chosen": 1.608344316482544, "rewards/margins": 0.608123779296875, "rewards/rejected": 1.000220537185669, "step": 11814 }, { "epoch": 1.92, "learning_rate": 1.0468739967244555e-07, "logits/chosen": -0.6119797825813293, "logits/rejected": -0.6714391112327576, "logps/chosen": -94.11062622070312, "logps/rejected": -79.14299011230469, "loss": 0.6913, "rewards/accuracies": 0.0, "rewards/chosen": 1.2314170598983765, "rewards/margins": -0.8942917585372925, "rewards/rejected": 2.125708818435669, "step": 11815 }, { "epoch": 1.92, "learning_rate": 1.0460694150601418e-07, "logits/chosen": -0.7617058753967285, "logits/rejected": -0.6409595012664795, "logps/chosen": -97.14582824707031, "logps/rejected": -65.70515441894531, "loss": 0.0359, "rewards/accuracies": 1.0, "rewards/chosen": 5.6357622146606445, "rewards/margins": 3.197133779525757, "rewards/rejected": 2.4386284351348877, "step": 11816 }, { "epoch": 1.92, "learning_rate": 1.0452651065750201e-07, "logits/chosen": -0.5925073623657227, "logits/rejected": -0.5300756692886353, "logps/chosen": -50.789642333984375, "logps/rejected": -88.05291748046875, "loss": 0.5606, "rewards/accuracies": 1.0, "rewards/chosen": 0.6572643518447876, "rewards/margins": 0.10279655456542969, "rewards/rejected": 0.5544677972793579, "step": 11817 }, { "epoch": 1.92, "learning_rate": 1.0444610713246588e-07, "logits/chosen": -0.47067660093307495, "logits/rejected": -0.4360658526420593, "logps/chosen": -110.80488586425781, "logps/rejected": -87.51438903808594, "loss": 0.7725, "rewards/accuracies": 1.0, "rewards/chosen": 1.0357331037521362, "rewards/margins": 0.34486013650894165, "rewards/rejected": 0.6908729672431946, "step": 11818 }, { "epoch": 1.92, "learning_rate": 1.0436573093646105e-07, "logits/chosen": -0.8138810992240906, "logits/rejected": -0.9677770733833313, "logps/chosen": -175.60903930664062, "logps/rejected": -145.01953125, "loss": 1.5703, "rewards/accuracies": 0.0, "rewards/chosen": 3.4085144996643066, "rewards/margins": -2.4152207374572754, "rewards/rejected": 5.823735237121582, "step": 11819 }, { "epoch": 1.92, "learning_rate": 1.0428538207504056e-07, "logits/chosen": -0.8402383923530579, "logits/rejected": -0.8308928608894348, "logps/chosen": -35.42150115966797, "logps/rejected": -76.66291809082031, "loss": 0.6883, "rewards/accuracies": 0.0, "rewards/chosen": 1.6723793745040894, "rewards/margins": -1.0431221723556519, "rewards/rejected": 2.715501546859741, "step": 11820 }, { "epoch": 1.92, "learning_rate": 1.0420506055375605e-07, "logits/chosen": -0.9421605467796326, "logits/rejected": -0.9065402150154114, "logps/chosen": -74.7469253540039, "logps/rejected": -94.92488098144531, "loss": 0.5775, "rewards/accuracies": 0.0, "rewards/chosen": 2.476391553878784, "rewards/margins": -0.509528398513794, "rewards/rejected": 2.985919952392578, "step": 11821 }, { "epoch": 1.92, "learning_rate": 1.0412476637815665e-07, "logits/chosen": -0.3968523442745209, "logits/rejected": -0.3968523442745209, "logps/chosen": -17.493602752685547, "logps/rejected": -17.493602752685547, "loss": 0.6908, "rewards/accuracies": 0.0, "rewards/chosen": 0.26579952239990234, "rewards/margins": 0.0, "rewards/rejected": 0.26579952239990234, "step": 11822 }, { "epoch": 1.92, "learning_rate": 1.0404449955379025e-07, "logits/chosen": -0.7155968546867371, "logits/rejected": -0.6069477200508118, "logps/chosen": -97.63665771484375, "logps/rejected": -40.7650146484375, "loss": 0.185, "rewards/accuracies": 1.0, "rewards/chosen": 1.4013893604278564, "rewards/margins": 1.1246185302734375, "rewards/rejected": 0.27677080035209656, "step": 11823 }, { "epoch": 1.92, "learning_rate": 1.0396426008620223e-07, "logits/chosen": -0.9081193208694458, "logits/rejected": -0.7309329509735107, "logps/chosen": -78.6661376953125, "logps/rejected": -50.95494079589844, "loss": 0.2029, "rewards/accuracies": 1.0, "rewards/chosen": 3.4997329711914062, "rewards/margins": 0.7074370384216309, "rewards/rejected": 2.7922959327697754, "step": 11824 }, { "epoch": 1.92, "learning_rate": 1.0388404798093664e-07, "logits/chosen": -0.5337844491004944, "logits/rejected": -0.49527597427368164, "logps/chosen": -78.64273071289062, "logps/rejected": -71.82086181640625, "loss": 0.4762, "rewards/accuracies": 1.0, "rewards/chosen": 0.6756218075752258, "rewards/margins": 0.36690598726272583, "rewards/rejected": 0.3087158203125, "step": 11825 }, { "epoch": 1.92, "learning_rate": 1.0380386324353508e-07, "logits/chosen": -0.5546508431434631, "logits/rejected": -0.3238278329372406, "logps/chosen": -64.03672790527344, "logps/rejected": -71.67375946044922, "loss": 1.155, "rewards/accuracies": 0.0, "rewards/chosen": 1.7677841186523438, "rewards/margins": -0.7040526866912842, "rewards/rejected": 2.471836805343628, "step": 11826 }, { "epoch": 1.92, "learning_rate": 1.0372370587953777e-07, "logits/chosen": -0.8774147033691406, "logits/rejected": -0.7142716646194458, "logps/chosen": -174.72781372070312, "logps/rejected": -53.24945831298828, "loss": 0.1819, "rewards/accuracies": 1.0, "rewards/chosen": 4.713934421539307, "rewards/margins": 2.587620496749878, "rewards/rejected": 2.1263139247894287, "step": 11827 }, { "epoch": 1.92, "learning_rate": 1.0364357589448292e-07, "logits/chosen": -1.1423770189285278, "logits/rejected": -1.0274134874343872, "logps/chosen": -208.28785705566406, "logps/rejected": -218.65780639648438, "loss": 1.6002, "rewards/accuracies": 0.0, "rewards/chosen": 4.137889385223389, "rewards/margins": -2.737455368041992, "rewards/rejected": 6.875344753265381, "step": 11828 }, { "epoch": 1.92, "learning_rate": 1.0356347329390646e-07, "logits/chosen": -0.5106456279754639, "logits/rejected": -0.6240020394325256, "logps/chosen": -121.0946044921875, "logps/rejected": -124.42036437988281, "loss": 0.1839, "rewards/accuracies": 1.0, "rewards/chosen": 6.1925811767578125, "rewards/margins": 1.7454571723937988, "rewards/rejected": 4.447124004364014, "step": 11829 }, { "epoch": 1.92, "learning_rate": 1.03483398083343e-07, "logits/chosen": -0.4399983882904053, "logits/rejected": -0.4326901137828827, "logps/chosen": -22.752445220947266, "logps/rejected": -21.33709716796875, "loss": 0.9373, "rewards/accuracies": 0.0, "rewards/chosen": -0.05077362060546875, "rewards/margins": -0.6022470593452454, "rewards/rejected": 0.5514734387397766, "step": 11830 }, { "epoch": 1.92, "learning_rate": 1.0340335026832475e-07, "logits/chosen": -0.9005141854286194, "logits/rejected": -0.87040776014328, "logps/chosen": -65.88744354248047, "logps/rejected": -105.24434661865234, "loss": 1.4268, "rewards/accuracies": 0.0, "rewards/chosen": 2.849428653717041, "rewards/margins": -1.9502243995666504, "rewards/rejected": 4.799653053283691, "step": 11831 }, { "epoch": 1.92, "learning_rate": 1.0332332985438247e-07, "logits/chosen": -0.6406601071357727, "logits/rejected": -0.5089893937110901, "logps/chosen": -62.764217376708984, "logps/rejected": -35.50929641723633, "loss": 0.508, "rewards/accuracies": 1.0, "rewards/chosen": 2.297584295272827, "rewards/margins": 0.9647061824798584, "rewards/rejected": 1.3328781127929688, "step": 11832 }, { "epoch": 1.92, "learning_rate": 1.0324333684704462e-07, "logits/chosen": -0.3776525557041168, "logits/rejected": -0.39243653416633606, "logps/chosen": -15.593457221984863, "logps/rejected": -24.146135330200195, "loss": 1.3282, "rewards/accuracies": 0.0, "rewards/chosen": 0.33950167894363403, "rewards/margins": -0.12142744660377502, "rewards/rejected": 0.46092912554740906, "step": 11833 }, { "epoch": 1.92, "learning_rate": 1.0316337125183816e-07, "logits/chosen": -1.015889286994934, "logits/rejected": -0.9877206683158875, "logps/chosen": -212.29901123046875, "logps/rejected": -86.76121520996094, "loss": 1.6831, "rewards/accuracies": 0.0, "rewards/chosen": 3.919865369796753, "rewards/margins": -1.635568380355835, "rewards/rejected": 5.555433750152588, "step": 11834 }, { "epoch": 1.92, "learning_rate": 1.0308343307428768e-07, "logits/chosen": -1.0404658317565918, "logits/rejected": -1.0486302375793457, "logps/chosen": -55.66459274291992, "logps/rejected": -74.47758483886719, "loss": 0.4932, "rewards/accuracies": 1.0, "rewards/chosen": 1.4975231885910034, "rewards/margins": 0.08219718933105469, "rewards/rejected": 1.4153259992599487, "step": 11835 }, { "epoch": 1.92, "learning_rate": 1.030035223199165e-07, "logits/chosen": -0.7108986377716064, "logits/rejected": -0.6675548553466797, "logps/chosen": -84.25778198242188, "logps/rejected": -79.14411926269531, "loss": 0.8519, "rewards/accuracies": 0.0, "rewards/chosen": 1.674652099609375, "rewards/margins": -0.7641313076019287, "rewards/rejected": 2.4387834072113037, "step": 11836 }, { "epoch": 1.92, "learning_rate": 1.0292363899424533e-07, "logits/chosen": -0.8455472588539124, "logits/rejected": -0.703797459602356, "logps/chosen": -114.291015625, "logps/rejected": -60.12006378173828, "loss": 2.0781, "rewards/accuracies": 1.0, "rewards/chosen": 1.5314483642578125, "rewards/margins": 1.2893803119659424, "rewards/rejected": 0.2420680969953537, "step": 11837 }, { "epoch": 1.92, "learning_rate": 1.0284378310279368e-07, "logits/chosen": -0.41256487369537354, "logits/rejected": -0.3110431730747223, "logps/chosen": -59.805145263671875, "logps/rejected": -41.025630950927734, "loss": 0.5115, "rewards/accuracies": 1.0, "rewards/chosen": 1.3501914739608765, "rewards/margins": 0.2651599645614624, "rewards/rejected": 1.085031509399414, "step": 11838 }, { "epoch": 1.92, "learning_rate": 1.0276395465107857e-07, "logits/chosen": -0.6943289637565613, "logits/rejected": -0.6513333320617676, "logps/chosen": -110.42837524414062, "logps/rejected": -80.69368743896484, "loss": 0.4895, "rewards/accuracies": 1.0, "rewards/chosen": 4.600654602050781, "rewards/margins": 2.70859432220459, "rewards/rejected": 1.8920601606369019, "step": 11839 }, { "epoch": 1.92, "learning_rate": 1.0268415364461564e-07, "logits/chosen": -0.5568723678588867, "logits/rejected": -0.5475800037384033, "logps/chosen": -1.31138277053833, "logps/rejected": -19.79009437561035, "loss": 0.378, "rewards/accuracies": 1.0, "rewards/chosen": 0.40571996569633484, "rewards/margins": 0.2678018808364868, "rewards/rejected": 0.13791809976100922, "step": 11840 }, { "epoch": 1.92, "learning_rate": 1.0260438008891814e-07, "logits/chosen": -0.48266586661338806, "logits/rejected": -0.5166873335838318, "logps/chosen": -44.8571662902832, "logps/rejected": -36.74310302734375, "loss": 0.581, "rewards/accuracies": 0.0, "rewards/chosen": -0.26825904846191406, "rewards/margins": -0.7635796070098877, "rewards/rejected": 0.49532052874565125, "step": 11841 }, { "epoch": 1.92, "learning_rate": 1.025246339894979e-07, "logits/chosen": -0.7171037197113037, "logits/rejected": -0.7171037197113037, "logps/chosen": -29.108543395996094, "logps/rejected": -29.108543395996094, "loss": 0.8346, "rewards/accuracies": 0.0, "rewards/chosen": 1.7619774341583252, "rewards/margins": 0.0, "rewards/rejected": 1.7619774341583252, "step": 11842 }, { "epoch": 1.92, "learning_rate": 1.0244491535186434e-07, "logits/chosen": -0.9553849101066589, "logits/rejected": -0.861169159412384, "logps/chosen": -86.94593811035156, "logps/rejected": -118.36469268798828, "loss": 0.9379, "rewards/accuracies": 1.0, "rewards/chosen": 2.191410779953003, "rewards/margins": 0.5178000926971436, "rewards/rejected": 1.6736106872558594, "step": 11843 }, { "epoch": 1.92, "learning_rate": 1.023652241815256e-07, "logits/chosen": -0.6433330774307251, "logits/rejected": -0.73888099193573, "logps/chosen": -142.39410400390625, "logps/rejected": -140.60604858398438, "loss": 1.3674, "rewards/accuracies": 0.0, "rewards/chosen": 0.4443679749965668, "rewards/margins": -2.4213106632232666, "rewards/rejected": 2.865678548812866, "step": 11844 }, { "epoch": 1.92, "learning_rate": 1.0228556048398729e-07, "logits/chosen": -0.4696301221847534, "logits/rejected": -0.5034607648849487, "logps/chosen": -73.75344848632812, "logps/rejected": -65.25161743164062, "loss": 0.9844, "rewards/accuracies": 0.0, "rewards/chosen": 0.9864959716796875, "rewards/margins": -0.49175572395324707, "rewards/rejected": 1.4782516956329346, "step": 11845 }, { "epoch": 1.92, "learning_rate": 1.0220592426475366e-07, "logits/chosen": -0.49414169788360596, "logits/rejected": -0.39411622285842896, "logps/chosen": -50.46076202392578, "logps/rejected": -5.95298957824707, "loss": 0.8281, "rewards/accuracies": 1.0, "rewards/chosen": 1.6914390325546265, "rewards/margins": 0.9698256254196167, "rewards/rejected": 0.7216134071350098, "step": 11846 }, { "epoch": 1.92, "learning_rate": 1.0212631552932655e-07, "logits/chosen": -0.2858220636844635, "logits/rejected": -0.2854364514350891, "logps/chosen": -16.05170249938965, "logps/rejected": -8.887167930603027, "loss": 0.4178, "rewards/accuracies": 0.0, "rewards/chosen": 0.2756376266479492, "rewards/margins": -0.0772404670715332, "rewards/rejected": 0.3528780937194824, "step": 11847 }, { "epoch": 1.92, "learning_rate": 1.0204673428320649e-07, "logits/chosen": -0.6355224251747131, "logits/rejected": -0.6355224251747131, "logps/chosen": -22.32927703857422, "logps/rejected": -22.32927703857422, "loss": 0.8472, "rewards/accuracies": 0.0, "rewards/chosen": -0.04314766079187393, "rewards/margins": 0.0, "rewards/rejected": -0.04314766079187393, "step": 11848 }, { "epoch": 1.92, "learning_rate": 1.0196718053189146e-07, "logits/chosen": -0.7877177000045776, "logits/rejected": -0.7193931341171265, "logps/chosen": -60.302085876464844, "logps/rejected": -55.036773681640625, "loss": 0.6229, "rewards/accuracies": 1.0, "rewards/chosen": 1.6380332708358765, "rewards/margins": 0.06329262256622314, "rewards/rejected": 1.5747406482696533, "step": 11849 }, { "epoch": 1.92, "learning_rate": 1.0188765428087814e-07, "logits/chosen": -0.979697048664093, "logits/rejected": -0.9193154573440552, "logps/chosen": -84.81571960449219, "logps/rejected": -59.660953521728516, "loss": 0.9767, "rewards/accuracies": 1.0, "rewards/chosen": 2.0295753479003906, "rewards/margins": 1.561661958694458, "rewards/rejected": 0.4679134488105774, "step": 11850 }, { "epoch": 1.92, "learning_rate": 1.0180815553566086e-07, "logits/chosen": -0.8662232160568237, "logits/rejected": -0.8275941610336304, "logps/chosen": -103.3680419921875, "logps/rejected": -157.6741485595703, "loss": 2.4154, "rewards/accuracies": 0.0, "rewards/chosen": 4.660090923309326, "rewards/margins": -2.840013027191162, "rewards/rejected": 7.500103950500488, "step": 11851 }, { "epoch": 1.92, "learning_rate": 1.0172868430173242e-07, "logits/chosen": -0.291764497756958, "logits/rejected": -0.3020505905151367, "logps/chosen": -5.047194004058838, "logps/rejected": -25.13534927368164, "loss": 0.9208, "rewards/accuracies": 1.0, "rewards/chosen": 0.3322133719921112, "rewards/margins": 0.27397361397743225, "rewards/rejected": 0.05823974683880806, "step": 11852 }, { "epoch": 1.92, "learning_rate": 1.0164924058458329e-07, "logits/chosen": -0.5907755494117737, "logits/rejected": -0.5980142951011658, "logps/chosen": -84.93387603759766, "logps/rejected": -27.702096939086914, "loss": 0.5609, "rewards/accuracies": 1.0, "rewards/chosen": 1.0674537420272827, "rewards/margins": 0.7614461183547974, "rewards/rejected": 0.30600759387016296, "step": 11853 }, { "epoch": 1.92, "learning_rate": 1.0156982438970252e-07, "logits/chosen": -0.3574458956718445, "logits/rejected": -0.3342823088169098, "logps/chosen": -94.58277130126953, "logps/rejected": -58.736351013183594, "loss": 1.3451, "rewards/accuracies": 1.0, "rewards/chosen": 2.06754469871521, "rewards/margins": 1.0113290548324585, "rewards/rejected": 1.0562156438827515, "step": 11854 }, { "epoch": 1.92, "learning_rate": 1.0149043572257676e-07, "logits/chosen": -0.7633605599403381, "logits/rejected": -0.5318154096603394, "logps/chosen": -102.65315246582031, "logps/rejected": -85.67636108398438, "loss": 0.4155, "rewards/accuracies": 1.0, "rewards/chosen": 5.49435567855835, "rewards/margins": 3.053898572921753, "rewards/rejected": 2.4404571056365967, "step": 11855 }, { "epoch": 1.92, "learning_rate": 1.014110745886913e-07, "logits/chosen": -0.8674496412277222, "logits/rejected": -0.6418519616127014, "logps/chosen": -59.09598159790039, "logps/rejected": -24.04283905029297, "loss": 0.5771, "rewards/accuracies": 1.0, "rewards/chosen": 1.9502743482589722, "rewards/margins": 1.833890438079834, "rewards/rejected": 0.11638393253087997, "step": 11856 }, { "epoch": 1.92, "learning_rate": 1.0133174099352898e-07, "logits/chosen": -0.7763106822967529, "logits/rejected": -0.881101131439209, "logps/chosen": -45.40723419189453, "logps/rejected": -146.77920532226562, "loss": 2.0393, "rewards/accuracies": 0.0, "rewards/chosen": 1.0719448328018188, "rewards/margins": -3.786775588989258, "rewards/rejected": 4.858720302581787, "step": 11857 }, { "epoch": 1.92, "learning_rate": 1.012524349425713e-07, "logits/chosen": -0.9163644909858704, "logits/rejected": -0.8880410194396973, "logps/chosen": -65.67498779296875, "logps/rejected": -111.27471160888672, "loss": 0.6233, "rewards/accuracies": 1.0, "rewards/chosen": 1.1630477905273438, "rewards/margins": 0.10530698299407959, "rewards/rejected": 1.0577408075332642, "step": 11858 }, { "epoch": 1.92, "learning_rate": 1.011731564412972e-07, "logits/chosen": -1.2265126705169678, "logits/rejected": -1.1850155591964722, "logps/chosen": -128.50131225585938, "logps/rejected": -113.47776794433594, "loss": 1.1603, "rewards/accuracies": 0.0, "rewards/chosen": 4.754138469696045, "rewards/margins": -2.2150955200195312, "rewards/rejected": 6.969233989715576, "step": 11859 }, { "epoch": 1.93, "learning_rate": 1.0109390549518437e-07, "logits/chosen": -1.2182457447052002, "logits/rejected": -1.072607159614563, "logps/chosen": -109.77118682861328, "logps/rejected": -104.912353515625, "loss": 0.0334, "rewards/accuracies": 1.0, "rewards/chosen": 7.697017669677734, "rewards/margins": 5.242146968841553, "rewards/rejected": 2.4548707008361816, "step": 11860 }, { "epoch": 1.93, "learning_rate": 1.010146821097081e-07, "logits/chosen": -0.5809135437011719, "logits/rejected": -0.5809135437011719, "logps/chosen": -41.084251403808594, "logps/rejected": -41.084251403808594, "loss": 0.5324, "rewards/accuracies": 0.0, "rewards/chosen": 1.399317979812622, "rewards/margins": 0.0, "rewards/rejected": 1.399317979812622, "step": 11861 }, { "epoch": 1.93, "learning_rate": 1.0093548629034215e-07, "logits/chosen": -0.8827493786811829, "logits/rejected": -0.790905773639679, "logps/chosen": -146.99856567382812, "logps/rejected": -121.25166320800781, "loss": 0.3439, "rewards/accuracies": 1.0, "rewards/chosen": 5.525622844696045, "rewards/margins": 0.41582679748535156, "rewards/rejected": 5.109796047210693, "step": 11862 }, { "epoch": 1.93, "learning_rate": 1.0085631804255801e-07, "logits/chosen": -0.791763186454773, "logits/rejected": -0.8566627502441406, "logps/chosen": -77.29714965820312, "logps/rejected": -60.280399322509766, "loss": 1.2021, "rewards/accuracies": 0.0, "rewards/chosen": 1.166455864906311, "rewards/margins": -1.224765419960022, "rewards/rejected": 2.391221284866333, "step": 11863 }, { "epoch": 1.93, "learning_rate": 1.0077717737182556e-07, "logits/chosen": -0.4367193579673767, "logits/rejected": -0.37564876675605774, "logps/chosen": -35.61735534667969, "logps/rejected": -27.92607307434082, "loss": 0.5766, "rewards/accuracies": 0.0, "rewards/chosen": 1.211862564086914, "rewards/margins": -0.693333625793457, "rewards/rejected": 1.905196189880371, "step": 11864 }, { "epoch": 1.93, "learning_rate": 1.0069806428361277e-07, "logits/chosen": -0.7270861864089966, "logits/rejected": -0.7270861864089966, "logps/chosen": -50.89435577392578, "logps/rejected": -50.89435577392578, "loss": 0.7239, "rewards/accuracies": 0.0, "rewards/chosen": 1.4653160572052002, "rewards/margins": 0.0, "rewards/rejected": 1.4653160572052002, "step": 11865 }, { "epoch": 1.93, "learning_rate": 1.0061897878338543e-07, "logits/chosen": -0.8477105498313904, "logits/rejected": -0.5960924625396729, "logps/chosen": -153.31716918945312, "logps/rejected": -58.20973205566406, "loss": 0.0339, "rewards/accuracies": 1.0, "rewards/chosen": 7.984870910644531, "rewards/margins": 4.884821891784668, "rewards/rejected": 3.100048780441284, "step": 11866 }, { "epoch": 1.93, "learning_rate": 1.0053992087660779e-07, "logits/chosen": -0.2504301965236664, "logits/rejected": -0.25210675597190857, "logps/chosen": -1.7970538139343262, "logps/rejected": -2.2511496543884277, "loss": 0.3902, "rewards/accuracies": 0.0, "rewards/chosen": 0.30266615748405457, "rewards/margins": -0.06116780638694763, "rewards/rejected": 0.3638339638710022, "step": 11867 }, { "epoch": 1.93, "learning_rate": 1.0046089056874174e-07, "logits/chosen": -0.7379550337791443, "logits/rejected": -0.7102826833724976, "logps/chosen": -21.110715866088867, "logps/rejected": -3.322009563446045, "loss": 0.8355, "rewards/accuracies": 1.0, "rewards/chosen": 0.9432073831558228, "rewards/margins": 0.3414384722709656, "rewards/rejected": 0.6017689108848572, "step": 11868 }, { "epoch": 1.93, "learning_rate": 1.0038188786524781e-07, "logits/chosen": -0.42056116461753845, "logits/rejected": -0.4221080541610718, "logps/chosen": -60.30681228637695, "logps/rejected": -61.261138916015625, "loss": 0.5828, "rewards/accuracies": 1.0, "rewards/chosen": 2.680840015411377, "rewards/margins": 0.06053662300109863, "rewards/rejected": 2.6203033924102783, "step": 11869 }, { "epoch": 1.93, "learning_rate": 1.0030291277158409e-07, "logits/chosen": -0.2579195499420166, "logits/rejected": -0.2579195499420166, "logps/chosen": -38.29604721069336, "logps/rejected": -38.29604721069336, "loss": 0.4119, "rewards/accuracies": 0.0, "rewards/chosen": 1.242491602897644, "rewards/margins": 0.0, "rewards/rejected": 1.242491602897644, "step": 11870 }, { "epoch": 1.93, "learning_rate": 1.0022396529320726e-07, "logits/chosen": -0.6754467487335205, "logits/rejected": -0.7063542604446411, "logps/chosen": -57.622108459472656, "logps/rejected": -131.68399047851562, "loss": 1.0738, "rewards/accuracies": 0.0, "rewards/chosen": 2.9818687438964844, "rewards/margins": -0.032438039779663086, "rewards/rejected": 3.0143067836761475, "step": 11871 }, { "epoch": 1.93, "learning_rate": 1.0014504543557156e-07, "logits/chosen": -0.7991854548454285, "logits/rejected": -0.7236083149909973, "logps/chosen": -88.48081970214844, "logps/rejected": -46.83685302734375, "loss": 0.3621, "rewards/accuracies": 0.0, "rewards/chosen": 1.8454437255859375, "rewards/margins": -0.01299893856048584, "rewards/rejected": 1.8584426641464233, "step": 11872 }, { "epoch": 1.93, "learning_rate": 1.0006615320412993e-07, "logits/chosen": -0.43318989872932434, "logits/rejected": -0.43318989872932434, "logps/chosen": -3.719729423522949, "logps/rejected": -3.719729423522949, "loss": 0.4946, "rewards/accuracies": 0.0, "rewards/chosen": 0.499526709318161, "rewards/margins": 0.0, "rewards/rejected": 0.499526709318161, "step": 11873 }, { "epoch": 1.93, "learning_rate": 9.998728860433275e-08, "logits/chosen": -0.5861944556236267, "logits/rejected": -0.5894675850868225, "logps/chosen": -125.80912780761719, "logps/rejected": -127.02789306640625, "loss": 1.5991, "rewards/accuracies": 1.0, "rewards/chosen": 4.993770122528076, "rewards/margins": 1.121403694152832, "rewards/rejected": 3.872366428375244, "step": 11874 }, { "epoch": 1.93, "learning_rate": 9.990845164162909e-08, "logits/chosen": -0.44910845160484314, "logits/rejected": -0.44910845160484314, "logps/chosen": -63.38878631591797, "logps/rejected": -63.38878631591797, "loss": 0.3932, "rewards/accuracies": 0.0, "rewards/chosen": 1.0563697814941406, "rewards/margins": 0.0, "rewards/rejected": 1.0563697814941406, "step": 11875 }, { "epoch": 1.93, "learning_rate": 9.982964232146563e-08, "logits/chosen": -0.8978087306022644, "logits/rejected": -0.8166835308074951, "logps/chosen": -103.28069305419922, "logps/rejected": -55.26805877685547, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 3.9860939979553223, "rewards/margins": 3.2866108417510986, "rewards/rejected": 0.6994830965995789, "step": 11876 }, { "epoch": 1.93, "learning_rate": 9.975086064928751e-08, "logits/chosen": -0.4893343150615692, "logits/rejected": -0.5439659953117371, "logps/chosen": -84.7047348022461, "logps/rejected": -80.963134765625, "loss": 0.2713, "rewards/accuracies": 1.0, "rewards/chosen": 3.232123613357544, "rewards/margins": 0.841376543045044, "rewards/rejected": 2.3907470703125, "step": 11877 }, { "epoch": 1.93, "learning_rate": 9.967210663053766e-08, "logits/chosen": -1.0106918811798096, "logits/rejected": -1.016951560974121, "logps/chosen": -89.80010986328125, "logps/rejected": -94.55351257324219, "loss": 0.3876, "rewards/accuracies": 1.0, "rewards/chosen": 2.4868485927581787, "rewards/margins": 0.0020875930786132812, "rewards/rejected": 2.4847609996795654, "step": 11878 }, { "epoch": 1.93, "learning_rate": 9.959338027065739e-08, "logits/chosen": -0.6016711592674255, "logits/rejected": -0.6324681043624878, "logps/chosen": -114.40158081054688, "logps/rejected": -100.43733215332031, "loss": 0.0622, "rewards/accuracies": 1.0, "rewards/chosen": 2.6189498901367188, "rewards/margins": 2.3490989208221436, "rewards/rejected": 0.2698509395122528, "step": 11879 }, { "epoch": 1.93, "learning_rate": 9.951468157508575e-08, "logits/chosen": -0.6252140402793884, "logits/rejected": -0.6252140402793884, "logps/chosen": -90.45938110351562, "logps/rejected": -90.45938110351562, "loss": 0.7812, "rewards/accuracies": 0.0, "rewards/chosen": 1.9001435041427612, "rewards/margins": 0.0, "rewards/rejected": 1.9001435041427612, "step": 11880 }, { "epoch": 1.93, "learning_rate": 9.943601054926026e-08, "logits/chosen": -0.7788110971450806, "logits/rejected": -0.7062123417854309, "logps/chosen": -106.34819030761719, "logps/rejected": -141.21817016601562, "loss": 0.043, "rewards/accuracies": 1.0, "rewards/chosen": 5.6654558181762695, "rewards/margins": 2.6328752040863037, "rewards/rejected": 3.032580614089966, "step": 11881 }, { "epoch": 1.93, "learning_rate": 9.93573671986162e-08, "logits/chosen": -0.5541847944259644, "logits/rejected": -0.5551825761795044, "logps/chosen": -11.547545433044434, "logps/rejected": -22.238216400146484, "loss": 3.2732, "rewards/accuracies": 1.0, "rewards/chosen": 0.34928950667381287, "rewards/margins": 0.30728110671043396, "rewards/rejected": 0.042008399963378906, "step": 11882 }, { "epoch": 1.93, "learning_rate": 9.927875152858728e-08, "logits/chosen": -0.6144027709960938, "logits/rejected": -0.6370322704315186, "logps/chosen": -106.48461151123047, "logps/rejected": -126.54627227783203, "loss": 1.1001, "rewards/accuracies": 0.0, "rewards/chosen": 1.235845923423767, "rewards/margins": -2.0292840003967285, "rewards/rejected": 3.265129804611206, "step": 11883 }, { "epoch": 1.93, "learning_rate": 9.920016354460481e-08, "logits/chosen": -1.0581251382827759, "logits/rejected": -1.0379313230514526, "logps/chosen": -92.96092224121094, "logps/rejected": -80.77239990234375, "loss": 1.2586, "rewards/accuracies": 0.0, "rewards/chosen": 1.982934594154358, "rewards/margins": -1.3527497053146362, "rewards/rejected": 3.335684299468994, "step": 11884 }, { "epoch": 1.93, "learning_rate": 9.912160325209878e-08, "logits/chosen": -0.6464769244194031, "logits/rejected": -0.5752538442611694, "logps/chosen": -65.13528442382812, "logps/rejected": -75.233154296875, "loss": 0.5131, "rewards/accuracies": 1.0, "rewards/chosen": 2.3252274990081787, "rewards/margins": 0.55069899559021, "rewards/rejected": 1.7745285034179688, "step": 11885 }, { "epoch": 1.93, "learning_rate": 9.90430706564967e-08, "logits/chosen": -0.4506714940071106, "logits/rejected": -0.4350724220275879, "logps/chosen": -47.30963897705078, "logps/rejected": -42.450260162353516, "loss": 1.007, "rewards/accuracies": 1.0, "rewards/chosen": 1.8659813404083252, "rewards/margins": 0.12297821044921875, "rewards/rejected": 1.7430031299591064, "step": 11886 }, { "epoch": 1.93, "learning_rate": 9.896456576322471e-08, "logits/chosen": -0.4461343288421631, "logits/rejected": -0.4634883999824524, "logps/chosen": -0.9837449193000793, "logps/rejected": -34.49625015258789, "loss": 0.5718, "rewards/accuracies": 0.0, "rewards/chosen": 0.2940196692943573, "rewards/margins": -0.034056514501571655, "rewards/rejected": 0.32807618379592896, "step": 11887 }, { "epoch": 1.93, "learning_rate": 9.888608857770641e-08, "logits/chosen": -0.44383326172828674, "logits/rejected": -0.4458589553833008, "logps/chosen": -27.9040470123291, "logps/rejected": -41.29187774658203, "loss": 0.4223, "rewards/accuracies": 0.0, "rewards/chosen": 0.5647943615913391, "rewards/margins": -0.07999897003173828, "rewards/rejected": 0.6447933316230774, "step": 11888 }, { "epoch": 1.93, "learning_rate": 9.880763910536416e-08, "logits/chosen": -1.2230342626571655, "logits/rejected": -1.115659475326538, "logps/chosen": -101.77345275878906, "logps/rejected": -27.033323287963867, "loss": 0.7057, "rewards/accuracies": 1.0, "rewards/chosen": 4.2598466873168945, "rewards/margins": 3.9276766777038574, "rewards/rejected": 0.3321700990200043, "step": 11889 }, { "epoch": 1.93, "learning_rate": 9.872921735161777e-08, "logits/chosen": -0.862468421459198, "logits/rejected": -0.8267547488212585, "logps/chosen": -88.64604187011719, "logps/rejected": -251.03346252441406, "loss": 1.4319, "rewards/accuracies": 0.0, "rewards/chosen": 1.3587067127227783, "rewards/margins": -2.798701524734497, "rewards/rejected": 4.157408237457275, "step": 11890 }, { "epoch": 1.93, "learning_rate": 9.865082332188579e-08, "logits/chosen": -0.8266473412513733, "logits/rejected": -0.7877947092056274, "logps/chosen": -128.683349609375, "logps/rejected": -125.06253051757812, "loss": 1.1987, "rewards/accuracies": 1.0, "rewards/chosen": 2.361315965652466, "rewards/margins": 0.6643158197402954, "rewards/rejected": 1.6970001459121704, "step": 11891 }, { "epoch": 1.93, "learning_rate": 9.857245702158413e-08, "logits/chosen": -0.555243968963623, "logits/rejected": -0.7593784332275391, "logps/chosen": -131.31500244140625, "logps/rejected": -54.29408264160156, "loss": 0.5718, "rewards/accuracies": 1.0, "rewards/chosen": 3.9916152954101562, "rewards/margins": 1.4672348499298096, "rewards/rejected": 2.5243804454803467, "step": 11892 }, { "epoch": 1.93, "learning_rate": 9.849411845612748e-08, "logits/chosen": -0.7994118332862854, "logits/rejected": -0.8156381249427795, "logps/chosen": -57.38154220581055, "logps/rejected": -68.94575500488281, "loss": 0.4168, "rewards/accuracies": 1.0, "rewards/chosen": 0.9830555319786072, "rewards/margins": 0.3699604272842407, "rewards/rejected": 0.6130951046943665, "step": 11893 }, { "epoch": 1.93, "learning_rate": 9.841580763092811e-08, "logits/chosen": -0.6907013058662415, "logits/rejected": -0.6694464087486267, "logps/chosen": -56.67877960205078, "logps/rejected": -69.89521789550781, "loss": 0.4313, "rewards/accuracies": 0.0, "rewards/chosen": 1.5770081281661987, "rewards/margins": -0.2226271629333496, "rewards/rejected": 1.7996352910995483, "step": 11894 }, { "epoch": 1.93, "learning_rate": 9.833752455139666e-08, "logits/chosen": -0.6297916769981384, "logits/rejected": -0.6674911379814148, "logps/chosen": -47.68691635131836, "logps/rejected": -114.18025207519531, "loss": 1.6846, "rewards/accuracies": 0.0, "rewards/chosen": 1.697055459022522, "rewards/margins": -3.320744037628174, "rewards/rejected": 5.017799377441406, "step": 11895 }, { "epoch": 1.93, "learning_rate": 9.825926922294159e-08, "logits/chosen": -0.5867525339126587, "logits/rejected": -0.35034045577049255, "logps/chosen": -66.18177032470703, "logps/rejected": -51.14423751831055, "loss": 0.6225, "rewards/accuracies": 0.0, "rewards/chosen": 0.47916489839553833, "rewards/margins": -0.8281665444374084, "rewards/rejected": 1.3073314428329468, "step": 11896 }, { "epoch": 1.93, "learning_rate": 9.818104165096985e-08, "logits/chosen": -1.3083009719848633, "logits/rejected": -1.2556042671203613, "logps/chosen": -162.11044311523438, "logps/rejected": -45.039363861083984, "loss": 0.2492, "rewards/accuracies": 1.0, "rewards/chosen": 2.972303867340088, "rewards/margins": 1.6960235834121704, "rewards/rejected": 1.2762802839279175, "step": 11897 }, { "epoch": 1.93, "learning_rate": 9.81028418408859e-08, "logits/chosen": -0.7337427735328674, "logits/rejected": -0.7224288582801819, "logps/chosen": -39.68677520751953, "logps/rejected": -19.808927536010742, "loss": 0.6351, "rewards/accuracies": 1.0, "rewards/chosen": 0.5835922360420227, "rewards/margins": 0.20254269242286682, "rewards/rejected": 0.3810495436191559, "step": 11898 }, { "epoch": 1.93, "learning_rate": 9.802466979809288e-08, "logits/chosen": -1.0038377046585083, "logits/rejected": -0.8615419864654541, "logps/chosen": -134.46678161621094, "logps/rejected": -23.18157196044922, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 6.165641784667969, "rewards/margins": 5.800760269165039, "rewards/rejected": 0.36488133668899536, "step": 11899 }, { "epoch": 1.93, "learning_rate": 9.794652552799171e-08, "logits/chosen": -1.010209560394287, "logits/rejected": -0.9442483186721802, "logps/chosen": -90.09456634521484, "logps/rejected": -103.49917602539062, "loss": 0.4578, "rewards/accuracies": 0.0, "rewards/chosen": 2.465325117111206, "rewards/margins": -0.37012577056884766, "rewards/rejected": 2.8354508876800537, "step": 11900 }, { "epoch": 1.93, "learning_rate": 9.786840903598126e-08, "logits/chosen": -0.7363753914833069, "logits/rejected": -0.6577917337417603, "logps/chosen": -125.20777893066406, "logps/rejected": -68.46290588378906, "loss": 0.3831, "rewards/accuracies": 1.0, "rewards/chosen": 5.434822082519531, "rewards/margins": 2.8650214672088623, "rewards/rejected": 2.569800615310669, "step": 11901 }, { "epoch": 1.93, "learning_rate": 9.779032032745887e-08, "logits/chosen": -0.9049047827720642, "logits/rejected": -0.9108567237854004, "logps/chosen": -171.57296752929688, "logps/rejected": -120.75347137451172, "loss": 1.1594, "rewards/accuracies": 0.0, "rewards/chosen": 4.120144844055176, "rewards/margins": -0.9729437828063965, "rewards/rejected": 5.093088626861572, "step": 11902 }, { "epoch": 1.93, "learning_rate": 9.771225940781947e-08, "logits/chosen": -0.8909656405448914, "logits/rejected": -0.9363327026367188, "logps/chosen": -97.16532897949219, "logps/rejected": -51.39718246459961, "loss": 1.042, "rewards/accuracies": 0.0, "rewards/chosen": 2.290058135986328, "rewards/margins": -0.5064747333526611, "rewards/rejected": 2.7965328693389893, "step": 11903 }, { "epoch": 1.93, "learning_rate": 9.76342262824566e-08, "logits/chosen": -0.811745285987854, "logits/rejected": -0.7413957118988037, "logps/chosen": -84.26152038574219, "logps/rejected": -46.41509246826172, "loss": 0.9269, "rewards/accuracies": 0.0, "rewards/chosen": 0.498025506734848, "rewards/margins": -0.9272720813751221, "rewards/rejected": 1.4252976179122925, "step": 11904 }, { "epoch": 1.93, "learning_rate": 9.755622095676136e-08, "logits/chosen": -0.9452055096626282, "logits/rejected": -0.9452055096626282, "logps/chosen": -84.73765563964844, "logps/rejected": -84.73765563964844, "loss": 2.415, "rewards/accuracies": 0.0, "rewards/chosen": 2.4509079456329346, "rewards/margins": 0.0, "rewards/rejected": 2.4509079456329346, "step": 11905 }, { "epoch": 1.93, "learning_rate": 9.747824343612337e-08, "logits/chosen": -0.8939543962478638, "logits/rejected": -0.9002323746681213, "logps/chosen": -102.02661895751953, "logps/rejected": -59.81986618041992, "loss": 0.7568, "rewards/accuracies": 0.0, "rewards/chosen": 0.6212882995605469, "rewards/margins": -0.6798046827316284, "rewards/rejected": 1.3010929822921753, "step": 11906 }, { "epoch": 1.93, "learning_rate": 9.740029372593e-08, "logits/chosen": -0.6703159809112549, "logits/rejected": -0.5010877251625061, "logps/chosen": -123.05496978759766, "logps/rejected": -88.85810089111328, "loss": 0.5236, "rewards/accuracies": 1.0, "rewards/chosen": 1.7279350757598877, "rewards/margins": 0.23483121395111084, "rewards/rejected": 1.4931038618087769, "step": 11907 }, { "epoch": 1.93, "learning_rate": 9.7322371831567e-08, "logits/chosen": -0.8361826539039612, "logits/rejected": -0.7818004488945007, "logps/chosen": -68.41267395019531, "logps/rejected": -82.40296936035156, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 1.4390159845352173, "rewards/margins": -0.9989098310470581, "rewards/rejected": 2.4379258155822754, "step": 11908 }, { "epoch": 1.93, "learning_rate": 9.724447775841782e-08, "logits/chosen": -0.6954197287559509, "logits/rejected": -0.6508256793022156, "logps/chosen": -74.76824188232422, "logps/rejected": -100.31193542480469, "loss": 0.5989, "rewards/accuracies": 1.0, "rewards/chosen": 1.6701980829238892, "rewards/margins": 0.42090916633605957, "rewards/rejected": 1.2492889165878296, "step": 11909 }, { "epoch": 1.93, "learning_rate": 9.71666115118644e-08, "logits/chosen": -0.9127214550971985, "logits/rejected": -0.8747753500938416, "logps/chosen": -72.86428833007812, "logps/rejected": -74.44758605957031, "loss": 1.0309, "rewards/accuracies": 0.0, "rewards/chosen": 1.455910563468933, "rewards/margins": -1.7699462175369263, "rewards/rejected": 3.2258567810058594, "step": 11910 }, { "epoch": 1.93, "learning_rate": 9.708877309728635e-08, "logits/chosen": -0.453259140253067, "logits/rejected": -0.46987709403038025, "logps/chosen": -4.593564510345459, "logps/rejected": -36.43778991699219, "loss": 0.4253, "rewards/accuracies": 0.0, "rewards/chosen": 0.13065095245838165, "rewards/margins": -0.136565163731575, "rewards/rejected": 0.26721611618995667, "step": 11911 }, { "epoch": 1.93, "learning_rate": 9.701096252006191e-08, "logits/chosen": -0.8685910701751709, "logits/rejected": -0.8406307697296143, "logps/chosen": -12.28861141204834, "logps/rejected": -18.398967742919922, "loss": 0.4814, "rewards/accuracies": 1.0, "rewards/chosen": 1.6173813343048096, "rewards/margins": 0.2077697515487671, "rewards/rejected": 1.4096115827560425, "step": 11912 }, { "epoch": 1.93, "learning_rate": 9.693317978556664e-08, "logits/chosen": -0.5972754955291748, "logits/rejected": -0.6508347988128662, "logps/chosen": -57.7725715637207, "logps/rejected": -94.75675964355469, "loss": 0.3914, "rewards/accuracies": 0.0, "rewards/chosen": 1.7147899866104126, "rewards/margins": -0.16644024848937988, "rewards/rejected": 1.8812302350997925, "step": 11913 }, { "epoch": 1.93, "learning_rate": 9.685542489917492e-08, "logits/chosen": -0.6374155282974243, "logits/rejected": -0.6307255625724792, "logps/chosen": -59.026588439941406, "logps/rejected": -54.56800079345703, "loss": 0.8165, "rewards/accuracies": 1.0, "rewards/chosen": 2.600543975830078, "rewards/margins": 0.6089431047439575, "rewards/rejected": 1.9916008710861206, "step": 11914 }, { "epoch": 1.93, "learning_rate": 9.677769786625867e-08, "logits/chosen": -0.5353924632072449, "logits/rejected": -0.5353924632072449, "logps/chosen": -48.9156494140625, "logps/rejected": -48.9156494140625, "loss": 1.2994, "rewards/accuracies": 0.0, "rewards/chosen": 2.1502976417541504, "rewards/margins": 0.0, "rewards/rejected": 2.1502976417541504, "step": 11915 }, { "epoch": 1.93, "learning_rate": 9.669999869218826e-08, "logits/chosen": -0.9574795365333557, "logits/rejected": -1.0183789730072021, "logps/chosen": -74.58575439453125, "logps/rejected": -51.26934814453125, "loss": 0.7032, "rewards/accuracies": 1.0, "rewards/chosen": 2.091524600982666, "rewards/margins": 0.31855249404907227, "rewards/rejected": 1.7729721069335938, "step": 11916 }, { "epoch": 1.93, "learning_rate": 9.662232738233178e-08, "logits/chosen": -0.5021554827690125, "logits/rejected": -0.5257421135902405, "logps/chosen": -69.9825439453125, "logps/rejected": -88.416015625, "loss": 0.5921, "rewards/accuracies": 1.0, "rewards/chosen": 1.822302222251892, "rewards/margins": 0.030654072761535645, "rewards/rejected": 1.7916481494903564, "step": 11917 }, { "epoch": 1.93, "learning_rate": 9.654468394205579e-08, "logits/chosen": -1.1151293516159058, "logits/rejected": -1.0259392261505127, "logps/chosen": -111.67720031738281, "logps/rejected": -54.195701599121094, "loss": 0.7781, "rewards/accuracies": 1.0, "rewards/chosen": 1.140541911125183, "rewards/margins": 0.14049077033996582, "rewards/rejected": 1.0000511407852173, "step": 11918 }, { "epoch": 1.93, "learning_rate": 9.646706837672448e-08, "logits/chosen": -0.7457703948020935, "logits/rejected": -0.7111895680427551, "logps/chosen": -38.07117462158203, "logps/rejected": -32.106868743896484, "loss": 1.4876, "rewards/accuracies": 1.0, "rewards/chosen": 1.6770095825195312, "rewards/margins": 0.441969633102417, "rewards/rejected": 1.2350399494171143, "step": 11919 }, { "epoch": 1.93, "learning_rate": 9.63894806917006e-08, "logits/chosen": -0.5532790422439575, "logits/rejected": -0.4802663028240204, "logps/chosen": -153.37979125976562, "logps/rejected": -48.91161346435547, "loss": 0.322, "rewards/accuracies": 1.0, "rewards/chosen": 4.407635688781738, "rewards/margins": 1.535688877105713, "rewards/rejected": 2.8719468116760254, "step": 11920 }, { "epoch": 1.93, "learning_rate": 9.631192089234463e-08, "logits/chosen": -0.4592054784297943, "logits/rejected": -0.4230583608150482, "logps/chosen": -80.50164031982422, "logps/rejected": -39.04666519165039, "loss": 0.3446, "rewards/accuracies": 1.0, "rewards/chosen": 1.2069625854492188, "rewards/margins": 0.8298381567001343, "rewards/rejected": 0.3771243989467621, "step": 11921 }, { "epoch": 1.94, "learning_rate": 9.62343889840151e-08, "logits/chosen": -0.7028193473815918, "logits/rejected": -0.6674145460128784, "logps/chosen": -103.58439636230469, "logps/rejected": -61.21187210083008, "loss": 0.922, "rewards/accuracies": 0.0, "rewards/chosen": 1.8793495893478394, "rewards/margins": -0.5804814100265503, "rewards/rejected": 2.4598309993743896, "step": 11922 }, { "epoch": 1.94, "learning_rate": 9.615688497206891e-08, "logits/chosen": -0.7144103050231934, "logits/rejected": -0.5743896961212158, "logps/chosen": -38.693878173828125, "logps/rejected": -19.778064727783203, "loss": 0.6172, "rewards/accuracies": 1.0, "rewards/chosen": 2.0925705432891846, "rewards/margins": 2.0002448558807373, "rewards/rejected": 0.09232578426599503, "step": 11923 }, { "epoch": 1.94, "learning_rate": 9.607940886186061e-08, "logits/chosen": -1.0410935878753662, "logits/rejected": -1.042447805404663, "logps/chosen": -42.02947235107422, "logps/rejected": -38.79634475708008, "loss": 0.601, "rewards/accuracies": 1.0, "rewards/chosen": 1.8937256336212158, "rewards/margins": 0.4101825952529907, "rewards/rejected": 1.483543038368225, "step": 11924 }, { "epoch": 1.94, "learning_rate": 9.600196065874339e-08, "logits/chosen": -0.895315945148468, "logits/rejected": -0.9988263845443726, "logps/chosen": -78.26985168457031, "logps/rejected": -91.5296401977539, "loss": 2.2717, "rewards/accuracies": 0.0, "rewards/chosen": 1.1834274530410767, "rewards/margins": -3.822455883026123, "rewards/rejected": 5.00588321685791, "step": 11925 }, { "epoch": 1.94, "learning_rate": 9.59245403680679e-08, "logits/chosen": -0.7014181613922119, "logits/rejected": -0.7014181613922119, "logps/chosen": -92.39299011230469, "logps/rejected": -92.39299011230469, "loss": 0.6854, "rewards/accuracies": 0.0, "rewards/chosen": 2.9717559814453125, "rewards/margins": 0.0, "rewards/rejected": 2.9717559814453125, "step": 11926 }, { "epoch": 1.94, "learning_rate": 9.584714799518339e-08, "logits/chosen": -0.6010406017303467, "logits/rejected": -0.5059186220169067, "logps/chosen": -57.481876373291016, "logps/rejected": -49.749183654785156, "loss": 0.4903, "rewards/accuracies": 0.0, "rewards/chosen": 1.4721935987472534, "rewards/margins": -0.20214271545410156, "rewards/rejected": 1.674336314201355, "step": 11927 }, { "epoch": 1.94, "learning_rate": 9.576978354543669e-08, "logits/chosen": -0.018196890130639076, "logits/rejected": -0.014623506926000118, "logps/chosen": -11.921056747436523, "logps/rejected": -8.20332145690918, "loss": 0.4626, "rewards/accuracies": 1.0, "rewards/chosen": 0.4145212173461914, "rewards/margins": 0.1780960112810135, "rewards/rejected": 0.23642520606517792, "step": 11928 }, { "epoch": 1.94, "learning_rate": 9.569244702417322e-08, "logits/chosen": -0.38964417576789856, "logits/rejected": -0.2587066888809204, "logps/chosen": -102.15054321289062, "logps/rejected": -37.98378372192383, "loss": 0.7039, "rewards/accuracies": 1.0, "rewards/chosen": 0.8977004885673523, "rewards/margins": 0.5862236022949219, "rewards/rejected": 0.3114769160747528, "step": 11929 }, { "epoch": 1.94, "learning_rate": 9.561513843673597e-08, "logits/chosen": -0.5054470300674438, "logits/rejected": -0.5054470300674438, "logps/chosen": -17.791748046875, "logps/rejected": -17.791748046875, "loss": 0.4324, "rewards/accuracies": 0.0, "rewards/chosen": 0.38803407549858093, "rewards/margins": 0.0, "rewards/rejected": 0.38803407549858093, "step": 11930 }, { "epoch": 1.94, "learning_rate": 9.553785778846646e-08, "logits/chosen": -0.9038496613502502, "logits/rejected": -0.9467637538909912, "logps/chosen": -38.863033294677734, "logps/rejected": -55.22724533081055, "loss": 0.2619, "rewards/accuracies": 1.0, "rewards/chosen": 2.255708694458008, "rewards/margins": 0.6765624284744263, "rewards/rejected": 1.5791462659835815, "step": 11931 }, { "epoch": 1.94, "learning_rate": 9.546060508470371e-08, "logits/chosen": -0.6326327919960022, "logits/rejected": -0.6905797719955444, "logps/chosen": -90.35838317871094, "logps/rejected": -88.56936645507812, "loss": 0.4194, "rewards/accuracies": 0.0, "rewards/chosen": 5.251898288726807, "rewards/margins": -0.2563905715942383, "rewards/rejected": 5.508288860321045, "step": 11932 }, { "epoch": 1.94, "learning_rate": 9.538338033078557e-08, "logits/chosen": -0.8400660157203674, "logits/rejected": -0.8004693388938904, "logps/chosen": -52.88923645019531, "logps/rejected": -98.81059265136719, "loss": 0.4468, "rewards/accuracies": 1.0, "rewards/chosen": 2.0492348670959473, "rewards/margins": 0.33620917797088623, "rewards/rejected": 1.713025689125061, "step": 11933 }, { "epoch": 1.94, "learning_rate": 9.530618353204717e-08, "logits/chosen": -0.6288493871688843, "logits/rejected": -0.7806476354598999, "logps/chosen": -66.16834259033203, "logps/rejected": -128.6974639892578, "loss": 1.9057, "rewards/accuracies": 0.0, "rewards/chosen": 1.5364655256271362, "rewards/margins": -3.4723892211914062, "rewards/rejected": 5.008854866027832, "step": 11934 }, { "epoch": 1.94, "learning_rate": 9.522901469382238e-08, "logits/chosen": -1.0039676427841187, "logits/rejected": -0.9127343893051147, "logps/chosen": -114.59122467041016, "logps/rejected": -82.27529907226562, "loss": 0.106, "rewards/accuracies": 1.0, "rewards/chosen": 6.59883975982666, "rewards/margins": 3.49430775642395, "rewards/rejected": 3.10453200340271, "step": 11935 }, { "epoch": 1.94, "learning_rate": 9.515187382144258e-08, "logits/chosen": -0.7323036193847656, "logits/rejected": -0.7624378800392151, "logps/chosen": -73.02167510986328, "logps/rejected": -109.89541625976562, "loss": 1.7508, "rewards/accuracies": 0.0, "rewards/chosen": 1.4233306646347046, "rewards/margins": -0.430908203125, "rewards/rejected": 1.8542388677597046, "step": 11936 }, { "epoch": 1.94, "learning_rate": 9.507476092023769e-08, "logits/chosen": -0.3545878827571869, "logits/rejected": -0.3803841769695282, "logps/chosen": -1.0857806205749512, "logps/rejected": -23.104154586791992, "loss": 0.7396, "rewards/accuracies": 1.0, "rewards/chosen": 0.28069210052490234, "rewards/margins": 0.0987127274274826, "rewards/rejected": 0.18197937309741974, "step": 11937 }, { "epoch": 1.94, "learning_rate": 9.499767599553527e-08, "logits/chosen": -0.31465640664100647, "logits/rejected": -0.3056270480155945, "logps/chosen": -1.0469346046447754, "logps/rejected": -16.218276977539062, "loss": 0.6381, "rewards/accuracies": 1.0, "rewards/chosen": 0.3996763825416565, "rewards/margins": 0.049591511487960815, "rewards/rejected": 0.3500848710536957, "step": 11938 }, { "epoch": 1.94, "learning_rate": 9.492061905266135e-08, "logits/chosen": -0.6375481486320496, "logits/rejected": -0.6244679689407349, "logps/chosen": -51.96271514892578, "logps/rejected": -46.573875427246094, "loss": 1.8093, "rewards/accuracies": 1.0, "rewards/chosen": 0.8656318783760071, "rewards/margins": 0.24508440494537354, "rewards/rejected": 0.6205474734306335, "step": 11939 }, { "epoch": 1.94, "learning_rate": 9.484359009693971e-08, "logits/chosen": -1.0585706233978271, "logits/rejected": -1.017484188079834, "logps/chosen": -94.36875915527344, "logps/rejected": -48.83585739135742, "loss": 0.3818, "rewards/accuracies": 0.0, "rewards/chosen": 1.1049118041992188, "rewards/margins": -0.02728426456451416, "rewards/rejected": 1.132196068763733, "step": 11940 }, { "epoch": 1.94, "learning_rate": 9.476658913369246e-08, "logits/chosen": -0.47947725653648376, "logits/rejected": -0.8174441456794739, "logps/chosen": -105.29135131835938, "logps/rejected": -73.17991638183594, "loss": 0.5035, "rewards/accuracies": 0.0, "rewards/chosen": 2.3522064685821533, "rewards/margins": -0.2609550952911377, "rewards/rejected": 2.613161563873291, "step": 11941 }, { "epoch": 1.94, "learning_rate": 9.46896161682394e-08, "logits/chosen": -0.6601895689964294, "logits/rejected": -0.6580075621604919, "logps/chosen": -56.49850082397461, "logps/rejected": -83.422607421875, "loss": 0.5501, "rewards/accuracies": 0.0, "rewards/chosen": 1.5779438018798828, "rewards/margins": -0.3972667455673218, "rewards/rejected": 1.9752105474472046, "step": 11942 }, { "epoch": 1.94, "learning_rate": 9.461267120589894e-08, "logits/chosen": -0.73555988073349, "logits/rejected": -0.7875213027000427, "logps/chosen": -78.32492065429688, "logps/rejected": -77.65266418457031, "loss": 0.7146, "rewards/accuracies": 0.0, "rewards/chosen": 1.542260766029358, "rewards/margins": -0.9675263166427612, "rewards/rejected": 2.509787082672119, "step": 11943 }, { "epoch": 1.94, "learning_rate": 9.45357542519869e-08, "logits/chosen": -0.4223329424858093, "logits/rejected": -0.4193874001502991, "logps/chosen": -1.3399564027786255, "logps/rejected": -20.09307861328125, "loss": 0.946, "rewards/accuracies": 1.0, "rewards/chosen": 0.23941364884376526, "rewards/margins": 0.03397999703884125, "rewards/rejected": 0.205433651804924, "step": 11944 }, { "epoch": 1.94, "learning_rate": 9.445886531181791e-08, "logits/chosen": -0.7925892472267151, "logits/rejected": -0.840522289276123, "logps/chosen": -76.2806167602539, "logps/rejected": -129.31155395507812, "loss": 0.4957, "rewards/accuracies": 0.0, "rewards/chosen": 5.368030548095703, "rewards/margins": -0.1221628189086914, "rewards/rejected": 5.4901933670043945, "step": 11945 }, { "epoch": 1.94, "learning_rate": 9.438200439070388e-08, "logits/chosen": -0.5451471209526062, "logits/rejected": -0.5332595705986023, "logps/chosen": -70.20372009277344, "logps/rejected": -25.70737648010254, "loss": 0.3683, "rewards/accuracies": 1.0, "rewards/chosen": 0.9320091605186462, "rewards/margins": 0.03695470094680786, "rewards/rejected": 0.8950544595718384, "step": 11946 }, { "epoch": 1.94, "learning_rate": 9.43051714939555e-08, "logits/chosen": -0.7537257671356201, "logits/rejected": -0.704244077205658, "logps/chosen": -214.85910034179688, "logps/rejected": -27.801227569580078, "loss": 0.0338, "rewards/accuracies": 1.0, "rewards/chosen": 5.522885322570801, "rewards/margins": 3.4896671772003174, "rewards/rejected": 2.0332181453704834, "step": 11947 }, { "epoch": 1.94, "learning_rate": 9.422836662688094e-08, "logits/chosen": -0.7712678909301758, "logits/rejected": -0.5959751009941101, "logps/chosen": -93.19902801513672, "logps/rejected": -63.3641471862793, "loss": 0.7792, "rewards/accuracies": 1.0, "rewards/chosen": 6.349929332733154, "rewards/margins": 3.663922071456909, "rewards/rejected": 2.686007261276245, "step": 11948 }, { "epoch": 1.94, "learning_rate": 9.415158979478688e-08, "logits/chosen": -0.9985109567642212, "logits/rejected": -1.028588891029358, "logps/chosen": -78.13568115234375, "logps/rejected": -109.33430480957031, "loss": 0.4772, "rewards/accuracies": 1.0, "rewards/chosen": 1.8232132196426392, "rewards/margins": 0.9602851867675781, "rewards/rejected": 0.862928032875061, "step": 11949 }, { "epoch": 1.94, "learning_rate": 9.407484100297768e-08, "logits/chosen": -0.7672258615493774, "logits/rejected": -0.733229398727417, "logps/chosen": -52.792877197265625, "logps/rejected": -87.03123474121094, "loss": 0.7356, "rewards/accuracies": 0.0, "rewards/chosen": 2.178905487060547, "rewards/margins": -1.100595235824585, "rewards/rejected": 3.279500722885132, "step": 11950 }, { "epoch": 1.94, "learning_rate": 9.399812025675624e-08, "logits/chosen": -0.8615189790725708, "logits/rejected": -0.8609782457351685, "logps/chosen": -45.54343032836914, "logps/rejected": -53.12201690673828, "loss": 0.393, "rewards/accuracies": 1.0, "rewards/chosen": 2.4272701740264893, "rewards/margins": 0.48368728160858154, "rewards/rejected": 1.9435828924179077, "step": 11951 }, { "epoch": 1.94, "learning_rate": 9.392142756142291e-08, "logits/chosen": -0.38035154342651367, "logits/rejected": -0.4398656189441681, "logps/chosen": -77.671630859375, "logps/rejected": -80.34976196289062, "loss": 2.3797, "rewards/accuracies": 0.0, "rewards/chosen": 0.2072601318359375, "rewards/margins": -3.139392137527466, "rewards/rejected": 3.3466522693634033, "step": 11952 }, { "epoch": 1.94, "learning_rate": 9.384476292227672e-08, "logits/chosen": -1.0649605989456177, "logits/rejected": -0.35763949155807495, "logps/chosen": -73.22708892822266, "logps/rejected": -97.35083770751953, "loss": 0.3607, "rewards/accuracies": 1.0, "rewards/chosen": 5.1338019371032715, "rewards/margins": 2.4553465843200684, "rewards/rejected": 2.678455352783203, "step": 11953 }, { "epoch": 1.94, "learning_rate": 9.376812634461418e-08, "logits/chosen": -0.8793814182281494, "logits/rejected": -0.8276253342628479, "logps/chosen": -137.2342071533203, "logps/rejected": -81.37554168701172, "loss": 0.7761, "rewards/accuracies": 0.0, "rewards/chosen": 0.938397228717804, "rewards/margins": -1.2348403930664062, "rewards/rejected": 2.1732375621795654, "step": 11954 }, { "epoch": 1.94, "learning_rate": 9.36915178337303e-08, "logits/chosen": -0.6616339087486267, "logits/rejected": -0.6313443183898926, "logps/chosen": -64.16941833496094, "logps/rejected": -77.13064575195312, "loss": 0.5401, "rewards/accuracies": 0.0, "rewards/chosen": 1.564357042312622, "rewards/margins": -0.28260958194732666, "rewards/rejected": 1.8469666242599487, "step": 11955 }, { "epoch": 1.94, "learning_rate": 9.361493739491816e-08, "logits/chosen": -0.9127293229103088, "logits/rejected": -0.890347421169281, "logps/chosen": -147.1230010986328, "logps/rejected": -79.48025512695312, "loss": 0.4511, "rewards/accuracies": 1.0, "rewards/chosen": 2.899523973464966, "rewards/margins": 0.028920650482177734, "rewards/rejected": 2.870603322982788, "step": 11956 }, { "epoch": 1.94, "learning_rate": 9.35383850334685e-08, "logits/chosen": -0.6839186549186707, "logits/rejected": -0.7672905325889587, "logps/chosen": -30.71779441833496, "logps/rejected": -86.23946380615234, "loss": 0.531, "rewards/accuracies": 0.0, "rewards/chosen": 0.627995491027832, "rewards/margins": -0.5603086948394775, "rewards/rejected": 1.1883041858673096, "step": 11957 }, { "epoch": 1.94, "learning_rate": 9.346186075467056e-08, "logits/chosen": -0.5915738344192505, "logits/rejected": -0.5911673903465271, "logps/chosen": -85.73389434814453, "logps/rejected": -48.35798645019531, "loss": 0.5373, "rewards/accuracies": 0.0, "rewards/chosen": 2.8720688819885254, "rewards/margins": -0.5761237144470215, "rewards/rejected": 3.448192596435547, "step": 11958 }, { "epoch": 1.94, "learning_rate": 9.338536456381118e-08, "logits/chosen": -0.6278408169746399, "logits/rejected": -0.5748350620269775, "logps/chosen": -47.2763557434082, "logps/rejected": -47.91569900512695, "loss": 1.0378, "rewards/accuracies": 0.0, "rewards/chosen": 1.2410179376602173, "rewards/margins": -0.5120518207550049, "rewards/rejected": 1.7530697584152222, "step": 11959 }, { "epoch": 1.94, "learning_rate": 9.330889646617584e-08, "logits/chosen": -1.1432950496673584, "logits/rejected": -1.078567624092102, "logps/chosen": -108.12773895263672, "logps/rejected": -82.8048095703125, "loss": 1.4412, "rewards/accuracies": 0.0, "rewards/chosen": 2.053830862045288, "rewards/margins": -0.6673872470855713, "rewards/rejected": 2.7212181091308594, "step": 11960 }, { "epoch": 1.94, "learning_rate": 9.323245646704753e-08, "logits/chosen": -0.7900518774986267, "logits/rejected": -0.7509507536888123, "logps/chosen": -52.05956268310547, "logps/rejected": -119.97689819335938, "loss": 2.2331, "rewards/accuracies": 0.0, "rewards/chosen": 2.5201127529144287, "rewards/margins": -2.697300672531128, "rewards/rejected": 5.217413425445557, "step": 11961 }, { "epoch": 1.94, "learning_rate": 9.315604457170767e-08, "logits/chosen": -0.7216457724571228, "logits/rejected": -0.7807708382606506, "logps/chosen": -148.08853149414062, "logps/rejected": -128.60235595703125, "loss": 0.074, "rewards/accuracies": 1.0, "rewards/chosen": 3.2365996837615967, "rewards/margins": 2.1135191917419434, "rewards/rejected": 1.1230804920196533, "step": 11962 }, { "epoch": 1.94, "learning_rate": 9.307966078543545e-08, "logits/chosen": -0.40428945422172546, "logits/rejected": -0.4037293791770935, "logps/chosen": -94.15579223632812, "logps/rejected": -94.91193389892578, "loss": 1.2814, "rewards/accuracies": 0.0, "rewards/chosen": 0.8406692743301392, "rewards/margins": -1.5394920110702515, "rewards/rejected": 2.3801612854003906, "step": 11963 }, { "epoch": 1.94, "learning_rate": 9.300330511350841e-08, "logits/chosen": -0.8681370615959167, "logits/rejected": -0.8210800886154175, "logps/chosen": -62.03570556640625, "logps/rejected": -23.465011596679688, "loss": 0.3071, "rewards/accuracies": 1.0, "rewards/chosen": 2.2019150257110596, "rewards/margins": 2.218953847885132, "rewards/rejected": -0.017038917168974876, "step": 11964 }, { "epoch": 1.94, "learning_rate": 9.292697756120188e-08, "logits/chosen": -1.077731728553772, "logits/rejected": -1.0291575193405151, "logps/chosen": -88.16329193115234, "logps/rejected": -95.5269775390625, "loss": 0.4169, "rewards/accuracies": 0.0, "rewards/chosen": 4.144660472869873, "rewards/margins": -0.18304967880249023, "rewards/rejected": 4.327710151672363, "step": 11965 }, { "epoch": 1.94, "learning_rate": 9.285067813378955e-08, "logits/chosen": -0.7849155068397522, "logits/rejected": -0.7860330939292908, "logps/chosen": -49.342350006103516, "logps/rejected": -52.926570892333984, "loss": 1.7486, "rewards/accuracies": 0.0, "rewards/chosen": 1.046270728111267, "rewards/margins": -0.7171070575714111, "rewards/rejected": 1.7633777856826782, "step": 11966 }, { "epoch": 1.94, "learning_rate": 9.277440683654275e-08, "logits/chosen": -0.7869026064872742, "logits/rejected": -0.7888332605361938, "logps/chosen": -66.20372009277344, "logps/rejected": -112.60101318359375, "loss": 1.2851, "rewards/accuracies": 0.0, "rewards/chosen": 0.6459686160087585, "rewards/margins": -1.3291871547698975, "rewards/rejected": 1.9751557111740112, "step": 11967 }, { "epoch": 1.94, "learning_rate": 9.269816367473138e-08, "logits/chosen": -0.691417932510376, "logits/rejected": -0.6899713277816772, "logps/chosen": -2.330115795135498, "logps/rejected": -1.6241559982299805, "loss": 0.4833, "rewards/accuracies": 0.0, "rewards/chosen": 0.3688000738620758, "rewards/margins": -0.05696845054626465, "rewards/rejected": 0.42576852440834045, "step": 11968 }, { "epoch": 1.94, "learning_rate": 9.262194865362282e-08, "logits/chosen": -0.7333093285560608, "logits/rejected": -0.6611231565475464, "logps/chosen": -52.022865295410156, "logps/rejected": -27.70965003967285, "loss": 0.4191, "rewards/accuracies": 1.0, "rewards/chosen": 1.581468939781189, "rewards/margins": 0.8808640837669373, "rewards/rejected": 0.7006048560142517, "step": 11969 }, { "epoch": 1.94, "learning_rate": 9.254576177848311e-08, "logits/chosen": -0.802331268787384, "logits/rejected": -0.8545743227005005, "logps/chosen": -92.03298950195312, "logps/rejected": -127.70442962646484, "loss": 2.0907, "rewards/accuracies": 0.0, "rewards/chosen": 0.6931518912315369, "rewards/margins": -1.3204047679901123, "rewards/rejected": 2.013556718826294, "step": 11970 }, { "epoch": 1.94, "learning_rate": 9.24696030545758e-08, "logits/chosen": -0.3170687258243561, "logits/rejected": -0.32121938467025757, "logps/chosen": -1.7611440420150757, "logps/rejected": -13.848889350891113, "loss": 0.3794, "rewards/accuracies": 1.0, "rewards/chosen": 0.2477150410413742, "rewards/margins": 0.1190451979637146, "rewards/rejected": 0.1286698430776596, "step": 11971 }, { "epoch": 1.94, "learning_rate": 9.239347248716289e-08, "logits/chosen": -1.0882469415664673, "logits/rejected": -1.1862279176712036, "logps/chosen": -99.82064819335938, "logps/rejected": -156.4485321044922, "loss": 0.4808, "rewards/accuracies": 0.0, "rewards/chosen": 4.868295192718506, "rewards/margins": -0.3238801956176758, "rewards/rejected": 5.192175388336182, "step": 11972 }, { "epoch": 1.94, "learning_rate": 9.231737008150414e-08, "logits/chosen": -0.5941299200057983, "logits/rejected": -0.6978634595870972, "logps/chosen": -51.20751190185547, "logps/rejected": -90.53692626953125, "loss": 0.5399, "rewards/accuracies": 0.0, "rewards/chosen": 1.2968437671661377, "rewards/margins": -0.6524986028671265, "rewards/rejected": 1.9493423700332642, "step": 11973 }, { "epoch": 1.94, "learning_rate": 9.224129584285767e-08, "logits/chosen": -0.8936188220977783, "logits/rejected": -0.7804757356643677, "logps/chosen": -126.40286254882812, "logps/rejected": -56.25480651855469, "loss": 1.0837, "rewards/accuracies": 1.0, "rewards/chosen": 1.0534149408340454, "rewards/margins": 0.848670244216919, "rewards/rejected": 0.20474472641944885, "step": 11974 }, { "epoch": 1.94, "learning_rate": 9.216524977647932e-08, "logits/chosen": -0.348755419254303, "logits/rejected": -0.3571983575820923, "logps/chosen": -112.42555236816406, "logps/rejected": -48.555091857910156, "loss": 0.3549, "rewards/accuracies": 1.0, "rewards/chosen": 1.1429588794708252, "rewards/margins": 0.027946829795837402, "rewards/rejected": 1.1150120496749878, "step": 11975 }, { "epoch": 1.94, "learning_rate": 9.208923188762335e-08, "logits/chosen": -1.0070551633834839, "logits/rejected": -1.0552879571914673, "logps/chosen": -75.6502685546875, "logps/rejected": -168.60952758789062, "loss": 2.9712, "rewards/accuracies": 0.0, "rewards/chosen": 1.1221832036972046, "rewards/margins": -5.0837860107421875, "rewards/rejected": 6.205969333648682, "step": 11976 }, { "epoch": 1.94, "learning_rate": 9.201324218154166e-08, "logits/chosen": -0.5152060985565186, "logits/rejected": -0.5388356447219849, "logps/chosen": -55.038185119628906, "logps/rejected": -65.93405151367188, "loss": 0.1495, "rewards/accuracies": 1.0, "rewards/chosen": 2.6162400245666504, "rewards/margins": 1.1101242303848267, "rewards/rejected": 1.5061157941818237, "step": 11977 }, { "epoch": 1.94, "learning_rate": 9.193728066348466e-08, "logits/chosen": -0.7714394927024841, "logits/rejected": -0.734208881855011, "logps/chosen": -68.9647445678711, "logps/rejected": -73.94685363769531, "loss": 2.4726, "rewards/accuracies": 0.0, "rewards/chosen": 0.3725547790527344, "rewards/margins": -0.3376861810684204, "rewards/rejected": 0.7102409601211548, "step": 11978 }, { "epoch": 1.94, "learning_rate": 9.186134733870026e-08, "logits/chosen": -0.8341919183731079, "logits/rejected": -0.5454668998718262, "logps/chosen": -124.83647155761719, "logps/rejected": -105.74475860595703, "loss": 0.0826, "rewards/accuracies": 1.0, "rewards/chosen": 6.975535869598389, "rewards/margins": 3.3497843742370605, "rewards/rejected": 3.625751495361328, "step": 11979 }, { "epoch": 1.94, "learning_rate": 9.178544221243512e-08, "logits/chosen": -0.7135293483734131, "logits/rejected": -0.6775833964347839, "logps/chosen": -99.64720916748047, "logps/rejected": -160.0614471435547, "loss": 2.556, "rewards/accuracies": 0.0, "rewards/chosen": 1.400776743888855, "rewards/margins": -4.57489013671875, "rewards/rejected": 5.9756669998168945, "step": 11980 }, { "epoch": 1.94, "learning_rate": 9.170956528993317e-08, "logits/chosen": -0.8948107361793518, "logits/rejected": -0.849319577217102, "logps/chosen": -201.1180419921875, "logps/rejected": -194.2281494140625, "loss": 1.6733, "rewards/accuracies": 0.0, "rewards/chosen": 4.483975410461426, "rewards/margins": -3.2852935791015625, "rewards/rejected": 7.769268989562988, "step": 11981 }, { "epoch": 1.94, "learning_rate": 9.163371657643714e-08, "logits/chosen": -0.8305497169494629, "logits/rejected": -0.8326170444488525, "logps/chosen": -69.54146575927734, "logps/rejected": -92.80223083496094, "loss": 0.5598, "rewards/accuracies": 0.0, "rewards/chosen": 2.3310112953186035, "rewards/margins": -0.7090010643005371, "rewards/rejected": 3.0400123596191406, "step": 11982 }, { "epoch": 1.94, "learning_rate": 9.155789607718717e-08, "logits/chosen": -0.7585640549659729, "logits/rejected": -0.7629197239875793, "logps/chosen": -65.70368194580078, "logps/rejected": -46.42353057861328, "loss": 0.8885, "rewards/accuracies": 0.0, "rewards/chosen": 0.19853821396827698, "rewards/margins": -1.1317780017852783, "rewards/rejected": 1.330316185951233, "step": 11983 }, { "epoch": 1.95, "learning_rate": 9.148210379742199e-08, "logits/chosen": -0.7025759220123291, "logits/rejected": -0.6946561336517334, "logps/chosen": -24.924863815307617, "logps/rejected": -40.26359939575195, "loss": 0.1955, "rewards/accuracies": 1.0, "rewards/chosen": 1.583860993385315, "rewards/margins": 0.7633703351020813, "rewards/rejected": 0.8204906582832336, "step": 11984 }, { "epoch": 1.95, "learning_rate": 9.140633974237787e-08, "logits/chosen": -0.5200751423835754, "logits/rejected": -0.3749898672103882, "logps/chosen": -49.74455261230469, "logps/rejected": -76.69898986816406, "loss": 2.4314, "rewards/accuracies": 1.0, "rewards/chosen": 2.224807024002075, "rewards/margins": 0.3722633123397827, "rewards/rejected": 1.8525437116622925, "step": 11985 }, { "epoch": 1.95, "learning_rate": 9.133060391728963e-08, "logits/chosen": -0.8708399534225464, "logits/rejected": -0.682403028011322, "logps/chosen": -200.42578125, "logps/rejected": -52.54990005493164, "loss": 1.5679, "rewards/accuracies": 1.0, "rewards/chosen": 1.8425781726837158, "rewards/margins": 0.11383938789367676, "rewards/rejected": 1.728738784790039, "step": 11986 }, { "epoch": 1.95, "learning_rate": 9.12548963273897e-08, "logits/chosen": -1.1926960945129395, "logits/rejected": -1.2348666191101074, "logps/chosen": -152.14183044433594, "logps/rejected": -100.43943786621094, "loss": 0.3789, "rewards/accuracies": 1.0, "rewards/chosen": 4.510402202606201, "rewards/margins": 2.9554178714752197, "rewards/rejected": 1.5549843311309814, "step": 11987 }, { "epoch": 1.95, "learning_rate": 9.117921697790898e-08, "logits/chosen": -0.5976588726043701, "logits/rejected": -0.567501425743103, "logps/chosen": -89.25360107421875, "logps/rejected": -124.6778564453125, "loss": 1.2198, "rewards/accuracies": 0.0, "rewards/chosen": 1.7638977766036987, "rewards/margins": -0.9142745733261108, "rewards/rejected": 2.6781723499298096, "step": 11988 }, { "epoch": 1.95, "learning_rate": 9.110356587407592e-08, "logits/chosen": -0.5536943674087524, "logits/rejected": -0.549552857875824, "logps/chosen": -45.60930633544922, "logps/rejected": -51.48302459716797, "loss": 1.9246, "rewards/accuracies": 1.0, "rewards/chosen": 1.035257339477539, "rewards/margins": 0.10983926057815552, "rewards/rejected": 0.9254180788993835, "step": 11989 }, { "epoch": 1.95, "learning_rate": 9.102794302111749e-08, "logits/chosen": -1.0663992166519165, "logits/rejected": -0.9248499274253845, "logps/chosen": -126.58296203613281, "logps/rejected": -65.55311584472656, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 7.335324287414551, "rewards/margins": 4.678565979003906, "rewards/rejected": 2.6567580699920654, "step": 11990 }, { "epoch": 1.95, "learning_rate": 9.095234842425853e-08, "logits/chosen": -0.7916374206542969, "logits/rejected": -0.8285171389579773, "logps/chosen": -89.7130355834961, "logps/rejected": -80.18409729003906, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 1.2343460321426392, "rewards/margins": -0.9618188142776489, "rewards/rejected": 2.196164846420288, "step": 11991 }, { "epoch": 1.95, "learning_rate": 9.087678208872174e-08, "logits/chosen": -0.6092061996459961, "logits/rejected": -0.6264184713363647, "logps/chosen": -70.55525207519531, "logps/rejected": -136.45066833496094, "loss": 1.8869, "rewards/accuracies": 0.0, "rewards/chosen": 1.094502329826355, "rewards/margins": -3.7398529052734375, "rewards/rejected": 4.834355354309082, "step": 11992 }, { "epoch": 1.95, "learning_rate": 9.080124401972833e-08, "logits/chosen": -0.5726468563079834, "logits/rejected": -0.6908096671104431, "logps/chosen": -83.37855529785156, "logps/rejected": -117.77293395996094, "loss": 1.2885, "rewards/accuracies": 0.0, "rewards/chosen": 2.247950792312622, "rewards/margins": -2.3472535610198975, "rewards/rejected": 4.5952043533325195, "step": 11993 }, { "epoch": 1.95, "learning_rate": 9.072573422249691e-08, "logits/chosen": -0.4950157105922699, "logits/rejected": -0.4841710031032562, "logps/chosen": -10.591960906982422, "logps/rejected": -19.188312530517578, "loss": 0.335, "rewards/accuracies": 1.0, "rewards/chosen": 1.3930339813232422, "rewards/margins": 0.5542880892753601, "rewards/rejected": 0.8387458920478821, "step": 11994 }, { "epoch": 1.95, "learning_rate": 9.06502527022448e-08, "logits/chosen": -0.912032961845398, "logits/rejected": -0.9328086972236633, "logps/chosen": -74.68388366699219, "logps/rejected": -85.29425811767578, "loss": 1.6484, "rewards/accuracies": 0.0, "rewards/chosen": 1.8600791692733765, "rewards/margins": -1.5794755220413208, "rewards/rejected": 3.4395546913146973, "step": 11995 }, { "epoch": 1.95, "learning_rate": 9.057479946418677e-08, "logits/chosen": -1.000955581665039, "logits/rejected": -1.0583809614181519, "logps/chosen": -222.59335327148438, "logps/rejected": -82.5029525756836, "loss": 0.1229, "rewards/accuracies": 1.0, "rewards/chosen": 3.691821336746216, "rewards/margins": 1.2914237976074219, "rewards/rejected": 2.400397539138794, "step": 11996 }, { "epoch": 1.95, "learning_rate": 9.049937451353623e-08, "logits/chosen": -0.36428189277648926, "logits/rejected": -0.3462294340133667, "logps/chosen": -15.53455638885498, "logps/rejected": -2.9403436183929443, "loss": 0.5459, "rewards/accuracies": 0.0, "rewards/chosen": 0.21086759865283966, "rewards/margins": -0.0771745890378952, "rewards/rejected": 0.28804218769073486, "step": 11997 }, { "epoch": 1.95, "learning_rate": 9.042397785550404e-08, "logits/chosen": -0.7345975637435913, "logits/rejected": -0.6193386912345886, "logps/chosen": -65.20864868164062, "logps/rejected": -29.727325439453125, "loss": 0.6945, "rewards/accuracies": 1.0, "rewards/chosen": 2.6299424171447754, "rewards/margins": 2.0866386890411377, "rewards/rejected": 0.5433036684989929, "step": 11998 }, { "epoch": 1.95, "learning_rate": 9.03486094952997e-08, "logits/chosen": -0.936001181602478, "logits/rejected": -0.8294017314910889, "logps/chosen": -121.57089233398438, "logps/rejected": -43.698081970214844, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": 4.107513427734375, "rewards/margins": 2.0915472507476807, "rewards/rejected": 2.0159661769866943, "step": 11999 }, { "epoch": 1.95, "learning_rate": 9.027326943813013e-08, "logits/chosen": -0.9244321584701538, "logits/rejected": -0.6491371393203735, "logps/chosen": -129.44044494628906, "logps/rejected": -23.562299728393555, "loss": 0.1093, "rewards/accuracies": 1.0, "rewards/chosen": 4.012519836425781, "rewards/margins": 3.5998141765594482, "rewards/rejected": 0.4127056300640106, "step": 12000 } ], "logging_steps": 1.0, "max_steps": 12322, "num_train_epochs": 2, "save_steps": 2000, "total_flos": 0.0, "trial_name": null, "trial_params": null }