diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,3209 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.9952, - "eval_steps": 500, - "global_step": 936, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.016, - "grad_norm": 38.78816510379247, - "learning_rate": 2.6595744680851066e-07, - "logits/chosen": -0.9742305874824524, - "logits/rejected": -1.0986480712890625, - "logps/chosen": -1.3971922397613525, - "logps/rejected": -2.0653083324432373, - "loss": 1.7192, - "odds_ratio_loss": 0.5222790241241455, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.06985961645841599, - "rewards/margins": 0.03340579569339752, - "rewards/rejected": -0.1032654196023941, - "sft_loss": 1.3971922397613525, - "step": 5 - }, - { - "epoch": 0.032, - "grad_norm": 34.938822021789555, - "learning_rate": 5.319148936170213e-07, - "logits/chosen": -0.9504960775375366, - "logits/rejected": -0.9644749760627747, - "logps/chosen": -1.5183446407318115, - "logps/rejected": -1.7423614263534546, - "loss": 1.7, - "odds_ratio_loss": 0.6908475756645203, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.07591722905635834, - "rewards/margins": 0.011200849898159504, - "rewards/rejected": -0.08711807429790497, - "sft_loss": 1.5183446407318115, - "step": 10 - }, - { - "epoch": 0.048, - "grad_norm": 16.339902176867668, - "learning_rate": 7.97872340425532e-07, - "logits/chosen": -0.9059197306632996, - "logits/rejected": -0.9509723782539368, - "logps/chosen": -1.3207223415374756, - "logps/rejected": -2.0547330379486084, - "loss": 1.6089, - "odds_ratio_loss": 0.6031222343444824, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.06603612005710602, - "rewards/margins": 0.036700539290905, - "rewards/rejected": -0.10273666679859161, - "sft_loss": 1.3207223415374756, - "step": 15 - }, - { - "epoch": 0.064, - "grad_norm": 20.917533328584984, - "learning_rate": 1.0638297872340427e-06, - "logits/chosen": -0.9647526741027832, - "logits/rejected": -1.0629401206970215, - "logps/chosen": -1.4611456394195557, - "logps/rejected": -2.1091372966766357, - "loss": 1.4991, - "odds_ratio_loss": 0.5197792649269104, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.07305728644132614, - "rewards/margins": 0.03239959105849266, - "rewards/rejected": -0.1054568737745285, - "sft_loss": 1.4611456394195557, - "step": 20 - }, - { - "epoch": 0.08, - "grad_norm": 18.664826990233166, - "learning_rate": 1.3297872340425533e-06, - "logits/chosen": -1.054176688194275, - "logits/rejected": -1.3495676517486572, - "logps/chosen": -1.3905874490737915, - "logps/rejected": -1.7380883693695068, - "loss": 1.3634, - "odds_ratio_loss": 0.5891755819320679, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.06952936947345734, - "rewards/margins": 0.017375053837895393, - "rewards/rejected": -0.08690442144870758, - "sft_loss": 1.3905874490737915, - "step": 25 - }, - { - "epoch": 0.096, - "grad_norm": 10.604261736039259, - "learning_rate": 1.595744680851064e-06, - "logits/chosen": -0.9233123660087585, - "logits/rejected": -0.9947792887687683, - "logps/chosen": -1.3364614248275757, - "logps/rejected": -1.4293016195297241, - "loss": 1.3477, - "odds_ratio_loss": 0.7917504906654358, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.0668230801820755, - "rewards/margins": 0.0046419971622526646, - "rewards/rejected": -0.07146507501602173, - "sft_loss": 1.3364614248275757, - "step": 30 - }, - { - "epoch": 0.112, - "grad_norm": 9.23627597385138, - "learning_rate": 1.8617021276595745e-06, - "logits/chosen": -1.1354405879974365, - "logits/rejected": -1.1504920721054077, - "logps/chosen": -1.2876451015472412, - "logps/rejected": -1.394587755203247, - "loss": 1.3323, - "odds_ratio_loss": 0.7012001276016235, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.06438224762678146, - "rewards/margins": 0.005347140599042177, - "rewards/rejected": -0.06972938776016235, - "sft_loss": 1.2876451015472412, - "step": 35 - }, - { - "epoch": 0.128, - "grad_norm": 10.737738874884705, - "learning_rate": 2.1276595744680853e-06, - "logits/chosen": -1.0701355934143066, - "logits/rejected": -1.2617167234420776, - "logps/chosen": -1.345517873764038, - "logps/rejected": -1.5351002216339111, - "loss": 1.3194, - "odds_ratio_loss": 0.6834388971328735, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.06727589666843414, - "rewards/margins": 0.009479111060500145, - "rewards/rejected": -0.07675500959157944, - "sft_loss": 1.345517873764038, - "step": 40 - }, - { - "epoch": 0.144, - "grad_norm": 3.92140979781429, - "learning_rate": 2.393617021276596e-06, - "logits/chosen": -0.8892070651054382, - "logits/rejected": -0.9303566217422485, - "logps/chosen": -1.1515204906463623, - "logps/rejected": -1.311240315437317, - "loss": 1.3284, - "odds_ratio_loss": 0.6429694890975952, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.05757603049278259, - "rewards/margins": 0.007985993288457394, - "rewards/rejected": -0.06556202471256256, - "sft_loss": 1.1515204906463623, - "step": 45 - }, - { - "epoch": 0.16, - "grad_norm": 12.131879118457386, - "learning_rate": 2.6595744680851065e-06, - "logits/chosen": -0.9589262008666992, - "logits/rejected": -1.0993459224700928, - "logps/chosen": -1.3476245403289795, - "logps/rejected": -1.4529037475585938, - "loss": 1.3627, - "odds_ratio_loss": 0.6757664680480957, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.06738122552633286, - "rewards/margins": 0.005263959523290396, - "rewards/rejected": -0.07264517992734909, - "sft_loss": 1.3476245403289795, - "step": 50 - }, - { - "epoch": 0.176, - "grad_norm": 6.920195139152275, - "learning_rate": 2.9255319148936174e-06, - "logits/chosen": -0.9547260403633118, - "logits/rejected": -1.137032151222229, - "logps/chosen": -1.3528764247894287, - "logps/rejected": -1.5240767002105713, - "loss": 1.2606, - "odds_ratio_loss": 0.6684664487838745, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.06764382123947144, - "rewards/margins": 0.008560018613934517, - "rewards/rejected": -0.0762038379907608, - "sft_loss": 1.3528764247894287, - "step": 55 - }, - { - "epoch": 0.192, - "grad_norm": 5.7098162603211104, - "learning_rate": 3.191489361702128e-06, - "logits/chosen": -0.982739269733429, - "logits/rejected": -1.1074873208999634, - "logps/chosen": -1.4583510160446167, - "logps/rejected": -1.5452958345413208, - "loss": 1.3274, - "odds_ratio_loss": 0.7071143388748169, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -0.07291755080223083, - "rewards/margins": 0.00434724148362875, - "rewards/rejected": -0.07726480066776276, - "sft_loss": 1.4583510160446167, - "step": 60 - }, - { - "epoch": 0.208, - "grad_norm": 4.824404122964849, - "learning_rate": 3.457446808510639e-06, - "logits/chosen": -0.9817106127738953, - "logits/rejected": -1.111875295639038, - "logps/chosen": -1.3345075845718384, - "logps/rejected": -1.5089738368988037, - "loss": 1.2879, - "odds_ratio_loss": 0.6608881950378418, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.06672537326812744, - "rewards/margins": 0.008723323233425617, - "rewards/rejected": -0.07544869929552078, - "sft_loss": 1.3345075845718384, - "step": 65 - }, - { - "epoch": 0.224, - "grad_norm": 6.021701349784469, - "learning_rate": 3.723404255319149e-06, - "logits/chosen": -1.1069040298461914, - "logits/rejected": -1.075409173965454, - "logps/chosen": -1.2757076025009155, - "logps/rejected": -1.4533952474594116, - "loss": 1.3134, - "odds_ratio_loss": 0.7258908152580261, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.0637853816151619, - "rewards/margins": 0.008884383365511894, - "rewards/rejected": -0.07266975939273834, - "sft_loss": 1.2757076025009155, - "step": 70 - }, - { - "epoch": 0.24, - "grad_norm": 5.02350299089495, - "learning_rate": 3.98936170212766e-06, - "logits/chosen": -0.9449170231819153, - "logits/rejected": -1.1291000843048096, - "logps/chosen": -1.2914139032363892, - "logps/rejected": -1.484882116317749, - "loss": 1.2797, - "odds_ratio_loss": 0.6280355453491211, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.06457068026065826, - "rewards/margins": 0.009673424996435642, - "rewards/rejected": -0.07424411177635193, - "sft_loss": 1.2914139032363892, - "step": 75 - }, - { - "epoch": 0.256, - "grad_norm": 5.308377873815178, - "learning_rate": 4.255319148936171e-06, - "logits/chosen": -0.9751418828964233, - "logits/rejected": -1.094626545906067, - "logps/chosen": -1.0752859115600586, - "logps/rejected": -1.334636926651001, - "loss": 1.2678, - "odds_ratio_loss": 0.5908417701721191, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.05376429483294487, - "rewards/margins": 0.012967551127076149, - "rewards/rejected": -0.06673184782266617, - "sft_loss": 1.0752859115600586, - "step": 80 - }, - { - "epoch": 0.272, - "grad_norm": 6.346400541372991, - "learning_rate": 4.521276595744681e-06, - "logits/chosen": -0.9909063577651978, - "logits/rejected": -1.0453189611434937, - "logps/chosen": -1.0384023189544678, - "logps/rejected": -1.2738986015319824, - "loss": 1.2615, - "odds_ratio_loss": 0.6149815320968628, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.051920123398303986, - "rewards/margins": 0.011774817481637001, - "rewards/rejected": -0.06369493901729584, - "sft_loss": 1.0384023189544678, - "step": 85 - }, - { - "epoch": 0.288, - "grad_norm": 6.055157418934933, - "learning_rate": 4.787234042553192e-06, - "logits/chosen": -0.9102107882499695, - "logits/rejected": -0.9935488700866699, - "logps/chosen": -1.1961522102355957, - "logps/rejected": -1.415398359298706, - "loss": 1.212, - "odds_ratio_loss": 0.6041514277458191, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.059807609766721725, - "rewards/margins": 0.010962305590510368, - "rewards/rejected": -0.07076992094516754, - "sft_loss": 1.1961522102355957, - "step": 90 - }, - { - "epoch": 0.304, - "grad_norm": 5.552038237166853, - "learning_rate": 4.999982598564682e-06, - "logits/chosen": -0.9868474006652832, - "logits/rejected": -1.0573745965957642, - "logps/chosen": -1.2718483209609985, - "logps/rejected": -1.5264861583709717, - "loss": 1.2618, - "odds_ratio_loss": 0.640211820602417, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.06359241157770157, - "rewards/margins": 0.012731892988085747, - "rewards/rejected": -0.07632430642843246, - "sft_loss": 1.2718483209609985, - "step": 95 - }, - { - "epoch": 0.32, - "grad_norm": 7.727039615086689, - "learning_rate": 4.999373573764188e-06, - "logits/chosen": -1.0175714492797852, - "logits/rejected": -0.8734873533248901, - "logps/chosen": -1.199947476387024, - "logps/rejected": -1.3050801753997803, - "loss": 1.2807, - "odds_ratio_loss": 0.7060422897338867, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.059997379779815674, - "rewards/margins": 0.005256640259176493, - "rewards/rejected": -0.06525401026010513, - "sft_loss": 1.199947476387024, - "step": 100 - }, - { - "epoch": 0.336, - "grad_norm": 6.365580704740442, - "learning_rate": 4.997894719430564e-06, - "logits/chosen": -0.8132748603820801, - "logits/rejected": -0.9986258745193481, - "logps/chosen": -1.1854135990142822, - "logps/rejected": -1.5025197267532349, - "loss": 1.2805, - "odds_ratio_loss": 0.6134302020072937, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.05927068740129471, - "rewards/margins": 0.015855297446250916, - "rewards/rejected": -0.07512599229812622, - "sft_loss": 1.1854135990142822, - "step": 105 - }, - { - "epoch": 0.352, - "grad_norm": 6.860280491199452, - "learning_rate": 4.995546550233241e-06, - "logits/chosen": -0.9532275199890137, - "logits/rejected": -1.136918306350708, - "logps/chosen": -1.408894658088684, - "logps/rejected": -1.558379888534546, - "loss": 1.2769, - "odds_ratio_loss": 0.7106048464775085, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.0704447329044342, - "rewards/margins": 0.007474270649254322, - "rewards/rejected": -0.07791899144649506, - "sft_loss": 1.408894658088684, - "step": 110 - }, - { - "epoch": 0.368, - "grad_norm": 5.854849184237901, - "learning_rate": 4.992329883379755e-06, - "logits/chosen": -0.9923961758613586, - "logits/rejected": -1.2076470851898193, - "logps/chosen": -1.4104652404785156, - "logps/rejected": -1.4293034076690674, - "loss": 1.2514, - "odds_ratio_loss": 0.7473000288009644, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.07052326202392578, - "rewards/margins": 0.0009419171256013215, - "rewards/rejected": -0.07146517932415009, - "sft_loss": 1.4104652404785156, - "step": 115 - }, - { - "epoch": 0.384, - "grad_norm": 4.334718029265899, - "learning_rate": 4.988245838331339e-06, - "logits/chosen": -0.9599634408950806, - "logits/rejected": -0.9879687428474426, - "logps/chosen": -1.2545793056488037, - "logps/rejected": -1.4001166820526123, - "loss": 1.2204, - "odds_ratio_loss": 0.6919530630111694, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.06272897124290466, - "rewards/margins": 0.007276860065758228, - "rewards/rejected": -0.07000583410263062, - "sft_loss": 1.2545793056488037, - "step": 120 - }, - { - "epoch": 0.4, - "grad_norm": 6.640886415637415, - "learning_rate": 4.983295836413337e-06, - "logits/chosen": -0.8892742395401001, - "logits/rejected": -0.8854384422302246, - "logps/chosen": -1.1642634868621826, - "logps/rejected": -1.2482268810272217, - "loss": 1.225, - "odds_ratio_loss": 0.7551355361938477, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.05821318179368973, - "rewards/margins": 0.004198164213448763, - "rewards/rejected": -0.0624113492667675, - "sft_loss": 1.1642634868621826, - "step": 125 - }, - { - "epoch": 0.416, - "grad_norm": 3.3977384471741727, - "learning_rate": 4.977481600320545e-06, - "logits/chosen": -1.0043504238128662, - "logits/rejected": -1.0277382135391235, - "logps/chosen": -1.2277835607528687, - "logps/rejected": -1.3244746923446655, - "loss": 1.2696, - "odds_ratio_loss": 0.726096510887146, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.06138917803764343, - "rewards/margins": 0.0048345597460865974, - "rewards/rejected": -0.06622373312711716, - "sft_loss": 1.2277835607528687, - "step": 130 - }, - { - "epoch": 0.432, - "grad_norm": 29.84907862640638, - "learning_rate": 4.970805153517692e-06, - "logits/chosen": -0.906798243522644, - "logits/rejected": -1.0836145877838135, - "logps/chosen": -1.273900032043457, - "logps/rejected": -1.6986215114593506, - "loss": 1.2562, - "odds_ratio_loss": 0.6722120642662048, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.06369500607252121, - "rewards/margins": 0.02123607136309147, - "rewards/rejected": -0.08493108302354813, - "sft_loss": 1.273900032043457, - "step": 135 - }, - { - "epoch": 0.448, - "grad_norm": 9.009223423852758, - "learning_rate": 4.963268819535228e-06, - "logits/chosen": -0.9239026308059692, - "logits/rejected": -0.9300885200500488, - "logps/chosen": -1.0778651237487793, - "logps/rejected": -1.2015340328216553, - "loss": 1.2297, - "odds_ratio_loss": 0.6974226832389832, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": -0.053893256932497025, - "rewards/margins": 0.006183440797030926, - "rewards/rejected": -0.06007670238614082, - "sft_loss": 1.0778651237487793, - "step": 140 - }, - { - "epoch": 0.464, - "grad_norm": 6.881851804175557, - "learning_rate": 4.954875221160695e-06, - "logits/chosen": -0.9062795639038086, - "logits/rejected": -0.9437534213066101, - "logps/chosen": -1.151301622390747, - "logps/rejected": -1.376366376876831, - "loss": 1.1857, - "odds_ratio_loss": 0.6390897035598755, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.05756508558988571, - "rewards/margins": 0.01125323586165905, - "rewards/rejected": -0.06881832331418991, - "sft_loss": 1.151301622390747, - "step": 145 - }, - { - "epoch": 0.48, - "grad_norm": 6.024680716397588, - "learning_rate": 4.945627279525943e-06, - "logits/chosen": -0.8047590255737305, - "logits/rejected": -1.0275001525878906, - "logps/chosen": -1.1844927072525024, - "logps/rejected": -1.1749188899993896, - "loss": 1.2559, - "odds_ratio_loss": 0.7716752886772156, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -0.05922463536262512, - "rewards/margins": -0.0004786819336004555, - "rewards/rejected": -0.05874595046043396, - "sft_loss": 1.1844927072525024, - "step": 150 - }, - { - "epoch": 0.496, - "grad_norm": 4.969594803216186, - "learning_rate": 4.935528213090526e-06, - "logits/chosen": -0.8657525181770325, - "logits/rejected": -0.9782077074050903, - "logps/chosen": -1.2747188806533813, - "logps/rejected": -1.331703543663025, - "loss": 1.2301, - "odds_ratio_loss": 0.720513641834259, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.06373593956232071, - "rewards/margins": 0.0028492447454482317, - "rewards/rejected": -0.06658518314361572, - "sft_loss": 1.2747188806533813, - "step": 155 - }, - { - "epoch": 0.512, - "grad_norm": 5.691340876461069, - "learning_rate": 4.9245815365216115e-06, - "logits/chosen": -0.7511823773384094, - "logits/rejected": -1.0508475303649902, - "logps/chosen": -1.1776434183120728, - "logps/rejected": -1.6007537841796875, - "loss": 1.221, - "odds_ratio_loss": 0.5590518116950989, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.05888216570019722, - "rewards/margins": 0.021155523136258125, - "rewards/rejected": -0.08003769814968109, - "sft_loss": 1.1776434183120728, - "step": 160 - }, - { - "epoch": 0.528, - "grad_norm": 6.0040479808042235, - "learning_rate": 4.912791059470815e-06, - "logits/chosen": -0.9242817163467407, - "logits/rejected": -0.8517144918441772, - "logps/chosen": -1.0615520477294922, - "logps/rejected": -1.288195252418518, - "loss": 1.1745, - "odds_ratio_loss": 0.6426531076431274, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.05307760834693909, - "rewards/margins": 0.011332156136631966, - "rewards/rejected": -0.0644097551703453, - "sft_loss": 1.0615520477294922, - "step": 165 - }, - { - "epoch": 0.544, - "grad_norm": 5.529927712259793, - "learning_rate": 4.900160885248363e-06, - "logits/chosen": -0.9031238555908203, - "logits/rejected": -0.8960297703742981, - "logps/chosen": -1.2523874044418335, - "logps/rejected": -1.4036993980407715, - "loss": 1.194, - "odds_ratio_loss": 0.6845428347587585, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.06261937320232391, - "rewards/margins": 0.007565596606582403, - "rewards/rejected": -0.07018496841192245, - "sft_loss": 1.2523874044418335, - "step": 170 - }, - { - "epoch": 0.56, - "grad_norm": 4.1316911858664644, - "learning_rate": 4.886695409395068e-06, - "logits/chosen": -0.853635311126709, - "logits/rejected": -1.0007998943328857, - "logps/chosen": -1.158414602279663, - "logps/rejected": -1.1915799379348755, - "loss": 1.2303, - "odds_ratio_loss": 0.7887060046195984, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -0.05792072415351868, - "rewards/margins": 0.0016582731623202562, - "rewards/rejected": -0.059578996151685715, - "sft_loss": 1.158414602279663, - "step": 175 - }, - { - "epoch": 0.576, - "grad_norm": 4.185731700055316, - "learning_rate": 4.872399318152594e-06, - "logits/chosen": -0.9244221448898315, - "logits/rejected": -0.8243404626846313, - "logps/chosen": -1.124448537826538, - "logps/rejected": -1.1855733394622803, - "loss": 1.2359, - "odds_ratio_loss": 0.6966110467910767, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.05622243881225586, - "rewards/margins": 0.0030562314204871655, - "rewards/rejected": -0.059278666973114014, - "sft_loss": 1.124448537826538, - "step": 180 - }, - { - "epoch": 0.592, - "grad_norm": 5.534373355221194, - "learning_rate": 4.857277586832556e-06, - "logits/chosen": -0.8700960278511047, - "logits/rejected": -1.0795601606369019, - "logps/chosen": -1.1355112791061401, - "logps/rejected": -1.3763678073883057, - "loss": 1.2299, - "odds_ratio_loss": 0.61272132396698, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.05677555873990059, - "rewards/margins": 0.012042825110256672, - "rewards/rejected": -0.06881839036941528, - "sft_loss": 1.1355112791061401, - "step": 185 - }, - { - "epoch": 0.608, - "grad_norm": 8.320294434791975, - "learning_rate": 4.841335478085015e-06, - "logits/chosen": -0.837981104850769, - "logits/rejected": -1.041286587715149, - "logps/chosen": -1.0703322887420654, - "logps/rejected": -1.6396074295043945, - "loss": 1.1951, - "odds_ratio_loss": 0.5467146635055542, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.05351661518216133, - "rewards/margins": 0.028463756665587425, - "rewards/rejected": -0.08198036998510361, - "sft_loss": 1.0703322887420654, - "step": 190 - }, - { - "epoch": 0.624, - "grad_norm": 4.60123509876027, - "learning_rate": 4.824578540066981e-06, - "logits/chosen": -0.93339604139328, - "logits/rejected": -0.8643118739128113, - "logps/chosen": -1.3019145727157593, - "logps/rejected": -1.2324789762496948, - "loss": 1.2285, - "odds_ratio_loss": 0.8142224550247192, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -0.06509573012590408, - "rewards/margins": -0.003471782896667719, - "rewards/rejected": -0.061623942106962204, - "sft_loss": 1.3019145727157593, - "step": 195 - }, - { - "epoch": 0.64, - "grad_norm": 21.66363438026074, - "learning_rate": 4.807012604511542e-06, - "logits/chosen": -0.6915689706802368, - "logits/rejected": -0.897596538066864, - "logps/chosen": -1.1985743045806885, - "logps/rejected": -1.3054250478744507, - "loss": 1.2092, - "odds_ratio_loss": 0.6641479730606079, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.059928715229034424, - "rewards/margins": 0.005342531017959118, - "rewards/rejected": -0.06527124345302582, - "sft_loss": 1.1985743045806885, - "step": 200 - }, - { - "epoch": 0.656, - "grad_norm": 5.945812709802962, - "learning_rate": 4.788643784698316e-06, - "logits/chosen": -0.8415448069572449, - "logits/rejected": -0.8126734495162964, - "logps/chosen": -1.1990188360214233, - "logps/rejected": -1.1923567056655884, - "loss": 1.1919, - "odds_ratio_loss": 0.7749138474464417, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.05995094031095505, - "rewards/margins": -0.0003331051266286522, - "rewards/rejected": -0.05961783602833748, - "sft_loss": 1.1990188360214233, - "step": 205 - }, - { - "epoch": 0.672, - "grad_norm": 6.175015091799187, - "learning_rate": 4.769478473325908e-06, - "logits/chosen": -0.9676336050033569, - "logits/rejected": -1.0954523086547852, - "logps/chosen": -1.0937362909317017, - "logps/rejected": -1.169339895248413, - "loss": 1.2244, - "odds_ratio_loss": 0.7255644798278809, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.054686807096004486, - "rewards/margins": 0.003780178725719452, - "rewards/rejected": -0.058466989547014236, - "sft_loss": 1.0937362909317017, - "step": 210 - }, - { - "epoch": 0.688, - "grad_norm": 4.1222127125265935, - "learning_rate": 4.7495233402871336e-06, - "logits/chosen": -0.7574236392974854, - "logits/rejected": -0.9107638597488403, - "logps/chosen": -1.0908468961715698, - "logps/rejected": -1.3144160509109497, - "loss": 1.2612, - "odds_ratio_loss": 0.6122642159461975, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.05454235151410103, - "rewards/margins": 0.011178453452885151, - "rewards/rejected": -0.0657208114862442, - "sft_loss": 1.0908468961715698, - "step": 215 - }, - { - "epoch": 0.704, - "grad_norm": 6.629592523219295, - "learning_rate": 4.728785330347771e-06, - "logits/chosen": -1.0126041173934937, - "logits/rejected": -1.1902836561203003, - "logps/chosen": -1.0830891132354736, - "logps/rejected": -1.2832982540130615, - "loss": 1.1849, - "odds_ratio_loss": 0.6457486748695374, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.05415446311235428, - "rewards/margins": 0.010010452009737492, - "rewards/rejected": -0.0641649141907692, - "sft_loss": 1.0830891132354736, - "step": 220 - }, - { - "epoch": 0.72, - "grad_norm": 4.769856705836216, - "learning_rate": 4.70727166072964e-06, - "logits/chosen": -0.9934102296829224, - "logits/rejected": -1.0441877841949463, - "logps/chosen": -1.1385221481323242, - "logps/rejected": -1.3962942361831665, - "loss": 1.2004, - "odds_ratio_loss": 0.631730318069458, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.05692611262202263, - "rewards/margins": 0.012888607569038868, - "rewards/rejected": -0.06981472671031952, - "sft_loss": 1.1385221481323242, - "step": 225 - }, - { - "epoch": 0.736, - "grad_norm": 9.602754215145033, - "learning_rate": 4.684989818598887e-06, - "logits/chosen": -0.8686431050300598, - "logits/rejected": -0.9307680130004883, - "logps/chosen": -1.2431721687316895, - "logps/rejected": -1.3505405187606812, - "loss": 1.1679, - "odds_ratio_loss": 0.7377614974975586, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.06215860694646835, - "rewards/margins": 0.005368414800614119, - "rewards/rejected": -0.06752702593803406, - "sft_loss": 1.2431721687316895, - "step": 230 - }, - { - "epoch": 0.752, - "grad_norm": 4.991520073276977, - "learning_rate": 4.661947558460296e-06, - "logits/chosen": -0.893735408782959, - "logits/rejected": -1.037488579750061, - "logps/chosen": -1.2366321086883545, - "logps/rejected": -1.422572135925293, - "loss": 1.1988, - "odds_ratio_loss": 0.6442903280258179, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.06183161213994026, - "rewards/margins": 0.009297001175582409, - "rewards/rejected": -0.07112861424684525, - "sft_loss": 1.2366321086883545, - "step": 235 - }, - { - "epoch": 0.768, - "grad_norm": 5.765863831511136, - "learning_rate": 4.63815289945858e-06, - "logits/chosen": -0.8988453149795532, - "logits/rejected": -1.1190879344940186, - "logps/chosen": -1.1124114990234375, - "logps/rejected": -1.2776150703430176, - "loss": 1.2519, - "odds_ratio_loss": 0.697569727897644, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.055620573461055756, - "rewards/margins": 0.008260180242359638, - "rewards/rejected": -0.06388075649738312, - "sft_loss": 1.1124114990234375, - "step": 240 - }, - { - "epoch": 0.784, - "grad_norm": 4.2480178226924465, - "learning_rate": 4.613614122587563e-06, - "logits/chosen": -0.8890430331230164, - "logits/rejected": -0.993693470954895, - "logps/chosen": -1.1632936000823975, - "logps/rejected": -1.2893292903900146, - "loss": 1.2055, - "odds_ratio_loss": 0.7878369092941284, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.058164678514003754, - "rewards/margins": 0.0063017881475389, - "rewards/rejected": -0.0644664615392685, - "sft_loss": 1.1632936000823975, - "step": 245 - }, - { - "epoch": 0.8, - "grad_norm": 4.066953723548615, - "learning_rate": 4.5883397678082385e-06, - "logits/chosen": -0.8833068013191223, - "logits/rejected": -1.0138885974884033, - "logps/chosen": -1.2117881774902344, - "logps/rejected": -1.1593796014785767, - "loss": 1.1903, - "odds_ratio_loss": 0.7823435664176941, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -0.06058941036462784, - "rewards/margins": -0.002620431361719966, - "rewards/rejected": -0.05796898156404495, - "sft_loss": 1.2117881774902344, - "step": 250 - }, - { - "epoch": 0.816, - "grad_norm": 5.662450247054175, - "learning_rate": 4.562338631076703e-06, - "logits/chosen": -1.0948327779769897, - "logits/rejected": -1.1347147226333618, - "logps/chosen": -1.0046937465667725, - "logps/rejected": -1.2093970775604248, - "loss": 1.1825, - "odds_ratio_loss": 0.6374476552009583, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.05023468658328056, - "rewards/margins": 0.010235178284347057, - "rewards/rejected": -0.060469865798950195, - "sft_loss": 1.0046937465667725, - "step": 255 - }, - { - "epoch": 0.832, - "grad_norm": 5.116135672128333, - "learning_rate": 4.535619761282989e-06, - "logits/chosen": -0.9458154439926147, - "logits/rejected": -1.224017858505249, - "logps/chosen": -1.202074646949768, - "logps/rejected": -1.3588545322418213, - "loss": 1.23, - "odds_ratio_loss": 0.668530285358429, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.06010373681783676, - "rewards/margins": 0.007838994264602661, - "rewards/rejected": -0.06794272363185883, - "sft_loss": 1.202074646949768, - "step": 260 - }, - { - "epoch": 0.848, - "grad_norm": 8.619415579204428, - "learning_rate": 4.508192457101886e-06, - "logits/chosen": -1.0687763690948486, - "logits/rejected": -1.213895320892334, - "logps/chosen": -1.2021548748016357, - "logps/rejected": -1.541288137435913, - "loss": 1.2482, - "odds_ratio_loss": 0.658495306968689, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.060107748955488205, - "rewards/margins": 0.016956666484475136, - "rewards/rejected": -0.0770644098520279, - "sft_loss": 1.2021548748016357, - "step": 265 - }, - { - "epoch": 0.864, - "grad_norm": 5.000713262682842, - "learning_rate": 4.480066263756821e-06, - "logits/chosen": -0.9636927843093872, - "logits/rejected": -1.1979575157165527, - "logps/chosen": -1.220902442932129, - "logps/rejected": -1.5320258140563965, - "loss": 1.1889, - "odds_ratio_loss": 0.5880663990974426, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.06104512885212898, - "rewards/margins": 0.01555616408586502, - "rewards/rejected": -0.0766012892127037, - "sft_loss": 1.220902442932129, - "step": 270 - }, - { - "epoch": 0.88, - "grad_norm": 6.295390583446683, - "learning_rate": 4.451250969697944e-06, - "logits/chosen": -0.9099109768867493, - "logits/rejected": -1.1926413774490356, - "logps/chosen": -1.1539406776428223, - "logps/rejected": -1.4602700471878052, - "loss": 1.2078, - "odds_ratio_loss": 0.5688539743423462, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.05769703909754753, - "rewards/margins": 0.015316471457481384, - "rewards/rejected": -0.07301349937915802, - "sft_loss": 1.1539406776428223, - "step": 275 - }, - { - "epoch": 0.896, - "grad_norm": 3.602560879317352, - "learning_rate": 4.42175660319555e-06, - "logits/chosen": -1.159501314163208, - "logits/rejected": -1.2406375408172607, - "logps/chosen": -1.192801594734192, - "logps/rejected": -1.3772008419036865, - "loss": 1.1898, - "odds_ratio_loss": 0.7453117966651917, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": -0.059640079736709595, - "rewards/margins": 0.00921996496617794, - "rewards/rejected": -0.06886004656553268, - "sft_loss": 1.192801594734192, - "step": 280 - }, - { - "epoch": 0.912, - "grad_norm": 8.783987785350908, - "learning_rate": 4.391593428850069e-06, - "logits/chosen": -0.9893490076065063, - "logits/rejected": -1.1613223552703857, - "logps/chosen": -1.0789204835891724, - "logps/rejected": -1.1841206550598145, - "loss": 1.2078, - "odds_ratio_loss": 0.7538634538650513, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.05394602566957474, - "rewards/margins": 0.005260012112557888, - "rewards/rejected": -0.0592060312628746, - "sft_loss": 1.0789204835891724, - "step": 285 - }, - { - "epoch": 0.928, - "grad_norm": 6.736280205252853, - "learning_rate": 4.360771944019767e-06, - "logits/chosen": -1.0662925243377686, - "logits/rejected": -1.1013010740280151, - "logps/chosen": -1.2803829908370972, - "logps/rejected": -1.6290887594223022, - "loss": 1.2225, - "odds_ratio_loss": 0.6528395414352417, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.06401915848255157, - "rewards/margins": 0.017435286194086075, - "rewards/rejected": -0.08145444095134735, - "sft_loss": 1.2803829908370972, - "step": 290 - }, - { - "epoch": 0.944, - "grad_norm": 7.2196796373916365, - "learning_rate": 4.329302875167486e-06, - "logits/chosen": -1.0543363094329834, - "logits/rejected": -1.2899835109710693, - "logps/chosen": -1.1794860363006592, - "logps/rejected": -1.4053813219070435, - "loss": 1.2094, - "odds_ratio_loss": 0.6699637174606323, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.05897430330514908, - "rewards/margins": 0.011294771917164326, - "rewards/rejected": -0.07026907801628113, - "sft_loss": 1.1794860363006592, - "step": 295 - }, - { - "epoch": 0.96, - "grad_norm": 5.962697902517566, - "learning_rate": 4.297197174127619e-06, - "logits/chosen": -1.0228321552276611, - "logits/rejected": -1.2345274686813354, - "logps/chosen": -1.0752114057540894, - "logps/rejected": -1.2755658626556396, - "loss": 1.212, - "odds_ratio_loss": 0.6680848002433777, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.05376056954264641, - "rewards/margins": 0.01001772377640009, - "rewards/rejected": -0.06377829611301422, - "sft_loss": 1.0752114057540894, - "step": 300 - }, - { - "epoch": 0.976, - "grad_norm": 3.4092575585850917, - "learning_rate": 4.2644660142946685e-06, - "logits/chosen": -0.9500937461853027, - "logits/rejected": -1.0103486776351929, - "logps/chosen": -1.0463281869888306, - "logps/rejected": -1.4567468166351318, - "loss": 1.2082, - "odds_ratio_loss": 0.5394900441169739, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.05231640487909317, - "rewards/margins": 0.020520929247140884, - "rewards/rejected": -0.07283733785152435, - "sft_loss": 1.0463281869888306, - "step": 305 - }, - { - "epoch": 0.992, - "grad_norm": 4.944690721660475, - "learning_rate": 4.231120786734689e-06, - "logits/chosen": -1.0554213523864746, - "logits/rejected": -1.2123026847839355, - "logps/chosen": -1.0677235126495361, - "logps/rejected": -1.427178978919983, - "loss": 1.1548, - "odds_ratio_loss": 0.5765484571456909, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.05338617414236069, - "rewards/margins": 0.01797277294099331, - "rewards/rejected": -0.07135894149541855, - "sft_loss": 1.0677235126495361, - "step": 310 - }, - { - "epoch": 1.008, - "grad_norm": 3.9331589062338344, - "learning_rate": 4.197173096220983e-06, - "logits/chosen": -1.0465214252471924, - "logits/rejected": -1.0950441360473633, - "logps/chosen": -1.0325384140014648, - "logps/rejected": -1.4658787250518799, - "loss": 1.1007, - "odds_ratio_loss": 0.5475068688392639, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.05162692070007324, - "rewards/margins": 0.02166702225804329, - "rewards/rejected": -0.07329393923282623, - "sft_loss": 1.0325384140014648, - "step": 315 - }, - { - "epoch": 1.024, - "grad_norm": 2.810636096476599, - "learning_rate": 4.162634757195418e-06, - "logits/chosen": -0.9247652888298035, - "logits/rejected": -1.031118392944336, - "logps/chosen": -0.9556388854980469, - "logps/rejected": -1.2451565265655518, - "loss": 0.9386, - "odds_ratio_loss": 0.594650149345398, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.047781944274902344, - "rewards/margins": 0.014475872740149498, - "rewards/rejected": -0.06225781887769699, - "sft_loss": 0.9556388854980469, - "step": 320 - }, - { - "epoch": 1.04, - "grad_norm": 3.0448051628925454, - "learning_rate": 4.127517789656772e-06, - "logits/chosen": -0.9287153482437134, - "logits/rejected": -1.0283979177474976, - "logps/chosen": -1.0202507972717285, - "logps/rejected": -1.2847683429718018, - "loss": 1.0353, - "odds_ratio_loss": 0.5959427952766418, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.051012538373470306, - "rewards/margins": 0.01322587113827467, - "rewards/rejected": -0.06423841416835785, - "sft_loss": 1.0202507972717285, - "step": 325 - }, - { - "epoch": 1.056, - "grad_norm": 4.1771500190151825, - "learning_rate": 4.091834414977556e-06, - "logits/chosen": -1.0129518508911133, - "logits/rejected": -1.271144986152649, - "logps/chosen": -1.018913984298706, - "logps/rejected": -1.554570198059082, - "loss": 0.9511, - "odds_ratio_loss": 0.47632694244384766, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.0509456992149353, - "rewards/margins": 0.026782814413309097, - "rewards/rejected": -0.0777285099029541, - "sft_loss": 1.018913984298706, - "step": 330 - }, - { - "epoch": 1.072, - "grad_norm": 6.50448197903886, - "learning_rate": 4.055597051650731e-06, - "logits/chosen": -1.0193812847137451, - "logits/rejected": -1.1772067546844482, - "logps/chosen": -0.9281299710273743, - "logps/rejected": -1.3907569646835327, - "loss": 0.9662, - "odds_ratio_loss": 0.5411213040351868, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.046406496316194534, - "rewards/margins": 0.023131350055336952, - "rewards/rejected": -0.06953784823417664, - "sft_loss": 0.9281299710273743, - "step": 335 - }, - { - "epoch": 1.088, - "grad_norm": 5.600060917092442, - "learning_rate": 4.018818310967843e-06, - "logits/chosen": -1.1713335514068604, - "logits/rejected": -1.2236130237579346, - "logps/chosen": -1.0068949460983276, - "logps/rejected": -1.1553858518600464, - "loss": 0.9817, - "odds_ratio_loss": 0.7148981094360352, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.05034474655985832, - "rewards/margins": 0.007424544543027878, - "rewards/rejected": -0.0577692911028862, - "sft_loss": 1.0068949460983276, - "step": 340 - }, - { - "epoch": 1.104, - "grad_norm": 5.092643762749835, - "learning_rate": 3.981510992630055e-06, - "logits/chosen": -0.9599639177322388, - "logits/rejected": -1.189029574394226, - "logps/chosen": -0.9695978164672852, - "logps/rejected": -1.4548832178115845, - "loss": 0.9953, - "odds_ratio_loss": 0.5450643301010132, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.048479896038770676, - "rewards/margins": 0.024264268577098846, - "rewards/rejected": -0.07274416834115982, - "sft_loss": 0.9695978164672852, - "step": 345 - }, - { - "epoch": 1.12, - "grad_norm": 4.234304914475125, - "learning_rate": 3.943688080293607e-06, - "logits/chosen": -0.9754691123962402, - "logits/rejected": -1.1199839115142822, - "logps/chosen": -0.894666314125061, - "logps/rejected": -1.577906847000122, - "loss": 0.9707, - "odds_ratio_loss": 0.45880669355392456, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.044733308255672455, - "rewards/margins": 0.03416203707456589, - "rewards/rejected": -0.07889535278081894, - "sft_loss": 0.894666314125061, - "step": 350 - }, - { - "epoch": 1.1360000000000001, - "grad_norm": 4.426257939120987, - "learning_rate": 3.905362737051252e-06, - "logits/chosen": -1.0120149850845337, - "logits/rejected": -1.1284050941467285, - "logps/chosen": -0.8394268751144409, - "logps/rejected": -1.3487886190414429, - "loss": 0.9606, - "odds_ratio_loss": 0.5031847357749939, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.041971348226070404, - "rewards/margins": 0.02546808496117592, - "rewards/rejected": -0.06743943691253662, - "sft_loss": 0.8394268751144409, - "step": 355 - }, - { - "epoch": 1.152, - "grad_norm": 6.846072733066629, - "learning_rate": 3.866548300851254e-06, - "logits/chosen": -1.0661356449127197, - "logits/rejected": -1.1536469459533691, - "logps/chosen": -0.9550708532333374, - "logps/rejected": -1.400836706161499, - "loss": 0.9989, - "odds_ratio_loss": 0.5404989123344421, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.04775355011224747, - "rewards/margins": 0.02228829450905323, - "rewards/rejected": -0.07004183530807495, - "sft_loss": 0.9550708532333374, - "step": 360 - }, - { - "epoch": 1.168, - "grad_norm": 4.238159230888541, - "learning_rate": 3.827258279855527e-06, - "logits/chosen": -1.1802518367767334, - "logits/rejected": -1.206186056137085, - "logps/chosen": -1.0155624151229858, - "logps/rejected": -1.3900431394577026, - "loss": 0.9456, - "odds_ratio_loss": 0.6698687672615051, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.05077812820672989, - "rewards/margins": 0.01872403547167778, - "rewards/rejected": -0.06950215995311737, - "sft_loss": 1.0155624151229858, - "step": 365 - }, - { - "epoch": 1.184, - "grad_norm": 3.7353385077354337, - "learning_rate": 3.787506347738538e-06, - "logits/chosen": -0.9678732752799988, - "logits/rejected": -1.1043659448623657, - "logps/chosen": -0.9759182929992676, - "logps/rejected": -1.3616999387741089, - "loss": 0.9721, - "odds_ratio_loss": 0.6270595788955688, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.0487959161400795, - "rewards/margins": 0.019289087504148483, - "rewards/rejected": -0.06808499991893768, - "sft_loss": 0.9759182929992676, - "step": 370 - }, - { - "epoch": 1.2, - "grad_norm": 4.7540211314956204, - "learning_rate": 3.747306338928609e-06, - "logits/chosen": -1.049630045890808, - "logits/rejected": -1.2382904291152954, - "logps/chosen": -0.938238263130188, - "logps/rejected": -1.2308955192565918, - "loss": 1.0158, - "odds_ratio_loss": 0.5973559021949768, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.04691191390156746, - "rewards/margins": 0.01463286578655243, - "rewards/rejected": -0.06154477596282959, - "sft_loss": 0.938238263130188, - "step": 375 - }, - { - "epoch": 1.216, - "grad_norm": 5.565360284878935, - "learning_rate": 3.706672243793271e-06, - "logits/chosen": -0.9532748460769653, - "logits/rejected": -1.03403902053833, - "logps/chosen": -0.9681148529052734, - "logps/rejected": -1.2792482376098633, - "loss": 0.9464, - "odds_ratio_loss": 0.5441717505455017, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.04840574413537979, - "rewards/margins": 0.015556666068732738, - "rewards/rejected": -0.0639624148607254, - "sft_loss": 0.9681148529052734, - "step": 380 - }, - { - "epoch": 1.232, - "grad_norm": 3.447185504214593, - "learning_rate": 3.665618203770352e-06, - "logits/chosen": -1.0210788249969482, - "logits/rejected": -1.1396093368530273, - "logps/chosen": -0.8457318544387817, - "logps/rejected": -1.2301608324050903, - "loss": 0.9478, - "odds_ratio_loss": 0.5166088342666626, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.04228659346699715, - "rewards/margins": 0.01922144927084446, - "rewards/rejected": -0.06150804087519646, - "sft_loss": 0.8457318544387817, - "step": 385 - }, - { - "epoch": 1.248, - "grad_norm": 3.3338981917851047, - "learning_rate": 3.6241585064464846e-06, - "logits/chosen": -1.2911275625228882, - "logits/rejected": -1.1232960224151611, - "logps/chosen": -0.8720508813858032, - "logps/rejected": -1.23488187789917, - "loss": 0.9435, - "odds_ratio_loss": 0.5066360831260681, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.04360254481434822, - "rewards/margins": 0.01814154163002968, - "rewards/rejected": -0.0617440864443779, - "sft_loss": 0.8720508813858032, - "step": 390 - }, - { - "epoch": 1.264, - "grad_norm": 3.567835515660478, - "learning_rate": 3.582307580584759e-06, - "logits/chosen": -0.959577739238739, - "logits/rejected": -1.2563647031784058, - "logps/chosen": -0.8968993425369263, - "logps/rejected": -1.2443525791168213, - "loss": 0.9669, - "odds_ratio_loss": 0.521682620048523, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.04484497010707855, - "rewards/margins": 0.01737266220152378, - "rewards/rejected": -0.062217630445957184, - "sft_loss": 0.8968993425369263, - "step": 395 - }, - { - "epoch": 1.28, - "grad_norm": 3.9755484861044956, - "learning_rate": 3.5400799911032357e-06, - "logits/chosen": -1.054391860961914, - "logits/rejected": -1.017757534980774, - "logps/chosen": -0.8325561285018921, - "logps/rejected": -1.2529734373092651, - "loss": 0.9323, - "odds_ratio_loss": 0.4909485876560211, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.041627805680036545, - "rewards/margins": 0.021020859479904175, - "rewards/rejected": -0.06264867633581161, - "sft_loss": 0.8325561285018921, - "step": 400 - }, - { - "epoch": 1.296, - "grad_norm": 3.4685607506272476, - "learning_rate": 3.4974904340060756e-06, - "logits/chosen": -1.004164218902588, - "logits/rejected": -1.320536732673645, - "logps/chosen": -1.0670472383499146, - "logps/rejected": -1.2909588813781738, - "loss": 0.9978, - "odds_ratio_loss": 0.6580184102058411, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.05335236340761185, - "rewards/margins": 0.011195586062967777, - "rewards/rejected": -0.06454795598983765, - "sft_loss": 1.0670472383499146, - "step": 405 - }, - { - "epoch": 1.312, - "grad_norm": 4.3019598153539516, - "learning_rate": 3.4545537312690565e-06, - "logits/chosen": -1.1336114406585693, - "logits/rejected": -1.1372759342193604, - "logps/chosen": -0.9065093994140625, - "logps/rejected": -1.415405511856079, - "loss": 0.9679, - "odds_ratio_loss": 0.4388805031776428, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.045325469225645065, - "rewards/margins": 0.02544480562210083, - "rewards/rejected": -0.0707702785730362, - "sft_loss": 0.9065093994140625, - "step": 410 - }, - { - "epoch": 1.328, - "grad_norm": 3.490824264107295, - "learning_rate": 3.4112848256812374e-06, - "logits/chosen": -1.0414103269577026, - "logits/rejected": -1.164736032485962, - "logps/chosen": -0.8454300761222839, - "logps/rejected": -1.311577558517456, - "loss": 0.9274, - "odds_ratio_loss": 0.503560483455658, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.04227150231599808, - "rewards/margins": 0.02330738492310047, - "rewards/rejected": -0.0655788853764534, - "sft_loss": 0.8454300761222839, - "step": 415 - }, - { - "epoch": 1.3439999999999999, - "grad_norm": 8.789242453958117, - "learning_rate": 3.3676987756445894e-06, - "logits/chosen": -0.935681164264679, - "logits/rejected": -1.1432523727416992, - "logps/chosen": -0.9042528867721558, - "logps/rejected": -1.587890386581421, - "loss": 0.9137, - "odds_ratio_loss": 0.47100192308425903, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.04521264508366585, - "rewards/margins": 0.0341818742454052, - "rewards/rejected": -0.07939452677965164, - "sft_loss": 0.9042528867721558, - "step": 420 - }, - { - "epoch": 1.3599999999999999, - "grad_norm": 4.930177739224116, - "learning_rate": 3.323810749933381e-06, - "logits/chosen": -1.0698130130767822, - "logits/rejected": -1.226738691329956, - "logps/chosen": -0.8717530369758606, - "logps/rejected": -1.4678089618682861, - "loss": 0.9124, - "odds_ratio_loss": 0.4427013397216797, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.04358765110373497, - "rewards/margins": 0.02980278991162777, - "rewards/rejected": -0.07339043915271759, - "sft_loss": 0.8717530369758606, - "step": 425 - }, - { - "epoch": 1.376, - "grad_norm": 7.570978097521412, - "learning_rate": 3.2796360224151587e-06, - "logits/chosen": -1.2224234342575073, - "logits/rejected": -1.3439141511917114, - "logps/chosen": -0.9194822311401367, - "logps/rejected": -1.3092845678329468, - "loss": 0.9594, - "odds_ratio_loss": 0.5272402763366699, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.045974113047122955, - "rewards/margins": 0.019490113481879234, - "rewards/rejected": -0.06546422094106674, - "sft_loss": 0.9194822311401367, - "step": 430 - }, - { - "epoch": 1.392, - "grad_norm": 4.982061148370284, - "learning_rate": 3.235189966735148e-06, - "logits/chosen": -1.2072560787200928, - "logits/rejected": -1.1339049339294434, - "logps/chosen": -0.805392861366272, - "logps/rejected": -1.594913125038147, - "loss": 0.9259, - "odds_ratio_loss": 0.406252384185791, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.0402696393430233, - "rewards/margins": 0.03947601094841957, - "rewards/rejected": -0.07974565029144287, - "sft_loss": 0.805392861366272, - "step": 435 - }, - { - "epoch": 1.408, - "grad_norm": 7.692151482525084, - "learning_rate": 3.1904880509659397e-06, - "logits/chosen": -1.0890783071517944, - "logits/rejected": -1.177278757095337, - "logps/chosen": -0.9858812093734741, - "logps/rejected": -1.204777479171753, - "loss": 0.9667, - "odds_ratio_loss": 0.637945830821991, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.049294065684080124, - "rewards/margins": 0.010944806039333344, - "rewards/rejected": -0.060238875448703766, - "sft_loss": 0.9858812093734741, - "step": 440 - }, - { - "epoch": 1.424, - "grad_norm": 4.492542501706703, - "learning_rate": 3.1455458322242943e-06, - "logits/chosen": -1.004298448562622, - "logits/rejected": -1.1840426921844482, - "logps/chosen": -0.9921313524246216, - "logps/rejected": -1.2852861881256104, - "loss": 0.9938, - "odds_ratio_loss": 0.6350023150444031, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.0496065691113472, - "rewards/margins": 0.014657738618552685, - "rewards/rejected": -0.06426431238651276, - "sft_loss": 0.9921313524246216, - "step": 445 - }, - { - "epoch": 1.44, - "grad_norm": 4.069388708509856, - "learning_rate": 3.100378951256981e-06, - "logits/chosen": -1.2047039270401, - "logits/rejected": -1.2526731491088867, - "logps/chosen": -0.8970128297805786, - "logps/rejected": -1.087720274925232, - "loss": 0.9649, - "odds_ratio_loss": 0.6119378805160522, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.04485064372420311, - "rewards/margins": 0.009535368531942368, - "rewards/rejected": -0.05438600853085518, - "sft_loss": 0.8970128297805786, - "step": 450 - }, - { - "epoch": 1.456, - "grad_norm": 4.1796047584641745, - "learning_rate": 3.055003126997495e-06, - "logits/chosen": -1.100693941116333, - "logits/rejected": -1.1093151569366455, - "logps/chosen": -0.9937971234321594, - "logps/rejected": -1.3282194137573242, - "loss": 0.9642, - "odds_ratio_loss": 0.5750898122787476, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.04968985915184021, - "rewards/margins": 0.01672111637890339, - "rewards/rejected": -0.06641098111867905, - "sft_loss": 0.9937971234321594, - "step": 455 - }, - { - "epoch": 1.472, - "grad_norm": 4.0507990154789075, - "learning_rate": 3.0094341510955697e-06, - "logits/chosen": -1.1363385915756226, - "logits/rejected": -1.1738272905349731, - "logps/chosen": -0.877064049243927, - "logps/rejected": -1.4007574319839478, - "loss": 0.959, - "odds_ratio_loss": 0.5275009870529175, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.04385320469737053, - "rewards/margins": 0.026184672489762306, - "rewards/rejected": -0.07003787159919739, - "sft_loss": 0.877064049243927, - "step": 460 - }, - { - "epoch": 1.488, - "grad_norm": 5.088249905544024, - "learning_rate": 2.963687882421383e-06, - "logits/chosen": -1.096651554107666, - "logits/rejected": -1.1972352266311646, - "logps/chosen": -0.9147326350212097, - "logps/rejected": -1.253187894821167, - "loss": 0.9591, - "odds_ratio_loss": 0.5869191288948059, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.04573662951588631, - "rewards/margins": 0.01692276820540428, - "rewards/rejected": -0.06265939772129059, - "sft_loss": 0.9147326350212097, - "step": 465 - }, - { - "epoch": 1.504, - "grad_norm": 4.206721558657063, - "learning_rate": 2.9177802415463714e-06, - "logits/chosen": -0.9876214265823364, - "logits/rejected": -1.073717713356018, - "logps/chosen": -0.7661324143409729, - "logps/rejected": -1.2461497783660889, - "loss": 0.9619, - "odds_ratio_loss": 0.45304179191589355, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.038306623697280884, - "rewards/margins": 0.0240008607506752, - "rewards/rejected": -0.062307484447956085, - "sft_loss": 0.7661324143409729, - "step": 470 - }, - { - "epoch": 1.52, - "grad_norm": 3.67250528125703, - "learning_rate": 2.871727205202563e-06, - "logits/chosen": -1.0482373237609863, - "logits/rejected": -1.1120043992996216, - "logps/chosen": -0.950273871421814, - "logps/rejected": -1.2787425518035889, - "loss": 0.9498, - "odds_ratio_loss": 0.599994421005249, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.047513701021671295, - "rewards/margins": 0.016423430293798447, - "rewards/rejected": -0.06393712759017944, - "sft_loss": 0.950273871421814, - "step": 475 - }, - { - "epoch": 1.536, - "grad_norm": 4.735321963888987, - "learning_rate": 2.825544800722376e-06, - "logits/chosen": -1.1098222732543945, - "logits/rejected": -1.0207080841064453, - "logps/chosen": -0.9046823382377625, - "logps/rejected": -1.3911302089691162, - "loss": 0.9629, - "odds_ratio_loss": 0.5132350921630859, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.04523410648107529, - "rewards/margins": 0.0243223924189806, - "rewards/rejected": -0.06955650448799133, - "sft_loss": 0.9046823382377625, - "step": 480 - }, - { - "epoch": 1.552, - "grad_norm": 3.159767853907868, - "learning_rate": 2.7792491004607984e-06, - "logits/chosen": -1.0096070766448975, - "logits/rejected": -1.0770386457443237, - "logps/chosen": -0.9277147054672241, - "logps/rejected": -1.344773530960083, - "loss": 0.9485, - "odds_ratio_loss": 0.5044125318527222, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.046385735273361206, - "rewards/margins": 0.020852940157055855, - "rewards/rejected": -0.06723867356777191, - "sft_loss": 0.9277147054672241, - "step": 485 - }, - { - "epoch": 1.568, - "grad_norm": 4.013671913402214, - "learning_rate": 2.732856216201906e-06, - "logits/chosen": -1.063245415687561, - "logits/rejected": -1.1233893632888794, - "logps/chosen": -0.9281940460205078, - "logps/rejected": -1.3444063663482666, - "loss": 0.9529, - "odds_ratio_loss": 0.489690363407135, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.04640969634056091, - "rewards/margins": 0.020810618996620178, - "rewards/rejected": -0.06722031533718109, - "sft_loss": 0.9281940460205078, - "step": 490 - }, - { - "epoch": 1.584, - "grad_norm": 4.31314553790671, - "learning_rate": 2.6863822935516546e-06, - "logits/chosen": -1.0234626531600952, - "logits/rejected": -1.1978650093078613, - "logps/chosen": -0.9632134437561035, - "logps/rejected": -1.206189751625061, - "loss": 0.9733, - "odds_ratio_loss": 0.5884974002838135, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.048160675913095474, - "rewards/margins": 0.012148816138505936, - "rewards/rejected": -0.06030949205160141, - "sft_loss": 0.9632134437561035, - "step": 495 - }, - { - "epoch": 1.6, - "grad_norm": 3.3125411606876445, - "learning_rate": 2.639843506318899e-06, - "logits/chosen": -0.9256556630134583, - "logits/rejected": -1.1392202377319336, - "logps/chosen": -0.8793741464614868, - "logps/rejected": -1.3441039323806763, - "loss": 0.9562, - "odds_ratio_loss": 0.510412335395813, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.04396871104836464, - "rewards/margins": 0.023236487060785294, - "rewards/rejected": -0.06720519065856934, - "sft_loss": 0.8793741464614868, - "step": 500 - }, - { - "epoch": 1.616, - "grad_norm": 5.866811891887456, - "learning_rate": 2.593256050886603e-06, - "logits/chosen": -0.8647163510322571, - "logits/rejected": -1.0798470973968506, - "logps/chosen": -0.9747751951217651, - "logps/rejected": -1.2537850141525269, - "loss": 0.9531, - "odds_ratio_loss": 0.5470961332321167, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.0487387590110302, - "rewards/margins": 0.01395049411803484, - "rewards/rejected": -0.06268925964832306, - "sft_loss": 0.9747751951217651, - "step": 505 - }, - { - "epoch": 1.6320000000000001, - "grad_norm": 4.074172203592654, - "learning_rate": 2.5466361405751914e-06, - "logits/chosen": -0.9186518788337708, - "logits/rejected": -1.1423577070236206, - "logps/chosen": -0.9840022921562195, - "logps/rejected": -1.3202232122421265, - "loss": 0.949, - "odds_ratio_loss": 0.564594566822052, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.049200110137462616, - "rewards/margins": 0.01681104302406311, - "rewards/rejected": -0.06601114571094513, - "sft_loss": 0.9840022921562195, - "step": 510 - }, - { - "epoch": 1.6480000000000001, - "grad_norm": 5.002204843627367, - "learning_rate": 2.5e-06, - "logits/chosen": -0.8605943918228149, - "logits/rejected": -0.9710782766342163, - "logps/chosen": -0.8691279292106628, - "logps/rejected": -1.2410802841186523, - "loss": 0.9795, - "odds_ratio_loss": 0.5254562497138977, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.04345639795064926, - "rewards/margins": 0.018597617745399475, - "rewards/rejected": -0.06205401569604874, - "sft_loss": 0.8691279292106628, - "step": 515 - }, - { - "epoch": 1.6640000000000001, - "grad_norm": 8.858448372423531, - "learning_rate": 2.4533638594248094e-06, - "logits/chosen": -0.9867407083511353, - "logits/rejected": -1.0888783931732178, - "logps/chosen": -0.9163697957992554, - "logps/rejected": -1.1703190803527832, - "loss": 0.9221, - "odds_ratio_loss": 0.5501306056976318, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.045818496495485306, - "rewards/margins": 0.012697461061179638, - "rewards/rejected": -0.05851595476269722, - "sft_loss": 0.9163697957992554, - "step": 520 - }, - { - "epoch": 1.6800000000000002, - "grad_norm": 4.445755031880296, - "learning_rate": 2.406743949113397e-06, - "logits/chosen": -0.9282479286193848, - "logits/rejected": -1.1179710626602173, - "logps/chosen": -0.872515082359314, - "logps/rejected": -1.3933911323547363, - "loss": 0.9773, - "odds_ratio_loss": 0.5091356039047241, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.04362575337290764, - "rewards/margins": 0.026043808087706566, - "rewards/rejected": -0.06966955959796906, - "sft_loss": 0.872515082359314, - "step": 525 - }, - { - "epoch": 1.696, - "grad_norm": 3.6413598914526486, - "learning_rate": 2.360156493681102e-06, - "logits/chosen": -0.910436749458313, - "logits/rejected": -1.0247265100479126, - "logps/chosen": -0.9496439695358276, - "logps/rejected": -1.3968040943145752, - "loss": 0.9491, - "odds_ratio_loss": 0.48868894577026367, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.0474822036921978, - "rewards/margins": 0.0223580040037632, - "rewards/rejected": -0.069840207695961, - "sft_loss": 0.9496439695358276, - "step": 530 - }, - { - "epoch": 1.712, - "grad_norm": 4.294695857238959, - "learning_rate": 2.3136177064483462e-06, - "logits/chosen": -1.0878788232803345, - "logits/rejected": -1.283992052078247, - "logps/chosen": -1.003148078918457, - "logps/rejected": -1.4421515464782715, - "loss": 0.9586, - "odds_ratio_loss": 0.49956098198890686, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.05015740543603897, - "rewards/margins": 0.021950174123048782, - "rewards/rejected": -0.07210757583379745, - "sft_loss": 1.003148078918457, - "step": 535 - }, - { - "epoch": 1.728, - "grad_norm": 4.183928338953188, - "learning_rate": 2.2671437837980943e-06, - "logits/chosen": -1.0223979949951172, - "logits/rejected": -0.9681426286697388, - "logps/chosen": -0.8269376754760742, - "logps/rejected": -1.141745924949646, - "loss": 0.8999, - "odds_ratio_loss": 0.5333009958267212, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.04134688898921013, - "rewards/margins": 0.0157404113560915, - "rewards/rejected": -0.05708730220794678, - "sft_loss": 0.8269376754760742, - "step": 540 - }, - { - "epoch": 1.744, - "grad_norm": 4.65432905249252, - "learning_rate": 2.2207508995392024e-06, - "logits/chosen": -0.9679327011108398, - "logits/rejected": -1.0990087985992432, - "logps/chosen": -0.9268666505813599, - "logps/rejected": -1.2932831048965454, - "loss": 0.9613, - "odds_ratio_loss": 0.5248704552650452, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.04634333401918411, - "rewards/margins": 0.018320811912417412, - "rewards/rejected": -0.06466414779424667, - "sft_loss": 0.9268666505813599, - "step": 545 - }, - { - "epoch": 1.76, - "grad_norm": 3.9586982529561205, - "learning_rate": 2.1744551992776247e-06, - "logits/chosen": -1.0059576034545898, - "logits/rejected": -1.0268685817718506, - "logps/chosen": -0.9964353442192078, - "logps/rejected": -1.6557565927505493, - "loss": 0.9815, - "odds_ratio_loss": 0.49777206778526306, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.04982176423072815, - "rewards/margins": 0.032966069877147675, - "rewards/rejected": -0.08278782665729523, - "sft_loss": 0.9964353442192078, - "step": 550 - }, - { - "epoch": 1.776, - "grad_norm": 9.562472852188419, - "learning_rate": 2.1282727947974373e-06, - "logits/chosen": -0.9853806495666504, - "logits/rejected": -1.1351251602172852, - "logps/chosen": -0.8706343770027161, - "logps/rejected": -1.4279866218566895, - "loss": 0.9807, - "odds_ratio_loss": 0.4323902130126953, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.043531715869903564, - "rewards/margins": 0.02786761149764061, - "rewards/rejected": -0.07139933109283447, - "sft_loss": 0.8706343770027161, - "step": 555 - }, - { - "epoch": 1.792, - "grad_norm": 4.3083896592948845, - "learning_rate": 2.082219758453629e-06, - "logits/chosen": -0.8532622456550598, - "logits/rejected": -1.0556373596191406, - "logps/chosen": -0.9265901446342468, - "logps/rejected": -1.6611661911010742, - "loss": 0.9808, - "odds_ratio_loss": 0.47050419449806213, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.04632951319217682, - "rewards/margins": 0.03672880306839943, - "rewards/rejected": -0.08305831253528595, - "sft_loss": 0.9265901446342468, - "step": 560 - }, - { - "epoch": 1.808, - "grad_norm": 4.988596811372773, - "learning_rate": 2.036312117578617e-06, - "logits/chosen": -0.8543407320976257, - "logits/rejected": -1.035388708114624, - "logps/chosen": -0.9570035934448242, - "logps/rejected": -1.4836640357971191, - "loss": 0.9573, - "odds_ratio_loss": 0.4832231104373932, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.04785018041729927, - "rewards/margins": 0.02633301541209221, - "rewards/rejected": -0.07418319582939148, - "sft_loss": 0.9570035934448242, - "step": 565 - }, - { - "epoch": 1.8239999999999998, - "grad_norm": 3.4762921171914463, - "learning_rate": 1.990565848904431e-06, - "logits/chosen": -0.8208405375480652, - "logits/rejected": -0.940590500831604, - "logps/chosen": -0.8431119918823242, - "logps/rejected": -1.3455002307891846, - "loss": 0.9561, - "odds_ratio_loss": 0.4739263653755188, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.04215560108423233, - "rewards/margins": 0.025119412690401077, - "rewards/rejected": -0.06727501004934311, - "sft_loss": 0.8431119918823242, - "step": 570 - }, - { - "epoch": 1.8399999999999999, - "grad_norm": 4.955237689773966, - "learning_rate": 1.9449968730025055e-06, - "logits/chosen": -0.9368654489517212, - "logits/rejected": -1.018002986907959, - "logps/chosen": -1.0393065214157104, - "logps/rejected": -1.3489172458648682, - "loss": 0.9946, - "odds_ratio_loss": 0.5891560316085815, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.05196532607078552, - "rewards/margins": 0.01548053603619337, - "rewards/rejected": -0.06744585931301117, - "sft_loss": 1.0393065214157104, - "step": 575 - }, - { - "epoch": 1.8559999999999999, - "grad_norm": 4.122518133723736, - "learning_rate": 1.899621048743019e-06, - "logits/chosen": -0.8472858667373657, - "logits/rejected": -0.9123756289482117, - "logps/chosen": -0.8696669340133667, - "logps/rejected": -1.3730722665786743, - "loss": 0.9332, - "odds_ratio_loss": 0.504239559173584, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.043483346700668335, - "rewards/margins": 0.025170262902975082, - "rewards/rejected": -0.06865362077951431, - "sft_loss": 0.8696669340133667, - "step": 580 - }, - { - "epoch": 1.8719999999999999, - "grad_norm": 5.090413419223807, - "learning_rate": 1.854454167775706e-06, - "logits/chosen": -0.8803795576095581, - "logits/rejected": -1.011498212814331, - "logps/chosen": -0.8195604085922241, - "logps/rejected": -1.5066629648208618, - "loss": 0.9053, - "odds_ratio_loss": 0.4221072793006897, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.040978025645017624, - "rewards/margins": 0.03435512259602547, - "rewards/rejected": -0.07533314824104309, - "sft_loss": 0.8195604085922241, - "step": 585 - }, - { - "epoch": 1.888, - "grad_norm": 3.962979618406243, - "learning_rate": 1.8095119490340618e-06, - "logits/chosen": -0.9934213757514954, - "logits/rejected": -1.028438925743103, - "logps/chosen": -0.9060677289962769, - "logps/rejected": -1.4857866764068604, - "loss": 0.9753, - "odds_ratio_loss": 0.6394127011299133, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.04530338943004608, - "rewards/margins": 0.028985943645238876, - "rewards/rejected": -0.07428933680057526, - "sft_loss": 0.9060677289962769, - "step": 590 - }, - { - "epoch": 1.904, - "grad_norm": 4.384891130460857, - "learning_rate": 1.764810033264852e-06, - "logits/chosen": -0.907455563545227, - "logits/rejected": -1.0589368343353271, - "logps/chosen": -1.0206191539764404, - "logps/rejected": -1.4559767246246338, - "loss": 0.9771, - "odds_ratio_loss": 0.5931872129440308, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.0510309636592865, - "rewards/margins": 0.02176786959171295, - "rewards/rejected": -0.07279883325099945, - "sft_loss": 1.0206191539764404, - "step": 595 - }, - { - "epoch": 1.92, - "grad_norm": 3.4134689622305383, - "learning_rate": 1.7203639775848423e-06, - "logits/chosen": -1.0145207643508911, - "logits/rejected": -0.9972650408744812, - "logps/chosen": -0.956381618976593, - "logps/rejected": -1.3823316097259521, - "loss": 0.9631, - "odds_ratio_loss": 0.5243911743164062, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.04781908541917801, - "rewards/margins": 0.021297505125403404, - "rewards/rejected": -0.06911659240722656, - "sft_loss": 0.956381618976593, - "step": 600 - }, - { - "epoch": 1.936, - "grad_norm": 4.505148742615646, - "learning_rate": 1.6761892500666195e-06, - "logits/chosen": -1.023581624031067, - "logits/rejected": -1.027942419052124, - "logps/chosen": -0.933574378490448, - "logps/rejected": -1.2963403463363647, - "loss": 0.9786, - "odds_ratio_loss": 0.5001336336135864, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.04667872190475464, - "rewards/margins": 0.018138296902179718, - "rewards/rejected": -0.06481701880693436, - "sft_loss": 0.933574378490448, - "step": 605 - }, - { - "epoch": 1.952, - "grad_norm": 4.83889942986056, - "learning_rate": 1.632301224355411e-06, - "logits/chosen": -0.8887473940849304, - "logits/rejected": -0.9829289317131042, - "logps/chosen": -1.0025461912155151, - "logps/rejected": -1.2573713064193726, - "loss": 0.9546, - "odds_ratio_loss": 0.6103852987289429, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.0501273088157177, - "rewards/margins": 0.012741250917315483, - "rewards/rejected": -0.06286855787038803, - "sft_loss": 1.0025461912155151, - "step": 610 - }, - { - "epoch": 1.968, - "grad_norm": 4.517273923883564, - "learning_rate": 1.5887151743187634e-06, - "logits/chosen": -0.9073827862739563, - "logits/rejected": -1.0887978076934814, - "logps/chosen": -0.9604595899581909, - "logps/rejected": -1.4289729595184326, - "loss": 0.9811, - "odds_ratio_loss": 0.545965850353241, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.048022981733083725, - "rewards/margins": 0.023425664752721786, - "rewards/rejected": -0.07144864648580551, - "sft_loss": 0.9604595899581909, - "step": 615 - }, - { - "epoch": 1.984, - "grad_norm": 3.1757477627169908, - "learning_rate": 1.5454462687309445e-06, - "logits/chosen": -1.015500783920288, - "logits/rejected": -1.0785764455795288, - "logps/chosen": -0.9297005534172058, - "logps/rejected": -1.5386453866958618, - "loss": 0.9563, - "odds_ratio_loss": 0.542974054813385, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.04648502543568611, - "rewards/margins": 0.03044723905622959, - "rewards/rejected": -0.07693226635456085, - "sft_loss": 0.9297005534172058, - "step": 620 - }, - { - "epoch": 2.0, - "grad_norm": 4.242978905390128, - "learning_rate": 1.502509565993925e-06, - "logits/chosen": -0.9173771142959595, - "logits/rejected": -1.0122209787368774, - "logps/chosen": -0.9267832040786743, - "logps/rejected": -1.5153340101242065, - "loss": 0.8898, - "odds_ratio_loss": 0.45579010248184204, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.04633915796875954, - "rewards/margins": 0.02942754328250885, - "rewards/rejected": -0.07576669752597809, - "sft_loss": 0.9267832040786743, - "step": 625 - }, - { - "epoch": 2.016, - "grad_norm": 3.553798975041404, - "learning_rate": 1.4599200088967652e-06, - "logits/chosen": -0.8619723320007324, - "logits/rejected": -1.0356481075286865, - "logps/chosen": -0.8099665641784668, - "logps/rejected": -1.3428175449371338, - "loss": 0.8054, - "odds_ratio_loss": 0.42676934599876404, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.04049833118915558, - "rewards/margins": 0.02664254978299141, - "rewards/rejected": -0.06714087724685669, - "sft_loss": 0.8099665641784668, - "step": 630 - }, - { - "epoch": 2.032, - "grad_norm": 4.569902596370669, - "learning_rate": 1.4176924194152405e-06, - "logits/chosen": -0.9947516322135925, - "logits/rejected": -1.0760139226913452, - "logps/chosen": -0.822666347026825, - "logps/rejected": -1.4580928087234497, - "loss": 0.8194, - "odds_ratio_loss": 0.408845990896225, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.041133321821689606, - "rewards/margins": 0.03177132084965706, - "rewards/rejected": -0.07290463894605637, - "sft_loss": 0.822666347026825, - "step": 635 - }, - { - "epoch": 2.048, - "grad_norm": 3.58850101082042, - "learning_rate": 1.3758414935535147e-06, - "logits/chosen": -0.8080762624740601, - "logits/rejected": -0.8731328845024109, - "logps/chosen": -0.6912453174591064, - "logps/rejected": -1.2947412729263306, - "loss": 0.7991, - "odds_ratio_loss": 0.39434918761253357, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.03456226736307144, - "rewards/margins": 0.030174797400832176, - "rewards/rejected": -0.06473706662654877, - "sft_loss": 0.6912453174591064, - "step": 640 - }, - { - "epoch": 2.064, - "grad_norm": 3.1095303302424204, - "learning_rate": 1.3343817962296485e-06, - "logits/chosen": -0.9426826238632202, - "logits/rejected": -0.9611290097236633, - "logps/chosen": -0.7455964088439941, - "logps/rejected": -1.3733656406402588, - "loss": 0.8219, - "odds_ratio_loss": 0.400754451751709, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.037279821932315826, - "rewards/margins": 0.03138846904039383, - "rewards/rejected": -0.06866829097270966, - "sft_loss": 0.7455964088439941, - "step": 645 - }, - { - "epoch": 2.08, - "grad_norm": 3.045256483819498, - "learning_rate": 1.293327756206729e-06, - "logits/chosen": -0.8983833193778992, - "logits/rejected": -0.9661597013473511, - "logps/chosen": -0.6603522300720215, - "logps/rejected": -1.5160930156707764, - "loss": 0.7579, - "odds_ratio_loss": 0.3125734031200409, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.033017612993717194, - "rewards/margins": 0.042787034064531326, - "rewards/rejected": -0.07580464333295822, - "sft_loss": 0.6603522300720215, - "step": 650 - }, - { - "epoch": 2.096, - "grad_norm": 3.1717779800652592, - "learning_rate": 1.252693661071391e-06, - "logits/chosen": -1.157251238822937, - "logits/rejected": -1.0623748302459717, - "logps/chosen": -0.7086284160614014, - "logps/rejected": -1.514593243598938, - "loss": 0.7865, - "odds_ratio_loss": 0.33622443675994873, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.035431426018476486, - "rewards/margins": 0.04029824212193489, - "rewards/rejected": -0.07572966068983078, - "sft_loss": 0.7086284160614014, - "step": 655 - }, - { - "epoch": 2.112, - "grad_norm": 2.7205852811983093, - "learning_rate": 1.2124936522614622e-06, - "logits/chosen": -1.0289185047149658, - "logits/rejected": -1.098934292793274, - "logps/chosen": -0.7381170988082886, - "logps/rejected": -1.4106049537658691, - "loss": 0.7903, - "odds_ratio_loss": 0.45428943634033203, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.03690585121512413, - "rewards/margins": 0.033624399453401566, - "rewards/rejected": -0.0705302506685257, - "sft_loss": 0.7381170988082886, - "step": 660 - }, - { - "epoch": 2.128, - "grad_norm": 2.8528967143088666, - "learning_rate": 1.1727417201444735e-06, - "logits/chosen": -1.0716520547866821, - "logits/rejected": -1.0720608234405518, - "logps/chosen": -0.7356234788894653, - "logps/rejected": -1.2771320343017578, - "loss": 0.797, - "odds_ratio_loss": 0.41821280121803284, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.036781176924705505, - "rewards/margins": 0.027075421065092087, - "rewards/rejected": -0.0638565942645073, - "sft_loss": 0.7356234788894653, - "step": 665 - }, - { - "epoch": 2.144, - "grad_norm": 3.7706399394026615, - "learning_rate": 1.1334516991487473e-06, - "logits/chosen": -0.7660879492759705, - "logits/rejected": -1.0078188180923462, - "logps/chosen": -0.7483173608779907, - "logps/rejected": -1.2434841394424438, - "loss": 0.7863, - "odds_ratio_loss": 0.3927895426750183, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.037415869534015656, - "rewards/margins": 0.02475833147764206, - "rewards/rejected": -0.062174201011657715, - "sft_loss": 0.7483173608779907, - "step": 670 - }, - { - "epoch": 2.16, - "grad_norm": 3.5703943571070482, - "learning_rate": 1.094637262948749e-06, - "logits/chosen": -0.9008601307868958, - "logits/rejected": -1.0412832498550415, - "logps/chosen": -0.821381688117981, - "logps/rejected": -1.3886059522628784, - "loss": 0.7675, - "odds_ratio_loss": 0.4320302903652191, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.04106908664107323, - "rewards/margins": 0.02836121991276741, - "rewards/rejected": -0.06943030655384064, - "sft_loss": 0.821381688117981, - "step": 675 - }, - { - "epoch": 2.176, - "grad_norm": 2.9837572151294065, - "learning_rate": 1.0563119197063934e-06, - "logits/chosen": -1.058257818222046, - "logits/rejected": -1.0895675420761108, - "logps/chosen": -0.914572536945343, - "logps/rejected": -1.3927881717681885, - "loss": 0.7969, - "odds_ratio_loss": 0.46075814962387085, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.04572862759232521, - "rewards/margins": 0.023910781368613243, - "rewards/rejected": -0.0696394145488739, - "sft_loss": 0.914572536945343, - "step": 680 - }, - { - "epoch": 2.192, - "grad_norm": 3.2497337072742414, - "learning_rate": 1.018489007369945e-06, - "logits/chosen": -0.8853160738945007, - "logits/rejected": -0.8999375104904175, - "logps/chosen": -0.7777100801467896, - "logps/rejected": -1.448999047279358, - "loss": 0.804, - "odds_ratio_loss": 0.39251843094825745, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.03888550028204918, - "rewards/margins": 0.03356444090604782, - "rewards/rejected": -0.0724499449133873, - "sft_loss": 0.7777100801467896, - "step": 685 - }, - { - "epoch": 2.208, - "grad_norm": 3.029373367507165, - "learning_rate": 9.81181689032158e-07, - "logits/chosen": -0.8918782472610474, - "logits/rejected": -0.8753920793533325, - "logps/chosen": -0.6909304857254028, - "logps/rejected": -1.2901936769485474, - "loss": 0.7751, - "odds_ratio_loss": 0.4143063426017761, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.03454653173685074, - "rewards/margins": 0.029963159933686256, - "rewards/rejected": -0.06450968980789185, - "sft_loss": 0.6909304857254028, - "step": 690 - }, - { - "epoch": 2.224, - "grad_norm": 4.5746404005970405, - "learning_rate": 9.444029483492703e-07, - "logits/chosen": -1.0063061714172363, - "logits/rejected": -1.0239533185958862, - "logps/chosen": -0.6867491006851196, - "logps/rejected": -1.245091199874878, - "loss": 0.7732, - "odds_ratio_loss": 0.4617982804775238, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.03433745354413986, - "rewards/margins": 0.027917101979255676, - "rewards/rejected": -0.06225455552339554, - "sft_loss": 0.6867491006851196, - "step": 695 - }, - { - "epoch": 2.24, - "grad_norm": 3.694255891644154, - "learning_rate": 9.081655850224449e-07, - "logits/chosen": -1.0207760334014893, - "logits/rejected": -1.1230969429016113, - "logps/chosen": -0.8350968360900879, - "logps/rejected": -1.5500547885894775, - "loss": 0.7583, - "odds_ratio_loss": 0.42910847067832947, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.041754838079214096, - "rewards/margins": 0.03574790060520172, - "rewards/rejected": -0.07750274240970612, - "sft_loss": 0.8350968360900879, - "step": 700 - }, - { - "epoch": 2.2560000000000002, - "grad_norm": 3.994487822092531, - "learning_rate": 8.724822103432282e-07, - "logits/chosen": -0.8827289342880249, - "logits/rejected": -0.9654695391654968, - "logps/chosen": -0.7088466286659241, - "logps/rejected": -1.281082034111023, - "loss": 0.7482, - "odds_ratio_loss": 0.3990407884120941, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.035442329943180084, - "rewards/margins": 0.028611773625016212, - "rewards/rejected": -0.06405410915613174, - "sft_loss": 0.7088466286659241, - "step": 705 - }, - { - "epoch": 2.2720000000000002, - "grad_norm": 3.3050191320581166, - "learning_rate": 8.373652428045831e-07, - "logits/chosen": -1.0878264904022217, - "logits/rejected": -1.0405784845352173, - "logps/chosen": -0.65968257188797, - "logps/rejected": -1.3744075298309326, - "loss": 0.7722, - "odds_ratio_loss": 0.3181590139865875, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.03298413008451462, - "rewards/margins": 0.03573625162243843, - "rewards/rejected": -0.06872038543224335, - "sft_loss": 0.65968257188797, - "step": 710 - }, - { - "epoch": 2.288, - "grad_norm": 3.8117555833594943, - "learning_rate": 8.028269037790171e-07, - "logits/chosen": -0.8991669416427612, - "logits/rejected": -0.9795140027999878, - "logps/chosen": -0.8547693490982056, - "logps/rejected": -1.5010745525360107, - "loss": 0.7688, - "odds_ratio_loss": 0.42176875472068787, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.04273846372961998, - "rewards/margins": 0.03231526166200638, - "rewards/rejected": -0.07505372911691666, - "sft_loss": 0.8547693490982056, - "step": 715 - }, - { - "epoch": 2.304, - "grad_norm": 6.060881007391729, - "learning_rate": 7.688792132653111e-07, - "logits/chosen": -0.8518077731132507, - "logits/rejected": -0.9652704000473022, - "logps/chosen": -0.7316969633102417, - "logps/rejected": -1.5251853466033936, - "loss": 0.7621, - "odds_ratio_loss": 0.3367399573326111, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.03658485412597656, - "rewards/margins": 0.03967442363500595, - "rewards/rejected": -0.07625927031040192, - "sft_loss": 0.7316969633102417, - "step": 720 - }, - { - "epoch": 2.32, - "grad_norm": 2.9281573250934363, - "learning_rate": 7.355339857053318e-07, - "logits/chosen": -0.8477774858474731, - "logits/rejected": -1.018747329711914, - "logps/chosen": -0.7693859934806824, - "logps/rejected": -1.3998419046401978, - "loss": 0.7714, - "odds_ratio_loss": 0.3711521625518799, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.03846929594874382, - "rewards/margins": 0.031522803008556366, - "rewards/rejected": -0.06999210268259048, - "sft_loss": 0.7693859934806824, - "step": 725 - }, - { - "epoch": 2.336, - "grad_norm": 3.3575685063670213, - "learning_rate": 7.028028258723818e-07, - "logits/chosen": -0.9588859677314758, - "logits/rejected": -1.0327568054199219, - "logps/chosen": -0.7077163457870483, - "logps/rejected": -1.3171709775924683, - "loss": 0.7932, - "odds_ratio_loss": 0.3643357455730438, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.03538581356406212, - "rewards/margins": 0.030472736805677414, - "rewards/rejected": -0.06585855782032013, - "sft_loss": 0.7077163457870483, - "step": 730 - }, - { - "epoch": 2.352, - "grad_norm": 3.07955633424655, - "learning_rate": 6.706971248325151e-07, - "logits/chosen": -0.8883659243583679, - "logits/rejected": -1.0441168546676636, - "logps/chosen": -0.8074674606323242, - "logps/rejected": -1.4699620008468628, - "loss": 0.776, - "odds_ratio_loss": 0.39879804849624634, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.04037337377667427, - "rewards/margins": 0.033124733716249466, - "rewards/rejected": -0.07349809259176254, - "sft_loss": 0.8074674606323242, - "step": 735 - }, - { - "epoch": 2.368, - "grad_norm": 3.6213614434043593, - "learning_rate": 6.392280559802341e-07, - "logits/chosen": -0.9162646532058716, - "logits/rejected": -0.9279802441596985, - "logps/chosen": -0.7218812704086304, - "logps/rejected": -1.4897241592407227, - "loss": 0.7637, - "odds_ratio_loss": 0.411416620016098, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.0360940620303154, - "rewards/margins": 0.03839214891195297, - "rewards/rejected": -0.07448621094226837, - "sft_loss": 0.7218812704086304, - "step": 740 - }, - { - "epoch": 2.384, - "grad_norm": 4.1017905530653245, - "learning_rate": 6.084065711499326e-07, - "logits/chosen": -0.84368896484375, - "logits/rejected": -0.9179951548576355, - "logps/chosen": -0.6008902192115784, - "logps/rejected": -1.4554692506790161, - "loss": 0.7279, - "odds_ratio_loss": 0.3157138526439667, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -0.030044514685869217, - "rewards/margins": 0.04272894933819771, - "rewards/rejected": -0.07277346402406693, - "sft_loss": 0.6008902192115784, - "step": 745 - }, - { - "epoch": 2.4, - "grad_norm": 3.8487814271047758, - "learning_rate": 5.782433968044495e-07, - "logits/chosen": -0.8399477005004883, - "logits/rejected": -0.9413140416145325, - "logps/chosen": -0.8139437437057495, - "logps/rejected": -1.5214980840682983, - "loss": 0.7662, - "odds_ratio_loss": 0.5000792741775513, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.040697190910577774, - "rewards/margins": 0.03537772223353386, - "rewards/rejected": -0.07607491314411163, - "sft_loss": 0.8139437437057495, - "step": 750 - }, - { - "epoch": 2.416, - "grad_norm": 3.305488495552487, - "learning_rate": 5.487490303020576e-07, - "logits/chosen": -0.9991554021835327, - "logits/rejected": -0.944214940071106, - "logps/chosen": -0.7551851272583008, - "logps/rejected": -1.2736141681671143, - "loss": 0.7806, - "odds_ratio_loss": 0.4188924729824066, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.03775925561785698, - "rewards/margins": 0.025921454653143883, - "rewards/rejected": -0.06368071585893631, - "sft_loss": 0.7551851272583008, - "step": 755 - }, - { - "epoch": 2.432, - "grad_norm": 3.6380522040854566, - "learning_rate": 5.199337362431792e-07, - "logits/chosen": -0.8715981245040894, - "logits/rejected": -1.0095371007919312, - "logps/chosen": -0.716741681098938, - "logps/rejected": -1.7061433792114258, - "loss": 0.7998, - "odds_ratio_loss": 0.40209442377090454, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.0358370803296566, - "rewards/margins": 0.04947008565068245, - "rewards/rejected": -0.08530716598033905, - "sft_loss": 0.716741681098938, - "step": 760 - }, - { - "epoch": 2.448, - "grad_norm": 4.799248860716653, - "learning_rate": 4.918075428981148e-07, - "logits/chosen": -0.9698484539985657, - "logits/rejected": -1.0252515077590942, - "logps/chosen": -0.7276674509048462, - "logps/rejected": -1.2364145517349243, - "loss": 0.7519, - "odds_ratio_loss": 0.4512792229652405, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.03638336807489395, - "rewards/margins": 0.025437360629439354, - "rewards/rejected": -0.06182073429226875, - "sft_loss": 0.7276674509048462, - "step": 765 - }, - { - "epoch": 2.464, - "grad_norm": 3.975888652914653, - "learning_rate": 4.643802387170118e-07, - "logits/chosen": -0.784801185131073, - "logits/rejected": -0.8751401901245117, - "logps/chosen": -0.736864447593689, - "logps/rejected": -1.4113752841949463, - "loss": 0.7689, - "odds_ratio_loss": 0.41703367233276367, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.03684321790933609, - "rewards/margins": 0.0337255522608757, - "rewards/rejected": -0.07056877762079239, - "sft_loss": 0.736864447593689, - "step": 770 - }, - { - "epoch": 2.48, - "grad_norm": 3.7017186080610873, - "learning_rate": 4.376613689232978e-07, - "logits/chosen": -0.9608728289604187, - "logits/rejected": -1.000018835067749, - "logps/chosen": -0.8199416399002075, - "logps/rejected": -1.469847321510315, - "loss": 0.7588, - "odds_ratio_loss": 0.43195098638534546, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.04099708050489426, - "rewards/margins": 0.03249528631567955, - "rewards/rejected": -0.0734923705458641, - "sft_loss": 0.8199416399002075, - "step": 775 - }, - { - "epoch": 2.496, - "grad_norm": 3.1981116471330764, - "learning_rate": 4.1166023219176176e-07, - "logits/chosen": -0.8377124667167664, - "logits/rejected": -1.0116181373596191, - "logps/chosen": -0.7824350595474243, - "logps/rejected": -1.6547685861587524, - "loss": 0.7523, - "odds_ratio_loss": 0.32834020256996155, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.039121754467487335, - "rewards/margins": 0.043616678565740585, - "rewards/rejected": -0.08273844420909882, - "sft_loss": 0.7824350595474243, - "step": 780 - }, - { - "epoch": 2.512, - "grad_norm": 3.7082914287151945, - "learning_rate": 3.863858774124385e-07, - "logits/chosen": -1.0026713609695435, - "logits/rejected": -1.0367608070373535, - "logps/chosen": -0.6959080696105957, - "logps/rejected": -1.4358288049697876, - "loss": 0.7563, - "odds_ratio_loss": 0.3291171193122864, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -0.03479539602994919, - "rewards/margins": 0.036996036767959595, - "rewards/rejected": -0.07179144024848938, - "sft_loss": 0.6959080696105957, - "step": 785 - }, - { - "epoch": 2.528, - "grad_norm": 2.9819183882511004, - "learning_rate": 3.618471005414215e-07, - "logits/chosen": -1.1593316793441772, - "logits/rejected": -1.1186132431030273, - "logps/chosen": -0.5521804094314575, - "logps/rejected": -1.3308180570602417, - "loss": 0.7382, - "odds_ratio_loss": 0.40159231424331665, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.027609020471572876, - "rewards/margins": 0.03893188014626503, - "rewards/rejected": -0.0665409117937088, - "sft_loss": 0.5521804094314575, - "step": 790 - }, - { - "epoch": 2.544, - "grad_norm": 3.3567110818810755, - "learning_rate": 3.380524415397049e-07, - "logits/chosen": -0.8402732014656067, - "logits/rejected": -1.0374484062194824, - "logps/chosen": -0.803017795085907, - "logps/rejected": -1.2986352443695068, - "loss": 0.7742, - "odds_ratio_loss": 0.46145597100257874, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.04015089198946953, - "rewards/margins": 0.0247808750718832, - "rewards/rejected": -0.06493176519870758, - "sft_loss": 0.803017795085907, - "step": 795 - }, - { - "epoch": 2.56, - "grad_norm": 3.3420577148113924, - "learning_rate": 3.150101814011136e-07, - "logits/chosen": -1.031750202178955, - "logits/rejected": -1.02663254737854, - "logps/chosen": -0.7611913681030273, - "logps/rejected": -1.1853117942810059, - "loss": 0.7785, - "odds_ratio_loss": 0.5039734244346619, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.03805956989526749, - "rewards/margins": 0.021206015720963478, - "rewards/rejected": -0.05926559120416641, - "sft_loss": 0.7611913681030273, - "step": 800 - }, - { - "epoch": 2.576, - "grad_norm": 3.246898163699306, - "learning_rate": 2.927283392703606e-07, - "logits/chosen": -0.8511770367622375, - "logits/rejected": -0.9470258951187134, - "logps/chosen": -0.7670743465423584, - "logps/rejected": -1.1585172414779663, - "loss": 0.7977, - "odds_ratio_loss": 0.4781486988067627, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.03835371881723404, - "rewards/margins": 0.019572149962186813, - "rewards/rejected": -0.057925861328840256, - "sft_loss": 0.7670743465423584, - "step": 805 - }, - { - "epoch": 2.592, - "grad_norm": 3.9992460548702438, - "learning_rate": 2.712146696522305e-07, - "logits/chosen": -1.022859811782837, - "logits/rejected": -1.129640817642212, - "logps/chosen": -0.716964840888977, - "logps/rejected": -1.246201992034912, - "loss": 0.7552, - "odds_ratio_loss": 0.4771757125854492, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.03584824129939079, - "rewards/margins": 0.026461860164999962, - "rewards/rejected": -0.062310099601745605, - "sft_loss": 0.716964840888977, - "step": 810 - }, - { - "epoch": 2.608, - "grad_norm": 3.5945402118846403, - "learning_rate": 2.504766597128666e-07, - "logits/chosen": -0.9580585360527039, - "logits/rejected": -0.9554399251937866, - "logps/chosen": -0.8108547925949097, - "logps/rejected": -1.315863847732544, - "loss": 0.7575, - "odds_ratio_loss": 0.44800853729248047, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.040542736649513245, - "rewards/margins": 0.025250453501939774, - "rewards/rejected": -0.06579320132732391, - "sft_loss": 0.8108547925949097, - "step": 815 - }, - { - "epoch": 2.624, - "grad_norm": 3.288445101598415, - "learning_rate": 2.3052152667409289e-07, - "logits/chosen": -0.9488247036933899, - "logits/rejected": -0.8991857767105103, - "logps/chosen": -0.7179723978042603, - "logps/rejected": -1.3861563205718994, - "loss": 0.7752, - "odds_ratio_loss": 0.3779214322566986, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.03589861840009689, - "rewards/margins": 0.03340919688344002, - "rewards/rejected": -0.06930781900882721, - "sft_loss": 0.7179723978042603, - "step": 820 - }, - { - "epoch": 2.64, - "grad_norm": 4.333903778456222, - "learning_rate": 2.1135621530168488e-07, - "logits/chosen": -1.028704285621643, - "logits/rejected": -0.9637089967727661, - "logps/chosen": -0.6309131383895874, - "logps/rejected": -1.2894703149795532, - "loss": 0.7935, - "odds_ratio_loss": 0.3709859549999237, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.03154565393924713, - "rewards/margins": 0.03292786702513695, - "rewards/rejected": -0.06447352468967438, - "sft_loss": 0.6309131383895874, - "step": 825 - }, - { - "epoch": 2.656, - "grad_norm": 3.1671393797255414, - "learning_rate": 1.9298739548845813e-07, - "logits/chosen": -0.9645661115646362, - "logits/rejected": -0.9816401600837708, - "logps/chosen": -0.8160637021064758, - "logps/rejected": -1.374236822128296, - "loss": 0.8091, - "odds_ratio_loss": 0.4034562110900879, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.04080318659543991, - "rewards/margins": 0.027908671647310257, - "rewards/rejected": -0.06871185451745987, - "sft_loss": 0.8160637021064758, - "step": 830 - }, - { - "epoch": 2.672, - "grad_norm": 3.093675577889326, - "learning_rate": 1.7542145993301896e-07, - "logits/chosen": -0.9770193099975586, - "logits/rejected": -1.0430421829223633, - "logps/chosen": -0.6873968839645386, - "logps/rejected": -1.183712124824524, - "loss": 0.7636, - "odds_ratio_loss": 0.42723220586776733, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.03436984866857529, - "rewards/margins": 0.02481575682759285, - "rewards/rejected": -0.05918560549616814, - "sft_loss": 0.6873968839645386, - "step": 835 - }, - { - "epoch": 2.6879999999999997, - "grad_norm": 3.044985818298562, - "learning_rate": 1.5866452191498488e-07, - "logits/chosen": -0.8661419153213501, - "logits/rejected": -1.0225311517715454, - "logps/chosen": -0.7636631727218628, - "logps/rejected": -1.324355959892273, - "loss": 0.7742, - "odds_ratio_loss": 0.3950451910495758, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.03818316012620926, - "rewards/margins": 0.028034633025527, - "rewards/rejected": -0.06621779501438141, - "sft_loss": 0.7636631727218628, - "step": 840 - }, - { - "epoch": 2.7039999999999997, - "grad_norm": 3.49139680447691, - "learning_rate": 1.4272241316744456e-07, - "logits/chosen": -0.9768983125686646, - "logits/rejected": -1.0676887035369873, - "logps/chosen": -0.8691689372062683, - "logps/rejected": -1.5023279190063477, - "loss": 0.7386, - "odds_ratio_loss": 0.36283236742019653, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -0.043458450585603714, - "rewards/margins": 0.03165794909000397, - "rewards/rejected": -0.07511639595031738, - "sft_loss": 0.8691689372062683, - "step": 845 - }, - { - "epoch": 2.7199999999999998, - "grad_norm": 3.4065842852697674, - "learning_rate": 1.2760068184740597e-07, - "logits/chosen": -0.9277766942977905, - "logits/rejected": -0.9539043307304382, - "logps/chosen": -0.6972528100013733, - "logps/rejected": -1.4623371362686157, - "loss": 0.711, - "odds_ratio_loss": 0.32903754711151123, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -0.03486264497041702, - "rewards/margins": 0.03825421258807182, - "rewards/rejected": -0.07311685383319855, - "sft_loss": 0.6972528100013733, - "step": 850 - }, - { - "epoch": 2.7359999999999998, - "grad_norm": 4.258492525068464, - "learning_rate": 1.133045906049321e-07, - "logits/chosen": -0.9535223245620728, - "logits/rejected": -0.967192530632019, - "logps/chosen": -0.7654585242271423, - "logps/rejected": -1.2852718830108643, - "loss": 0.7165, - "odds_ratio_loss": 0.4885888695716858, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.038272932171821594, - "rewards/margins": 0.025990664958953857, - "rewards/rejected": -0.06426359713077545, - "sft_loss": 0.7654585242271423, - "step": 855 - }, - { - "epoch": 2.752, - "grad_norm": 3.0541533378277403, - "learning_rate": 9.983911475163727e-08, - "logits/chosen": -0.9215459823608398, - "logits/rejected": -1.025499939918518, - "logps/chosen": -0.6536693572998047, - "logps/rejected": -1.464961051940918, - "loss": 0.7491, - "odds_ratio_loss": 0.32264453172683716, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -0.032683465629816055, - "rewards/margins": 0.04056458920240402, - "rewards/rejected": -0.07324805110692978, - "sft_loss": 0.6536693572998047, - "step": 860 - }, - { - "epoch": 2.768, - "grad_norm": 3.5262579401831413, - "learning_rate": 8.720894052918566e-08, - "logits/chosen": -0.9398951530456543, - "logits/rejected": -1.0038102865219116, - "logps/chosen": -0.6747088432312012, - "logps/rejected": -1.5684466361999512, - "loss": 0.7404, - "odds_ratio_loss": 0.35855698585510254, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.03373543918132782, - "rewards/margins": 0.044686902314424515, - "rewards/rejected": -0.07842233031988144, - "sft_loss": 0.6747088432312012, - "step": 865 - }, - { - "epoch": 2.784, - "grad_norm": 4.66998911832725, - "learning_rate": 7.541846347838915e-08, - "logits/chosen": -0.9948604702949524, - "logits/rejected": -0.9804432988166809, - "logps/chosen": -0.7318006753921509, - "logps/rejected": -1.360406517982483, - "loss": 0.7496, - "odds_ratio_loss": 0.4134978652000427, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.036590032279491425, - "rewards/margins": 0.03143029659986496, - "rewards/rejected": -0.06802032887935638, - "sft_loss": 0.7318006753921509, - "step": 870 - }, - { - "epoch": 2.8, - "grad_norm": 3.657269423224737, - "learning_rate": 6.447178690947492e-08, - "logits/chosen": -0.8408433794975281, - "logits/rejected": -1.0504786968231201, - "logps/chosen": -0.6820616126060486, - "logps/rejected": -1.6608028411865234, - "loss": 0.7865, - "odds_ratio_loss": 0.2992148697376251, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": -0.03410308063030243, - "rewards/margins": 0.04893706366419792, - "rewards/rejected": -0.08304014801979065, - "sft_loss": 0.6820616126060486, - "step": 875 - }, - { - "epoch": 2.816, - "grad_norm": 3.377129562618715, - "learning_rate": 5.437272047405712e-08, - "logits/chosen": -0.8684722781181335, - "logits/rejected": -0.871597945690155, - "logps/chosen": -0.7219066619873047, - "logps/rejected": -1.350375771522522, - "loss": 0.7529, - "odds_ratio_loss": 0.3972117006778717, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": -0.036095328629016876, - "rewards/margins": 0.031423456966876984, - "rewards/rejected": -0.06751878559589386, - "sft_loss": 0.7219066619873047, - "step": 880 - }, - { - "epoch": 2.832, - "grad_norm": 3.1663560374696633, - "learning_rate": 4.512477883930527e-08, - "logits/chosen": -0.9551021456718445, - "logits/rejected": -1.0277870893478394, - "logps/chosen": -0.7899004817008972, - "logps/rejected": -1.5054200887680054, - "loss": 0.8151, - "odds_ratio_loss": 0.44721174240112305, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.03949502483010292, - "rewards/margins": 0.03577598184347153, - "rewards/rejected": -0.07527101784944534, - "sft_loss": 0.7899004817008972, - "step": 885 - }, - { - "epoch": 2.848, - "grad_norm": 4.553277973933315, - "learning_rate": 3.673118046477159e-08, - "logits/chosen": -0.9583977460861206, - "logits/rejected": -0.9128350019454956, - "logps/chosen": -0.7262309789657593, - "logps/rejected": -1.2458374500274658, - "loss": 0.7505, - "odds_ratio_loss": 0.4251146912574768, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.0363115519285202, - "rewards/margins": 0.025980323553085327, - "rewards/rejected": -0.06229187175631523, - "sft_loss": 0.7262309789657593, - "step": 890 - }, - { - "epoch": 2.864, - "grad_norm": 3.38523058441021, - "learning_rate": 2.9194846482308026e-08, - "logits/chosen": -0.9198935627937317, - "logits/rejected": -1.0665795803070068, - "logps/chosen": -0.7491655349731445, - "logps/rejected": -1.439427137374878, - "loss": 0.7622, - "odds_ratio_loss": 0.3993758261203766, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.037458278238773346, - "rewards/margins": 0.03451308235526085, - "rewards/rejected": -0.0719713643193245, - "sft_loss": 0.7491655349731445, - "step": 895 - }, - { - "epoch": 2.88, - "grad_norm": 5.053660983515992, - "learning_rate": 2.251839967945535e-08, - "logits/chosen": -0.9375904202461243, - "logits/rejected": -1.060706377029419, - "logps/chosen": -0.8767011761665344, - "logps/rejected": -1.344328761100769, - "loss": 0.766, - "odds_ratio_loss": 0.5356343984603882, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.04383506253361702, - "rewards/margins": 0.02338137850165367, - "rewards/rejected": -0.06721644103527069, - "sft_loss": 0.8767011761665344, - "step": 900 - }, - { - "epoch": 2.896, - "grad_norm": 4.052840811239487, - "learning_rate": 1.6704163586663825e-08, - "logits/chosen": -0.9922626614570618, - "logits/rejected": -0.952767014503479, - "logps/chosen": -0.7710371017456055, - "logps/rejected": -1.4237831830978394, - "loss": 0.8119, - "odds_ratio_loss": 0.37999871373176575, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.03855185955762863, - "rewards/margins": 0.032637301832437515, - "rewards/rejected": -0.07118916511535645, - "sft_loss": 0.7710371017456055, - "step": 905 - }, - { - "epoch": 2.912, - "grad_norm": 3.5836749421401146, - "learning_rate": 1.1754161668660612e-08, - "logits/chosen": -0.8529298901557922, - "logits/rejected": -0.905608057975769, - "logps/chosen": -0.7730112671852112, - "logps/rejected": -1.4924733638763428, - "loss": 0.7818, - "odds_ratio_loss": 0.40126991271972656, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.03865056484937668, - "rewards/margins": 0.035973113030195236, - "rewards/rejected": -0.07462368160486221, - "sft_loss": 0.7730112671852112, - "step": 910 - }, - { - "epoch": 2.928, - "grad_norm": 4.257912492562235, - "learning_rate": 7.670116620245304e-09, - "logits/chosen": -0.9177722930908203, - "logits/rejected": -1.0528053045272827, - "logps/chosen": -0.7789991497993469, - "logps/rejected": -1.4093081951141357, - "loss": 0.7862, - "odds_ratio_loss": 0.41386303305625916, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.03894995525479317, - "rewards/margins": 0.0315154567360878, - "rewards/rejected": -0.07046540826559067, - "sft_loss": 0.7789991497993469, - "step": 915 - }, - { - "epoch": 2.944, - "grad_norm": 4.5782153540538575, - "learning_rate": 4.453449766758933e-09, - "logits/chosen": -1.0013253688812256, - "logits/rejected": -0.9696464538574219, - "logps/chosen": -0.7406843900680542, - "logps/rejected": -1.4851348400115967, - "loss": 0.7435, - "odds_ratio_loss": 0.3701634407043457, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.037034209817647934, - "rewards/margins": 0.03722252696752548, - "rewards/rejected": -0.07425674051046371, - "sft_loss": 0.7406843900680542, - "step": 920 - }, - { - "epoch": 2.96, - "grad_norm": 3.8405956220288906, - "learning_rate": 2.1052805694365964e-09, - "logits/chosen": -0.9112046360969543, - "logits/rejected": -0.953746497631073, - "logps/chosen": -0.9097617864608765, - "logps/rejected": -1.2347737550735474, - "loss": 0.787, - "odds_ratio_loss": 0.5556334853172302, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.04548809304833412, - "rewards/margins": 0.016250593587756157, - "rewards/rejected": -0.061738692224025726, - "sft_loss": 0.9097617864608765, - "step": 925 - }, - { - "epoch": 2.976, - "grad_norm": 3.3132646219123676, - "learning_rate": 6.264262358129936e-10, - "logits/chosen": -0.8271144032478333, - "logits/rejected": -1.076704502105713, - "logps/chosen": -0.8304189443588257, - "logps/rejected": -1.3862361907958984, - "loss": 0.7539, - "odds_ratio_loss": 0.4186479449272156, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.04152094945311546, - "rewards/margins": 0.027790868654847145, - "rewards/rejected": -0.06931181252002716, - "sft_loss": 0.8304189443588257, - "step": 930 - }, - { - "epoch": 2.992, - "grad_norm": 3.846388656199648, - "learning_rate": 1.7401435318531444e-11, - "logits/chosen": -1.0561763048171997, - "logits/rejected": -1.0017073154449463, - "logps/chosen": -0.6843768358230591, - "logps/rejected": -1.4440100193023682, - "loss": 0.7977, - "odds_ratio_loss": 0.4135671555995941, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": -0.034218840301036835, - "rewards/margins": 0.03798165172338486, - "rewards/rejected": -0.07220049947500229, - "sft_loss": 0.6843768358230591, - "step": 935 - }, - { - "epoch": 2.9952, - "step": 936, - "total_flos": 172904605286400.0, - "train_loss": 0.9994224820636276, - "train_runtime": 29817.6595, - "train_samples_per_second": 2.012, - "train_steps_per_second": 0.031 - } - ], - "logging_steps": 5, - "max_steps": 936, - "num_input_tokens_seen": 0, - "num_train_epochs": 3, - "save_steps": 100.0, - "total_flos": 172904605286400.0, - "train_batch_size": 2, - "trial_name": null, - "trial_params": null -}