{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 84, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -3.2206931114196777, "debug/policy_chosen_logps": -200.86012268066406, "debug/policy_rejected_logits": -2.898437738418579, "debug/policy_rejected_logps": -208.0646514892578, "debug/reference_chosen_logps": -200.86012268066406, "debug/reference_rejected_logps": -208.0646514892578, "epoch": 0.011904761904761904, "grad_norm": 7.909108829449386, "learning_rate": 1e-06, "logits/chosen": -3.2206931114196777, "logits/rejected": -2.898437738418579, "logps/chosen": -200.86012268066406, "logps/rejected": -208.0646514892578, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -3.1326141357421875, "debug/policy_chosen_logps": -219.65463256835938, "debug/policy_rejected_logits": -3.1044466495513916, "debug/policy_rejected_logps": -218.29165649414062, "debug/reference_chosen_logps": -219.87649536132812, "debug/reference_rejected_logps": -218.21566772460938, "epoch": 0.023809523809523808, "grad_norm": 7.063786345467847, "learning_rate": 1e-06, "logits/chosen": -3.1326141357421875, "logits/rejected": -3.1044466495513916, "logps/chosen": -219.65463256835938, "logps/rejected": -218.29165649414062, "loss": 0.4983, "rewards/accuracies": 0.625, "rewards/chosen": 0.0022186278365552425, "rewards/margins": 0.0029784394428133965, "rewards/rejected": -0.0007598113734275103, "step": 2 }, { "debug/policy_chosen_logits": -3.135671377182007, "debug/policy_chosen_logps": -208.30648803710938, "debug/policy_rejected_logits": -3.0028326511383057, "debug/policy_rejected_logps": -239.4796142578125, "debug/reference_chosen_logps": -208.16616821289062, "debug/reference_rejected_logps": -239.21315002441406, "epoch": 0.03571428571428571, "grad_norm": 7.814141071934522, "learning_rate": 1e-06, "logits/chosen": -3.135671377182007, "logits/rejected": -3.0028326511383057, "logps/chosen": -208.30648803710938, "logps/rejected": -239.4796142578125, "loss": 0.4933, "rewards/accuracies": 0.5, "rewards/chosen": -0.0014033315237611532, "rewards/margins": 0.0012612915597856045, "rewards/rejected": -0.0026646230835467577, "step": 3 }, { "debug/policy_chosen_logits": -3.187422275543213, "debug/policy_chosen_logps": -217.25140380859375, "debug/policy_rejected_logits": -3.1444270610809326, "debug/policy_rejected_logps": -219.7812957763672, "debug/reference_chosen_logps": -217.35531616210938, "debug/reference_rejected_logps": -218.98512268066406, "epoch": 0.047619047619047616, "grad_norm": 7.09702591230904, "learning_rate": 1e-06, "logits/chosen": -3.187422275543213, "logits/rejected": -3.1444270610809326, "logps/chosen": -217.25140380859375, "logps/rejected": -219.7812957763672, "loss": 0.4896, "rewards/accuracies": 0.625, "rewards/chosen": 0.001039218856021762, "rewards/margins": 0.009000949561595917, "rewards/rejected": -0.007961731404066086, "step": 4 }, { "debug/policy_chosen_logits": -3.0982069969177246, "debug/policy_chosen_logps": -216.79022216796875, "debug/policy_rejected_logits": -2.9893712997436523, "debug/policy_rejected_logps": -236.919189453125, "debug/reference_chosen_logps": -217.1226806640625, "debug/reference_rejected_logps": -235.52293395996094, "epoch": 0.05952380952380952, "grad_norm": 5.630512315088139, "learning_rate": 1e-06, "logits/chosen": -3.0982069969177246, "logits/rejected": -2.9893712997436523, "logps/chosen": -216.79022216796875, "logps/rejected": -236.919189453125, "loss": 0.4832, "rewards/accuracies": 0.875, "rewards/chosen": 0.0033246802631765604, "rewards/margins": 0.01728721708059311, "rewards/rejected": -0.013962535187602043, "step": 5 }, { "debug/policy_chosen_logits": -3.1938157081604004, "debug/policy_chosen_logps": -207.0302734375, "debug/policy_rejected_logits": -3.085257053375244, "debug/policy_rejected_logps": -217.99813842773438, "debug/reference_chosen_logps": -207.5093536376953, "debug/reference_rejected_logps": -216.325927734375, "epoch": 0.07142857142857142, "grad_norm": 5.70424011476702, "learning_rate": 1e-06, "logits/chosen": -3.1938157081604004, "logits/rejected": -3.085257053375244, "logps/chosen": -207.0302734375, "logps/rejected": -217.99813842773438, "loss": 0.4773, "rewards/accuracies": 0.625, "rewards/chosen": 0.004790782928466797, "rewards/margins": 0.02151281200349331, "rewards/rejected": -0.01672203093767166, "step": 6 }, { "debug/policy_chosen_logits": -3.163257122039795, "debug/policy_chosen_logps": -211.92942810058594, "debug/policy_rejected_logits": -3.0449893474578857, "debug/policy_rejected_logps": -214.0583953857422, "debug/reference_chosen_logps": -212.87844848632812, "debug/reference_rejected_logps": -212.81723022460938, "epoch": 0.08333333333333333, "grad_norm": 6.547599704469646, "learning_rate": 1e-06, "logits/chosen": -3.163257122039795, "logits/rejected": -3.0449893474578857, "logps/chosen": -211.92942810058594, "logps/rejected": -214.0583953857422, "loss": 0.4684, "rewards/accuracies": 0.75, "rewards/chosen": 0.00949014537036419, "rewards/margins": 0.021901872009038925, "rewards/rejected": -0.012411728501319885, "step": 7 }, { "debug/policy_chosen_logits": -3.2458667755126953, "debug/policy_chosen_logps": -197.6811065673828, "debug/policy_rejected_logits": -3.0591673851013184, "debug/policy_rejected_logps": -231.7470245361328, "debug/reference_chosen_logps": -199.1888427734375, "debug/reference_rejected_logps": -228.80226135253906, "epoch": 0.09523809523809523, "grad_norm": 7.248047931654466, "learning_rate": 1e-06, "logits/chosen": -3.2458667755126953, "logits/rejected": -3.0591673851013184, "logps/chosen": -197.6811065673828, "logps/rejected": -231.7470245361328, "loss": 0.4588, "rewards/accuracies": 0.75, "rewards/chosen": 0.015077304095029831, "rewards/margins": 0.04452499374747276, "rewards/rejected": -0.029447689652442932, "step": 8 }, { "debug/policy_chosen_logits": -3.212696075439453, "debug/policy_chosen_logps": -196.501708984375, "debug/policy_rejected_logits": -3.0544850826263428, "debug/policy_rejected_logps": -217.50942993164062, "debug/reference_chosen_logps": -198.6233673095703, "debug/reference_rejected_logps": -214.6073760986328, "epoch": 0.10714285714285714, "grad_norm": 6.028275040076952, "learning_rate": 1e-06, "logits/chosen": -3.212696075439453, "logits/rejected": -3.0544850826263428, "logps/chosen": -196.501708984375, "logps/rejected": -217.50942993164062, "loss": 0.4341, "rewards/accuracies": 0.75, "rewards/chosen": 0.021216563880443573, "rewards/margins": 0.050237104296684265, "rewards/rejected": -0.029020538553595543, "step": 9 }, { "debug/policy_chosen_logits": -3.1316659450531006, "debug/policy_chosen_logps": -216.9590301513672, "debug/policy_rejected_logits": -3.0151987075805664, "debug/policy_rejected_logps": -225.83355712890625, "debug/reference_chosen_logps": -221.61790466308594, "debug/reference_rejected_logps": -219.59786987304688, "epoch": 0.11904761904761904, "grad_norm": 5.4813634886972515, "learning_rate": 1e-06, "logits/chosen": -3.1316659450531006, "logits/rejected": -3.0151987075805664, "logps/chosen": -216.9590301513672, "logps/rejected": -225.83355712890625, "loss": 0.4064, "rewards/accuracies": 0.625, "rewards/chosen": 0.046588800847530365, "rewards/margins": 0.10894560813903809, "rewards/rejected": -0.06235681474208832, "step": 10 }, { "debug/policy_chosen_logits": -3.2891530990600586, "debug/policy_chosen_logps": -196.72274780273438, "debug/policy_rejected_logits": -3.048377513885498, "debug/policy_rejected_logps": -233.62039184570312, "debug/reference_chosen_logps": -202.43775939941406, "debug/reference_rejected_logps": -216.876708984375, "epoch": 0.13095238095238096, "grad_norm": 4.876535878960536, "learning_rate": 1e-06, "logits/chosen": -3.2891530990600586, "logits/rejected": -3.048377513885498, "logps/chosen": -196.72274780273438, "logps/rejected": -233.62039184570312, "loss": 0.4083, "rewards/accuracies": 0.875, "rewards/chosen": 0.05715004354715347, "rewards/margins": 0.22458696365356445, "rewards/rejected": -0.16743692755699158, "step": 11 }, { "debug/policy_chosen_logits": -3.1717939376831055, "debug/policy_chosen_logps": -210.72622680664062, "debug/policy_rejected_logits": -3.060811758041382, "debug/policy_rejected_logps": -230.44659423828125, "debug/reference_chosen_logps": -219.1645050048828, "debug/reference_rejected_logps": -223.36727905273438, "epoch": 0.14285714285714285, "grad_norm": 5.9567820612609585, "learning_rate": 1e-06, "logits/chosen": -3.1717939376831055, "logits/rejected": -3.060811758041382, "logps/chosen": -210.72622680664062, "logps/rejected": -230.44659423828125, "loss": 0.3617, "rewards/accuracies": 1.0, "rewards/chosen": 0.08438283205032349, "rewards/margins": 0.155176043510437, "rewards/rejected": -0.07079321146011353, "step": 12 }, { "debug/policy_chosen_logits": -3.175966739654541, "debug/policy_chosen_logps": -198.75677490234375, "debug/policy_rejected_logits": -3.0943973064422607, "debug/policy_rejected_logps": -209.6800994873047, "debug/reference_chosen_logps": -206.38829040527344, "debug/reference_rejected_logps": -198.82118225097656, "epoch": 0.15476190476190477, "grad_norm": 4.372108578101748, "learning_rate": 1e-06, "logits/chosen": -3.175966739654541, "logits/rejected": -3.0943973064422607, "logps/chosen": -198.75677490234375, "logps/rejected": -209.6800994873047, "loss": 0.378, "rewards/accuracies": 0.75, "rewards/chosen": 0.07631513476371765, "rewards/margins": 0.1849043220281601, "rewards/rejected": -0.10858918726444244, "step": 13 }, { "debug/policy_chosen_logits": -3.17907977104187, "debug/policy_chosen_logps": -204.30484008789062, "debug/policy_rejected_logits": -3.0033490657806396, "debug/policy_rejected_logps": -224.59976196289062, "debug/reference_chosen_logps": -215.46783447265625, "debug/reference_rejected_logps": -213.1477813720703, "epoch": 0.16666666666666666, "grad_norm": 4.951824369660157, "learning_rate": 1e-06, "logits/chosen": -3.17907977104187, "logits/rejected": -3.0033490657806396, "logps/chosen": -204.30484008789062, "logps/rejected": -224.59976196289062, "loss": 0.354, "rewards/accuracies": 1.0, "rewards/chosen": 0.1116299033164978, "rewards/margins": 0.22614973783493042, "rewards/rejected": -0.11451983451843262, "step": 14 }, { "debug/policy_chosen_logits": -3.0961408615112305, "debug/policy_chosen_logps": -209.14535522460938, "debug/policy_rejected_logits": -3.00311279296875, "debug/policy_rejected_logps": -224.08718872070312, "debug/reference_chosen_logps": -220.22549438476562, "debug/reference_rejected_logps": -222.09124755859375, "epoch": 0.17857142857142858, "grad_norm": 4.835613329592286, "learning_rate": 1e-06, "logits/chosen": -3.0961408615112305, "logits/rejected": -3.00311279296875, "logps/chosen": -209.14535522460938, "logps/rejected": -224.08718872070312, "loss": 0.386, "rewards/accuracies": 0.75, "rewards/chosen": 0.11080135405063629, "rewards/margins": 0.13076072931289673, "rewards/rejected": -0.019959375262260437, "step": 15 }, { "debug/policy_chosen_logits": -3.2088327407836914, "debug/policy_chosen_logps": -206.8312530517578, "debug/policy_rejected_logits": -3.0719218254089355, "debug/policy_rejected_logps": -226.76593017578125, "debug/reference_chosen_logps": -218.99668884277344, "debug/reference_rejected_logps": -211.85366821289062, "epoch": 0.19047619047619047, "grad_norm": 4.544057800793572, "learning_rate": 1e-06, "logits/chosen": -3.2088327407836914, "logits/rejected": -3.0719218254089355, "logps/chosen": -206.8312530517578, "logps/rejected": -226.76593017578125, "loss": 0.3723, "rewards/accuracies": 0.875, "rewards/chosen": 0.12165433168411255, "rewards/margins": 0.27077698707580566, "rewards/rejected": -0.1491226851940155, "step": 16 }, { "debug/policy_chosen_logits": -3.1967031955718994, "debug/policy_chosen_logps": -206.11988830566406, "debug/policy_rejected_logits": -3.192483425140381, "debug/policy_rejected_logps": -230.90736389160156, "debug/reference_chosen_logps": -221.86349487304688, "debug/reference_rejected_logps": -223.01187133789062, "epoch": 0.20238095238095238, "grad_norm": 4.376140545216712, "learning_rate": 1e-06, "logits/chosen": -3.1967031955718994, "logits/rejected": -3.192483425140381, "logps/chosen": -206.11988830566406, "logps/rejected": -230.90736389160156, "loss": 0.3998, "rewards/accuracies": 0.75, "rewards/chosen": 0.15743595361709595, "rewards/margins": 0.23639078438282013, "rewards/rejected": -0.07895481586456299, "step": 17 }, { "debug/policy_chosen_logits": -3.1152353286743164, "debug/policy_chosen_logps": -193.357177734375, "debug/policy_rejected_logits": -2.959299087524414, "debug/policy_rejected_logps": -263.7359313964844, "debug/reference_chosen_logps": -210.5402069091797, "debug/reference_rejected_logps": -232.1613311767578, "epoch": 0.21428571428571427, "grad_norm": 7.866472140538224, "learning_rate": 1e-06, "logits/chosen": -3.1152353286743164, "logits/rejected": -2.959299087524414, "logps/chosen": -193.357177734375, "logps/rejected": -263.7359313964844, "loss": 0.4077, "rewards/accuracies": 1.0, "rewards/chosen": 0.17183025181293488, "rewards/margins": 0.4875761568546295, "rewards/rejected": -0.31574589014053345, "step": 18 }, { "debug/policy_chosen_logits": -3.2761292457580566, "debug/policy_chosen_logps": -185.4181365966797, "debug/policy_rejected_logits": -3.269240379333496, "debug/policy_rejected_logps": -225.70327758789062, "debug/reference_chosen_logps": -202.86187744140625, "debug/reference_rejected_logps": -228.69943237304688, "epoch": 0.2261904761904762, "grad_norm": 7.929981237103111, "learning_rate": 1e-06, "logits/chosen": -3.2761292457580566, "logits/rejected": -3.269240379333496, "logps/chosen": -185.4181365966797, "logps/rejected": -225.70327758789062, "loss": 0.3953, "rewards/accuracies": 0.625, "rewards/chosen": 0.17443738877773285, "rewards/margins": 0.14447586238384247, "rewards/rejected": 0.02996152639389038, "step": 19 }, { "debug/policy_chosen_logits": -3.2340097427368164, "debug/policy_chosen_logps": -193.43096923828125, "debug/policy_rejected_logits": -3.099465847015381, "debug/policy_rejected_logps": -230.0125732421875, "debug/reference_chosen_logps": -216.61485290527344, "debug/reference_rejected_logps": -211.0814208984375, "epoch": 0.23809523809523808, "grad_norm": 8.640304701773179, "learning_rate": 1e-06, "logits/chosen": -3.2340097427368164, "logits/rejected": -3.099465847015381, "logps/chosen": -193.43096923828125, "logps/rejected": -230.0125732421875, "loss": 0.3575, "rewards/accuracies": 1.0, "rewards/chosen": 0.23183873295783997, "rewards/margins": 0.42115041613578796, "rewards/rejected": -0.189311683177948, "step": 20 }, { "debug/policy_chosen_logits": -3.162797689437866, "debug/policy_chosen_logps": -191.775146484375, "debug/policy_rejected_logits": -3.2479846477508545, "debug/policy_rejected_logps": -217.1043243408203, "debug/reference_chosen_logps": -215.61062622070312, "debug/reference_rejected_logps": -209.42953491210938, "epoch": 0.25, "grad_norm": 7.114249349006281, "learning_rate": 1e-06, "logits/chosen": -3.162797689437866, "logits/rejected": -3.2479846477508545, "logps/chosen": -191.775146484375, "logps/rejected": -217.1043243408203, "loss": 0.4064, "rewards/accuracies": 0.875, "rewards/chosen": 0.23835478723049164, "rewards/margins": 0.3151026964187622, "rewards/rejected": -0.07674790918827057, "step": 21 }, { "debug/policy_chosen_logits": -3.3014657497406006, "debug/policy_chosen_logps": -185.117919921875, "debug/policy_rejected_logits": -3.2064030170440674, "debug/policy_rejected_logps": -231.311279296875, "debug/reference_chosen_logps": -208.03945922851562, "debug/reference_rejected_logps": -204.46234130859375, "epoch": 0.2619047619047619, "grad_norm": 6.717051925734186, "learning_rate": 1e-06, "logits/chosen": -3.3014657497406006, "logits/rejected": -3.2064030170440674, "logps/chosen": -185.117919921875, "logps/rejected": -231.311279296875, "loss": 0.4124, "rewards/accuracies": 0.875, "rewards/chosen": 0.22921523451805115, "rewards/margins": 0.4977045953273773, "rewards/rejected": -0.26848936080932617, "step": 22 }, { "debug/policy_chosen_logits": -3.2503294944763184, "debug/policy_chosen_logps": -193.4159698486328, "debug/policy_rejected_logits": -3.1396243572235107, "debug/policy_rejected_logps": -254.65135192871094, "debug/reference_chosen_logps": -217.694580078125, "debug/reference_rejected_logps": -212.81295776367188, "epoch": 0.27380952380952384, "grad_norm": 7.339797017549244, "learning_rate": 1e-06, "logits/chosen": -3.2503294944763184, "logits/rejected": -3.1396243572235107, "logps/chosen": -193.4159698486328, "logps/rejected": -254.65135192871094, "loss": 0.3197, "rewards/accuracies": 1.0, "rewards/chosen": 0.24278610944747925, "rewards/margins": 0.6611701250076294, "rewards/rejected": -0.41838398575782776, "step": 23 }, { "debug/policy_chosen_logits": -3.299981117248535, "debug/policy_chosen_logps": -185.9925537109375, "debug/policy_rejected_logits": -3.0819525718688965, "debug/policy_rejected_logps": -258.153076171875, "debug/reference_chosen_logps": -212.69705200195312, "debug/reference_rejected_logps": -221.9197235107422, "epoch": 0.2857142857142857, "grad_norm": 6.4780070021425145, "learning_rate": 1e-06, "logits/chosen": -3.299981117248535, "logits/rejected": -3.0819525718688965, "logps/chosen": -185.9925537109375, "logps/rejected": -258.153076171875, "loss": 0.3351, "rewards/accuracies": 0.875, "rewards/chosen": 0.2670450806617737, "rewards/margins": 0.629378616809845, "rewards/rejected": -0.3623335361480713, "step": 24 }, { "debug/policy_chosen_logits": -3.3384597301483154, "debug/policy_chosen_logps": -179.01553344726562, "debug/policy_rejected_logits": -3.2596333026885986, "debug/policy_rejected_logps": -230.0491180419922, "debug/reference_chosen_logps": -201.91806030273438, "debug/reference_rejected_logps": -212.782958984375, "epoch": 0.2976190476190476, "grad_norm": 5.646156232636996, "learning_rate": 1e-06, "logits/chosen": -3.3384597301483154, "logits/rejected": -3.2596333026885986, "logps/chosen": -179.01553344726562, "logps/rejected": -230.0491180419922, "loss": 0.3597, "rewards/accuracies": 0.875, "rewards/chosen": 0.2290252447128296, "rewards/margins": 0.40168675780296326, "rewards/rejected": -0.17266148328781128, "step": 25 }, { "debug/policy_chosen_logits": -3.2951643466949463, "debug/policy_chosen_logps": -187.70423889160156, "debug/policy_rejected_logits": -3.064873218536377, "debug/policy_rejected_logps": -259.9974365234375, "debug/reference_chosen_logps": -209.17364501953125, "debug/reference_rejected_logps": -230.99514770507812, "epoch": 0.30952380952380953, "grad_norm": 5.004804914940331, "learning_rate": 1e-06, "logits/chosen": -3.2951643466949463, "logits/rejected": -3.064873218536377, "logps/chosen": -187.70423889160156, "logps/rejected": -259.9974365234375, "loss": 0.3607, "rewards/accuracies": 0.75, "rewards/chosen": 0.21469400823116302, "rewards/margins": 0.5047171115875244, "rewards/rejected": -0.2900230884552002, "step": 26 }, { "debug/policy_chosen_logits": -3.205540895462036, "debug/policy_chosen_logps": -202.82135009765625, "debug/policy_rejected_logits": -3.1168112754821777, "debug/policy_rejected_logps": -220.36105346679688, "debug/reference_chosen_logps": -219.64862060546875, "debug/reference_rejected_logps": -217.84548950195312, "epoch": 0.32142857142857145, "grad_norm": 7.861811148122209, "learning_rate": 1e-06, "logits/chosen": -3.205540895462036, "logits/rejected": -3.1168112754821777, "logps/chosen": -202.82135009765625, "logps/rejected": -220.36105346679688, "loss": 0.3706, "rewards/accuracies": 0.75, "rewards/chosen": 0.16827276349067688, "rewards/margins": 0.19342824816703796, "rewards/rejected": -0.02515549585223198, "step": 27 }, { "debug/policy_chosen_logits": -3.266860246658325, "debug/policy_chosen_logps": -190.13174438476562, "debug/policy_rejected_logits": -3.2241365909576416, "debug/policy_rejected_logps": -217.80401611328125, "debug/reference_chosen_logps": -207.9426727294922, "debug/reference_rejected_logps": -213.35520935058594, "epoch": 0.3333333333333333, "grad_norm": 5.17181036610514, "learning_rate": 1e-06, "logits/chosen": -3.266860246658325, "logits/rejected": -3.2241365909576416, "logps/chosen": -190.13174438476562, "logps/rejected": -217.80401611328125, "loss": 0.381, "rewards/accuracies": 0.875, "rewards/chosen": 0.17810919880867004, "rewards/margins": 0.22259722650051117, "rewards/rejected": -0.044488027691841125, "step": 28 }, { "debug/policy_chosen_logits": -3.138282060623169, "debug/policy_chosen_logps": -194.2283935546875, "debug/policy_rejected_logits": -3.242144823074341, "debug/policy_rejected_logps": -240.74034118652344, "debug/reference_chosen_logps": -205.63931274414062, "debug/reference_rejected_logps": -223.89479064941406, "epoch": 0.34523809523809523, "grad_norm": 13.702732402282919, "learning_rate": 1e-06, "logits/chosen": -3.138282060623169, "logits/rejected": -3.242144823074341, "logps/chosen": -194.2283935546875, "logps/rejected": -240.74034118652344, "loss": 0.3319, "rewards/accuracies": 0.875, "rewards/chosen": 0.11410927027463913, "rewards/margins": 0.2825648784637451, "rewards/rejected": -0.1684555858373642, "step": 29 }, { "debug/policy_chosen_logits": -3.2786245346069336, "debug/policy_chosen_logps": -196.5978546142578, "debug/policy_rejected_logits": -3.1386148929595947, "debug/policy_rejected_logps": -242.57998657226562, "debug/reference_chosen_logps": -202.89393615722656, "debug/reference_rejected_logps": -212.0622100830078, "epoch": 0.35714285714285715, "grad_norm": 18.8411217134156, "learning_rate": 1e-06, "logits/chosen": -3.2786245346069336, "logits/rejected": -3.1386148929595947, "logps/chosen": -196.5978546142578, "logps/rejected": -242.57998657226562, "loss": 0.3392, "rewards/accuracies": 0.875, "rewards/chosen": 0.0629609078168869, "rewards/margins": 0.36813876032829285, "rewards/rejected": -0.30517783761024475, "step": 30 }, { "debug/policy_chosen_logits": -3.220453977584839, "debug/policy_chosen_logps": -210.29718017578125, "debug/policy_rejected_logits": -3.0406980514526367, "debug/policy_rejected_logps": -254.49639892578125, "debug/reference_chosen_logps": -217.5068359375, "debug/reference_rejected_logps": -222.9973907470703, "epoch": 0.36904761904761907, "grad_norm": 17.63556570482219, "learning_rate": 1e-06, "logits/chosen": -3.220453977584839, "logits/rejected": -3.0406980514526367, "logps/chosen": -210.29718017578125, "logps/rejected": -254.49639892578125, "loss": 0.3337, "rewards/accuracies": 0.875, "rewards/chosen": 0.0720965787768364, "rewards/margins": 0.38708674907684326, "rewards/rejected": -0.31499019265174866, "step": 31 }, { "debug/policy_chosen_logits": -3.283477544784546, "debug/policy_chosen_logps": -195.7086944580078, "debug/policy_rejected_logits": -3.2998650074005127, "debug/policy_rejected_logps": -248.77716064453125, "debug/reference_chosen_logps": -205.69100952148438, "debug/reference_rejected_logps": -238.83279418945312, "epoch": 0.38095238095238093, "grad_norm": 3.545596888211082, "learning_rate": 1e-06, "logits/chosen": -3.283477544784546, "logits/rejected": -3.2998650074005127, "logps/chosen": -195.7086944580078, "logps/rejected": -248.77716064453125, "loss": 0.3367, "rewards/accuracies": 0.875, "rewards/chosen": 0.09982330352067947, "rewards/margins": 0.19926708936691284, "rewards/rejected": -0.09944379329681396, "step": 32 }, { "debug/policy_chosen_logits": -3.2302284240722656, "debug/policy_chosen_logps": -189.14857482910156, "debug/policy_rejected_logits": -3.1394155025482178, "debug/policy_rejected_logps": -231.4539031982422, "debug/reference_chosen_logps": -204.17739868164062, "debug/reference_rejected_logps": -209.3110809326172, "epoch": 0.39285714285714285, "grad_norm": 3.835026591129013, "learning_rate": 1e-06, "logits/chosen": -3.2302284240722656, "logits/rejected": -3.1394155025482178, "logps/chosen": -189.14857482910156, "logps/rejected": -231.4539031982422, "loss": 0.337, "rewards/accuracies": 0.875, "rewards/chosen": 0.15028832852840424, "rewards/margins": 0.3717164993286133, "rewards/rejected": -0.22142818570137024, "step": 33 }, { "debug/policy_chosen_logits": -3.2129104137420654, "debug/policy_chosen_logps": -199.6995849609375, "debug/policy_rejected_logits": -3.0475525856018066, "debug/policy_rejected_logps": -242.47573852539062, "debug/reference_chosen_logps": -211.38697814941406, "debug/reference_rejected_logps": -218.95986938476562, "epoch": 0.40476190476190477, "grad_norm": 3.6065017129358563, "learning_rate": 1e-06, "logits/chosen": -3.2129104137420654, "logits/rejected": -3.0475525856018066, "logps/chosen": -199.6995849609375, "logps/rejected": -242.47573852539062, "loss": 0.3252, "rewards/accuracies": 0.75, "rewards/chosen": 0.11687390506267548, "rewards/margins": 0.35203248262405396, "rewards/rejected": -0.23515859246253967, "step": 34 }, { "debug/policy_chosen_logits": -3.0805389881134033, "debug/policy_chosen_logps": -200.73974609375, "debug/policy_rejected_logits": -3.04288649559021, "debug/policy_rejected_logps": -218.5482940673828, "debug/reference_chosen_logps": -213.4292449951172, "debug/reference_rejected_logps": -223.4329071044922, "epoch": 0.4166666666666667, "grad_norm": 5.765726016207518, "learning_rate": 1e-06, "logits/chosen": -3.0805389881134033, "logits/rejected": -3.04288649559021, "logps/chosen": -200.73974609375, "logps/rejected": -218.5482940673828, "loss": 0.3783, "rewards/accuracies": 0.75, "rewards/chosen": 0.12689509987831116, "rewards/margins": 0.07804906368255615, "rewards/rejected": 0.048846036195755005, "step": 35 }, { "debug/policy_chosen_logits": -3.339388370513916, "debug/policy_chosen_logps": -175.0019073486328, "debug/policy_rejected_logits": -3.125450611114502, "debug/policy_rejected_logps": -234.52488708496094, "debug/reference_chosen_logps": -191.13693237304688, "debug/reference_rejected_logps": -211.32481384277344, "epoch": 0.42857142857142855, "grad_norm": 3.4118668773069563, "learning_rate": 1e-06, "logits/chosen": -3.339388370513916, "logits/rejected": -3.125450611114502, "logps/chosen": -175.0019073486328, "logps/rejected": -234.52488708496094, "loss": 0.324, "rewards/accuracies": 0.875, "rewards/chosen": 0.16135042905807495, "rewards/margins": 0.3933510482311249, "rewards/rejected": -0.23200063407421112, "step": 36 }, { "debug/policy_chosen_logits": -3.1920435428619385, "debug/policy_chosen_logps": -200.70831298828125, "debug/policy_rejected_logits": -3.0675861835479736, "debug/policy_rejected_logps": -240.75643920898438, "debug/reference_chosen_logps": -220.32891845703125, "debug/reference_rejected_logps": -224.6131591796875, "epoch": 0.44047619047619047, "grad_norm": 3.775080381030047, "learning_rate": 1e-06, "logits/chosen": -3.1920435428619385, "logits/rejected": -3.0675861835479736, "logps/chosen": -200.70831298828125, "logps/rejected": -240.75643920898438, "loss": 0.3004, "rewards/accuracies": 1.0, "rewards/chosen": 0.19620609283447266, "rewards/margins": 0.357638955116272, "rewards/rejected": -0.16143286228179932, "step": 37 }, { "debug/policy_chosen_logits": -3.173511505126953, "debug/policy_chosen_logps": -203.63047790527344, "debug/policy_rejected_logits": -3.0617382526397705, "debug/policy_rejected_logps": -252.37109375, "debug/reference_chosen_logps": -214.03150939941406, "debug/reference_rejected_logps": -230.29051208496094, "epoch": 0.4523809523809524, "grad_norm": 4.456940286801414, "learning_rate": 1e-06, "logits/chosen": -3.173511505126953, "logits/rejected": -3.0617382526397705, "logps/chosen": -203.63047790527344, "logps/rejected": -252.37109375, "loss": 0.3421, "rewards/accuracies": 1.0, "rewards/chosen": 0.10401026904582977, "rewards/margins": 0.3248162269592285, "rewards/rejected": -0.22080595791339874, "step": 38 }, { "debug/policy_chosen_logits": -3.153968095779419, "debug/policy_chosen_logps": -215.73043823242188, "debug/policy_rejected_logits": -3.1119275093078613, "debug/policy_rejected_logps": -230.99270629882812, "debug/reference_chosen_logps": -224.58657836914062, "debug/reference_rejected_logps": -230.7893829345703, "epoch": 0.4642857142857143, "grad_norm": 3.7503647290224547, "learning_rate": 1e-06, "logits/chosen": -3.153968095779419, "logits/rejected": -3.1119275093078613, "logps/chosen": -215.73043823242188, "logps/rejected": -230.99270629882812, "loss": 0.3678, "rewards/accuracies": 0.625, "rewards/chosen": 0.08856132626533508, "rewards/margins": 0.09059463441371918, "rewards/rejected": -0.0020333081483840942, "step": 39 }, { "debug/policy_chosen_logits": -3.0754432678222656, "debug/policy_chosen_logps": -196.66781616210938, "debug/policy_rejected_logits": -2.9984235763549805, "debug/policy_rejected_logps": -238.45590209960938, "debug/reference_chosen_logps": -210.78643798828125, "debug/reference_rejected_logps": -208.17205810546875, "epoch": 0.47619047619047616, "grad_norm": 4.009273704463349, "learning_rate": 1e-06, "logits/chosen": -3.0754432678222656, "logits/rejected": -2.9984235763549805, "logps/chosen": -196.66781616210938, "logps/rejected": -238.45590209960938, "loss": 0.327, "rewards/accuracies": 0.875, "rewards/chosen": 0.14118614792823792, "rewards/margins": 0.4440246522426605, "rewards/rejected": -0.3028385043144226, "step": 40 }, { "debug/policy_chosen_logits": -3.180971622467041, "debug/policy_chosen_logps": -211.52041625976562, "debug/policy_rejected_logits": -3.1294796466827393, "debug/policy_rejected_logps": -226.37899780273438, "debug/reference_chosen_logps": -221.50608825683594, "debug/reference_rejected_logps": -220.98367309570312, "epoch": 0.4880952380952381, "grad_norm": 4.554216689284069, "learning_rate": 1e-06, "logits/chosen": -3.180971622467041, "logits/rejected": -3.1294796466827393, "logps/chosen": -211.52041625976562, "logps/rejected": -226.37899780273438, "loss": 0.3403, "rewards/accuracies": 0.875, "rewards/chosen": 0.09985677897930145, "rewards/margins": 0.15380997955799103, "rewards/rejected": -0.05395320802927017, "step": 41 }, { "debug/policy_chosen_logits": -3.1375789642333984, "debug/policy_chosen_logps": -213.60757446289062, "debug/policy_rejected_logits": -3.1117618083953857, "debug/policy_rejected_logps": -256.28436279296875, "debug/reference_chosen_logps": -226.36074829101562, "debug/reference_rejected_logps": -242.12509155273438, "epoch": 0.5, "grad_norm": 4.775957985916952, "learning_rate": 1e-06, "logits/chosen": -3.1375789642333984, "logits/rejected": -3.1117618083953857, "logps/chosen": -213.60757446289062, "logps/rejected": -256.28436279296875, "loss": 0.3186, "rewards/accuracies": 0.625, "rewards/chosen": 0.1275317668914795, "rewards/margins": 0.26912426948547363, "rewards/rejected": -0.14159251749515533, "step": 42 }, { "debug/policy_chosen_logits": -3.1112849712371826, "debug/policy_chosen_logps": -208.876220703125, "debug/policy_rejected_logits": -3.0731041431427, "debug/policy_rejected_logps": -213.07131958007812, "debug/reference_chosen_logps": -226.13137817382812, "debug/reference_rejected_logps": -202.86383056640625, "epoch": 0.5119047619047619, "grad_norm": 3.169314540293903, "learning_rate": 1e-06, "logits/chosen": -3.1112849712371826, "logits/rejected": -3.0731041431427, "logps/chosen": -208.876220703125, "logps/rejected": -213.07131958007812, "loss": 0.3156, "rewards/accuracies": 0.75, "rewards/chosen": 0.17255139350891113, "rewards/margins": 0.2746262848377228, "rewards/rejected": -0.10207486897706985, "step": 43 }, { "debug/policy_chosen_logits": -3.1239426136016846, "debug/policy_chosen_logps": -201.20697021484375, "debug/policy_rejected_logits": -3.087494373321533, "debug/policy_rejected_logps": -259.4544372558594, "debug/reference_chosen_logps": -221.05288696289062, "debug/reference_rejected_logps": -240.31405639648438, "epoch": 0.5238095238095238, "grad_norm": 3.436483143809452, "learning_rate": 1e-06, "logits/chosen": -3.1239426136016846, "logits/rejected": -3.087494373321533, "logps/chosen": -201.20697021484375, "logps/rejected": -259.4544372558594, "loss": 0.3123, "rewards/accuracies": 1.0, "rewards/chosen": 0.19845911860466003, "rewards/margins": 0.3898632526397705, "rewards/rejected": -0.19140410423278809, "step": 44 }, { "debug/policy_chosen_logits": -3.138341188430786, "debug/policy_chosen_logps": -175.82582092285156, "debug/policy_rejected_logits": -2.981558084487915, "debug/policy_rejected_logps": -231.8880615234375, "debug/reference_chosen_logps": -196.50595092773438, "debug/reference_rejected_logps": -216.26844787597656, "epoch": 0.5357142857142857, "grad_norm": 3.953231116742769, "learning_rate": 1e-06, "logits/chosen": -3.138341188430786, "logits/rejected": -2.981558084487915, "logps/chosen": -175.82582092285156, "logps/rejected": -231.8880615234375, "loss": 0.3425, "rewards/accuracies": 0.875, "rewards/chosen": 0.2068011611700058, "rewards/margins": 0.3629972040653229, "rewards/rejected": -0.15619604289531708, "step": 45 }, { "debug/policy_chosen_logits": -3.097372531890869, "debug/policy_chosen_logps": -190.10743713378906, "debug/policy_rejected_logits": -2.9607269763946533, "debug/policy_rejected_logps": -255.23684692382812, "debug/reference_chosen_logps": -207.03024291992188, "debug/reference_rejected_logps": -224.69744873046875, "epoch": 0.5476190476190477, "grad_norm": 4.130780539545194, "learning_rate": 1e-06, "logits/chosen": -3.097372531890869, "logits/rejected": -2.9607269763946533, "logps/chosen": -190.10743713378906, "logps/rejected": -255.23684692382812, "loss": 0.334, "rewards/accuracies": 0.875, "rewards/chosen": 0.16922807693481445, "rewards/margins": 0.4746219217777252, "rewards/rejected": -0.30539384484291077, "step": 46 }, { "debug/policy_chosen_logits": -3.1764347553253174, "debug/policy_chosen_logps": -204.2672119140625, "debug/policy_rejected_logits": -3.070920467376709, "debug/policy_rejected_logps": -236.72357177734375, "debug/reference_chosen_logps": -222.78765869140625, "debug/reference_rejected_logps": -211.966552734375, "epoch": 0.5595238095238095, "grad_norm": 2.9027881991083926, "learning_rate": 1e-06, "logits/chosen": -3.1764347553253174, "logits/rejected": -3.070920467376709, "logps/chosen": -204.2672119140625, "logps/rejected": -236.72357177734375, "loss": 0.3134, "rewards/accuracies": 0.875, "rewards/chosen": 0.18520456552505493, "rewards/margins": 0.4327746033668518, "rewards/rejected": -0.24757003784179688, "step": 47 }, { "debug/policy_chosen_logits": -3.0339467525482178, "debug/policy_chosen_logps": -214.1638641357422, "debug/policy_rejected_logits": -3.061377763748169, "debug/policy_rejected_logps": -234.35415649414062, "debug/reference_chosen_logps": -219.6826171875, "debug/reference_rejected_logps": -225.13951110839844, "epoch": 0.5714285714285714, "grad_norm": 3.492138091854823, "learning_rate": 1e-06, "logits/chosen": -3.0339467525482178, "logits/rejected": -3.061377763748169, "logps/chosen": -214.1638641357422, "logps/rejected": -234.35415649414062, "loss": 0.3227, "rewards/accuracies": 0.75, "rewards/chosen": 0.05518750846385956, "rewards/margins": 0.14733393490314484, "rewards/rejected": -0.09214641898870468, "step": 48 }, { "debug/policy_chosen_logits": -3.1581661701202393, "debug/policy_chosen_logps": -185.55007934570312, "debug/policy_rejected_logits": -3.0453264713287354, "debug/policy_rejected_logps": -262.7301025390625, "debug/reference_chosen_logps": -206.8365478515625, "debug/reference_rejected_logps": -242.17652893066406, "epoch": 0.5833333333333334, "grad_norm": 3.1731020410352615, "learning_rate": 1e-06, "logits/chosen": -3.1581661701202393, "logits/rejected": -3.0453264713287354, "logps/chosen": -185.55007934570312, "logps/rejected": -262.7301025390625, "loss": 0.3134, "rewards/accuracies": 0.875, "rewards/chosen": 0.21286450326442719, "rewards/margins": 0.4184000492095947, "rewards/rejected": -0.20553553104400635, "step": 49 }, { "debug/policy_chosen_logits": -3.206784725189209, "debug/policy_chosen_logps": -170.53402709960938, "debug/policy_rejected_logits": -2.9109795093536377, "debug/policy_rejected_logps": -258.2296142578125, "debug/reference_chosen_logps": -196.68124389648438, "debug/reference_rejected_logps": -230.39051818847656, "epoch": 0.5952380952380952, "grad_norm": 2.934981229097639, "learning_rate": 1e-06, "logits/chosen": -3.206784725189209, "logits/rejected": -2.9109795093536377, "logps/chosen": -170.53402709960938, "logps/rejected": -258.2296142578125, "loss": 0.2659, "rewards/accuracies": 1.0, "rewards/chosen": 0.26147204637527466, "rewards/margins": 0.5398629307746887, "rewards/rejected": -0.27839091420173645, "step": 50 }, { "debug/policy_chosen_logits": -3.0622358322143555, "debug/policy_chosen_logps": -200.63052368164062, "debug/policy_rejected_logits": -2.947862148284912, "debug/policy_rejected_logps": -252.6195068359375, "debug/reference_chosen_logps": -221.46884155273438, "debug/reference_rejected_logps": -228.0500030517578, "epoch": 0.6071428571428571, "grad_norm": 3.5442225737215387, "learning_rate": 1e-06, "logits/chosen": -3.0622358322143555, "logits/rejected": -2.947862148284912, "logps/chosen": -200.63052368164062, "logps/rejected": -252.6195068359375, "loss": 0.3124, "rewards/accuracies": 1.0, "rewards/chosen": 0.20838311314582825, "rewards/margins": 0.4540780782699585, "rewards/rejected": -0.24569493532180786, "step": 51 }, { "debug/policy_chosen_logits": -3.1601505279541016, "debug/policy_chosen_logps": -182.13150024414062, "debug/policy_rejected_logits": -2.988621711730957, "debug/policy_rejected_logps": -228.06826782226562, "debug/reference_chosen_logps": -201.15792846679688, "debug/reference_rejected_logps": -213.58132934570312, "epoch": 0.6190476190476191, "grad_norm": 4.536944665819507, "learning_rate": 1e-06, "logits/chosen": -3.1601505279541016, "logits/rejected": -2.988621711730957, "logps/chosen": -182.13150024414062, "logps/rejected": -228.06826782226562, "loss": 0.3305, "rewards/accuracies": 1.0, "rewards/chosen": 0.19026409089565277, "rewards/margins": 0.33513346314430237, "rewards/rejected": -0.1448693871498108, "step": 52 }, { "debug/policy_chosen_logits": -3.198798894882202, "debug/policy_chosen_logps": -189.36521911621094, "debug/policy_rejected_logits": -3.0948612689971924, "debug/policy_rejected_logps": -224.48573303222656, "debug/reference_chosen_logps": -206.97372436523438, "debug/reference_rejected_logps": -216.28659057617188, "epoch": 0.6309523809523809, "grad_norm": 3.1233076322027067, "learning_rate": 1e-06, "logits/chosen": -3.198798894882202, "logits/rejected": -3.0948612689971924, "logps/chosen": -189.36521911621094, "logps/rejected": -224.48573303222656, "loss": 0.2912, "rewards/accuracies": 0.625, "rewards/chosen": 0.17608505487442017, "rewards/margins": 0.25807660818099976, "rewards/rejected": -0.08199156075716019, "step": 53 }, { "debug/policy_chosen_logits": -3.0483901500701904, "debug/policy_chosen_logps": -220.47686767578125, "debug/policy_rejected_logits": -3.0346791744232178, "debug/policy_rejected_logps": -233.89801025390625, "debug/reference_chosen_logps": -234.3529052734375, "debug/reference_rejected_logps": -234.83953857421875, "epoch": 0.6428571428571429, "grad_norm": 4.209474032104615, "learning_rate": 1e-06, "logits/chosen": -3.0483901500701904, "logits/rejected": -3.0346791744232178, "logps/chosen": -220.47686767578125, "logps/rejected": -233.89801025390625, "loss": 0.3198, "rewards/accuracies": 0.875, "rewards/chosen": 0.1387602984905243, "rewards/margins": 0.12934501469135284, "rewards/rejected": 0.00941528007388115, "step": 54 }, { "debug/policy_chosen_logits": -3.171710968017578, "debug/policy_chosen_logps": -177.91751098632812, "debug/policy_rejected_logits": -3.0502405166625977, "debug/policy_rejected_logps": -286.4560852050781, "debug/reference_chosen_logps": -200.41464233398438, "debug/reference_rejected_logps": -257.74066162109375, "epoch": 0.6547619047619048, "grad_norm": 4.118989628914695, "learning_rate": 1e-06, "logits/chosen": -3.171710968017578, "logits/rejected": -3.0502405166625977, "logps/chosen": -177.91751098632812, "logps/rejected": -286.4560852050781, "loss": 0.2885, "rewards/accuracies": 1.0, "rewards/chosen": 0.22497142851352692, "rewards/margins": 0.5121252536773682, "rewards/rejected": -0.28715386986732483, "step": 55 }, { "debug/policy_chosen_logits": -3.09352445602417, "debug/policy_chosen_logps": -201.69338989257812, "debug/policy_rejected_logits": -2.858083486557007, "debug/policy_rejected_logps": -237.42831420898438, "debug/reference_chosen_logps": -221.5948944091797, "debug/reference_rejected_logps": -233.11854553222656, "epoch": 0.6666666666666666, "grad_norm": 5.4156894530313835, "learning_rate": 1e-06, "logits/chosen": -3.09352445602417, "logits/rejected": -2.858083486557007, "logps/chosen": -201.69338989257812, "logps/rejected": -237.42831420898438, "loss": 0.3487, "rewards/accuracies": 1.0, "rewards/chosen": 0.19901515543460846, "rewards/margins": 0.24211297929286957, "rewards/rejected": -0.04309781268239021, "step": 56 }, { "debug/policy_chosen_logits": -3.0208303928375244, "debug/policy_chosen_logps": -204.16375732421875, "debug/policy_rejected_logits": -2.9121055603027344, "debug/policy_rejected_logps": -266.00836181640625, "debug/reference_chosen_logps": -221.49114990234375, "debug/reference_rejected_logps": -238.82809448242188, "epoch": 0.6785714285714286, "grad_norm": 3.4258355626960837, "learning_rate": 1e-06, "logits/chosen": -3.0208303928375244, "logits/rejected": -2.9121055603027344, "logps/chosen": -204.16375732421875, "logps/rejected": -266.00836181640625, "loss": 0.3158, "rewards/accuracies": 0.875, "rewards/chosen": 0.17327386140823364, "rewards/margins": 0.44507646560668945, "rewards/rejected": -0.2718026041984558, "step": 57 }, { "debug/policy_chosen_logits": -3.2102866172790527, "debug/policy_chosen_logps": -218.61669921875, "debug/policy_rejected_logits": -3.0547966957092285, "debug/policy_rejected_logps": -265.02374267578125, "debug/reference_chosen_logps": -230.63229370117188, "debug/reference_rejected_logps": -222.15362548828125, "epoch": 0.6904761904761905, "grad_norm": 6.73796058848574, "learning_rate": 1e-06, "logits/chosen": -3.2102866172790527, "logits/rejected": -3.0547966957092285, "logps/chosen": -218.61669921875, "logps/rejected": -265.02374267578125, "loss": 0.2995, "rewards/accuracies": 0.875, "rewards/chosen": 0.12015601992607117, "rewards/margins": 0.5488572120666504, "rewards/rejected": -0.42870116233825684, "step": 58 }, { "debug/policy_chosen_logits": -3.1211562156677246, "debug/policy_chosen_logps": -186.32528686523438, "debug/policy_rejected_logits": -3.046443462371826, "debug/policy_rejected_logps": -230.16293334960938, "debug/reference_chosen_logps": -207.93145751953125, "debug/reference_rejected_logps": -210.10281372070312, "epoch": 0.7023809523809523, "grad_norm": 3.5427381092079506, "learning_rate": 1e-06, "logits/chosen": -3.1211562156677246, "logits/rejected": -3.046443462371826, "logps/chosen": -186.32528686523438, "logps/rejected": -230.16293334960938, "loss": 0.3172, "rewards/accuracies": 0.875, "rewards/chosen": 0.21606168150901794, "rewards/margins": 0.4166628122329712, "rewards/rejected": -0.20060113072395325, "step": 59 }, { "debug/policy_chosen_logits": -3.099186420440674, "debug/policy_chosen_logps": -179.74508666992188, "debug/policy_rejected_logits": -3.098525047302246, "debug/policy_rejected_logps": -218.37451171875, "debug/reference_chosen_logps": -204.20477294921875, "debug/reference_rejected_logps": -208.37628173828125, "epoch": 0.7142857142857143, "grad_norm": 3.822141253561726, "learning_rate": 1e-06, "logits/chosen": -3.099186420440674, "logits/rejected": -3.098525047302246, "logps/chosen": -179.74508666992188, "logps/rejected": -218.37451171875, "loss": 0.2797, "rewards/accuracies": 0.875, "rewards/chosen": 0.2445967197418213, "rewards/margins": 0.3445791006088257, "rewards/rejected": -0.0999823734164238, "step": 60 }, { "debug/policy_chosen_logits": -3.1759068965911865, "debug/policy_chosen_logps": -189.11534118652344, "debug/policy_rejected_logits": -3.130244255065918, "debug/policy_rejected_logps": -233.64544677734375, "debug/reference_chosen_logps": -214.55841064453125, "debug/reference_rejected_logps": -213.53517150878906, "epoch": 0.7261904761904762, "grad_norm": 3.7708296837344957, "learning_rate": 1e-06, "logits/chosen": -3.1759068965911865, "logits/rejected": -3.130244255065918, "logps/chosen": -189.11534118652344, "logps/rejected": -233.64544677734375, "loss": 0.2868, "rewards/accuracies": 1.0, "rewards/chosen": 0.25443071126937866, "rewards/margins": 0.45553332567214966, "rewards/rejected": -0.201102614402771, "step": 61 }, { "debug/policy_chosen_logits": -3.1119725704193115, "debug/policy_chosen_logps": -178.9134063720703, "debug/policy_rejected_logits": -3.0561046600341797, "debug/policy_rejected_logps": -235.5465545654297, "debug/reference_chosen_logps": -201.8980712890625, "debug/reference_rejected_logps": -204.71258544921875, "epoch": 0.7380952380952381, "grad_norm": 3.6489245962660175, "learning_rate": 1e-06, "logits/chosen": -3.1119725704193115, "logits/rejected": -3.0561046600341797, "logps/chosen": -178.9134063720703, "logps/rejected": -235.5465545654297, "loss": 0.2636, "rewards/accuracies": 1.0, "rewards/chosen": 0.22984656691551208, "rewards/margins": 0.5381861925125122, "rewards/rejected": -0.3083396553993225, "step": 62 }, { "debug/policy_chosen_logits": -3.1469156742095947, "debug/policy_chosen_logps": -201.1051025390625, "debug/policy_rejected_logits": -3.169464111328125, "debug/policy_rejected_logps": -232.58746337890625, "debug/reference_chosen_logps": -221.8560028076172, "debug/reference_rejected_logps": -219.1409149169922, "epoch": 0.75, "grad_norm": 4.929161871830666, "learning_rate": 1e-06, "logits/chosen": -3.1469156742095947, "logits/rejected": -3.169464111328125, "logps/chosen": -201.1051025390625, "logps/rejected": -232.58746337890625, "loss": 0.2553, "rewards/accuracies": 0.875, "rewards/chosen": 0.20750907063484192, "rewards/margins": 0.34197473526000977, "rewards/rejected": -0.13446564972400665, "step": 63 }, { "debug/policy_chosen_logits": -3.264002799987793, "debug/policy_chosen_logps": -176.3603515625, "debug/policy_rejected_logits": -3.2041733264923096, "debug/policy_rejected_logps": -237.74607849121094, "debug/reference_chosen_logps": -194.56317138671875, "debug/reference_rejected_logps": -220.99493408203125, "epoch": 0.7619047619047619, "grad_norm": 3.818382564446503, "learning_rate": 1e-06, "logits/chosen": -3.264002799987793, "logits/rejected": -3.2041733264923096, "logps/chosen": -176.3603515625, "logps/rejected": -237.74607849121094, "loss": 0.2587, "rewards/accuracies": 0.875, "rewards/chosen": 0.18202824890613556, "rewards/margins": 0.3495399057865143, "rewards/rejected": -0.16751165688037872, "step": 64 }, { "debug/policy_chosen_logits": -3.119412899017334, "debug/policy_chosen_logps": -203.42031860351562, "debug/policy_rejected_logits": -3.1157796382904053, "debug/policy_rejected_logps": -233.96710205078125, "debug/reference_chosen_logps": -222.92288208007812, "debug/reference_rejected_logps": -212.2803955078125, "epoch": 0.7738095238095238, "grad_norm": 3.2445663415245227, "learning_rate": 1e-06, "logits/chosen": -3.119412899017334, "logits/rejected": -3.1157796382904053, "logps/chosen": -203.42031860351562, "logps/rejected": -233.96710205078125, "loss": 0.2657, "rewards/accuracies": 1.0, "rewards/chosen": 0.19502560794353485, "rewards/margins": 0.41189298033714294, "rewards/rejected": -0.2168673723936081, "step": 65 }, { "debug/policy_chosen_logits": -3.1231942176818848, "debug/policy_chosen_logps": -187.53953552246094, "debug/policy_rejected_logits": -2.986668109893799, "debug/policy_rejected_logps": -240.6594696044922, "debug/reference_chosen_logps": -203.42880249023438, "debug/reference_rejected_logps": -214.08140563964844, "epoch": 0.7857142857142857, "grad_norm": 3.993146507021109, "learning_rate": 1e-06, "logits/chosen": -3.1231942176818848, "logits/rejected": -2.986668109893799, "logps/chosen": -187.53953552246094, "logps/rejected": -240.6594696044922, "loss": 0.2497, "rewards/accuracies": 0.875, "rewards/chosen": 0.15889252722263336, "rewards/margins": 0.4246731102466583, "rewards/rejected": -0.26578059792518616, "step": 66 }, { "debug/policy_chosen_logits": -3.1682167053222656, "debug/policy_chosen_logps": -205.82066345214844, "debug/policy_rejected_logits": -3.2192001342773438, "debug/policy_rejected_logps": -218.02267456054688, "debug/reference_chosen_logps": -218.94798278808594, "debug/reference_rejected_logps": -207.19180297851562, "epoch": 0.7976190476190477, "grad_norm": 4.631060327525655, "learning_rate": 1e-06, "logits/chosen": -3.1682167053222656, "logits/rejected": -3.2192001342773438, "logps/chosen": -205.82066345214844, "logps/rejected": -218.02267456054688, "loss": 0.256, "rewards/accuracies": 0.625, "rewards/chosen": 0.1312730759382248, "rewards/margins": 0.23958177864551544, "rewards/rejected": -0.10830870270729065, "step": 67 }, { "debug/policy_chosen_logits": -3.203059196472168, "debug/policy_chosen_logps": -178.87045288085938, "debug/policy_rejected_logits": -2.987100839614868, "debug/policy_rejected_logps": -232.19949340820312, "debug/reference_chosen_logps": -203.59388732910156, "debug/reference_rejected_logps": -210.4298553466797, "epoch": 0.8095238095238095, "grad_norm": 3.521359819189684, "learning_rate": 1e-06, "logits/chosen": -3.203059196472168, "logits/rejected": -2.987100839614868, "logps/chosen": -178.87045288085938, "logps/rejected": -232.19949340820312, "loss": 0.3033, "rewards/accuracies": 1.0, "rewards/chosen": 0.24723434448242188, "rewards/margins": 0.4649306535720825, "rewards/rejected": -0.21769630908966064, "step": 68 }, { "debug/policy_chosen_logits": -3.0755350589752197, "debug/policy_chosen_logps": -187.50741577148438, "debug/policy_rejected_logits": -3.0124213695526123, "debug/policy_rejected_logps": -249.99533081054688, "debug/reference_chosen_logps": -211.29078674316406, "debug/reference_rejected_logps": -215.66058349609375, "epoch": 0.8214285714285714, "grad_norm": 4.192433976871342, "learning_rate": 1e-06, "logits/chosen": -3.0755350589752197, "logits/rejected": -3.0124213695526123, "logps/chosen": -187.50741577148438, "logps/rejected": -249.99533081054688, "loss": 0.2695, "rewards/accuracies": 1.0, "rewards/chosen": 0.23783370852470398, "rewards/margins": 0.5811812281608582, "rewards/rejected": -0.3433475196361542, "step": 69 }, { "debug/policy_chosen_logits": -3.1850805282592773, "debug/policy_chosen_logps": -188.49639892578125, "debug/policy_rejected_logits": -3.084040880203247, "debug/policy_rejected_logps": -220.804931640625, "debug/reference_chosen_logps": -212.1209716796875, "debug/reference_rejected_logps": -211.52203369140625, "epoch": 0.8333333333333334, "grad_norm": 3.2739721314248973, "learning_rate": 1e-06, "logits/chosen": -3.1850805282592773, "logits/rejected": -3.084040880203247, "logps/chosen": -188.49639892578125, "logps/rejected": -220.804931640625, "loss": 0.267, "rewards/accuracies": 0.875, "rewards/chosen": 0.23624567687511444, "rewards/margins": 0.32907477021217346, "rewards/rejected": -0.09282909333705902, "step": 70 }, { "debug/policy_chosen_logits": -3.096564292907715, "debug/policy_chosen_logps": -207.81924438476562, "debug/policy_rejected_logits": -3.0242645740509033, "debug/policy_rejected_logps": -229.8334503173828, "debug/reference_chosen_logps": -227.1575164794922, "debug/reference_rejected_logps": -214.90184020996094, "epoch": 0.8452380952380952, "grad_norm": 3.0819588522165686, "learning_rate": 1e-06, "logits/chosen": -3.096564292907715, "logits/rejected": -3.0242645740509033, "logps/chosen": -207.81924438476562, "logps/rejected": -229.8334503173828, "loss": 0.2744, "rewards/accuracies": 0.75, "rewards/chosen": 0.1933828443288803, "rewards/margins": 0.34269896149635315, "rewards/rejected": -0.14931611716747284, "step": 71 }, { "debug/policy_chosen_logits": -3.1674182415008545, "debug/policy_chosen_logps": -192.69192504882812, "debug/policy_rejected_logits": -3.0628364086151123, "debug/policy_rejected_logps": -225.1174774169922, "debug/reference_chosen_logps": -215.3536376953125, "debug/reference_rejected_logps": -209.8455352783203, "epoch": 0.8571428571428571, "grad_norm": 3.773640551428593, "learning_rate": 1e-06, "logits/chosen": -3.1674182415008545, "logits/rejected": -3.0628364086151123, "logps/chosen": -192.69192504882812, "logps/rejected": -225.1174774169922, "loss": 0.2596, "rewards/accuracies": 0.75, "rewards/chosen": 0.2266169786453247, "rewards/margins": 0.37933632731437683, "rewards/rejected": -0.15271936357021332, "step": 72 }, { "debug/policy_chosen_logits": -3.077094793319702, "debug/policy_chosen_logps": -187.55560302734375, "debug/policy_rejected_logits": -3.0116915702819824, "debug/policy_rejected_logps": -248.090087890625, "debug/reference_chosen_logps": -208.9130096435547, "debug/reference_rejected_logps": -221.7311248779297, "epoch": 0.8690476190476191, "grad_norm": 3.2776627025025333, "learning_rate": 1e-06, "logits/chosen": -3.077094793319702, "logits/rejected": -3.0116915702819824, "logps/chosen": -187.55560302734375, "logps/rejected": -248.090087890625, "loss": 0.2596, "rewards/accuracies": 1.0, "rewards/chosen": 0.21357415616512299, "rewards/margins": 0.47716373205184937, "rewards/rejected": -0.2635895609855652, "step": 73 }, { "debug/policy_chosen_logits": -3.182755470275879, "debug/policy_chosen_logps": -205.19509887695312, "debug/policy_rejected_logits": -2.9780564308166504, "debug/policy_rejected_logps": -253.59747314453125, "debug/reference_chosen_logps": -220.27328491210938, "debug/reference_rejected_logps": -215.6810302734375, "epoch": 0.8809523809523809, "grad_norm": 3.5105144029616446, "learning_rate": 1e-06, "logits/chosen": -3.182755470275879, "logits/rejected": -2.9780564308166504, "logps/chosen": -205.19509887695312, "logps/rejected": -253.59747314453125, "loss": 0.3096, "rewards/accuracies": 0.875, "rewards/chosen": 0.1507818102836609, "rewards/margins": 0.5299463272094727, "rewards/rejected": -0.37916454672813416, "step": 74 }, { "debug/policy_chosen_logits": -3.0349855422973633, "debug/policy_chosen_logps": -197.20550537109375, "debug/policy_rejected_logits": -2.968355178833008, "debug/policy_rejected_logps": -260.9197998046875, "debug/reference_chosen_logps": -217.93182373046875, "debug/reference_rejected_logps": -250.27139282226562, "epoch": 0.8928571428571429, "grad_norm": 5.618897079839302, "learning_rate": 1e-06, "logits/chosen": -3.0349855422973633, "logits/rejected": -2.968355178833008, "logps/chosen": -197.20550537109375, "logps/rejected": -260.9197998046875, "loss": 0.3018, "rewards/accuracies": 0.75, "rewards/chosen": 0.20726338028907776, "rewards/margins": 0.3137475848197937, "rewards/rejected": -0.10648422688245773, "step": 75 }, { "debug/policy_chosen_logits": -3.1911251544952393, "debug/policy_chosen_logps": -184.67440795898438, "debug/policy_rejected_logits": -3.182987928390503, "debug/policy_rejected_logps": -210.40711975097656, "debug/reference_chosen_logps": -203.4073486328125, "debug/reference_rejected_logps": -204.89013671875, "epoch": 0.9047619047619048, "grad_norm": 3.696876592657335, "learning_rate": 1e-06, "logits/chosen": -3.1911251544952393, "logits/rejected": -3.182987928390503, "logps/chosen": -184.67440795898438, "logps/rejected": -210.40711975097656, "loss": 0.3278, "rewards/accuracies": 0.75, "rewards/chosen": 0.1873292624950409, "rewards/margins": 0.24249891936779022, "rewards/rejected": -0.05516962707042694, "step": 76 }, { "debug/policy_chosen_logits": -3.178257465362549, "debug/policy_chosen_logps": -187.74302673339844, "debug/policy_rejected_logits": -3.0253536701202393, "debug/policy_rejected_logps": -255.27618408203125, "debug/reference_chosen_logps": -204.26242065429688, "debug/reference_rejected_logps": -219.57601928710938, "epoch": 0.9166666666666666, "grad_norm": 6.676177800896766, "learning_rate": 1e-06, "logits/chosen": -3.178257465362549, "logits/rejected": -3.0253536701202393, "logps/chosen": -187.74302673339844, "logps/rejected": -255.27618408203125, "loss": 0.2808, "rewards/accuracies": 1.0, "rewards/chosen": 0.16519393026828766, "rewards/margins": 0.5221953988075256, "rewards/rejected": -0.35700148344039917, "step": 77 }, { "debug/policy_chosen_logits": -3.172184705734253, "debug/policy_chosen_logps": -197.2301788330078, "debug/policy_rejected_logits": -3.122185230255127, "debug/policy_rejected_logps": -227.55764770507812, "debug/reference_chosen_logps": -208.9789276123047, "debug/reference_rejected_logps": -220.4404296875, "epoch": 0.9285714285714286, "grad_norm": 3.121642412627353, "learning_rate": 1e-06, "logits/chosen": -3.172184705734253, "logits/rejected": -3.122185230255127, "logps/chosen": -197.2301788330078, "logps/rejected": -227.55764770507812, "loss": 0.2861, "rewards/accuracies": 0.75, "rewards/chosen": 0.11748749017715454, "rewards/margins": 0.18865975737571716, "rewards/rejected": -0.07117227464914322, "step": 78 }, { "debug/policy_chosen_logits": -3.149040937423706, "debug/policy_chosen_logps": -180.52720642089844, "debug/policy_rejected_logits": -3.1451313495635986, "debug/policy_rejected_logps": -242.54115295410156, "debug/reference_chosen_logps": -205.92019653320312, "debug/reference_rejected_logps": -219.41590881347656, "epoch": 0.9404761904761905, "grad_norm": 3.1134708691492845, "learning_rate": 1e-06, "logits/chosen": -3.149040937423706, "logits/rejected": -3.1451313495635986, "logps/chosen": -180.52720642089844, "logps/rejected": -242.54115295410156, "loss": 0.2408, "rewards/accuracies": 1.0, "rewards/chosen": 0.25392991304397583, "rewards/margins": 0.4851823151111603, "rewards/rejected": -0.23125241696834564, "step": 79 }, { "debug/policy_chosen_logits": -3.215009927749634, "debug/policy_chosen_logps": -161.9908447265625, "debug/policy_rejected_logits": -2.9049410820007324, "debug/policy_rejected_logps": -237.18609619140625, "debug/reference_chosen_logps": -198.1100311279297, "debug/reference_rejected_logps": -205.93499755859375, "epoch": 0.9523809523809523, "grad_norm": 6.27678156865478, "learning_rate": 1e-06, "logits/chosen": -3.215009927749634, "logits/rejected": -2.9049410820007324, "logps/chosen": -161.9908447265625, "logps/rejected": -237.18609619140625, "loss": 0.3087, "rewards/accuracies": 1.0, "rewards/chosen": 0.361191987991333, "rewards/margins": 0.6737030744552612, "rewards/rejected": -0.3125110864639282, "step": 80 }, { "debug/policy_chosen_logits": -3.0824358463287354, "debug/policy_chosen_logps": -185.10894775390625, "debug/policy_rejected_logits": -2.936793804168701, "debug/policy_rejected_logps": -268.28875732421875, "debug/reference_chosen_logps": -200.82766723632812, "debug/reference_rejected_logps": -233.51441955566406, "epoch": 0.9642857142857143, "grad_norm": 3.3522970674177937, "learning_rate": 1e-06, "logits/chosen": -3.0824358463287354, "logits/rejected": -2.936793804168701, "logps/chosen": -185.10894775390625, "logps/rejected": -268.28875732421875, "loss": 0.2521, "rewards/accuracies": 1.0, "rewards/chosen": 0.15718725323677063, "rewards/margins": 0.5049305558204651, "rewards/rejected": -0.34774333238601685, "step": 81 }, { "debug/policy_chosen_logits": -3.135251760482788, "debug/policy_chosen_logps": -170.79705810546875, "debug/policy_rejected_logits": -3.0056710243225098, "debug/policy_rejected_logps": -227.4779052734375, "debug/reference_chosen_logps": -200.08804321289062, "debug/reference_rejected_logps": -197.99270629882812, "epoch": 0.9761904761904762, "grad_norm": 3.284253852501879, "learning_rate": 1e-06, "logits/chosen": -3.135251760482788, "logits/rejected": -3.0056710243225098, "logps/chosen": -170.79705810546875, "logps/rejected": -227.4779052734375, "loss": 0.2355, "rewards/accuracies": 0.875, "rewards/chosen": 0.2929098308086395, "rewards/margins": 0.5877617597579956, "rewards/rejected": -0.2948519289493561, "step": 82 }, { "debug/policy_chosen_logits": -3.0462417602539062, "debug/policy_chosen_logps": -195.46688842773438, "debug/policy_rejected_logits": -2.9659104347229004, "debug/policy_rejected_logps": -253.25881958007812, "debug/reference_chosen_logps": -216.95620727539062, "debug/reference_rejected_logps": -213.9208984375, "epoch": 0.9880952380952381, "grad_norm": 3.3816029123776024, "learning_rate": 1e-06, "logits/chosen": -3.0462417602539062, "logits/rejected": -2.9659104347229004, "logps/chosen": -195.46688842773438, "logps/rejected": -253.25881958007812, "loss": 0.2725, "rewards/accuracies": 1.0, "rewards/chosen": 0.21489323675632477, "rewards/margins": 0.6082723140716553, "rewards/rejected": -0.3933790922164917, "step": 83 }, { "debug/policy_chosen_logits": -3.181661605834961, "debug/policy_chosen_logps": -201.0180206298828, "debug/policy_rejected_logits": -3.114567518234253, "debug/policy_rejected_logps": -248.83258056640625, "debug/reference_chosen_logps": -217.26596069335938, "debug/reference_rejected_logps": -230.24380493164062, "epoch": 1.0, "grad_norm": 3.6750878689595683, "learning_rate": 1e-06, "logits/chosen": -3.181661605834961, "logits/rejected": -3.114567518234253, "logps/chosen": -201.0180206298828, "logps/rejected": -248.83258056640625, "loss": 0.2642, "rewards/accuracies": 0.75, "rewards/chosen": 0.16247960925102234, "rewards/margins": 0.3483673632144928, "rewards/rejected": -0.18588775396347046, "step": 84 }, { "epoch": 1.0, "step": 84, "total_flos": 0.0, "train_loss": 0.33750108753641445, "train_runtime": 246.5194, "train_samples_per_second": 21.613, "train_steps_per_second": 0.341 } ], "logging_steps": 1, "max_steps": 84, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }