{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988751406074241, "eval_steps": 100, "global_step": 444, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 5.829373545547037, "learning_rate": 1.111111111111111e-08, "logits/chosen": -1.8433172702789307, "logits/rejected": -2.1778242588043213, "logps/chosen": -155.12074279785156, "logps/rejected": -108.14129638671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 6.4818389334129645, "learning_rate": 1.111111111111111e-07, "logits/chosen": -1.727405071258545, "logits/rejected": -1.8230912685394287, "logps/chosen": -143.81710815429688, "logps/rejected": -170.6587371826172, "loss": 0.693, "rewards/accuracies": 0.4652777910232544, "rewards/chosen": 8.138448174577206e-05, "rewards/margins": 0.0009054330294020474, "rewards/rejected": -0.0008240485331043601, "step": 10 }, { "epoch": 0.04, "grad_norm": 6.216353393457572, "learning_rate": 2.222222222222222e-07, "logits/chosen": -1.7563774585723877, "logits/rejected": -1.8175561428070068, "logps/chosen": -156.39651489257812, "logps/rejected": -182.17941284179688, "loss": 0.6929, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0005323028308339417, "rewards/margins": 0.0004471595457289368, "rewards/rejected": 8.51431759656407e-05, "step": 20 }, { "epoch": 0.07, "grad_norm": 6.0623601927922826, "learning_rate": 3.333333333333333e-07, "logits/chosen": -1.7707617282867432, "logits/rejected": -1.9445222616195679, "logps/chosen": -162.476318359375, "logps/rejected": -180.51072692871094, "loss": 0.6926, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.003538253251463175, "rewards/margins": 0.0020595293026417494, "rewards/rejected": 0.0014787239488214254, "step": 30 }, { "epoch": 0.09, "grad_norm": 6.203147518363453, "learning_rate": 4.444444444444444e-07, "logits/chosen": -1.7849353551864624, "logits/rejected": -1.9426301717758179, "logps/chosen": -175.6881866455078, "logps/rejected": -160.2828369140625, "loss": 0.6905, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.010533371940255165, "rewards/margins": 0.0045619565062224865, "rewards/rejected": 0.005971415434032679, "step": 40 }, { "epoch": 0.11, "grad_norm": 5.821555258105456, "learning_rate": 4.998062918544441e-07, "logits/chosen": -1.6218881607055664, "logits/rejected": -1.7974551916122437, "logps/chosen": -140.06240844726562, "logps/rejected": -163.06736755371094, "loss": 0.687, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.024222631007432938, "rewards/margins": 0.012534504756331444, "rewards/rejected": 0.011688126251101494, "step": 50 }, { "epoch": 0.13, "grad_norm": 6.110757227734316, "learning_rate": 4.98258427321406e-07, "logits/chosen": -1.7357165813446045, "logits/rejected": -1.8816426992416382, "logps/chosen": -164.33438110351562, "logps/rejected": -165.95216369628906, "loss": 0.6807, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.04671463742852211, "rewards/margins": 0.026966657489538193, "rewards/rejected": 0.019747978076338768, "step": 60 }, { "epoch": 0.16, "grad_norm": 6.155653196810327, "learning_rate": 4.951722892251762e-07, "logits/chosen": -1.6737648248672485, "logits/rejected": -1.7360236644744873, "logps/chosen": -158.34616088867188, "logps/rejected": -189.7154998779297, "loss": 0.6759, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.06381961703300476, "rewards/margins": 0.03640252351760864, "rewards/rejected": 0.02741708979010582, "step": 70 }, { "epoch": 0.18, "grad_norm": 6.083689058170866, "learning_rate": 4.905670000773126e-07, "logits/chosen": -1.577292799949646, "logits/rejected": -1.6474878787994385, "logps/chosen": -174.16554260253906, "logps/rejected": -139.80081176757812, "loss": 0.6741, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.08402051031589508, "rewards/margins": 0.05129547044634819, "rewards/rejected": 0.03272503241896629, "step": 80 }, { "epoch": 0.2, "grad_norm": 5.617574112691242, "learning_rate": 4.844710954430464e-07, "logits/chosen": -1.6551265716552734, "logits/rejected": -1.710513710975647, "logps/chosen": -155.87420654296875, "logps/rejected": -184.04806518554688, "loss": 0.6661, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08863753080368042, "rewards/margins": 0.054521817713975906, "rewards/rejected": 0.034115713089704514, "step": 90 }, { "epoch": 0.22, "grad_norm": 5.627048741895505, "learning_rate": 4.769223471275234e-07, "logits/chosen": -1.5745666027069092, "logits/rejected": -1.6258203983306885, "logps/chosen": -147.27999877929688, "logps/rejected": -151.06619262695312, "loss": 0.6595, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1112900972366333, "rewards/margins": 0.0667150542140007, "rewards/rejected": 0.04457502439618111, "step": 100 }, { "epoch": 0.22, "eval_logits/chosen": -1.7720075845718384, "eval_logits/rejected": -1.495701789855957, "eval_logps/chosen": -124.06204986572266, "eval_logps/rejected": -139.30418395996094, "eval_loss": 0.6646677255630493, "eval_rewards/accuracies": 0.6875, "eval_rewards/chosen": 0.1106695607304573, "eval_rewards/margins": 0.06266607344150543, "eval_rewards/rejected": 0.04800347983837128, "eval_runtime": 107.3775, "eval_samples_per_second": 10.654, "eval_steps_per_second": 0.335, "step": 100 }, { "epoch": 0.25, "grad_norm": 5.866012556456834, "learning_rate": 4.6796752913190956e-07, "logits/chosen": -1.5874210596084595, "logits/rejected": -1.6103451251983643, "logps/chosen": -155.8997039794922, "logps/rejected": -162.63836669921875, "loss": 0.6579, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.12256599962711334, "rewards/margins": 0.07956713438034058, "rewards/rejected": 0.04299888014793396, "step": 110 }, { "epoch": 0.27, "grad_norm": 5.609183180938371, "learning_rate": 4.576621278295557e-07, "logits/chosen": -1.5197416543960571, "logits/rejected": -1.572852373123169, "logps/chosen": -147.88705444335938, "logps/rejected": -145.33999633789062, "loss": 0.6512, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.13489821553230286, "rewards/margins": 0.0956321507692337, "rewards/rejected": 0.03926606848835945, "step": 120 }, { "epoch": 0.29, "grad_norm": 5.6105044223251355, "learning_rate": 4.4606999815804657e-07, "logits/chosen": -1.4735063314437866, "logits/rejected": -1.662398338317871, "logps/chosen": -146.32366943359375, "logps/rejected": -139.0260009765625, "loss": 0.6492, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1404353231191635, "rewards/margins": 0.10020889341831207, "rewards/rejected": 0.04022643715143204, "step": 130 }, { "epoch": 0.31, "grad_norm": 5.559220884446223, "learning_rate": 4.332629679574565e-07, "logits/chosen": -1.4670491218566895, "logits/rejected": -1.6285909414291382, "logps/chosen": -148.60751342773438, "logps/rejected": -174.4378204345703, "loss": 0.6456, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.12996497750282288, "rewards/margins": 0.07584364712238312, "rewards/rejected": 0.05412132665514946, "step": 140 }, { "epoch": 0.34, "grad_norm": 5.496701088365727, "learning_rate": 4.193203929064353e-07, "logits/chosen": -1.4563395977020264, "logits/rejected": -1.5474860668182373, "logps/chosen": -142.05953979492188, "logps/rejected": -161.18702697753906, "loss": 0.6406, "rewards/accuracies": 0.793749988079071, "rewards/chosen": 0.16432908177375793, "rewards/margins": 0.11925216019153595, "rewards/rejected": 0.04507693648338318, "step": 150 }, { "epoch": 0.36, "grad_norm": 5.87004256418159, "learning_rate": 4.043286648138538e-07, "logits/chosen": -1.4940943717956543, "logits/rejected": -1.5696378946304321, "logps/chosen": -144.10693359375, "logps/rejected": -174.38937377929688, "loss": 0.6446, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.14990444481372833, "rewards/margins": 0.10559757798910141, "rewards/rejected": 0.04430687427520752, "step": 160 }, { "epoch": 0.38, "grad_norm": 6.736125236884026, "learning_rate": 3.883806763127647e-07, "logits/chosen": -1.4956731796264648, "logits/rejected": -1.5208299160003662, "logps/chosen": -154.81716918945312, "logps/rejected": -155.2576904296875, "loss": 0.6388, "rewards/accuracies": 0.8125, "rewards/chosen": 0.17543208599090576, "rewards/margins": 0.13297812640666962, "rewards/rejected": 0.04245396703481674, "step": 170 }, { "epoch": 0.4, "grad_norm": 5.520870747493312, "learning_rate": 3.715752452735703e-07, "logits/chosen": -1.518593192100525, "logits/rejected": -1.6800349950790405, "logps/chosen": -140.48988342285156, "logps/rejected": -161.6919708251953, "loss": 0.6328, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.17410950362682343, "rewards/margins": 0.13047902286052704, "rewards/rejected": 0.04363049194216728, "step": 180 }, { "epoch": 0.43, "grad_norm": 5.706390091330182, "learning_rate": 3.540165025028843e-07, "logits/chosen": -1.5428271293640137, "logits/rejected": -1.6062263250350952, "logps/chosen": -159.2704315185547, "logps/rejected": -173.2039031982422, "loss": 0.6286, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.18463760614395142, "rewards/margins": 0.13972006738185883, "rewards/rejected": 0.04491753131151199, "step": 190 }, { "epoch": 0.45, "grad_norm": 5.847027641584725, "learning_rate": 3.358132465220639e-07, "logits/chosen": -1.4393140077590942, "logits/rejected": -1.5474971532821655, "logps/chosen": -148.5250244140625, "logps/rejected": -156.9046173095703, "loss": 0.6273, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1776243895292282, "rewards/margins": 0.1414380818605423, "rewards/rejected": 0.03618631511926651, "step": 200 }, { "epoch": 0.45, "eval_logits/chosen": -1.7316410541534424, "eval_logits/rejected": -1.465333342552185, "eval_logps/chosen": -119.38525390625, "eval_logps/rejected": -138.4956817626953, "eval_loss": 0.6494045853614807, "eval_rewards/accuracies": 0.6979166865348816, "eval_rewards/chosen": 0.1574375331401825, "eval_rewards/margins": 0.10134916752576828, "eval_rewards/rejected": 0.05608838051557541, "eval_runtime": 106.286, "eval_samples_per_second": 10.763, "eval_steps_per_second": 0.339, "step": 200 }, { "epoch": 0.47, "grad_norm": 5.808496828700401, "learning_rate": 3.170782694233712e-07, "logits/chosen": -1.4331612586975098, "logits/rejected": -1.62355637550354, "logps/chosen": -132.7198944091797, "logps/rejected": -162.63983154296875, "loss": 0.6259, "rewards/accuracies": 0.8125, "rewards/chosen": 0.1818162202835083, "rewards/margins": 0.14457334578037262, "rewards/rejected": 0.03724289312958717, "step": 210 }, { "epoch": 0.49, "grad_norm": 6.450138104412953, "learning_rate": 2.979276579809346e-07, "logits/chosen": -1.567256212234497, "logits/rejected": -1.662076711654663, "logps/chosen": -139.86077880859375, "logps/rejected": -170.76498413085938, "loss": 0.6226, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.17449909448623657, "rewards/margins": 0.1373990774154663, "rewards/rejected": 0.03710002452135086, "step": 220 }, { "epoch": 0.52, "grad_norm": 5.305590860314572, "learning_rate": 2.78480074347007e-07, "logits/chosen": -1.4688160419464111, "logits/rejected": -1.6507971286773682, "logps/chosen": -155.4250030517578, "logps/rejected": -139.8217010498047, "loss": 0.6187, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.185538649559021, "rewards/margins": 0.1848856508731842, "rewards/rejected": 0.000652993272524327, "step": 230 }, { "epoch": 0.54, "grad_norm": 5.538376518324725, "learning_rate": 2.588560207905135e-07, "logits/chosen": -1.5921481847763062, "logits/rejected": -1.6697231531143188, "logps/chosen": -163.6059112548828, "logps/rejected": -150.09193420410156, "loss": 0.6086, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.21226021647453308, "rewards/margins": 0.19374233484268188, "rewards/rejected": 0.018517881631851196, "step": 240 }, { "epoch": 0.56, "grad_norm": 5.203072441741653, "learning_rate": 2.391770930337597e-07, "logits/chosen": -1.5545365810394287, "logits/rejected": -1.5908061265945435, "logps/chosen": -140.0444793701172, "logps/rejected": -160.80111694335938, "loss": 0.6191, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.1901070922613144, "rewards/margins": 0.17807592451572418, "rewards/rejected": 0.012031197547912598, "step": 250 }, { "epoch": 0.58, "grad_norm": 5.475288227895773, "learning_rate": 2.195652268138194e-07, "logits/chosen": -1.567275047302246, "logits/rejected": -1.6613355875015259, "logps/chosen": -150.01036071777344, "logps/rejected": -157.63027954101562, "loss": 0.6119, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.1891297549009323, "rewards/margins": 0.18906521797180176, "rewards/rejected": 6.455164111685008e-05, "step": 260 }, { "epoch": 0.61, "grad_norm": 5.992354186949266, "learning_rate": 2.001419423371019e-07, "logits/chosen": -1.47898268699646, "logits/rejected": -1.5700337886810303, "logps/chosen": -134.41952514648438, "logps/rejected": -160.8531494140625, "loss": 0.6083, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.18836051225662231, "rewards/margins": 0.16941113770008087, "rewards/rejected": 0.018949372693896294, "step": 270 }, { "epoch": 0.63, "grad_norm": 6.043059967391702, "learning_rate": 1.810275913086562e-07, "logits/chosen": -1.482757329940796, "logits/rejected": -1.648633599281311, "logps/chosen": -158.1710968017578, "logps/rejected": -164.2964324951172, "loss": 0.6129, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.17465534806251526, "rewards/margins": 0.1687730997800827, "rewards/rejected": 0.0058822231367230415, "step": 280 }, { "epoch": 0.65, "grad_norm": 5.822282255796662, "learning_rate": 1.6234061120181143e-07, "logits/chosen": -1.5249128341674805, "logits/rejected": -1.6839654445648193, "logps/chosen": -130.04713439941406, "logps/rejected": -178.07696533203125, "loss": 0.6064, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.20361635088920593, "rewards/margins": 0.20471592247486115, "rewards/rejected": -0.0010995581978932023, "step": 290 }, { "epoch": 0.67, "grad_norm": 5.917241220768849, "learning_rate": 1.4419679138889375e-07, "logits/chosen": -1.4709835052490234, "logits/rejected": -1.7355806827545166, "logps/chosen": -156.6675262451172, "logps/rejected": -172.46078491210938, "loss": 0.6009, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.1793396770954132, "rewards/margins": 0.20549102127552032, "rewards/rejected": -0.02615133859217167, "step": 300 }, { "epoch": 0.67, "eval_logits/chosen": -1.777042269706726, "eval_logits/rejected": -1.5097768306732178, "eval_logps/chosen": -120.27433013916016, "eval_logps/rejected": -141.86488342285156, "eval_loss": 0.6398369669914246, "eval_rewards/accuracies": 0.71875, "eval_rewards/chosen": 0.14854662120342255, "eval_rewards/margins": 0.12615016102790833, "eval_rewards/rejected": 0.022396454587578773, "eval_runtime": 111.134, "eval_samples_per_second": 10.294, "eval_steps_per_second": 0.324, "step": 300 }, { "epoch": 0.7, "grad_norm": 5.942805799246918, "learning_rate": 1.2670855568026362e-07, "logits/chosen": -1.552185297012329, "logits/rejected": -1.6878124475479126, "logps/chosen": -135.38902282714844, "logps/rejected": -174.15255737304688, "loss": 0.6031, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.17009037733078003, "rewards/margins": 0.18223796784877777, "rewards/rejected": -0.012147602625191212, "step": 310 }, { "epoch": 0.72, "grad_norm": 5.410484098522815, "learning_rate": 1.0998426571724643e-07, "logits/chosen": -1.5845314264297485, "logits/rejected": -1.6747452020645142, "logps/chosen": -146.5388641357422, "logps/rejected": -157.44863891601562, "loss": 0.5989, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.1947994828224182, "rewards/margins": 0.21365301311016083, "rewards/rejected": -0.01885353960096836, "step": 320 }, { "epoch": 0.74, "grad_norm": 5.323259223525621, "learning_rate": 9.412754953531663e-08, "logits/chosen": -1.560361623764038, "logits/rejected": -1.6760743856430054, "logps/chosen": -147.3408966064453, "logps/rejected": -164.4519500732422, "loss": 0.5956, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.18070648610591888, "rewards/margins": 0.22251346707344055, "rewards/rejected": -0.04180694743990898, "step": 330 }, { "epoch": 0.76, "grad_norm": 6.0603812493866025, "learning_rate": 7.923665945792943e-08, "logits/chosen": -1.542307734489441, "logits/rejected": -1.6773264408111572, "logps/chosen": -132.24139404296875, "logps/rejected": -148.74737548828125, "loss": 0.6007, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.17221280932426453, "rewards/margins": 0.211787611246109, "rewards/rejected": -0.03957480937242508, "step": 340 }, { "epoch": 0.79, "grad_norm": 5.800803684922802, "learning_rate": 6.540386329965863e-08, "logits/chosen": -1.613059639930725, "logits/rejected": -1.6966331005096436, "logps/chosen": -155.21559143066406, "logps/rejected": -161.65882873535156, "loss": 0.597, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.17716926336288452, "rewards/margins": 0.21501335501670837, "rewards/rejected": -0.03784411773085594, "step": 350 }, { "epoch": 0.81, "grad_norm": 5.294198300700879, "learning_rate": 5.271487265090163e-08, "logits/chosen": -1.605891466140747, "logits/rejected": -1.6633691787719727, "logps/chosen": -133.00123596191406, "logps/rejected": -176.7678680419922, "loss": 0.5888, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.17767903208732605, "rewards/margins": 0.228514164686203, "rewards/rejected": -0.05083512142300606, "step": 360 }, { "epoch": 0.83, "grad_norm": 6.853203056351755, "learning_rate": 4.1248311786649394e-08, "logits/chosen": -1.6259254217147827, "logits/rejected": -1.7257139682769775, "logps/chosen": -135.5113525390625, "logps/rejected": -180.2209014892578, "loss": 0.5989, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.17052185535430908, "rewards/margins": 0.2071322202682495, "rewards/rejected": -0.03661039471626282, "step": 370 }, { "epoch": 0.85, "grad_norm": 5.58249557148135, "learning_rate": 3.107523049009983e-08, "logits/chosen": -1.5495421886444092, "logits/rejected": -1.6909148693084717, "logps/chosen": -148.41799926757812, "logps/rejected": -188.6688995361328, "loss": 0.5986, "rewards/accuracies": 0.90625, "rewards/chosen": 0.18436935544013977, "rewards/margins": 0.21894951164722443, "rewards/rejected": -0.03458016738295555, "step": 380 }, { "epoch": 0.88, "grad_norm": 6.065104934238928, "learning_rate": 2.2258663809784888e-08, "logits/chosen": -1.556806206703186, "logits/rejected": -1.6664282083511353, "logps/chosen": -134.76539611816406, "logps/rejected": -166.59054565429688, "loss": 0.598, "rewards/accuracies": 0.875, "rewards/chosen": 0.19658346474170685, "rewards/margins": 0.23869290947914124, "rewards/rejected": -0.04210943728685379, "step": 390 }, { "epoch": 0.9, "grad_norm": 6.519921387019466, "learning_rate": 1.4853241478071599e-08, "logits/chosen": -1.5817980766296387, "logits/rejected": -1.6547319889068604, "logps/chosen": -132.71343994140625, "logps/rejected": -159.65066528320312, "loss": 0.6003, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.16476558148860931, "rewards/margins": 0.182787224650383, "rewards/rejected": -0.018021635711193085, "step": 400 }, { "epoch": 0.9, "eval_logits/chosen": -1.8108444213867188, "eval_logits/rejected": -1.544880986213684, "eval_logps/chosen": -121.37197875976562, "eval_logps/rejected": -144.05641174316406, "eval_loss": 0.6354950666427612, "eval_rewards/accuracies": 0.7326388955116272, "eval_rewards/chosen": 0.1375703364610672, "eval_rewards/margins": 0.13708928227424622, "eval_rewards/rejected": 0.00048106827307492495, "eval_runtime": 109.3237, "eval_samples_per_second": 10.464, "eval_steps_per_second": 0.329, "step": 400 }, { "epoch": 0.92, "grad_norm": 5.560808880302564, "learning_rate": 8.904849411180748e-09, "logits/chosen": -1.5504529476165771, "logits/rejected": -1.675254464149475, "logps/chosen": -141.06692504882812, "logps/rejected": -165.89645385742188, "loss": 0.6023, "rewards/accuracies": 0.84375, "rewards/chosen": 0.1476067751646042, "rewards/margins": 0.19318901002407074, "rewards/rejected": -0.04558226466178894, "step": 410 }, { "epoch": 0.94, "grad_norm": 6.1731600663059005, "learning_rate": 4.45034538815614e-09, "logits/chosen": -1.56648850440979, "logits/rejected": -1.7188094854354858, "logps/chosen": -161.48452758789062, "logps/rejected": -189.99099731445312, "loss": 0.593, "rewards/accuracies": 0.8125, "rewards/chosen": 0.16634421050548553, "rewards/margins": 0.21816936135292053, "rewards/rejected": -0.051825135946273804, "step": 420 }, { "epoch": 0.97, "grad_norm": 5.916412317020735, "learning_rate": 1.5173306705126287e-09, "logits/chosen": -1.5939347743988037, "logits/rejected": -1.6984974145889282, "logps/chosen": -147.58717346191406, "logps/rejected": -158.92880249023438, "loss": 0.5973, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.18250404298305511, "rewards/margins": 0.252483069896698, "rewards/rejected": -0.06997901946306229, "step": 430 }, { "epoch": 0.99, "grad_norm": 5.628332108752967, "learning_rate": 1.239789776653899e-10, "logits/chosen": -1.5746687650680542, "logits/rejected": -1.7450227737426758, "logps/chosen": -137.1623077392578, "logps/rejected": -198.08595275878906, "loss": 0.5957, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1561325490474701, "rewards/margins": 0.1958049237728119, "rewards/rejected": -0.03967234492301941, "step": 440 }, { "epoch": 1.0, "step": 444, "total_flos": 0.0, "train_loss": 0.05915545343278764, "train_runtime": 553.7697, "train_samples_per_second": 51.35, "train_steps_per_second": 0.802 } ], "logging_steps": 10, "max_steps": 444, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }