Minbyul's picture
Model save
0639750 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9988751406074241,
"eval_steps": 100,
"global_step": 444,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 5.829373545547037,
"learning_rate": 1.111111111111111e-08,
"logits/chosen": -1.8433172702789307,
"logits/rejected": -2.1778242588043213,
"logps/chosen": -155.12074279785156,
"logps/rejected": -108.14129638671875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"grad_norm": 6.4818389334129645,
"learning_rate": 1.111111111111111e-07,
"logits/chosen": -1.727405071258545,
"logits/rejected": -1.8230912685394287,
"logps/chosen": -143.81710815429688,
"logps/rejected": -170.6587371826172,
"loss": 0.693,
"rewards/accuracies": 0.4652777910232544,
"rewards/chosen": 8.138448174577206e-05,
"rewards/margins": 0.0009054330294020474,
"rewards/rejected": -0.0008240485331043601,
"step": 10
},
{
"epoch": 0.04,
"grad_norm": 6.216353393457572,
"learning_rate": 2.222222222222222e-07,
"logits/chosen": -1.7563774585723877,
"logits/rejected": -1.8175561428070068,
"logps/chosen": -156.39651489257812,
"logps/rejected": -182.17941284179688,
"loss": 0.6929,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0005323028308339417,
"rewards/margins": 0.0004471595457289368,
"rewards/rejected": 8.51431759656407e-05,
"step": 20
},
{
"epoch": 0.07,
"grad_norm": 6.0623601927922826,
"learning_rate": 3.333333333333333e-07,
"logits/chosen": -1.7707617282867432,
"logits/rejected": -1.9445222616195679,
"logps/chosen": -162.476318359375,
"logps/rejected": -180.51072692871094,
"loss": 0.6926,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.003538253251463175,
"rewards/margins": 0.0020595293026417494,
"rewards/rejected": 0.0014787239488214254,
"step": 30
},
{
"epoch": 0.09,
"grad_norm": 6.203147518363453,
"learning_rate": 4.444444444444444e-07,
"logits/chosen": -1.7849353551864624,
"logits/rejected": -1.9426301717758179,
"logps/chosen": -175.6881866455078,
"logps/rejected": -160.2828369140625,
"loss": 0.6905,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.010533371940255165,
"rewards/margins": 0.0045619565062224865,
"rewards/rejected": 0.005971415434032679,
"step": 40
},
{
"epoch": 0.11,
"grad_norm": 5.821555258105456,
"learning_rate": 4.998062918544441e-07,
"logits/chosen": -1.6218881607055664,
"logits/rejected": -1.7974551916122437,
"logps/chosen": -140.06240844726562,
"logps/rejected": -163.06736755371094,
"loss": 0.687,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.024222631007432938,
"rewards/margins": 0.012534504756331444,
"rewards/rejected": 0.011688126251101494,
"step": 50
},
{
"epoch": 0.13,
"grad_norm": 6.110757227734316,
"learning_rate": 4.98258427321406e-07,
"logits/chosen": -1.7357165813446045,
"logits/rejected": -1.8816426992416382,
"logps/chosen": -164.33438110351562,
"logps/rejected": -165.95216369628906,
"loss": 0.6807,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.04671463742852211,
"rewards/margins": 0.026966657489538193,
"rewards/rejected": 0.019747978076338768,
"step": 60
},
{
"epoch": 0.16,
"grad_norm": 6.155653196810327,
"learning_rate": 4.951722892251762e-07,
"logits/chosen": -1.6737648248672485,
"logits/rejected": -1.7360236644744873,
"logps/chosen": -158.34616088867188,
"logps/rejected": -189.7154998779297,
"loss": 0.6759,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": 0.06381961703300476,
"rewards/margins": 0.03640252351760864,
"rewards/rejected": 0.02741708979010582,
"step": 70
},
{
"epoch": 0.18,
"grad_norm": 6.083689058170866,
"learning_rate": 4.905670000773126e-07,
"logits/chosen": -1.577292799949646,
"logits/rejected": -1.6474878787994385,
"logps/chosen": -174.16554260253906,
"logps/rejected": -139.80081176757812,
"loss": 0.6741,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.08402051031589508,
"rewards/margins": 0.05129547044634819,
"rewards/rejected": 0.03272503241896629,
"step": 80
},
{
"epoch": 0.2,
"grad_norm": 5.617574112691242,
"learning_rate": 4.844710954430464e-07,
"logits/chosen": -1.6551265716552734,
"logits/rejected": -1.710513710975647,
"logps/chosen": -155.87420654296875,
"logps/rejected": -184.04806518554688,
"loss": 0.6661,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.08863753080368042,
"rewards/margins": 0.054521817713975906,
"rewards/rejected": 0.034115713089704514,
"step": 90
},
{
"epoch": 0.22,
"grad_norm": 5.627048741895505,
"learning_rate": 4.769223471275234e-07,
"logits/chosen": -1.5745666027069092,
"logits/rejected": -1.6258203983306885,
"logps/chosen": -147.27999877929688,
"logps/rejected": -151.06619262695312,
"loss": 0.6595,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.1112900972366333,
"rewards/margins": 0.0667150542140007,
"rewards/rejected": 0.04457502439618111,
"step": 100
},
{
"epoch": 0.22,
"eval_logits/chosen": -1.7720075845718384,
"eval_logits/rejected": -1.495701789855957,
"eval_logps/chosen": -124.06204986572266,
"eval_logps/rejected": -139.30418395996094,
"eval_loss": 0.6646677255630493,
"eval_rewards/accuracies": 0.6875,
"eval_rewards/chosen": 0.1106695607304573,
"eval_rewards/margins": 0.06266607344150543,
"eval_rewards/rejected": 0.04800347983837128,
"eval_runtime": 107.3775,
"eval_samples_per_second": 10.654,
"eval_steps_per_second": 0.335,
"step": 100
},
{
"epoch": 0.25,
"grad_norm": 5.866012556456834,
"learning_rate": 4.6796752913190956e-07,
"logits/chosen": -1.5874210596084595,
"logits/rejected": -1.6103451251983643,
"logps/chosen": -155.8997039794922,
"logps/rejected": -162.63836669921875,
"loss": 0.6579,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.12256599962711334,
"rewards/margins": 0.07956713438034058,
"rewards/rejected": 0.04299888014793396,
"step": 110
},
{
"epoch": 0.27,
"grad_norm": 5.609183180938371,
"learning_rate": 4.576621278295557e-07,
"logits/chosen": -1.5197416543960571,
"logits/rejected": -1.572852373123169,
"logps/chosen": -147.88705444335938,
"logps/rejected": -145.33999633789062,
"loss": 0.6512,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": 0.13489821553230286,
"rewards/margins": 0.0956321507692337,
"rewards/rejected": 0.03926606848835945,
"step": 120
},
{
"epoch": 0.29,
"grad_norm": 5.6105044223251355,
"learning_rate": 4.4606999815804657e-07,
"logits/chosen": -1.4735063314437866,
"logits/rejected": -1.662398338317871,
"logps/chosen": -146.32366943359375,
"logps/rejected": -139.0260009765625,
"loss": 0.6492,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.1404353231191635,
"rewards/margins": 0.10020889341831207,
"rewards/rejected": 0.04022643715143204,
"step": 130
},
{
"epoch": 0.31,
"grad_norm": 5.559220884446223,
"learning_rate": 4.332629679574565e-07,
"logits/chosen": -1.4670491218566895,
"logits/rejected": -1.6285909414291382,
"logps/chosen": -148.60751342773438,
"logps/rejected": -174.4378204345703,
"loss": 0.6456,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.12996497750282288,
"rewards/margins": 0.07584364712238312,
"rewards/rejected": 0.05412132665514946,
"step": 140
},
{
"epoch": 0.34,
"grad_norm": 5.496701088365727,
"learning_rate": 4.193203929064353e-07,
"logits/chosen": -1.4563395977020264,
"logits/rejected": -1.5474860668182373,
"logps/chosen": -142.05953979492188,
"logps/rejected": -161.18702697753906,
"loss": 0.6406,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.16432908177375793,
"rewards/margins": 0.11925216019153595,
"rewards/rejected": 0.04507693648338318,
"step": 150
},
{
"epoch": 0.36,
"grad_norm": 5.87004256418159,
"learning_rate": 4.043286648138538e-07,
"logits/chosen": -1.4940943717956543,
"logits/rejected": -1.5696378946304321,
"logps/chosen": -144.10693359375,
"logps/rejected": -174.38937377929688,
"loss": 0.6446,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.14990444481372833,
"rewards/margins": 0.10559757798910141,
"rewards/rejected": 0.04430687427520752,
"step": 160
},
{
"epoch": 0.38,
"grad_norm": 6.736125236884026,
"learning_rate": 3.883806763127647e-07,
"logits/chosen": -1.4956731796264648,
"logits/rejected": -1.5208299160003662,
"logps/chosen": -154.81716918945312,
"logps/rejected": -155.2576904296875,
"loss": 0.6388,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.17543208599090576,
"rewards/margins": 0.13297812640666962,
"rewards/rejected": 0.04245396703481674,
"step": 170
},
{
"epoch": 0.4,
"grad_norm": 5.520870747493312,
"learning_rate": 3.715752452735703e-07,
"logits/chosen": -1.518593192100525,
"logits/rejected": -1.6800349950790405,
"logps/chosen": -140.48988342285156,
"logps/rejected": -161.6919708251953,
"loss": 0.6328,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.17410950362682343,
"rewards/margins": 0.13047902286052704,
"rewards/rejected": 0.04363049194216728,
"step": 180
},
{
"epoch": 0.43,
"grad_norm": 5.706390091330182,
"learning_rate": 3.540165025028843e-07,
"logits/chosen": -1.5428271293640137,
"logits/rejected": -1.6062263250350952,
"logps/chosen": -159.2704315185547,
"logps/rejected": -173.2039031982422,
"loss": 0.6286,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.18463760614395142,
"rewards/margins": 0.13972006738185883,
"rewards/rejected": 0.04491753131151199,
"step": 190
},
{
"epoch": 0.45,
"grad_norm": 5.847027641584725,
"learning_rate": 3.358132465220639e-07,
"logits/chosen": -1.4393140077590942,
"logits/rejected": -1.5474971532821655,
"logps/chosen": -148.5250244140625,
"logps/rejected": -156.9046173095703,
"loss": 0.6273,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.1776243895292282,
"rewards/margins": 0.1414380818605423,
"rewards/rejected": 0.03618631511926651,
"step": 200
},
{
"epoch": 0.45,
"eval_logits/chosen": -1.7316410541534424,
"eval_logits/rejected": -1.465333342552185,
"eval_logps/chosen": -119.38525390625,
"eval_logps/rejected": -138.4956817626953,
"eval_loss": 0.6494045853614807,
"eval_rewards/accuracies": 0.6979166865348816,
"eval_rewards/chosen": 0.1574375331401825,
"eval_rewards/margins": 0.10134916752576828,
"eval_rewards/rejected": 0.05608838051557541,
"eval_runtime": 106.286,
"eval_samples_per_second": 10.763,
"eval_steps_per_second": 0.339,
"step": 200
},
{
"epoch": 0.47,
"grad_norm": 5.808496828700401,
"learning_rate": 3.170782694233712e-07,
"logits/chosen": -1.4331612586975098,
"logits/rejected": -1.62355637550354,
"logps/chosen": -132.7198944091797,
"logps/rejected": -162.63983154296875,
"loss": 0.6259,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.1818162202835083,
"rewards/margins": 0.14457334578037262,
"rewards/rejected": 0.03724289312958717,
"step": 210
},
{
"epoch": 0.49,
"grad_norm": 6.450138104412953,
"learning_rate": 2.979276579809346e-07,
"logits/chosen": -1.567256212234497,
"logits/rejected": -1.662076711654663,
"logps/chosen": -139.86077880859375,
"logps/rejected": -170.76498413085938,
"loss": 0.6226,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.17449909448623657,
"rewards/margins": 0.1373990774154663,
"rewards/rejected": 0.03710002452135086,
"step": 220
},
{
"epoch": 0.52,
"grad_norm": 5.305590860314572,
"learning_rate": 2.78480074347007e-07,
"logits/chosen": -1.4688160419464111,
"logits/rejected": -1.6507971286773682,
"logps/chosen": -155.4250030517578,
"logps/rejected": -139.8217010498047,
"loss": 0.6187,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.185538649559021,
"rewards/margins": 0.1848856508731842,
"rewards/rejected": 0.000652993272524327,
"step": 230
},
{
"epoch": 0.54,
"grad_norm": 5.538376518324725,
"learning_rate": 2.588560207905135e-07,
"logits/chosen": -1.5921481847763062,
"logits/rejected": -1.6697231531143188,
"logps/chosen": -163.6059112548828,
"logps/rejected": -150.09193420410156,
"loss": 0.6086,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.21226021647453308,
"rewards/margins": 0.19374233484268188,
"rewards/rejected": 0.018517881631851196,
"step": 240
},
{
"epoch": 0.56,
"grad_norm": 5.203072441741653,
"learning_rate": 2.391770930337597e-07,
"logits/chosen": -1.5545365810394287,
"logits/rejected": -1.5908061265945435,
"logps/chosen": -140.0444793701172,
"logps/rejected": -160.80111694335938,
"loss": 0.6191,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.1901070922613144,
"rewards/margins": 0.17807592451572418,
"rewards/rejected": 0.012031197547912598,
"step": 250
},
{
"epoch": 0.58,
"grad_norm": 5.475288227895773,
"learning_rate": 2.195652268138194e-07,
"logits/chosen": -1.567275047302246,
"logits/rejected": -1.6613355875015259,
"logps/chosen": -150.01036071777344,
"logps/rejected": -157.63027954101562,
"loss": 0.6119,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.1891297549009323,
"rewards/margins": 0.18906521797180176,
"rewards/rejected": 6.455164111685008e-05,
"step": 260
},
{
"epoch": 0.61,
"grad_norm": 5.992354186949266,
"learning_rate": 2.001419423371019e-07,
"logits/chosen": -1.47898268699646,
"logits/rejected": -1.5700337886810303,
"logps/chosen": -134.41952514648438,
"logps/rejected": -160.8531494140625,
"loss": 0.6083,
"rewards/accuracies": 0.8062499761581421,
"rewards/chosen": 0.18836051225662231,
"rewards/margins": 0.16941113770008087,
"rewards/rejected": 0.018949372693896294,
"step": 270
},
{
"epoch": 0.63,
"grad_norm": 6.043059967391702,
"learning_rate": 1.810275913086562e-07,
"logits/chosen": -1.482757329940796,
"logits/rejected": -1.648633599281311,
"logps/chosen": -158.1710968017578,
"logps/rejected": -164.2964324951172,
"loss": 0.6129,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.17465534806251526,
"rewards/margins": 0.1687730997800827,
"rewards/rejected": 0.0058822231367230415,
"step": 280
},
{
"epoch": 0.65,
"grad_norm": 5.822282255796662,
"learning_rate": 1.6234061120181143e-07,
"logits/chosen": -1.5249128341674805,
"logits/rejected": -1.6839654445648193,
"logps/chosen": -130.04713439941406,
"logps/rejected": -178.07696533203125,
"loss": 0.6064,
"rewards/accuracies": 0.856249988079071,
"rewards/chosen": 0.20361635088920593,
"rewards/margins": 0.20471592247486115,
"rewards/rejected": -0.0010995581978932023,
"step": 290
},
{
"epoch": 0.67,
"grad_norm": 5.917241220768849,
"learning_rate": 1.4419679138889375e-07,
"logits/chosen": -1.4709835052490234,
"logits/rejected": -1.7355806827545166,
"logps/chosen": -156.6675262451172,
"logps/rejected": -172.46078491210938,
"loss": 0.6009,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.1793396770954132,
"rewards/margins": 0.20549102127552032,
"rewards/rejected": -0.02615133859217167,
"step": 300
},
{
"epoch": 0.67,
"eval_logits/chosen": -1.777042269706726,
"eval_logits/rejected": -1.5097768306732178,
"eval_logps/chosen": -120.27433013916016,
"eval_logps/rejected": -141.86488342285156,
"eval_loss": 0.6398369669914246,
"eval_rewards/accuracies": 0.71875,
"eval_rewards/chosen": 0.14854662120342255,
"eval_rewards/margins": 0.12615016102790833,
"eval_rewards/rejected": 0.022396454587578773,
"eval_runtime": 111.134,
"eval_samples_per_second": 10.294,
"eval_steps_per_second": 0.324,
"step": 300
},
{
"epoch": 0.7,
"grad_norm": 5.942805799246918,
"learning_rate": 1.2670855568026362e-07,
"logits/chosen": -1.552185297012329,
"logits/rejected": -1.6878124475479126,
"logps/chosen": -135.38902282714844,
"logps/rejected": -174.15255737304688,
"loss": 0.6031,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.17009037733078003,
"rewards/margins": 0.18223796784877777,
"rewards/rejected": -0.012147602625191212,
"step": 310
},
{
"epoch": 0.72,
"grad_norm": 5.410484098522815,
"learning_rate": 1.0998426571724643e-07,
"logits/chosen": -1.5845314264297485,
"logits/rejected": -1.6747452020645142,
"logps/chosen": -146.5388641357422,
"logps/rejected": -157.44863891601562,
"loss": 0.5989,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.1947994828224182,
"rewards/margins": 0.21365301311016083,
"rewards/rejected": -0.01885353960096836,
"step": 320
},
{
"epoch": 0.74,
"grad_norm": 5.323259223525621,
"learning_rate": 9.412754953531663e-08,
"logits/chosen": -1.560361623764038,
"logits/rejected": -1.6760743856430054,
"logps/chosen": -147.3408966064453,
"logps/rejected": -164.4519500732422,
"loss": 0.5956,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.18070648610591888,
"rewards/margins": 0.22251346707344055,
"rewards/rejected": -0.04180694743990898,
"step": 330
},
{
"epoch": 0.76,
"grad_norm": 6.0603812493866025,
"learning_rate": 7.923665945792943e-08,
"logits/chosen": -1.542307734489441,
"logits/rejected": -1.6773264408111572,
"logps/chosen": -132.24139404296875,
"logps/rejected": -148.74737548828125,
"loss": 0.6007,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 0.17221280932426453,
"rewards/margins": 0.211787611246109,
"rewards/rejected": -0.03957480937242508,
"step": 340
},
{
"epoch": 0.79,
"grad_norm": 5.800803684922802,
"learning_rate": 6.540386329965863e-08,
"logits/chosen": -1.613059639930725,
"logits/rejected": -1.6966331005096436,
"logps/chosen": -155.21559143066406,
"logps/rejected": -161.65882873535156,
"loss": 0.597,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.17716926336288452,
"rewards/margins": 0.21501335501670837,
"rewards/rejected": -0.03784411773085594,
"step": 350
},
{
"epoch": 0.81,
"grad_norm": 5.294198300700879,
"learning_rate": 5.271487265090163e-08,
"logits/chosen": -1.605891466140747,
"logits/rejected": -1.6633691787719727,
"logps/chosen": -133.00123596191406,
"logps/rejected": -176.7678680419922,
"loss": 0.5888,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 0.17767903208732605,
"rewards/margins": 0.228514164686203,
"rewards/rejected": -0.05083512142300606,
"step": 360
},
{
"epoch": 0.83,
"grad_norm": 6.853203056351755,
"learning_rate": 4.1248311786649394e-08,
"logits/chosen": -1.6259254217147827,
"logits/rejected": -1.7257139682769775,
"logps/chosen": -135.5113525390625,
"logps/rejected": -180.2209014892578,
"loss": 0.5989,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.17052185535430908,
"rewards/margins": 0.2071322202682495,
"rewards/rejected": -0.03661039471626282,
"step": 370
},
{
"epoch": 0.85,
"grad_norm": 5.58249557148135,
"learning_rate": 3.107523049009983e-08,
"logits/chosen": -1.5495421886444092,
"logits/rejected": -1.6909148693084717,
"logps/chosen": -148.41799926757812,
"logps/rejected": -188.6688995361328,
"loss": 0.5986,
"rewards/accuracies": 0.90625,
"rewards/chosen": 0.18436935544013977,
"rewards/margins": 0.21894951164722443,
"rewards/rejected": -0.03458016738295555,
"step": 380
},
{
"epoch": 0.88,
"grad_norm": 6.065104934238928,
"learning_rate": 2.2258663809784888e-08,
"logits/chosen": -1.556806206703186,
"logits/rejected": -1.6664282083511353,
"logps/chosen": -134.76539611816406,
"logps/rejected": -166.59054565429688,
"loss": 0.598,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.19658346474170685,
"rewards/margins": 0.23869290947914124,
"rewards/rejected": -0.04210943728685379,
"step": 390
},
{
"epoch": 0.9,
"grad_norm": 6.519921387019466,
"learning_rate": 1.4853241478071599e-08,
"logits/chosen": -1.5817980766296387,
"logits/rejected": -1.6547319889068604,
"logps/chosen": -132.71343994140625,
"logps/rejected": -159.65066528320312,
"loss": 0.6003,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.16476558148860931,
"rewards/margins": 0.182787224650383,
"rewards/rejected": -0.018021635711193085,
"step": 400
},
{
"epoch": 0.9,
"eval_logits/chosen": -1.8108444213867188,
"eval_logits/rejected": -1.544880986213684,
"eval_logps/chosen": -121.37197875976562,
"eval_logps/rejected": -144.05641174316406,
"eval_loss": 0.6354950666427612,
"eval_rewards/accuracies": 0.7326388955116272,
"eval_rewards/chosen": 0.1375703364610672,
"eval_rewards/margins": 0.13708928227424622,
"eval_rewards/rejected": 0.00048106827307492495,
"eval_runtime": 109.3237,
"eval_samples_per_second": 10.464,
"eval_steps_per_second": 0.329,
"step": 400
},
{
"epoch": 0.92,
"grad_norm": 5.560808880302564,
"learning_rate": 8.904849411180748e-09,
"logits/chosen": -1.5504529476165771,
"logits/rejected": -1.675254464149475,
"logps/chosen": -141.06692504882812,
"logps/rejected": -165.89645385742188,
"loss": 0.6023,
"rewards/accuracies": 0.84375,
"rewards/chosen": 0.1476067751646042,
"rewards/margins": 0.19318901002407074,
"rewards/rejected": -0.04558226466178894,
"step": 410
},
{
"epoch": 0.94,
"grad_norm": 6.1731600663059005,
"learning_rate": 4.45034538815614e-09,
"logits/chosen": -1.56648850440979,
"logits/rejected": -1.7188094854354858,
"logps/chosen": -161.48452758789062,
"logps/rejected": -189.99099731445312,
"loss": 0.593,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.16634421050548553,
"rewards/margins": 0.21816936135292053,
"rewards/rejected": -0.051825135946273804,
"step": 420
},
{
"epoch": 0.97,
"grad_norm": 5.916412317020735,
"learning_rate": 1.5173306705126287e-09,
"logits/chosen": -1.5939347743988037,
"logits/rejected": -1.6984974145889282,
"logps/chosen": -147.58717346191406,
"logps/rejected": -158.92880249023438,
"loss": 0.5973,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.18250404298305511,
"rewards/margins": 0.252483069896698,
"rewards/rejected": -0.06997901946306229,
"step": 430
},
{
"epoch": 0.99,
"grad_norm": 5.628332108752967,
"learning_rate": 1.239789776653899e-10,
"logits/chosen": -1.5746687650680542,
"logits/rejected": -1.7450227737426758,
"logps/chosen": -137.1623077392578,
"logps/rejected": -198.08595275878906,
"loss": 0.5957,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.1561325490474701,
"rewards/margins": 0.1958049237728119,
"rewards/rejected": -0.03967234492301941,
"step": 440
},
{
"epoch": 1.0,
"step": 444,
"total_flos": 0.0,
"train_loss": 0.05915545343278764,
"train_runtime": 553.7697,
"train_samples_per_second": 51.35,
"train_steps_per_second": 0.802
}
],
"logging_steps": 10,
"max_steps": 444,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}