zephyr-gemma-2-9b-dpo-4k / trainer_state.json
tanliboy's picture
Model save
772f9e4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998691442030882,
"eval_steps": 100,
"global_step": 477,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002093692750588851,
"grad_norm": 4.870537596158184,
"learning_rate": 4.166666666666666e-09,
"logits/chosen": -7.072341442108154,
"logits/rejected": -6.540944576263428,
"logps/chosen": -346.17401123046875,
"logps/rejected": -373.49456787109375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02093692750588851,
"grad_norm": 5.556671250581048,
"learning_rate": 4.166666666666667e-08,
"logits/chosen": -6.266731262207031,
"logits/rejected": -6.149946212768555,
"logps/chosen": -350.89019775390625,
"logps/rejected": -355.3866271972656,
"loss": 0.6932,
"rewards/accuracies": 0.4305555522441864,
"rewards/chosen": 0.000294171943096444,
"rewards/margins": 0.0003653134626802057,
"rewards/rejected": -7.114150503184646e-05,
"step": 10
},
{
"epoch": 0.04187385501177702,
"grad_norm": 5.947484840987328,
"learning_rate": 8.333333333333334e-08,
"logits/chosen": -6.901098728179932,
"logits/rejected": -6.428101539611816,
"logps/chosen": -368.4015197753906,
"logps/rejected": -332.3041687011719,
"loss": 0.6929,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.000805316842161119,
"rewards/margins": 0.0013799300650134683,
"rewards/rejected": -0.0005746129900217056,
"step": 20
},
{
"epoch": 0.06281078251766553,
"grad_norm": 4.7415695218728455,
"learning_rate": 1.25e-07,
"logits/chosen": -6.833685874938965,
"logits/rejected": -6.713122367858887,
"logps/chosen": -381.8787536621094,
"logps/rejected": -335.8555603027344,
"loss": 0.6922,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0064066750928759575,
"rewards/margins": 0.0018916327971965075,
"rewards/rejected": 0.004515042062848806,
"step": 30
},
{
"epoch": 0.08374771002355404,
"grad_norm": 4.440630542343055,
"learning_rate": 1.6666666666666668e-07,
"logits/chosen": -7.030734062194824,
"logits/rejected": -6.716243743896484,
"logps/chosen": -374.63531494140625,
"logps/rejected": -338.505126953125,
"loss": 0.6894,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.025013629347085953,
"rewards/margins": 0.007752637378871441,
"rewards/rejected": 0.017260991036891937,
"step": 40
},
{
"epoch": 0.10468463752944256,
"grad_norm": 4.761237773867083,
"learning_rate": 1.9998927475076103e-07,
"logits/chosen": -6.727326393127441,
"logits/rejected": -6.21535587310791,
"logps/chosen": -338.68768310546875,
"logps/rejected": -324.8238830566406,
"loss": 0.6853,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.04892030358314514,
"rewards/margins": 0.014415493234992027,
"rewards/rejected": 0.034504808485507965,
"step": 50
},
{
"epoch": 0.12562156503533106,
"grad_norm": 5.119072453981655,
"learning_rate": 1.9961413253717213e-07,
"logits/chosen": -7.186664581298828,
"logits/rejected": -6.633403778076172,
"logps/chosen": -326.31439208984375,
"logps/rejected": -301.0088195800781,
"loss": 0.6768,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": 0.07451288402080536,
"rewards/margins": 0.03354056924581528,
"rewards/rejected": 0.04097231104969978,
"step": 60
},
{
"epoch": 0.14655849254121958,
"grad_norm": 5.007390249181196,
"learning_rate": 1.9870502626379125e-07,
"logits/chosen": -7.373200416564941,
"logits/rejected": -7.114656925201416,
"logps/chosen": -356.11444091796875,
"logps/rejected": -357.88812255859375,
"loss": 0.6678,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.04506208747625351,
"rewards/margins": 0.0459139421582222,
"rewards/rejected": -0.0008518520626239479,
"step": 70
},
{
"epoch": 0.16749542004710807,
"grad_norm": 5.277515253273024,
"learning_rate": 1.9726682903510838e-07,
"logits/chosen": -8.06989860534668,
"logits/rejected": -7.776799201965332,
"logps/chosen": -383.57763671875,
"logps/rejected": -360.22412109375,
"loss": 0.6567,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.008996413089334965,
"rewards/margins": 0.0934341698884964,
"rewards/rejected": -0.08443775773048401,
"step": 80
},
{
"epoch": 0.1884323475529966,
"grad_norm": 5.718904864627125,
"learning_rate": 1.9530725005474194e-07,
"logits/chosen": -8.55430793762207,
"logits/rejected": -8.116801261901855,
"logps/chosen": -343.30426025390625,
"logps/rejected": -344.4271545410156,
"loss": 0.6418,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.09966368973255157,
"rewards/margins": 0.11764608323574066,
"rewards/rejected": -0.21730978786945343,
"step": 90
},
{
"epoch": 0.2093692750588851,
"grad_norm": 6.2104459729271335,
"learning_rate": 1.9283679330160724e-07,
"logits/chosen": -8.740680694580078,
"logits/rejected": -8.447057723999023,
"logps/chosen": -355.4083557128906,
"logps/rejected": -336.2757263183594,
"loss": 0.6273,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.18184241652488708,
"rewards/margins": 0.17646023631095886,
"rewards/rejected": -0.35830265283584595,
"step": 100
},
{
"epoch": 0.2093692750588851,
"eval_logits/chosen": -9.086737632751465,
"eval_logits/rejected": -8.745743751525879,
"eval_logps/chosen": -391.9050598144531,
"eval_logps/rejected": -385.3993835449219,
"eval_loss": 0.6256434321403503,
"eval_rewards/accuracies": 0.6800000071525574,
"eval_rewards/chosen": -0.2307606041431427,
"eval_rewards/margins": 0.16034501791000366,
"eval_rewards/rejected": -0.391105592250824,
"eval_runtime": 241.3872,
"eval_samples_per_second": 8.285,
"eval_steps_per_second": 1.036,
"step": 100
},
{
"epoch": 0.23030620256477363,
"grad_norm": 6.2752590431966375,
"learning_rate": 1.898687012251826e-07,
"logits/chosen": -9.274371147155762,
"logits/rejected": -9.027790069580078,
"logps/chosen": -357.4116516113281,
"logps/rejected": -360.20465087890625,
"loss": 0.6273,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.2726437747478485,
"rewards/margins": 0.17124271392822266,
"rewards/rejected": -0.44388651847839355,
"step": 110
},
{
"epoch": 0.2512431300706621,
"grad_norm": 6.345691652928505,
"learning_rate": 1.8641888376168482e-07,
"logits/chosen": -9.697042465209961,
"logits/rejected": -9.629292488098145,
"logps/chosen": -399.30670166015625,
"logps/rejected": -386.90557861328125,
"loss": 0.6059,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.28612643480300903,
"rewards/margins": 0.24121908843517303,
"rewards/rejected": -0.5273455381393433,
"step": 120
},
{
"epoch": 0.2721800575765506,
"grad_norm": 7.047283511946962,
"learning_rate": 1.8250583305165094e-07,
"logits/chosen": -10.05250358581543,
"logits/rejected": -9.847970008850098,
"logps/chosen": -378.30462646484375,
"logps/rejected": -395.7349548339844,
"loss": 0.5913,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.3639756739139557,
"rewards/margins": 0.2538781762123108,
"rewards/rejected": -0.6178538799285889,
"step": 130
},
{
"epoch": 0.29311698508243916,
"grad_norm": 8.250566588104485,
"learning_rate": 1.78150524316067e-07,
"logits/chosen": -10.428335189819336,
"logits/rejected": -10.18110179901123,
"logps/chosen": -423.37896728515625,
"logps/rejected": -418.40814208984375,
"loss": 0.5918,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.36636847257614136,
"rewards/margins": 0.3312264084815979,
"rewards/rejected": -0.6975948214530945,
"step": 140
},
{
"epoch": 0.31405391258832765,
"grad_norm": 7.160872331336618,
"learning_rate": 1.7337630342238038e-07,
"logits/chosen": -10.668364524841309,
"logits/rejected": -10.247892379760742,
"logps/chosen": -438.568603515625,
"logps/rejected": -439.31243896484375,
"loss": 0.5943,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.44665470719337463,
"rewards/margins": 0.35836517810821533,
"rewards/rejected": -0.8050198554992676,
"step": 150
},
{
"epoch": 0.33499084009421615,
"grad_norm": 8.012025922238234,
"learning_rate": 1.682087617430782e-07,
"logits/chosen": -10.813860893249512,
"logits/rejected": -10.625129699707031,
"logps/chosen": -402.5640563964844,
"logps/rejected": -415.65032958984375,
"loss": 0.5833,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.584823489189148,
"rewards/margins": 0.2518795430660248,
"rewards/rejected": -0.8367029428482056,
"step": 160
},
{
"epoch": 0.3559277676001047,
"grad_norm": 8.371912815572413,
"learning_rate": 1.6267559897763025e-07,
"logits/chosen": -10.72344970703125,
"logits/rejected": -10.272022247314453,
"logps/chosen": -365.57806396484375,
"logps/rejected": -445.9237365722656,
"loss": 0.5881,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5119096040725708,
"rewards/margins": 0.36096295714378357,
"rewards/rejected": -0.872872531414032,
"step": 170
},
{
"epoch": 0.3768646951059932,
"grad_norm": 8.291785272423704,
"learning_rate": 1.5680647467311557e-07,
"logits/chosen": -11.077351570129395,
"logits/rejected": -10.810079574584961,
"logps/chosen": -403.3762512207031,
"logps/rejected": -436.90411376953125,
"loss": 0.5611,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.5242770910263062,
"rewards/margins": 0.4779755473136902,
"rewards/rejected": -1.0022525787353516,
"step": 180
},
{
"epoch": 0.39780162261188173,
"grad_norm": 8.768402559709461,
"learning_rate": 1.506328492394303e-07,
"logits/chosen": -10.848445892333984,
"logits/rejected": -10.747730255126953,
"logps/chosen": -361.96917724609375,
"logps/rejected": -427.53131103515625,
"loss": 0.5678,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.6229602098464966,
"rewards/margins": 0.3965635299682617,
"rewards/rejected": -1.0195238590240479,
"step": 190
},
{
"epoch": 0.4187385501177702,
"grad_norm": 10.10996155431003,
"learning_rate": 1.4418781531128634e-07,
"logits/chosen": -11.236587524414062,
"logits/rejected": -11.044486045837402,
"logps/chosen": -414.62225341796875,
"logps/rejected": -419.2083435058594,
"loss": 0.5701,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.6426066160202026,
"rewards/margins": 0.4211527705192566,
"rewards/rejected": -1.0637595653533936,
"step": 200
},
{
"epoch": 0.4187385501177702,
"eval_logits/chosen": -11.197042465209961,
"eval_logits/rejected": -10.930534362792969,
"eval_logps/chosen": -442.0617370605469,
"eval_logps/rejected": -458.7671203613281,
"eval_loss": 0.5679408311843872,
"eval_rewards/accuracies": 0.6800000071525574,
"eval_rewards/chosen": -0.7323274612426758,
"eval_rewards/margins": 0.39245596528053284,
"eval_rewards/rejected": -1.1247833967208862,
"eval_runtime": 238.8989,
"eval_samples_per_second": 8.372,
"eval_steps_per_second": 1.046,
"step": 200
},
{
"epoch": 0.4396754776236587,
"grad_norm": 8.838084489326292,
"learning_rate": 1.375059203609562e-07,
"logits/chosen": -11.455360412597656,
"logits/rejected": -11.19542121887207,
"logps/chosen": -448.05706787109375,
"logps/rejected": -442.106689453125,
"loss": 0.579,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.6968704462051392,
"rewards/margins": 0.5110000967979431,
"rewards/rejected": -1.207870364189148,
"step": 210
},
{
"epoch": 0.46061240512954726,
"grad_norm": 11.66423950154552,
"learning_rate": 1.306229815126159e-07,
"logits/chosen": -10.951299667358398,
"logits/rejected": -10.760453224182129,
"logps/chosen": -387.4031066894531,
"logps/rejected": -438.4293518066406,
"loss": 0.5669,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.6763008832931519,
"rewards/margins": 0.459463506937027,
"rewards/rejected": -1.1357643604278564,
"step": 220
},
{
"epoch": 0.48154933263543576,
"grad_norm": 8.962170857968381,
"learning_rate": 1.2357589355094274e-07,
"logits/chosen": -11.39826488494873,
"logits/rejected": -11.283321380615234,
"logps/chosen": -385.0740966796875,
"logps/rejected": -439.373291015625,
"loss": 0.5679,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6969426870346069,
"rewards/margins": 0.4374031126499176,
"rewards/rejected": -1.1343457698822021,
"step": 230
},
{
"epoch": 0.5024862601413242,
"grad_norm": 9.228744844021563,
"learning_rate": 1.1640243115310217e-07,
"logits/chosen": -11.526562690734863,
"logits/rejected": -11.38569450378418,
"logps/chosen": -431.7037048339844,
"logps/rejected": -451.6121520996094,
"loss": 0.5411,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.6818773150444031,
"rewards/margins": 0.4736524522304535,
"rewards/rejected": -1.1555297374725342,
"step": 240
},
{
"epoch": 0.5234231876472127,
"grad_norm": 9.475835105940853,
"learning_rate": 1.0914104640422679e-07,
"logits/chosen": -11.973138809204102,
"logits/rejected": -11.678539276123047,
"logps/chosen": -438.6128845214844,
"logps/rejected": -494.81475830078125,
"loss": 0.5498,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7578285932540894,
"rewards/margins": 0.5199133157730103,
"rewards/rejected": -1.2777419090270996,
"step": 250
},
{
"epoch": 0.5443601151531012,
"grad_norm": 10.55001524910548,
"learning_rate": 1.0183066268176774e-07,
"logits/chosen": -11.676985740661621,
"logits/rejected": -11.725804328918457,
"logps/chosen": -406.6064453125,
"logps/rejected": -454.85015869140625,
"loss": 0.5528,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.6437905430793762,
"rewards/margins": 0.597993791103363,
"rewards/rejected": -1.2417843341827393,
"step": 260
},
{
"epoch": 0.5652970426589898,
"grad_norm": 10.40303673883758,
"learning_rate": 9.451046601356724e-08,
"logits/chosen": -11.600053787231445,
"logits/rejected": -11.353797912597656,
"logps/chosen": -433.164794921875,
"logps/rejected": -443.1178283691406,
"loss": 0.5492,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8422502279281616,
"rewards/margins": 0.5629727244377136,
"rewards/rejected": -1.40522301197052,
"step": 270
},
{
"epoch": 0.5862339701648783,
"grad_norm": 10.92362637516926,
"learning_rate": 8.721969502803953e-08,
"logits/chosen": -11.430012702941895,
"logits/rejected": -11.168180465698242,
"logps/chosen": -443.4029846191406,
"logps/rejected": -476.8294982910156,
"loss": 0.5538,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.9203767776489258,
"rewards/margins": 0.4110565781593323,
"rewards/rejected": -1.3314332962036133,
"step": 280
},
{
"epoch": 0.6071708976707668,
"grad_norm": 9.76250775652993,
"learning_rate": 7.999743062239557e-08,
"logits/chosen": -11.536198616027832,
"logits/rejected": -11.403619766235352,
"logps/chosen": -425.9610900878906,
"logps/rejected": -467.23291015625,
"loss": 0.5452,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.7125166654586792,
"rewards/margins": 0.6340970396995544,
"rewards/rejected": -1.3466136455535889,
"step": 290
},
{
"epoch": 0.6281078251766553,
"grad_norm": 12.956036139139355,
"learning_rate": 7.28823864763583e-08,
"logits/chosen": -11.646090507507324,
"logits/rejected": -11.440874099731445,
"logps/chosen": -433.30120849609375,
"logps/rejected": -482.5865173339844,
"loss": 0.5398,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.790636420249939,
"rewards/margins": 0.5609198808670044,
"rewards/rejected": -1.351556420326233,
"step": 300
},
{
"epoch": 0.6281078251766553,
"eval_logits/chosen": -11.52165412902832,
"eval_logits/rejected": -11.35400676727295,
"eval_logps/chosen": -458.75299072265625,
"eval_logps/rejected": -483.2173156738281,
"eval_loss": 0.5490842461585999,
"eval_rewards/accuracies": 0.6840000152587891,
"eval_rewards/chosen": -0.8992397785186768,
"eval_rewards/margins": 0.47004497051239014,
"eval_rewards/rejected": -1.369284749031067,
"eval_runtime": 222.3735,
"eval_samples_per_second": 8.994,
"eval_steps_per_second": 1.124,
"step": 300
},
{
"epoch": 0.6490447526825438,
"grad_norm": 11.066017903277775,
"learning_rate": 6.591270153428288e-08,
"logits/chosen": -11.68547248840332,
"logits/rejected": -11.603824615478516,
"logps/chosen": -465.458740234375,
"logps/rejected": -478.3704528808594,
"loss": 0.5356,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8401340246200562,
"rewards/margins": 0.5598424673080444,
"rewards/rejected": -1.3999764919281006,
"step": 310
},
{
"epoch": 0.6699816801884323,
"grad_norm": 11.177050758944128,
"learning_rate": 5.912573556804452e-08,
"logits/chosen": -11.293741226196289,
"logits/rejected": -11.169164657592773,
"logps/chosen": -391.907958984375,
"logps/rejected": -435.26190185546875,
"loss": 0.5324,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7623863220214844,
"rewards/margins": 0.5452743768692017,
"rewards/rejected": -1.3076608180999756,
"step": 320
},
{
"epoch": 0.6909186076943209,
"grad_norm": 13.652856840334575,
"learning_rate": 5.255786891654399e-08,
"logits/chosen": -11.769269943237305,
"logits/rejected": -11.3505220413208,
"logps/chosen": -447.8871154785156,
"logps/rejected": -496.74029541015625,
"loss": 0.537,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8857334852218628,
"rewards/margins": 0.6645745038986206,
"rewards/rejected": -1.5503078699111938,
"step": 330
},
{
"epoch": 0.7118555352002094,
"grad_norm": 11.172216313095676,
"learning_rate": 4.624430747529102e-08,
"logits/chosen": -11.618499755859375,
"logits/rejected": -11.424800872802734,
"logps/chosen": -420.2229919433594,
"logps/rejected": -472.84033203125,
"loss": 0.5363,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.7942701578140259,
"rewards/margins": 0.6123701333999634,
"rewards/rejected": -1.4066402912139893,
"step": 340
},
{
"epoch": 0.7327924627060979,
"grad_norm": 12.255527795835215,
"learning_rate": 4.0218893981385925e-08,
"logits/chosen": -11.575703620910645,
"logits/rejected": -11.422430038452148,
"logps/chosen": -432.9644470214844,
"logps/rejected": -470.8561096191406,
"loss": 0.5185,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8515139818191528,
"rewards/margins": 0.5291236042976379,
"rewards/rejected": -1.380637526512146,
"step": 350
},
{
"epoch": 0.7537293902119864,
"grad_norm": 12.049137551707718,
"learning_rate": 3.45139266054715e-08,
"logits/chosen": -11.708954811096191,
"logits/rejected": -11.447134017944336,
"logps/chosen": -482.3265686035156,
"logps/rejected": -521.8561401367188,
"loss": 0.547,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.885657787322998,
"rewards/margins": 0.5782418251037598,
"rewards/rejected": -1.4638997316360474,
"step": 360
},
{
"epoch": 0.7746663177178749,
"grad_norm": 10.956723652484643,
"learning_rate": 2.9159985823062993e-08,
"logits/chosen": -11.063775062561035,
"logits/rejected": -11.105363845825195,
"logps/chosen": -469.81201171875,
"logps/rejected": -485.8446350097656,
"loss": 0.54,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.8363800048828125,
"rewards/margins": 0.5975291132926941,
"rewards/rejected": -1.4339090585708618,
"step": 370
},
{
"epoch": 0.7956032452237635,
"grad_norm": 13.069162308279825,
"learning_rate": 2.4185770493280577e-08,
"logits/chosen": -11.262879371643066,
"logits/rejected": -11.126230239868164,
"logps/chosen": -455.2259216308594,
"logps/rejected": -489.8641052246094,
"loss": 0.5364,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.9804290533065796,
"rewards/margins": 0.48143139481544495,
"rewards/rejected": -1.4618604183197021,
"step": 380
},
{
"epoch": 0.816540172729652,
"grad_norm": 13.1101074898303,
"learning_rate": 1.9617944023656108e-08,
"logits/chosen": -11.852777481079102,
"logits/rejected": -11.575521469116211,
"logps/chosen": -470.88604736328125,
"logps/rejected": -511.36767578125,
"loss": 0.532,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.876979649066925,
"rewards/margins": 0.5152709484100342,
"rewards/rejected": -1.3922507762908936,
"step": 390
},
{
"epoch": 0.8374771002355405,
"grad_norm": 13.484360798114164,
"learning_rate": 1.5480991445620538e-08,
"logits/chosen": -11.511263847351074,
"logits/rejected": -11.029703140258789,
"logps/chosen": -427.28448486328125,
"logps/rejected": -486.9627990722656,
"loss": 0.54,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8370214700698853,
"rewards/margins": 0.5594587922096252,
"rewards/rejected": -1.3964803218841553,
"step": 400
},
{
"epoch": 0.8374771002355405,
"eval_logits/chosen": -11.385127067565918,
"eval_logits/rejected": -11.129400253295898,
"eval_logps/chosen": -459.30474853515625,
"eval_logps/rejected": -487.0408020019531,
"eval_loss": 0.5449032783508301,
"eval_rewards/accuracies": 0.6759999990463257,
"eval_rewards/chosen": -0.9047574996948242,
"eval_rewards/margins": 0.5027627944946289,
"eval_rewards/rejected": -1.4075202941894531,
"eval_runtime": 222.5645,
"eval_samples_per_second": 8.986,
"eval_steps_per_second": 1.123,
"step": 400
},
{
"epoch": 0.8584140277414289,
"grad_norm": 10.485330642188247,
"learning_rate": 1.1797088166794e-08,
"logits/chosen": -11.086836814880371,
"logits/rejected": -10.8326997756958,
"logps/chosen": -452.47235107421875,
"logps/rejected": -453.66986083984375,
"loss": 0.5401,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.9103538393974304,
"rewards/margins": 0.4986189007759094,
"rewards/rejected": -1.4089727401733398,
"step": 410
},
{
"epoch": 0.8793509552473174,
"grad_norm": 10.664608353191511,
"learning_rate": 8.585981103608341e-09,
"logits/chosen": -11.340084075927734,
"logits/rejected": -10.990386962890625,
"logps/chosen": -402.55157470703125,
"logps/rejected": -500.97467041015625,
"loss": 0.531,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.8159112930297852,
"rewards/margins": 0.6232641339302063,
"rewards/rejected": -1.4391753673553467,
"step": 420
},
{
"epoch": 0.9002878827532059,
"grad_norm": 11.582461841758496,
"learning_rate": 5.864882831430273e-09,
"logits/chosen": -11.550569534301758,
"logits/rejected": -11.248498916625977,
"logps/chosen": -449.3023986816406,
"logps/rejected": -463.66510009765625,
"loss": 0.5232,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8865207433700562,
"rewards/margins": 0.47660237550735474,
"rewards/rejected": -1.3631231784820557,
"step": 430
},
{
"epoch": 0.9212248102590945,
"grad_norm": 11.301516689755074,
"learning_rate": 3.6483793195745682e-09,
"logits/chosen": -11.318781852722168,
"logits/rejected": -10.964235305786133,
"logps/chosen": -413.6148986816406,
"logps/rejected": -463.11114501953125,
"loss": 0.5351,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.8221774101257324,
"rewards/margins": 0.663087010383606,
"rewards/rejected": -1.4852644205093384,
"step": 440
},
{
"epoch": 0.942161737764983,
"grad_norm": 12.872962412868715,
"learning_rate": 1.9483517457776433e-09,
"logits/chosen": -11.27305793762207,
"logits/rejected": -10.98169994354248,
"logps/chosen": -440.1998596191406,
"logps/rejected": -488.0940856933594,
"loss": 0.5273,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.8431445956230164,
"rewards/margins": 0.6032007932662964,
"rewards/rejected": -1.446345329284668,
"step": 450
},
{
"epoch": 0.9630986652708715,
"grad_norm": 11.52042506596002,
"learning_rate": 7.739128092312918e-10,
"logits/chosen": -11.074236869812012,
"logits/rejected": -10.837442398071289,
"logps/chosen": -452.1619567871094,
"logps/rejected": -474.42620849609375,
"loss": 0.5402,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8461356163024902,
"rewards/margins": 0.6280852556228638,
"rewards/rejected": -1.4742207527160645,
"step": 460
},
{
"epoch": 0.98403559277676,
"grad_norm": 11.458055654937185,
"learning_rate": 1.313578835593465e-10,
"logits/chosen": -11.382277488708496,
"logits/rejected": -11.017861366271973,
"logps/chosen": -448.569091796875,
"logps/rejected": -476.3909606933594,
"loss": 0.5184,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9083318710327148,
"rewards/margins": 0.5944851636886597,
"rewards/rejected": -1.502817153930664,
"step": 470
},
{
"epoch": 0.998691442030882,
"step": 477,
"total_flos": 0.0,
"train_loss": 0.5785154289169632,
"train_runtime": 17436.3201,
"train_samples_per_second": 3.506,
"train_steps_per_second": 0.027
}
],
"logging_steps": 10,
"max_steps": 477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}