NicholasCorrado commited on
Commit
e7af3f7
·
verified ·
1 Parent(s): 34a8d27

Model save

Browse files
Files changed (4) hide show
  1. README.md +15 -21
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +430 -430
README.md CHANGED
@@ -3,16 +3,10 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: alignment-handbook/zephyr-7b-sft-full
5
  tags:
6
- - alignment-handbook
7
- - trl
8
- - dpo
9
- - generated_from_trainer
10
  - trl
11
  - dpo
12
  - alignment-handbook
13
  - generated_from_trainer
14
- datasets:
15
- - HuggingFaceH4/ultrafeedback_binarized
16
  model-index:
17
  - name: zephyr-7b-dpo-full
18
  results: []
@@ -23,17 +17,17 @@ should probably proofread and complete it, then remove this comment. -->
23
 
24
  # zephyr-7b-dpo-full
25
 
26
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the HuggingFaceH4/ultrafeedback_binarized dataset.
27
  It achieves the following results on the evaluation set:
28
- - Loss: 0.2473
29
- - Rewards/chosen: -4.6815
30
- - Rewards/rejected: -10.5131
31
- - Rewards/accuracies: 0.8525
32
- - Rewards/margins: 5.8316
33
- - Logps/rejected: -1354.8135
34
- - Logps/chosen: -759.6055
35
- - Logits/rejected: -1.2709
36
- - Logits/chosen: -1.7157
37
 
38
  ## Model description
39
 
@@ -68,11 +62,11 @@ The following hyperparameters were used during training:
68
 
69
  ### Training results
70
 
71
- | Training Loss | Epoch | Step | Logits/chosen | Logits/rejected | Logps/chosen | Logps/rejected | Validation Loss | Rewards/accuracies | Rewards/chosen | Rewards/margins | Rewards/rejected |
72
- |:-------------:|:------:|:----:|:-------------:|:---------------:|:------------:|:--------------:|:---------------:|:------------------:|:--------------:|:---------------:|:----------------:|
73
- | 0.3043 | 0.2559 | 100 | -2.9645 | -2.9613 | -589.3575 | -935.3794 | 0.3080 | 0.8245 | -2.9790 | 3.3398 | -6.3188 |
74
- | 0.2557 | 0.5118 | 200 | -2.2748 | -2.0707 | -709.4976 | -1222.8809 | 0.2607 | 0.8470 | -4.1804 | 5.0134 | -9.1938 |
75
- | 0.2515 | 0.7678 | 300 | -1.8309 | -1.4322 | -726.7409 | -1277.9103 | 0.2493 | 0.8509 | -4.3528 | 5.3912 | -9.7441 |
76
 
77
 
78
  ### Framework versions
 
3
  license: apache-2.0
4
  base_model: alignment-handbook/zephyr-7b-sft-full
5
  tags:
 
 
 
 
6
  - trl
7
  - dpo
8
  - alignment-handbook
9
  - generated_from_trainer
 
 
10
  model-index:
11
  - name: zephyr-7b-dpo-full
12
  results: []
 
17
 
18
  # zephyr-7b-dpo-full
19
 
20
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.2490
23
+ - Rewards/chosen: -4.6026
24
+ - Rewards/rejected: -9.7319
25
+ - Rewards/accuracies: 0.8463
26
+ - Rewards/margins: 5.1294
27
+ - Logps/rejected: -1276.6984
28
+ - Logps/chosen: -751.7161
29
+ - Logits/rejected: -1.1260
30
+ - Logits/chosen: -1.5108
31
 
32
  ## Model description
33
 
 
62
 
63
  ### Training results
64
 
65
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
66
+ |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
67
+ | 0.3131 | 0.2559 | 100 | 0.3047 | -2.7883 | -6.0602 | 0.8292 | 3.2718 | -909.5203 | -570.2890 | -2.9402 | -2.9498 |
68
+ | 0.2541 | 0.5118 | 200 | 0.2608 | -5.2681 | -10.7685 | 0.8463 | 5.5004 | -1380.3505 | -818.2686 | -1.4771 | -1.8230 |
69
+ | 0.2604 | 0.7678 | 300 | 0.2490 | -4.6026 | -9.7319 | 0.8463 | 5.1294 | -1276.6984 | -751.7161 | -1.1260 | -1.5108 |
70
 
71
 
72
  ### Framework versions
all_results.json CHANGED
@@ -14,9 +14,9 @@
14
  "eval_samples_per_second": 11.099,
15
  "eval_steps_per_second": 0.349,
16
  "total_flos": 0.0,
17
- "train_loss": 0.0,
18
- "train_runtime": 0.0175,
19
  "train_samples": 50000,
20
- "train_samples_per_second": 2864609.543,
21
- "train_steps_per_second": 22343.954
22
  }
 
14
  "eval_samples_per_second": 11.099,
15
  "eval_steps_per_second": 0.349,
16
  "total_flos": 0.0,
17
+ "train_loss": 0.32177775089557353,
18
+ "train_runtime": 13656.3788,
19
  "train_samples": 50000,
20
+ "train_samples_per_second": 3.661,
21
+ "train_steps_per_second": 0.029
22
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9980806142034548,
3
  "total_flos": 0.0,
4
- "train_loss": 0.0,
5
- "train_runtime": 0.0175,
6
  "train_samples": 50000,
7
- "train_samples_per_second": 2864609.543,
8
- "train_steps_per_second": 22343.954
9
  }
 
1
  {
2
  "epoch": 0.9980806142034548,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.32177775089557353,
5
+ "train_runtime": 13656.3788,
6
  "train_samples": 50000,
7
+ "train_samples_per_second": 3.661,
8
+ "train_steps_per_second": 0.029
9
  }
trainer_state.json CHANGED
@@ -10,12 +10,12 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0025591810620601407,
13
- "grad_norm": 8.372040796681393,
14
  "learning_rate": 1.282051282051282e-08,
15
- "logits/chosen": -2.9558680057525635,
16
- "logits/rejected": -2.9835896492004395,
17
- "logps/chosen": -287.1746520996094,
18
- "logps/rejected": -318.6817626953125,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
@@ -25,645 +25,645 @@
25
  },
26
  {
27
  "epoch": 0.025591810620601407,
28
- "grad_norm": 9.212524406222368,
29
  "learning_rate": 1.2820512820512818e-07,
30
- "logits/chosen": -3.0168228149414062,
31
- "logits/rejected": -3.0099453926086426,
32
- "logps/chosen": -286.0946044921875,
33
- "logps/rejected": -304.9287414550781,
34
- "loss": 0.693,
35
  "rewards/accuracies": 0.4097222089767456,
36
- "rewards/chosen": -0.00034835602855309844,
37
- "rewards/margins": -5.7743654906516895e-05,
38
- "rewards/rejected": -0.00029061237000860274,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.05118362124120281,
43
- "grad_norm": 8.914663082765845,
44
  "learning_rate": 2.5641025641025636e-07,
45
- "logits/chosen": -3.006526470184326,
46
- "logits/rejected": -2.9971041679382324,
47
- "logps/chosen": -283.1783447265625,
48
- "logps/rejected": -298.82427978515625,
49
- "loss": 0.69,
50
- "rewards/accuracies": 0.6468750238418579,
51
- "rewards/chosen": 0.0020886282436549664,
52
- "rewards/margins": 0.00550027284771204,
53
- "rewards/rejected": -0.0034116446040570736,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.07677543186180422,
58
- "grad_norm": 9.211645338164717,
59
  "learning_rate": 3.8461538461538463e-07,
60
- "logits/chosen": -3.0022165775299072,
61
- "logits/rejected": -2.997166156768799,
62
- "logps/chosen": -280.0216369628906,
63
- "logps/rejected": -295.76959228515625,
64
- "loss": 0.6714,
65
- "rewards/accuracies": 0.7593749761581421,
66
- "rewards/chosen": 0.009928617626428604,
67
- "rewards/margins": 0.04372577741742134,
68
- "rewards/rejected": -0.03379715979099274,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.10236724248240563,
73
- "grad_norm": 10.081033679769522,
74
  "learning_rate": 4.99989986344963e-07,
75
- "logits/chosen": -3.0331904888153076,
76
- "logits/rejected": -3.0275347232818604,
77
- "logps/chosen": -285.2835998535156,
78
- "logps/rejected": -319.5372314453125,
79
- "loss": 0.615,
80
- "rewards/accuracies": 0.828125,
81
- "rewards/chosen": -0.042057085782289505,
82
- "rewards/margins": 0.17162299156188965,
83
- "rewards/rejected": -0.21368007361888885,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.12795905310300704,
88
- "grad_norm": 13.895203665337698,
89
  "learning_rate": 4.987893180827479e-07,
90
- "logits/chosen": -3.0868072509765625,
91
- "logits/rejected": -3.0783658027648926,
92
- "logps/chosen": -368.4632263183594,
93
- "logps/rejected": -446.92608642578125,
94
- "loss": 0.5101,
95
- "rewards/accuracies": 0.784375011920929,
96
- "rewards/chosen": -0.7513679265975952,
97
- "rewards/margins": 0.5743271112442017,
98
- "rewards/rejected": -1.3256951570510864,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.15355086372360843,
103
- "grad_norm": 17.530401994512683,
104
  "learning_rate": 4.955969343539162e-07,
105
- "logits/chosen": -3.0945253372192383,
106
- "logits/rejected": -3.077105760574341,
107
- "logps/chosen": -525.1121215820312,
108
- "logps/rejected": -675.7432861328125,
109
- "loss": 0.4203,
110
- "rewards/accuracies": 0.7875000238418579,
111
- "rewards/chosen": -2.4605610370635986,
112
- "rewards/margins": 1.4077235460281372,
113
- "rewards/rejected": -3.8682847023010254,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.17914267434420986,
118
- "grad_norm": 20.655420138174726,
119
  "learning_rate": 4.90438392204474e-07,
120
- "logits/chosen": -3.1209347248077393,
121
- "logits/rejected": -3.1095337867736816,
122
- "logps/chosen": -557.0096435546875,
123
- "logps/rejected": -769.7672119140625,
124
- "loss": 0.3553,
125
- "rewards/accuracies": 0.824999988079071,
126
- "rewards/chosen": -2.73195219039917,
127
- "rewards/margins": 1.957765817642212,
128
- "rewards/rejected": -4.6897172927856445,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.20473448496481125,
133
- "grad_norm": 34.20074706717272,
134
  "learning_rate": 4.83354989019146e-07,
135
- "logits/chosen": -3.0491955280303955,
136
- "logits/rejected": -3.05132794380188,
137
- "logps/chosen": -558.9640502929688,
138
- "logps/rejected": -801.9044799804688,
139
- "loss": 0.3429,
140
- "rewards/accuracies": 0.778124988079071,
141
- "rewards/chosen": -2.753610134124756,
142
- "rewards/margins": 2.3364205360412598,
143
- "rewards/rejected": -5.090030193328857,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.23032629558541268,
148
- "grad_norm": 25.187798132399784,
149
  "learning_rate": 4.7440343190975353e-07,
150
- "logits/chosen": -3.0432305335998535,
151
- "logits/rejected": -3.041344165802002,
152
- "logps/chosen": -570.7512817382812,
153
- "logps/rejected": -843.0035400390625,
154
- "loss": 0.3284,
155
- "rewards/accuracies": 0.8343750238418579,
156
- "rewards/chosen": -2.7428793907165527,
157
- "rewards/margins": 2.6616787910461426,
158
- "rewards/rejected": -5.404558181762695,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.2559181062060141,
163
- "grad_norm": 39.301413976140616,
164
  "learning_rate": 4.6365538373900506e-07,
165
- "logits/chosen": -3.000190019607544,
166
- "logits/rejected": -3.000822067260742,
167
- "logps/chosen": -588.419677734375,
168
- "logps/rejected": -898.4981689453125,
169
- "loss": 0.3043,
170
- "rewards/accuracies": 0.8125,
171
- "rewards/chosen": -2.9578394889831543,
172
- "rewards/margins": 3.0813093185424805,
173
- "rewards/rejected": -6.039149284362793,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.2559181062060141,
178
- "eval_logits/chosen": -2.96449875831604,
179
- "eval_logits/rejected": -2.961296796798706,
180
- "eval_logps/chosen": -589.3575439453125,
181
- "eval_logps/rejected": -935.37939453125,
182
- "eval_loss": 0.3079955577850342,
183
- "eval_rewards/accuracies": 0.8245341777801514,
184
- "eval_rewards/chosen": -2.9790048599243164,
185
- "eval_rewards/margins": 3.339751958847046,
186
- "eval_rewards/rejected": -6.318756580352783,
187
- "eval_runtime": 475.0898,
188
- "eval_samples_per_second": 10.79,
189
- "eval_steps_per_second": 0.339,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.28150991682661547,
194
- "grad_norm": 23.2464532886725,
195
  "learning_rate": 4.5119688941406386e-07,
196
- "logits/chosen": -2.9700212478637695,
197
- "logits/rejected": -2.974587917327881,
198
- "logps/chosen": -593.6639404296875,
199
- "logps/rejected": -905.212890625,
200
- "loss": 0.3259,
201
- "rewards/accuracies": 0.8125,
202
- "rewards/chosen": -3.0440046787261963,
203
- "rewards/margins": 2.9606070518493652,
204
- "rewards/rejected": -6.004611492156982,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.30710172744721687,
209
- "grad_norm": 20.92968428786604,
210
  "learning_rate": 4.3712768704277524e-07,
211
- "logits/chosen": -2.954521417617798,
212
- "logits/rejected": -2.959869861602783,
213
- "logps/chosen": -536.2525634765625,
214
- "logps/rejected": -830.0895385742188,
215
- "loss": 0.3015,
216
  "rewards/accuracies": 0.8125,
217
- "rewards/chosen": -2.4020705223083496,
218
- "rewards/margins": 2.877993106842041,
219
- "rewards/rejected": -5.280063629150391,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.3326935380678183,
224
- "grad_norm": 28.437529952019855,
225
  "learning_rate": 4.2156040946718343e-07,
226
- "logits/chosen": -2.862247943878174,
227
- "logits/rejected": -2.8785834312438965,
228
- "logps/chosen": -636.6513061523438,
229
- "logps/rejected": -1044.999755859375,
230
- "loss": 0.2841,
231
- "rewards/accuracies": 0.824999988079071,
232
- "rewards/chosen": -3.4623122215270996,
233
- "rewards/margins": 3.8583245277404785,
234
- "rewards/rejected": -7.320636749267578,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.3582853486884197,
239
- "grad_norm": 19.315717522096396,
240
  "learning_rate": 4.046196825665637e-07,
241
- "logits/chosen": -2.832946538925171,
242
- "logits/rejected": -2.8259646892547607,
243
- "logps/chosen": -593.8186645507812,
244
- "logps/rejected": -979.2440185546875,
245
- "loss": 0.2622,
246
- "rewards/accuracies": 0.8374999761581421,
247
- "rewards/chosen": -2.943942070007324,
248
- "rewards/margins": 3.7794156074523926,
249
- "rewards/rejected": -6.723358154296875,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.3838771593090211,
254
- "grad_norm": 22.976543958848772,
255
  "learning_rate": 3.864411275486261e-07,
256
- "logits/chosen": -2.760894536972046,
257
- "logits/rejected": -2.7563464641571045,
258
- "logps/chosen": -634.1366577148438,
259
- "logps/rejected": -1078.597412109375,
260
- "loss": 0.2794,
261
- "rewards/accuracies": 0.8218749761581421,
262
- "rewards/chosen": -3.5211944580078125,
263
- "rewards/margins": 4.23276424407959,
264
- "rewards/rejected": -7.753958702087402,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.4094689699296225,
269
- "grad_norm": 21.849223996398678,
270
  "learning_rate": 3.671702752161759e-07,
271
- "logits/chosen": -2.6357340812683105,
272
- "logits/rejected": -2.5901741981506348,
273
- "logps/chosen": -729.1080322265625,
274
- "logps/rejected": -1168.1109619140625,
275
- "loss": 0.2781,
276
- "rewards/accuracies": 0.8374999761581421,
277
- "rewards/chosen": -4.447979927062988,
278
- "rewards/margins": 4.25943660736084,
279
- "rewards/rejected": -8.707415580749512,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.4350607805502239,
284
- "grad_norm": 34.268492856409395,
285
  "learning_rate": 3.4696140090121375e-07,
286
- "logits/chosen": -2.5432353019714355,
287
- "logits/rejected": -2.4383697509765625,
288
- "logps/chosen": -768.44775390625,
289
- "logps/rejected": -1241.2236328125,
290
- "loss": 0.2592,
291
- "rewards/accuracies": 0.8531249761581421,
292
- "rewards/chosen": -4.733465194702148,
293
- "rewards/margins": 4.677088737487793,
294
- "rewards/rejected": -9.410554885864258,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.46065259117082535,
299
- "grad_norm": 24.22372375688885,
300
  "learning_rate": 3.259762893935617e-07,
301
- "logits/chosen": -2.5379650592803955,
302
- "logits/rejected": -2.4227848052978516,
303
- "logps/chosen": -643.3690185546875,
304
- "logps/rejected": -1086.7647705078125,
305
- "loss": 0.2982,
306
- "rewards/accuracies": 0.8218749761581421,
307
- "rewards/chosen": -3.5777480602264404,
308
- "rewards/margins": 4.262465476989746,
309
- "rewards/rejected": -7.840213775634766,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.48624440179142675,
314
- "grad_norm": 22.276548976639525,
315
  "learning_rate": 3.0438293975154184e-07,
316
- "logits/chosen": -2.4349989891052246,
317
- "logits/rejected": -2.2799932956695557,
318
- "logps/chosen": -682.0303955078125,
319
- "logps/rejected": -1134.5205078125,
320
- "loss": 0.2551,
321
  "rewards/accuracies": 0.8531249761581421,
322
- "rewards/chosen": -3.8549671173095703,
323
- "rewards/margins": 4.42364501953125,
324
- "rewards/rejected": -8.27861213684082,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.5118362124120281,
329
- "grad_norm": 23.143027388197456,
330
  "learning_rate": 2.823542203635138e-07,
331
- "logits/chosen": -2.3481929302215576,
332
- "logits/rejected": -2.147021770477295,
333
- "logps/chosen": -698.4183959960938,
334
- "logps/rejected": -1117.36962890625,
335
- "loss": 0.2557,
336
- "rewards/accuracies": 0.8031250238418579,
337
- "rewards/chosen": -3.897473096847534,
338
- "rewards/margins": 4.184942722320557,
339
- "rewards/rejected": -8.082415580749512,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.5118362124120281,
344
- "eval_logits/chosen": -2.2748405933380127,
345
- "eval_logits/rejected": -2.0707473754882812,
346
- "eval_logps/chosen": -709.49755859375,
347
- "eval_logps/rejected": -1222.880859375,
348
- "eval_loss": 0.2607395350933075,
349
- "eval_rewards/accuracies": 0.8470497131347656,
350
- "eval_rewards/chosen": -4.1804046630859375,
351
- "eval_rewards/margins": 5.013367652893066,
352
- "eval_rewards/rejected": -9.193772315979004,
353
- "eval_runtime": 467.0944,
354
- "eval_samples_per_second": 10.974,
355
- "eval_steps_per_second": 0.345,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.5374280230326296,
360
- "grad_norm": 29.714729651434116,
361
  "learning_rate": 2.600664850273538e-07,
362
- "logits/chosen": -2.220996379852295,
363
- "logits/rejected": -2.0096168518066406,
364
- "logps/chosen": -736.2384033203125,
365
- "logps/rejected": -1215.465576171875,
366
- "loss": 0.265,
367
- "rewards/accuracies": 0.8187500238418579,
368
- "rewards/chosen": -4.440661907196045,
369
- "rewards/margins": 4.793159484863281,
370
- "rewards/rejected": -9.2338228225708,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.5630198336532309,
375
- "grad_norm": 30.279152935247957,
376
  "learning_rate": 2.3769816112703045e-07,
377
- "logits/chosen": -2.0213561058044434,
378
- "logits/rejected": -1.708433747291565,
379
- "logps/chosen": -803.655029296875,
380
- "logps/rejected": -1369.3001708984375,
381
- "loss": 0.253,
382
- "rewards/accuracies": 0.828125,
383
- "rewards/chosen": -5.206329345703125,
384
- "rewards/margins": 5.589818000793457,
385
- "rewards/rejected": -10.796146392822266,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.5886116442738324,
390
- "grad_norm": 25.614713397243474,
391
  "learning_rate": 2.1542832120881677e-07,
392
- "logits/chosen": -1.8582950830459595,
393
- "logits/rejected": -1.4825233221054077,
394
- "logps/chosen": -796.53857421875,
395
- "logps/rejected": -1305.88818359375,
396
- "loss": 0.2601,
397
- "rewards/accuracies": 0.8187500238418579,
398
- "rewards/chosen": -5.0762786865234375,
399
- "rewards/margins": 5.091577053070068,
400
- "rewards/rejected": -10.167856216430664,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.6142034548944337,
405
- "grad_norm": 24.31506504955288,
406
  "learning_rate": 1.934352493925695e-07,
407
- "logits/chosen": -1.9886703491210938,
408
- "logits/rejected": -1.6142040491104126,
409
- "logps/chosen": -769.7489013671875,
410
- "logps/rejected": -1311.3548583984375,
411
- "loss": 0.2748,
412
- "rewards/accuracies": 0.8062499761581421,
413
- "rewards/chosen": -4.7199506759643555,
414
- "rewards/margins": 5.420409202575684,
415
- "rewards/rejected": -10.140359878540039,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.6397952655150352,
420
- "grad_norm": 31.574156427087846,
421
  "learning_rate": 1.7189501409486059e-07,
422
- "logits/chosen": -2.0121378898620605,
423
- "logits/rejected": -1.6347030401229858,
424
- "logps/chosen": -716.2213134765625,
425
- "logps/rejected": -1249.190185546875,
426
- "loss": 0.2809,
427
- "rewards/accuracies": 0.840624988079071,
428
- "rewards/chosen": -4.3439483642578125,
429
- "rewards/margins": 5.274473667144775,
430
- "rewards/rejected": -9.61842155456543,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.6653870761356366,
435
- "grad_norm": 19.51529401796244,
436
  "learning_rate": 1.5098005849021078e-07,
437
- "logits/chosen": -2.051848888397217,
438
- "logits/rejected": -1.7610851526260376,
439
- "logps/chosen": -730.6099853515625,
440
- "logps/rejected": -1209.3929443359375,
441
- "loss": 0.2457,
442
- "rewards/accuracies": 0.800000011920929,
443
- "rewards/chosen": -4.3370680809021,
444
- "rewards/margins": 4.67025089263916,
445
- "rewards/rejected": -9.007319450378418,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.690978886756238,
450
- "grad_norm": 34.445303465962446,
451
  "learning_rate": 1.30857819994673e-07,
452
- "logits/chosen": -1.923056960105896,
453
- "logits/rejected": -1.6418602466583252,
454
- "logps/chosen": -726.4392700195312,
455
- "logps/rejected": -1264.95458984375,
456
- "loss": 0.256,
457
- "rewards/accuracies": 0.831250011920929,
458
- "rewards/chosen": -4.332821369171143,
459
- "rewards/margins": 5.251183032989502,
460
- "rewards/rejected": -9.584003448486328,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.7165706973768394,
465
- "grad_norm": 36.88942757740681,
466
  "learning_rate": 1.116893898236716e-07,
467
- "logits/chosen": -1.9537960290908813,
468
- "logits/rejected": -1.6011472940444946,
469
- "logps/chosen": -746.1478271484375,
470
- "logps/rejected": -1309.915283203125,
471
- "loss": 0.2386,
472
- "rewards/accuracies": 0.856249988079071,
473
- "rewards/chosen": -4.4233832359313965,
474
- "rewards/margins": 5.588069438934326,
475
- "rewards/rejected": -10.011453628540039,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.7421625079974408,
480
- "grad_norm": 49.4114473741805,
481
  "learning_rate": 9.362822335518062e-08,
482
- "logits/chosen": -1.8809627294540405,
483
- "logits/rejected": -1.427119493484497,
484
- "logps/chosen": -769.44140625,
485
- "logps/rejected": -1329.4346923828125,
486
- "loss": 0.2622,
487
- "rewards/accuracies": 0.8843749761581421,
488
- "rewards/chosen": -4.582036972045898,
489
- "rewards/margins": 5.65748929977417,
490
- "rewards/rejected": -10.239526748657227,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.7677543186180422,
495
- "grad_norm": 19.047743120052225,
496
  "learning_rate": 7.681891162260015e-08,
497
- "logits/chosen": -1.828704833984375,
498
- "logits/rejected": -1.5141593217849731,
499
- "logps/chosen": -717.1990966796875,
500
- "logps/rejected": -1231.6229248046875,
501
- "loss": 0.2515,
502
- "rewards/accuracies": 0.8187500238418579,
503
- "rewards/chosen": -4.390562057495117,
504
- "rewards/margins": 4.981083869934082,
505
- "rewards/rejected": -9.3716459274292,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.7677543186180422,
510
- "eval_logits/chosen": -1.830853819847107,
511
- "eval_logits/rejected": -1.4321902990341187,
512
- "eval_logps/chosen": -726.7409057617188,
513
- "eval_logps/rejected": -1277.9102783203125,
514
- "eval_loss": 0.24932526051998138,
515
- "eval_rewards/accuracies": 0.850931704044342,
516
- "eval_rewards/chosen": -4.352837562561035,
517
- "eval_rewards/margins": 5.391228675842285,
518
- "eval_rewards/rejected": -9.74406623840332,
519
- "eval_runtime": 468.6767,
520
- "eval_samples_per_second": 10.937,
521
- "eval_steps_per_second": 0.344,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.7933461292386437,
526
- "grad_norm": 27.67097540824916,
527
  "learning_rate": 6.139602377230247e-08,
528
- "logits/chosen": -1.7593371868133545,
529
- "logits/rejected": -1.3604390621185303,
530
- "logps/chosen": -744.55078125,
531
- "logps/rejected": -1289.401611328125,
532
- "loss": 0.2523,
533
- "rewards/accuracies": 0.8500000238418579,
534
- "rewards/chosen": -4.4812517166137695,
535
- "rewards/margins": 5.370087623596191,
536
- "rewards/rejected": -9.851339340209961,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.818937939859245,
541
- "grad_norm": 27.140457734231973,
542
  "learning_rate": 4.748302975270837e-08,
543
- "logits/chosen": -1.7739003896713257,
544
- "logits/rejected": -1.296608805656433,
545
- "logps/chosen": -752.3242797851562,
546
- "logps/rejected": -1264.2474365234375,
547
- "loss": 0.2396,
548
- "rewards/accuracies": 0.8343750238418579,
549
- "rewards/chosen": -4.523016452789307,
550
- "rewards/margins": 5.08230447769165,
551
- "rewards/rejected": -9.605320930480957,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.8445297504798465,
556
- "grad_norm": 19.26284094768001,
557
  "learning_rate": 3.5191311859445795e-08,
558
- "logits/chosen": -1.762459397315979,
559
- "logits/rejected": -1.3729654550552368,
560
- "logps/chosen": -762.8904418945312,
561
- "logps/rejected": -1324.27197265625,
562
- "loss": 0.2321,
563
- "rewards/accuracies": 0.828125,
564
- "rewards/chosen": -4.7744550704956055,
565
- "rewards/margins": 5.492222785949707,
566
- "rewards/rejected": -10.266677856445312,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.8701215611004478,
571
- "grad_norm": 24.386284614263385,
572
  "learning_rate": 2.4619273049795996e-08,
573
- "logits/chosen": -1.7161592245101929,
574
- "logits/rejected": -1.374194860458374,
575
- "logps/chosen": -761.0625,
576
- "logps/rejected": -1358.326416015625,
577
- "loss": 0.2605,
578
- "rewards/accuracies": 0.8656250238418579,
579
- "rewards/chosen": -4.776429653167725,
580
- "rewards/margins": 5.739912986755371,
581
- "rewards/rejected": -10.516342163085938,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.8957133717210493,
586
- "grad_norm": 25.575727288966945,
587
  "learning_rate": 1.5851549164932115e-08,
588
- "logits/chosen": -1.698293924331665,
589
- "logits/rejected": -1.249987006187439,
590
- "logps/chosen": -781.341552734375,
591
- "logps/rejected": -1365.3270263671875,
592
- "loss": 0.2447,
593
- "rewards/accuracies": 0.8218749761581421,
594
- "rewards/chosen": -4.80244255065918,
595
- "rewards/margins": 5.769272804260254,
596
- "rewards/rejected": -10.571714401245117,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.9213051823416507,
601
- "grad_norm": 28.833379096024903,
602
  "learning_rate": 8.958331366609423e-09,
603
- "logits/chosen": -1.625765085220337,
604
- "logits/rejected": -1.215453863143921,
605
- "logps/chosen": -729.2706298828125,
606
- "logps/rejected": -1337.571044921875,
607
- "loss": 0.2386,
608
  "rewards/accuracies": 0.84375,
609
- "rewards/chosen": -4.483765602111816,
610
- "rewards/margins": 5.966723442077637,
611
- "rewards/rejected": -10.450489044189453,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.946896992962252,
616
- "grad_norm": 32.57067435027107,
617
  "learning_rate": 3.994804212627461e-09,
618
- "logits/chosen": -1.7305755615234375,
619
- "logits/rejected": -1.2781140804290771,
620
- "logps/chosen": -772.3482666015625,
621
- "logps/rejected": -1352.6353759765625,
622
- "loss": 0.2442,
623
  "rewards/accuracies": 0.8374999761581421,
624
- "rewards/chosen": -4.808593273162842,
625
- "rewards/margins": 5.681990623474121,
626
- "rewards/rejected": -10.490584373474121,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.9724888035828535,
631
- "grad_norm": 52.329250984527555,
632
  "learning_rate": 1.0007038696262516e-09,
633
- "logits/chosen": -1.7279059886932373,
634
- "logits/rejected": -1.3428099155426025,
635
- "logps/chosen": -765.5159301757812,
636
- "logps/rejected": -1355.879150390625,
637
- "loss": 0.2353,
638
- "rewards/accuracies": 0.84375,
639
- "rewards/chosen": -4.687448024749756,
640
- "rewards/margins": 5.837033271789551,
641
- "rewards/rejected": -10.524479866027832,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.9980806142034548,
646
- "grad_norm": 22.836707059503702,
647
  "learning_rate": 0.0,
648
- "logits/chosen": -1.6817991733551025,
649
- "logits/rejected": -1.2504949569702148,
650
- "logps/chosen": -785.3814086914062,
651
- "logps/rejected": -1332.812744140625,
652
- "loss": 0.2478,
653
- "rewards/accuracies": 0.856249988079071,
654
- "rewards/chosen": -4.990485191345215,
655
- "rewards/margins": 5.404683589935303,
656
- "rewards/rejected": -10.395169258117676,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.9980806142034548,
661
  "step": 390,
662
  "total_flos": 0.0,
663
- "train_loss": 0.0,
664
- "train_runtime": 0.0175,
665
- "train_samples_per_second": 2864609.543,
666
- "train_steps_per_second": 22343.954
667
  }
668
  ],
669
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0025591810620601407,
13
+ "grad_norm": 9.300729627928584,
14
  "learning_rate": 1.282051282051282e-08,
15
+ "logits/chosen": -3.076528787612915,
16
+ "logits/rejected": -3.05928111076355,
17
+ "logps/chosen": -298.5160217285156,
18
+ "logps/rejected": -280.6475524902344,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
 
25
  },
26
  {
27
  "epoch": 0.025591810620601407,
28
+ "grad_norm": 8.91999986583984,
29
  "learning_rate": 1.2820512820512818e-07,
30
+ "logits/chosen": -3.0055878162384033,
31
+ "logits/rejected": -3.006002187728882,
32
+ "logps/chosen": -295.4976806640625,
33
+ "logps/rejected": -299.0694885253906,
34
+ "loss": 0.6932,
35
  "rewards/accuracies": 0.4097222089767456,
36
+ "rewards/chosen": -0.00027675420278683305,
37
+ "rewards/margins": -0.00019084251835010946,
38
+ "rewards/rejected": -8.591161895310506e-05,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.05118362124120281,
43
+ "grad_norm": 8.795980264234538,
44
  "learning_rate": 2.5641025641025636e-07,
45
+ "logits/chosen": -3.013876438140869,
46
+ "logits/rejected": -2.9846599102020264,
47
+ "logps/chosen": -278.1435546875,
48
+ "logps/rejected": -280.6507568359375,
49
+ "loss": 0.6894,
50
+ "rewards/accuracies": 0.746874988079071,
51
+ "rewards/chosen": 0.004018495324999094,
52
+ "rewards/margins": 0.008613762445747852,
53
+ "rewards/rejected": -0.004595267120748758,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.07677543186180422,
58
+ "grad_norm": 8.739321551260307,
59
  "learning_rate": 3.8461538461538463e-07,
60
+ "logits/chosen": -2.9977359771728516,
61
+ "logits/rejected": -2.9877142906188965,
62
+ "logps/chosen": -286.0644836425781,
63
+ "logps/rejected": -299.9364318847656,
64
+ "loss": 0.6715,
65
+ "rewards/accuracies": 0.796875,
66
+ "rewards/chosen": 0.01204680372029543,
67
+ "rewards/margins": 0.042642779648303986,
68
+ "rewards/rejected": -0.03059597872197628,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.10236724248240563,
73
+ "grad_norm": 10.25775431193367,
74
  "learning_rate": 4.99989986344963e-07,
75
+ "logits/chosen": -3.0262513160705566,
76
+ "logits/rejected": -3.018004894256592,
77
+ "logps/chosen": -292.0692138671875,
78
+ "logps/rejected": -319.06201171875,
79
+ "loss": 0.6172,
80
+ "rewards/accuracies": 0.7906249761581421,
81
+ "rewards/chosen": -0.040852729231119156,
82
+ "rewards/margins": 0.15366603434085846,
83
+ "rewards/rejected": -0.19451875984668732,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.12795905310300704,
88
+ "grad_norm": 13.197584682554714,
89
  "learning_rate": 4.987893180827479e-07,
90
+ "logits/chosen": -3.042942762374878,
91
+ "logits/rejected": -3.0136711597442627,
92
+ "logps/chosen": -354.50311279296875,
93
+ "logps/rejected": -408.25933837890625,
94
+ "loss": 0.5182,
95
+ "rewards/accuracies": 0.768750011920929,
96
+ "rewards/chosen": -0.7526591420173645,
97
+ "rewards/margins": 0.5003793239593506,
98
+ "rewards/rejected": -1.2530385255813599,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.15355086372360843,
103
+ "grad_norm": 17.586826775035103,
104
  "learning_rate": 4.955969343539162e-07,
105
+ "logits/chosen": -3.068596363067627,
106
+ "logits/rejected": -3.05415415763855,
107
+ "logps/chosen": -529.9071044921875,
108
+ "logps/rejected": -668.5487670898438,
109
+ "loss": 0.4414,
110
+ "rewards/accuracies": 0.7562500238418579,
111
+ "rewards/chosen": -2.2428505420684814,
112
+ "rewards/margins": 1.1905735731124878,
113
+ "rewards/rejected": -3.4334239959716797,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.17914267434420986,
118
+ "grad_norm": 29.094385760333942,
119
  "learning_rate": 4.90438392204474e-07,
120
+ "logits/chosen": -3.1505279541015625,
121
+ "logits/rejected": -3.1332974433898926,
122
+ "logps/chosen": -545.3878173828125,
123
+ "logps/rejected": -748.8613891601562,
124
+ "loss": 0.3467,
125
+ "rewards/accuracies": 0.84375,
126
+ "rewards/chosen": -2.668983221054077,
127
+ "rewards/margins": 1.957241415977478,
128
+ "rewards/rejected": -4.626224994659424,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.20473448496481125,
133
+ "grad_norm": 19.23827111496606,
134
  "learning_rate": 4.83354989019146e-07,
135
+ "logits/chosen": -3.1376254558563232,
136
+ "logits/rejected": -3.1242246627807617,
137
+ "logps/chosen": -567.9331665039062,
138
+ "logps/rejected": -824.6702880859375,
139
+ "loss": 0.3436,
140
+ "rewards/accuracies": 0.8218749761581421,
141
+ "rewards/chosen": -2.782257556915283,
142
+ "rewards/margins": 2.4667036533355713,
143
+ "rewards/rejected": -5.248961448669434,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.23032629558541268,
148
+ "grad_norm": 19.580017636565856,
149
  "learning_rate": 4.7440343190975353e-07,
150
+ "logits/chosen": -3.0505166053771973,
151
+ "logits/rejected": -3.042973279953003,
152
+ "logps/chosen": -611.1217041015625,
153
+ "logps/rejected": -919.2169799804688,
154
+ "loss": 0.3345,
155
+ "rewards/accuracies": 0.828125,
156
+ "rewards/chosen": -3.080747604370117,
157
+ "rewards/margins": 3.0196213722229004,
158
+ "rewards/rejected": -6.100369453430176,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.2559181062060141,
163
+ "grad_norm": 25.054112621298355,
164
  "learning_rate": 4.6365538373900506e-07,
165
+ "logits/chosen": -2.9996213912963867,
166
+ "logits/rejected": -2.990790843963623,
167
+ "logps/chosen": -554.5274658203125,
168
+ "logps/rejected": -862.6036987304688,
169
+ "loss": 0.3131,
170
+ "rewards/accuracies": 0.84375,
171
+ "rewards/chosen": -2.494730234146118,
172
+ "rewards/margins": 2.9665369987487793,
173
+ "rewards/rejected": -5.461266994476318,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.2559181062060141,
178
+ "eval_logits/chosen": -2.949781656265259,
179
+ "eval_logits/rejected": -2.940216541290283,
180
+ "eval_logps/chosen": -570.2890014648438,
181
+ "eval_logps/rejected": -909.520263671875,
182
+ "eval_loss": 0.3047349452972412,
183
+ "eval_rewards/accuracies": 0.8291925191879272,
184
+ "eval_rewards/chosen": -2.7883195877075195,
185
+ "eval_rewards/margins": 3.2718467712402344,
186
+ "eval_rewards/rejected": -6.060166358947754,
187
+ "eval_runtime": 463.0554,
188
+ "eval_samples_per_second": 11.07,
189
+ "eval_steps_per_second": 0.348,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.28150991682661547,
194
+ "grad_norm": 28.064453271068697,
195
  "learning_rate": 4.5119688941406386e-07,
196
+ "logits/chosen": -2.917959451675415,
197
+ "logits/rejected": -2.907701253890991,
198
+ "logps/chosen": -648.3985595703125,
199
+ "logps/rejected": -1051.9510498046875,
200
+ "loss": 0.313,
201
+ "rewards/accuracies": 0.84375,
202
+ "rewards/chosen": -3.600653886795044,
203
+ "rewards/margins": 3.9409751892089844,
204
+ "rewards/rejected": -7.541630744934082,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.30710172744721687,
209
+ "grad_norm": 24.128631827809503,
210
  "learning_rate": 4.3712768704277524e-07,
211
+ "logits/chosen": -2.886887550354004,
212
+ "logits/rejected": -2.8801417350769043,
213
+ "logps/chosen": -532.00048828125,
214
+ "logps/rejected": -861.47802734375,
215
+ "loss": 0.2992,
216
  "rewards/accuracies": 0.8125,
217
+ "rewards/chosen": -2.393364429473877,
218
+ "rewards/margins": 3.2483468055725098,
219
+ "rewards/rejected": -5.641711235046387,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.3326935380678183,
224
+ "grad_norm": 41.03517320506039,
225
  "learning_rate": 4.2156040946718343e-07,
226
+ "logits/chosen": -2.7591373920440674,
227
+ "logits/rejected": -2.7252399921417236,
228
+ "logps/chosen": -654.140380859375,
229
+ "logps/rejected": -1031.537109375,
230
+ "loss": 0.2747,
231
+ "rewards/accuracies": 0.846875011920929,
232
+ "rewards/chosen": -3.605952501296997,
233
+ "rewards/margins": 3.7166149616241455,
234
+ "rewards/rejected": -7.322567939758301,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.3582853486884197,
239
+ "grad_norm": 29.778477903537254,
240
  "learning_rate": 4.046196825665637e-07,
241
+ "logits/chosen": -2.559828519821167,
242
+ "logits/rejected": -2.479792594909668,
243
+ "logps/chosen": -711.0362548828125,
244
+ "logps/rejected": -1145.183837890625,
245
+ "loss": 0.2739,
246
+ "rewards/accuracies": 0.846875011920929,
247
+ "rewards/chosen": -4.220881938934326,
248
+ "rewards/margins": 4.242983341217041,
249
+ "rewards/rejected": -8.463865280151367,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.3838771593090211,
254
+ "grad_norm": 26.95404487166528,
255
  "learning_rate": 3.864411275486261e-07,
256
+ "logits/chosen": -2.3361523151397705,
257
+ "logits/rejected": -2.17087721824646,
258
+ "logps/chosen": -759.3307495117188,
259
+ "logps/rejected": -1217.0687255859375,
260
+ "loss": 0.2708,
261
+ "rewards/accuracies": 0.8343750238418579,
262
+ "rewards/chosen": -4.6963276863098145,
263
+ "rewards/margins": 4.567781925201416,
264
+ "rewards/rejected": -9.264108657836914,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.4094689699296225,
269
+ "grad_norm": 36.77841932269942,
270
  "learning_rate": 3.671702752161759e-07,
271
+ "logits/chosen": -2.2813222408294678,
272
+ "logits/rejected": -2.133829116821289,
273
+ "logps/chosen": -743.0094604492188,
274
+ "logps/rejected": -1178.709716796875,
275
+ "loss": 0.2646,
276
+ "rewards/accuracies": 0.8812500238418579,
277
+ "rewards/chosen": -4.425657272338867,
278
+ "rewards/margins": 4.283566951751709,
279
+ "rewards/rejected": -8.709224700927734,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.4350607805502239,
284
+ "grad_norm": 31.269061340589527,
285
  "learning_rate": 3.4696140090121375e-07,
286
+ "logits/chosen": -2.316734790802002,
287
+ "logits/rejected": -2.122131824493408,
288
+ "logps/chosen": -777.4652099609375,
289
+ "logps/rejected": -1272.863525390625,
290
+ "loss": 0.2719,
291
+ "rewards/accuracies": 0.8343750238418579,
292
+ "rewards/chosen": -4.7845916748046875,
293
+ "rewards/margins": 4.937060356140137,
294
+ "rewards/rejected": -9.721651077270508,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.46065259117082535,
299
+ "grad_norm": 20.7412696930549,
300
  "learning_rate": 3.259762893935617e-07,
301
+ "logits/chosen": -2.1940865516662598,
302
+ "logits/rejected": -1.979815125465393,
303
+ "logps/chosen": -725.8430786132812,
304
+ "logps/rejected": -1174.8927001953125,
305
+ "loss": 0.2459,
306
+ "rewards/accuracies": 0.8343750238418579,
307
+ "rewards/chosen": -4.354685306549072,
308
+ "rewards/margins": 4.337596893310547,
309
+ "rewards/rejected": -8.692281723022461,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.48624440179142675,
314
+ "grad_norm": 22.840820454830215,
315
  "learning_rate": 3.0438293975154184e-07,
316
+ "logits/chosen": -2.1304101943969727,
317
+ "logits/rejected": -1.8553383350372314,
318
+ "logps/chosen": -694.48193359375,
319
+ "logps/rejected": -1045.50927734375,
320
+ "loss": 0.2737,
321
  "rewards/accuracies": 0.8531249761581421,
322
+ "rewards/chosen": -4.092527866363525,
323
+ "rewards/margins": 3.4944236278533936,
324
+ "rewards/rejected": -7.58695125579834,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.5118362124120281,
329
+ "grad_norm": 48.315922100487725,
330
  "learning_rate": 2.823542203635138e-07,
331
+ "logits/chosen": -1.982703447341919,
332
+ "logits/rejected": -1.6738717555999756,
333
+ "logps/chosen": -708.0183715820312,
334
+ "logps/rejected": -1162.70556640625,
335
+ "loss": 0.2541,
336
+ "rewards/accuracies": 0.8531249761581421,
337
+ "rewards/chosen": -4.193836688995361,
338
+ "rewards/margins": 4.424074649810791,
339
+ "rewards/rejected": -8.617910385131836,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.5118362124120281,
344
+ "eval_logits/chosen": -1.8229694366455078,
345
+ "eval_logits/rejected": -1.4771463871002197,
346
+ "eval_logps/chosen": -818.2685546875,
347
+ "eval_logps/rejected": -1380.3504638671875,
348
+ "eval_loss": 0.2607860863208771,
349
+ "eval_rewards/accuracies": 0.8462733030319214,
350
+ "eval_rewards/chosen": -5.268115043640137,
351
+ "eval_rewards/margins": 5.500354290008545,
352
+ "eval_rewards/rejected": -10.768467903137207,
353
+ "eval_runtime": 462.7794,
354
+ "eval_samples_per_second": 11.077,
355
+ "eval_steps_per_second": 0.348,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.5374280230326296,
360
+ "grad_norm": 31.413529562290883,
361
  "learning_rate": 2.600664850273538e-07,
362
+ "logits/chosen": -1.7659847736358643,
363
+ "logits/rejected": -1.4307024478912354,
364
+ "logps/chosen": -810.8310546875,
365
+ "logps/rejected": -1331.2171630859375,
366
+ "loss": 0.25,
367
+ "rewards/accuracies": 0.8343750238418579,
368
+ "rewards/chosen": -5.195899486541748,
369
+ "rewards/margins": 5.097092151641846,
370
+ "rewards/rejected": -10.292991638183594,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.5630198336532309,
375
+ "grad_norm": 27.081174599726534,
376
  "learning_rate": 2.3769816112703045e-07,
377
+ "logits/chosen": -1.6221271753311157,
378
+ "logits/rejected": -1.2494385242462158,
379
+ "logps/chosen": -812.7271728515625,
380
+ "logps/rejected": -1295.7265625,
381
+ "loss": 0.2522,
382
+ "rewards/accuracies": 0.846875011920929,
383
+ "rewards/chosen": -5.059515953063965,
384
+ "rewards/margins": 4.7599945068359375,
385
+ "rewards/rejected": -9.819511413574219,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.5886116442738324,
390
+ "grad_norm": 17.333742936417842,
391
  "learning_rate": 2.1542832120881677e-07,
392
+ "logits/chosen": -1.681460976600647,
393
+ "logits/rejected": -1.3104435205459595,
394
+ "logps/chosen": -759.7596435546875,
395
+ "logps/rejected": -1202.4127197265625,
396
+ "loss": 0.2744,
397
+ "rewards/accuracies": 0.809374988079071,
398
+ "rewards/chosen": -4.643033027648926,
399
+ "rewards/margins": 4.396432876586914,
400
+ "rewards/rejected": -9.039464950561523,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.6142034548944337,
405
+ "grad_norm": 26.442974160839057,
406
  "learning_rate": 1.934352493925695e-07,
407
+ "logits/chosen": -1.5410155057907104,
408
+ "logits/rejected": -1.2406994104385376,
409
+ "logps/chosen": -764.8502197265625,
410
+ "logps/rejected": -1239.8050537109375,
411
+ "loss": 0.3065,
412
+ "rewards/accuracies": 0.828125,
413
+ "rewards/chosen": -4.678874492645264,
414
+ "rewards/margins": 4.619391441345215,
415
+ "rewards/rejected": -9.29826545715332,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.6397952655150352,
420
+ "grad_norm": 25.27878081759285,
421
  "learning_rate": 1.7189501409486059e-07,
422
+ "logits/chosen": -1.6293185949325562,
423
+ "logits/rejected": -1.3014212846755981,
424
+ "logps/chosen": -790.1380004882812,
425
+ "logps/rejected": -1304.321044921875,
426
+ "loss": 0.243,
427
+ "rewards/accuracies": 0.859375,
428
+ "rewards/chosen": -4.863690376281738,
429
+ "rewards/margins": 5.004323482513428,
430
+ "rewards/rejected": -9.868013381958008,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.6653870761356366,
435
+ "grad_norm": 33.57879613298401,
436
  "learning_rate": 1.5098005849021078e-07,
437
+ "logits/chosen": -1.5503108501434326,
438
+ "logits/rejected": -1.2434519529342651,
439
+ "logps/chosen": -808.1937255859375,
440
+ "logps/rejected": -1313.421875,
441
+ "loss": 0.2634,
442
+ "rewards/accuracies": 0.809374988079071,
443
+ "rewards/chosen": -5.169132709503174,
444
+ "rewards/margins": 4.917426109313965,
445
+ "rewards/rejected": -10.086559295654297,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.690978886756238,
450
+ "grad_norm": 29.480034521734677,
451
  "learning_rate": 1.30857819994673e-07,
452
+ "logits/chosen": -1.4918172359466553,
453
+ "logits/rejected": -1.1054009199142456,
454
+ "logps/chosen": -813.1369018554688,
455
+ "logps/rejected": -1310.81640625,
456
+ "loss": 0.2593,
457
+ "rewards/accuracies": 0.828125,
458
+ "rewards/chosen": -5.231433868408203,
459
+ "rewards/margins": 4.96165132522583,
460
+ "rewards/rejected": -10.193084716796875,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.7165706973768394,
465
+ "grad_norm": 31.86429892286265,
466
  "learning_rate": 1.116893898236716e-07,
467
+ "logits/chosen": -1.7091087102890015,
468
+ "logits/rejected": -1.4138612747192383,
469
+ "logps/chosen": -746.4378051757812,
470
+ "logps/rejected": -1237.638427734375,
471
+ "loss": 0.2394,
472
+ "rewards/accuracies": 0.831250011920929,
473
+ "rewards/chosen": -4.613304615020752,
474
+ "rewards/margins": 4.798872947692871,
475
+ "rewards/rejected": -9.412178039550781,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.7421625079974408,
480
+ "grad_norm": 17.438412834057193,
481
  "learning_rate": 9.362822335518062e-08,
482
+ "logits/chosen": -1.6236486434936523,
483
+ "logits/rejected": -1.2768752574920654,
484
+ "logps/chosen": -760.7342529296875,
485
+ "logps/rejected": -1256.605712890625,
486
+ "loss": 0.2524,
487
+ "rewards/accuracies": 0.840624988079071,
488
+ "rewards/chosen": -4.530442237854004,
489
+ "rewards/margins": 4.826045036315918,
490
+ "rewards/rejected": -9.356486320495605,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.7677543186180422,
495
+ "grad_norm": 16.423055035755162,
496
  "learning_rate": 7.681891162260015e-08,
497
+ "logits/chosen": -1.4279536008834839,
498
+ "logits/rejected": -1.0810502767562866,
499
+ "logps/chosen": -718.3739013671875,
500
+ "logps/rejected": -1134.891357421875,
501
+ "loss": 0.2604,
502
+ "rewards/accuracies": 0.815625011920929,
503
+ "rewards/chosen": -4.416540622711182,
504
+ "rewards/margins": 4.05845832824707,
505
+ "rewards/rejected": -8.47499942779541,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.7677543186180422,
510
+ "eval_logits/chosen": -1.5107975006103516,
511
+ "eval_logits/rejected": -1.1260188817977905,
512
+ "eval_logps/chosen": -751.716064453125,
513
+ "eval_logps/rejected": -1276.6983642578125,
514
+ "eval_loss": 0.24895673990249634,
515
+ "eval_rewards/accuracies": 0.8462733030319214,
516
+ "eval_rewards/chosen": -4.602590084075928,
517
+ "eval_rewards/margins": 5.129357814788818,
518
+ "eval_rewards/rejected": -9.73194694519043,
519
+ "eval_runtime": 461.9008,
520
+ "eval_samples_per_second": 11.098,
521
+ "eval_steps_per_second": 0.349,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.7933461292386437,
526
+ "grad_norm": 24.363324466471035,
527
  "learning_rate": 6.139602377230247e-08,
528
+ "logits/chosen": -1.552236557006836,
529
+ "logits/rejected": -1.2730642557144165,
530
+ "logps/chosen": -777.7206420898438,
531
+ "logps/rejected": -1248.6849365234375,
532
+ "loss": 0.2592,
533
+ "rewards/accuracies": 0.828125,
534
+ "rewards/chosen": -4.85605001449585,
535
+ "rewards/margins": 4.622461318969727,
536
+ "rewards/rejected": -9.478510856628418,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.818937939859245,
541
+ "grad_norm": 27.20082426255916,
542
  "learning_rate": 4.748302975270837e-08,
543
+ "logits/chosen": -1.5342400074005127,
544
+ "logits/rejected": -1.1317594051361084,
545
+ "logps/chosen": -757.9732666015625,
546
+ "logps/rejected": -1286.5670166015625,
547
+ "loss": 0.2424,
548
+ "rewards/accuracies": 0.831250011920929,
549
+ "rewards/chosen": -4.64093542098999,
550
+ "rewards/margins": 5.158129692077637,
551
+ "rewards/rejected": -9.799064636230469,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.8445297504798465,
556
+ "grad_norm": 25.05529887885906,
557
  "learning_rate": 3.5191311859445795e-08,
558
+ "logits/chosen": -1.6275312900543213,
559
+ "logits/rejected": -1.1424721479415894,
560
+ "logps/chosen": -807.8544311523438,
561
+ "logps/rejected": -1404.4608154296875,
562
+ "loss": 0.2424,
563
+ "rewards/accuracies": 0.8656250238418579,
564
+ "rewards/chosen": -5.022496223449707,
565
+ "rewards/margins": 5.926790237426758,
566
+ "rewards/rejected": -10.949285507202148,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.8701215611004478,
571
+ "grad_norm": 26.4529069102665,
572
  "learning_rate": 2.4619273049795996e-08,
573
+ "logits/chosen": -1.4998157024383545,
574
+ "logits/rejected": -1.1587274074554443,
575
+ "logps/chosen": -786.7511596679688,
576
+ "logps/rejected": -1342.9837646484375,
577
+ "loss": 0.2452,
578
+ "rewards/accuracies": 0.859375,
579
+ "rewards/chosen": -5.053293704986572,
580
+ "rewards/margins": 5.45106840133667,
581
+ "rewards/rejected": -10.504362106323242,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.8957133717210493,
586
+ "grad_norm": 29.52957594878875,
587
  "learning_rate": 1.5851549164932115e-08,
588
+ "logits/chosen": -1.5455517768859863,
589
+ "logits/rejected": -1.0819389820098877,
590
+ "logps/chosen": -793.215087890625,
591
+ "logps/rejected": -1346.713623046875,
592
+ "loss": 0.2349,
593
+ "rewards/accuracies": 0.8374999761581421,
594
+ "rewards/chosen": -5.030755519866943,
595
+ "rewards/margins": 5.420226573944092,
596
+ "rewards/rejected": -10.450983047485352,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.9213051823416507,
601
+ "grad_norm": 25.548406142288144,
602
  "learning_rate": 8.958331366609423e-09,
603
+ "logits/chosen": -1.5059678554534912,
604
+ "logits/rejected": -1.0360510349273682,
605
+ "logps/chosen": -829.7648315429688,
606
+ "logps/rejected": -1369.6993408203125,
607
+ "loss": 0.2633,
608
  "rewards/accuracies": 0.84375,
609
+ "rewards/chosen": -5.2703986167907715,
610
+ "rewards/margins": 5.340662479400635,
611
+ "rewards/rejected": -10.611061096191406,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.946896992962252,
616
+ "grad_norm": 21.11557499289859,
617
  "learning_rate": 3.994804212627461e-09,
618
+ "logits/chosen": -1.5344655513763428,
619
+ "logits/rejected": -1.0639396905899048,
620
+ "logps/chosen": -785.5733642578125,
621
+ "logps/rejected": -1357.8092041015625,
622
+ "loss": 0.2495,
623
  "rewards/accuracies": 0.8374999761581421,
624
+ "rewards/chosen": -4.919853210449219,
625
+ "rewards/margins": 5.675114154815674,
626
+ "rewards/rejected": -10.594966888427734,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.9724888035828535,
631
+ "grad_norm": 36.07951763974912,
632
  "learning_rate": 1.0007038696262516e-09,
633
+ "logits/chosen": -1.5291458368301392,
634
+ "logits/rejected": -0.9540739059448242,
635
+ "logps/chosen": -803.43017578125,
636
+ "logps/rejected": -1367.823486328125,
637
+ "loss": 0.2557,
638
+ "rewards/accuracies": 0.871874988079071,
639
+ "rewards/chosen": -5.007359504699707,
640
+ "rewards/margins": 5.686868667602539,
641
+ "rewards/rejected": -10.69422721862793,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.9980806142034548,
646
+ "grad_norm": 36.10595960803006,
647
  "learning_rate": 0.0,
648
+ "logits/chosen": -1.6231262683868408,
649
+ "logits/rejected": -1.1367409229278564,
650
+ "logps/chosen": -790.3123779296875,
651
+ "logps/rejected": -1342.495849609375,
652
+ "loss": 0.2454,
653
+ "rewards/accuracies": 0.840624988079071,
654
+ "rewards/chosen": -4.837635040283203,
655
+ "rewards/margins": 5.484267234802246,
656
+ "rewards/rejected": -10.32190227508545,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.9980806142034548,
661
  "step": 390,
662
  "total_flos": 0.0,
663
+ "train_loss": 0.32177775089557353,
664
+ "train_runtime": 13656.3788,
665
+ "train_samples_per_second": 3.661,
666
+ "train_steps_per_second": 0.029
667
  }
668
  ],
669
  "logging_steps": 10,