NicholasCorrado commited on
Commit
ea41ddd
·
verified ·
1 Parent(s): dadbb6b

Model save

Browse files
Files changed (4) hide show
  1. README.md +23 -25
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +963 -963
README.md CHANGED
@@ -3,33 +3,31 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: alignment-handbook/zephyr-7b-sft-full
5
  tags:
6
- - alignment-handbook
7
  - trl
8
  - dpo
 
9
  - generated_from_trainer
10
- datasets:
11
- - HuggingFaceH4/ultrafeedback_binarized
12
  model-index:
13
- - name: mixed_zephyr-7b-dpo-full
14
  results: []
15
  ---
16
 
17
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
  should probably proofread and complete it, then remove this comment. -->
19
 
20
- # mixed_zephyr-7b-dpo-full
21
 
22
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the HuggingFaceH4/ultrafeedback_binarized dataset.
23
  It achieves the following results on the evaluation set:
24
- - Logits/chosen: -0.3096
25
- - Logits/rejected: 0.6049
26
- - Logps/chosen: -755.9323
27
- - Logps/rejected: -1192.5621
28
- - Loss: 0.3152
29
- - Rewards/accuracies: 0.8184
30
- - Rewards/chosen: -4.6496
31
- - Rewards/margins: 4.3751
32
- - Rewards/rejected: -9.0247
33
 
34
  ## Model description
35
 
@@ -64,16 +62,16 @@ The following hyperparameters were used during training:
64
 
65
  ### Training results
66
 
67
- | Training Loss | Epoch | Step | Logits/chosen | Logits/rejected | Logps/chosen | Logps/rejected | Validation Loss | Rewards/accuracies | Rewards/chosen | Rewards/margins | Rewards/rejected |
68
- |:-------------:|:------:|:----:|:-------------:|:---------------:|:------------:|:--------------:|:---------------:|:------------------:|:--------------:|:---------------:|:----------------:|
69
- | 0.5385 | 0.1152 | 100 | -2.9012 | -2.8749 | -433.4271 | -527.4997 | 0.4593 | 0.7539 | -1.4246 | 0.9495 | -2.3741 |
70
- | 0.4369 | 0.2303 | 200 | -1.5078 | -1.1798 | -594.2914 | -823.1062 | 0.3590 | 0.7915 | -3.0332 | 2.2969 | -5.3301 |
71
- | 0.4119 | 0.3455 | 300 | -0.6166 | -0.1140 | -677.2002 | -996.9340 | 0.3369 | 0.8156 | -3.8623 | 3.2061 | -7.0684 |
72
- | 0.3964 | 0.4607 | 400 | -0.6209 | 0.2313 | -753.4187 | -1128.0946 | 0.3311 | 0.8178 | -4.6245 | 3.7555 | -8.3800 |
73
- | 0.3858 | 0.5759 | 500 | -0.7776 | 0.1893 | -694.4181 | -1049.8429 | 0.3247 | 0.8167 | -4.0345 | 3.5630 | -7.5975 |
74
- | 0.4031 | 0.6910 | 600 | -0.2605 | 0.6163 | -748.3096 | -1143.1573 | 0.3191 | 0.8201 | -4.5734 | 3.9572 | -8.5306 |
75
- | 0.4007 | 0.8062 | 700 | -0.4982 | 0.4411 | -753.0112 | -1189.4250 | 0.3171 | 0.8178 | -4.6204 | 4.3729 | -8.9933 |
76
- | 0.3644 | 0.9214 | 800 | -0.3096 | 0.6049 | -755.9323 | -1192.5621 | 0.3152 | 0.8184 | -4.6496 | 4.3751 | -9.0247 |
77
 
78
 
79
  ### Framework versions
 
3
  license: apache-2.0
4
  base_model: alignment-handbook/zephyr-7b-sft-full
5
  tags:
 
6
  - trl
7
  - dpo
8
+ - alignment-handbook
9
  - generated_from_trainer
 
 
10
  model-index:
11
+ - name: zephyr-7b-dpo-full
12
  results: []
13
  ---
14
 
15
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
16
  should probably proofread and complete it, then remove this comment. -->
17
 
18
+ # zephyr-7b-dpo-full
19
 
20
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the None dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.3161
23
+ - Rewards/chosen: -4.1174
24
+ - Rewards/rejected: -8.3387
25
+ - Rewards/accuracies: 0.8212
26
+ - Rewards/margins: 4.2213
27
+ - Logps/rejected: -1123.9625
28
+ - Logps/chosen: -702.7068
29
+ - Logits/rejected: 0.5558
30
+ - Logits/chosen: -0.4246
31
 
32
  ## Model description
33
 
 
62
 
63
  ### Training results
64
 
65
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
66
+ |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
67
+ | 0.5381 | 0.1152 | 100 | 0.4758 | -1.9882 | -2.9171 | 0.7270 | 0.9288 | -581.7981 | -489.7893 | -2.8822 | -2.9045 |
68
+ | 0.4268 | 0.2303 | 200 | 0.3577 | -3.9068 | -6.8487 | 0.7976 | 2.9419 | -974.9606 | -681.6494 | -0.6781 | -0.9791 |
69
+ | 0.4067 | 0.3455 | 300 | 0.3411 | -3.9757 | -7.6481 | 0.8094 | 3.6724 | -1054.9027 | -688.5351 | -0.6642 | -1.2474 |
70
+ | 0.4011 | 0.4607 | 400 | 0.3295 | -4.4449 | -8.4011 | 0.8156 | 3.9562 | -1130.1991 | -735.4550 | 0.1183 | -0.7429 |
71
+ | 0.3727 | 0.5759 | 500 | 0.3260 | -3.7203 | -7.6540 | 0.8161 | 3.9337 | -1055.4913 | -662.9987 | -0.4066 | -1.3009 |
72
+ | 0.3933 | 0.6910 | 600 | 0.3190 | -3.7331 | -7.5182 | 0.8257 | 3.7851 | -1041.9088 | -664.2776 | 0.3247 | -0.5819 |
73
+ | 0.3858 | 0.8062 | 700 | 0.3166 | -3.9569 | -8.0356 | 0.8246 | 4.0787 | -1093.6547 | -686.6614 | 0.3586 | -0.6058 |
74
+ | 0.3785 | 0.9214 | 800 | 0.3161 | -4.1174 | -8.3387 | 0.8212 | 4.2213 | -1123.9625 | -702.7068 | 0.5558 | -0.4246 |
75
 
76
 
77
  ### Framework versions
all_results.json CHANGED
@@ -14,9 +14,9 @@
14
  "eval_samples_per_second": 11.048,
15
  "eval_steps_per_second": 0.346,
16
  "total_flos": 0.0,
17
- "train_loss": 0.0,
18
- "train_runtime": 0.0211,
19
  "train_samples": 111134,
20
- "train_samples_per_second": 5273498.215,
21
- "train_steps_per_second": 41188.083
22
  }
 
14
  "eval_samples_per_second": 11.048,
15
  "eval_steps_per_second": 0.346,
16
  "total_flos": 0.0,
17
+ "train_loss": 0.42924998652550483,
18
+ "train_runtime": 32201.1967,
19
  "train_samples": 111134,
20
+ "train_samples_per_second": 3.451,
21
+ "train_steps_per_second": 0.027
22
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9997120644975526,
3
  "total_flos": 0.0,
4
- "train_loss": 0.0,
5
- "train_runtime": 0.0211,
6
  "train_samples": 111134,
7
- "train_samples_per_second": 5273498.215,
8
- "train_steps_per_second": 41188.083
9
  }
 
1
  {
2
  "epoch": 0.9997120644975526,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.42924998652550483,
5
+ "train_runtime": 32201.1967,
6
  "train_samples": 111134,
7
+ "train_samples_per_second": 3.451,
8
+ "train_steps_per_second": 0.027
9
  }
trainer_state.json CHANGED
@@ -9,13 +9,13 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "grad_norm": 13.015832712288159,
14
- "learning_rate": 5e-07,
15
- "logits/chosen": -2.605381965637207,
16
- "logits/rejected": -2.5362534523010254,
17
- "logps/chosen": -197.4033660888672,
18
- "logps/rejected": -176.15130615234375,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
@@ -25,1430 +25,1430 @@
25
  },
26
  {
27
  "epoch": 0.01151742009789807,
28
- "grad_norm": 8.230031374538095,
29
  "learning_rate": 5.747126436781609e-08,
30
- "logits/chosen": -2.797184705734253,
31
- "logits/rejected": -2.768812417984009,
32
- "logps/chosen": -266.24053955078125,
33
- "logps/rejected": -265.971923828125,
34
- "loss": 0.6928,
35
- "rewards/accuracies": 0.4305555522441864,
36
- "rewards/chosen": -0.00021778659720439464,
37
- "rewards/margins": -0.00010571091843303293,
38
- "rewards/rejected": -0.00011207569332327694,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.02303484019579614,
43
- "grad_norm": 8.31009452460146,
44
  "learning_rate": 1.1494252873563217e-07,
45
- "logits/chosen": -2.802431583404541,
46
- "logits/rejected": -2.773219347000122,
47
- "logps/chosen": -287.32781982421875,
48
- "logps/rejected": -273.28900146484375,
49
- "loss": 0.6923,
50
  "rewards/accuracies": 0.5406249761581421,
51
- "rewards/chosen": 0.0006046505295671523,
52
- "rewards/margins": 0.0014849099097773433,
53
- "rewards/rejected": -0.0008802594384178519,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.03455226029369421,
58
- "grad_norm": 8.066889291282722,
59
  "learning_rate": 1.7241379310344828e-07,
60
- "logits/chosen": -2.804356098175049,
61
- "logits/rejected": -2.7821590900421143,
62
- "logps/chosen": -278.156494140625,
63
- "logps/rejected": -270.8301086425781,
64
- "loss": 0.6905,
65
- "rewards/accuracies": 0.684374988079071,
66
- "rewards/chosen": 0.0025812473613768816,
67
- "rewards/margins": 0.005251543130725622,
68
- "rewards/rejected": -0.0026702960021793842,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.04606968039159228,
73
- "grad_norm": 8.939044393747595,
74
  "learning_rate": 2.2988505747126435e-07,
75
- "logits/chosen": -2.8080034255981445,
76
- "logits/rejected": -2.7811412811279297,
77
- "logps/chosen": -272.1091003417969,
78
- "logps/rejected": -268.6837158203125,
79
- "loss": 0.6858,
80
- "rewards/accuracies": 0.703125,
81
- "rewards/chosen": 0.007119017653167248,
82
- "rewards/margins": 0.016155635938048363,
83
- "rewards/rejected": -0.009036618284881115,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.05758710048949035,
88
- "grad_norm": 9.805284456793881,
89
  "learning_rate": 2.873563218390804e-07,
90
- "logits/chosen": -2.834063768386841,
91
- "logits/rejected": -2.7892394065856934,
92
- "logps/chosen": -284.08453369140625,
93
- "logps/rejected": -282.91802978515625,
94
- "loss": 0.6762,
95
- "rewards/accuracies": 0.75,
96
- "rewards/chosen": 0.014029329642653465,
97
- "rewards/margins": 0.03542623296380043,
98
- "rewards/rejected": -0.021396907046437263,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.06910452058738842,
103
- "grad_norm": 8.352607046334498,
104
  "learning_rate": 3.4482758620689656e-07,
105
- "logits/chosen": -2.805022716522217,
106
- "logits/rejected": -2.796321392059326,
107
- "logps/chosen": -292.1920166015625,
108
- "logps/rejected": -302.4415588378906,
109
- "loss": 0.6624,
110
- "rewards/accuracies": 0.71875,
111
- "rewards/chosen": 0.006681998260319233,
112
- "rewards/margins": 0.06977846473455429,
113
- "rewards/rejected": -0.06309647113084793,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.0806219406852865,
118
- "grad_norm": 9.823702522936284,
119
  "learning_rate": 4.0229885057471266e-07,
120
- "logits/chosen": -2.75339674949646,
121
- "logits/rejected": -2.751986026763916,
122
- "logps/chosen": -281.77618408203125,
123
- "logps/rejected": -300.4095153808594,
124
- "loss": 0.6322,
125
- "rewards/accuracies": 0.734375,
126
- "rewards/chosen": -0.07559685409069061,
127
- "rewards/margins": 0.150864839553833,
128
- "rewards/rejected": -0.22646169364452362,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.09213936078318456,
133
- "grad_norm": 11.550756640744595,
134
  "learning_rate": 4.597701149425287e-07,
135
- "logits/chosen": -2.8751022815704346,
136
- "logits/rejected": -2.8525900840759277,
137
- "logps/chosen": -316.79888916015625,
138
- "logps/rejected": -340.1561584472656,
139
- "loss": 0.5913,
140
- "rewards/accuracies": 0.765625,
141
- "rewards/chosen": -0.24550755321979523,
142
- "rewards/margins": 0.29761967062950134,
143
- "rewards/rejected": -0.5431272387504578,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.10365678088108264,
148
- "grad_norm": 18.102002209139584,
149
  "learning_rate": 4.999817969178237e-07,
150
- "logits/chosen": -2.8152594566345215,
151
- "logits/rejected": -2.7724924087524414,
152
- "logps/chosen": -363.1444396972656,
153
- "logps/rejected": -401.7603759765625,
154
- "loss": 0.5547,
155
- "rewards/accuracies": 0.746874988079071,
156
- "rewards/chosen": -0.6734243631362915,
157
- "rewards/margins": 0.5091755986213684,
158
- "rewards/rejected": -1.1825997829437256,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.1151742009789807,
163
- "grad_norm": 15.592173368744417,
164
  "learning_rate": 4.996582603056428e-07,
165
- "logits/chosen": -2.7807068824768066,
166
- "logits/rejected": -2.75152325630188,
167
- "logps/chosen": -403.1298828125,
168
- "logps/rejected": -451.24072265625,
169
- "loss": 0.5385,
170
- "rewards/accuracies": 0.7093750238418579,
171
- "rewards/chosen": -1.0208370685577393,
172
- "rewards/margins": 0.6278557181358337,
173
- "rewards/rejected": -1.6486928462982178,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.1151742009789807,
178
- "eval_logits/chosen": -2.9011571407318115,
179
- "eval_logits/rejected": -2.874889373779297,
180
- "eval_logps/chosen": -433.42706298828125,
181
- "eval_logps/rejected": -527.4996948242188,
182
- "eval_loss": 0.45933064818382263,
183
- "eval_rewards/accuracies": 0.753923773765564,
184
- "eval_rewards/chosen": -1.424589991569519,
185
- "eval_rewards/margins": 0.9494837522506714,
186
- "eval_rewards/rejected": -2.3740737438201904,
187
- "eval_runtime": 651.6627,
188
- "eval_samples_per_second": 10.935,
189
- "eval_steps_per_second": 0.342,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.12669162107687879,
194
- "grad_norm": 17.802035855151065,
195
  "learning_rate": 4.989308132738126e-07,
196
- "logits/chosen": -2.731767416000366,
197
- "logits/rejected": -2.702854633331299,
198
- "logps/chosen": -390.03009033203125,
199
- "logps/rejected": -461.499755859375,
200
- "loss": 0.4959,
201
- "rewards/accuracies": 0.6812499761581421,
202
- "rewards/chosen": -1.160954236984253,
203
- "rewards/margins": 0.7525253295898438,
204
- "rewards/rejected": -1.9134795665740967,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.13820904117477684,
209
- "grad_norm": 26.847609346017396,
210
  "learning_rate": 4.978006327248536e-07,
211
- "logits/chosen": -2.6494832038879395,
212
- "logits/rejected": -2.6402511596679688,
213
- "logps/chosen": -438.6656799316406,
214
- "logps/rejected": -550.1033325195312,
215
- "loss": 0.475,
216
- "rewards/accuracies": 0.7437499761581421,
217
- "rewards/chosen": -1.6023308038711548,
218
- "rewards/margins": 1.078300952911377,
219
- "rewards/rejected": -2.680631637573242,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.14972646127267492,
224
- "grad_norm": 25.087856993190254,
225
  "learning_rate": 4.962695471250032e-07,
226
- "logits/chosen": -2.4692533016204834,
227
- "logits/rejected": -2.435044050216675,
228
- "logps/chosen": -499.8922424316406,
229
- "logps/rejected": -645.5679931640625,
230
- "loss": 0.468,
231
- "rewards/accuracies": 0.746874988079071,
232
- "rewards/chosen": -2.0733580589294434,
233
- "rewards/margins": 1.5583977699279785,
234
- "rewards/rejected": -3.631755828857422,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.161243881370573,
239
- "grad_norm": 26.974432330966298,
240
  "learning_rate": 4.94340033546025e-07,
241
- "logits/chosen": -1.697016716003418,
242
- "logits/rejected": -1.593400239944458,
243
- "logps/chosen": -511.65814208984375,
244
- "logps/rejected": -659.9658813476562,
245
- "loss": 0.4654,
246
- "rewards/accuracies": 0.746874988079071,
247
- "rewards/chosen": -2.305452823638916,
248
- "rewards/margins": 1.5949369668960571,
249
- "rewards/rejected": -3.9003894329071045,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.17276130146847107,
254
- "grad_norm": 21.115401587052915,
255
  "learning_rate": 4.920152136576705e-07,
256
- "logits/chosen": -1.4327126741409302,
257
- "logits/rejected": -1.2659103870391846,
258
- "logps/chosen": -538.796630859375,
259
- "logps/rejected": -664.4251098632812,
260
- "loss": 0.4789,
261
- "rewards/accuracies": 0.753125011920929,
262
- "rewards/chosen": -2.33674955368042,
263
- "rewards/margins": 1.4603914022445679,
264
- "rewards/rejected": -3.7971413135528564,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.18427872156636912,
269
- "grad_norm": 24.637364700318916,
270
  "learning_rate": 4.892988486772756e-07,
271
- "logits/chosen": -1.4591898918151855,
272
- "logits/rejected": -1.3274848461151123,
273
- "logps/chosen": -468.7333068847656,
274
- "logps/rejected": -612.8162841796875,
275
- "loss": 0.4462,
276
- "rewards/accuracies": 0.7749999761581421,
277
- "rewards/chosen": -1.950823187828064,
278
- "rewards/margins": 1.431302785873413,
279
- "rewards/rejected": -3.3821263313293457,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.1957961416642672,
284
- "grad_norm": 27.13923752480491,
285
  "learning_rate": 4.861953332846629e-07,
286
- "logits/chosen": -1.2759544849395752,
287
- "logits/rejected": -1.0808634757995605,
288
- "logps/chosen": -469.6282653808594,
289
- "logps/rejected": -628.2378540039062,
290
- "loss": 0.444,
291
- "rewards/accuracies": 0.7593749761581421,
292
- "rewards/chosen": -2.0090174674987793,
293
- "rewards/margins": 1.5872033834457397,
294
- "rewards/rejected": -3.5962207317352295,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.20731356176216528,
299
- "grad_norm": 22.29941288426432,
300
  "learning_rate": 4.827096885121953e-07,
301
- "logits/chosen": -0.8839688301086426,
302
- "logits/rejected": -0.664128839969635,
303
- "logps/chosen": -591.6177978515625,
304
- "logps/rejected": -778.9203491210938,
305
- "loss": 0.4486,
306
- "rewards/accuracies": 0.7406250238418579,
307
- "rewards/chosen": -3.2479281425476074,
308
- "rewards/margins": 1.83078134059906,
309
- "rewards/rejected": -5.078709125518799,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.21883098186006333,
314
- "grad_norm": 20.150152801800882,
315
  "learning_rate": 4.788475536214821e-07,
316
- "logits/chosen": -1.1295298337936401,
317
- "logits/rejected": -0.8731690645217896,
318
- "logps/chosen": -518.4920654296875,
319
- "logps/rejected": -677.3343505859375,
320
- "loss": 0.4248,
321
- "rewards/accuracies": 0.778124988079071,
322
- "rewards/chosen": -2.0726380348205566,
323
- "rewards/margins": 1.7125848531723022,
324
- "rewards/rejected": -3.7852234840393066,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.2303484019579614,
329
- "grad_norm": 24.4341951464939,
330
  "learning_rate": 4.746151769798818e-07,
331
- "logits/chosen": -0.9307588338851929,
332
- "logits/rejected": -0.6262258291244507,
333
- "logps/chosen": -524.0397338867188,
334
- "logps/rejected": -701.8967895507812,
335
- "loss": 0.4369,
336
- "rewards/accuracies": 0.7437499761581421,
337
- "rewards/chosen": -2.360715389251709,
338
- "rewards/margins": 1.8277909755706787,
339
- "rewards/rejected": -4.188506603240967,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.2303484019579614,
344
- "eval_logits/chosen": -1.5077687501907349,
345
- "eval_logits/rejected": -1.1797598600387573,
346
- "eval_logps/chosen": -594.2913818359375,
347
- "eval_logps/rejected": -823.106201171875,
348
- "eval_loss": 0.3589639961719513,
349
- "eval_rewards/accuracies": 0.7914798259735107,
350
- "eval_rewards/chosen": -3.033234119415283,
351
- "eval_rewards/margins": 2.2969048023223877,
352
- "eval_rewards/rejected": -5.330138683319092,
353
- "eval_runtime": 650.6064,
354
- "eval_samples_per_second": 10.953,
355
- "eval_steps_per_second": 0.343,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.2418658220558595,
360
- "grad_norm": 27.352856519591263,
361
  "learning_rate": 4.7001940595156055e-07,
362
- "logits/chosen": -0.7815187573432922,
363
- "logits/rejected": -0.46700936555862427,
364
- "logps/chosen": -518.8436279296875,
365
- "logps/rejected": -683.1966552734375,
366
- "loss": 0.4274,
367
- "rewards/accuracies": 0.7250000238418579,
368
- "rewards/chosen": -2.385855197906494,
369
- "rewards/margins": 1.621694564819336,
370
- "rewards/rejected": -4.00754976272583,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.25338324215375757,
375
- "grad_norm": 29.897947419384028,
376
  "learning_rate": 4.650676758194623e-07,
377
- "logits/chosen": -0.5421683192253113,
378
- "logits/rejected": -0.02623056247830391,
379
- "logps/chosen": -606.1685791015625,
380
- "logps/rejected": -831.0916137695312,
381
- "loss": 0.4012,
382
- "rewards/accuracies": 0.765625,
383
- "rewards/chosen": -3.0587515830993652,
384
- "rewards/margins": 2.499514102935791,
385
- "rewards/rejected": -5.558266639709473,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.26490066225165565,
390
- "grad_norm": 28.31850344555953,
391
  "learning_rate": 4.5976799775611215e-07,
392
- "logits/chosen": -0.28304657340049744,
393
- "logits/rejected": 0.2166980504989624,
394
- "logps/chosen": -565.9539794921875,
395
- "logps/rejected": -766.6756591796875,
396
- "loss": 0.4392,
397
- "rewards/accuracies": 0.796875,
398
- "rewards/chosen": -2.7024905681610107,
399
- "rewards/margins": 2.067142963409424,
400
- "rewards/rejected": -4.7696332931518555,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.2764180823495537,
405
- "grad_norm": 25.790552553148434,
406
  "learning_rate": 4.5412894586271543e-07,
407
- "logits/chosen": -0.3281463384628296,
408
- "logits/rejected": 0.12199939787387848,
409
- "logps/chosen": -534.4832763671875,
410
- "logps/rejected": -700.3882446289062,
411
- "loss": 0.4403,
412
- "rewards/accuracies": 0.793749988079071,
413
- "rewards/chosen": -2.3464341163635254,
414
- "rewards/margins": 1.9047329425811768,
415
- "rewards/rejected": -4.251167297363281,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.28793550244745175,
420
- "grad_norm": 29.425669097369397,
421
  "learning_rate": 4.481596432975201e-07,
422
- "logits/chosen": -0.6021678447723389,
423
- "logits/rejected": -0.20536144077777863,
424
- "logps/chosen": -615.7349853515625,
425
- "logps/rejected": -839.0997924804688,
426
- "loss": 0.4298,
427
- "rewards/accuracies": 0.7593749761581421,
428
- "rewards/chosen": -3.1481640338897705,
429
- "rewards/margins": 2.2502574920654297,
430
- "rewards/rejected": -5.398421764373779,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.29945292254534983,
435
- "grad_norm": 23.62933629230091,
436
  "learning_rate": 4.41869747515886e-07,
437
- "logits/chosen": -0.2845512330532074,
438
- "logits/rejected": 0.14756298065185547,
439
- "logps/chosen": -572.5442504882812,
440
- "logps/rejected": -812.703125,
441
- "loss": 0.3968,
442
- "rewards/accuracies": 0.768750011920929,
443
- "rewards/chosen": -2.856945514678955,
444
- "rewards/margins": 2.3578898906707764,
445
- "rewards/rejected": -5.214835166931152,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.3109703426432479,
450
- "grad_norm": 36.01630964835951,
451
  "learning_rate": 4.352694346459396e-07,
452
- "logits/chosen": -0.057602040469646454,
453
- "logits/rejected": 0.40555334091186523,
454
- "logps/chosen": -587.2971801757812,
455
- "logps/rejected": -866.1613159179688,
456
- "loss": 0.4006,
457
  "rewards/accuracies": 0.784375011920929,
458
- "rewards/chosen": -3.123883008956909,
459
- "rewards/margins": 2.7192797660827637,
460
- "rewards/rejected": -5.84316349029541,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.322487762741146,
465
- "grad_norm": 26.73415377993604,
466
  "learning_rate": 4.2836938302509256e-07,
467
- "logits/chosen": -0.25706934928894043,
468
- "logits/rejected": 0.16837282478809357,
469
- "logps/chosen": -575.8345947265625,
470
- "logps/rejected": -808.24267578125,
471
- "loss": 0.4075,
472
- "rewards/accuracies": 0.778124988079071,
473
- "rewards/chosen": -2.9973578453063965,
474
- "rewards/margins": 2.355498790740967,
475
- "rewards/rejected": -5.352856636047363,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.33400518283904407,
480
- "grad_norm": 29.332592595497015,
481
  "learning_rate": 4.2118075592405874e-07,
482
- "logits/chosen": -0.3039420247077942,
483
- "logits/rejected": 0.07993211597204208,
484
- "logps/chosen": -582.0941162109375,
485
- "logps/rejected": -830.5714111328125,
486
- "loss": 0.3976,
487
- "rewards/accuracies": 0.793749988079071,
488
- "rewards/chosen": -2.9472875595092773,
489
- "rewards/margins": 2.5015506744384766,
490
- "rewards/rejected": -5.448838233947754,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.34552260293694215,
495
- "grad_norm": 30.91612291215343,
496
  "learning_rate": 4.137151834863213e-07,
497
- "logits/chosen": -0.10641048848628998,
498
- "logits/rejected": 0.6166712641716003,
499
- "logps/chosen": -632.7642822265625,
500
- "logps/rejected": -849.4898681640625,
501
- "loss": 0.4119,
502
- "rewards/accuracies": 0.778124988079071,
503
- "rewards/chosen": -3.399906873703003,
504
- "rewards/margins": 2.507375478744507,
505
- "rewards/rejected": -5.90728235244751,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.34552260293694215,
510
- "eval_logits/chosen": -0.6165890693664551,
511
- "eval_logits/rejected": -0.11399216204881668,
512
- "eval_logps/chosen": -677.2001953125,
513
- "eval_logps/rejected": -996.9340209960938,
514
- "eval_loss": 0.336904913187027,
515
- "eval_rewards/accuracies": 0.8155829310417175,
516
- "eval_rewards/chosen": -3.862321615219116,
517
- "eval_rewards/margins": 3.206094741821289,
518
- "eval_rewards/rejected": -7.068417072296143,
519
- "eval_runtime": 656.6921,
520
- "eval_samples_per_second": 10.851,
521
- "eval_steps_per_second": 0.34,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.35704002303484017,
526
- "grad_norm": 22.38837991601497,
527
  "learning_rate": 4.059847439122671e-07,
528
- "logits/chosen": -0.46659454703330994,
529
- "logits/rejected": 0.0826030969619751,
530
- "logps/chosen": -515.8815307617188,
531
- "logps/rejected": -717.310302734375,
532
- "loss": 0.4112,
533
- "rewards/accuracies": 0.765625,
534
- "rewards/chosen": -2.256371021270752,
535
- "rewards/margins": 2.008225679397583,
536
- "rewards/rejected": -4.264596462249756,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.36855744313273825,
541
- "grad_norm": 21.515754430109986,
542
  "learning_rate": 3.98001943918432e-07,
543
- "logits/chosen": -0.8846302032470703,
544
- "logits/rejected": -0.03813103586435318,
545
- "logps/chosen": -544.3895263671875,
546
- "logps/rejected": -746.3841552734375,
547
- "loss": 0.3939,
548
- "rewards/accuracies": 0.809374988079071,
549
- "rewards/chosen": -2.379772663116455,
550
- "rewards/margins": 2.3421151638031006,
551
- "rewards/rejected": -4.721888542175293,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.38007486323063633,
556
- "grad_norm": 33.71230207361674,
557
  "learning_rate": 3.8977969850346866e-07,
558
- "logits/chosen": 0.13661722838878632,
559
- "logits/rejected": 0.7041386365890503,
560
- "logps/chosen": -666.94482421875,
561
- "logps/rejected": -926.0341796875,
562
- "loss": 0.3873,
563
- "rewards/accuracies": 0.75,
564
- "rewards/chosen": -3.78490948677063,
565
- "rewards/margins": 2.646435022354126,
566
- "rewards/rejected": -6.431344509124756,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.3915922833285344,
571
- "grad_norm": 28.524858622055092,
572
  "learning_rate": 3.8133131005357465e-07,
573
- "logits/chosen": -0.015070567838847637,
574
- "logits/rejected": 0.6914359927177429,
575
- "logps/chosen": -646.4139404296875,
576
- "logps/rejected": -965.0103759765625,
577
- "loss": 0.3971,
578
- "rewards/accuracies": 0.78125,
579
- "rewards/chosen": -3.5984835624694824,
580
- "rewards/margins": 3.210897922515869,
581
- "rewards/rejected": -6.809381008148193,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.4031097034264325,
586
- "grad_norm": 32.078697347416266,
587
  "learning_rate": 3.7267044682118435e-07,
588
- "logits/chosen": -0.002132108900696039,
589
- "logits/rejected": 0.7953078150749207,
590
- "logps/chosen": -604.9791259765625,
591
- "logps/rejected": -838.1949462890625,
592
- "loss": 0.4191,
593
- "rewards/accuracies": 0.768750011920929,
594
- "rewards/chosen": -3.1062846183776855,
595
- "rewards/margins": 2.339332342147827,
596
- "rewards/rejected": -5.445616722106934,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.41462712352433057,
601
- "grad_norm": 28.020517011807925,
602
  "learning_rate": 3.638111208117425e-07,
603
- "logits/chosen": -0.1473531574010849,
604
- "logits/rejected": 0.490295946598053,
605
- "logps/chosen": -583.7153930664062,
606
- "logps/rejected": -761.9363403320312,
607
- "loss": 0.4035,
608
- "rewards/accuracies": 0.765625,
609
- "rewards/chosen": -3.0424270629882812,
610
- "rewards/margins": 1.7358585596084595,
611
- "rewards/rejected": -4.778285026550293,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.42614454362222864,
616
- "grad_norm": 25.853288738352997,
617
  "learning_rate": 3.5476766511433605e-07,
618
- "logits/chosen": -0.25570568442344666,
619
- "logits/rejected": 0.6842668652534485,
620
- "logps/chosen": -590.0350341796875,
621
- "logps/rejected": -811.4537353515625,
622
- "loss": 0.3968,
623
- "rewards/accuracies": 0.793749988079071,
624
- "rewards/chosen": -3.002671480178833,
625
- "rewards/margins": 2.369654655456543,
626
- "rewards/rejected": -5.372325897216797,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.43766196372012667,
631
- "grad_norm": 21.591809702398923,
632
  "learning_rate": 3.455547107128602e-07,
633
- "logits/chosen": -0.12841393053531647,
634
- "logits/rejected": 0.6481091380119324,
635
- "logps/chosen": -580.2199096679688,
636
- "logps/rejected": -826.1383666992188,
637
- "loss": 0.3958,
638
- "rewards/accuracies": 0.800000011920929,
639
- "rewards/chosen": -3.195159435272217,
640
- "rewards/margins": 2.441926956176758,
641
- "rewards/rejected": -5.637085914611816,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.44917938381802475,
646
- "grad_norm": 39.83795352564531,
647
  "learning_rate": 3.361871628152338e-07,
648
- "logits/chosen": -0.23047828674316406,
649
- "logits/rejected": 0.7577739953994751,
650
- "logps/chosen": -605.4849853515625,
651
- "logps/rejected": -883.64501953125,
652
- "loss": 0.4085,
653
- "rewards/accuracies": 0.809374988079071,
654
- "rewards/chosen": -3.1104187965393066,
655
- "rewards/margins": 3.0135536193847656,
656
- "rewards/rejected": -6.123971939086914,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.4606968039159228,
661
- "grad_norm": 22.463302227367222,
662
  "learning_rate": 3.2668017673896077e-07,
663
- "logits/chosen": -0.22118684649467468,
664
- "logits/rejected": 0.6193957924842834,
665
- "logps/chosen": -640.8189697265625,
666
- "logps/rejected": -955.4924926757812,
667
- "loss": 0.3964,
668
- "rewards/accuracies": 0.809374988079071,
669
- "rewards/chosen": -3.495349884033203,
670
- "rewards/margins": 3.084470748901367,
671
- "rewards/rejected": -6.579820156097412,
672
  "step": 400
673
  },
674
  {
675
  "epoch": 0.4606968039159228,
676
- "eval_logits/chosen": -0.6209221482276917,
677
- "eval_logits/rejected": 0.23131267726421356,
678
- "eval_logps/chosen": -753.418701171875,
679
- "eval_logps/rejected": -1128.0946044921875,
680
- "eval_loss": 0.33106523752212524,
681
- "eval_rewards/accuracies": 0.8178251385688782,
682
- "eval_rewards/chosen": -4.624506950378418,
683
- "eval_rewards/margins": 3.7555172443389893,
684
- "eval_rewards/rejected": -8.380023956298828,
685
- "eval_runtime": 655.865,
686
- "eval_samples_per_second": 10.865,
687
- "eval_steps_per_second": 0.34,
688
  "step": 400
689
  },
690
  {
691
  "epoch": 0.4722142240138209,
692
- "grad_norm": 27.33004967085911,
693
  "learning_rate": 3.1704913339205103e-07,
694
- "logits/chosen": 0.38320040702819824,
695
- "logits/rejected": 1.2441421747207642,
696
- "logps/chosen": -592.7208862304688,
697
- "logps/rejected": -816.4508666992188,
698
- "loss": 0.407,
699
- "rewards/accuracies": 0.7875000238418579,
700
- "rewards/chosen": -3.1608848571777344,
701
- "rewards/margins": 2.369687080383301,
702
- "rewards/rejected": -5.530571937561035,
703
  "step": 410
704
  },
705
  {
706
  "epoch": 0.483731644111719,
707
- "grad_norm": 29.22769569320565,
708
  "learning_rate": 3.0730961438896885e-07,
709
- "logits/chosen": -0.32711368799209595,
710
- "logits/rejected": 0.6167188882827759,
711
- "logps/chosen": -647.5065307617188,
712
- "logps/rejected": -920.5850830078125,
713
- "loss": 0.3864,
714
- "rewards/accuracies": 0.815625011920929,
715
- "rewards/chosen": -3.577653408050537,
716
- "rewards/margins": 2.697723865509033,
717
- "rewards/rejected": -6.27537727355957,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 0.49524906420961706,
722
- "grad_norm": 29.1628367265211,
723
  "learning_rate": 2.9747737684186795e-07,
724
- "logits/chosen": -0.8004047274589539,
725
- "logits/rejected": 0.0654061958193779,
726
- "logps/chosen": -586.2633056640625,
727
- "logps/rejected": -828.8479614257812,
728
- "loss": 0.4008,
729
- "rewards/accuracies": 0.800000011920929,
730
- "rewards/chosen": -3.018512487411499,
731
- "rewards/margins": 2.515615701675415,
732
- "rewards/rejected": -5.534128189086914,
733
  "step": 430
734
  },
735
  {
736
  "epoch": 0.5067664843075151,
737
- "grad_norm": 43.05788588925481,
738
  "learning_rate": 2.8756832786789663e-07,
739
- "logits/chosen": -0.7165388464927673,
740
- "logits/rejected": 0.3907933533191681,
741
- "logps/chosen": -558.6912231445312,
742
- "logps/rejected": -839.3739013671875,
743
- "loss": 0.3988,
744
- "rewards/accuracies": 0.831250011920929,
745
- "rewards/chosen": -2.8957457542419434,
746
- "rewards/margins": 2.8470349311828613,
747
- "rewards/rejected": -5.742780685424805,
748
  "step": 440
749
  },
750
  {
751
  "epoch": 0.5182839044054132,
752
- "grad_norm": 26.95003512302597,
753
  "learning_rate": 2.7759849885381747e-07,
754
- "logits/chosen": -0.43579286336898804,
755
- "logits/rejected": 0.7088162302970886,
756
- "logps/chosen": -564.5299072265625,
757
- "logps/rejected": -807.0545043945312,
758
- "loss": 0.3965,
759
- "rewards/accuracies": 0.778124988079071,
760
- "rewards/chosen": -2.7139556407928467,
761
- "rewards/margins": 2.583310127258301,
762
- "rewards/rejected": -5.297266483306885,
763
  "step": 450
764
  },
765
  {
766
  "epoch": 0.5298013245033113,
767
- "grad_norm": 37.40829093424466,
768
  "learning_rate": 2.675840195195762e-07,
769
- "logits/chosen": -0.4753951132297516,
770
- "logits/rejected": 0.5207837224006653,
771
- "logps/chosen": -559.075927734375,
772
- "logps/rejected": -858.9351806640625,
773
- "loss": 0.3858,
774
- "rewards/accuracies": 0.809374988079071,
775
- "rewards/chosen": -2.8625900745391846,
776
- "rewards/margins": 2.9560627937316895,
777
- "rewards/rejected": -5.818652153015137,
778
  "step": 460
779
  },
780
  {
781
  "epoch": 0.5413187446012093,
782
- "grad_norm": 28.860389068235733,
783
  "learning_rate": 2.575410918227829e-07,
784
- "logits/chosen": -0.4289991855621338,
785
- "logits/rejected": 0.41408976912498474,
786
- "logps/chosen": -583.07763671875,
787
- "logps/rejected": -848.7003784179688,
788
- "loss": 0.3851,
789
- "rewards/accuracies": 0.75,
790
- "rewards/chosen": -2.932926654815674,
791
- "rewards/margins": 2.7647881507873535,
792
- "rewards/rejected": -5.697714805603027,
793
  "step": 470
794
  },
795
  {
796
  "epoch": 0.5528361646991073,
797
- "grad_norm": 25.478968182398468,
798
  "learning_rate": 2.474859637463226e-07,
799
- "logits/chosen": 0.019112158566713333,
800
- "logits/rejected": 0.9573495984077454,
801
- "logps/chosen": -578.31005859375,
802
- "logps/rejected": -817.9622192382812,
803
- "loss": 0.4001,
804
- "rewards/accuracies": 0.753125011920929,
805
- "rewards/chosen": -3.071147918701172,
806
- "rewards/margins": 2.528298854827881,
807
- "rewards/rejected": -5.599446773529053,
808
  "step": 480
809
  },
810
  {
811
  "epoch": 0.5643535847970055,
812
- "grad_norm": 22.69267875960799,
813
  "learning_rate": 2.3743490301150355e-07,
814
- "logits/chosen": 0.03456907719373703,
815
- "logits/rejected": 0.9821624755859375,
816
- "logps/chosen": -616.0007934570312,
817
- "logps/rejected": -855.6959228515625,
818
- "loss": 0.395,
819
- "rewards/accuracies": 0.78125,
820
- "rewards/chosen": -3.1994495391845703,
821
- "rewards/margins": 2.4747273921966553,
822
- "rewards/rejected": -5.6741766929626465,
823
  "step": 490
824
  },
825
  {
826
  "epoch": 0.5758710048949035,
827
- "grad_norm": 26.70832967985792,
828
  "learning_rate": 2.274041707592724e-07,
829
- "logits/chosen": -0.4122609496116638,
830
- "logits/rejected": 0.6060948371887207,
831
- "logps/chosen": -594.303466796875,
832
- "logps/rejected": -892.1234130859375,
833
- "loss": 0.3858,
834
- "rewards/accuracies": 0.8125,
835
- "rewards/chosen": -3.213183879852295,
836
- "rewards/margins": 2.860560894012451,
837
- "rewards/rejected": -6.0737457275390625,
838
  "step": 500
839
  },
840
  {
841
  "epoch": 0.5758710048949035,
842
- "eval_logits/chosen": -0.7776147127151489,
843
- "eval_logits/rejected": 0.18928049504756927,
844
- "eval_logps/chosen": -694.4180908203125,
845
- "eval_logps/rejected": -1049.8428955078125,
846
- "eval_loss": 0.3246955871582031,
847
- "eval_rewards/accuracies": 0.8167040348052979,
848
- "eval_rewards/chosen": -4.034500598907471,
849
- "eval_rewards/margins": 3.563004732131958,
850
- "eval_rewards/rejected": -7.59750509262085,
851
- "eval_runtime": 874.6942,
852
- "eval_samples_per_second": 8.147,
853
- "eval_steps_per_second": 0.255,
854
  "step": 500
855
  },
856
  {
857
  "epoch": 0.5873884249928016,
858
- "grad_norm": 21.857166040982808,
859
  "learning_rate": 2.17409995242075e-07,
860
- "logits/chosen": -0.3013337552547455,
861
- "logits/rejected": 0.687148928642273,
862
- "logps/chosen": -590.9053955078125,
863
- "logps/rejected": -848.279296875,
864
- "loss": 0.3623,
865
- "rewards/accuracies": 0.800000011920929,
866
- "rewards/chosen": -3.1760306358337402,
867
- "rewards/margins": 2.6910133361816406,
868
- "rewards/rejected": -5.867043972015381,
869
  "step": 510
870
  },
871
  {
872
  "epoch": 0.5989058450906997,
873
- "grad_norm": 32.93018464240502,
874
  "learning_rate": 2.0746854556892544e-07,
875
- "logits/chosen": -0.28416475653648376,
876
- "logits/rejected": 0.760982871055603,
877
- "logps/chosen": -584.7510986328125,
878
- "logps/rejected": -825.0016479492188,
879
- "loss": 0.3654,
880
  "rewards/accuracies": 0.8187500238418579,
881
- "rewards/chosen": -3.0093677043914795,
882
- "rewards/margins": 2.5775859355926514,
883
- "rewards/rejected": -5.586953639984131,
884
  "step": 520
885
  },
886
  {
887
  "epoch": 0.6104232651885978,
888
- "grad_norm": 31.84439684571111,
889
  "learning_rate": 1.9759590554616173e-07,
890
- "logits/chosen": -0.21416716277599335,
891
- "logits/rejected": 0.8462156057357788,
892
- "logps/chosen": -591.3154296875,
893
- "logps/rejected": -826.24853515625,
894
- "loss": 0.39,
895
- "rewards/accuracies": 0.7749999761581421,
896
- "rewards/chosen": -3.1136324405670166,
897
- "rewards/margins": 2.453207015991211,
898
- "rewards/rejected": -5.56683874130249,
899
  "step": 530
900
  },
901
  {
902
  "epoch": 0.6219406852864958,
903
- "grad_norm": 28.506645648712848,
904
  "learning_rate": 1.8780804765620746e-07,
905
- "logits/chosen": -0.06838655471801758,
906
- "logits/rejected": 1.0294172763824463,
907
- "logps/chosen": -577.9981689453125,
908
- "logps/rejected": -835.3642578125,
909
- "loss": 0.3793,
910
- "rewards/accuracies": 0.8062499761581421,
911
- "rewards/chosen": -2.834435224533081,
912
- "rewards/margins": 2.704789876937866,
913
- "rewards/rejected": -5.539225101470947,
914
  "step": 540
915
  },
916
  {
917
  "epoch": 0.6334581053843938,
918
- "grad_norm": 30.179970032375,
919
  "learning_rate": 1.7812080721643973e-07,
920
- "logits/chosen": -0.30736392736434937,
921
- "logits/rejected": 0.8852709531784058,
922
- "logps/chosen": -576.0755615234375,
923
- "logps/rejected": -836.8414306640625,
924
- "loss": 0.381,
925
- "rewards/accuracies": 0.8125,
926
- "rewards/chosen": -2.8606371879577637,
927
- "rewards/margins": 2.6888041496276855,
928
- "rewards/rejected": -5.549441337585449,
929
  "step": 550
930
  },
931
  {
932
  "epoch": 0.644975525482292,
933
- "grad_norm": 26.709457513505647,
934
  "learning_rate": 1.6854985675997063e-07,
935
- "logits/chosen": -0.26044386625289917,
936
- "logits/rejected": 0.7742006778717041,
937
- "logps/chosen": -582.1048583984375,
938
- "logps/rejected": -819.34033203125,
939
- "loss": 0.4007,
940
- "rewards/accuracies": 0.7875000238418579,
941
- "rewards/chosen": -2.9853100776672363,
942
- "rewards/margins": 2.4650139808654785,
943
- "rewards/rejected": -5.450324058532715,
944
  "step": 560
945
  },
946
  {
947
  "epoch": 0.65649294558019,
948
- "grad_norm": 27.543745008054035,
949
  "learning_rate": 1.5911068067978818e-07,
950
- "logits/chosen": -0.05375183746218681,
951
- "logits/rejected": 1.1043269634246826,
952
- "logps/chosen": -581.2166748046875,
953
- "logps/rejected": -818.9441528320312,
954
- "loss": 0.3971,
955
- "rewards/accuracies": 0.8187500238418579,
956
- "rewards/chosen": -2.8354713916778564,
957
- "rewards/margins": 2.655651330947876,
958
- "rewards/rejected": -5.491122245788574,
959
  "step": 570
960
  },
961
  {
962
  "epoch": 0.6680103656780881,
963
- "grad_norm": 22.093767953647365,
964
  "learning_rate": 1.4981855017728197e-07,
965
- "logits/chosen": 0.0580272376537323,
966
- "logits/rejected": 0.7513723373413086,
967
- "logps/chosen": -571.5791625976562,
968
- "logps/rejected": -858.3342895507812,
969
- "loss": 0.3701,
970
- "rewards/accuracies": 0.784375011920929,
971
- "rewards/chosen": -3.0100650787353516,
972
- "rewards/margins": 2.6902260780334473,
973
- "rewards/rejected": -5.700291633605957,
974
  "step": 580
975
  },
976
  {
977
  "epoch": 0.6795277857759862,
978
- "grad_norm": 36.73163562183304,
979
  "learning_rate": 1.406884985556804e-07,
980
- "logits/chosen": -0.005457936320453882,
981
- "logits/rejected": 1.047271490097046,
982
- "logps/chosen": -635.6920166015625,
983
- "logps/rejected": -881.1370849609375,
984
- "loss": 0.3825,
985
- "rewards/accuracies": 0.7437499761581421,
986
- "rewards/chosen": -3.439662456512451,
987
- "rewards/margins": 2.575695037841797,
988
- "rewards/rejected": -6.01535701751709,
989
  "step": 590
990
  },
991
  {
992
  "epoch": 0.6910452058738843,
993
- "grad_norm": 30.080057939243627,
994
  "learning_rate": 1.3173529689837354e-07,
995
- "logits/chosen": -0.23538751900196075,
996
- "logits/rejected": 0.9952915906906128,
997
- "logps/chosen": -625.2637939453125,
998
- "logps/rejected": -905.7393798828125,
999
- "loss": 0.4031,
1000
- "rewards/accuracies": 0.765625,
1001
- "rewards/chosen": -3.336890459060669,
1002
- "rewards/margins": 3.089966058731079,
1003
- "rewards/rejected": -6.42685604095459,
1004
  "step": 600
1005
  },
1006
  {
1007
  "epoch": 0.6910452058738843,
1008
- "eval_logits/chosen": -0.26048585772514343,
1009
- "eval_logits/rejected": 0.6162645220756531,
1010
- "eval_logps/chosen": -748.3095703125,
1011
- "eval_logps/rejected": -1143.1573486328125,
1012
- "eval_loss": 0.3190823495388031,
1013
- "eval_rewards/accuracies": 0.820067286491394,
1014
- "eval_rewards/chosen": -4.573415279388428,
1015
- "eval_rewards/margins": 3.9572343826293945,
1016
- "eval_rewards/rejected": -8.530649185180664,
1017
- "eval_runtime": 651.1572,
1018
- "eval_samples_per_second": 10.944,
1019
- "eval_steps_per_second": 0.342,
1020
  "step": 600
1021
  },
1022
  {
1023
  "epoch": 0.7025626259717823,
1024
- "grad_norm": 28.901245277836818,
1025
  "learning_rate": 1.2297343017146726e-07,
1026
- "logits/chosen": 0.07719476521015167,
1027
- "logits/rejected": 1.148842453956604,
1028
- "logps/chosen": -615.670166015625,
1029
- "logps/rejected": -902.8016357421875,
1030
- "loss": 0.385,
1031
- "rewards/accuracies": 0.78125,
1032
- "rewards/chosen": -3.2692692279815674,
1033
- "rewards/margins": 2.9544837474823,
1034
- "rewards/rejected": -6.223752975463867,
1035
  "step": 610
1036
  },
1037
  {
1038
  "epoch": 0.7140800460696803,
1039
- "grad_norm": 26.881220630663055,
1040
  "learning_rate": 1.1441707378923474e-07,
1041
- "logits/chosen": 0.3414779305458069,
1042
- "logits/rejected": 1.1920559406280518,
1043
- "logps/chosen": -611.7634887695312,
1044
- "logps/rejected": -883.4708862304688,
1045
- "loss": 0.4032,
1046
- "rewards/accuracies": 0.7593749761581421,
1047
- "rewards/chosen": -3.267723798751831,
1048
- "rewards/margins": 2.798133373260498,
1049
- "rewards/rejected": -6.065857410430908,
1050
  "step": 620
1051
  },
1052
  {
1053
  "epoch": 0.7255974661675785,
1054
- "grad_norm": 22.92522846442678,
1055
  "learning_rate": 1.06080070680377e-07,
1056
- "logits/chosen": 0.059290122240781784,
1057
- "logits/rejected": 1.0623096227645874,
1058
- "logps/chosen": -614.2824096679688,
1059
- "logps/rejected": -868.0498046875,
1060
- "loss": 0.372,
1061
- "rewards/accuracies": 0.7718750238418579,
1062
- "rewards/chosen": -3.2304539680480957,
1063
- "rewards/margins": 2.7317616939544678,
1064
- "rewards/rejected": -5.962214946746826,
1065
  "step": 630
1066
  },
1067
  {
1068
  "epoch": 0.7371148862654765,
1069
- "grad_norm": 18.474464704704374,
1070
  "learning_rate": 9.797590889219587e-08,
1071
- "logits/chosen": -0.07347230613231659,
1072
- "logits/rejected": 0.7878081798553467,
1073
- "logps/chosen": -598.407958984375,
1074
- "logps/rejected": -922.5660400390625,
1075
- "loss": 0.3733,
1076
- "rewards/accuracies": 0.796875,
1077
- "rewards/chosen": -3.226865768432617,
1078
- "rewards/margins": 3.285538911819458,
1079
- "rewards/rejected": -6.5124053955078125,
1080
  "step": 640
1081
  },
1082
  {
1083
  "epoch": 0.7486323063633746,
1084
- "grad_norm": 25.105406106031534,
1085
  "learning_rate": 9.011769976891367e-08,
1086
- "logits/chosen": 0.06855427473783493,
1087
- "logits/rejected": 1.2701406478881836,
1088
- "logps/chosen": -594.861083984375,
1089
- "logps/rejected": -820.2781372070312,
1090
- "loss": 0.3929,
1091
- "rewards/accuracies": 0.828125,
1092
- "rewards/chosen": -2.9540488719940186,
1093
- "rewards/margins": 2.544384241104126,
1094
- "rewards/rejected": -5.4984331130981445,
1095
  "step": 650
1096
  },
1097
  {
1098
  "epoch": 0.7601497264612727,
1099
- "grad_norm": 25.930812393377074,
1100
  "learning_rate": 8.251815673944218e-08,
1101
- "logits/chosen": -0.13862136006355286,
1102
- "logits/rejected": 0.950897216796875,
1103
- "logps/chosen": -660.083740234375,
1104
- "logps/rejected": -984.3076171875,
1105
- "loss": 0.3798,
1106
- "rewards/accuracies": 0.815625011920929,
1107
- "rewards/chosen": -3.4895882606506348,
1108
- "rewards/margins": 3.4641425609588623,
1109
- "rewards/rejected": -6.953730583190918,
1110
  "step": 660
1111
  },
1112
  {
1113
  "epoch": 0.7716671465591708,
1114
- "grad_norm": 22.848572550568402,
1115
  "learning_rate": 7.518957474892148e-08,
1116
- "logits/chosen": 0.03879556804895401,
1117
- "logits/rejected": 0.8222616314888,
1118
- "logps/chosen": -593.1844482421875,
1119
- "logps/rejected": -868.5235595703125,
1120
- "loss": 0.3716,
1121
- "rewards/accuracies": 0.809374988079071,
1122
- "rewards/chosen": -3.1576333045959473,
1123
- "rewards/margins": 2.791321277618408,
1124
- "rewards/rejected": -5.9489545822143555,
1125
  "step": 670
1126
  },
1127
  {
1128
  "epoch": 0.7831845666570688,
1129
- "grad_norm": 37.77871708341422,
1130
  "learning_rate": 6.814381036730274e-08,
1131
- "logits/chosen": -0.06809209287166595,
1132
- "logits/rejected": 0.9388583898544312,
1133
- "logps/chosen": -602.1769409179688,
1134
- "logps/rejected": -918.8854370117188,
1135
- "loss": 0.4027,
1136
- "rewards/accuracies": 0.768750011920929,
1137
- "rewards/chosen": -3.232111692428589,
1138
- "rewards/margins": 3.247992753982544,
1139
- "rewards/rejected": -6.480103969573975,
1140
  "step": 680
1141
  },
1142
  {
1143
  "epoch": 0.7947019867549668,
1144
- "grad_norm": 25.302552395916596,
1145
  "learning_rate": 6.139226260715872e-08,
1146
- "logits/chosen": -0.06320186704397202,
1147
- "logits/rejected": 0.8823334574699402,
1148
- "logps/chosen": -625.1529541015625,
1149
- "logps/rejected": -898.3558349609375,
1150
- "loss": 0.3655,
1151
- "rewards/accuracies": 0.778124988079071,
1152
- "rewards/chosen": -3.5077052116394043,
1153
- "rewards/margins": 2.779940366744995,
1154
- "rewards/rejected": -6.2876458168029785,
1155
  "step": 690
1156
  },
1157
  {
1158
  "epoch": 0.806219406852865,
1159
- "grad_norm": 41.487105287447704,
1160
  "learning_rate": 5.4945854481754734e-08,
1161
- "logits/chosen": 0.07060976326465607,
1162
- "logits/rejected": 0.9207429885864258,
1163
- "logps/chosen": -644.9542236328125,
1164
- "logps/rejected": -981.1370849609375,
1165
- "loss": 0.4007,
1166
- "rewards/accuracies": 0.7749999761581421,
1167
- "rewards/chosen": -3.7041306495666504,
1168
- "rewards/margins": 3.448993682861328,
1169
- "rewards/rejected": -7.1531243324279785,
1170
  "step": 700
1171
  },
1172
  {
1173
  "epoch": 0.806219406852865,
1174
- "eval_logits/chosen": -0.4981551170349121,
1175
- "eval_logits/rejected": 0.44106799364089966,
1176
- "eval_logps/chosen": -753.01123046875,
1177
- "eval_logps/rejected": -1189.425048828125,
1178
- "eval_loss": 0.31710898876190186,
1179
- "eval_rewards/accuracies": 0.8178251385688782,
1180
- "eval_rewards/chosen": -4.620431900024414,
1181
- "eval_rewards/margins": 4.372895240783691,
1182
- "eval_rewards/rejected": -8.993328094482422,
1183
- "eval_runtime": 653.0396,
1184
- "eval_samples_per_second": 10.912,
1185
- "eval_steps_per_second": 0.341,
1186
  "step": 700
1187
  },
1188
  {
1189
  "epoch": 0.817736826950763,
1190
- "grad_norm": 26.15798738128027,
1191
  "learning_rate": 4.881501533321605e-08,
1192
- "logits/chosen": -0.3350176513195038,
1193
- "logits/rejected": 0.5944274663925171,
1194
- "logps/chosen": -611.4078369140625,
1195
- "logps/rejected": -894.845703125,
1196
- "loss": 0.3819,
1197
- "rewards/accuracies": 0.796875,
1198
- "rewards/chosen": -3.2478299140930176,
1199
- "rewards/margins": 2.9104466438293457,
1200
- "rewards/rejected": -6.158276557922363,
1201
  "step": 710
1202
  },
1203
  {
1204
  "epoch": 0.8292542470486611,
1205
- "grad_norm": 28.210401445519196,
1206
  "learning_rate": 4.300966395938377e-08,
1207
- "logits/chosen": -0.47553783655166626,
1208
- "logits/rejected": 0.6052624583244324,
1209
- "logps/chosen": -642.4817504882812,
1210
- "logps/rejected": -950.6018676757812,
1211
- "loss": 0.3724,
1212
- "rewards/accuracies": 0.8062499761581421,
1213
- "rewards/chosen": -3.420116901397705,
1214
- "rewards/margins": 3.2143654823303223,
1215
- "rewards/rejected": -6.634482383728027,
1216
  "step": 720
1217
  },
1218
  {
1219
  "epoch": 0.8407716671465592,
1220
- "grad_norm": 27.28999144486062,
1221
  "learning_rate": 3.7539192566655246e-08,
1222
- "logits/chosen": -0.0816282406449318,
1223
- "logits/rejected": 0.8518702387809753,
1224
- "logps/chosen": -626.6390991210938,
1225
- "logps/rejected": -941.8958129882812,
1226
- "loss": 0.3713,
1227
- "rewards/accuracies": 0.8343750238418579,
1228
- "rewards/chosen": -3.378054141998291,
1229
- "rewards/margins": 3.2282519340515137,
1230
- "rewards/rejected": -6.606306552886963,
1231
  "step": 730
1232
  },
1233
  {
1234
  "epoch": 0.8522890872444573,
1235
- "grad_norm": 27.71621255798267,
1236
  "learning_rate": 3.24124515747731e-08,
1237
- "logits/chosen": -0.0028346062172204256,
1238
- "logits/rejected": 1.1290369033813477,
1239
- "logps/chosen": -672.173828125,
1240
- "logps/rejected": -975.3580322265625,
1241
- "loss": 0.374,
1242
- "rewards/accuracies": 0.796875,
1243
- "rewards/chosen": -3.6781773567199707,
1244
- "rewards/margins": 3.3185067176818848,
1245
- "rewards/rejected": -6.996683597564697,
1246
  "step": 740
1247
  },
1248
  {
1249
  "epoch": 0.8638065073423553,
1250
- "grad_norm": 35.482960030400996,
1251
  "learning_rate": 2.763773529814506e-08,
1252
- "logits/chosen": 0.17448297142982483,
1253
- "logits/rejected": 0.9923737645149231,
1254
- "logps/chosen": -603.94970703125,
1255
- "logps/rejected": -925.9880981445312,
1256
- "loss": 0.3918,
1257
- "rewards/accuracies": 0.78125,
1258
- "rewards/chosen": -3.315547466278076,
1259
- "rewards/margins": 3.1224138736724854,
1260
- "rewards/rejected": -6.437961578369141,
1261
  "step": 750
1262
  },
1263
  {
1264
  "epoch": 0.8753239274402533,
1265
- "grad_norm": 28.184713620117034,
1266
  "learning_rate": 2.3222768526860698e-08,
1267
- "logits/chosen": 0.0647897943854332,
1268
- "logits/rejected": 0.9855157136917114,
1269
- "logps/chosen": -613.9244995117188,
1270
- "logps/rejected": -901.00927734375,
1271
- "loss": 0.3741,
1272
- "rewards/accuracies": 0.7749999761581421,
1273
- "rewards/chosen": -3.254974365234375,
1274
- "rewards/margins": 2.7708938121795654,
1275
- "rewards/rejected": -6.0258684158325195,
1276
  "step": 760
1277
  },
1278
  {
1279
  "epoch": 0.8868413475381515,
1280
- "grad_norm": 35.13633269103924,
1281
  "learning_rate": 1.9174694029115146e-08,
1282
- "logits/chosen": 0.19886977970600128,
1283
- "logits/rejected": 1.013934850692749,
1284
- "logps/chosen": -620.3436279296875,
1285
- "logps/rejected": -958.3701171875,
1286
- "loss": 0.3682,
1287
- "rewards/accuracies": 0.8218749761581421,
1288
- "rewards/chosen": -3.4876530170440674,
1289
- "rewards/margins": 3.2890784740448,
1290
- "rewards/rejected": -6.776731967926025,
1291
  "step": 770
1292
  },
1293
  {
1294
  "epoch": 0.8983587676360495,
1295
- "grad_norm": 29.350577487943855,
1296
  "learning_rate": 1.5500060995258134e-08,
1297
- "logits/chosen": 0.12428224086761475,
1298
- "logits/rejected": 1.2418944835662842,
1299
- "logps/chosen": -604.94873046875,
1300
- "logps/rejected": -891.98828125,
1301
- "loss": 0.37,
1302
- "rewards/accuracies": 0.778124988079071,
1303
- "rewards/chosen": -3.2872474193573,
1304
- "rewards/margins": 2.9589405059814453,
1305
- "rewards/rejected": -6.24618673324585,
1306
  "step": 780
1307
  },
1308
  {
1309
  "epoch": 0.9098761877339476,
1310
- "grad_norm": 31.77954056090223,
1311
  "learning_rate": 1.2204814442165812e-08,
1312
- "logits/chosen": 0.1471497118473053,
1313
- "logits/rejected": 1.0471051931381226,
1314
- "logps/chosen": -657.8414306640625,
1315
- "logps/rejected": -977.1556396484375,
1316
- "loss": 0.3992,
1317
- "rewards/accuracies": 0.784375011920929,
1318
- "rewards/chosen": -3.6439871788024902,
1319
- "rewards/margins": 3.291074752807617,
1320
- "rewards/rejected": -6.935061454772949,
1321
  "step": 790
1322
  },
1323
  {
1324
  "epoch": 0.9213936078318457,
1325
- "grad_norm": 35.231363022526715,
1326
  "learning_rate": 9.294285595075669e-09,
1327
- "logits/chosen": 0.23517772555351257,
1328
- "logits/rejected": 1.1635137796401978,
1329
- "logps/chosen": -621.228515625,
1330
- "logps/rejected": -941.3455200195312,
1331
- "loss": 0.3644,
1332
- "rewards/accuracies": 0.7906249761581421,
1333
- "rewards/chosen": -3.437223434448242,
1334
- "rewards/margins": 3.3359901905059814,
1335
- "rewards/rejected": -6.7732133865356445,
1336
  "step": 800
1337
  },
1338
  {
1339
  "epoch": 0.9213936078318457,
1340
- "eval_logits/chosen": -0.3096068501472473,
1341
- "eval_logits/rejected": 0.6049354672431946,
1342
- "eval_logps/chosen": -755.9322509765625,
1343
- "eval_logps/rejected": -1192.5621337890625,
1344
- "eval_loss": 0.31517288088798523,
1345
- "eval_rewards/accuracies": 0.818385660648346,
1346
- "eval_rewards/chosen": -4.649641990661621,
1347
- "eval_rewards/margins": 4.37505578994751,
1348
- "eval_rewards/rejected": -9.024698257446289,
1349
- "eval_runtime": 652.0187,
1350
- "eval_samples_per_second": 10.929,
1351
- "eval_steps_per_second": 0.342,
1352
  "step": 800
1353
  },
1354
  {
1355
  "epoch": 0.9329110279297438,
1356
- "grad_norm": 35.70619984366826,
1357
  "learning_rate": 6.773183262446914e-09,
1358
- "logits/chosen": 0.08351641893386841,
1359
- "logits/rejected": 1.0455710887908936,
1360
- "logps/chosen": -619.4977416992188,
1361
- "logps/rejected": -918.3001098632812,
1362
- "loss": 0.4056,
1363
- "rewards/accuracies": 0.78125,
1364
- "rewards/chosen": -3.3363006114959717,
1365
- "rewards/margins": 3.10974383354187,
1366
- "rewards/rejected": -6.446043968200684,
1367
  "step": 810
1368
  },
1369
  {
1370
  "epoch": 0.9444284480276418,
1371
- "grad_norm": 32.90474966984876,
1372
  "learning_rate": 4.645586217799452e-09,
1373
- "logits/chosen": -0.05233382433652878,
1374
- "logits/rejected": 0.976836085319519,
1375
- "logps/chosen": -630.252685546875,
1376
- "logps/rejected": -968.9739379882812,
1377
- "loss": 0.3685,
1378
- "rewards/accuracies": 0.824999988079071,
1379
- "rewards/chosen": -3.2664294242858887,
1380
- "rewards/margins": 3.563570022583008,
1381
- "rewards/rejected": -6.8299994468688965,
1382
  "step": 820
1383
  },
1384
  {
1385
  "epoch": 0.9559458681255398,
1386
- "grad_norm": 36.38359169566316,
1387
  "learning_rate": 2.9149366008568987e-09,
1388
- "logits/chosen": 0.14797405898571014,
1389
- "logits/rejected": 0.9976932406425476,
1390
- "logps/chosen": -601.341552734375,
1391
- "logps/rejected": -791.478271484375,
1392
- "loss": 0.4137,
1393
  "rewards/accuracies": 0.778124988079071,
1394
- "rewards/chosen": -3.2392711639404297,
1395
- "rewards/margins": 2.041738986968994,
1396
- "rewards/rejected": -5.281010150909424,
1397
  "step": 830
1398
  },
1399
  {
1400
  "epoch": 0.967463288223438,
1401
- "grad_norm": 36.38873535658025,
1402
  "learning_rate": 1.5840343486700215e-09,
1403
- "logits/chosen": 0.11059533059597015,
1404
- "logits/rejected": 1.2648974657058716,
1405
- "logps/chosen": -640.0452880859375,
1406
- "logps/rejected": -975.3658447265625,
1407
- "loss": 0.394,
1408
- "rewards/accuracies": 0.8125,
1409
- "rewards/chosen": -3.4839179515838623,
1410
- "rewards/margins": 3.5232937335968018,
1411
- "rewards/rejected": -7.007212162017822,
1412
  "step": 840
1413
  },
1414
  {
1415
  "epoch": 0.978980708321336,
1416
- "grad_norm": 40.1741036497201,
1417
  "learning_rate": 6.550326657293881e-10,
1418
- "logits/chosen": 0.27081722021102905,
1419
- "logits/rejected": 1.2972664833068848,
1420
- "logps/chosen": -605.1519775390625,
1421
- "logps/rejected": -885.68896484375,
1422
- "loss": 0.4039,
1423
- "rewards/accuracies": 0.7906249761581421,
1424
- "rewards/chosen": -3.3234386444091797,
1425
- "rewards/margins": 2.970566511154175,
1426
- "rewards/rejected": -6.294005870819092,
1427
  "step": 850
1428
  },
1429
  {
1430
  "epoch": 0.9904981284192341,
1431
- "grad_norm": 31.693047329975887,
1432
  "learning_rate": 1.2943454039654467e-10,
1433
- "logits/chosen": 0.14183056354522705,
1434
- "logits/rejected": 1.1139782667160034,
1435
- "logps/chosen": -605.5335693359375,
1436
- "logps/rejected": -855.60595703125,
1437
- "loss": 0.3858,
1438
- "rewards/accuracies": 0.753125011920929,
1439
- "rewards/chosen": -3.16903018951416,
1440
- "rewards/margins": 2.619706869125366,
1441
- "rewards/rejected": -5.7887372970581055,
1442
  "step": 860
1443
  },
1444
  {
1445
  "epoch": 0.9997120644975526,
1446
  "step": 868,
1447
  "total_flos": 0.0,
1448
- "train_loss": 0.0,
1449
- "train_runtime": 0.0211,
1450
- "train_samples_per_second": 5273498.215,
1451
- "train_steps_per_second": 41188.083
1452
  }
1453
  ],
1454
  "logging_steps": 10,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.001151742009789807,
13
+ "grad_norm": 8.954344956678737,
14
+ "learning_rate": 5.747126436781609e-09,
15
+ "logits/chosen": -2.759351968765259,
16
+ "logits/rejected": -2.762708902359009,
17
+ "logps/chosen": -256.92901611328125,
18
+ "logps/rejected": -268.4883728027344,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
 
25
  },
26
  {
27
  "epoch": 0.01151742009789807,
28
+ "grad_norm": 8.455865944439507,
29
  "learning_rate": 5.747126436781609e-08,
30
+ "logits/chosen": -2.8186073303222656,
31
+ "logits/rejected": -2.7818901538848877,
32
+ "logps/chosen": -280.477783203125,
33
+ "logps/rejected": -273.6023864746094,
34
+ "loss": 0.6932,
35
+ "rewards/accuracies": 0.3854166567325592,
36
+ "rewards/chosen": 0.0001983554830076173,
37
+ "rewards/margins": -0.00018551234097685665,
38
+ "rewards/rejected": 0.0003838678530883044,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.02303484019579614,
43
+ "grad_norm": 8.281991111156,
44
  "learning_rate": 1.1494252873563217e-07,
45
+ "logits/chosen": -2.780491828918457,
46
+ "logits/rejected": -2.7574048042297363,
47
+ "logps/chosen": -288.68408203125,
48
+ "logps/rejected": -274.33233642578125,
49
+ "loss": 0.6929,
50
  "rewards/accuracies": 0.5406249761581421,
51
+ "rewards/chosen": 0.000404498161515221,
52
+ "rewards/margins": 0.0008455432835035026,
53
+ "rewards/rejected": -0.00044104509288445115,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.03455226029369421,
58
+ "grad_norm": 8.294840577575956,
59
  "learning_rate": 1.7241379310344828e-07,
60
+ "logits/chosen": -2.7976105213165283,
61
+ "logits/rejected": -2.771127462387085,
62
+ "logps/chosen": -282.4824523925781,
63
+ "logps/rejected": -267.7333984375,
64
+ "loss": 0.6915,
65
+ "rewards/accuracies": 0.6499999761581421,
66
+ "rewards/chosen": 0.002527938922867179,
67
+ "rewards/margins": 0.0036745104007422924,
68
+ "rewards/rejected": -0.0011465717107057571,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.04606968039159228,
73
+ "grad_norm": 8.712130617823888,
74
  "learning_rate": 2.2988505747126435e-07,
75
+ "logits/chosen": -2.7874083518981934,
76
+ "logits/rejected": -2.7691152095794678,
77
+ "logps/chosen": -276.18780517578125,
78
+ "logps/rejected": -266.81781005859375,
79
+ "loss": 0.6874,
80
+ "rewards/accuracies": 0.706250011920929,
81
+ "rewards/chosen": 0.0077004628255963326,
82
+ "rewards/margins": 0.010546171106398106,
83
+ "rewards/rejected": -0.0028457094449549913,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.05758710048949035,
88
+ "grad_norm": 10.31218146006529,
89
  "learning_rate": 2.873563218390804e-07,
90
+ "logits/chosen": -2.8169891834259033,
91
+ "logits/rejected": -2.780351161956787,
92
+ "logps/chosen": -288.07293701171875,
93
+ "logps/rejected": -277.56207275390625,
94
+ "loss": 0.6779,
95
+ "rewards/accuracies": 0.746874988079071,
96
+ "rewards/chosen": 0.016184702515602112,
97
+ "rewards/margins": 0.033592261373996735,
98
+ "rewards/rejected": -0.01740756258368492,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.06910452058738842,
103
+ "grad_norm": 8.630622631020962,
104
  "learning_rate": 3.4482758620689656e-07,
105
+ "logits/chosen": -2.8177428245544434,
106
+ "logits/rejected": -2.7997875213623047,
107
+ "logps/chosen": -284.9295959472656,
108
+ "logps/rejected": -300.76239013671875,
109
+ "loss": 0.6642,
110
+ "rewards/accuracies": 0.7250000238418579,
111
+ "rewards/chosen": 0.014625328592956066,
112
+ "rewards/margins": 0.06306789815425873,
113
+ "rewards/rejected": -0.04844257980585098,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.0806219406852865,
118
+ "grad_norm": 10.734027101004664,
119
  "learning_rate": 4.0229885057471266e-07,
120
+ "logits/chosen": -2.7529358863830566,
121
+ "logits/rejected": -2.7322020530700684,
122
+ "logps/chosen": -290.8133850097656,
123
+ "logps/rejected": -287.9371643066406,
124
+ "loss": 0.6367,
125
+ "rewards/accuracies": 0.715624988079071,
126
+ "rewards/chosen": -0.06497061997652054,
127
+ "rewards/margins": 0.13046525418758392,
128
+ "rewards/rejected": -0.19543585181236267,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.09213936078318456,
133
+ "grad_norm": 11.33124626301948,
134
  "learning_rate": 4.597701149425287e-07,
135
+ "logits/chosen": -2.861381769180298,
136
+ "logits/rejected": -2.830373525619507,
137
+ "logps/chosen": -326.0346374511719,
138
+ "logps/rejected": -342.4071960449219,
139
+ "loss": 0.5943,
140
+ "rewards/accuracies": 0.7718750238418579,
141
+ "rewards/chosen": -0.21955294907093048,
142
+ "rewards/margins": 0.28150954842567444,
143
+ "rewards/rejected": -0.5010625123977661,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.10365678088108264,
148
+ "grad_norm": 20.262969117511794,
149
  "learning_rate": 4.999817969178237e-07,
150
+ "logits/chosen": -2.802300214767456,
151
+ "logits/rejected": -2.766491174697876,
152
+ "logps/chosen": -352.80755615234375,
153
+ "logps/rejected": -383.49725341796875,
154
+ "loss": 0.5561,
155
+ "rewards/accuracies": 0.7124999761581421,
156
+ "rewards/chosen": -0.6216552257537842,
157
+ "rewards/margins": 0.41057389974594116,
158
+ "rewards/rejected": -1.0322291851043701,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.1151742009789807,
163
+ "grad_norm": 21.31118240013579,
164
  "learning_rate": 4.996582603056428e-07,
165
+ "logits/chosen": -2.789545774459839,
166
+ "logits/rejected": -2.761136293411255,
167
+ "logps/chosen": -439.2823791503906,
168
+ "logps/rejected": -501.868408203125,
169
+ "loss": 0.5381,
170
+ "rewards/accuracies": 0.699999988079071,
171
+ "rewards/chosen": -1.3656787872314453,
172
+ "rewards/margins": 0.7182655334472656,
173
+ "rewards/rejected": -2.08394455909729,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.1151742009789807,
178
+ "eval_logits/chosen": -2.9044759273529053,
179
+ "eval_logits/rejected": -2.882232189178467,
180
+ "eval_logps/chosen": -489.789306640625,
181
+ "eval_logps/rejected": -581.798095703125,
182
+ "eval_loss": 0.4757886528968811,
183
+ "eval_rewards/accuracies": 0.7270179390907288,
184
+ "eval_rewards/chosen": -1.9882127046585083,
185
+ "eval_rewards/margins": 0.9288455843925476,
186
+ "eval_rewards/rejected": -2.9170584678649902,
187
+ "eval_runtime": 641.6076,
188
+ "eval_samples_per_second": 11.106,
189
+ "eval_steps_per_second": 0.348,
190
  "step": 100
191
  },
192
  {
193
  "epoch": 0.12669162107687879,
194
+ "grad_norm": 19.990999467237586,
195
  "learning_rate": 4.989308132738126e-07,
196
+ "logits/chosen": -2.7357916831970215,
197
+ "logits/rejected": -2.711280345916748,
198
+ "logps/chosen": -394.81048583984375,
199
+ "logps/rejected": -465.7745056152344,
200
+ "loss": 0.5215,
201
+ "rewards/accuracies": 0.690625011920929,
202
+ "rewards/chosen": -1.2416951656341553,
203
+ "rewards/margins": 0.7623735666275024,
204
+ "rewards/rejected": -2.0040688514709473,
205
  "step": 110
206
  },
207
  {
208
  "epoch": 0.13820904117477684,
209
+ "grad_norm": 22.41253635277838,
210
  "learning_rate": 4.978006327248536e-07,
211
+ "logits/chosen": -2.7071425914764404,
212
+ "logits/rejected": -2.6982598304748535,
213
+ "logps/chosen": -415.25634765625,
214
+ "logps/rejected": -507.17376708984375,
215
+ "loss": 0.4829,
216
+ "rewards/accuracies": 0.768750011920929,
217
+ "rewards/chosen": -1.099837303161621,
218
+ "rewards/margins": 0.9606343507766724,
219
+ "rewards/rejected": -2.060471773147583,
220
  "step": 120
221
  },
222
  {
223
  "epoch": 0.14972646127267492,
224
+ "grad_norm": 28.3351388035627,
225
  "learning_rate": 4.962695471250032e-07,
226
+ "logits/chosen": -2.509568929672241,
227
+ "logits/rejected": -2.4706530570983887,
228
+ "logps/chosen": -475.9378356933594,
229
+ "logps/rejected": -565.5125732421875,
230
+ "loss": 0.4913,
231
+ "rewards/accuracies": 0.6937500238418579,
232
+ "rewards/chosen": -1.896477460861206,
233
+ "rewards/margins": 1.053924798965454,
234
+ "rewards/rejected": -2.950402021408081,
235
  "step": 130
236
  },
237
  {
238
  "epoch": 0.161243881370573,
239
+ "grad_norm": 28.57986478281954,
240
  "learning_rate": 4.94340033546025e-07,
241
+ "logits/chosen": -1.8837181329727173,
242
+ "logits/rejected": -1.7757914066314697,
243
+ "logps/chosen": -481.201416015625,
244
+ "logps/rejected": -605.1053466796875,
245
+ "loss": 0.4783,
246
+ "rewards/accuracies": 0.753125011920929,
247
+ "rewards/chosen": -2.0652859210968018,
248
+ "rewards/margins": 1.3595540523529053,
249
+ "rewards/rejected": -3.424839735031128,
250
  "step": 140
251
  },
252
  {
253
  "epoch": 0.17276130146847107,
254
+ "grad_norm": 19.68624301597443,
255
  "learning_rate": 4.920152136576705e-07,
256
+ "logits/chosen": -1.7746684551239014,
257
+ "logits/rejected": -1.6012051105499268,
258
+ "logps/chosen": -478.12646484375,
259
+ "logps/rejected": -569.0491333007812,
260
+ "loss": 0.456,
261
+ "rewards/accuracies": 0.721875011920929,
262
+ "rewards/chosen": -1.7061989307403564,
263
+ "rewards/margins": 1.1741154193878174,
264
+ "rewards/rejected": -2.880314350128174,
265
  "step": 150
266
  },
267
  {
268
  "epoch": 0.18427872156636912,
269
+ "grad_norm": 22.179872671918822,
270
  "learning_rate": 4.892988486772756e-07,
271
+ "logits/chosen": -1.5278130769729614,
272
+ "logits/rejected": -1.3506165742874146,
273
+ "logps/chosen": -477.80853271484375,
274
+ "logps/rejected": -651.3321533203125,
275
+ "loss": 0.4295,
276
+ "rewards/accuracies": 0.778124988079071,
277
+ "rewards/chosen": -2.0188517570495605,
278
+ "rewards/margins": 1.7111384868621826,
279
+ "rewards/rejected": -3.7299904823303223,
280
  "step": 160
281
  },
282
  {
283
  "epoch": 0.1957961416642672,
284
+ "grad_norm": 22.622363623676517,
285
  "learning_rate": 4.861953332846629e-07,
286
+ "logits/chosen": -1.2652065753936768,
287
+ "logits/rejected": -1.1065785884857178,
288
+ "logps/chosen": -485.0794372558594,
289
+ "logps/rejected": -644.7564697265625,
290
+ "loss": 0.4356,
291
+ "rewards/accuracies": 0.7406250238418579,
292
+ "rewards/chosen": -2.1487345695495605,
293
+ "rewards/margins": 1.5242230892181396,
294
+ "rewards/rejected": -3.6729576587677,
295
  "step": 170
296
  },
297
  {
298
  "epoch": 0.20731356176216528,
299
+ "grad_norm": 26.087556223120203,
300
  "learning_rate": 4.827096885121953e-07,
301
+ "logits/chosen": -1.0748474597930908,
302
+ "logits/rejected": -0.7963994145393372,
303
+ "logps/chosen": -554.6792602539062,
304
+ "logps/rejected": -734.0240478515625,
305
+ "loss": 0.4457,
306
+ "rewards/accuracies": 0.75,
307
+ "rewards/chosen": -2.8119685649871826,
308
+ "rewards/margins": 1.7995468378067017,
309
+ "rewards/rejected": -4.611515045166016,
310
  "step": 180
311
  },
312
  {
313
  "epoch": 0.21883098186006333,
314
+ "grad_norm": 23.82706622737464,
315
  "learning_rate": 4.788475536214821e-07,
316
+ "logits/chosen": -0.7968783974647522,
317
+ "logits/rejected": -0.5444242358207703,
318
+ "logps/chosen": -570.3772583007812,
319
+ "logps/rejected": -743.9293212890625,
320
+ "loss": 0.404,
321
+ "rewards/accuracies": 0.800000011920929,
322
+ "rewards/chosen": -2.7484724521636963,
323
+ "rewards/margins": 1.847896933555603,
324
+ "rewards/rejected": -4.59636926651001,
325
  "step": 190
326
  },
327
  {
328
  "epoch": 0.2303484019579614,
329
+ "grad_norm": 22.12633066649394,
330
  "learning_rate": 4.746151769798818e-07,
331
+ "logits/chosen": -0.33320680260658264,
332
+ "logits/rejected": 0.0012704581022262573,
333
+ "logps/chosen": -587.2708740234375,
334
+ "logps/rejected": -802.9293212890625,
335
+ "loss": 0.4268,
336
+ "rewards/accuracies": 0.7593749761581421,
337
+ "rewards/chosen": -3.0321972370147705,
338
+ "rewards/margins": 2.202017068862915,
339
+ "rewards/rejected": -5.2342143058776855,
340
  "step": 200
341
  },
342
  {
343
  "epoch": 0.2303484019579614,
344
+ "eval_logits/chosen": -0.9790740013122559,
345
+ "eval_logits/rejected": -0.6780607104301453,
346
+ "eval_logps/chosen": -681.6493530273438,
347
+ "eval_logps/rejected": -974.9606323242188,
348
+ "eval_loss": 0.3576536774635315,
349
+ "eval_rewards/accuracies": 0.7976457476615906,
350
+ "eval_rewards/chosen": -3.906813621520996,
351
+ "eval_rewards/margins": 2.9418699741363525,
352
+ "eval_rewards/rejected": -6.848682403564453,
353
+ "eval_runtime": 642.8141,
354
+ "eval_samples_per_second": 11.086,
355
+ "eval_steps_per_second": 0.347,
356
  "step": 200
357
  },
358
  {
359
  "epoch": 0.2418658220558595,
360
+ "grad_norm": 28.55770787025379,
361
  "learning_rate": 4.7001940595156055e-07,
362
+ "logits/chosen": -0.15793052315711975,
363
+ "logits/rejected": 0.05452694371342659,
364
+ "logps/chosen": -531.8611450195312,
365
+ "logps/rejected": -715.461181640625,
366
+ "loss": 0.4386,
367
+ "rewards/accuracies": 0.731249988079071,
368
+ "rewards/chosen": -2.547398328781128,
369
+ "rewards/margins": 1.7863715887069702,
370
+ "rewards/rejected": -4.333769798278809,
371
  "step": 210
372
  },
373
  {
374
  "epoch": 0.25338324215375757,
375
+ "grad_norm": 34.18952768278615,
376
  "learning_rate": 4.650676758194623e-07,
377
+ "logits/chosen": -0.5843445062637329,
378
+ "logits/rejected": -0.08485187590122223,
379
+ "logps/chosen": -509.3975524902344,
380
+ "logps/rejected": -702.2587280273438,
381
+ "loss": 0.4169,
382
+ "rewards/accuracies": 0.78125,
383
+ "rewards/chosen": -2.2745628356933594,
384
+ "rewards/margins": 2.145096778869629,
385
+ "rewards/rejected": -4.419659614562988,
386
  "step": 220
387
  },
388
  {
389
  "epoch": 0.26490066225165565,
390
+ "grad_norm": 33.36988457876833,
391
  "learning_rate": 4.5976799775611215e-07,
392
+ "logits/chosen": -0.3160017728805542,
393
+ "logits/rejected": 0.20093998312950134,
394
+ "logps/chosen": -516.4368896484375,
395
+ "logps/rejected": -678.5955200195312,
396
+ "loss": 0.4413,
397
+ "rewards/accuracies": 0.75,
398
+ "rewards/chosen": -2.2482521533966064,
399
+ "rewards/margins": 1.7269527912139893,
400
+ "rewards/rejected": -3.9752049446105957,
401
  "step": 230
402
  },
403
  {
404
  "epoch": 0.2764180823495537,
405
+ "grad_norm": 25.800936315454518,
406
  "learning_rate": 4.5412894586271543e-07,
407
+ "logits/chosen": -0.25688791275024414,
408
+ "logits/rejected": 0.23107607662677765,
409
+ "logps/chosen": -507.2642517089844,
410
+ "logps/rejected": -678.391357421875,
411
+ "loss": 0.42,
412
+ "rewards/accuracies": 0.765625,
413
+ "rewards/chosen": -2.0940096378326416,
414
+ "rewards/margins": 1.9648818969726562,
415
+ "rewards/rejected": -4.058891773223877,
416
  "step": 240
417
  },
418
  {
419
  "epoch": 0.28793550244745175,
420
+ "grad_norm": 25.786461976756396,
421
  "learning_rate": 4.481596432975201e-07,
422
+ "logits/chosen": -1.0171256065368652,
423
+ "logits/rejected": -0.5706368684768677,
424
+ "logps/chosen": -517.12451171875,
425
+ "logps/rejected": -737.7791137695312,
426
+ "loss": 0.4331,
427
+ "rewards/accuracies": 0.768750011920929,
428
+ "rewards/chosen": -2.2817940711975098,
429
+ "rewards/margins": 2.181729793548584,
430
+ "rewards/rejected": -4.463524341583252,
431
  "step": 250
432
  },
433
  {
434
  "epoch": 0.29945292254534983,
435
+ "grad_norm": 47.2235188357849,
436
  "learning_rate": 4.41869747515886e-07,
437
+ "logits/chosen": -1.1118611097335815,
438
+ "logits/rejected": -0.6688307523727417,
439
+ "logps/chosen": -620.9531860351562,
440
+ "logps/rejected": -872.623046875,
441
+ "loss": 0.4144,
442
+ "rewards/accuracies": 0.753125011920929,
443
+ "rewards/chosen": -3.3720595836639404,
444
+ "rewards/margins": 2.488572359085083,
445
+ "rewards/rejected": -5.860631465911865,
446
  "step": 260
447
  },
448
  {
449
  "epoch": 0.3109703426432479,
450
+ "grad_norm": 26.838626915471743,
451
  "learning_rate": 4.352694346459396e-07,
452
+ "logits/chosen": -1.2210652828216553,
453
+ "logits/rejected": -0.7195647358894348,
454
+ "logps/chosen": -534.5902099609375,
455
+ "logps/rejected": -761.0294189453125,
456
+ "loss": 0.4083,
457
  "rewards/accuracies": 0.784375011920929,
458
+ "rewards/chosen": -2.6057441234588623,
459
+ "rewards/margins": 2.184957504272461,
460
+ "rewards/rejected": -4.790701866149902,
461
  "step": 270
462
  },
463
  {
464
  "epoch": 0.322487762741146,
465
+ "grad_norm": 26.363036179443302,
466
  "learning_rate": 4.2836938302509256e-07,
467
+ "logits/chosen": -1.2423580884933472,
468
+ "logits/rejected": -0.7629604935646057,
469
+ "logps/chosen": -559.0585327148438,
470
+ "logps/rejected": -827.2657470703125,
471
+ "loss": 0.3904,
472
+ "rewards/accuracies": 0.7718750238418579,
473
+ "rewards/chosen": -2.7791800498962402,
474
+ "rewards/margins": 2.684356689453125,
475
+ "rewards/rejected": -5.463536739349365,
476
  "step": 280
477
  },
478
  {
479
  "epoch": 0.33400518283904407,
480
+ "grad_norm": 25.387456130173874,
481
  "learning_rate": 4.2118075592405874e-07,
482
+ "logits/chosen": -0.6111725568771362,
483
+ "logits/rejected": -0.15174560248851776,
484
+ "logps/chosen": -587.7760009765625,
485
+ "logps/rejected": -839.3088989257812,
486
+ "loss": 0.4086,
487
+ "rewards/accuracies": 0.8125,
488
+ "rewards/chosen": -2.9200491905212402,
489
+ "rewards/margins": 2.618988037109375,
490
+ "rewards/rejected": -5.539036750793457,
491
  "step": 290
492
  },
493
  {
494
  "epoch": 0.34552260293694215,
495
+ "grad_norm": 24.386747936025703,
496
  "learning_rate": 4.137151834863213e-07,
497
+ "logits/chosen": -0.8588908910751343,
498
+ "logits/rejected": -0.17885461449623108,
499
+ "logps/chosen": -585.371337890625,
500
+ "logps/rejected": -798.4925537109375,
501
+ "loss": 0.4067,
502
+ "rewards/accuracies": 0.7875000238418579,
503
+ "rewards/chosen": -3.0237374305725098,
504
+ "rewards/margins": 2.431835651397705,
505
+ "rewards/rejected": -5.455573558807373,
506
  "step": 300
507
  },
508
  {
509
  "epoch": 0.34552260293694215,
510
+ "eval_logits/chosen": -1.2473794221878052,
511
+ "eval_logits/rejected": -0.6641976833343506,
512
+ "eval_logps/chosen": -688.5350952148438,
513
+ "eval_logps/rejected": -1054.9027099609375,
514
+ "eval_loss": 0.34113186597824097,
515
+ "eval_rewards/accuracies": 0.8094170689582825,
516
+ "eval_rewards/chosen": -3.9756710529327393,
517
+ "eval_rewards/margins": 3.6724324226379395,
518
+ "eval_rewards/rejected": -7.648103713989258,
519
+ "eval_runtime": 641.4555,
520
+ "eval_samples_per_second": 11.109,
521
+ "eval_steps_per_second": 0.348,
522
  "step": 300
523
  },
524
  {
525
  "epoch": 0.35704002303484017,
526
+ "grad_norm": 31.743815715159577,
527
  "learning_rate": 4.059847439122671e-07,
528
+ "logits/chosen": -0.9840304255485535,
529
+ "logits/rejected": -0.4123767018318176,
530
+ "logps/chosen": -538.8982543945312,
531
+ "logps/rejected": -785.3285522460938,
532
+ "loss": 0.4144,
533
+ "rewards/accuracies": 0.7718750238418579,
534
+ "rewards/chosen": -2.550431489944458,
535
+ "rewards/margins": 2.4609429836273193,
536
+ "rewards/rejected": -5.011374473571777,
537
  "step": 310
538
  },
539
  {
540
  "epoch": 0.36855744313273825,
541
+ "grad_norm": 24.806762291664178,
542
  "learning_rate": 3.98001943918432e-07,
543
+ "logits/chosen": -1.2074978351593018,
544
+ "logits/rejected": -0.2775883674621582,
545
+ "logps/chosen": -509.53887939453125,
546
+ "logps/rejected": -702.5384521484375,
547
+ "loss": 0.4077,
548
+ "rewards/accuracies": 0.7875000238418579,
549
+ "rewards/chosen": -2.1766436100006104,
550
+ "rewards/margins": 2.217984914779663,
551
+ "rewards/rejected": -4.394628524780273,
552
  "step": 320
553
  },
554
  {
555
  "epoch": 0.38007486323063633,
556
+ "grad_norm": 31.303160160987616,
557
  "learning_rate": 3.8977969850346866e-07,
558
+ "logits/chosen": -0.7918741106987,
559
+ "logits/rejected": 0.004956415388733149,
560
+ "logps/chosen": -578.205078125,
561
+ "logps/rejected": -870.2096557617188,
562
+ "loss": 0.3752,
563
+ "rewards/accuracies": 0.765625,
564
+ "rewards/chosen": -2.821563720703125,
565
+ "rewards/margins": 3.012451648712158,
566
+ "rewards/rejected": -5.834015369415283,
567
  "step": 330
568
  },
569
  {
570
  "epoch": 0.3915922833285344,
571
+ "grad_norm": 27.692313832407383,
572
  "learning_rate": 3.8133131005357465e-07,
573
+ "logits/chosen": -0.9087894558906555,
574
+ "logits/rejected": -0.1920526772737503,
575
+ "logps/chosen": -602.9824829101562,
576
+ "logps/rejected": -872.3701171875,
577
+ "loss": 0.4233,
578
+ "rewards/accuracies": 0.7906249761581421,
579
+ "rewards/chosen": -3.1107020378112793,
580
+ "rewards/margins": 2.777942180633545,
581
+ "rewards/rejected": -5.888644218444824,
582
  "step": 340
583
  },
584
  {
585
  "epoch": 0.4031097034264325,
586
+ "grad_norm": 22.459786517410773,
587
  "learning_rate": 3.7267044682118435e-07,
588
+ "logits/chosen": -0.6368467211723328,
589
+ "logits/rejected": 0.05047903582453728,
590
+ "logps/chosen": -546.3031616210938,
591
+ "logps/rejected": -768.5010986328125,
592
+ "loss": 0.4038,
593
+ "rewards/accuracies": 0.753125011920929,
594
+ "rewards/chosen": -2.607321262359619,
595
+ "rewards/margins": 2.1534266471862793,
596
+ "rewards/rejected": -4.76074743270874,
597
  "step": 350
598
  },
599
  {
600
  "epoch": 0.41462712352433057,
601
+ "grad_norm": 54.30836966222414,
602
  "learning_rate": 3.638111208117425e-07,
603
+ "logits/chosen": -0.4448986053466797,
604
+ "logits/rejected": 0.26401039958000183,
605
+ "logps/chosen": -660.1737060546875,
606
+ "logps/rejected": -936.7528076171875,
607
+ "loss": 0.3945,
608
+ "rewards/accuracies": 0.7562500238418579,
609
+ "rewards/chosen": -3.779724597930908,
610
+ "rewards/margins": 2.7942070960998535,
611
+ "rewards/rejected": -6.573931694030762,
612
  "step": 360
613
  },
614
  {
615
  "epoch": 0.42614454362222864,
616
+ "grad_norm": 30.943136192122527,
617
  "learning_rate": 3.5476766511433605e-07,
618
+ "logits/chosen": -0.6570574641227722,
619
+ "logits/rejected": 0.12815245985984802,
620
+ "logps/chosen": -589.9466552734375,
621
+ "logps/rejected": -808.5029296875,
622
+ "loss": 0.4008,
623
+ "rewards/accuracies": 0.7562500238418579,
624
+ "rewards/chosen": -3.0944387912750244,
625
+ "rewards/margins": 2.3009159564971924,
626
+ "rewards/rejected": -5.395354270935059,
627
  "step": 370
628
  },
629
  {
630
  "epoch": 0.43766196372012667,
631
+ "grad_norm": 19.09835141181145,
632
  "learning_rate": 3.455547107128602e-07,
633
+ "logits/chosen": -0.45787113904953003,
634
+ "logits/rejected": 0.38415655493736267,
635
+ "logps/chosen": -527.8428955078125,
636
+ "logps/rejected": -821.2969970703125,
637
+ "loss": 0.3953,
638
+ "rewards/accuracies": 0.8125,
639
+ "rewards/chosen": -2.6933910846710205,
640
+ "rewards/margins": 2.8653125762939453,
641
+ "rewards/rejected": -5.558703422546387,
642
  "step": 380
643
  },
644
  {
645
  "epoch": 0.44917938381802475,
646
+ "grad_norm": 25.986133062638014,
647
  "learning_rate": 3.361871628152338e-07,
648
+ "logits/chosen": 0.1055336743593216,
649
+ "logits/rejected": 0.8421304821968079,
650
+ "logps/chosen": -574.3500366210938,
651
+ "logps/rejected": -818.9120483398438,
652
+ "loss": 0.4116,
653
+ "rewards/accuracies": 0.800000011920929,
654
+ "rewards/chosen": -2.960313558578491,
655
+ "rewards/margins": 2.6313090324401855,
656
+ "rewards/rejected": -5.591622829437256,
657
  "step": 390
658
  },
659
  {
660
  "epoch": 0.4606968039159228,
661
+ "grad_norm": 33.121972667621534,
662
  "learning_rate": 3.2668017673896077e-07,
663
+ "logits/chosen": -0.19368359446525574,
664
+ "logits/rejected": 0.5146237015724182,
665
+ "logps/chosen": -617.7755737304688,
666
+ "logps/rejected": -860.82861328125,
667
+ "loss": 0.4011,
668
+ "rewards/accuracies": 0.762499988079071,
669
+ "rewards/chosen": -3.3203601837158203,
670
+ "rewards/margins": 2.387831211090088,
671
+ "rewards/rejected": -5.708191871643066,
672
  "step": 400
673
  },
674
  {
675
  "epoch": 0.4606968039159228,
676
+ "eval_logits/chosen": -0.7429019808769226,
677
+ "eval_logits/rejected": 0.1183277890086174,
678
+ "eval_logps/chosen": -735.4549560546875,
679
+ "eval_logps/rejected": -1130.1990966796875,
680
+ "eval_loss": 0.32945430278778076,
681
+ "eval_rewards/accuracies": 0.8155829310417175,
682
+ "eval_rewards/chosen": -4.444869518280029,
683
+ "eval_rewards/margins": 3.9561986923217773,
684
+ "eval_rewards/rejected": -8.401067733764648,
685
+ "eval_runtime": 645.3491,
686
+ "eval_samples_per_second": 11.042,
687
+ "eval_steps_per_second": 0.346,
688
  "step": 400
689
  },
690
  {
691
  "epoch": 0.4722142240138209,
692
+ "grad_norm": 28.677116827222893,
693
  "learning_rate": 3.1704913339205103e-07,
694
+ "logits/chosen": -0.49749231338500977,
695
+ "logits/rejected": 0.5208367109298706,
696
+ "logps/chosen": -578.421875,
697
+ "logps/rejected": -866.33544921875,
698
+ "loss": 0.4082,
699
+ "rewards/accuracies": 0.8031250238418579,
700
+ "rewards/chosen": -2.9447290897369385,
701
+ "rewards/margins": 3.001622438430786,
702
+ "rewards/rejected": -5.946351528167725,
703
  "step": 410
704
  },
705
  {
706
  "epoch": 0.483731644111719,
707
+ "grad_norm": 25.93977899429838,
708
  "learning_rate": 3.0730961438896885e-07,
709
+ "logits/chosen": -1.0381872653961182,
710
+ "logits/rejected": -0.06657940149307251,
711
+ "logps/chosen": -550.3687744140625,
712
+ "logps/rejected": -820.9139404296875,
713
+ "loss": 0.4,
714
+ "rewards/accuracies": 0.7875000238418579,
715
+ "rewards/chosen": -2.6255903244018555,
716
+ "rewards/margins": 2.71159029006958,
717
+ "rewards/rejected": -5.3371806144714355,
718
  "step": 420
719
  },
720
  {
721
  "epoch": 0.49524906420961706,
722
+ "grad_norm": 24.865636277888157,
723
  "learning_rate": 2.9747737684186795e-07,
724
+ "logits/chosen": -1.131136417388916,
725
+ "logits/rejected": -0.22055275738239288,
726
+ "logps/chosen": -534.1951293945312,
727
+ "logps/rejected": -773.7057495117188,
728
+ "loss": 0.3973,
729
+ "rewards/accuracies": 0.793749988079071,
730
+ "rewards/chosen": -2.4572598934173584,
731
+ "rewards/margins": 2.522524356842041,
732
+ "rewards/rejected": -4.9797844886779785,
733
  "step": 430
734
  },
735
  {
736
  "epoch": 0.5067664843075151,
737
+ "grad_norm": 36.423518981463275,
738
  "learning_rate": 2.8756832786789663e-07,
739
+ "logits/chosen": -1.2586928606033325,
740
+ "logits/rejected": -0.14189645648002625,
741
+ "logps/chosen": -513.0911254882812,
742
+ "logps/rejected": -821.0222778320312,
743
+ "loss": 0.4053,
744
+ "rewards/accuracies": 0.815625011920929,
745
+ "rewards/chosen": -2.5397918224334717,
746
+ "rewards/margins": 3.0112507343292236,
747
+ "rewards/rejected": -5.551042556762695,
748
  "step": 440
749
  },
750
  {
751
  "epoch": 0.5182839044054132,
752
+ "grad_norm": 27.76293529869272,
753
  "learning_rate": 2.7759849885381747e-07,
754
+ "logits/chosen": -1.2538650035858154,
755
+ "logits/rejected": -0.25470593571662903,
756
+ "logps/chosen": -517.45751953125,
757
+ "logps/rejected": -752.2701416015625,
758
+ "loss": 0.3845,
759
+ "rewards/accuracies": 0.793749988079071,
760
+ "rewards/chosen": -2.279754877090454,
761
+ "rewards/margins": 2.4887535572052,
762
+ "rewards/rejected": -4.768507957458496,
763
  "step": 450
764
  },
765
  {
766
  "epoch": 0.5298013245033113,
767
+ "grad_norm": 24.085341189256496,
768
  "learning_rate": 2.675840195195762e-07,
769
+ "logits/chosen": -1.382927417755127,
770
+ "logits/rejected": -0.48569250106811523,
771
+ "logps/chosen": -513.8163452148438,
772
+ "logps/rejected": -766.2885131835938,
773
+ "loss": 0.3799,
774
+ "rewards/accuracies": 0.793749988079071,
775
+ "rewards/chosen": -2.302953004837036,
776
+ "rewards/margins": 2.5440595149993896,
777
+ "rewards/rejected": -4.847012519836426,
778
  "step": 460
779
  },
780
  {
781
  "epoch": 0.5413187446012093,
782
+ "grad_norm": 29.101645338578166,
783
  "learning_rate": 2.575410918227829e-07,
784
+ "logits/chosen": -0.9476648569107056,
785
+ "logits/rejected": -0.29435592889785767,
786
+ "logps/chosen": -573.229736328125,
787
+ "logps/rejected": -794.494140625,
788
+ "loss": 0.3896,
789
+ "rewards/accuracies": 0.71875,
790
+ "rewards/chosen": -2.8009214401245117,
791
+ "rewards/margins": 2.2908072471618652,
792
+ "rewards/rejected": -5.091729164123535,
793
  "step": 470
794
  },
795
  {
796
  "epoch": 0.5528361646991073,
797
+ "grad_norm": 30.198868045523568,
798
  "learning_rate": 2.474859637463226e-07,
799
+ "logits/chosen": -0.4918448328971863,
800
+ "logits/rejected": 0.3972582221031189,
801
+ "logps/chosen": -556.0670776367188,
802
+ "logps/rejected": -804.6024169921875,
803
+ "loss": 0.3886,
804
+ "rewards/accuracies": 0.8218749761581421,
805
+ "rewards/chosen": -2.8543224334716797,
806
+ "rewards/margins": 2.5813965797424316,
807
+ "rewards/rejected": -5.435718536376953,
808
  "step": 480
809
  },
810
  {
811
  "epoch": 0.5643535847970055,
812
+ "grad_norm": 28.52494731969291,
813
  "learning_rate": 2.3743490301150355e-07,
814
+ "logits/chosen": -0.5242654085159302,
815
+ "logits/rejected": 0.405693382024765,
816
+ "logps/chosen": -570.1974487304688,
817
+ "logps/rejected": -811.2410278320312,
818
+ "loss": 0.3959,
819
+ "rewards/accuracies": 0.778124988079071,
820
+ "rewards/chosen": -2.862198829650879,
821
+ "rewards/margins": 2.4608964920043945,
822
+ "rewards/rejected": -5.323095321655273,
823
  "step": 490
824
  },
825
  {
826
  "epoch": 0.5758710048949035,
827
+ "grad_norm": 27.925871853424944,
828
  "learning_rate": 2.274041707592724e-07,
829
+ "logits/chosen": -1.0720345973968506,
830
+ "logits/rejected": -0.10801704227924347,
831
+ "logps/chosen": -542.1129150390625,
832
+ "logps/rejected": -843.2058715820312,
833
+ "loss": 0.3727,
834
+ "rewards/accuracies": 0.800000011920929,
835
+ "rewards/chosen": -2.6700572967529297,
836
+ "rewards/margins": 2.9847068786621094,
837
+ "rewards/rejected": -5.654764175415039,
838
  "step": 500
839
  },
840
  {
841
  "epoch": 0.5758710048949035,
842
+ "eval_logits/chosen": -1.3008555173873901,
843
+ "eval_logits/rejected": -0.40656155347824097,
844
+ "eval_logps/chosen": -662.9987182617188,
845
+ "eval_logps/rejected": -1055.4913330078125,
846
+ "eval_loss": 0.3260224759578705,
847
+ "eval_rewards/accuracies": 0.8161435127258301,
848
+ "eval_rewards/chosen": -3.720306634902954,
849
+ "eval_rewards/margins": 3.9336841106414795,
850
+ "eval_rewards/rejected": -7.653989791870117,
851
+ "eval_runtime": 644.1686,
852
+ "eval_samples_per_second": 11.062,
853
+ "eval_steps_per_second": 0.346,
854
  "step": 500
855
  },
856
  {
857
  "epoch": 0.5873884249928016,
858
+ "grad_norm": 23.857944610065903,
859
  "learning_rate": 2.17409995242075e-07,
860
+ "logits/chosen": -0.8062444925308228,
861
+ "logits/rejected": 0.046298883855342865,
862
+ "logps/chosen": -568.9137573242188,
863
+ "logps/rejected": -859.3053588867188,
864
+ "loss": 0.3716,
865
+ "rewards/accuracies": 0.7906249761581421,
866
+ "rewards/chosen": -2.9011380672454834,
867
+ "rewards/margins": 3.026390552520752,
868
+ "rewards/rejected": -5.927529335021973,
869
  "step": 510
870
  },
871
  {
872
  "epoch": 0.5989058450906997,
873
+ "grad_norm": 34.73513393097343,
874
  "learning_rate": 2.0746854556892544e-07,
875
+ "logits/chosen": -0.23699383437633514,
876
+ "logits/rejected": 0.7075563073158264,
877
+ "logps/chosen": -630.9346313476562,
878
+ "logps/rejected": -914.3643798828125,
879
+ "loss": 0.3669,
880
  "rewards/accuracies": 0.8187500238418579,
881
+ "rewards/chosen": -3.393051862716675,
882
+ "rewards/margins": 2.9863784313201904,
883
+ "rewards/rejected": -6.379430294036865,
884
  "step": 520
885
  },
886
  {
887
  "epoch": 0.6104232651885978,
888
+ "grad_norm": 31.950736125786133,
889
  "learning_rate": 1.9759590554616173e-07,
890
+ "logits/chosen": -0.5588125586509705,
891
+ "logits/rejected": 0.502726674079895,
892
+ "logps/chosen": -579.3170776367188,
893
+ "logps/rejected": -851.6565551757812,
894
+ "loss": 0.3844,
895
+ "rewards/accuracies": 0.78125,
896
+ "rewards/chosen": -3.0607731342315674,
897
+ "rewards/margins": 2.8013463020324707,
898
+ "rewards/rejected": -5.862119674682617,
899
  "step": 530
900
  },
901
  {
902
  "epoch": 0.6219406852864958,
903
+ "grad_norm": 31.535492973313232,
904
  "learning_rate": 1.8780804765620746e-07,
905
+ "logits/chosen": -0.7882386445999146,
906
+ "logits/rejected": 0.33689114451408386,
907
+ "logps/chosen": -505.6971740722656,
908
+ "logps/rejected": -770.9715576171875,
909
+ "loss": 0.3714,
910
+ "rewards/accuracies": 0.8125,
911
+ "rewards/chosen": -2.2458343505859375,
912
+ "rewards/margins": 2.7756423950195312,
913
+ "rewards/rejected": -5.021476745605469,
914
  "step": 540
915
  },
916
  {
917
  "epoch": 0.6334581053843938,
918
+ "grad_norm": 24.518899300596807,
919
  "learning_rate": 1.7812080721643973e-07,
920
+ "logits/chosen": -0.6544414758682251,
921
+ "logits/rejected": 0.6189062595367432,
922
+ "logps/chosen": -587.9449462890625,
923
+ "logps/rejected": -899.134765625,
924
+ "loss": 0.367,
925
+ "rewards/accuracies": 0.828125,
926
+ "rewards/chosen": -2.8350448608398438,
927
+ "rewards/margins": 3.2712531089782715,
928
+ "rewards/rejected": -6.106298446655273,
929
  "step": 550
930
  },
931
  {
932
  "epoch": 0.644975525482292,
933
+ "grad_norm": 30.6763264648463,
934
  "learning_rate": 1.6854985675997063e-07,
935
+ "logits/chosen": -0.7213582396507263,
936
+ "logits/rejected": 0.4497779905796051,
937
+ "logps/chosen": -577.9434814453125,
938
+ "logps/rejected": -847.3308715820312,
939
+ "loss": 0.3895,
940
+ "rewards/accuracies": 0.800000011920929,
941
+ "rewards/chosen": -2.8931756019592285,
942
+ "rewards/margins": 2.903951644897461,
943
+ "rewards/rejected": -5.797126293182373,
944
  "step": 560
945
  },
946
  {
947
  "epoch": 0.65649294558019,
948
+ "grad_norm": 25.945256821159443,
949
  "learning_rate": 1.5911068067978818e-07,
950
+ "logits/chosen": -0.48560142517089844,
951
+ "logits/rejected": 0.6249833106994629,
952
+ "logps/chosen": -559.338623046875,
953
+ "logps/rejected": -765.5469970703125,
954
+ "loss": 0.4061,
955
+ "rewards/accuracies": 0.737500011920929,
956
+ "rewards/chosen": -2.6765408515930176,
957
+ "rewards/margins": 2.3505935668945312,
958
+ "rewards/rejected": -5.027134895324707,
959
  "step": 570
960
  },
961
  {
962
  "epoch": 0.6680103656780881,
963
+ "grad_norm": 27.330253535761116,
964
  "learning_rate": 1.4981855017728197e-07,
965
+ "logits/chosen": -0.3748374879360199,
966
+ "logits/rejected": 0.31157660484313965,
967
+ "logps/chosen": -530.5482177734375,
968
+ "logps/rejected": -811.3858032226562,
969
+ "loss": 0.3952,
970
+ "rewards/accuracies": 0.778124988079071,
971
+ "rewards/chosen": -2.6320903301239014,
972
+ "rewards/margins": 2.578404188156128,
973
+ "rewards/rejected": -5.2104949951171875,
974
  "step": 580
975
  },
976
  {
977
  "epoch": 0.6795277857759862,
978
+ "grad_norm": 31.016503489930162,
979
  "learning_rate": 1.406884985556804e-07,
980
+ "logits/chosen": -0.366117388010025,
981
+ "logits/rejected": 0.6988335251808167,
982
+ "logps/chosen": -560.417236328125,
983
+ "logps/rejected": -811.5693359375,
984
+ "loss": 0.3842,
985
+ "rewards/accuracies": 0.7593749761581421,
986
+ "rewards/chosen": -2.7148842811584473,
987
+ "rewards/margins": 2.6223959922790527,
988
+ "rewards/rejected": -5.337280750274658,
989
  "step": 590
990
  },
991
  {
992
  "epoch": 0.6910452058738843,
993
+ "grad_norm": 33.49108857296992,
994
  "learning_rate": 1.3173529689837354e-07,
995
+ "logits/chosen": -0.42413753271102905,
996
+ "logits/rejected": 0.8382568359375,
997
+ "logps/chosen": -592.3516845703125,
998
+ "logps/rejected": -904.6168212890625,
999
+ "loss": 0.3933,
1000
+ "rewards/accuracies": 0.800000011920929,
1001
+ "rewards/chosen": -2.979618549346924,
1002
+ "rewards/margins": 3.4002768993377686,
1003
+ "rewards/rejected": -6.3798956871032715,
1004
  "step": 600
1005
  },
1006
  {
1007
  "epoch": 0.6910452058738843,
1008
+ "eval_logits/chosen": -0.5819162130355835,
1009
+ "eval_logits/rejected": 0.32466375827789307,
1010
+ "eval_logps/chosen": -664.277587890625,
1011
+ "eval_logps/rejected": -1041.9088134765625,
1012
+ "eval_loss": 0.3190486431121826,
1013
+ "eval_rewards/accuracies": 0.8256726264953613,
1014
+ "eval_rewards/chosen": -3.733095169067383,
1015
+ "eval_rewards/margins": 3.7850706577301025,
1016
+ "eval_rewards/rejected": -7.5181660652160645,
1017
+ "eval_runtime": 642.0739,
1018
+ "eval_samples_per_second": 11.098,
1019
+ "eval_steps_per_second": 0.347,
1020
  "step": 600
1021
  },
1022
  {
1023
  "epoch": 0.7025626259717823,
1024
+ "grad_norm": 33.29931962582806,
1025
  "learning_rate": 1.2297343017146726e-07,
1026
+ "logits/chosen": -0.12166979163885117,
1027
+ "logits/rejected": 0.9423264265060425,
1028
+ "logps/chosen": -573.4078979492188,
1029
+ "logps/rejected": -842.9945068359375,
1030
+ "loss": 0.38,
1031
+ "rewards/accuracies": 0.8062499761581421,
1032
+ "rewards/chosen": -2.8864645957946777,
1033
+ "rewards/margins": 2.795393705368042,
1034
+ "rewards/rejected": -5.681858539581299,
1035
  "step": 610
1036
  },
1037
  {
1038
  "epoch": 0.7140800460696803,
1039
+ "grad_norm": 27.906297465558936,
1040
  "learning_rate": 1.1441707378923474e-07,
1041
+ "logits/chosen": 0.07007602602243423,
1042
+ "logits/rejected": 0.9558390378952026,
1043
+ "logps/chosen": -573.7945556640625,
1044
+ "logps/rejected": -877.9972534179688,
1045
+ "loss": 0.4068,
1046
+ "rewards/accuracies": 0.8031250238418579,
1047
+ "rewards/chosen": -2.903010368347168,
1048
+ "rewards/margins": 3.0875983238220215,
1049
+ "rewards/rejected": -5.990609169006348,
1050
  "step": 620
1051
  },
1052
  {
1053
  "epoch": 0.7255974661675785,
1054
+ "grad_norm": 22.67579385883027,
1055
  "learning_rate": 1.06080070680377e-07,
1056
+ "logits/chosen": 0.022813748568296432,
1057
+ "logits/rejected": 0.8800600171089172,
1058
+ "logps/chosen": -573.94482421875,
1059
+ "logps/rejected": -831.4425048828125,
1060
+ "loss": 0.3693,
1061
+ "rewards/accuracies": 0.765625,
1062
+ "rewards/chosen": -2.894199848175049,
1063
+ "rewards/margins": 2.732788324356079,
1064
+ "rewards/rejected": -5.626988410949707,
1065
  "step": 630
1066
  },
1067
  {
1068
  "epoch": 0.7371148862654765,
1069
+ "grad_norm": 24.736748332046908,
1070
  "learning_rate": 9.797590889219587e-08,
1071
+ "logits/chosen": 0.08141092956066132,
1072
+ "logits/rejected": 0.9846137166023254,
1073
+ "logps/chosen": -573.6690673828125,
1074
+ "logps/rejected": -854.0896606445312,
1075
+ "loss": 0.3694,
1076
+ "rewards/accuracies": 0.7718750238418579,
1077
+ "rewards/chosen": -3.009357213973999,
1078
+ "rewards/margins": 2.850132465362549,
1079
+ "rewards/rejected": -5.859489440917969,
1080
  "step": 640
1081
  },
1082
  {
1083
  "epoch": 0.7486323063633746,
1084
+ "grad_norm": 19.955009545836916,
1085
  "learning_rate": 9.011769976891367e-08,
1086
+ "logits/chosen": 0.18348722159862518,
1087
+ "logits/rejected": 1.4492700099945068,
1088
+ "logps/chosen": -584.7655029296875,
1089
+ "logps/rejected": -836.3447265625,
1090
+ "loss": 0.3854,
1091
+ "rewards/accuracies": 0.809374988079071,
1092
+ "rewards/chosen": -2.9547104835510254,
1093
+ "rewards/margins": 2.7604758739471436,
1094
+ "rewards/rejected": -5.71518611907959,
1095
  "step": 650
1096
  },
1097
  {
1098
  "epoch": 0.7601497264612727,
1099
+ "grad_norm": 30.274437174678905,
1100
  "learning_rate": 8.251815673944218e-08,
1101
+ "logits/chosen": -0.350322425365448,
1102
+ "logits/rejected": 0.7839977741241455,
1103
+ "logps/chosen": -641.0626220703125,
1104
+ "logps/rejected": -957.6082763671875,
1105
+ "loss": 0.3594,
1106
+ "rewards/accuracies": 0.8218749761581421,
1107
+ "rewards/chosen": -3.33996844291687,
1108
+ "rewards/margins": 3.434263229370117,
1109
+ "rewards/rejected": -6.774231910705566,
1110
  "step": 660
1111
  },
1112
  {
1113
  "epoch": 0.7716671465591708,
1114
+ "grad_norm": 27.553152868967715,
1115
  "learning_rate": 7.518957474892148e-08,
1116
+ "logits/chosen": -0.08678195625543594,
1117
+ "logits/rejected": 0.7337055206298828,
1118
+ "logps/chosen": -591.7120971679688,
1119
+ "logps/rejected": -890.9392700195312,
1120
+ "loss": 0.3827,
1121
+ "rewards/accuracies": 0.7875000238418579,
1122
+ "rewards/chosen": -2.974543333053589,
1123
+ "rewards/margins": 3.0520572662353516,
1124
+ "rewards/rejected": -6.0266008377075195,
1125
  "step": 670
1126
  },
1127
  {
1128
  "epoch": 0.7831845666570688,
1129
+ "grad_norm": 30.232454471420272,
1130
  "learning_rate": 6.814381036730274e-08,
1131
+ "logits/chosen": -0.2323370724916458,
1132
+ "logits/rejected": 0.9590624570846558,
1133
+ "logps/chosen": -585.7045288085938,
1134
+ "logps/rejected": -862.7799072265625,
1135
+ "loss": 0.4032,
1136
+ "rewards/accuracies": 0.7749999761581421,
1137
+ "rewards/chosen": -3.0005805492401123,
1138
+ "rewards/margins": 2.9160284996032715,
1139
+ "rewards/rejected": -5.916609287261963,
1140
  "step": 680
1141
  },
1142
  {
1143
  "epoch": 0.7947019867549668,
1144
+ "grad_norm": 28.297784072359498,
1145
  "learning_rate": 6.139226260715872e-08,
1146
+ "logits/chosen": -0.23143115639686584,
1147
+ "logits/rejected": 0.6265262365341187,
1148
+ "logps/chosen": -598.2958374023438,
1149
+ "logps/rejected": -911.5656127929688,
1150
+ "loss": 0.3747,
1151
+ "rewards/accuracies": 0.7562500238418579,
1152
+ "rewards/chosen": -3.1650335788726807,
1153
+ "rewards/margins": 3.1202616691589355,
1154
+ "rewards/rejected": -6.285294532775879,
1155
  "step": 690
1156
  },
1157
  {
1158
  "epoch": 0.806219406852865,
1159
+ "grad_norm": 27.17313115678408,
1160
  "learning_rate": 5.4945854481754734e-08,
1161
+ "logits/chosen": -0.07786539942026138,
1162
+ "logits/rejected": 0.7690663933753967,
1163
+ "logps/chosen": -588.6981201171875,
1164
+ "logps/rejected": -892.0836181640625,
1165
+ "loss": 0.3858,
1166
+ "rewards/accuracies": 0.7562500238418579,
1167
+ "rewards/chosen": -3.089141845703125,
1168
+ "rewards/margins": 3.093146800994873,
1169
+ "rewards/rejected": -6.182288646697998,
1170
  "step": 700
1171
  },
1172
  {
1173
  "epoch": 0.806219406852865,
1174
+ "eval_logits/chosen": -0.6058293581008911,
1175
+ "eval_logits/rejected": 0.35863998532295227,
1176
+ "eval_logps/chosen": -686.6614379882812,
1177
+ "eval_logps/rejected": -1093.6546630859375,
1178
+ "eval_loss": 0.31664812564849854,
1179
+ "eval_rewards/accuracies": 0.8245515823364258,
1180
+ "eval_rewards/chosen": -3.9569337368011475,
1181
+ "eval_rewards/margins": 4.078691005706787,
1182
+ "eval_rewards/rejected": -8.035624504089355,
1183
+ "eval_runtime": 644.0322,
1184
+ "eval_samples_per_second": 11.065,
1185
+ "eval_steps_per_second": 0.346,
1186
  "step": 700
1187
  },
1188
  {
1189
  "epoch": 0.817736826950763,
1190
+ "grad_norm": 26.61646646409602,
1191
  "learning_rate": 4.881501533321605e-08,
1192
+ "logits/chosen": -0.4410906732082367,
1193
+ "logits/rejected": 0.5729657411575317,
1194
+ "logps/chosen": -575.347900390625,
1195
+ "logps/rejected": -934.3646240234375,
1196
+ "loss": 0.3728,
1197
+ "rewards/accuracies": 0.8187500238418579,
1198
+ "rewards/chosen": -2.907287359237671,
1199
+ "rewards/margins": 3.6123032569885254,
1200
+ "rewards/rejected": -6.519589900970459,
1201
  "step": 710
1202
  },
1203
  {
1204
  "epoch": 0.8292542470486611,
1205
+ "grad_norm": 29.443243927116985,
1206
  "learning_rate": 4.300966395938377e-08,
1207
+ "logits/chosen": -0.38687664270401,
1208
+ "logits/rejected": 0.5803302526473999,
1209
+ "logps/chosen": -589.1840209960938,
1210
+ "logps/rejected": -924.7605590820312,
1211
+ "loss": 0.3718,
1212
+ "rewards/accuracies": 0.8125,
1213
+ "rewards/chosen": -3.089477062225342,
1214
+ "rewards/margins": 3.404120683670044,
1215
+ "rewards/rejected": -6.493597507476807,
1216
  "step": 720
1217
  },
1218
  {
1219
  "epoch": 0.8407716671465592,
1220
+ "grad_norm": 27.075379532779515,
1221
  "learning_rate": 3.7539192566655246e-08,
1222
+ "logits/chosen": -0.2331455647945404,
1223
+ "logits/rejected": 0.7184351086616516,
1224
+ "logps/chosen": -597.6744384765625,
1225
+ "logps/rejected": -883.7164916992188,
1226
+ "loss": 0.3743,
1227
+ "rewards/accuracies": 0.8062499761581421,
1228
+ "rewards/chosen": -3.0997419357299805,
1229
+ "rewards/margins": 2.9383721351623535,
1230
+ "rewards/rejected": -6.038114547729492,
1231
  "step": 730
1232
  },
1233
  {
1234
  "epoch": 0.8522890872444573,
1235
+ "grad_norm": 34.009147703744425,
1236
  "learning_rate": 3.24124515747731e-08,
1237
+ "logits/chosen": 0.15024222433567047,
1238
+ "logits/rejected": 1.2283384799957275,
1239
+ "logps/chosen": -628.3982543945312,
1240
+ "logps/rejected": -985.0546875,
1241
+ "loss": 0.3708,
1242
+ "rewards/accuracies": 0.7906249761581421,
1243
+ "rewards/chosen": -3.4668242931365967,
1244
+ "rewards/margins": 3.758965253829956,
1245
+ "rewards/rejected": -7.2257890701293945,
1246
  "step": 740
1247
  },
1248
  {
1249
  "epoch": 0.8638065073423553,
1250
+ "grad_norm": 32.52714315668832,
1251
  "learning_rate": 2.763773529814506e-08,
1252
+ "logits/chosen": 0.2603093981742859,
1253
+ "logits/rejected": 1.1775026321411133,
1254
+ "logps/chosen": -583.00439453125,
1255
+ "logps/rejected": -904.5720825195312,
1256
+ "loss": 0.3746,
1257
+ "rewards/accuracies": 0.778124988079071,
1258
+ "rewards/chosen": -3.006887912750244,
1259
+ "rewards/margins": 3.236215114593506,
1260
+ "rewards/rejected": -6.24310302734375,
1261
  "step": 750
1262
  },
1263
  {
1264
  "epoch": 0.8753239274402533,
1265
+ "grad_norm": 27.98558075342816,
1266
  "learning_rate": 2.3222768526860698e-08,
1267
+ "logits/chosen": 0.1701681762933731,
1268
+ "logits/rejected": 1.1987775564193726,
1269
+ "logps/chosen": -613.1824951171875,
1270
+ "logps/rejected": -900.89111328125,
1271
+ "loss": 0.3922,
1272
+ "rewards/accuracies": 0.753125011920929,
1273
+ "rewards/chosen": -3.226149320602417,
1274
+ "rewards/margins": 2.7997944355010986,
1275
+ "rewards/rejected": -6.025943756103516,
1276
  "step": 760
1277
  },
1278
  {
1279
  "epoch": 0.8868413475381515,
1280
+ "grad_norm": 29.982562987973527,
1281
  "learning_rate": 1.9174694029115146e-08,
1282
+ "logits/chosen": 0.12872493267059326,
1283
+ "logits/rejected": 1.1761186122894287,
1284
+ "logps/chosen": -600.1962890625,
1285
+ "logps/rejected": -947.6414794921875,
1286
+ "loss": 0.3739,
1287
+ "rewards/accuracies": 0.8500000238418579,
1288
+ "rewards/chosen": -3.291241407394409,
1289
+ "rewards/margins": 3.434460401535034,
1290
+ "rewards/rejected": -6.725701808929443,
1291
  "step": 770
1292
  },
1293
  {
1294
  "epoch": 0.8983587676360495,
1295
+ "grad_norm": 29.902322430965125,
1296
  "learning_rate": 1.5500060995258134e-08,
1297
+ "logits/chosen": 0.07875040918588638,
1298
+ "logits/rejected": 1.1803662776947021,
1299
+ "logps/chosen": -587.24951171875,
1300
+ "logps/rejected": -848.0086059570312,
1301
+ "loss": 0.3833,
1302
+ "rewards/accuracies": 0.7437499761581421,
1303
+ "rewards/chosen": -3.145979642868042,
1304
+ "rewards/margins": 2.7399418354034424,
1305
+ "rewards/rejected": -5.885921955108643,
1306
  "step": 780
1307
  },
1308
  {
1309
  "epoch": 0.9098761877339476,
1310
+ "grad_norm": 31.87265554134813,
1311
  "learning_rate": 1.2204814442165812e-08,
1312
+ "logits/chosen": 0.0372554175555706,
1313
+ "logits/rejected": 0.9403896331787109,
1314
+ "logps/chosen": -605.0985107421875,
1315
+ "logps/rejected": -893.2574462890625,
1316
+ "loss": 0.3836,
1317
+ "rewards/accuracies": 0.8031250238418579,
1318
+ "rewards/chosen": -3.219792127609253,
1319
+ "rewards/margins": 2.947915554046631,
1320
+ "rewards/rejected": -6.1677069664001465,
1321
  "step": 790
1322
  },
1323
  {
1324
  "epoch": 0.9213936078318457,
1325
+ "grad_norm": 28.29913492171739,
1326
  "learning_rate": 9.294285595075669e-09,
1327
+ "logits/chosen": 0.1543000489473343,
1328
+ "logits/rejected": 1.00014328956604,
1329
+ "logps/chosen": -594.6954956054688,
1330
+ "logps/rejected": -909.7340087890625,
1331
+ "loss": 0.3785,
1332
+ "rewards/accuracies": 0.7562500238418579,
1333
+ "rewards/chosen": -3.133711338043213,
1334
+ "rewards/margins": 3.2508773803710938,
1335
+ "rewards/rejected": -6.384588241577148,
1336
  "step": 800
1337
  },
1338
  {
1339
  "epoch": 0.9213936078318457,
1340
+ "eval_logits/chosen": -0.424582302570343,
1341
+ "eval_logits/rejected": 0.5558030605316162,
1342
+ "eval_logps/chosen": -702.706787109375,
1343
+ "eval_logps/rejected": -1123.9625244140625,
1344
+ "eval_loss": 0.31607937812805176,
1345
+ "eval_rewards/accuracies": 0.8211883306503296,
1346
+ "eval_rewards/chosen": -4.117387771606445,
1347
+ "eval_rewards/margins": 4.221314430236816,
1348
+ "eval_rewards/rejected": -8.338702201843262,
1349
+ "eval_runtime": 644.1715,
1350
+ "eval_samples_per_second": 11.062,
1351
+ "eval_steps_per_second": 0.346,
1352
  "step": 800
1353
  },
1354
  {
1355
  "epoch": 0.9329110279297438,
1356
+ "grad_norm": 28.271393767746897,
1357
  "learning_rate": 6.773183262446914e-09,
1358
+ "logits/chosen": 0.015259919688105583,
1359
+ "logits/rejected": 1.0111353397369385,
1360
+ "logps/chosen": -610.5919189453125,
1361
+ "logps/rejected": -938.6796875,
1362
+ "loss": 0.3958,
1363
+ "rewards/accuracies": 0.778124988079071,
1364
+ "rewards/chosen": -3.225220203399658,
1365
+ "rewards/margins": 3.3511810302734375,
1366
+ "rewards/rejected": -6.5764007568359375,
1367
  "step": 810
1368
  },
1369
  {
1370
  "epoch": 0.9444284480276418,
1371
+ "grad_norm": 36.747504831392355,
1372
  "learning_rate": 4.645586217799452e-09,
1373
+ "logits/chosen": -0.05728811025619507,
1374
+ "logits/rejected": 1.0568214654922485,
1375
+ "logps/chosen": -602.8958129882812,
1376
+ "logps/rejected": -914.1162109375,
1377
+ "loss": 0.392,
1378
+ "rewards/accuracies": 0.784375011920929,
1379
+ "rewards/chosen": -2.9873690605163574,
1380
+ "rewards/margins": 3.2894864082336426,
1381
+ "rewards/rejected": -6.27685546875,
1382
  "step": 820
1383
  },
1384
  {
1385
  "epoch": 0.9559458681255398,
1386
+ "grad_norm": 30.708105872731245,
1387
  "learning_rate": 2.9149366008568987e-09,
1388
+ "logits/chosen": 0.1167169064283371,
1389
+ "logits/rejected": 1.0058209896087646,
1390
+ "logps/chosen": -580.9910278320312,
1391
+ "logps/rejected": -903.1005859375,
1392
+ "loss": 0.405,
1393
  "rewards/accuracies": 0.778124988079071,
1394
+ "rewards/chosen": -2.9704513549804688,
1395
+ "rewards/margins": 3.3174147605895996,
1396
+ "rewards/rejected": -6.287866115570068,
1397
  "step": 830
1398
  },
1399
  {
1400
  "epoch": 0.967463288223438,
1401
+ "grad_norm": 26.852523446236688,
1402
  "learning_rate": 1.5840343486700215e-09,
1403
+ "logits/chosen": 0.10119374096393585,
1404
+ "logits/rejected": 1.3030340671539307,
1405
+ "logps/chosen": -624.4727783203125,
1406
+ "logps/rejected": -907.2607421875,
1407
+ "loss": 0.4136,
1408
+ "rewards/accuracies": 0.7875000238418579,
1409
+ "rewards/chosen": -3.329115629196167,
1410
+ "rewards/margins": 3.0215249061584473,
1411
+ "rewards/rejected": -6.350640296936035,
1412
  "step": 840
1413
  },
1414
  {
1415
  "epoch": 0.978980708321336,
1416
+ "grad_norm": 29.02811217591628,
1417
  "learning_rate": 6.550326657293881e-10,
1418
+ "logits/chosen": 0.16462787985801697,
1419
+ "logits/rejected": 1.183337926864624,
1420
+ "logps/chosen": -576.1724853515625,
1421
+ "logps/rejected": -849.2581176757812,
1422
+ "loss": 0.4037,
1423
+ "rewards/accuracies": 0.793749988079071,
1424
+ "rewards/chosen": -3.109165668487549,
1425
+ "rewards/margins": 2.8826241493225098,
1426
+ "rewards/rejected": -5.991789817810059,
1427
  "step": 850
1428
  },
1429
  {
1430
  "epoch": 0.9904981284192341,
1431
+ "grad_norm": 38.83754661046954,
1432
  "learning_rate": 1.2943454039654467e-10,
1433
+ "logits/chosen": 0.04645358771085739,
1434
+ "logits/rejected": 1.1813570261001587,
1435
+ "logps/chosen": -563.9542236328125,
1436
+ "logps/rejected": -806.0048828125,
1437
+ "loss": 0.385,
1438
+ "rewards/accuracies": 0.784375011920929,
1439
+ "rewards/chosen": -2.8706278800964355,
1440
+ "rewards/margins": 2.5064380168914795,
1441
+ "rewards/rejected": -5.377066135406494,
1442
  "step": 860
1443
  },
1444
  {
1445
  "epoch": 0.9997120644975526,
1446
  "step": 868,
1447
  "total_flos": 0.0,
1448
+ "train_loss": 0.42924998652550483,
1449
+ "train_runtime": 32201.1967,
1450
+ "train_samples_per_second": 3.451,
1451
+ "train_steps_per_second": 0.027
1452
  }
1453
  ],
1454
  "logging_steps": 10,