NicholasCorrado commited on
Commit
d214813
·
verified ·
1 Parent(s): b606e18

Model save

Browse files
README.md CHANGED
@@ -3,16 +3,10 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: alignment-handbook/zephyr-7b-sft-full
5
  tags:
6
- - alignment-handbook
7
- - trl
8
- - dpo
9
- - generated_from_trainer
10
  - trl
11
  - dpo
12
  - alignment-handbook
13
  - generated_from_trainer
14
- datasets:
15
- - HuggingFaceH4/ultrafeedback_binarized
16
  model-index:
17
  - name: zephyr-7b-dpo-full
18
  results: []
@@ -23,17 +17,17 @@ should probably proofread and complete it, then remove this comment. -->
23
 
24
  # zephyr-7b-dpo-full
25
 
26
- This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the HuggingFaceH4/ultrafeedback_binarized dataset.
27
  It achieves the following results on the evaluation set:
28
- - Loss: 0.2473
29
- - Rewards/chosen: -4.6815
30
- - Rewards/rejected: -10.5131
31
- - Rewards/accuracies: 0.8525
32
- - Rewards/margins: 5.8316
33
- - Logps/rejected: -1354.8135
34
- - Logps/chosen: -759.6055
35
- - Logits/rejected: -1.2709
36
- - Logits/chosen: -1.7157
37
 
38
  ## Model description
39
 
@@ -70,9 +64,14 @@ The following hyperparameters were used during training:
70
 
71
  | Training Loss | Epoch | Step | Logits/chosen | Logits/rejected | Logps/chosen | Logps/rejected | Validation Loss | Rewards/accuracies | Rewards/chosen | Rewards/margins | Rewards/rejected |
72
  |:-------------:|:------:|:----:|:-------------:|:---------------:|:------------:|:--------------:|:---------------:|:------------------:|:--------------:|:---------------:|:----------------:|
73
- | 0.3043 | 0.2559 | 100 | -2.9645 | -2.9613 | -589.3575 | -935.3794 | 0.3080 | 0.8245 | -2.9790 | 3.3398 | -6.3188 |
74
- | 0.2557 | 0.5118 | 200 | -2.2748 | -2.0707 | -709.4976 | -1222.8809 | 0.2607 | 0.8470 | -4.1804 | 5.0134 | -9.1938 |
75
- | 0.2515 | 0.7678 | 300 | -1.8309 | -1.4322 | -726.7409 | -1277.9103 | 0.2493 | 0.8509 | -4.3528 | 5.3912 | -9.7441 |
 
 
 
 
 
76
 
77
 
78
  ### Framework versions
 
3
  license: apache-2.0
4
  base_model: alignment-handbook/zephyr-7b-sft-full
5
  tags:
 
 
 
 
6
  - trl
7
  - dpo
8
  - alignment-handbook
9
  - generated_from_trainer
 
 
10
  model-index:
11
  - name: zephyr-7b-dpo-full
12
  results: []
 
17
 
18
  # zephyr-7b-dpo-full
19
 
20
+ This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the None dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Logits/chosen: -0.3096
23
+ - Logits/rejected: 0.6049
24
+ - Logps/chosen: -755.9323
25
+ - Logps/rejected: -1192.5621
26
+ - Loss: 0.3152
27
+ - Rewards/accuracies: 0.8184
28
+ - Rewards/chosen: -4.6496
29
+ - Rewards/margins: 4.3751
30
+ - Rewards/rejected: -9.0247
31
 
32
  ## Model description
33
 
 
64
 
65
  | Training Loss | Epoch | Step | Logits/chosen | Logits/rejected | Logps/chosen | Logps/rejected | Validation Loss | Rewards/accuracies | Rewards/chosen | Rewards/margins | Rewards/rejected |
66
  |:-------------:|:------:|:----:|:-------------:|:---------------:|:------------:|:--------------:|:---------------:|:------------------:|:--------------:|:---------------:|:----------------:|
67
+ | 0.5385 | 0.1152 | 100 | -2.9012 | -2.8749 | -433.4271 | -527.4997 | 0.4593 | 0.7539 | -1.4246 | 0.9495 | -2.3741 |
68
+ | 0.4369 | 0.2303 | 200 | -1.5078 | -1.1798 | -594.2914 | -823.1062 | 0.3590 | 0.7915 | -3.0332 | 2.2969 | -5.3301 |
69
+ | 0.4119 | 0.3455 | 300 | -0.6166 | -0.1140 | -677.2002 | -996.9340 | 0.3369 | 0.8156 | -3.8623 | 3.2061 | -7.0684 |
70
+ | 0.3964 | 0.4607 | 400 | -0.6209 | 0.2313 | -753.4187 | -1128.0946 | 0.3311 | 0.8178 | -4.6245 | 3.7555 | -8.3800 |
71
+ | 0.3858 | 0.5759 | 500 | -0.7776 | 0.1893 | -694.4181 | -1049.8429 | 0.3247 | 0.8167 | -4.0345 | 3.5630 | -7.5975 |
72
+ | 0.4031 | 0.6910 | 600 | -0.2605 | 0.6163 | -748.3096 | -1143.1573 | 0.3191 | 0.8201 | -4.5734 | 3.9572 | -8.5306 |
73
+ | 0.4007 | 0.8062 | 700 | -0.4982 | 0.4411 | -753.0112 | -1189.4250 | 0.3171 | 0.8178 | -4.6204 | 4.3729 | -8.9933 |
74
+ | 0.3644 | 0.9214 | 800 | -0.3096 | 0.6049 | -755.9323 | -1192.5621 | 0.3152 | 0.8184 | -4.6496 | 4.3751 | -9.0247 |
75
 
76
 
77
  ### Framework versions
all_results.json CHANGED
@@ -1,22 +1,22 @@
1
  {
2
- "epoch": 0.9980806142034548,
3
- "eval_logits/chosen": -1.7157304286956787,
4
- "eval_logits/rejected": -1.2709392309188843,
5
- "eval_logps/chosen": -759.6055297851562,
6
- "eval_logps/rejected": -1354.8134765625,
7
- "eval_loss": 0.24732786417007446,
8
- "eval_rewards/accuracies": 0.8524844646453857,
9
- "eval_rewards/chosen": -4.681485176086426,
10
- "eval_rewards/margins": 5.831614017486572,
11
- "eval_rewards/rejected": -10.513099670410156,
12
- "eval_runtime": 461.8444,
13
- "eval_samples": 5126,
14
- "eval_samples_per_second": 11.099,
15
- "eval_steps_per_second": 0.349,
16
  "total_flos": 0.0,
17
  "train_loss": 0.0,
18
- "train_runtime": 0.0175,
19
- "train_samples": 50000,
20
- "train_samples_per_second": 2864609.543,
21
- "train_steps_per_second": 22343.954
22
  }
 
1
  {
2
+ "epoch": 0.9997120644975526,
3
+ "eval_logits/chosen": -0.29694831371307373,
4
+ "eval_logits/rejected": 0.6135479807853699,
5
+ "eval_logps/chosen": -754.9666748046875,
6
+ "eval_logps/rejected": -1189.8031005859375,
7
+ "eval_loss": 0.3148016333580017,
8
+ "eval_rewards/accuracies": 0.8178251385688782,
9
+ "eval_rewards/chosen": -4.639986038208008,
10
+ "eval_rewards/margins": 4.357123374938965,
11
+ "eval_rewards/rejected": -8.997109413146973,
12
+ "eval_runtime": 645.0187,
13
+ "eval_samples": 7126,
14
+ "eval_samples_per_second": 11.048,
15
+ "eval_steps_per_second": 0.346,
16
  "total_flos": 0.0,
17
  "train_loss": 0.0,
18
+ "train_runtime": 0.0211,
19
+ "train_samples": 111134,
20
+ "train_samples_per_second": 5273498.215,
21
+ "train_steps_per_second": 41188.083
22
  }
config.json CHANGED
@@ -22,6 +22,6 @@
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.44.1",
25
- "use_cache": true,
26
  "vocab_size": 32000
27
  }
 
22
  "tie_word_embeddings": false,
23
  "torch_dtype": "bfloat16",
24
  "transformers_version": "4.44.1",
25
+ "use_cache": false,
26
  "vocab_size": 32000
27
  }
eval_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 0.9980806142034548,
3
- "eval_logits/chosen": -1.7157304286956787,
4
- "eval_logits/rejected": -1.2709392309188843,
5
- "eval_logps/chosen": -759.6055297851562,
6
- "eval_logps/rejected": -1354.8134765625,
7
- "eval_loss": 0.24732786417007446,
8
- "eval_rewards/accuracies": 0.8524844646453857,
9
- "eval_rewards/chosen": -4.681485176086426,
10
- "eval_rewards/margins": 5.831614017486572,
11
- "eval_rewards/rejected": -10.513099670410156,
12
- "eval_runtime": 461.8444,
13
- "eval_samples": 5126,
14
- "eval_samples_per_second": 11.099,
15
- "eval_steps_per_second": 0.349
16
  }
 
1
  {
2
+ "epoch": 0.9997120644975526,
3
+ "eval_logits/chosen": -0.29694831371307373,
4
+ "eval_logits/rejected": 0.6135479807853699,
5
+ "eval_logps/chosen": -754.9666748046875,
6
+ "eval_logps/rejected": -1189.8031005859375,
7
+ "eval_loss": 0.3148016333580017,
8
+ "eval_rewards/accuracies": 0.8178251385688782,
9
+ "eval_rewards/chosen": -4.639986038208008,
10
+ "eval_rewards/margins": 4.357123374938965,
11
+ "eval_rewards/rejected": -8.997109413146973,
12
+ "eval_runtime": 645.0187,
13
+ "eval_samples": 7126,
14
+ "eval_samples_per_second": 11.048,
15
+ "eval_steps_per_second": 0.346
16
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4fb52896daf5ed8eb8941bcce3b0f886228633d39d7dd833b32693271e26ec8b
3
  size 4943162336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:897f55e46c5eba22bbc53c9f48bf32d4d7a8dce6b4e774d89b4ccce301997e8b
3
  size 4943162336
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6052514b704355da0bd67161adf45ce888741ffbf7639d96c8c61ca48c402768
3
  size 4999819336
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bd18ed0c2802d7611637a96ee62089ef6667152f6fab83d0922666bf04f1e0f
3
  size 4999819336
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fee6b3e918791993ece5b563d2121f65fe38698c5b48f06f1543da6c7fffa15e
3
  size 4540516344
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8636350f0d86db4f7f6444dab2e4d9577031e3af1bedb354a872a725f1d2071b
3
  size 4540516344
runs/Aug25_01-32-44_ip-10-0-9-154.ec2.internal/events.out.tfevents.1724550949.ip-10-0-9-154.ec2.internal.80146.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a26a27eab8959c5e6c16945f90a815c480d360ccfd2fa5bffe3ae23eebee680
3
+ size 6511
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9980806142034548,
3
  "total_flos": 0.0,
4
  "train_loss": 0.0,
5
- "train_runtime": 0.0175,
6
- "train_samples": 50000,
7
- "train_samples_per_second": 2864609.543,
8
- "train_steps_per_second": 22343.954
9
  }
 
1
  {
2
+ "epoch": 0.9997120644975526,
3
  "total_flos": 0.0,
4
  "train_loss": 0.0,
5
+ "train_runtime": 0.0211,
6
+ "train_samples": 111134,
7
+ "train_samples_per_second": 5273498.215,
8
+ "train_steps_per_second": 41188.083
9
  }
trainer_state.json CHANGED
@@ -1,21 +1,21 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9980806142034548,
5
  "eval_steps": 100,
6
- "global_step": 390,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0025591810620601407,
13
- "grad_norm": 8.372040796681393,
14
- "learning_rate": 1.282051282051282e-08,
15
- "logits/chosen": -2.9558680057525635,
16
- "logits/rejected": -2.9835896492004395,
17
- "logps/chosen": -287.1746520996094,
18
- "logps/rejected": -318.6817626953125,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
@@ -24,650 +24,1435 @@
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.025591810620601407,
28
- "grad_norm": 9.212524406222368,
29
- "learning_rate": 1.2820512820512818e-07,
30
- "logits/chosen": -3.0168228149414062,
31
- "logits/rejected": -3.0099453926086426,
32
- "logps/chosen": -286.0946044921875,
33
- "logps/rejected": -304.9287414550781,
34
- "loss": 0.693,
35
- "rewards/accuracies": 0.4097222089767456,
36
- "rewards/chosen": -0.00034835602855309844,
37
- "rewards/margins": -5.7743654906516895e-05,
38
- "rewards/rejected": -0.00029061237000860274,
39
  "step": 10
40
  },
41
  {
42
- "epoch": 0.05118362124120281,
43
- "grad_norm": 8.914663082765845,
44
- "learning_rate": 2.5641025641025636e-07,
45
- "logits/chosen": -3.006526470184326,
46
- "logits/rejected": -2.9971041679382324,
47
- "logps/chosen": -283.1783447265625,
48
- "logps/rejected": -298.82427978515625,
49
- "loss": 0.69,
50
- "rewards/accuracies": 0.6468750238418579,
51
- "rewards/chosen": 0.0020886282436549664,
52
- "rewards/margins": 0.00550027284771204,
53
- "rewards/rejected": -0.0034116446040570736,
54
  "step": 20
55
  },
56
  {
57
- "epoch": 0.07677543186180422,
58
- "grad_norm": 9.211645338164717,
59
- "learning_rate": 3.8461538461538463e-07,
60
- "logits/chosen": -3.0022165775299072,
61
- "logits/rejected": -2.997166156768799,
62
- "logps/chosen": -280.0216369628906,
63
- "logps/rejected": -295.76959228515625,
64
- "loss": 0.6714,
65
- "rewards/accuracies": 0.7593749761581421,
66
- "rewards/chosen": 0.009928617626428604,
67
- "rewards/margins": 0.04372577741742134,
68
- "rewards/rejected": -0.03379715979099274,
69
  "step": 30
70
  },
71
  {
72
- "epoch": 0.10236724248240563,
73
- "grad_norm": 10.081033679769522,
74
- "learning_rate": 4.99989986344963e-07,
75
- "logits/chosen": -3.0331904888153076,
76
- "logits/rejected": -3.0275347232818604,
77
- "logps/chosen": -285.2835998535156,
78
- "logps/rejected": -319.5372314453125,
79
- "loss": 0.615,
80
- "rewards/accuracies": 0.828125,
81
- "rewards/chosen": -0.042057085782289505,
82
- "rewards/margins": 0.17162299156188965,
83
- "rewards/rejected": -0.21368007361888885,
84
  "step": 40
85
  },
86
  {
87
- "epoch": 0.12795905310300704,
88
- "grad_norm": 13.895203665337698,
89
- "learning_rate": 4.987893180827479e-07,
90
- "logits/chosen": -3.0868072509765625,
91
- "logits/rejected": -3.0783658027648926,
92
- "logps/chosen": -368.4632263183594,
93
- "logps/rejected": -446.92608642578125,
94
- "loss": 0.5101,
95
- "rewards/accuracies": 0.784375011920929,
96
- "rewards/chosen": -0.7513679265975952,
97
- "rewards/margins": 0.5743271112442017,
98
- "rewards/rejected": -1.3256951570510864,
99
  "step": 50
100
  },
101
  {
102
- "epoch": 0.15355086372360843,
103
- "grad_norm": 17.530401994512683,
104
- "learning_rate": 4.955969343539162e-07,
105
- "logits/chosen": -3.0945253372192383,
106
- "logits/rejected": -3.077105760574341,
107
- "logps/chosen": -525.1121215820312,
108
- "logps/rejected": -675.7432861328125,
109
- "loss": 0.4203,
110
- "rewards/accuracies": 0.7875000238418579,
111
- "rewards/chosen": -2.4605610370635986,
112
- "rewards/margins": 1.4077235460281372,
113
- "rewards/rejected": -3.8682847023010254,
114
  "step": 60
115
  },
116
  {
117
- "epoch": 0.17914267434420986,
118
- "grad_norm": 20.655420138174726,
119
- "learning_rate": 4.90438392204474e-07,
120
- "logits/chosen": -3.1209347248077393,
121
- "logits/rejected": -3.1095337867736816,
122
- "logps/chosen": -557.0096435546875,
123
- "logps/rejected": -769.7672119140625,
124
- "loss": 0.3553,
125
- "rewards/accuracies": 0.824999988079071,
126
- "rewards/chosen": -2.73195219039917,
127
- "rewards/margins": 1.957765817642212,
128
- "rewards/rejected": -4.6897172927856445,
129
  "step": 70
130
  },
131
  {
132
- "epoch": 0.20473448496481125,
133
- "grad_norm": 34.20074706717272,
134
- "learning_rate": 4.83354989019146e-07,
135
- "logits/chosen": -3.0491955280303955,
136
- "logits/rejected": -3.05132794380188,
137
- "logps/chosen": -558.9640502929688,
138
- "logps/rejected": -801.9044799804688,
139
- "loss": 0.3429,
140
- "rewards/accuracies": 0.778124988079071,
141
- "rewards/chosen": -2.753610134124756,
142
- "rewards/margins": 2.3364205360412598,
143
- "rewards/rejected": -5.090030193328857,
144
  "step": 80
145
  },
146
  {
147
- "epoch": 0.23032629558541268,
148
- "grad_norm": 25.187798132399784,
149
- "learning_rate": 4.7440343190975353e-07,
150
- "logits/chosen": -3.0432305335998535,
151
- "logits/rejected": -3.041344165802002,
152
- "logps/chosen": -570.7512817382812,
153
- "logps/rejected": -843.0035400390625,
154
- "loss": 0.3284,
155
- "rewards/accuracies": 0.8343750238418579,
156
- "rewards/chosen": -2.7428793907165527,
157
- "rewards/margins": 2.6616787910461426,
158
- "rewards/rejected": -5.404558181762695,
159
  "step": 90
160
  },
161
  {
162
- "epoch": 0.2559181062060141,
163
- "grad_norm": 39.301413976140616,
164
- "learning_rate": 4.6365538373900506e-07,
165
- "logits/chosen": -3.000190019607544,
166
- "logits/rejected": -3.000822067260742,
167
- "logps/chosen": -588.419677734375,
168
- "logps/rejected": -898.4981689453125,
169
- "loss": 0.3043,
170
- "rewards/accuracies": 0.8125,
171
- "rewards/chosen": -2.9578394889831543,
172
- "rewards/margins": 3.0813093185424805,
173
- "rewards/rejected": -6.039149284362793,
174
  "step": 100
175
  },
176
  {
177
- "epoch": 0.2559181062060141,
178
- "eval_logits/chosen": -2.96449875831604,
179
- "eval_logits/rejected": -2.961296796798706,
180
- "eval_logps/chosen": -589.3575439453125,
181
- "eval_logps/rejected": -935.37939453125,
182
- "eval_loss": 0.3079955577850342,
183
- "eval_rewards/accuracies": 0.8245341777801514,
184
- "eval_rewards/chosen": -2.9790048599243164,
185
- "eval_rewards/margins": 3.339751958847046,
186
- "eval_rewards/rejected": -6.318756580352783,
187
- "eval_runtime": 475.0898,
188
- "eval_samples_per_second": 10.79,
189
- "eval_steps_per_second": 0.339,
190
  "step": 100
191
  },
192
  {
193
- "epoch": 0.28150991682661547,
194
- "grad_norm": 23.2464532886725,
195
- "learning_rate": 4.5119688941406386e-07,
196
- "logits/chosen": -2.9700212478637695,
197
- "logits/rejected": -2.974587917327881,
198
- "logps/chosen": -593.6639404296875,
199
- "logps/rejected": -905.212890625,
200
- "loss": 0.3259,
201
- "rewards/accuracies": 0.8125,
202
- "rewards/chosen": -3.0440046787261963,
203
- "rewards/margins": 2.9606070518493652,
204
- "rewards/rejected": -6.004611492156982,
205
  "step": 110
206
  },
207
  {
208
- "epoch": 0.30710172744721687,
209
- "grad_norm": 20.92968428786604,
210
- "learning_rate": 4.3712768704277524e-07,
211
- "logits/chosen": -2.954521417617798,
212
- "logits/rejected": -2.959869861602783,
213
- "logps/chosen": -536.2525634765625,
214
- "logps/rejected": -830.0895385742188,
215
- "loss": 0.3015,
216
- "rewards/accuracies": 0.8125,
217
- "rewards/chosen": -2.4020705223083496,
218
- "rewards/margins": 2.877993106842041,
219
- "rewards/rejected": -5.280063629150391,
220
  "step": 120
221
  },
222
  {
223
- "epoch": 0.3326935380678183,
224
- "grad_norm": 28.437529952019855,
225
- "learning_rate": 4.2156040946718343e-07,
226
- "logits/chosen": -2.862247943878174,
227
- "logits/rejected": -2.8785834312438965,
228
- "logps/chosen": -636.6513061523438,
229
- "logps/rejected": -1044.999755859375,
230
- "loss": 0.2841,
231
- "rewards/accuracies": 0.824999988079071,
232
- "rewards/chosen": -3.4623122215270996,
233
- "rewards/margins": 3.8583245277404785,
234
- "rewards/rejected": -7.320636749267578,
235
  "step": 130
236
  },
237
  {
238
- "epoch": 0.3582853486884197,
239
- "grad_norm": 19.315717522096396,
240
- "learning_rate": 4.046196825665637e-07,
241
- "logits/chosen": -2.832946538925171,
242
- "logits/rejected": -2.8259646892547607,
243
- "logps/chosen": -593.8186645507812,
244
- "logps/rejected": -979.2440185546875,
245
- "loss": 0.2622,
246
- "rewards/accuracies": 0.8374999761581421,
247
- "rewards/chosen": -2.943942070007324,
248
- "rewards/margins": 3.7794156074523926,
249
- "rewards/rejected": -6.723358154296875,
250
  "step": 140
251
  },
252
  {
253
- "epoch": 0.3838771593090211,
254
- "grad_norm": 22.976543958848772,
255
- "learning_rate": 3.864411275486261e-07,
256
- "logits/chosen": -2.760894536972046,
257
- "logits/rejected": -2.7563464641571045,
258
- "logps/chosen": -634.1366577148438,
259
- "logps/rejected": -1078.597412109375,
260
- "loss": 0.2794,
261
- "rewards/accuracies": 0.8218749761581421,
262
- "rewards/chosen": -3.5211944580078125,
263
- "rewards/margins": 4.23276424407959,
264
- "rewards/rejected": -7.753958702087402,
265
  "step": 150
266
  },
267
  {
268
- "epoch": 0.4094689699296225,
269
- "grad_norm": 21.849223996398678,
270
- "learning_rate": 3.671702752161759e-07,
271
- "logits/chosen": -2.6357340812683105,
272
- "logits/rejected": -2.5901741981506348,
273
- "logps/chosen": -729.1080322265625,
274
- "logps/rejected": -1168.1109619140625,
275
- "loss": 0.2781,
276
- "rewards/accuracies": 0.8374999761581421,
277
- "rewards/chosen": -4.447979927062988,
278
- "rewards/margins": 4.25943660736084,
279
- "rewards/rejected": -8.707415580749512,
280
  "step": 160
281
  },
282
  {
283
- "epoch": 0.4350607805502239,
284
- "grad_norm": 34.268492856409395,
285
- "learning_rate": 3.4696140090121375e-07,
286
- "logits/chosen": -2.5432353019714355,
287
- "logits/rejected": -2.4383697509765625,
288
- "logps/chosen": -768.44775390625,
289
- "logps/rejected": -1241.2236328125,
290
- "loss": 0.2592,
291
- "rewards/accuracies": 0.8531249761581421,
292
- "rewards/chosen": -4.733465194702148,
293
- "rewards/margins": 4.677088737487793,
294
- "rewards/rejected": -9.410554885864258,
295
  "step": 170
296
  },
297
  {
298
- "epoch": 0.46065259117082535,
299
- "grad_norm": 24.22372375688885,
300
- "learning_rate": 3.259762893935617e-07,
301
- "logits/chosen": -2.5379650592803955,
302
- "logits/rejected": -2.4227848052978516,
303
- "logps/chosen": -643.3690185546875,
304
- "logps/rejected": -1086.7647705078125,
305
- "loss": 0.2982,
306
- "rewards/accuracies": 0.8218749761581421,
307
- "rewards/chosen": -3.5777480602264404,
308
- "rewards/margins": 4.262465476989746,
309
- "rewards/rejected": -7.840213775634766,
310
  "step": 180
311
  },
312
  {
313
- "epoch": 0.48624440179142675,
314
- "grad_norm": 22.276548976639525,
315
- "learning_rate": 3.0438293975154184e-07,
316
- "logits/chosen": -2.4349989891052246,
317
- "logits/rejected": -2.2799932956695557,
318
- "logps/chosen": -682.0303955078125,
319
- "logps/rejected": -1134.5205078125,
320
- "loss": 0.2551,
321
- "rewards/accuracies": 0.8531249761581421,
322
- "rewards/chosen": -3.8549671173095703,
323
- "rewards/margins": 4.42364501953125,
324
- "rewards/rejected": -8.27861213684082,
325
  "step": 190
326
  },
327
  {
328
- "epoch": 0.5118362124120281,
329
- "grad_norm": 23.143027388197456,
330
- "learning_rate": 2.823542203635138e-07,
331
- "logits/chosen": -2.3481929302215576,
332
- "logits/rejected": -2.147021770477295,
333
- "logps/chosen": -698.4183959960938,
334
- "logps/rejected": -1117.36962890625,
335
- "loss": 0.2557,
336
- "rewards/accuracies": 0.8031250238418579,
337
- "rewards/chosen": -3.897473096847534,
338
- "rewards/margins": 4.184942722320557,
339
- "rewards/rejected": -8.082415580749512,
340
  "step": 200
341
  },
342
  {
343
- "epoch": 0.5118362124120281,
344
- "eval_logits/chosen": -2.2748405933380127,
345
- "eval_logits/rejected": -2.0707473754882812,
346
- "eval_logps/chosen": -709.49755859375,
347
- "eval_logps/rejected": -1222.880859375,
348
- "eval_loss": 0.2607395350933075,
349
- "eval_rewards/accuracies": 0.8470497131347656,
350
- "eval_rewards/chosen": -4.1804046630859375,
351
- "eval_rewards/margins": 5.013367652893066,
352
- "eval_rewards/rejected": -9.193772315979004,
353
- "eval_runtime": 467.0944,
354
- "eval_samples_per_second": 10.974,
355
- "eval_steps_per_second": 0.345,
356
  "step": 200
357
  },
358
  {
359
- "epoch": 0.5374280230326296,
360
- "grad_norm": 29.714729651434116,
361
- "learning_rate": 2.600664850273538e-07,
362
- "logits/chosen": -2.220996379852295,
363
- "logits/rejected": -2.0096168518066406,
364
- "logps/chosen": -736.2384033203125,
365
- "logps/rejected": -1215.465576171875,
366
- "loss": 0.265,
367
- "rewards/accuracies": 0.8187500238418579,
368
- "rewards/chosen": -4.440661907196045,
369
- "rewards/margins": 4.793159484863281,
370
- "rewards/rejected": -9.2338228225708,
371
  "step": 210
372
  },
373
  {
374
- "epoch": 0.5630198336532309,
375
- "grad_norm": 30.279152935247957,
376
- "learning_rate": 2.3769816112703045e-07,
377
- "logits/chosen": -2.0213561058044434,
378
- "logits/rejected": -1.708433747291565,
379
- "logps/chosen": -803.655029296875,
380
- "logps/rejected": -1369.3001708984375,
381
- "loss": 0.253,
382
- "rewards/accuracies": 0.828125,
383
- "rewards/chosen": -5.206329345703125,
384
- "rewards/margins": 5.589818000793457,
385
- "rewards/rejected": -10.796146392822266,
386
  "step": 220
387
  },
388
  {
389
- "epoch": 0.5886116442738324,
390
- "grad_norm": 25.614713397243474,
391
- "learning_rate": 2.1542832120881677e-07,
392
- "logits/chosen": -1.8582950830459595,
393
- "logits/rejected": -1.4825233221054077,
394
- "logps/chosen": -796.53857421875,
395
- "logps/rejected": -1305.88818359375,
396
- "loss": 0.2601,
397
- "rewards/accuracies": 0.8187500238418579,
398
- "rewards/chosen": -5.0762786865234375,
399
- "rewards/margins": 5.091577053070068,
400
- "rewards/rejected": -10.167856216430664,
401
  "step": 230
402
  },
403
  {
404
- "epoch": 0.6142034548944337,
405
- "grad_norm": 24.31506504955288,
406
- "learning_rate": 1.934352493925695e-07,
407
- "logits/chosen": -1.9886703491210938,
408
- "logits/rejected": -1.6142040491104126,
409
- "logps/chosen": -769.7489013671875,
410
- "logps/rejected": -1311.3548583984375,
411
- "loss": 0.2748,
412
- "rewards/accuracies": 0.8062499761581421,
413
- "rewards/chosen": -4.7199506759643555,
414
- "rewards/margins": 5.420409202575684,
415
- "rewards/rejected": -10.140359878540039,
416
  "step": 240
417
  },
418
  {
419
- "epoch": 0.6397952655150352,
420
- "grad_norm": 31.574156427087846,
421
- "learning_rate": 1.7189501409486059e-07,
422
- "logits/chosen": -2.0121378898620605,
423
- "logits/rejected": -1.6347030401229858,
424
- "logps/chosen": -716.2213134765625,
425
- "logps/rejected": -1249.190185546875,
426
- "loss": 0.2809,
427
- "rewards/accuracies": 0.840624988079071,
428
- "rewards/chosen": -4.3439483642578125,
429
- "rewards/margins": 5.274473667144775,
430
- "rewards/rejected": -9.61842155456543,
431
  "step": 250
432
  },
433
  {
434
- "epoch": 0.6653870761356366,
435
- "grad_norm": 19.51529401796244,
436
- "learning_rate": 1.5098005849021078e-07,
437
- "logits/chosen": -2.051848888397217,
438
- "logits/rejected": -1.7610851526260376,
439
- "logps/chosen": -730.6099853515625,
440
- "logps/rejected": -1209.3929443359375,
441
- "loss": 0.2457,
442
- "rewards/accuracies": 0.800000011920929,
443
- "rewards/chosen": -4.3370680809021,
444
- "rewards/margins": 4.67025089263916,
445
- "rewards/rejected": -9.007319450378418,
446
  "step": 260
447
  },
448
  {
449
- "epoch": 0.690978886756238,
450
- "grad_norm": 34.445303465962446,
451
- "learning_rate": 1.30857819994673e-07,
452
- "logits/chosen": -1.923056960105896,
453
- "logits/rejected": -1.6418602466583252,
454
- "logps/chosen": -726.4392700195312,
455
- "logps/rejected": -1264.95458984375,
456
- "loss": 0.256,
457
- "rewards/accuracies": 0.831250011920929,
458
- "rewards/chosen": -4.332821369171143,
459
- "rewards/margins": 5.251183032989502,
460
- "rewards/rejected": -9.584003448486328,
461
  "step": 270
462
  },
463
  {
464
- "epoch": 0.7165706973768394,
465
- "grad_norm": 36.88942757740681,
466
- "learning_rate": 1.116893898236716e-07,
467
- "logits/chosen": -1.9537960290908813,
468
- "logits/rejected": -1.6011472940444946,
469
- "logps/chosen": -746.1478271484375,
470
- "logps/rejected": -1309.915283203125,
471
- "loss": 0.2386,
472
- "rewards/accuracies": 0.856249988079071,
473
- "rewards/chosen": -4.4233832359313965,
474
- "rewards/margins": 5.588069438934326,
475
- "rewards/rejected": -10.011453628540039,
476
  "step": 280
477
  },
478
  {
479
- "epoch": 0.7421625079974408,
480
- "grad_norm": 49.4114473741805,
481
- "learning_rate": 9.362822335518062e-08,
482
- "logits/chosen": -1.8809627294540405,
483
- "logits/rejected": -1.427119493484497,
484
- "logps/chosen": -769.44140625,
485
- "logps/rejected": -1329.4346923828125,
486
- "loss": 0.2622,
487
- "rewards/accuracies": 0.8843749761581421,
488
- "rewards/chosen": -4.582036972045898,
489
- "rewards/margins": 5.65748929977417,
490
- "rewards/rejected": -10.239526748657227,
491
  "step": 290
492
  },
493
  {
494
- "epoch": 0.7677543186180422,
495
- "grad_norm": 19.047743120052225,
496
- "learning_rate": 7.681891162260015e-08,
497
- "logits/chosen": -1.828704833984375,
498
- "logits/rejected": -1.5141593217849731,
499
- "logps/chosen": -717.1990966796875,
500
- "logps/rejected": -1231.6229248046875,
501
- "loss": 0.2515,
502
- "rewards/accuracies": 0.8187500238418579,
503
- "rewards/chosen": -4.390562057495117,
504
- "rewards/margins": 4.981083869934082,
505
- "rewards/rejected": -9.3716459274292,
506
  "step": 300
507
  },
508
  {
509
- "epoch": 0.7677543186180422,
510
- "eval_logits/chosen": -1.830853819847107,
511
- "eval_logits/rejected": -1.4321902990341187,
512
- "eval_logps/chosen": -726.7409057617188,
513
- "eval_logps/rejected": -1277.9102783203125,
514
- "eval_loss": 0.24932526051998138,
515
- "eval_rewards/accuracies": 0.850931704044342,
516
- "eval_rewards/chosen": -4.352837562561035,
517
- "eval_rewards/margins": 5.391228675842285,
518
- "eval_rewards/rejected": -9.74406623840332,
519
- "eval_runtime": 468.6767,
520
- "eval_samples_per_second": 10.937,
521
- "eval_steps_per_second": 0.344,
522
  "step": 300
523
  },
524
  {
525
- "epoch": 0.7933461292386437,
526
- "grad_norm": 27.67097540824916,
527
- "learning_rate": 6.139602377230247e-08,
528
- "logits/chosen": -1.7593371868133545,
529
- "logits/rejected": -1.3604390621185303,
530
- "logps/chosen": -744.55078125,
531
- "logps/rejected": -1289.401611328125,
532
- "loss": 0.2523,
533
- "rewards/accuracies": 0.8500000238418579,
534
- "rewards/chosen": -4.4812517166137695,
535
- "rewards/margins": 5.370087623596191,
536
- "rewards/rejected": -9.851339340209961,
537
  "step": 310
538
  },
539
  {
540
- "epoch": 0.818937939859245,
541
- "grad_norm": 27.140457734231973,
542
- "learning_rate": 4.748302975270837e-08,
543
- "logits/chosen": -1.7739003896713257,
544
- "logits/rejected": -1.296608805656433,
545
- "logps/chosen": -752.3242797851562,
546
- "logps/rejected": -1264.2474365234375,
547
- "loss": 0.2396,
548
- "rewards/accuracies": 0.8343750238418579,
549
- "rewards/chosen": -4.523016452789307,
550
- "rewards/margins": 5.08230447769165,
551
- "rewards/rejected": -9.605320930480957,
552
  "step": 320
553
  },
554
  {
555
- "epoch": 0.8445297504798465,
556
- "grad_norm": 19.26284094768001,
557
- "learning_rate": 3.5191311859445795e-08,
558
- "logits/chosen": -1.762459397315979,
559
- "logits/rejected": -1.3729654550552368,
560
- "logps/chosen": -762.8904418945312,
561
- "logps/rejected": -1324.27197265625,
562
- "loss": 0.2321,
563
- "rewards/accuracies": 0.828125,
564
- "rewards/chosen": -4.7744550704956055,
565
- "rewards/margins": 5.492222785949707,
566
- "rewards/rejected": -10.266677856445312,
567
  "step": 330
568
  },
569
  {
570
- "epoch": 0.8701215611004478,
571
- "grad_norm": 24.386284614263385,
572
- "learning_rate": 2.4619273049795996e-08,
573
- "logits/chosen": -1.7161592245101929,
574
- "logits/rejected": -1.374194860458374,
575
- "logps/chosen": -761.0625,
576
- "logps/rejected": -1358.326416015625,
577
- "loss": 0.2605,
578
- "rewards/accuracies": 0.8656250238418579,
579
- "rewards/chosen": -4.776429653167725,
580
- "rewards/margins": 5.739912986755371,
581
- "rewards/rejected": -10.516342163085938,
582
  "step": 340
583
  },
584
  {
585
- "epoch": 0.8957133717210493,
586
- "grad_norm": 25.575727288966945,
587
- "learning_rate": 1.5851549164932115e-08,
588
- "logits/chosen": -1.698293924331665,
589
- "logits/rejected": -1.249987006187439,
590
- "logps/chosen": -781.341552734375,
591
- "logps/rejected": -1365.3270263671875,
592
- "loss": 0.2447,
593
- "rewards/accuracies": 0.8218749761581421,
594
- "rewards/chosen": -4.80244255065918,
595
- "rewards/margins": 5.769272804260254,
596
- "rewards/rejected": -10.571714401245117,
597
  "step": 350
598
  },
599
  {
600
- "epoch": 0.9213051823416507,
601
- "grad_norm": 28.833379096024903,
602
- "learning_rate": 8.958331366609423e-09,
603
- "logits/chosen": -1.625765085220337,
604
- "logits/rejected": -1.215453863143921,
605
- "logps/chosen": -729.2706298828125,
606
- "logps/rejected": -1337.571044921875,
607
- "loss": 0.2386,
608
- "rewards/accuracies": 0.84375,
609
- "rewards/chosen": -4.483765602111816,
610
- "rewards/margins": 5.966723442077637,
611
- "rewards/rejected": -10.450489044189453,
612
  "step": 360
613
  },
614
  {
615
- "epoch": 0.946896992962252,
616
- "grad_norm": 32.57067435027107,
617
- "learning_rate": 3.994804212627461e-09,
618
- "logits/chosen": -1.7305755615234375,
619
- "logits/rejected": -1.2781140804290771,
620
- "logps/chosen": -772.3482666015625,
621
- "logps/rejected": -1352.6353759765625,
622
- "loss": 0.2442,
623
- "rewards/accuracies": 0.8374999761581421,
624
- "rewards/chosen": -4.808593273162842,
625
- "rewards/margins": 5.681990623474121,
626
- "rewards/rejected": -10.490584373474121,
627
  "step": 370
628
  },
629
  {
630
- "epoch": 0.9724888035828535,
631
- "grad_norm": 52.329250984527555,
632
- "learning_rate": 1.0007038696262516e-09,
633
- "logits/chosen": -1.7279059886932373,
634
- "logits/rejected": -1.3428099155426025,
635
- "logps/chosen": -765.5159301757812,
636
- "logps/rejected": -1355.879150390625,
637
- "loss": 0.2353,
638
- "rewards/accuracies": 0.84375,
639
- "rewards/chosen": -4.687448024749756,
640
- "rewards/margins": 5.837033271789551,
641
- "rewards/rejected": -10.524479866027832,
642
  "step": 380
643
  },
644
  {
645
- "epoch": 0.9980806142034548,
646
- "grad_norm": 22.836707059503702,
647
- "learning_rate": 0.0,
648
- "logits/chosen": -1.6817991733551025,
649
- "logits/rejected": -1.2504949569702148,
650
- "logps/chosen": -785.3814086914062,
651
- "logps/rejected": -1332.812744140625,
652
- "loss": 0.2478,
653
- "rewards/accuracies": 0.856249988079071,
654
- "rewards/chosen": -4.990485191345215,
655
- "rewards/margins": 5.404683589935303,
656
- "rewards/rejected": -10.395169258117676,
657
  "step": 390
658
  },
659
  {
660
- "epoch": 0.9980806142034548,
661
- "step": 390,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
662
  "total_flos": 0.0,
663
  "train_loss": 0.0,
664
- "train_runtime": 0.0175,
665
- "train_samples_per_second": 2864609.543,
666
- "train_steps_per_second": 22343.954
667
  }
668
  ],
669
  "logging_steps": 10,
670
- "max_steps": 390,
671
  "num_input_tokens_seen": 0,
672
  "num_train_epochs": 1,
673
  "save_steps": 100,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9997120644975526,
5
  "eval_steps": 100,
6
+ "global_step": 868,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 1.0,
13
+ "grad_norm": 13.015832712288159,
14
+ "learning_rate": 5e-07,
15
+ "logits/chosen": -2.605381965637207,
16
+ "logits/rejected": -2.5362534523010254,
17
+ "logps/chosen": -197.4033660888672,
18
+ "logps/rejected": -176.15130615234375,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
 
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.01151742009789807,
28
+ "grad_norm": 8.230031374538095,
29
+ "learning_rate": 5.747126436781609e-08,
30
+ "logits/chosen": -2.797184705734253,
31
+ "logits/rejected": -2.768812417984009,
32
+ "logps/chosen": -266.24053955078125,
33
+ "logps/rejected": -265.971923828125,
34
+ "loss": 0.6928,
35
+ "rewards/accuracies": 0.4305555522441864,
36
+ "rewards/chosen": -0.00021778659720439464,
37
+ "rewards/margins": -0.00010571091843303293,
38
+ "rewards/rejected": -0.00011207569332327694,
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.02303484019579614,
43
+ "grad_norm": 8.31009452460146,
44
+ "learning_rate": 1.1494252873563217e-07,
45
+ "logits/chosen": -2.802431583404541,
46
+ "logits/rejected": -2.773219347000122,
47
+ "logps/chosen": -287.32781982421875,
48
+ "logps/rejected": -273.28900146484375,
49
+ "loss": 0.6923,
50
+ "rewards/accuracies": 0.5406249761581421,
51
+ "rewards/chosen": 0.0006046505295671523,
52
+ "rewards/margins": 0.0014849099097773433,
53
+ "rewards/rejected": -0.0008802594384178519,
54
  "step": 20
55
  },
56
  {
57
+ "epoch": 0.03455226029369421,
58
+ "grad_norm": 8.066889291282722,
59
+ "learning_rate": 1.7241379310344828e-07,
60
+ "logits/chosen": -2.804356098175049,
61
+ "logits/rejected": -2.7821590900421143,
62
+ "logps/chosen": -278.156494140625,
63
+ "logps/rejected": -270.8301086425781,
64
+ "loss": 0.6905,
65
+ "rewards/accuracies": 0.684374988079071,
66
+ "rewards/chosen": 0.0025812473613768816,
67
+ "rewards/margins": 0.005251543130725622,
68
+ "rewards/rejected": -0.0026702960021793842,
69
  "step": 30
70
  },
71
  {
72
+ "epoch": 0.04606968039159228,
73
+ "grad_norm": 8.939044393747595,
74
+ "learning_rate": 2.2988505747126435e-07,
75
+ "logits/chosen": -2.8080034255981445,
76
+ "logits/rejected": -2.7811412811279297,
77
+ "logps/chosen": -272.1091003417969,
78
+ "logps/rejected": -268.6837158203125,
79
+ "loss": 0.6858,
80
+ "rewards/accuracies": 0.703125,
81
+ "rewards/chosen": 0.007119017653167248,
82
+ "rewards/margins": 0.016155635938048363,
83
+ "rewards/rejected": -0.009036618284881115,
84
  "step": 40
85
  },
86
  {
87
+ "epoch": 0.05758710048949035,
88
+ "grad_norm": 9.805284456793881,
89
+ "learning_rate": 2.873563218390804e-07,
90
+ "logits/chosen": -2.834063768386841,
91
+ "logits/rejected": -2.7892394065856934,
92
+ "logps/chosen": -284.08453369140625,
93
+ "logps/rejected": -282.91802978515625,
94
+ "loss": 0.6762,
95
+ "rewards/accuracies": 0.75,
96
+ "rewards/chosen": 0.014029329642653465,
97
+ "rewards/margins": 0.03542623296380043,
98
+ "rewards/rejected": -0.021396907046437263,
99
  "step": 50
100
  },
101
  {
102
+ "epoch": 0.06910452058738842,
103
+ "grad_norm": 8.352607046334498,
104
+ "learning_rate": 3.4482758620689656e-07,
105
+ "logits/chosen": -2.805022716522217,
106
+ "logits/rejected": -2.796321392059326,
107
+ "logps/chosen": -292.1920166015625,
108
+ "logps/rejected": -302.4415588378906,
109
+ "loss": 0.6624,
110
+ "rewards/accuracies": 0.71875,
111
+ "rewards/chosen": 0.006681998260319233,
112
+ "rewards/margins": 0.06977846473455429,
113
+ "rewards/rejected": -0.06309647113084793,
114
  "step": 60
115
  },
116
  {
117
+ "epoch": 0.0806219406852865,
118
+ "grad_norm": 9.823702522936284,
119
+ "learning_rate": 4.0229885057471266e-07,
120
+ "logits/chosen": -2.75339674949646,
121
+ "logits/rejected": -2.751986026763916,
122
+ "logps/chosen": -281.77618408203125,
123
+ "logps/rejected": -300.4095153808594,
124
+ "loss": 0.6322,
125
+ "rewards/accuracies": 0.734375,
126
+ "rewards/chosen": -0.07559685409069061,
127
+ "rewards/margins": 0.150864839553833,
128
+ "rewards/rejected": -0.22646169364452362,
129
  "step": 70
130
  },
131
  {
132
+ "epoch": 0.09213936078318456,
133
+ "grad_norm": 11.550756640744595,
134
+ "learning_rate": 4.597701149425287e-07,
135
+ "logits/chosen": -2.8751022815704346,
136
+ "logits/rejected": -2.8525900840759277,
137
+ "logps/chosen": -316.79888916015625,
138
+ "logps/rejected": -340.1561584472656,
139
+ "loss": 0.5913,
140
+ "rewards/accuracies": 0.765625,
141
+ "rewards/chosen": -0.24550755321979523,
142
+ "rewards/margins": 0.29761967062950134,
143
+ "rewards/rejected": -0.5431272387504578,
144
  "step": 80
145
  },
146
  {
147
+ "epoch": 0.10365678088108264,
148
+ "grad_norm": 18.102002209139584,
149
+ "learning_rate": 4.999817969178237e-07,
150
+ "logits/chosen": -2.8152594566345215,
151
+ "logits/rejected": -2.7724924087524414,
152
+ "logps/chosen": -363.1444396972656,
153
+ "logps/rejected": -401.7603759765625,
154
+ "loss": 0.5547,
155
+ "rewards/accuracies": 0.746874988079071,
156
+ "rewards/chosen": -0.6734243631362915,
157
+ "rewards/margins": 0.5091755986213684,
158
+ "rewards/rejected": -1.1825997829437256,
159
  "step": 90
160
  },
161
  {
162
+ "epoch": 0.1151742009789807,
163
+ "grad_norm": 15.592173368744417,
164
+ "learning_rate": 4.996582603056428e-07,
165
+ "logits/chosen": -2.7807068824768066,
166
+ "logits/rejected": -2.75152325630188,
167
+ "logps/chosen": -403.1298828125,
168
+ "logps/rejected": -451.24072265625,
169
+ "loss": 0.5385,
170
+ "rewards/accuracies": 0.7093750238418579,
171
+ "rewards/chosen": -1.0208370685577393,
172
+ "rewards/margins": 0.6278557181358337,
173
+ "rewards/rejected": -1.6486928462982178,
174
  "step": 100
175
  },
176
  {
177
+ "epoch": 0.1151742009789807,
178
+ "eval_logits/chosen": -2.9011571407318115,
179
+ "eval_logits/rejected": -2.874889373779297,
180
+ "eval_logps/chosen": -433.42706298828125,
181
+ "eval_logps/rejected": -527.4996948242188,
182
+ "eval_loss": 0.45933064818382263,
183
+ "eval_rewards/accuracies": 0.753923773765564,
184
+ "eval_rewards/chosen": -1.424589991569519,
185
+ "eval_rewards/margins": 0.9494837522506714,
186
+ "eval_rewards/rejected": -2.3740737438201904,
187
+ "eval_runtime": 651.6627,
188
+ "eval_samples_per_second": 10.935,
189
+ "eval_steps_per_second": 0.342,
190
  "step": 100
191
  },
192
  {
193
+ "epoch": 0.12669162107687879,
194
+ "grad_norm": 17.802035855151065,
195
+ "learning_rate": 4.989308132738126e-07,
196
+ "logits/chosen": -2.731767416000366,
197
+ "logits/rejected": -2.702854633331299,
198
+ "logps/chosen": -390.03009033203125,
199
+ "logps/rejected": -461.499755859375,
200
+ "loss": 0.4959,
201
+ "rewards/accuracies": 0.6812499761581421,
202
+ "rewards/chosen": -1.160954236984253,
203
+ "rewards/margins": 0.7525253295898438,
204
+ "rewards/rejected": -1.9134795665740967,
205
  "step": 110
206
  },
207
  {
208
+ "epoch": 0.13820904117477684,
209
+ "grad_norm": 26.847609346017396,
210
+ "learning_rate": 4.978006327248536e-07,
211
+ "logits/chosen": -2.6494832038879395,
212
+ "logits/rejected": -2.6402511596679688,
213
+ "logps/chosen": -438.6656799316406,
214
+ "logps/rejected": -550.1033325195312,
215
+ "loss": 0.475,
216
+ "rewards/accuracies": 0.7437499761581421,
217
+ "rewards/chosen": -1.6023308038711548,
218
+ "rewards/margins": 1.078300952911377,
219
+ "rewards/rejected": -2.680631637573242,
220
  "step": 120
221
  },
222
  {
223
+ "epoch": 0.14972646127267492,
224
+ "grad_norm": 25.087856993190254,
225
+ "learning_rate": 4.962695471250032e-07,
226
+ "logits/chosen": -2.4692533016204834,
227
+ "logits/rejected": -2.435044050216675,
228
+ "logps/chosen": -499.8922424316406,
229
+ "logps/rejected": -645.5679931640625,
230
+ "loss": 0.468,
231
+ "rewards/accuracies": 0.746874988079071,
232
+ "rewards/chosen": -2.0733580589294434,
233
+ "rewards/margins": 1.5583977699279785,
234
+ "rewards/rejected": -3.631755828857422,
235
  "step": 130
236
  },
237
  {
238
+ "epoch": 0.161243881370573,
239
+ "grad_norm": 26.974432330966298,
240
+ "learning_rate": 4.94340033546025e-07,
241
+ "logits/chosen": -1.697016716003418,
242
+ "logits/rejected": -1.593400239944458,
243
+ "logps/chosen": -511.65814208984375,
244
+ "logps/rejected": -659.9658813476562,
245
+ "loss": 0.4654,
246
+ "rewards/accuracies": 0.746874988079071,
247
+ "rewards/chosen": -2.305452823638916,
248
+ "rewards/margins": 1.5949369668960571,
249
+ "rewards/rejected": -3.9003894329071045,
250
  "step": 140
251
  },
252
  {
253
+ "epoch": 0.17276130146847107,
254
+ "grad_norm": 21.115401587052915,
255
+ "learning_rate": 4.920152136576705e-07,
256
+ "logits/chosen": -1.4327126741409302,
257
+ "logits/rejected": -1.2659103870391846,
258
+ "logps/chosen": -538.796630859375,
259
+ "logps/rejected": -664.4251098632812,
260
+ "loss": 0.4789,
261
+ "rewards/accuracies": 0.753125011920929,
262
+ "rewards/chosen": -2.33674955368042,
263
+ "rewards/margins": 1.4603914022445679,
264
+ "rewards/rejected": -3.7971413135528564,
265
  "step": 150
266
  },
267
  {
268
+ "epoch": 0.18427872156636912,
269
+ "grad_norm": 24.637364700318916,
270
+ "learning_rate": 4.892988486772756e-07,
271
+ "logits/chosen": -1.4591898918151855,
272
+ "logits/rejected": -1.3274848461151123,
273
+ "logps/chosen": -468.7333068847656,
274
+ "logps/rejected": -612.8162841796875,
275
+ "loss": 0.4462,
276
+ "rewards/accuracies": 0.7749999761581421,
277
+ "rewards/chosen": -1.950823187828064,
278
+ "rewards/margins": 1.431302785873413,
279
+ "rewards/rejected": -3.3821263313293457,
280
  "step": 160
281
  },
282
  {
283
+ "epoch": 0.1957961416642672,
284
+ "grad_norm": 27.13923752480491,
285
+ "learning_rate": 4.861953332846629e-07,
286
+ "logits/chosen": -1.2759544849395752,
287
+ "logits/rejected": -1.0808634757995605,
288
+ "logps/chosen": -469.6282653808594,
289
+ "logps/rejected": -628.2378540039062,
290
+ "loss": 0.444,
291
+ "rewards/accuracies": 0.7593749761581421,
292
+ "rewards/chosen": -2.0090174674987793,
293
+ "rewards/margins": 1.5872033834457397,
294
+ "rewards/rejected": -3.5962207317352295,
295
  "step": 170
296
  },
297
  {
298
+ "epoch": 0.20731356176216528,
299
+ "grad_norm": 22.29941288426432,
300
+ "learning_rate": 4.827096885121953e-07,
301
+ "logits/chosen": -0.8839688301086426,
302
+ "logits/rejected": -0.664128839969635,
303
+ "logps/chosen": -591.6177978515625,
304
+ "logps/rejected": -778.9203491210938,
305
+ "loss": 0.4486,
306
+ "rewards/accuracies": 0.7406250238418579,
307
+ "rewards/chosen": -3.2479281425476074,
308
+ "rewards/margins": 1.83078134059906,
309
+ "rewards/rejected": -5.078709125518799,
310
  "step": 180
311
  },
312
  {
313
+ "epoch": 0.21883098186006333,
314
+ "grad_norm": 20.150152801800882,
315
+ "learning_rate": 4.788475536214821e-07,
316
+ "logits/chosen": -1.1295298337936401,
317
+ "logits/rejected": -0.8731690645217896,
318
+ "logps/chosen": -518.4920654296875,
319
+ "logps/rejected": -677.3343505859375,
320
+ "loss": 0.4248,
321
+ "rewards/accuracies": 0.778124988079071,
322
+ "rewards/chosen": -2.0726380348205566,
323
+ "rewards/margins": 1.7125848531723022,
324
+ "rewards/rejected": -3.7852234840393066,
325
  "step": 190
326
  },
327
  {
328
+ "epoch": 0.2303484019579614,
329
+ "grad_norm": 24.4341951464939,
330
+ "learning_rate": 4.746151769798818e-07,
331
+ "logits/chosen": -0.9307588338851929,
332
+ "logits/rejected": -0.6262258291244507,
333
+ "logps/chosen": -524.0397338867188,
334
+ "logps/rejected": -701.8967895507812,
335
+ "loss": 0.4369,
336
+ "rewards/accuracies": 0.7437499761581421,
337
+ "rewards/chosen": -2.360715389251709,
338
+ "rewards/margins": 1.8277909755706787,
339
+ "rewards/rejected": -4.188506603240967,
340
  "step": 200
341
  },
342
  {
343
+ "epoch": 0.2303484019579614,
344
+ "eval_logits/chosen": -1.5077687501907349,
345
+ "eval_logits/rejected": -1.1797598600387573,
346
+ "eval_logps/chosen": -594.2913818359375,
347
+ "eval_logps/rejected": -823.106201171875,
348
+ "eval_loss": 0.3589639961719513,
349
+ "eval_rewards/accuracies": 0.7914798259735107,
350
+ "eval_rewards/chosen": -3.033234119415283,
351
+ "eval_rewards/margins": 2.2969048023223877,
352
+ "eval_rewards/rejected": -5.330138683319092,
353
+ "eval_runtime": 650.6064,
354
+ "eval_samples_per_second": 10.953,
355
+ "eval_steps_per_second": 0.343,
356
  "step": 200
357
  },
358
  {
359
+ "epoch": 0.2418658220558595,
360
+ "grad_norm": 27.352856519591263,
361
+ "learning_rate": 4.7001940595156055e-07,
362
+ "logits/chosen": -0.7815187573432922,
363
+ "logits/rejected": -0.46700936555862427,
364
+ "logps/chosen": -518.8436279296875,
365
+ "logps/rejected": -683.1966552734375,
366
+ "loss": 0.4274,
367
+ "rewards/accuracies": 0.7250000238418579,
368
+ "rewards/chosen": -2.385855197906494,
369
+ "rewards/margins": 1.621694564819336,
370
+ "rewards/rejected": -4.00754976272583,
371
  "step": 210
372
  },
373
  {
374
+ "epoch": 0.25338324215375757,
375
+ "grad_norm": 29.897947419384028,
376
+ "learning_rate": 4.650676758194623e-07,
377
+ "logits/chosen": -0.5421683192253113,
378
+ "logits/rejected": -0.02623056247830391,
379
+ "logps/chosen": -606.1685791015625,
380
+ "logps/rejected": -831.0916137695312,
381
+ "loss": 0.4012,
382
+ "rewards/accuracies": 0.765625,
383
+ "rewards/chosen": -3.0587515830993652,
384
+ "rewards/margins": 2.499514102935791,
385
+ "rewards/rejected": -5.558266639709473,
386
  "step": 220
387
  },
388
  {
389
+ "epoch": 0.26490066225165565,
390
+ "grad_norm": 28.31850344555953,
391
+ "learning_rate": 4.5976799775611215e-07,
392
+ "logits/chosen": -0.28304657340049744,
393
+ "logits/rejected": 0.2166980504989624,
394
+ "logps/chosen": -565.9539794921875,
395
+ "logps/rejected": -766.6756591796875,
396
+ "loss": 0.4392,
397
+ "rewards/accuracies": 0.796875,
398
+ "rewards/chosen": -2.7024905681610107,
399
+ "rewards/margins": 2.067142963409424,
400
+ "rewards/rejected": -4.7696332931518555,
401
  "step": 230
402
  },
403
  {
404
+ "epoch": 0.2764180823495537,
405
+ "grad_norm": 25.790552553148434,
406
+ "learning_rate": 4.5412894586271543e-07,
407
+ "logits/chosen": -0.3281463384628296,
408
+ "logits/rejected": 0.12199939787387848,
409
+ "logps/chosen": -534.4832763671875,
410
+ "logps/rejected": -700.3882446289062,
411
+ "loss": 0.4403,
412
+ "rewards/accuracies": 0.793749988079071,
413
+ "rewards/chosen": -2.3464341163635254,
414
+ "rewards/margins": 1.9047329425811768,
415
+ "rewards/rejected": -4.251167297363281,
416
  "step": 240
417
  },
418
  {
419
+ "epoch": 0.28793550244745175,
420
+ "grad_norm": 29.425669097369397,
421
+ "learning_rate": 4.481596432975201e-07,
422
+ "logits/chosen": -0.6021678447723389,
423
+ "logits/rejected": -0.20536144077777863,
424
+ "logps/chosen": -615.7349853515625,
425
+ "logps/rejected": -839.0997924804688,
426
+ "loss": 0.4298,
427
+ "rewards/accuracies": 0.7593749761581421,
428
+ "rewards/chosen": -3.1481640338897705,
429
+ "rewards/margins": 2.2502574920654297,
430
+ "rewards/rejected": -5.398421764373779,
431
  "step": 250
432
  },
433
  {
434
+ "epoch": 0.29945292254534983,
435
+ "grad_norm": 23.62933629230091,
436
+ "learning_rate": 4.41869747515886e-07,
437
+ "logits/chosen": -0.2845512330532074,
438
+ "logits/rejected": 0.14756298065185547,
439
+ "logps/chosen": -572.5442504882812,
440
+ "logps/rejected": -812.703125,
441
+ "loss": 0.3968,
442
+ "rewards/accuracies": 0.768750011920929,
443
+ "rewards/chosen": -2.856945514678955,
444
+ "rewards/margins": 2.3578898906707764,
445
+ "rewards/rejected": -5.214835166931152,
446
  "step": 260
447
  },
448
  {
449
+ "epoch": 0.3109703426432479,
450
+ "grad_norm": 36.01630964835951,
451
+ "learning_rate": 4.352694346459396e-07,
452
+ "logits/chosen": -0.057602040469646454,
453
+ "logits/rejected": 0.40555334091186523,
454
+ "logps/chosen": -587.2971801757812,
455
+ "logps/rejected": -866.1613159179688,
456
+ "loss": 0.4006,
457
+ "rewards/accuracies": 0.784375011920929,
458
+ "rewards/chosen": -3.123883008956909,
459
+ "rewards/margins": 2.7192797660827637,
460
+ "rewards/rejected": -5.84316349029541,
461
  "step": 270
462
  },
463
  {
464
+ "epoch": 0.322487762741146,
465
+ "grad_norm": 26.73415377993604,
466
+ "learning_rate": 4.2836938302509256e-07,
467
+ "logits/chosen": -0.25706934928894043,
468
+ "logits/rejected": 0.16837282478809357,
469
+ "logps/chosen": -575.8345947265625,
470
+ "logps/rejected": -808.24267578125,
471
+ "loss": 0.4075,
472
+ "rewards/accuracies": 0.778124988079071,
473
+ "rewards/chosen": -2.9973578453063965,
474
+ "rewards/margins": 2.355498790740967,
475
+ "rewards/rejected": -5.352856636047363,
476
  "step": 280
477
  },
478
  {
479
+ "epoch": 0.33400518283904407,
480
+ "grad_norm": 29.332592595497015,
481
+ "learning_rate": 4.2118075592405874e-07,
482
+ "logits/chosen": -0.3039420247077942,
483
+ "logits/rejected": 0.07993211597204208,
484
+ "logps/chosen": -582.0941162109375,
485
+ "logps/rejected": -830.5714111328125,
486
+ "loss": 0.3976,
487
+ "rewards/accuracies": 0.793749988079071,
488
+ "rewards/chosen": -2.9472875595092773,
489
+ "rewards/margins": 2.5015506744384766,
490
+ "rewards/rejected": -5.448838233947754,
491
  "step": 290
492
  },
493
  {
494
+ "epoch": 0.34552260293694215,
495
+ "grad_norm": 30.91612291215343,
496
+ "learning_rate": 4.137151834863213e-07,
497
+ "logits/chosen": -0.10641048848628998,
498
+ "logits/rejected": 0.6166712641716003,
499
+ "logps/chosen": -632.7642822265625,
500
+ "logps/rejected": -849.4898681640625,
501
+ "loss": 0.4119,
502
+ "rewards/accuracies": 0.778124988079071,
503
+ "rewards/chosen": -3.399906873703003,
504
+ "rewards/margins": 2.507375478744507,
505
+ "rewards/rejected": -5.90728235244751,
506
  "step": 300
507
  },
508
  {
509
+ "epoch": 0.34552260293694215,
510
+ "eval_logits/chosen": -0.6165890693664551,
511
+ "eval_logits/rejected": -0.11399216204881668,
512
+ "eval_logps/chosen": -677.2001953125,
513
+ "eval_logps/rejected": -996.9340209960938,
514
+ "eval_loss": 0.336904913187027,
515
+ "eval_rewards/accuracies": 0.8155829310417175,
516
+ "eval_rewards/chosen": -3.862321615219116,
517
+ "eval_rewards/margins": 3.206094741821289,
518
+ "eval_rewards/rejected": -7.068417072296143,
519
+ "eval_runtime": 656.6921,
520
+ "eval_samples_per_second": 10.851,
521
+ "eval_steps_per_second": 0.34,
522
  "step": 300
523
  },
524
  {
525
+ "epoch": 0.35704002303484017,
526
+ "grad_norm": 22.38837991601497,
527
+ "learning_rate": 4.059847439122671e-07,
528
+ "logits/chosen": -0.46659454703330994,
529
+ "logits/rejected": 0.0826030969619751,
530
+ "logps/chosen": -515.8815307617188,
531
+ "logps/rejected": -717.310302734375,
532
+ "loss": 0.4112,
533
+ "rewards/accuracies": 0.765625,
534
+ "rewards/chosen": -2.256371021270752,
535
+ "rewards/margins": 2.008225679397583,
536
+ "rewards/rejected": -4.264596462249756,
537
  "step": 310
538
  },
539
  {
540
+ "epoch": 0.36855744313273825,
541
+ "grad_norm": 21.515754430109986,
542
+ "learning_rate": 3.98001943918432e-07,
543
+ "logits/chosen": -0.8846302032470703,
544
+ "logits/rejected": -0.03813103586435318,
545
+ "logps/chosen": -544.3895263671875,
546
+ "logps/rejected": -746.3841552734375,
547
+ "loss": 0.3939,
548
+ "rewards/accuracies": 0.809374988079071,
549
+ "rewards/chosen": -2.379772663116455,
550
+ "rewards/margins": 2.3421151638031006,
551
+ "rewards/rejected": -4.721888542175293,
552
  "step": 320
553
  },
554
  {
555
+ "epoch": 0.38007486323063633,
556
+ "grad_norm": 33.71230207361674,
557
+ "learning_rate": 3.8977969850346866e-07,
558
+ "logits/chosen": 0.13661722838878632,
559
+ "logits/rejected": 0.7041386365890503,
560
+ "logps/chosen": -666.94482421875,
561
+ "logps/rejected": -926.0341796875,
562
+ "loss": 0.3873,
563
+ "rewards/accuracies": 0.75,
564
+ "rewards/chosen": -3.78490948677063,
565
+ "rewards/margins": 2.646435022354126,
566
+ "rewards/rejected": -6.431344509124756,
567
  "step": 330
568
  },
569
  {
570
+ "epoch": 0.3915922833285344,
571
+ "grad_norm": 28.524858622055092,
572
+ "learning_rate": 3.8133131005357465e-07,
573
+ "logits/chosen": -0.015070567838847637,
574
+ "logits/rejected": 0.6914359927177429,
575
+ "logps/chosen": -646.4139404296875,
576
+ "logps/rejected": -965.0103759765625,
577
+ "loss": 0.3971,
578
+ "rewards/accuracies": 0.78125,
579
+ "rewards/chosen": -3.5984835624694824,
580
+ "rewards/margins": 3.210897922515869,
581
+ "rewards/rejected": -6.809381008148193,
582
  "step": 340
583
  },
584
  {
585
+ "epoch": 0.4031097034264325,
586
+ "grad_norm": 32.078697347416266,
587
+ "learning_rate": 3.7267044682118435e-07,
588
+ "logits/chosen": -0.002132108900696039,
589
+ "logits/rejected": 0.7953078150749207,
590
+ "logps/chosen": -604.9791259765625,
591
+ "logps/rejected": -838.1949462890625,
592
+ "loss": 0.4191,
593
+ "rewards/accuracies": 0.768750011920929,
594
+ "rewards/chosen": -3.1062846183776855,
595
+ "rewards/margins": 2.339332342147827,
596
+ "rewards/rejected": -5.445616722106934,
597
  "step": 350
598
  },
599
  {
600
+ "epoch": 0.41462712352433057,
601
+ "grad_norm": 28.020517011807925,
602
+ "learning_rate": 3.638111208117425e-07,
603
+ "logits/chosen": -0.1473531574010849,
604
+ "logits/rejected": 0.490295946598053,
605
+ "logps/chosen": -583.7153930664062,
606
+ "logps/rejected": -761.9363403320312,
607
+ "loss": 0.4035,
608
+ "rewards/accuracies": 0.765625,
609
+ "rewards/chosen": -3.0424270629882812,
610
+ "rewards/margins": 1.7358585596084595,
611
+ "rewards/rejected": -4.778285026550293,
612
  "step": 360
613
  },
614
  {
615
+ "epoch": 0.42614454362222864,
616
+ "grad_norm": 25.853288738352997,
617
+ "learning_rate": 3.5476766511433605e-07,
618
+ "logits/chosen": -0.25570568442344666,
619
+ "logits/rejected": 0.6842668652534485,
620
+ "logps/chosen": -590.0350341796875,
621
+ "logps/rejected": -811.4537353515625,
622
+ "loss": 0.3968,
623
+ "rewards/accuracies": 0.793749988079071,
624
+ "rewards/chosen": -3.002671480178833,
625
+ "rewards/margins": 2.369654655456543,
626
+ "rewards/rejected": -5.372325897216797,
627
  "step": 370
628
  },
629
  {
630
+ "epoch": 0.43766196372012667,
631
+ "grad_norm": 21.591809702398923,
632
+ "learning_rate": 3.455547107128602e-07,
633
+ "logits/chosen": -0.12841393053531647,
634
+ "logits/rejected": 0.6481091380119324,
635
+ "logps/chosen": -580.2199096679688,
636
+ "logps/rejected": -826.1383666992188,
637
+ "loss": 0.3958,
638
+ "rewards/accuracies": 0.800000011920929,
639
+ "rewards/chosen": -3.195159435272217,
640
+ "rewards/margins": 2.441926956176758,
641
+ "rewards/rejected": -5.637085914611816,
642
  "step": 380
643
  },
644
  {
645
+ "epoch": 0.44917938381802475,
646
+ "grad_norm": 39.83795352564531,
647
+ "learning_rate": 3.361871628152338e-07,
648
+ "logits/chosen": -0.23047828674316406,
649
+ "logits/rejected": 0.7577739953994751,
650
+ "logps/chosen": -605.4849853515625,
651
+ "logps/rejected": -883.64501953125,
652
+ "loss": 0.4085,
653
+ "rewards/accuracies": 0.809374988079071,
654
+ "rewards/chosen": -3.1104187965393066,
655
+ "rewards/margins": 3.0135536193847656,
656
+ "rewards/rejected": -6.123971939086914,
657
  "step": 390
658
  },
659
  {
660
+ "epoch": 0.4606968039159228,
661
+ "grad_norm": 22.463302227367222,
662
+ "learning_rate": 3.2668017673896077e-07,
663
+ "logits/chosen": -0.22118684649467468,
664
+ "logits/rejected": 0.6193957924842834,
665
+ "logps/chosen": -640.8189697265625,
666
+ "logps/rejected": -955.4924926757812,
667
+ "loss": 0.3964,
668
+ "rewards/accuracies": 0.809374988079071,
669
+ "rewards/chosen": -3.495349884033203,
670
+ "rewards/margins": 3.084470748901367,
671
+ "rewards/rejected": -6.579820156097412,
672
+ "step": 400
673
+ },
674
+ {
675
+ "epoch": 0.4606968039159228,
676
+ "eval_logits/chosen": -0.6209221482276917,
677
+ "eval_logits/rejected": 0.23131267726421356,
678
+ "eval_logps/chosen": -753.418701171875,
679
+ "eval_logps/rejected": -1128.0946044921875,
680
+ "eval_loss": 0.33106523752212524,
681
+ "eval_rewards/accuracies": 0.8178251385688782,
682
+ "eval_rewards/chosen": -4.624506950378418,
683
+ "eval_rewards/margins": 3.7555172443389893,
684
+ "eval_rewards/rejected": -8.380023956298828,
685
+ "eval_runtime": 655.865,
686
+ "eval_samples_per_second": 10.865,
687
+ "eval_steps_per_second": 0.34,
688
+ "step": 400
689
+ },
690
+ {
691
+ "epoch": 0.4722142240138209,
692
+ "grad_norm": 27.33004967085911,
693
+ "learning_rate": 3.1704913339205103e-07,
694
+ "logits/chosen": 0.38320040702819824,
695
+ "logits/rejected": 1.2441421747207642,
696
+ "logps/chosen": -592.7208862304688,
697
+ "logps/rejected": -816.4508666992188,
698
+ "loss": 0.407,
699
+ "rewards/accuracies": 0.7875000238418579,
700
+ "rewards/chosen": -3.1608848571777344,
701
+ "rewards/margins": 2.369687080383301,
702
+ "rewards/rejected": -5.530571937561035,
703
+ "step": 410
704
+ },
705
+ {
706
+ "epoch": 0.483731644111719,
707
+ "grad_norm": 29.22769569320565,
708
+ "learning_rate": 3.0730961438896885e-07,
709
+ "logits/chosen": -0.32711368799209595,
710
+ "logits/rejected": 0.6167188882827759,
711
+ "logps/chosen": -647.5065307617188,
712
+ "logps/rejected": -920.5850830078125,
713
+ "loss": 0.3864,
714
+ "rewards/accuracies": 0.815625011920929,
715
+ "rewards/chosen": -3.577653408050537,
716
+ "rewards/margins": 2.697723865509033,
717
+ "rewards/rejected": -6.27537727355957,
718
+ "step": 420
719
+ },
720
+ {
721
+ "epoch": 0.49524906420961706,
722
+ "grad_norm": 29.1628367265211,
723
+ "learning_rate": 2.9747737684186795e-07,
724
+ "logits/chosen": -0.8004047274589539,
725
+ "logits/rejected": 0.0654061958193779,
726
+ "logps/chosen": -586.2633056640625,
727
+ "logps/rejected": -828.8479614257812,
728
+ "loss": 0.4008,
729
+ "rewards/accuracies": 0.800000011920929,
730
+ "rewards/chosen": -3.018512487411499,
731
+ "rewards/margins": 2.515615701675415,
732
+ "rewards/rejected": -5.534128189086914,
733
+ "step": 430
734
+ },
735
+ {
736
+ "epoch": 0.5067664843075151,
737
+ "grad_norm": 43.05788588925481,
738
+ "learning_rate": 2.8756832786789663e-07,
739
+ "logits/chosen": -0.7165388464927673,
740
+ "logits/rejected": 0.3907933533191681,
741
+ "logps/chosen": -558.6912231445312,
742
+ "logps/rejected": -839.3739013671875,
743
+ "loss": 0.3988,
744
+ "rewards/accuracies": 0.831250011920929,
745
+ "rewards/chosen": -2.8957457542419434,
746
+ "rewards/margins": 2.8470349311828613,
747
+ "rewards/rejected": -5.742780685424805,
748
+ "step": 440
749
+ },
750
+ {
751
+ "epoch": 0.5182839044054132,
752
+ "grad_norm": 26.95003512302597,
753
+ "learning_rate": 2.7759849885381747e-07,
754
+ "logits/chosen": -0.43579286336898804,
755
+ "logits/rejected": 0.7088162302970886,
756
+ "logps/chosen": -564.5299072265625,
757
+ "logps/rejected": -807.0545043945312,
758
+ "loss": 0.3965,
759
+ "rewards/accuracies": 0.778124988079071,
760
+ "rewards/chosen": -2.7139556407928467,
761
+ "rewards/margins": 2.583310127258301,
762
+ "rewards/rejected": -5.297266483306885,
763
+ "step": 450
764
+ },
765
+ {
766
+ "epoch": 0.5298013245033113,
767
+ "grad_norm": 37.40829093424466,
768
+ "learning_rate": 2.675840195195762e-07,
769
+ "logits/chosen": -0.4753951132297516,
770
+ "logits/rejected": 0.5207837224006653,
771
+ "logps/chosen": -559.075927734375,
772
+ "logps/rejected": -858.9351806640625,
773
+ "loss": 0.3858,
774
+ "rewards/accuracies": 0.809374988079071,
775
+ "rewards/chosen": -2.8625900745391846,
776
+ "rewards/margins": 2.9560627937316895,
777
+ "rewards/rejected": -5.818652153015137,
778
+ "step": 460
779
+ },
780
+ {
781
+ "epoch": 0.5413187446012093,
782
+ "grad_norm": 28.860389068235733,
783
+ "learning_rate": 2.575410918227829e-07,
784
+ "logits/chosen": -0.4289991855621338,
785
+ "logits/rejected": 0.41408976912498474,
786
+ "logps/chosen": -583.07763671875,
787
+ "logps/rejected": -848.7003784179688,
788
+ "loss": 0.3851,
789
+ "rewards/accuracies": 0.75,
790
+ "rewards/chosen": -2.932926654815674,
791
+ "rewards/margins": 2.7647881507873535,
792
+ "rewards/rejected": -5.697714805603027,
793
+ "step": 470
794
+ },
795
+ {
796
+ "epoch": 0.5528361646991073,
797
+ "grad_norm": 25.478968182398468,
798
+ "learning_rate": 2.474859637463226e-07,
799
+ "logits/chosen": 0.019112158566713333,
800
+ "logits/rejected": 0.9573495984077454,
801
+ "logps/chosen": -578.31005859375,
802
+ "logps/rejected": -817.9622192382812,
803
+ "loss": 0.4001,
804
+ "rewards/accuracies": 0.753125011920929,
805
+ "rewards/chosen": -3.071147918701172,
806
+ "rewards/margins": 2.528298854827881,
807
+ "rewards/rejected": -5.599446773529053,
808
+ "step": 480
809
+ },
810
+ {
811
+ "epoch": 0.5643535847970055,
812
+ "grad_norm": 22.69267875960799,
813
+ "learning_rate": 2.3743490301150355e-07,
814
+ "logits/chosen": 0.03456907719373703,
815
+ "logits/rejected": 0.9821624755859375,
816
+ "logps/chosen": -616.0007934570312,
817
+ "logps/rejected": -855.6959228515625,
818
+ "loss": 0.395,
819
+ "rewards/accuracies": 0.78125,
820
+ "rewards/chosen": -3.1994495391845703,
821
+ "rewards/margins": 2.4747273921966553,
822
+ "rewards/rejected": -5.6741766929626465,
823
+ "step": 490
824
+ },
825
+ {
826
+ "epoch": 0.5758710048949035,
827
+ "grad_norm": 26.70832967985792,
828
+ "learning_rate": 2.274041707592724e-07,
829
+ "logits/chosen": -0.4122609496116638,
830
+ "logits/rejected": 0.6060948371887207,
831
+ "logps/chosen": -594.303466796875,
832
+ "logps/rejected": -892.1234130859375,
833
+ "loss": 0.3858,
834
+ "rewards/accuracies": 0.8125,
835
+ "rewards/chosen": -3.213183879852295,
836
+ "rewards/margins": 2.860560894012451,
837
+ "rewards/rejected": -6.0737457275390625,
838
+ "step": 500
839
+ },
840
+ {
841
+ "epoch": 0.5758710048949035,
842
+ "eval_logits/chosen": -0.7776147127151489,
843
+ "eval_logits/rejected": 0.18928049504756927,
844
+ "eval_logps/chosen": -694.4180908203125,
845
+ "eval_logps/rejected": -1049.8428955078125,
846
+ "eval_loss": 0.3246955871582031,
847
+ "eval_rewards/accuracies": 0.8167040348052979,
848
+ "eval_rewards/chosen": -4.034500598907471,
849
+ "eval_rewards/margins": 3.563004732131958,
850
+ "eval_rewards/rejected": -7.59750509262085,
851
+ "eval_runtime": 874.6942,
852
+ "eval_samples_per_second": 8.147,
853
+ "eval_steps_per_second": 0.255,
854
+ "step": 500
855
+ },
856
+ {
857
+ "epoch": 0.5873884249928016,
858
+ "grad_norm": 21.857166040982808,
859
+ "learning_rate": 2.17409995242075e-07,
860
+ "logits/chosen": -0.3013337552547455,
861
+ "logits/rejected": 0.687148928642273,
862
+ "logps/chosen": -590.9053955078125,
863
+ "logps/rejected": -848.279296875,
864
+ "loss": 0.3623,
865
+ "rewards/accuracies": 0.800000011920929,
866
+ "rewards/chosen": -3.1760306358337402,
867
+ "rewards/margins": 2.6910133361816406,
868
+ "rewards/rejected": -5.867043972015381,
869
+ "step": 510
870
+ },
871
+ {
872
+ "epoch": 0.5989058450906997,
873
+ "grad_norm": 32.93018464240502,
874
+ "learning_rate": 2.0746854556892544e-07,
875
+ "logits/chosen": -0.28416475653648376,
876
+ "logits/rejected": 0.760982871055603,
877
+ "logps/chosen": -584.7510986328125,
878
+ "logps/rejected": -825.0016479492188,
879
+ "loss": 0.3654,
880
+ "rewards/accuracies": 0.8187500238418579,
881
+ "rewards/chosen": -3.0093677043914795,
882
+ "rewards/margins": 2.5775859355926514,
883
+ "rewards/rejected": -5.586953639984131,
884
+ "step": 520
885
+ },
886
+ {
887
+ "epoch": 0.6104232651885978,
888
+ "grad_norm": 31.84439684571111,
889
+ "learning_rate": 1.9759590554616173e-07,
890
+ "logits/chosen": -0.21416716277599335,
891
+ "logits/rejected": 0.8462156057357788,
892
+ "logps/chosen": -591.3154296875,
893
+ "logps/rejected": -826.24853515625,
894
+ "loss": 0.39,
895
+ "rewards/accuracies": 0.7749999761581421,
896
+ "rewards/chosen": -3.1136324405670166,
897
+ "rewards/margins": 2.453207015991211,
898
+ "rewards/rejected": -5.56683874130249,
899
+ "step": 530
900
+ },
901
+ {
902
+ "epoch": 0.6219406852864958,
903
+ "grad_norm": 28.506645648712848,
904
+ "learning_rate": 1.8780804765620746e-07,
905
+ "logits/chosen": -0.06838655471801758,
906
+ "logits/rejected": 1.0294172763824463,
907
+ "logps/chosen": -577.9981689453125,
908
+ "logps/rejected": -835.3642578125,
909
+ "loss": 0.3793,
910
+ "rewards/accuracies": 0.8062499761581421,
911
+ "rewards/chosen": -2.834435224533081,
912
+ "rewards/margins": 2.704789876937866,
913
+ "rewards/rejected": -5.539225101470947,
914
+ "step": 540
915
+ },
916
+ {
917
+ "epoch": 0.6334581053843938,
918
+ "grad_norm": 30.179970032375,
919
+ "learning_rate": 1.7812080721643973e-07,
920
+ "logits/chosen": -0.30736392736434937,
921
+ "logits/rejected": 0.8852709531784058,
922
+ "logps/chosen": -576.0755615234375,
923
+ "logps/rejected": -836.8414306640625,
924
+ "loss": 0.381,
925
+ "rewards/accuracies": 0.8125,
926
+ "rewards/chosen": -2.8606371879577637,
927
+ "rewards/margins": 2.6888041496276855,
928
+ "rewards/rejected": -5.549441337585449,
929
+ "step": 550
930
+ },
931
+ {
932
+ "epoch": 0.644975525482292,
933
+ "grad_norm": 26.709457513505647,
934
+ "learning_rate": 1.6854985675997063e-07,
935
+ "logits/chosen": -0.26044386625289917,
936
+ "logits/rejected": 0.7742006778717041,
937
+ "logps/chosen": -582.1048583984375,
938
+ "logps/rejected": -819.34033203125,
939
+ "loss": 0.4007,
940
+ "rewards/accuracies": 0.7875000238418579,
941
+ "rewards/chosen": -2.9853100776672363,
942
+ "rewards/margins": 2.4650139808654785,
943
+ "rewards/rejected": -5.450324058532715,
944
+ "step": 560
945
+ },
946
+ {
947
+ "epoch": 0.65649294558019,
948
+ "grad_norm": 27.543745008054035,
949
+ "learning_rate": 1.5911068067978818e-07,
950
+ "logits/chosen": -0.05375183746218681,
951
+ "logits/rejected": 1.1043269634246826,
952
+ "logps/chosen": -581.2166748046875,
953
+ "logps/rejected": -818.9441528320312,
954
+ "loss": 0.3971,
955
+ "rewards/accuracies": 0.8187500238418579,
956
+ "rewards/chosen": -2.8354713916778564,
957
+ "rewards/margins": 2.655651330947876,
958
+ "rewards/rejected": -5.491122245788574,
959
+ "step": 570
960
+ },
961
+ {
962
+ "epoch": 0.6680103656780881,
963
+ "grad_norm": 22.093767953647365,
964
+ "learning_rate": 1.4981855017728197e-07,
965
+ "logits/chosen": 0.0580272376537323,
966
+ "logits/rejected": 0.7513723373413086,
967
+ "logps/chosen": -571.5791625976562,
968
+ "logps/rejected": -858.3342895507812,
969
+ "loss": 0.3701,
970
+ "rewards/accuracies": 0.784375011920929,
971
+ "rewards/chosen": -3.0100650787353516,
972
+ "rewards/margins": 2.6902260780334473,
973
+ "rewards/rejected": -5.700291633605957,
974
+ "step": 580
975
+ },
976
+ {
977
+ "epoch": 0.6795277857759862,
978
+ "grad_norm": 36.73163562183304,
979
+ "learning_rate": 1.406884985556804e-07,
980
+ "logits/chosen": -0.005457936320453882,
981
+ "logits/rejected": 1.047271490097046,
982
+ "logps/chosen": -635.6920166015625,
983
+ "logps/rejected": -881.1370849609375,
984
+ "loss": 0.3825,
985
+ "rewards/accuracies": 0.7437499761581421,
986
+ "rewards/chosen": -3.439662456512451,
987
+ "rewards/margins": 2.575695037841797,
988
+ "rewards/rejected": -6.01535701751709,
989
+ "step": 590
990
+ },
991
+ {
992
+ "epoch": 0.6910452058738843,
993
+ "grad_norm": 30.080057939243627,
994
+ "learning_rate": 1.3173529689837354e-07,
995
+ "logits/chosen": -0.23538751900196075,
996
+ "logits/rejected": 0.9952915906906128,
997
+ "logps/chosen": -625.2637939453125,
998
+ "logps/rejected": -905.7393798828125,
999
+ "loss": 0.4031,
1000
+ "rewards/accuracies": 0.765625,
1001
+ "rewards/chosen": -3.336890459060669,
1002
+ "rewards/margins": 3.089966058731079,
1003
+ "rewards/rejected": -6.42685604095459,
1004
+ "step": 600
1005
+ },
1006
+ {
1007
+ "epoch": 0.6910452058738843,
1008
+ "eval_logits/chosen": -0.26048585772514343,
1009
+ "eval_logits/rejected": 0.6162645220756531,
1010
+ "eval_logps/chosen": -748.3095703125,
1011
+ "eval_logps/rejected": -1143.1573486328125,
1012
+ "eval_loss": 0.3190823495388031,
1013
+ "eval_rewards/accuracies": 0.820067286491394,
1014
+ "eval_rewards/chosen": -4.573415279388428,
1015
+ "eval_rewards/margins": 3.9572343826293945,
1016
+ "eval_rewards/rejected": -8.530649185180664,
1017
+ "eval_runtime": 651.1572,
1018
+ "eval_samples_per_second": 10.944,
1019
+ "eval_steps_per_second": 0.342,
1020
+ "step": 600
1021
+ },
1022
+ {
1023
+ "epoch": 0.7025626259717823,
1024
+ "grad_norm": 28.901245277836818,
1025
+ "learning_rate": 1.2297343017146726e-07,
1026
+ "logits/chosen": 0.07719476521015167,
1027
+ "logits/rejected": 1.148842453956604,
1028
+ "logps/chosen": -615.670166015625,
1029
+ "logps/rejected": -902.8016357421875,
1030
+ "loss": 0.385,
1031
+ "rewards/accuracies": 0.78125,
1032
+ "rewards/chosen": -3.2692692279815674,
1033
+ "rewards/margins": 2.9544837474823,
1034
+ "rewards/rejected": -6.223752975463867,
1035
+ "step": 610
1036
+ },
1037
+ {
1038
+ "epoch": 0.7140800460696803,
1039
+ "grad_norm": 26.881220630663055,
1040
+ "learning_rate": 1.1441707378923474e-07,
1041
+ "logits/chosen": 0.3414779305458069,
1042
+ "logits/rejected": 1.1920559406280518,
1043
+ "logps/chosen": -611.7634887695312,
1044
+ "logps/rejected": -883.4708862304688,
1045
+ "loss": 0.4032,
1046
+ "rewards/accuracies": 0.7593749761581421,
1047
+ "rewards/chosen": -3.267723798751831,
1048
+ "rewards/margins": 2.798133373260498,
1049
+ "rewards/rejected": -6.065857410430908,
1050
+ "step": 620
1051
+ },
1052
+ {
1053
+ "epoch": 0.7255974661675785,
1054
+ "grad_norm": 22.92522846442678,
1055
+ "learning_rate": 1.06080070680377e-07,
1056
+ "logits/chosen": 0.059290122240781784,
1057
+ "logits/rejected": 1.0623096227645874,
1058
+ "logps/chosen": -614.2824096679688,
1059
+ "logps/rejected": -868.0498046875,
1060
+ "loss": 0.372,
1061
+ "rewards/accuracies": 0.7718750238418579,
1062
+ "rewards/chosen": -3.2304539680480957,
1063
+ "rewards/margins": 2.7317616939544678,
1064
+ "rewards/rejected": -5.962214946746826,
1065
+ "step": 630
1066
+ },
1067
+ {
1068
+ "epoch": 0.7371148862654765,
1069
+ "grad_norm": 18.474464704704374,
1070
+ "learning_rate": 9.797590889219587e-08,
1071
+ "logits/chosen": -0.07347230613231659,
1072
+ "logits/rejected": 0.7878081798553467,
1073
+ "logps/chosen": -598.407958984375,
1074
+ "logps/rejected": -922.5660400390625,
1075
+ "loss": 0.3733,
1076
+ "rewards/accuracies": 0.796875,
1077
+ "rewards/chosen": -3.226865768432617,
1078
+ "rewards/margins": 3.285538911819458,
1079
+ "rewards/rejected": -6.5124053955078125,
1080
+ "step": 640
1081
+ },
1082
+ {
1083
+ "epoch": 0.7486323063633746,
1084
+ "grad_norm": 25.105406106031534,
1085
+ "learning_rate": 9.011769976891367e-08,
1086
+ "logits/chosen": 0.06855427473783493,
1087
+ "logits/rejected": 1.2701406478881836,
1088
+ "logps/chosen": -594.861083984375,
1089
+ "logps/rejected": -820.2781372070312,
1090
+ "loss": 0.3929,
1091
+ "rewards/accuracies": 0.828125,
1092
+ "rewards/chosen": -2.9540488719940186,
1093
+ "rewards/margins": 2.544384241104126,
1094
+ "rewards/rejected": -5.4984331130981445,
1095
+ "step": 650
1096
+ },
1097
+ {
1098
+ "epoch": 0.7601497264612727,
1099
+ "grad_norm": 25.930812393377074,
1100
+ "learning_rate": 8.251815673944218e-08,
1101
+ "logits/chosen": -0.13862136006355286,
1102
+ "logits/rejected": 0.950897216796875,
1103
+ "logps/chosen": -660.083740234375,
1104
+ "logps/rejected": -984.3076171875,
1105
+ "loss": 0.3798,
1106
+ "rewards/accuracies": 0.815625011920929,
1107
+ "rewards/chosen": -3.4895882606506348,
1108
+ "rewards/margins": 3.4641425609588623,
1109
+ "rewards/rejected": -6.953730583190918,
1110
+ "step": 660
1111
+ },
1112
+ {
1113
+ "epoch": 0.7716671465591708,
1114
+ "grad_norm": 22.848572550568402,
1115
+ "learning_rate": 7.518957474892148e-08,
1116
+ "logits/chosen": 0.03879556804895401,
1117
+ "logits/rejected": 0.8222616314888,
1118
+ "logps/chosen": -593.1844482421875,
1119
+ "logps/rejected": -868.5235595703125,
1120
+ "loss": 0.3716,
1121
+ "rewards/accuracies": 0.809374988079071,
1122
+ "rewards/chosen": -3.1576333045959473,
1123
+ "rewards/margins": 2.791321277618408,
1124
+ "rewards/rejected": -5.9489545822143555,
1125
+ "step": 670
1126
+ },
1127
+ {
1128
+ "epoch": 0.7831845666570688,
1129
+ "grad_norm": 37.77871708341422,
1130
+ "learning_rate": 6.814381036730274e-08,
1131
+ "logits/chosen": -0.06809209287166595,
1132
+ "logits/rejected": 0.9388583898544312,
1133
+ "logps/chosen": -602.1769409179688,
1134
+ "logps/rejected": -918.8854370117188,
1135
+ "loss": 0.4027,
1136
+ "rewards/accuracies": 0.768750011920929,
1137
+ "rewards/chosen": -3.232111692428589,
1138
+ "rewards/margins": 3.247992753982544,
1139
+ "rewards/rejected": -6.480103969573975,
1140
+ "step": 680
1141
+ },
1142
+ {
1143
+ "epoch": 0.7947019867549668,
1144
+ "grad_norm": 25.302552395916596,
1145
+ "learning_rate": 6.139226260715872e-08,
1146
+ "logits/chosen": -0.06320186704397202,
1147
+ "logits/rejected": 0.8823334574699402,
1148
+ "logps/chosen": -625.1529541015625,
1149
+ "logps/rejected": -898.3558349609375,
1150
+ "loss": 0.3655,
1151
+ "rewards/accuracies": 0.778124988079071,
1152
+ "rewards/chosen": -3.5077052116394043,
1153
+ "rewards/margins": 2.779940366744995,
1154
+ "rewards/rejected": -6.2876458168029785,
1155
+ "step": 690
1156
+ },
1157
+ {
1158
+ "epoch": 0.806219406852865,
1159
+ "grad_norm": 41.487105287447704,
1160
+ "learning_rate": 5.4945854481754734e-08,
1161
+ "logits/chosen": 0.07060976326465607,
1162
+ "logits/rejected": 0.9207429885864258,
1163
+ "logps/chosen": -644.9542236328125,
1164
+ "logps/rejected": -981.1370849609375,
1165
+ "loss": 0.4007,
1166
+ "rewards/accuracies": 0.7749999761581421,
1167
+ "rewards/chosen": -3.7041306495666504,
1168
+ "rewards/margins": 3.448993682861328,
1169
+ "rewards/rejected": -7.1531243324279785,
1170
+ "step": 700
1171
+ },
1172
+ {
1173
+ "epoch": 0.806219406852865,
1174
+ "eval_logits/chosen": -0.4981551170349121,
1175
+ "eval_logits/rejected": 0.44106799364089966,
1176
+ "eval_logps/chosen": -753.01123046875,
1177
+ "eval_logps/rejected": -1189.425048828125,
1178
+ "eval_loss": 0.31710898876190186,
1179
+ "eval_rewards/accuracies": 0.8178251385688782,
1180
+ "eval_rewards/chosen": -4.620431900024414,
1181
+ "eval_rewards/margins": 4.372895240783691,
1182
+ "eval_rewards/rejected": -8.993328094482422,
1183
+ "eval_runtime": 653.0396,
1184
+ "eval_samples_per_second": 10.912,
1185
+ "eval_steps_per_second": 0.341,
1186
+ "step": 700
1187
+ },
1188
+ {
1189
+ "epoch": 0.817736826950763,
1190
+ "grad_norm": 26.15798738128027,
1191
+ "learning_rate": 4.881501533321605e-08,
1192
+ "logits/chosen": -0.3350176513195038,
1193
+ "logits/rejected": 0.5944274663925171,
1194
+ "logps/chosen": -611.4078369140625,
1195
+ "logps/rejected": -894.845703125,
1196
+ "loss": 0.3819,
1197
+ "rewards/accuracies": 0.796875,
1198
+ "rewards/chosen": -3.2478299140930176,
1199
+ "rewards/margins": 2.9104466438293457,
1200
+ "rewards/rejected": -6.158276557922363,
1201
+ "step": 710
1202
+ },
1203
+ {
1204
+ "epoch": 0.8292542470486611,
1205
+ "grad_norm": 28.210401445519196,
1206
+ "learning_rate": 4.300966395938377e-08,
1207
+ "logits/chosen": -0.47553783655166626,
1208
+ "logits/rejected": 0.6052624583244324,
1209
+ "logps/chosen": -642.4817504882812,
1210
+ "logps/rejected": -950.6018676757812,
1211
+ "loss": 0.3724,
1212
+ "rewards/accuracies": 0.8062499761581421,
1213
+ "rewards/chosen": -3.420116901397705,
1214
+ "rewards/margins": 3.2143654823303223,
1215
+ "rewards/rejected": -6.634482383728027,
1216
+ "step": 720
1217
+ },
1218
+ {
1219
+ "epoch": 0.8407716671465592,
1220
+ "grad_norm": 27.28999144486062,
1221
+ "learning_rate": 3.7539192566655246e-08,
1222
+ "logits/chosen": -0.0816282406449318,
1223
+ "logits/rejected": 0.8518702387809753,
1224
+ "logps/chosen": -626.6390991210938,
1225
+ "logps/rejected": -941.8958129882812,
1226
+ "loss": 0.3713,
1227
+ "rewards/accuracies": 0.8343750238418579,
1228
+ "rewards/chosen": -3.378054141998291,
1229
+ "rewards/margins": 3.2282519340515137,
1230
+ "rewards/rejected": -6.606306552886963,
1231
+ "step": 730
1232
+ },
1233
+ {
1234
+ "epoch": 0.8522890872444573,
1235
+ "grad_norm": 27.71621255798267,
1236
+ "learning_rate": 3.24124515747731e-08,
1237
+ "logits/chosen": -0.0028346062172204256,
1238
+ "logits/rejected": 1.1290369033813477,
1239
+ "logps/chosen": -672.173828125,
1240
+ "logps/rejected": -975.3580322265625,
1241
+ "loss": 0.374,
1242
+ "rewards/accuracies": 0.796875,
1243
+ "rewards/chosen": -3.6781773567199707,
1244
+ "rewards/margins": 3.3185067176818848,
1245
+ "rewards/rejected": -6.996683597564697,
1246
+ "step": 740
1247
+ },
1248
+ {
1249
+ "epoch": 0.8638065073423553,
1250
+ "grad_norm": 35.482960030400996,
1251
+ "learning_rate": 2.763773529814506e-08,
1252
+ "logits/chosen": 0.17448297142982483,
1253
+ "logits/rejected": 0.9923737645149231,
1254
+ "logps/chosen": -603.94970703125,
1255
+ "logps/rejected": -925.9880981445312,
1256
+ "loss": 0.3918,
1257
+ "rewards/accuracies": 0.78125,
1258
+ "rewards/chosen": -3.315547466278076,
1259
+ "rewards/margins": 3.1224138736724854,
1260
+ "rewards/rejected": -6.437961578369141,
1261
+ "step": 750
1262
+ },
1263
+ {
1264
+ "epoch": 0.8753239274402533,
1265
+ "grad_norm": 28.184713620117034,
1266
+ "learning_rate": 2.3222768526860698e-08,
1267
+ "logits/chosen": 0.0647897943854332,
1268
+ "logits/rejected": 0.9855157136917114,
1269
+ "logps/chosen": -613.9244995117188,
1270
+ "logps/rejected": -901.00927734375,
1271
+ "loss": 0.3741,
1272
+ "rewards/accuracies": 0.7749999761581421,
1273
+ "rewards/chosen": -3.254974365234375,
1274
+ "rewards/margins": 2.7708938121795654,
1275
+ "rewards/rejected": -6.0258684158325195,
1276
+ "step": 760
1277
+ },
1278
+ {
1279
+ "epoch": 0.8868413475381515,
1280
+ "grad_norm": 35.13633269103924,
1281
+ "learning_rate": 1.9174694029115146e-08,
1282
+ "logits/chosen": 0.19886977970600128,
1283
+ "logits/rejected": 1.013934850692749,
1284
+ "logps/chosen": -620.3436279296875,
1285
+ "logps/rejected": -958.3701171875,
1286
+ "loss": 0.3682,
1287
+ "rewards/accuracies": 0.8218749761581421,
1288
+ "rewards/chosen": -3.4876530170440674,
1289
+ "rewards/margins": 3.2890784740448,
1290
+ "rewards/rejected": -6.776731967926025,
1291
+ "step": 770
1292
+ },
1293
+ {
1294
+ "epoch": 0.8983587676360495,
1295
+ "grad_norm": 29.350577487943855,
1296
+ "learning_rate": 1.5500060995258134e-08,
1297
+ "logits/chosen": 0.12428224086761475,
1298
+ "logits/rejected": 1.2418944835662842,
1299
+ "logps/chosen": -604.94873046875,
1300
+ "logps/rejected": -891.98828125,
1301
+ "loss": 0.37,
1302
+ "rewards/accuracies": 0.778124988079071,
1303
+ "rewards/chosen": -3.2872474193573,
1304
+ "rewards/margins": 2.9589405059814453,
1305
+ "rewards/rejected": -6.24618673324585,
1306
+ "step": 780
1307
+ },
1308
+ {
1309
+ "epoch": 0.9098761877339476,
1310
+ "grad_norm": 31.77954056090223,
1311
+ "learning_rate": 1.2204814442165812e-08,
1312
+ "logits/chosen": 0.1471497118473053,
1313
+ "logits/rejected": 1.0471051931381226,
1314
+ "logps/chosen": -657.8414306640625,
1315
+ "logps/rejected": -977.1556396484375,
1316
+ "loss": 0.3992,
1317
+ "rewards/accuracies": 0.784375011920929,
1318
+ "rewards/chosen": -3.6439871788024902,
1319
+ "rewards/margins": 3.291074752807617,
1320
+ "rewards/rejected": -6.935061454772949,
1321
+ "step": 790
1322
+ },
1323
+ {
1324
+ "epoch": 0.9213936078318457,
1325
+ "grad_norm": 35.231363022526715,
1326
+ "learning_rate": 9.294285595075669e-09,
1327
+ "logits/chosen": 0.23517772555351257,
1328
+ "logits/rejected": 1.1635137796401978,
1329
+ "logps/chosen": -621.228515625,
1330
+ "logps/rejected": -941.3455200195312,
1331
+ "loss": 0.3644,
1332
+ "rewards/accuracies": 0.7906249761581421,
1333
+ "rewards/chosen": -3.437223434448242,
1334
+ "rewards/margins": 3.3359901905059814,
1335
+ "rewards/rejected": -6.7732133865356445,
1336
+ "step": 800
1337
+ },
1338
+ {
1339
+ "epoch": 0.9213936078318457,
1340
+ "eval_logits/chosen": -0.3096068501472473,
1341
+ "eval_logits/rejected": 0.6049354672431946,
1342
+ "eval_logps/chosen": -755.9322509765625,
1343
+ "eval_logps/rejected": -1192.5621337890625,
1344
+ "eval_loss": 0.31517288088798523,
1345
+ "eval_rewards/accuracies": 0.818385660648346,
1346
+ "eval_rewards/chosen": -4.649641990661621,
1347
+ "eval_rewards/margins": 4.37505578994751,
1348
+ "eval_rewards/rejected": -9.024698257446289,
1349
+ "eval_runtime": 652.0187,
1350
+ "eval_samples_per_second": 10.929,
1351
+ "eval_steps_per_second": 0.342,
1352
+ "step": 800
1353
+ },
1354
+ {
1355
+ "epoch": 0.9329110279297438,
1356
+ "grad_norm": 35.70619984366826,
1357
+ "learning_rate": 6.773183262446914e-09,
1358
+ "logits/chosen": 0.08351641893386841,
1359
+ "logits/rejected": 1.0455710887908936,
1360
+ "logps/chosen": -619.4977416992188,
1361
+ "logps/rejected": -918.3001098632812,
1362
+ "loss": 0.4056,
1363
+ "rewards/accuracies": 0.78125,
1364
+ "rewards/chosen": -3.3363006114959717,
1365
+ "rewards/margins": 3.10974383354187,
1366
+ "rewards/rejected": -6.446043968200684,
1367
+ "step": 810
1368
+ },
1369
+ {
1370
+ "epoch": 0.9444284480276418,
1371
+ "grad_norm": 32.90474966984876,
1372
+ "learning_rate": 4.645586217799452e-09,
1373
+ "logits/chosen": -0.05233382433652878,
1374
+ "logits/rejected": 0.976836085319519,
1375
+ "logps/chosen": -630.252685546875,
1376
+ "logps/rejected": -968.9739379882812,
1377
+ "loss": 0.3685,
1378
+ "rewards/accuracies": 0.824999988079071,
1379
+ "rewards/chosen": -3.2664294242858887,
1380
+ "rewards/margins": 3.563570022583008,
1381
+ "rewards/rejected": -6.8299994468688965,
1382
+ "step": 820
1383
+ },
1384
+ {
1385
+ "epoch": 0.9559458681255398,
1386
+ "grad_norm": 36.38359169566316,
1387
+ "learning_rate": 2.9149366008568987e-09,
1388
+ "logits/chosen": 0.14797405898571014,
1389
+ "logits/rejected": 0.9976932406425476,
1390
+ "logps/chosen": -601.341552734375,
1391
+ "logps/rejected": -791.478271484375,
1392
+ "loss": 0.4137,
1393
+ "rewards/accuracies": 0.778124988079071,
1394
+ "rewards/chosen": -3.2392711639404297,
1395
+ "rewards/margins": 2.041738986968994,
1396
+ "rewards/rejected": -5.281010150909424,
1397
+ "step": 830
1398
+ },
1399
+ {
1400
+ "epoch": 0.967463288223438,
1401
+ "grad_norm": 36.38873535658025,
1402
+ "learning_rate": 1.5840343486700215e-09,
1403
+ "logits/chosen": 0.11059533059597015,
1404
+ "logits/rejected": 1.2648974657058716,
1405
+ "logps/chosen": -640.0452880859375,
1406
+ "logps/rejected": -975.3658447265625,
1407
+ "loss": 0.394,
1408
+ "rewards/accuracies": 0.8125,
1409
+ "rewards/chosen": -3.4839179515838623,
1410
+ "rewards/margins": 3.5232937335968018,
1411
+ "rewards/rejected": -7.007212162017822,
1412
+ "step": 840
1413
+ },
1414
+ {
1415
+ "epoch": 0.978980708321336,
1416
+ "grad_norm": 40.1741036497201,
1417
+ "learning_rate": 6.550326657293881e-10,
1418
+ "logits/chosen": 0.27081722021102905,
1419
+ "logits/rejected": 1.2972664833068848,
1420
+ "logps/chosen": -605.1519775390625,
1421
+ "logps/rejected": -885.68896484375,
1422
+ "loss": 0.4039,
1423
+ "rewards/accuracies": 0.7906249761581421,
1424
+ "rewards/chosen": -3.3234386444091797,
1425
+ "rewards/margins": 2.970566511154175,
1426
+ "rewards/rejected": -6.294005870819092,
1427
+ "step": 850
1428
+ },
1429
+ {
1430
+ "epoch": 0.9904981284192341,
1431
+ "grad_norm": 31.693047329975887,
1432
+ "learning_rate": 1.2943454039654467e-10,
1433
+ "logits/chosen": 0.14183056354522705,
1434
+ "logits/rejected": 1.1139782667160034,
1435
+ "logps/chosen": -605.5335693359375,
1436
+ "logps/rejected": -855.60595703125,
1437
+ "loss": 0.3858,
1438
+ "rewards/accuracies": 0.753125011920929,
1439
+ "rewards/chosen": -3.16903018951416,
1440
+ "rewards/margins": 2.619706869125366,
1441
+ "rewards/rejected": -5.7887372970581055,
1442
+ "step": 860
1443
+ },
1444
+ {
1445
+ "epoch": 0.9997120644975526,
1446
+ "step": 868,
1447
  "total_flos": 0.0,
1448
  "train_loss": 0.0,
1449
+ "train_runtime": 0.0211,
1450
+ "train_samples_per_second": 5273498.215,
1451
+ "train_steps_per_second": 41188.083
1452
  }
1453
  ],
1454
  "logging_steps": 10,
1455
+ "max_steps": 868,
1456
  "num_input_tokens_seen": 0,
1457
  "num_train_epochs": 1,
1458
  "save_steps": 100,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e96390fb7126659bd719d71eb32e05a397e4dac8149cb37e7a0cf86d4b76d018
3
  size 7480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0edc6476c7442a09f8597b0f8e2a817170ad0a2428d1d50d67735dcd0a148145
3
  size 7480