yiran-wang3 commited on
Commit
56d50a9
1 Parent(s): 10aa8ec

End of training

Browse files
Files changed (6) hide show
  1. README.md +64 -0
  2. all_results.json +9 -0
  3. config.json +1 -1
  4. generation_config.json +14 -0
  5. train_results.json +9 -0
  6. trainer_state.json +1134 -0
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: yiran-wang3/qwen2_coder_reflct_adamw_iter2
5
+ tags:
6
+ - alignment-handbook
7
+ - generated_from_trainer
8
+ - trl
9
+ - dpo
10
+ datasets:
11
+ - self-generate/qwcoder2_reflct_sppo_hard_new_cn_mining_oj_iter2-binarized-reflection-scored
12
+ model-index:
13
+ - name: qwen2_coder_reflct_adamw_iter3
14
+ results: []
15
+ ---
16
+
17
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
+ should probably proofread and complete it, then remove this comment. -->
19
+
20
+ # qwen2_coder_reflct_adamw_iter3
21
+
22
+ This model is a fine-tuned version of [yiran-wang3/qwen2_coder_reflct_adamw_iter2](https://huggingface.co/yiran-wang3/qwen2_coder_reflct_adamw_iter2) on the self-generate/qwcoder2_reflct_sppo_hard_new_cn_mining_oj_iter2-binarized-reflection-scored dataset.
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 1e-06
42
+ - train_batch_size: 8
43
+ - eval_batch_size: 4
44
+ - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - num_devices: 8
47
+ - total_train_batch_size: 64
48
+ - total_eval_batch_size: 32
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: constant
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - lr_scheduler_warmup_steps: 100
53
+ - num_epochs: 1.0
54
+
55
+ ### Training results
56
+
57
+
58
+
59
+ ### Framework versions
60
+
61
+ - Transformers 4.45.0
62
+ - Pytorch 2.4.0+cu121
63
+ - Datasets 2.14.6
64
+ - Tokenizers 0.20.2
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.43149175896094394,
5
+ "train_runtime": 170.8012,
6
+ "train_samples": 3311,
7
+ "train_samples_per_second": 19.385,
8
+ "train_steps_per_second": 0.304
9
+ }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
- "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.45.0",
26
+ "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.45.0"
14
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.43149175896094394,
5
+ "train_runtime": 170.8012,
6
+ "train_samples": 3311,
7
+ "train_samples_per_second": 19.385,
8
+ "train_steps_per_second": 0.304
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 100,
6
+ "global_step": 52,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "debug/policy_chosen_logits": -2.977583885192871,
13
+ "debug/policy_chosen_logps": -214.68138122558594,
14
+ "debug/policy_rejected_logits": -3.0865864753723145,
15
+ "debug/policy_rejected_logps": -221.65213012695312,
16
+ "debug/reference_chosen_logps": -214.68138122558594,
17
+ "debug/reference_rejected_logps": -221.65213012695312,
18
+ "epoch": 0.019230769230769232,
19
+ "grad_norm": 5.166761446238812,
20
+ "learning_rate": 1e-06,
21
+ "logits/chosen": -2.977583885192871,
22
+ "logits/rejected": -3.0865864753723145,
23
+ "logps/chosen": -214.68138122558594,
24
+ "logps/rejected": -221.65213012695312,
25
+ "loss": 0.5,
26
+ "rewards/accuracies": 0.0,
27
+ "rewards/chosen": 0.0,
28
+ "rewards/margins": 0.0,
29
+ "rewards/rejected": 0.0,
30
+ "step": 1
31
+ },
32
+ {
33
+ "debug/policy_chosen_logits": -3.110696792602539,
34
+ "debug/policy_chosen_logps": -208.40696716308594,
35
+ "debug/policy_rejected_logits": -3.141209602355957,
36
+ "debug/policy_rejected_logps": -220.74032592773438,
37
+ "debug/reference_chosen_logps": -208.67347717285156,
38
+ "debug/reference_rejected_logps": -220.81326293945312,
39
+ "epoch": 0.038461538461538464,
40
+ "grad_norm": 4.748582416027732,
41
+ "learning_rate": 1e-06,
42
+ "logits/chosen": -3.110696792602539,
43
+ "logits/rejected": -3.141209602355957,
44
+ "logps/chosen": -208.40696716308594,
45
+ "logps/rejected": -220.74032592773438,
46
+ "loss": 0.4986,
47
+ "rewards/accuracies": 0.75,
48
+ "rewards/chosen": 0.0026651951484382153,
49
+ "rewards/margins": 0.001935672713443637,
50
+ "rewards/rejected": 0.000729522667825222,
51
+ "step": 2
52
+ },
53
+ {
54
+ "debug/policy_chosen_logits": -3.122938394546509,
55
+ "debug/policy_chosen_logps": -191.87767028808594,
56
+ "debug/policy_rejected_logits": -3.2007791996002197,
57
+ "debug/policy_rejected_logps": -196.9674530029297,
58
+ "debug/reference_chosen_logps": -192.09266662597656,
59
+ "debug/reference_rejected_logps": -196.9289093017578,
60
+ "epoch": 0.057692307692307696,
61
+ "grad_norm": 4.85937471576945,
62
+ "learning_rate": 1e-06,
63
+ "logits/chosen": -3.122938394546509,
64
+ "logits/rejected": -3.2007791996002197,
65
+ "logps/chosen": -191.87767028808594,
66
+ "logps/rejected": -196.9674530029297,
67
+ "loss": 0.4979,
68
+ "rewards/accuracies": 0.875,
69
+ "rewards/chosen": 0.0021500205621123314,
70
+ "rewards/margins": 0.0025354004465043545,
71
+ "rewards/rejected": -0.0003853798261843622,
72
+ "step": 3
73
+ },
74
+ {
75
+ "debug/policy_chosen_logits": -2.9445621967315674,
76
+ "debug/policy_chosen_logps": -202.26788330078125,
77
+ "debug/policy_rejected_logits": -2.958071708679199,
78
+ "debug/policy_rejected_logps": -236.9288330078125,
79
+ "debug/reference_chosen_logps": -202.63815307617188,
80
+ "debug/reference_rejected_logps": -236.94044494628906,
81
+ "epoch": 0.07692307692307693,
82
+ "grad_norm": 4.68477948866532,
83
+ "learning_rate": 1e-06,
84
+ "logits/chosen": -2.9445621967315674,
85
+ "logits/rejected": -2.958071708679199,
86
+ "logps/chosen": -202.26788330078125,
87
+ "logps/rejected": -236.9288330078125,
88
+ "loss": 0.4975,
89
+ "rewards/accuracies": 0.75,
90
+ "rewards/chosen": 0.0037027548532932997,
91
+ "rewards/margins": 0.0035865209065377712,
92
+ "rewards/rejected": 0.00011623383034020662,
93
+ "step": 4
94
+ },
95
+ {
96
+ "debug/policy_chosen_logits": -3.0143117904663086,
97
+ "debug/policy_chosen_logps": -206.9991455078125,
98
+ "debug/policy_rejected_logits": -3.0474984645843506,
99
+ "debug/policy_rejected_logps": -184.9208984375,
100
+ "debug/reference_chosen_logps": -207.888671875,
101
+ "debug/reference_rejected_logps": -185.94973754882812,
102
+ "epoch": 0.09615384615384616,
103
+ "grad_norm": 5.103004709570949,
104
+ "learning_rate": 1e-06,
105
+ "logits/chosen": -3.0143117904663086,
106
+ "logits/rejected": -3.0474984645843506,
107
+ "logps/chosen": -206.9991455078125,
108
+ "logps/rejected": -184.9208984375,
109
+ "loss": 0.4956,
110
+ "rewards/accuracies": 0.375,
111
+ "rewards/chosen": 0.008895359002053738,
112
+ "rewards/margins": -0.0013930893037468195,
113
+ "rewards/rejected": 0.010288448072969913,
114
+ "step": 5
115
+ },
116
+ {
117
+ "debug/policy_chosen_logits": -3.1157450675964355,
118
+ "debug/policy_chosen_logps": -191.06285095214844,
119
+ "debug/policy_rejected_logits": -3.1180226802825928,
120
+ "debug/policy_rejected_logps": -217.71697998046875,
121
+ "debug/reference_chosen_logps": -192.79592895507812,
122
+ "debug/reference_rejected_logps": -218.74090576171875,
123
+ "epoch": 0.11538461538461539,
124
+ "grad_norm": 4.434924732032029,
125
+ "learning_rate": 1e-06,
126
+ "logits/chosen": -3.1157450675964355,
127
+ "logits/rejected": -3.1180226802825928,
128
+ "logps/chosen": -191.06285095214844,
129
+ "logps/rejected": -217.71697998046875,
130
+ "loss": 0.4926,
131
+ "rewards/accuracies": 0.75,
132
+ "rewards/chosen": 0.017330702394247055,
133
+ "rewards/margins": 0.00709140719845891,
134
+ "rewards/rejected": 0.010239295661449432,
135
+ "step": 6
136
+ },
137
+ {
138
+ "debug/policy_chosen_logits": -3.011009454727173,
139
+ "debug/policy_chosen_logps": -214.3972625732422,
140
+ "debug/policy_rejected_logits": -3.1553845405578613,
141
+ "debug/policy_rejected_logps": -205.53399658203125,
142
+ "debug/reference_chosen_logps": -215.999267578125,
143
+ "debug/reference_rejected_logps": -204.93153381347656,
144
+ "epoch": 0.1346153846153846,
145
+ "grad_norm": 5.27755315826341,
146
+ "learning_rate": 1e-06,
147
+ "logits/chosen": -3.011009454727173,
148
+ "logits/rejected": -3.1553845405578613,
149
+ "logps/chosen": -214.3972625732422,
150
+ "logps/rejected": -205.53399658203125,
151
+ "loss": 0.4874,
152
+ "rewards/accuracies": 0.875,
153
+ "rewards/chosen": 0.016020087525248528,
154
+ "rewards/margins": 0.02204471454024315,
155
+ "rewards/rejected": -0.006024627946317196,
156
+ "step": 7
157
+ },
158
+ {
159
+ "debug/policy_chosen_logits": -3.0167155265808105,
160
+ "debug/policy_chosen_logps": -179.38259887695312,
161
+ "debug/policy_rejected_logits": -3.06713604927063,
162
+ "debug/policy_rejected_logps": -190.42263793945312,
163
+ "debug/reference_chosen_logps": -180.68841552734375,
164
+ "debug/reference_rejected_logps": -191.515869140625,
165
+ "epoch": 0.15384615384615385,
166
+ "grad_norm": 4.472209218113578,
167
+ "learning_rate": 1e-06,
168
+ "logits/chosen": -3.0167155265808105,
169
+ "logits/rejected": -3.06713604927063,
170
+ "logps/chosen": -179.38259887695312,
171
+ "logps/rejected": -190.42263793945312,
172
+ "loss": 0.4889,
173
+ "rewards/accuracies": 0.25,
174
+ "rewards/chosen": 0.013058356940746307,
175
+ "rewards/margins": 0.002126025967299938,
176
+ "rewards/rejected": 0.01093233097344637,
177
+ "step": 8
178
+ },
179
+ {
180
+ "debug/policy_chosen_logits": -3.1482882499694824,
181
+ "debug/policy_chosen_logps": -191.95626831054688,
182
+ "debug/policy_rejected_logits": -3.1080360412597656,
183
+ "debug/policy_rejected_logps": -211.66036987304688,
184
+ "debug/reference_chosen_logps": -192.33502197265625,
185
+ "debug/reference_rejected_logps": -211.17388916015625,
186
+ "epoch": 0.17307692307692307,
187
+ "grad_norm": 4.711300291586481,
188
+ "learning_rate": 1e-06,
189
+ "logits/chosen": -3.1482882499694824,
190
+ "logits/rejected": -3.1080360412597656,
191
+ "logps/chosen": -191.95626831054688,
192
+ "logps/rejected": -211.66036987304688,
193
+ "loss": 0.4863,
194
+ "rewards/accuracies": 0.75,
195
+ "rewards/chosen": 0.0037874598056077957,
196
+ "rewards/margins": 0.008652305230498314,
197
+ "rewards/rejected": -0.004864844959229231,
198
+ "step": 9
199
+ },
200
+ {
201
+ "debug/policy_chosen_logits": -3.047020196914673,
202
+ "debug/policy_chosen_logps": -200.92193603515625,
203
+ "debug/policy_rejected_logits": -3.1241281032562256,
204
+ "debug/policy_rejected_logps": -228.44821166992188,
205
+ "debug/reference_chosen_logps": -201.93951416015625,
206
+ "debug/reference_rejected_logps": -226.07154846191406,
207
+ "epoch": 0.19230769230769232,
208
+ "grad_norm": 4.555391857522008,
209
+ "learning_rate": 1e-06,
210
+ "logits/chosen": -3.047020196914673,
211
+ "logits/rejected": -3.1241281032562256,
212
+ "logps/chosen": -200.92193603515625,
213
+ "logps/rejected": -228.44821166992188,
214
+ "loss": 0.4822,
215
+ "rewards/accuracies": 0.875,
216
+ "rewards/chosen": 0.010175762698054314,
217
+ "rewards/margins": 0.033942315727472305,
218
+ "rewards/rejected": -0.02376655675470829,
219
+ "step": 10
220
+ },
221
+ {
222
+ "debug/policy_chosen_logits": -3.047590494155884,
223
+ "debug/policy_chosen_logps": -193.70095825195312,
224
+ "debug/policy_rejected_logits": -2.9977893829345703,
225
+ "debug/policy_rejected_logps": -240.172607421875,
226
+ "debug/reference_chosen_logps": -194.73997497558594,
227
+ "debug/reference_rejected_logps": -239.21884155273438,
228
+ "epoch": 0.21153846153846154,
229
+ "grad_norm": 5.679651909554435,
230
+ "learning_rate": 1e-06,
231
+ "logits/chosen": -3.047590494155884,
232
+ "logits/rejected": -2.9977893829345703,
233
+ "logps/chosen": -193.70095825195312,
234
+ "logps/rejected": -240.172607421875,
235
+ "loss": 0.4719,
236
+ "rewards/accuracies": 0.625,
237
+ "rewards/chosen": 0.010390243493020535,
238
+ "rewards/margins": 0.019927941262722015,
239
+ "rewards/rejected": -0.009537696838378906,
240
+ "step": 11
241
+ },
242
+ {
243
+ "debug/policy_chosen_logits": -3.063314199447632,
244
+ "debug/policy_chosen_logps": -193.97750854492188,
245
+ "debug/policy_rejected_logits": -2.8785159587860107,
246
+ "debug/policy_rejected_logps": -206.611083984375,
247
+ "debug/reference_chosen_logps": -195.43072509765625,
248
+ "debug/reference_rejected_logps": -203.80999755859375,
249
+ "epoch": 0.23076923076923078,
250
+ "grad_norm": 5.02375537373724,
251
+ "learning_rate": 1e-06,
252
+ "logits/chosen": -3.063314199447632,
253
+ "logits/rejected": -2.8785159587860107,
254
+ "logps/chosen": -193.97750854492188,
255
+ "logps/rejected": -206.611083984375,
256
+ "loss": 0.4596,
257
+ "rewards/accuracies": 0.625,
258
+ "rewards/chosen": 0.014532221481204033,
259
+ "rewards/margins": 0.0425429493188858,
260
+ "rewards/rejected": -0.02801072970032692,
261
+ "step": 12
262
+ },
263
+ {
264
+ "debug/policy_chosen_logits": -3.060885190963745,
265
+ "debug/policy_chosen_logps": -198.6640625,
266
+ "debug/policy_rejected_logits": -3.0962471961975098,
267
+ "debug/policy_rejected_logps": -247.57131958007812,
268
+ "debug/reference_chosen_logps": -202.70645141601562,
269
+ "debug/reference_rejected_logps": -239.8416290283203,
270
+ "epoch": 0.25,
271
+ "grad_norm": 4.916536953774363,
272
+ "learning_rate": 1e-06,
273
+ "logits/chosen": -3.060885190963745,
274
+ "logits/rejected": -3.0962471961975098,
275
+ "logps/chosen": -198.6640625,
276
+ "logps/rejected": -247.57131958007812,
277
+ "loss": 0.4602,
278
+ "rewards/accuracies": 1.0,
279
+ "rewards/chosen": 0.04042387008666992,
280
+ "rewards/margins": 0.11772066354751587,
281
+ "rewards/rejected": -0.07729679346084595,
282
+ "step": 13
283
+ },
284
+ {
285
+ "debug/policy_chosen_logits": -3.025531768798828,
286
+ "debug/policy_chosen_logps": -200.4529266357422,
287
+ "debug/policy_rejected_logits": -2.992382287979126,
288
+ "debug/policy_rejected_logps": -220.69435119628906,
289
+ "debug/reference_chosen_logps": -204.46011352539062,
290
+ "debug/reference_rejected_logps": -221.68063354492188,
291
+ "epoch": 0.2692307692307692,
292
+ "grad_norm": 4.5758304591077765,
293
+ "learning_rate": 1e-06,
294
+ "logits/chosen": -3.025531768798828,
295
+ "logits/rejected": -2.992382287979126,
296
+ "logps/chosen": -200.4529266357422,
297
+ "logps/rejected": -220.69435119628906,
298
+ "loss": 0.4713,
299
+ "rewards/accuracies": 0.5,
300
+ "rewards/chosen": 0.040071770548820496,
301
+ "rewards/margins": 0.030209042131900787,
302
+ "rewards/rejected": 0.009862728416919708,
303
+ "step": 14
304
+ },
305
+ {
306
+ "debug/policy_chosen_logits": -3.1965172290802,
307
+ "debug/policy_chosen_logps": -186.35000610351562,
308
+ "debug/policy_rejected_logits": -3.1802818775177,
309
+ "debug/policy_rejected_logps": -210.35556030273438,
310
+ "debug/reference_chosen_logps": -189.31112670898438,
311
+ "debug/reference_rejected_logps": -211.51010131835938,
312
+ "epoch": 0.28846153846153844,
313
+ "grad_norm": 5.776855357402379,
314
+ "learning_rate": 1e-06,
315
+ "logits/chosen": -3.1965172290802,
316
+ "logits/rejected": -3.1802818775177,
317
+ "logps/chosen": -186.35000610351562,
318
+ "logps/rejected": -210.35556030273438,
319
+ "loss": 0.4747,
320
+ "rewards/accuracies": 0.75,
321
+ "rewards/chosen": 0.02961129881441593,
322
+ "rewards/margins": 0.018065985292196274,
323
+ "rewards/rejected": 0.011545314453542233,
324
+ "step": 15
325
+ },
326
+ {
327
+ "debug/policy_chosen_logits": -3.004584550857544,
328
+ "debug/policy_chosen_logps": -219.46209716796875,
329
+ "debug/policy_rejected_logits": -2.997267007827759,
330
+ "debug/policy_rejected_logps": -239.15829467773438,
331
+ "debug/reference_chosen_logps": -221.38467407226562,
332
+ "debug/reference_rejected_logps": -233.982177734375,
333
+ "epoch": 0.3076923076923077,
334
+ "grad_norm": 5.1995499627013,
335
+ "learning_rate": 1e-06,
336
+ "logits/chosen": -3.004584550857544,
337
+ "logits/rejected": -2.997267007827759,
338
+ "logps/chosen": -219.46209716796875,
339
+ "logps/rejected": -239.15829467773438,
340
+ "loss": 0.4659,
341
+ "rewards/accuracies": 0.75,
342
+ "rewards/chosen": 0.019225770607590675,
343
+ "rewards/margins": 0.07098689675331116,
344
+ "rewards/rejected": -0.05176112800836563,
345
+ "step": 16
346
+ },
347
+ {
348
+ "debug/policy_chosen_logits": -3.0550475120544434,
349
+ "debug/policy_chosen_logps": -196.09921264648438,
350
+ "debug/policy_rejected_logits": -3.0315375328063965,
351
+ "debug/policy_rejected_logps": -228.4376983642578,
352
+ "debug/reference_chosen_logps": -196.76771545410156,
353
+ "debug/reference_rejected_logps": -219.703857421875,
354
+ "epoch": 0.3269230769230769,
355
+ "grad_norm": 4.905311259261036,
356
+ "learning_rate": 1e-06,
357
+ "logits/chosen": -3.0550475120544434,
358
+ "logits/rejected": -3.0315375328063965,
359
+ "logps/chosen": -196.09921264648438,
360
+ "logps/rejected": -228.4376983642578,
361
+ "loss": 0.4668,
362
+ "rewards/accuracies": 0.875,
363
+ "rewards/chosen": 0.006685066036880016,
364
+ "rewards/margins": 0.09402336925268173,
365
+ "rewards/rejected": -0.08733831346035004,
366
+ "step": 17
367
+ },
368
+ {
369
+ "debug/policy_chosen_logits": -3.109316110610962,
370
+ "debug/policy_chosen_logps": -192.22500610351562,
371
+ "debug/policy_rejected_logits": -3.070436954498291,
372
+ "debug/policy_rejected_logps": -225.37353515625,
373
+ "debug/reference_chosen_logps": -193.43701171875,
374
+ "debug/reference_rejected_logps": -215.90811157226562,
375
+ "epoch": 0.34615384615384615,
376
+ "grad_norm": 5.438987049949224,
377
+ "learning_rate": 1e-06,
378
+ "logits/chosen": -3.109316110610962,
379
+ "logits/rejected": -3.070436954498291,
380
+ "logps/chosen": -192.22500610351562,
381
+ "logps/rejected": -225.37353515625,
382
+ "loss": 0.4604,
383
+ "rewards/accuracies": 0.5,
384
+ "rewards/chosen": 0.012120017781853676,
385
+ "rewards/margins": 0.10677413642406464,
386
+ "rewards/rejected": -0.09465412050485611,
387
+ "step": 18
388
+ },
389
+ {
390
+ "debug/policy_chosen_logits": -3.156306505203247,
391
+ "debug/policy_chosen_logps": -198.84149169921875,
392
+ "debug/policy_rejected_logits": -3.163816213607788,
393
+ "debug/policy_rejected_logps": -207.2037811279297,
394
+ "debug/reference_chosen_logps": -199.2224884033203,
395
+ "debug/reference_rejected_logps": -204.29345703125,
396
+ "epoch": 0.36538461538461536,
397
+ "grad_norm": 5.3459094259339395,
398
+ "learning_rate": 1e-06,
399
+ "logits/chosen": -3.156306505203247,
400
+ "logits/rejected": -3.163816213607788,
401
+ "logps/chosen": -198.84149169921875,
402
+ "logps/rejected": -207.2037811279297,
403
+ "loss": 0.4623,
404
+ "rewards/accuracies": 0.625,
405
+ "rewards/chosen": 0.003809986636042595,
406
+ "rewards/margins": 0.032913051545619965,
407
+ "rewards/rejected": -0.02910306677222252,
408
+ "step": 19
409
+ },
410
+ {
411
+ "debug/policy_chosen_logits": -3.09970760345459,
412
+ "debug/policy_chosen_logps": -195.97071838378906,
413
+ "debug/policy_rejected_logits": -3.0445327758789062,
414
+ "debug/policy_rejected_logps": -233.5975341796875,
415
+ "debug/reference_chosen_logps": -196.68661499023438,
416
+ "debug/reference_rejected_logps": -226.482666015625,
417
+ "epoch": 0.38461538461538464,
418
+ "grad_norm": 5.574869499523263,
419
+ "learning_rate": 1e-06,
420
+ "logits/chosen": -3.09970760345459,
421
+ "logits/rejected": -3.0445327758789062,
422
+ "logps/chosen": -195.97071838378906,
423
+ "logps/rejected": -233.5975341796875,
424
+ "loss": 0.4437,
425
+ "rewards/accuracies": 0.625,
426
+ "rewards/chosen": 0.007159043103456497,
427
+ "rewards/margins": 0.07830756902694702,
428
+ "rewards/rejected": -0.07114852219820023,
429
+ "step": 20
430
+ },
431
+ {
432
+ "debug/policy_chosen_logits": -3.048816680908203,
433
+ "debug/policy_chosen_logps": -187.65283203125,
434
+ "debug/policy_rejected_logits": -3.0968563556671143,
435
+ "debug/policy_rejected_logps": -222.02886962890625,
436
+ "debug/reference_chosen_logps": -191.67349243164062,
437
+ "debug/reference_rejected_logps": -218.13577270507812,
438
+ "epoch": 0.40384615384615385,
439
+ "grad_norm": 4.472533940520768,
440
+ "learning_rate": 1e-06,
441
+ "logits/chosen": -3.048816680908203,
442
+ "logits/rejected": -3.0968563556671143,
443
+ "logps/chosen": -187.65283203125,
444
+ "logps/rejected": -222.02886962890625,
445
+ "loss": 0.4355,
446
+ "rewards/accuracies": 0.875,
447
+ "rewards/chosen": 0.04020654410123825,
448
+ "rewards/margins": 0.07913752645254135,
449
+ "rewards/rejected": -0.0389309898018837,
450
+ "step": 21
451
+ },
452
+ {
453
+ "debug/policy_chosen_logits": -3.1425256729125977,
454
+ "debug/policy_chosen_logps": -201.33770751953125,
455
+ "debug/policy_rejected_logits": -3.146122932434082,
456
+ "debug/policy_rejected_logps": -221.27914428710938,
457
+ "debug/reference_chosen_logps": -205.18829345703125,
458
+ "debug/reference_rejected_logps": -218.09457397460938,
459
+ "epoch": 0.4230769230769231,
460
+ "grad_norm": 6.018529206237279,
461
+ "learning_rate": 1e-06,
462
+ "logits/chosen": -3.1425256729125977,
463
+ "logits/rejected": -3.146122932434082,
464
+ "logps/chosen": -201.33770751953125,
465
+ "logps/rejected": -221.27914428710938,
466
+ "loss": 0.4552,
467
+ "rewards/accuracies": 0.625,
468
+ "rewards/chosen": 0.038505878299474716,
469
+ "rewards/margins": 0.07035169750452042,
470
+ "rewards/rejected": -0.0318458154797554,
471
+ "step": 22
472
+ },
473
+ {
474
+ "debug/policy_chosen_logits": -3.187493324279785,
475
+ "debug/policy_chosen_logps": -197.76002502441406,
476
+ "debug/policy_rejected_logits": -3.1831955909729004,
477
+ "debug/policy_rejected_logps": -201.7500762939453,
478
+ "debug/reference_chosen_logps": -200.38540649414062,
479
+ "debug/reference_rejected_logps": -205.30821228027344,
480
+ "epoch": 0.4423076923076923,
481
+ "grad_norm": 4.661142809825874,
482
+ "learning_rate": 1e-06,
483
+ "logits/chosen": -3.187493324279785,
484
+ "logits/rejected": -3.1831955909729004,
485
+ "logps/chosen": -197.76002502441406,
486
+ "logps/rejected": -201.7500762939453,
487
+ "loss": 0.4572,
488
+ "rewards/accuracies": 0.625,
489
+ "rewards/chosen": 0.02625381574034691,
490
+ "rewards/margins": -0.009327447973191738,
491
+ "rewards/rejected": 0.03558126464486122,
492
+ "step": 23
493
+ },
494
+ {
495
+ "debug/policy_chosen_logits": -3.20326566696167,
496
+ "debug/policy_chosen_logps": -191.74195861816406,
497
+ "debug/policy_rejected_logits": -3.187241792678833,
498
+ "debug/policy_rejected_logps": -233.16586303710938,
499
+ "debug/reference_chosen_logps": -196.749267578125,
500
+ "debug/reference_rejected_logps": -227.8116455078125,
501
+ "epoch": 0.46153846153846156,
502
+ "grad_norm": 4.44342516621906,
503
+ "learning_rate": 1e-06,
504
+ "logits/chosen": -3.20326566696167,
505
+ "logits/rejected": -3.187241792678833,
506
+ "logps/chosen": -191.74195861816406,
507
+ "logps/rejected": -233.16586303710938,
508
+ "loss": 0.4244,
509
+ "rewards/accuracies": 0.875,
510
+ "rewards/chosen": 0.05007295310497284,
511
+ "rewards/margins": 0.10361497104167938,
512
+ "rewards/rejected": -0.05354202166199684,
513
+ "step": 24
514
+ },
515
+ {
516
+ "debug/policy_chosen_logits": -3.142024517059326,
517
+ "debug/policy_chosen_logps": -203.33738708496094,
518
+ "debug/policy_rejected_logits": -3.017565965652466,
519
+ "debug/policy_rejected_logps": -238.41554260253906,
520
+ "debug/reference_chosen_logps": -205.4508056640625,
521
+ "debug/reference_rejected_logps": -228.70516967773438,
522
+ "epoch": 0.4807692307692308,
523
+ "grad_norm": 4.368252676849126,
524
+ "learning_rate": 1e-06,
525
+ "logits/chosen": -3.142024517059326,
526
+ "logits/rejected": -3.017565965652466,
527
+ "logps/chosen": -203.33738708496094,
528
+ "logps/rejected": -238.41554260253906,
529
+ "loss": 0.4321,
530
+ "rewards/accuracies": 0.875,
531
+ "rewards/chosen": 0.021134089678525925,
532
+ "rewards/margins": 0.11823777854442596,
533
+ "rewards/rejected": -0.09710369259119034,
534
+ "step": 25
535
+ },
536
+ {
537
+ "debug/policy_chosen_logits": -3.0380425453186035,
538
+ "debug/policy_chosen_logps": -200.80526733398438,
539
+ "debug/policy_rejected_logits": -3.081782102584839,
540
+ "debug/policy_rejected_logps": -218.62149047851562,
541
+ "debug/reference_chosen_logps": -203.88671875,
542
+ "debug/reference_rejected_logps": -216.6439208984375,
543
+ "epoch": 0.5,
544
+ "grad_norm": 4.4745876509482025,
545
+ "learning_rate": 1e-06,
546
+ "logits/chosen": -3.0380425453186035,
547
+ "logits/rejected": -3.081782102584839,
548
+ "logps/chosen": -200.80526733398438,
549
+ "logps/rejected": -218.62149047851562,
550
+ "loss": 0.4491,
551
+ "rewards/accuracies": 0.375,
552
+ "rewards/chosen": 0.030814513564109802,
553
+ "rewards/margins": 0.05059013515710831,
554
+ "rewards/rejected": -0.019775621592998505,
555
+ "step": 26
556
+ },
557
+ {
558
+ "debug/policy_chosen_logits": -3.1514594554901123,
559
+ "debug/policy_chosen_logps": -195.95147705078125,
560
+ "debug/policy_rejected_logits": -2.9963762760162354,
561
+ "debug/policy_rejected_logps": -275.3726806640625,
562
+ "debug/reference_chosen_logps": -193.73902893066406,
563
+ "debug/reference_rejected_logps": -263.09857177734375,
564
+ "epoch": 0.5192307692307693,
565
+ "grad_norm": 4.481499913956976,
566
+ "learning_rate": 1e-06,
567
+ "logits/chosen": -3.1514594554901123,
568
+ "logits/rejected": -2.9963762760162354,
569
+ "logps/chosen": -195.95147705078125,
570
+ "logps/rejected": -275.3726806640625,
571
+ "loss": 0.4362,
572
+ "rewards/accuracies": 0.75,
573
+ "rewards/chosen": -0.02212451957166195,
574
+ "rewards/margins": 0.10061690211296082,
575
+ "rewards/rejected": -0.12274143099784851,
576
+ "step": 27
577
+ },
578
+ {
579
+ "debug/policy_chosen_logits": -3.1708903312683105,
580
+ "debug/policy_chosen_logps": -188.44619750976562,
581
+ "debug/policy_rejected_logits": -3.2108025550842285,
582
+ "debug/policy_rejected_logps": -220.68988037109375,
583
+ "debug/reference_chosen_logps": -194.43800354003906,
584
+ "debug/reference_rejected_logps": -220.9051513671875,
585
+ "epoch": 0.5384615384615384,
586
+ "grad_norm": 4.764561262542065,
587
+ "learning_rate": 1e-06,
588
+ "logits/chosen": -3.1708903312683105,
589
+ "logits/rejected": -3.2108025550842285,
590
+ "logps/chosen": -188.44619750976562,
591
+ "logps/rejected": -220.68988037109375,
592
+ "loss": 0.4431,
593
+ "rewards/accuracies": 0.625,
594
+ "rewards/chosen": 0.05991803854703903,
595
+ "rewards/margins": 0.05776527523994446,
596
+ "rewards/rejected": 0.002152767963707447,
597
+ "step": 28
598
+ },
599
+ {
600
+ "debug/policy_chosen_logits": -3.1714835166931152,
601
+ "debug/policy_chosen_logps": -180.14614868164062,
602
+ "debug/policy_rejected_logits": -3.08394718170166,
603
+ "debug/policy_rejected_logps": -239.15164184570312,
604
+ "debug/reference_chosen_logps": -184.656494140625,
605
+ "debug/reference_rejected_logps": -224.29171752929688,
606
+ "epoch": 0.5576923076923077,
607
+ "grad_norm": 4.849507609873306,
608
+ "learning_rate": 1e-06,
609
+ "logits/chosen": -3.1714835166931152,
610
+ "logits/rejected": -3.08394718170166,
611
+ "logps/chosen": -180.14614868164062,
612
+ "logps/rejected": -239.15164184570312,
613
+ "loss": 0.3837,
614
+ "rewards/accuracies": 0.75,
615
+ "rewards/chosen": 0.045103415846824646,
616
+ "rewards/margins": 0.1937025487422943,
617
+ "rewards/rejected": -0.14859913289546967,
618
+ "step": 29
619
+ },
620
+ {
621
+ "debug/policy_chosen_logits": -3.178560972213745,
622
+ "debug/policy_chosen_logps": -210.00115966796875,
623
+ "debug/policy_rejected_logits": -3.1779253482818604,
624
+ "debug/policy_rejected_logps": -235.44644165039062,
625
+ "debug/reference_chosen_logps": -213.55612182617188,
626
+ "debug/reference_rejected_logps": -221.76431274414062,
627
+ "epoch": 0.5769230769230769,
628
+ "grad_norm": 4.895017276343638,
629
+ "learning_rate": 1e-06,
630
+ "logits/chosen": -3.178560972213745,
631
+ "logits/rejected": -3.1779253482818604,
632
+ "logps/chosen": -210.00115966796875,
633
+ "logps/rejected": -235.44644165039062,
634
+ "loss": 0.3956,
635
+ "rewards/accuracies": 0.75,
636
+ "rewards/chosen": 0.03554954379796982,
637
+ "rewards/margins": 0.1723707914352417,
638
+ "rewards/rejected": -0.13682125508785248,
639
+ "step": 30
640
+ },
641
+ {
642
+ "debug/policy_chosen_logits": -3.108640670776367,
643
+ "debug/policy_chosen_logps": -200.23190307617188,
644
+ "debug/policy_rejected_logits": -3.1070878505706787,
645
+ "debug/policy_rejected_logps": -206.5513153076172,
646
+ "debug/reference_chosen_logps": -209.71009826660156,
647
+ "debug/reference_rejected_logps": -205.9014892578125,
648
+ "epoch": 0.5961538461538461,
649
+ "grad_norm": 5.522641702393358,
650
+ "learning_rate": 1e-06,
651
+ "logits/chosen": -3.108640670776367,
652
+ "logits/rejected": -3.1070878505706787,
653
+ "logps/chosen": -200.23190307617188,
654
+ "logps/rejected": -206.5513153076172,
655
+ "loss": 0.4094,
656
+ "rewards/accuracies": 0.75,
657
+ "rewards/chosen": 0.09478198736906052,
658
+ "rewards/margins": 0.10128023475408554,
659
+ "rewards/rejected": -0.0064982399344444275,
660
+ "step": 31
661
+ },
662
+ {
663
+ "debug/policy_chosen_logits": -3.165921211242676,
664
+ "debug/policy_chosen_logps": -176.6615753173828,
665
+ "debug/policy_rejected_logits": -3.197509288787842,
666
+ "debug/policy_rejected_logps": -212.1288299560547,
667
+ "debug/reference_chosen_logps": -191.31610107421875,
668
+ "debug/reference_rejected_logps": -211.03121948242188,
669
+ "epoch": 0.6153846153846154,
670
+ "grad_norm": 5.252195941070867,
671
+ "learning_rate": 1e-06,
672
+ "logits/chosen": -3.165921211242676,
673
+ "logits/rejected": -3.197509288787842,
674
+ "logps/chosen": -176.6615753173828,
675
+ "logps/rejected": -212.1288299560547,
676
+ "loss": 0.3729,
677
+ "rewards/accuracies": 1.0,
678
+ "rewards/chosen": 0.14654535055160522,
679
+ "rewards/margins": 0.15752162039279938,
680
+ "rewards/rejected": -0.010976276360452175,
681
+ "step": 32
682
+ },
683
+ {
684
+ "debug/policy_chosen_logits": -3.020163059234619,
685
+ "debug/policy_chosen_logps": -203.67031860351562,
686
+ "debug/policy_rejected_logits": -3.0926342010498047,
687
+ "debug/policy_rejected_logps": -210.38372802734375,
688
+ "debug/reference_chosen_logps": -209.38458251953125,
689
+ "debug/reference_rejected_logps": -205.83929443359375,
690
+ "epoch": 0.6346153846153846,
691
+ "grad_norm": 4.8112163863395665,
692
+ "learning_rate": 1e-06,
693
+ "logits/chosen": -3.020163059234619,
694
+ "logits/rejected": -3.0926342010498047,
695
+ "logps/chosen": -203.67031860351562,
696
+ "logps/rejected": -210.38372802734375,
697
+ "loss": 0.3845,
698
+ "rewards/accuracies": 0.75,
699
+ "rewards/chosen": 0.05714261904358864,
700
+ "rewards/margins": 0.10258688032627106,
701
+ "rewards/rejected": -0.04544425755739212,
702
+ "step": 33
703
+ },
704
+ {
705
+ "debug/policy_chosen_logits": -3.226917028427124,
706
+ "debug/policy_chosen_logps": -197.40658569335938,
707
+ "debug/policy_rejected_logits": -3.1886653900146484,
708
+ "debug/policy_rejected_logps": -249.4960174560547,
709
+ "debug/reference_chosen_logps": -208.50588989257812,
710
+ "debug/reference_rejected_logps": -234.88482666015625,
711
+ "epoch": 0.6538461538461539,
712
+ "grad_norm": 4.70294242121678,
713
+ "learning_rate": 1e-06,
714
+ "logits/chosen": -3.226917028427124,
715
+ "logits/rejected": -3.1886653900146484,
716
+ "logps/chosen": -197.40658569335938,
717
+ "logps/rejected": -249.4960174560547,
718
+ "loss": 0.3647,
719
+ "rewards/accuracies": 0.75,
720
+ "rewards/chosen": 0.11099302023649216,
721
+ "rewards/margins": 0.25710487365722656,
722
+ "rewards/rejected": -0.146111860871315,
723
+ "step": 34
724
+ },
725
+ {
726
+ "debug/policy_chosen_logits": -3.1596603393554688,
727
+ "debug/policy_chosen_logps": -196.2579345703125,
728
+ "debug/policy_rejected_logits": -3.1261677742004395,
729
+ "debug/policy_rejected_logps": -231.90347290039062,
730
+ "debug/reference_chosen_logps": -205.25323486328125,
731
+ "debug/reference_rejected_logps": -227.40982055664062,
732
+ "epoch": 0.6730769230769231,
733
+ "grad_norm": 7.012486637898167,
734
+ "learning_rate": 1e-06,
735
+ "logits/chosen": -3.1596603393554688,
736
+ "logits/rejected": -3.1261677742004395,
737
+ "logps/chosen": -196.2579345703125,
738
+ "logps/rejected": -231.90347290039062,
739
+ "loss": 0.407,
740
+ "rewards/accuracies": 0.75,
741
+ "rewards/chosen": 0.08995288610458374,
742
+ "rewards/margins": 0.13488951325416565,
743
+ "rewards/rejected": -0.044936634600162506,
744
+ "step": 35
745
+ },
746
+ {
747
+ "debug/policy_chosen_logits": -3.1420416831970215,
748
+ "debug/policy_chosen_logps": -187.47885131835938,
749
+ "debug/policy_rejected_logits": -3.067408800125122,
750
+ "debug/policy_rejected_logps": -227.22024536132812,
751
+ "debug/reference_chosen_logps": -199.94757080078125,
752
+ "debug/reference_rejected_logps": -219.46438598632812,
753
+ "epoch": 0.6923076923076923,
754
+ "grad_norm": 5.480923899428052,
755
+ "learning_rate": 1e-06,
756
+ "logits/chosen": -3.1420416831970215,
757
+ "logits/rejected": -3.067408800125122,
758
+ "logps/chosen": -187.47885131835938,
759
+ "logps/rejected": -227.22024536132812,
760
+ "loss": 0.3983,
761
+ "rewards/accuracies": 1.0,
762
+ "rewards/chosen": 0.12468719482421875,
763
+ "rewards/margins": 0.2022458016872406,
764
+ "rewards/rejected": -0.07755860686302185,
765
+ "step": 36
766
+ },
767
+ {
768
+ "debug/policy_chosen_logits": -3.134657621383667,
769
+ "debug/policy_chosen_logps": -198.46621704101562,
770
+ "debug/policy_rejected_logits": -3.191622495651245,
771
+ "debug/policy_rejected_logps": -238.76580810546875,
772
+ "debug/reference_chosen_logps": -210.65228271484375,
773
+ "debug/reference_rejected_logps": -226.888427734375,
774
+ "epoch": 0.7115384615384616,
775
+ "grad_norm": 6.49950029807871,
776
+ "learning_rate": 1e-06,
777
+ "logits/chosen": -3.134657621383667,
778
+ "logits/rejected": -3.191622495651245,
779
+ "logps/chosen": -198.46621704101562,
780
+ "logps/rejected": -238.76580810546875,
781
+ "loss": 0.4017,
782
+ "rewards/accuracies": 1.0,
783
+ "rewards/chosen": 0.12186044454574585,
784
+ "rewards/margins": 0.24063441157341003,
785
+ "rewards/rejected": -0.11877395212650299,
786
+ "step": 37
787
+ },
788
+ {
789
+ "debug/policy_chosen_logits": -3.2385828495025635,
790
+ "debug/policy_chosen_logps": -170.713623046875,
791
+ "debug/policy_rejected_logits": -3.207604169845581,
792
+ "debug/policy_rejected_logps": -235.9530029296875,
793
+ "debug/reference_chosen_logps": -183.66558837890625,
794
+ "debug/reference_rejected_logps": -223.42767333984375,
795
+ "epoch": 0.7307692307692307,
796
+ "grad_norm": 5.516561491681134,
797
+ "learning_rate": 1e-06,
798
+ "logits/chosen": -3.2385828495025635,
799
+ "logits/rejected": -3.207604169845581,
800
+ "logps/chosen": -170.713623046875,
801
+ "logps/rejected": -235.9530029296875,
802
+ "loss": 0.4081,
803
+ "rewards/accuracies": 0.875,
804
+ "rewards/chosen": 0.12951962649822235,
805
+ "rewards/margins": 0.2547728717327118,
806
+ "rewards/rejected": -0.12525323033332825,
807
+ "step": 38
808
+ },
809
+ {
810
+ "debug/policy_chosen_logits": -3.0979700088500977,
811
+ "debug/policy_chosen_logps": -162.31280517578125,
812
+ "debug/policy_rejected_logits": -3.186298131942749,
813
+ "debug/policy_rejected_logps": -235.3626708984375,
814
+ "debug/reference_chosen_logps": -172.52468872070312,
815
+ "debug/reference_rejected_logps": -221.17556762695312,
816
+ "epoch": 0.75,
817
+ "grad_norm": 6.774526914117382,
818
+ "learning_rate": 1e-06,
819
+ "logits/chosen": -3.0979700088500977,
820
+ "logits/rejected": -3.186298131942749,
821
+ "logps/chosen": -162.31280517578125,
822
+ "logps/rejected": -235.3626708984375,
823
+ "loss": 0.3939,
824
+ "rewards/accuracies": 0.875,
825
+ "rewards/chosen": 0.10211898386478424,
826
+ "rewards/margins": 0.2439899444580078,
827
+ "rewards/rejected": -0.14187094569206238,
828
+ "step": 39
829
+ },
830
+ {
831
+ "debug/policy_chosen_logits": -3.089684009552002,
832
+ "debug/policy_chosen_logps": -196.1321563720703,
833
+ "debug/policy_rejected_logits": -3.04327392578125,
834
+ "debug/policy_rejected_logps": -216.24703979492188,
835
+ "debug/reference_chosen_logps": -200.5460205078125,
836
+ "debug/reference_rejected_logps": -207.2269287109375,
837
+ "epoch": 0.7692307692307693,
838
+ "grad_norm": 5.675881779895136,
839
+ "learning_rate": 1e-06,
840
+ "logits/chosen": -3.089684009552002,
841
+ "logits/rejected": -3.04327392578125,
842
+ "logps/chosen": -196.1321563720703,
843
+ "logps/rejected": -216.24703979492188,
844
+ "loss": 0.4025,
845
+ "rewards/accuracies": 0.875,
846
+ "rewards/chosen": 0.04413875192403793,
847
+ "rewards/margins": 0.1343398243188858,
848
+ "rewards/rejected": -0.09020107239484787,
849
+ "step": 40
850
+ },
851
+ {
852
+ "debug/policy_chosen_logits": -3.1140072345733643,
853
+ "debug/policy_chosen_logps": -187.277099609375,
854
+ "debug/policy_rejected_logits": -3.2128348350524902,
855
+ "debug/policy_rejected_logps": -254.4541015625,
856
+ "debug/reference_chosen_logps": -192.44972229003906,
857
+ "debug/reference_rejected_logps": -245.08372497558594,
858
+ "epoch": 0.7884615384615384,
859
+ "grad_norm": 6.653395887552183,
860
+ "learning_rate": 1e-06,
861
+ "logits/chosen": -3.1140072345733643,
862
+ "logits/rejected": -3.2128348350524902,
863
+ "logps/chosen": -187.277099609375,
864
+ "logps/rejected": -254.4541015625,
865
+ "loss": 0.4269,
866
+ "rewards/accuracies": 0.625,
867
+ "rewards/chosen": 0.051726073026657104,
868
+ "rewards/margins": 0.14542999863624573,
869
+ "rewards/rejected": -0.09370393306016922,
870
+ "step": 41
871
+ },
872
+ {
873
+ "debug/policy_chosen_logits": -3.3676302433013916,
874
+ "debug/policy_chosen_logps": -159.673095703125,
875
+ "debug/policy_rejected_logits": -3.310539484024048,
876
+ "debug/policy_rejected_logps": -204.17922973632812,
877
+ "debug/reference_chosen_logps": -172.7714385986328,
878
+ "debug/reference_rejected_logps": -196.74644470214844,
879
+ "epoch": 0.8076923076923077,
880
+ "grad_norm": 5.688046713439665,
881
+ "learning_rate": 1e-06,
882
+ "logits/chosen": -3.3676302433013916,
883
+ "logits/rejected": -3.310539484024048,
884
+ "logps/chosen": -159.673095703125,
885
+ "logps/rejected": -204.17922973632812,
886
+ "loss": 0.3881,
887
+ "rewards/accuracies": 0.625,
888
+ "rewards/chosen": 0.1309833526611328,
889
+ "rewards/margins": 0.20531128346920013,
890
+ "rewards/rejected": -0.07432794570922852,
891
+ "step": 42
892
+ },
893
+ {
894
+ "debug/policy_chosen_logits": -3.048048496246338,
895
+ "debug/policy_chosen_logps": -195.83262634277344,
896
+ "debug/policy_rejected_logits": -3.1337571144104004,
897
+ "debug/policy_rejected_logps": -220.1021728515625,
898
+ "debug/reference_chosen_logps": -204.2503662109375,
899
+ "debug/reference_rejected_logps": -214.80859375,
900
+ "epoch": 0.8269230769230769,
901
+ "grad_norm": 6.260667799966591,
902
+ "learning_rate": 1e-06,
903
+ "logits/chosen": -3.048048496246338,
904
+ "logits/rejected": -3.1337571144104004,
905
+ "logps/chosen": -195.83262634277344,
906
+ "logps/rejected": -220.1021728515625,
907
+ "loss": 0.4081,
908
+ "rewards/accuracies": 0.625,
909
+ "rewards/chosen": 0.08417723327875137,
910
+ "rewards/margins": 0.13711285591125488,
911
+ "rewards/rejected": -0.05293561890721321,
912
+ "step": 43
913
+ },
914
+ {
915
+ "debug/policy_chosen_logits": -3.2857210636138916,
916
+ "debug/policy_chosen_logps": -190.01461791992188,
917
+ "debug/policy_rejected_logits": -3.210080146789551,
918
+ "debug/policy_rejected_logps": -202.32200622558594,
919
+ "debug/reference_chosen_logps": -192.96429443359375,
920
+ "debug/reference_rejected_logps": -204.96524047851562,
921
+ "epoch": 0.8461538461538461,
922
+ "grad_norm": 5.41978019999578,
923
+ "learning_rate": 1e-06,
924
+ "logits/chosen": -3.2857210636138916,
925
+ "logits/rejected": -3.210080146789551,
926
+ "logps/chosen": -190.01461791992188,
927
+ "logps/rejected": -202.32200622558594,
928
+ "loss": 0.3812,
929
+ "rewards/accuracies": 0.625,
930
+ "rewards/chosen": 0.02949686348438263,
931
+ "rewards/margins": 0.0030643679201602936,
932
+ "rewards/rejected": 0.026432491838932037,
933
+ "step": 44
934
+ },
935
+ {
936
+ "debug/policy_chosen_logits": -3.212117910385132,
937
+ "debug/policy_chosen_logps": -168.555419921875,
938
+ "debug/policy_rejected_logits": -3.191093683242798,
939
+ "debug/policy_rejected_logps": -196.78350830078125,
940
+ "debug/reference_chosen_logps": -184.03561401367188,
941
+ "debug/reference_rejected_logps": -194.875,
942
+ "epoch": 0.8653846153846154,
943
+ "grad_norm": 8.557722940045432,
944
+ "learning_rate": 1e-06,
945
+ "logits/chosen": -3.212117910385132,
946
+ "logits/rejected": -3.191093683242798,
947
+ "logps/chosen": -168.555419921875,
948
+ "logps/rejected": -196.78350830078125,
949
+ "loss": 0.4219,
950
+ "rewards/accuracies": 0.875,
951
+ "rewards/chosen": 0.1548018455505371,
952
+ "rewards/margins": 0.1738869845867157,
953
+ "rewards/rejected": -0.01908515766263008,
954
+ "step": 45
955
+ },
956
+ {
957
+ "debug/policy_chosen_logits": -3.4355483055114746,
958
+ "debug/policy_chosen_logps": -174.82119750976562,
959
+ "debug/policy_rejected_logits": -3.4658639430999756,
960
+ "debug/policy_rejected_logps": -179.5398712158203,
961
+ "debug/reference_chosen_logps": -183.99972534179688,
962
+ "debug/reference_rejected_logps": -182.81251525878906,
963
+ "epoch": 0.8846153846153846,
964
+ "grad_norm": 5.9566531465155,
965
+ "learning_rate": 1e-06,
966
+ "logits/chosen": -3.4355483055114746,
967
+ "logits/rejected": -3.4658639430999756,
968
+ "logps/chosen": -174.82119750976562,
969
+ "logps/rejected": -179.5398712158203,
970
+ "loss": 0.4262,
971
+ "rewards/accuracies": 0.625,
972
+ "rewards/chosen": 0.09178514778614044,
973
+ "rewards/margins": 0.059058789163827896,
974
+ "rewards/rejected": 0.032726362347602844,
975
+ "step": 46
976
+ },
977
+ {
978
+ "debug/policy_chosen_logits": -3.0792293548583984,
979
+ "debug/policy_chosen_logps": -181.77552795410156,
980
+ "debug/policy_rejected_logits": -3.082854986190796,
981
+ "debug/policy_rejected_logps": -220.6517791748047,
982
+ "debug/reference_chosen_logps": -191.10177612304688,
983
+ "debug/reference_rejected_logps": -199.66552734375,
984
+ "epoch": 0.9038461538461539,
985
+ "grad_norm": 9.698103359515986,
986
+ "learning_rate": 1e-06,
987
+ "logits/chosen": -3.0792293548583984,
988
+ "logits/rejected": -3.082854986190796,
989
+ "logps/chosen": -181.77552795410156,
990
+ "logps/rejected": -220.6517791748047,
991
+ "loss": 0.3918,
992
+ "rewards/accuracies": 0.75,
993
+ "rewards/chosen": 0.09326266497373581,
994
+ "rewards/margins": 0.3031250536441803,
995
+ "rewards/rejected": -0.20986239612102509,
996
+ "step": 47
997
+ },
998
+ {
999
+ "debug/policy_chosen_logits": -3.1799800395965576,
1000
+ "debug/policy_chosen_logps": -182.957763671875,
1001
+ "debug/policy_rejected_logits": -3.1335575580596924,
1002
+ "debug/policy_rejected_logps": -217.8682403564453,
1003
+ "debug/reference_chosen_logps": -194.94949340820312,
1004
+ "debug/reference_rejected_logps": -203.79934692382812,
1005
+ "epoch": 0.9230769230769231,
1006
+ "grad_norm": 6.73392677471291,
1007
+ "learning_rate": 1e-06,
1008
+ "logits/chosen": -3.1799800395965576,
1009
+ "logits/rejected": -3.1335575580596924,
1010
+ "logps/chosen": -182.957763671875,
1011
+ "logps/rejected": -217.8682403564453,
1012
+ "loss": 0.34,
1013
+ "rewards/accuracies": 1.0,
1014
+ "rewards/chosen": 0.11991731822490692,
1015
+ "rewards/margins": 0.26060622930526733,
1016
+ "rewards/rejected": -0.1406889110803604,
1017
+ "step": 48
1018
+ },
1019
+ {
1020
+ "debug/policy_chosen_logits": -3.0402865409851074,
1021
+ "debug/policy_chosen_logps": -204.64993286132812,
1022
+ "debug/policy_rejected_logits": -3.1806182861328125,
1023
+ "debug/policy_rejected_logps": -223.6003875732422,
1024
+ "debug/reference_chosen_logps": -214.6299285888672,
1025
+ "debug/reference_rejected_logps": -218.46998596191406,
1026
+ "epoch": 0.9423076923076923,
1027
+ "grad_norm": 5.290743823323874,
1028
+ "learning_rate": 1e-06,
1029
+ "logits/chosen": -3.0402865409851074,
1030
+ "logits/rejected": -3.1806182861328125,
1031
+ "logps/chosen": -204.64993286132812,
1032
+ "logps/rejected": -223.6003875732422,
1033
+ "loss": 0.3762,
1034
+ "rewards/accuracies": 0.625,
1035
+ "rewards/chosen": 0.09979984909296036,
1036
+ "rewards/margins": 0.1511038839817047,
1037
+ "rewards/rejected": -0.05130405351519585,
1038
+ "step": 49
1039
+ },
1040
+ {
1041
+ "debug/policy_chosen_logits": -3.2792398929595947,
1042
+ "debug/policy_chosen_logps": -172.821533203125,
1043
+ "debug/policy_rejected_logits": -3.199223756790161,
1044
+ "debug/policy_rejected_logps": -226.37725830078125,
1045
+ "debug/reference_chosen_logps": -185.41844177246094,
1046
+ "debug/reference_rejected_logps": -212.13818359375,
1047
+ "epoch": 0.9615384615384616,
1048
+ "grad_norm": 12.334225732861476,
1049
+ "learning_rate": 1e-06,
1050
+ "logits/chosen": -3.2792398929595947,
1051
+ "logits/rejected": -3.199223756790161,
1052
+ "logps/chosen": -172.821533203125,
1053
+ "logps/rejected": -226.37725830078125,
1054
+ "loss": 0.3587,
1055
+ "rewards/accuracies": 1.0,
1056
+ "rewards/chosen": 0.12596909701824188,
1057
+ "rewards/margins": 0.268359899520874,
1058
+ "rewards/rejected": -0.14239083230495453,
1059
+ "step": 50
1060
+ },
1061
+ {
1062
+ "debug/policy_chosen_logits": -3.1161139011383057,
1063
+ "debug/policy_chosen_logps": -183.79429626464844,
1064
+ "debug/policy_rejected_logits": -3.1612720489501953,
1065
+ "debug/policy_rejected_logps": -238.53965759277344,
1066
+ "debug/reference_chosen_logps": -192.13421630859375,
1067
+ "debug/reference_rejected_logps": -220.17738342285156,
1068
+ "epoch": 0.9807692307692307,
1069
+ "grad_norm": 6.871974967844878,
1070
+ "learning_rate": 1e-06,
1071
+ "logits/chosen": -3.1161139011383057,
1072
+ "logits/rejected": -3.1612720489501953,
1073
+ "logps/chosen": -183.79429626464844,
1074
+ "logps/rejected": -238.53965759277344,
1075
+ "loss": 0.3778,
1076
+ "rewards/accuracies": 0.875,
1077
+ "rewards/chosen": 0.0833992063999176,
1078
+ "rewards/margins": 0.26702186465263367,
1079
+ "rewards/rejected": -0.18362264335155487,
1080
+ "step": 51
1081
+ },
1082
+ {
1083
+ "debug/policy_chosen_logits": -3.1640007495880127,
1084
+ "debug/policy_chosen_logps": -184.35934448242188,
1085
+ "debug/policy_rejected_logits": -3.1118485927581787,
1086
+ "debug/policy_rejected_logps": -235.7113037109375,
1087
+ "debug/reference_chosen_logps": -200.53074645996094,
1088
+ "debug/reference_rejected_logps": -225.58468627929688,
1089
+ "epoch": 1.0,
1090
+ "grad_norm": 6.270849179735478,
1091
+ "learning_rate": 1e-06,
1092
+ "logits/chosen": -3.1640007495880127,
1093
+ "logits/rejected": -3.1118485927581787,
1094
+ "logps/chosen": -184.35934448242188,
1095
+ "logps/rejected": -235.7113037109375,
1096
+ "loss": 0.3221,
1097
+ "rewards/accuracies": 0.75,
1098
+ "rewards/chosen": 0.1617138683795929,
1099
+ "rewards/margins": 0.2629801034927368,
1100
+ "rewards/rejected": -0.10126623511314392,
1101
+ "step": 52
1102
+ },
1103
+ {
1104
+ "epoch": 1.0,
1105
+ "step": 52,
1106
+ "total_flos": 0.0,
1107
+ "train_loss": 0.43149175896094394,
1108
+ "train_runtime": 170.8012,
1109
+ "train_samples_per_second": 19.385,
1110
+ "train_steps_per_second": 0.304
1111
+ }
1112
+ ],
1113
+ "logging_steps": 1,
1114
+ "max_steps": 52,
1115
+ "num_input_tokens_seen": 0,
1116
+ "num_train_epochs": 1,
1117
+ "save_steps": 500,
1118
+ "stateful_callbacks": {
1119
+ "TrainerControl": {
1120
+ "args": {
1121
+ "should_epoch_stop": false,
1122
+ "should_evaluate": false,
1123
+ "should_log": false,
1124
+ "should_save": true,
1125
+ "should_training_stop": true
1126
+ },
1127
+ "attributes": {}
1128
+ }
1129
+ },
1130
+ "total_flos": 0.0,
1131
+ "train_batch_size": 8,
1132
+ "trial_name": null,
1133
+ "trial_params": null
1134
+ }