wyceee commited on
Commit
87b7688
·
verified ·
1 Parent(s): edcc348

End of training

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 1.1608919289756158e-05,
4
- "train_runtime": 2917.0763,
5
- "train_samples": 140,
6
- "train_samples_per_second": 0.548,
7
- "train_steps_per_second": 0.034
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 1.5718676149845123e-05,
4
+ "train_runtime": 4262.7362,
5
+ "train_samples": 48,
6
+ "train_samples_per_second": 0.375,
7
+ "train_steps_per_second": 0.023
8
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22c31451dc66d5752513620bc1dd4c04c566b0b1a06a85a1a797c79cd0cd56aa
3
  size 4996670464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aa4f6bae2562b59ce5b4c50ed5ba43bb6c7a6490d3bdbfd840200784bb86db8
3
  size 4996670464
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27b7048459ff44f8eeaa9ed4b95162ad5a92c389ca99ed4a8f94a4af0012401f
3
  size 1178224960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5d2dcd5763357799ad9114059b0028a01c6683f147409faa0f31aafbd95ece0
3
  size 1178224960
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 1.1608919289756158e-05,
4
- "train_runtime": 2917.0763,
5
- "train_samples": 140,
6
- "train_samples_per_second": 0.548,
7
- "train_steps_per_second": 0.034
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 1.5718676149845123e-05,
4
+ "train_runtime": 4262.7362,
5
+ "train_samples": 48,
6
+ "train_samples_per_second": 0.375,
7
+ "train_steps_per_second": 0.023
8
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 5.571428571428571,
6
  "eval_steps": 500,
7
  "global_step": 100,
8
  "is_hyper_param_search": false,
@@ -10,969 +10,969 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 286.78125,
14
- "epoch": 0.11428571428571428,
15
- "grad_norm": 2.276369333267212,
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666665e-07,
18
- "loss": 0.0,
19
- "reward": 4.382780752843246,
20
- "reward_std": 2.351492334040813,
21
- "rewards/concensus_correctness_reward_func": 2.5035625000018626,
22
- "rewards/consensus_reward_func": 0.375,
23
  "rewards/cumulative_reward_2": 0.0,
24
- "rewards/final_correctness_reward_func": 0.625,
25
- "rewards/question_recreation_reward_func": 0.5795618183910847,
26
  "rewards/soft_format_reward_func": 0.0,
27
- "rewards/strict_format_reward_func": 0.0,
28
- "rewards/xmlcount_reward_func": 0.2996562537737191,
29
  "step": 2
30
  },
31
  {
32
- "completion_length": 252.75,
33
- "epoch": 0.22857142857142856,
34
- "grad_norm": 2.464397430419922,
35
- "kl": 0.0005955380293016788,
36
  "learning_rate": 5e-07,
37
- "loss": -0.0,
38
- "reward": 3.0856650918722153,
39
- "reward_std": 0.7508293140563183,
40
- "rewards/concensus_correctness_reward_func": 1.2791875004186295,
41
- "rewards/consensus_reward_func": 0.125,
42
  "rewards/cumulative_reward_2": 0.0,
43
- "rewards/final_correctness_reward_func": 0.5,
44
- "rewards/question_recreation_reward_func": 0.5572276492603123,
45
  "rewards/soft_format_reward_func": 0.0,
46
  "rewards/strict_format_reward_func": 0.015625,
47
- "rewards/xmlcount_reward_func": 0.6086250022053719,
48
  "step": 4
49
  },
50
  {
51
- "completion_length": 281.625,
52
- "epoch": 0.34285714285714286,
53
- "grad_norm": 2.6032321453094482,
54
- "kl": 0.0007702869843342341,
55
  "learning_rate": 4.994757065594279e-07,
56
  "loss": 0.0,
57
- "reward": 1.5275531343650073,
58
- "reward_std": 0.9591646681074053,
59
- "rewards/concensus_correctness_reward_func": 0.07531250268220901,
60
- "rewards/consensus_reward_func": 0.25,
61
  "rewards/cumulative_reward_2": 0.0,
62
- "rewards/final_correctness_reward_func": 0.1875,
63
- "rewards/question_recreation_reward_func": 0.528365645557642,
64
  "rewards/soft_format_reward_func": 0.0,
65
  "rewards/strict_format_reward_func": 0.03125,
66
- "rewards/xmlcount_reward_func": 0.45512500917539,
67
  "step": 6
68
  },
69
  {
70
- "completion_length": 329.8125,
71
- "epoch": 0.45714285714285713,
72
- "grad_norm": 3.317532539367676,
73
- "kl": 0.000785435793659417,
74
  "learning_rate": 4.979050253066063e-07,
75
  "loss": 0.0,
76
- "reward": 5.570158363319933,
77
- "reward_std": 1.5328934689750895,
78
- "rewards/concensus_correctness_reward_func": 3.344875006005168,
79
- "rewards/consensus_reward_func": 0.625,
80
  "rewards/cumulative_reward_2": 0.0,
81
- "rewards/final_correctness_reward_func": 0.75,
82
- "rewards/question_recreation_reward_func": 0.6130958534777164,
83
  "rewards/soft_format_reward_func": 0.0,
84
- "rewards/strict_format_reward_func": 0.046875,
85
- "rewards/xmlcount_reward_func": 0.19031249126419425,
86
  "step": 8
87
  },
88
  {
89
- "completion_length": 275.65625,
90
- "epoch": 0.5714285714285714,
91
- "grad_norm": 2.6608853340148926,
92
- "kl": 0.0007642239979759324,
93
  "learning_rate": 4.952945442245597e-07,
94
- "loss": -0.0,
95
- "reward": 4.114271555095911,
96
- "reward_std": 1.842139216605574,
97
- "rewards/concensus_correctness_reward_func": 2.100500001106411,
98
- "rewards/consensus_reward_func": 0.375,
99
  "rewards/cumulative_reward_2": 0.0,
100
- "rewards/final_correctness_reward_func": 0.5,
101
- "rewards/question_recreation_reward_func": 0.471396672539413,
102
  "rewards/soft_format_reward_func": 0.0,
103
  "rewards/strict_format_reward_func": 0.015625,
104
- "rewards/xmlcount_reward_func": 0.6517500090412796,
105
  "step": 10
106
  },
107
  {
108
- "completion_length": 322.71875,
109
- "epoch": 0.6857142857142857,
110
- "grad_norm": 2.0212087631225586,
111
- "kl": 0.0009263881365768611,
112
  "learning_rate": 4.916552125781528e-07,
113
  "loss": 0.0,
114
- "reward": 3.2796499207615852,
115
- "reward_std": 2.9198597713839263,
116
- "rewards/concensus_correctness_reward_func": 1.3640624983236194,
117
- "rewards/consensus_reward_func": 0.3125,
118
  "rewards/cumulative_reward_2": 0.0,
119
- "rewards/final_correctness_reward_func": 0.4375,
120
- "rewards/question_recreation_reward_func": 0.63699368853122,
121
  "rewards/soft_format_reward_func": 0.0,
122
- "rewards/strict_format_reward_func": 0.0,
123
- "rewards/xmlcount_reward_func": 0.5285937548615038,
124
  "step": 12
125
  },
126
  {
127
- "completion_length": 273.03125,
128
- "epoch": 0.8,
129
- "grad_norm": 2.2424488067626953,
130
- "kl": 0.0011271409348410089,
131
  "learning_rate": 4.870022949890676e-07,
132
  "loss": 0.0,
133
- "reward": 3.058269999921322,
134
- "reward_std": 2.4546356331557035,
135
- "rewards/concensus_correctness_reward_func": 1.011687501159031,
136
- "rewards/consensus_reward_func": 0.625,
137
  "rewards/cumulative_reward_2": 0.0,
138
- "rewards/final_correctness_reward_func": 0.4375,
139
- "rewards/question_recreation_reward_func": 0.5196137428283691,
140
  "rewards/soft_format_reward_func": 0.0,
141
- "rewards/strict_format_reward_func": 0.0,
142
- "rewards/xmlcount_reward_func": 0.4644687445834279,
143
  "step": 14
144
  },
145
  {
146
- "completion_length": 271.21875,
147
- "epoch": 0.9142857142857143,
148
- "grad_norm": 3.093533515930176,
149
- "kl": 0.0015523226065852214,
150
  "learning_rate": 4.81355307410676e-07,
151
  "loss": 0.0,
152
- "reward": 1.6757539696991444,
153
- "reward_std": 0.8273154485941632,
154
- "rewards/concensus_correctness_reward_func": 0.14224999770522118,
155
- "rewards/consensus_reward_func": 0.0625,
156
  "rewards/cumulative_reward_2": 0.0,
157
- "rewards/final_correctness_reward_func": 0.375,
158
- "rewards/question_recreation_reward_func": 0.562660242896527,
159
  "rewards/soft_format_reward_func": 0.0,
160
- "rewards/strict_format_reward_func": 0.015625,
161
- "rewards/xmlcount_reward_func": 0.5177187529043294,
162
  "step": 16
163
  },
164
  {
165
- "completion_length": 278.0416666666667,
166
- "epoch": 1.0,
167
- "grad_norm": 1.7071589231491089,
168
- "kl": 0.0019262856318770598,
169
  "learning_rate": 4.747379352713488e-07,
170
  "loss": 0.0,
171
- "reward": 2.2802133609851203,
172
- "reward_std": 1.2267065872438252,
173
- "rewards/concensus_correctness_reward_func": 0.23658333408335844,
174
- "rewards/consensus_reward_func": 0.3333333333333333,
175
  "rewards/cumulative_reward_2": 0.0,
176
- "rewards/final_correctness_reward_func": 0.6666666666666666,
177
- "rewards/question_recreation_reward_func": 0.5471299886703491,
178
  "rewards/soft_format_reward_func": 0.0,
179
- "rewards/strict_format_reward_func": 0.020833333333333332,
180
- "rewards/xmlcount_reward_func": 0.4756666754838079,
181
  "step": 18
182
  },
183
  {
184
- "completion_length": 283.15625,
185
- "epoch": 1.1142857142857143,
186
- "grad_norm": 2.894879102706909,
187
- "kl": 0.0032310091046383604,
188
  "learning_rate": 4.6717793412953776e-07,
189
  "loss": 0.0,
190
- "reward": 3.1169432625174522,
191
- "reward_std": 2.748181344475597,
192
- "rewards/concensus_correctness_reward_func": 1.3761250004172325,
193
- "rewards/consensus_reward_func": 0.25,
194
  "rewards/cumulative_reward_2": 0.0,
195
- "rewards/final_correctness_reward_func": 0.5,
196
- "rewards/question_recreation_reward_func": 0.4786620208178647,
197
  "rewards/soft_format_reward_func": 0.0,
198
- "rewards/strict_format_reward_func": 0.0,
199
- "rewards/xmlcount_reward_func": 0.5121562429703772,
200
  "step": 20
201
  },
202
  {
203
- "completion_length": 262.40625,
204
- "epoch": 1.2285714285714286,
205
- "grad_norm": 2.271083116531372,
206
- "kl": 0.003482333617284894,
207
  "learning_rate": 4.5870701325731773e-07,
208
  "loss": 0.0,
209
- "reward": 3.1201551258563995,
210
- "reward_std": 1.7736396370455623,
211
- "rewards/concensus_correctness_reward_func": 1.0093125184066594,
212
- "rewards/consensus_reward_func": 0.3125,
213
  "rewards/cumulative_reward_2": 0.0,
214
- "rewards/final_correctness_reward_func": 0.625,
215
- "rewards/question_recreation_reward_func": 0.6677489534486085,
216
  "rewards/soft_format_reward_func": 0.0,
217
- "rewards/strict_format_reward_func": 0.015625,
218
- "rewards/xmlcount_reward_func": 0.48996875435113907,
219
  "step": 22
220
  },
221
  {
222
- "completion_length": 281.71875,
223
- "epoch": 1.342857142857143,
224
- "grad_norm": 2.927028179168701,
225
- "kl": 0.004172757333435584,
226
  "learning_rate": 4.4936070264068016e-07,
227
  "loss": 0.0,
228
- "reward": 1.7949492074549198,
229
- "reward_std": 0.6816417840309441,
230
- "rewards/concensus_correctness_reward_func": 0.09593750163912773,
231
- "rewards/consensus_reward_func": 0.1875,
232
  "rewards/cumulative_reward_2": 0.0,
233
- "rewards/final_correctness_reward_func": 0.3125,
234
- "rewards/question_recreation_reward_func": 0.573636656627059,
235
  "rewards/soft_format_reward_func": 0.0,
236
- "rewards/strict_format_reward_func": 0.046875,
237
- "rewards/xmlcount_reward_func": 0.5785000016912818,
238
  "step": 24
239
  },
240
  {
241
- "completion_length": 240.0625,
242
- "epoch": 1.457142857142857,
243
- "grad_norm": 2.403028726577759,
244
- "kl": 0.0059904284426011145,
245
  "learning_rate": 4.391782039544238e-07,
246
  "loss": 0.0,
247
- "reward": 2.1940407678484917,
248
- "reward_std": 0.970807102508843,
249
- "rewards/concensus_correctness_reward_func": 0.160500000230968,
250
- "rewards/consensus_reward_func": 0.3125,
251
  "rewards/cumulative_reward_2": 0.0,
252
- "rewards/final_correctness_reward_func": 0.375,
253
- "rewards/question_recreation_reward_func": 0.5637594992294908,
254
  "rewards/soft_format_reward_func": 0.0,
255
- "rewards/strict_format_reward_func": 0.03125,
256
- "rewards/xmlcount_reward_func": 0.7510312646627426,
257
  "step": 26
258
  },
259
  {
260
- "completion_length": 291.59375,
261
- "epoch": 1.5714285714285714,
262
- "grad_norm": 2.285961866378784,
263
- "kl": 0.005350364925106987,
264
  "learning_rate": 4.282022261367073e-07,
265
  "loss": 0.0,
266
- "reward": 1.8530698530375957,
267
- "reward_std": 0.817692642332986,
268
- "rewards/concensus_correctness_reward_func": 0.17443750286474824,
269
- "rewards/consensus_reward_func": 0.25,
270
  "rewards/cumulative_reward_2": 0.0,
271
- "rewards/final_correctness_reward_func": 0.25,
272
- "rewards/question_recreation_reward_func": 0.5304135773330927,
273
- "rewards/soft_format_reward_func": 0.015625,
274
- "rewards/strict_format_reward_func": 0.0,
275
- "rewards/xmlcount_reward_func": 0.6325937514193356,
276
  "step": 28
277
  },
278
  {
279
- "completion_length": 279.8125,
280
- "epoch": 1.6857142857142857,
281
- "grad_norm": 2.314671754837036,
282
- "kl": 0.007266657623404171,
283
  "learning_rate": 4.1647880625292027e-07,
284
  "loss": 0.0,
285
- "reward": 4.273585867136717,
286
- "reward_std": 2.3483394776121713,
287
- "rewards/concensus_correctness_reward_func": 2.1670625028200448,
288
- "rewards/consensus_reward_func": 0.4375,
289
  "rewards/cumulative_reward_2": 0.0,
290
- "rewards/final_correctness_reward_func": 0.6875,
291
- "rewards/question_recreation_reward_func": 0.479585743509233,
292
  "rewards/soft_format_reward_func": 0.0,
293
- "rewards/strict_format_reward_func": 0.015625,
294
- "rewards/xmlcount_reward_func": 0.4863125025294721,
295
  "step": 30
296
  },
297
  {
298
- "completion_length": 284.1875,
299
- "epoch": 1.8,
300
- "grad_norm": 2.3709328174591064,
301
- "kl": 0.0072359980258625,
302
  "learning_rate": 4.040571164002318e-07,
303
  "loss": 0.0,
304
- "reward": 2.3310193195939064,
305
- "reward_std": 1.6677571701875422,
306
- "rewards/concensus_correctness_reward_func": 0.6925625018775463,
307
- "rewards/consensus_reward_func": 0.1875,
308
  "rewards/cumulative_reward_2": 0.0,
309
- "rewards/final_correctness_reward_func": 0.3125,
310
- "rewards/question_recreation_reward_func": 0.7243943102657795,
311
  "rewards/soft_format_reward_func": 0.0,
312
- "rewards/strict_format_reward_func": 0.0,
313
- "rewards/xmlcount_reward_func": 0.41406250291038305,
314
  "step": 32
315
  },
316
  {
317
- "completion_length": 304.84375,
318
- "epoch": 1.9142857142857141,
319
- "grad_norm": 2.7485358715057373,
320
- "kl": 0.007408478311845101,
321
  "learning_rate": 3.909892574627266e-07,
322
  "loss": 0.0,
323
- "reward": 2.814433066174388,
324
- "reward_std": 2.1739278100430965,
325
- "rewards/concensus_correctness_reward_func": 1.0306875060778111,
326
- "rewards/consensus_reward_func": 0.375,
327
  "rewards/cumulative_reward_2": 0.0,
328
- "rewards/final_correctness_reward_func": 0.5,
329
- "rewards/question_recreation_reward_func": 0.6048393156379461,
330
  "rewards/soft_format_reward_func": 0.0,
331
- "rewards/strict_format_reward_func": 0.0,
332
- "rewards/xmlcount_reward_func": 0.30390625447034836,
333
  "step": 34
334
  },
335
  {
336
- "completion_length": 329.5,
337
- "epoch": 2.0,
338
- "grad_norm": 1.5283843278884888,
339
- "kl": 0.008974243615133068,
340
  "learning_rate": 3.773300405821908e-07,
341
  "loss": 0.0,
342
- "reward": 5.211016138394673,
343
- "reward_std": 2.3310749906425676,
344
- "rewards/concensus_correctness_reward_func": 2.7943333238363266,
345
- "rewards/consensus_reward_func": 0.75,
346
  "rewards/cumulative_reward_2": 0.0,
347
- "rewards/final_correctness_reward_func": 0.5833333333333334,
348
- "rewards/question_recreation_reward_func": 0.4819745300337672,
349
  "rewards/soft_format_reward_func": 0.0,
350
- "rewards/strict_format_reward_func": 0.041666666666666664,
351
- "rewards/xmlcount_reward_func": 0.5597083357473215,
352
  "step": 36
353
  },
354
  {
355
- "completion_length": 306.46875,
356
- "epoch": 2.1142857142857143,
357
- "grad_norm": 2.286961555480957,
358
- "kl": 0.01197911988128908,
359
  "learning_rate": 3.6313675726113475e-07,
360
  "loss": 0.0,
361
- "reward": 2.252480274066329,
362
- "reward_std": 1.2174878492951393,
363
- "rewards/concensus_correctness_reward_func": 0.23100000014528632,
364
- "rewards/consensus_reward_func": 0.5625,
365
  "rewards/cumulative_reward_2": 0.0,
366
- "rewards/final_correctness_reward_func": 0.25,
367
- "rewards/question_recreation_reward_func": 0.5501990381162614,
368
  "rewards/soft_format_reward_func": 0.0,
369
- "rewards/strict_format_reward_func": 0.078125,
370
- "rewards/xmlcount_reward_func": 0.5806562546640635,
371
  "step": 38
372
  },
373
  {
374
- "completion_length": 280.3125,
375
- "epoch": 2.2285714285714286,
376
- "grad_norm": 1.9673209190368652,
377
- "kl": 0.010138543642824516,
378
  "learning_rate": 3.484689390623218e-07,
379
  "loss": 0.0,
380
- "reward": 2.7239319160580635,
381
- "reward_std": 0.854443228803575,
382
- "rewards/concensus_correctness_reward_func": 1.2991249989718199,
383
- "rewards/consensus_reward_func": 0.1875,
384
  "rewards/cumulative_reward_2": 0.0,
385
- "rewards/final_correctness_reward_func": 0.1875,
386
- "rewards/question_recreation_reward_func": 0.6282755881547928,
387
  "rewards/soft_format_reward_func": 0.0,
388
- "rewards/strict_format_reward_func": 0.015625,
389
- "rewards/xmlcount_reward_func": 0.4059062581509352,
390
  "step": 40
391
  },
392
  {
393
- "completion_length": 258.71875,
394
- "epoch": 2.342857142857143,
395
- "grad_norm": 4.803712844848633,
396
- "kl": 0.01296805081074126,
397
  "learning_rate": 3.3338810791270517e-07,
398
  "loss": 0.0,
399
- "reward": 3.26824863627553,
400
- "reward_std": 0.7542198827723041,
401
- "rewards/concensus_correctness_reward_func": 1.3208750002086163,
402
- "rewards/consensus_reward_func": 0.25,
403
  "rewards/cumulative_reward_2": 0.0,
404
- "rewards/final_correctness_reward_func": 0.5625,
405
- "rewards/question_recreation_reward_func": 0.4741549016907811,
406
  "rewards/soft_format_reward_func": 0.0,
407
- "rewards/strict_format_reward_func": 0.015625,
408
- "rewards/xmlcount_reward_func": 0.6450937520712614,
409
  "step": 42
410
  },
411
  {
412
- "completion_length": 297.875,
413
- "epoch": 2.4571428571428573,
414
- "grad_norm": 2.1548197269439697,
415
- "kl": 0.010247149417409673,
416
  "learning_rate": 3.179575180590857e-07,
417
  "loss": 0.0,
418
- "reward": 3.1603624299168587,
419
- "reward_std": 3.0717462142929435,
420
- "rewards/concensus_correctness_reward_func": 1.3996250029304065,
421
- "rewards/consensus_reward_func": 0.3125,
422
  "rewards/cumulative_reward_2": 0.0,
423
- "rewards/final_correctness_reward_func": 0.625,
424
- "rewards/question_recreation_reward_func": 0.432518573012203,
425
  "rewards/soft_format_reward_func": 0.0,
426
- "rewards/strict_format_reward_func": 0.015625,
427
- "rewards/xmlcount_reward_func": 0.3750937534496188,
428
  "step": 44
429
  },
430
  {
431
- "completion_length": 291.1875,
432
- "epoch": 2.571428571428571,
433
- "grad_norm": 2.1110994815826416,
434
- "kl": 0.009947843587724492,
435
  "learning_rate": 3.022418907578188e-07,
436
  "loss": 0.0,
437
- "reward": 2.907536911778152,
438
- "reward_std": 2.789290243992582,
439
- "rewards/concensus_correctness_reward_func": 1.2540624998509884,
440
- "rewards/consensus_reward_func": 0.25,
441
  "rewards/cumulative_reward_2": 0.0,
442
- "rewards/final_correctness_reward_func": 0.375,
443
- "rewards/question_recreation_reward_func": 0.6130369543097913,
444
  "rewards/soft_format_reward_func": 0.0,
445
- "rewards/strict_format_reward_func": 0.0,
446
- "rewards/xmlcount_reward_func": 0.4154375079087913,
447
  "step": 46
448
  },
449
  {
450
- "completion_length": 312.21875,
451
- "epoch": 2.685714285714286,
452
- "grad_norm": 2.2388100624084473,
453
- "kl": 0.012466569722164422,
454
  "learning_rate": 2.863071428113726e-07,
455
  "loss": 0.0,
456
- "reward": 3.700060784816742,
457
- "reward_std": 1.4210271453484893,
458
- "rewards/concensus_correctness_reward_func": 1.5416875053197145,
459
- "rewards/consensus_reward_func": 0.4375,
460
  "rewards/cumulative_reward_2": 0.0,
461
- "rewards/final_correctness_reward_func": 0.5,
462
- "rewards/question_recreation_reward_func": 0.5754670230671763,
463
  "rewards/soft_format_reward_func": 0.0,
464
- "rewards/strict_format_reward_func": 0.015625,
465
- "rewards/xmlcount_reward_func": 0.6297812489792705,
466
  "step": 48
467
  },
468
  {
469
- "completion_length": 330.78125,
470
- "epoch": 2.8,
471
- "grad_norm": 3.0799405574798584,
472
- "kl": 0.013010555092478171,
473
  "learning_rate": 2.7022011009035107e-07,
474
  "loss": 0.0,
475
- "reward": 2.5463776104152203,
476
- "reward_std": 1.2827392800245434,
477
- "rewards/concensus_correctness_reward_func": 0.39993750234134495,
478
- "rewards/consensus_reward_func": 0.625,
479
  "rewards/cumulative_reward_2": 0.0,
480
- "rewards/final_correctness_reward_func": 0.375,
481
- "rewards/question_recreation_reward_func": 0.6346588416490704,
482
  "rewards/soft_format_reward_func": 0.0,
483
- "rewards/strict_format_reward_func": 0.0625,
484
- "rewards/xmlcount_reward_func": 0.44928124244324863,
485
  "step": 50
486
  },
487
  {
488
- "completion_length": 258.96875,
489
- "epoch": 2.914285714285714,
490
- "grad_norm": 3.135465383529663,
491
- "kl": 0.017136874987045303,
492
  "learning_rate": 2.540482672006254e-07,
493
  "loss": 0.0,
494
- "reward": 5.302450358867645,
495
- "reward_std": 4.525662333006039,
496
- "rewards/concensus_correctness_reward_func": 2.898500017821789,
497
- "rewards/consensus_reward_func": 0.5,
498
  "rewards/cumulative_reward_2": 0.0,
499
- "rewards/final_correctness_reward_func": 0.8125,
500
- "rewards/question_recreation_reward_func": 0.49360665678977966,
501
  "rewards/soft_format_reward_func": 0.0,
502
- "rewards/strict_format_reward_func": 0.03125,
503
- "rewards/xmlcount_reward_func": 0.5665937489829957,
504
  "step": 52
505
  },
506
  {
507
- "completion_length": 257.5416666666667,
508
- "epoch": 3.0,
509
- "grad_norm": 1.707671880722046,
510
- "kl": 0.016812068953489263,
511
  "learning_rate": 2.37859444471388e-07,
512
  "loss": 0.0,
513
- "reward": 2.3912815153598785,
514
- "reward_std": 1.0656018268394594,
515
- "rewards/concensus_correctness_reward_func": 0.21558333188295364,
516
- "rewards/consensus_reward_func": 0.25,
517
  "rewards/cumulative_reward_2": 0.0,
518
- "rewards/final_correctness_reward_func": 0.75,
519
- "rewards/question_recreation_reward_func": 0.5453648641705513,
520
  "rewards/soft_format_reward_func": 0.0,
521
- "rewards/strict_format_reward_func": 0.020833333333333332,
522
- "rewards/xmlcount_reward_func": 0.6095000064621369,
523
  "step": 54
524
  },
525
  {
526
- "completion_length": 333.34375,
527
- "epoch": 3.1142857142857143,
528
- "grad_norm": 1.6752287149429321,
529
- "kl": 0.013997640533489175,
530
  "learning_rate": 2.2172154345117894e-07,
531
  "loss": 0.0,
532
- "reward": 5.073978617787361,
533
- "reward_std": 1.2313855392858386,
534
- "rewards/concensus_correctness_reward_func": 2.745937497355044,
535
- "rewards/consensus_reward_func": 0.4375,
536
  "rewards/cumulative_reward_2": 0.0,
537
- "rewards/final_correctness_reward_func": 0.625,
538
- "rewards/question_recreation_reward_func": 0.590072276070714,
539
  "rewards/soft_format_reward_func": 0.0,
540
- "rewards/strict_format_reward_func": 0.015625,
541
- "rewards/xmlcount_reward_func": 0.6598437521606684,
542
  "step": 56
543
  },
544
  {
545
- "completion_length": 278.4375,
546
- "epoch": 3.2285714285714286,
547
- "grad_norm": 2.5768773555755615,
548
- "kl": 0.017602251144126058,
549
  "learning_rate": 2.0570225210519433e-07,
550
  "loss": 0.0,
551
- "reward": 2.7841400876641273,
552
- "reward_std": 1.728901147376746,
553
- "rewards/concensus_correctness_reward_func": 0.8728750348091125,
554
- "rewards/consensus_reward_func": 0.125,
555
  "rewards/cumulative_reward_2": 0.0,
556
- "rewards/final_correctness_reward_func": 0.5,
557
- "rewards/question_recreation_reward_func": 0.5445462940260768,
558
  "rewards/soft_format_reward_func": 0.0,
559
  "rewards/strict_format_reward_func": 0.015625,
560
- "rewards/xmlcount_reward_func": 0.7260937541723251,
561
  "step": 58
562
  },
563
  {
564
- "completion_length": 306.53125,
565
- "epoch": 3.342857142857143,
566
- "grad_norm": 2.259335517883301,
567
- "kl": 0.015203700924757868,
568
  "learning_rate": 1.8986876090843664e-07,
569
  "loss": 0.0,
570
- "reward": 3.736787687987089,
571
- "reward_std": 1.3046259782277048,
572
- "rewards/concensus_correctness_reward_func": 1.5642499991226941,
573
- "rewards/consensus_reward_func": 0.625,
574
  "rewards/cumulative_reward_2": 0.0,
575
- "rewards/final_correctness_reward_func": 0.5,
576
- "rewards/question_recreation_reward_func": 0.5504127382300794,
577
  "rewards/soft_format_reward_func": 0.0,
578
- "rewards/strict_format_reward_func": 0.0,
579
- "rewards/xmlcount_reward_func": 0.497125007212162,
580
  "step": 60
581
  },
582
  {
583
- "completion_length": 304.9375,
584
- "epoch": 3.4571428571428573,
585
- "grad_norm": 2.193737268447876,
586
- "kl": 0.0159946876228787,
587
  "learning_rate": 1.7428748102551234e-07,
588
  "loss": 0.0,
589
- "reward": 2.9109396040439606,
590
- "reward_std": 2.4176445261691697,
591
- "rewards/concensus_correctness_reward_func": 0.8153750021010637,
592
- "rewards/consensus_reward_func": 0.5,
593
  "rewards/cumulative_reward_2": 0.0,
594
- "rewards/final_correctness_reward_func": 0.4375,
595
- "rewards/question_recreation_reward_func": 0.6774083143100142,
596
  "rewards/soft_format_reward_func": 0.0,
597
  "rewards/strict_format_reward_func": 0.046875,
598
- "rewards/xmlcount_reward_func": 0.43378125317394733,
599
  "step": 62
600
  },
601
  {
602
- "completion_length": 239.0625,
603
- "epoch": 3.571428571428571,
604
- "grad_norm": 2.8111119270324707,
605
- "kl": 0.019410541572142392,
606
  "learning_rate": 1.5902376575912814e-07,
607
  "loss": 0.0,
608
- "reward": 3.376887336373329,
609
- "reward_std": 2.5654359097243287,
610
- "rewards/concensus_correctness_reward_func": 1.3211875017732382,
611
- "rewards/consensus_reward_func": 0.3125,
612
  "rewards/cumulative_reward_2": 0.0,
613
- "rewards/final_correctness_reward_func": 0.5,
614
- "rewards/question_recreation_reward_func": 0.5570748917525634,
615
  "rewards/soft_format_reward_func": 0.0,
616
- "rewards/strict_format_reward_func": 0.015625,
617
- "rewards/xmlcount_reward_func": 0.6705000004731119,
618
  "step": 64
619
  },
620
  {
621
- "completion_length": 290.0,
622
- "epoch": 3.685714285714286,
623
- "grad_norm": 2.42885160446167,
624
- "kl": 0.020226228778483346,
625
  "learning_rate": 1.4414163643562753e-07,
626
  "loss": 0.0,
627
- "reward": 2.901755426079035,
628
- "reward_std": 1.9001993554411456,
629
- "rewards/concensus_correctness_reward_func": 0.9014999968931079,
630
- "rewards/consensus_reward_func": 0.5,
631
  "rewards/cumulative_reward_2": 0.0,
632
- "rewards/final_correctness_reward_func": 0.3125,
633
- "rewards/question_recreation_reward_func": 0.530786651186645,
634
- "rewards/soft_format_reward_func": 0.0,
635
- "rewards/strict_format_reward_func": 0.046875,
636
- "rewards/xmlcount_reward_func": 0.6100937562296167,
637
  "step": 66
638
  },
639
  {
640
- "completion_length": 291.90625,
641
- "epoch": 3.8,
642
- "grad_norm": 2.7818875312805176,
643
- "kl": 0.015350552916061133,
644
  "learning_rate": 1.2970351387729872e-07,
645
  "loss": 0.0,
646
- "reward": 3.8678977601230145,
647
- "reward_std": 3.3767369369015796,
648
- "rewards/concensus_correctness_reward_func": 1.9805624762084335,
649
- "rewards/consensus_reward_func": 0.5625,
650
  "rewards/cumulative_reward_2": 0.0,
651
- "rewards/final_correctness_reward_func": 0.375,
652
- "rewards/question_recreation_reward_func": 0.5122728096321225,
653
  "rewards/soft_format_reward_func": 0.0,
654
- "rewards/strict_format_reward_func": 0.03125,
655
- "rewards/xmlcount_reward_func": 0.40631250059232116,
656
  "step": 68
657
  },
658
  {
659
- "completion_length": 260.375,
660
- "epoch": 3.914285714285714,
661
- "grad_norm": 2.0774481296539307,
662
- "kl": 0.018419178784824908,
663
  "learning_rate": 1.1576995658775404e-07,
664
  "loss": 0.0,
665
- "reward": 2.9608126636594534,
666
- "reward_std": 2.178357046446763,
667
- "rewards/concensus_correctness_reward_func": 0.8325625006109476,
668
- "rewards/consensus_reward_func": 0.5625,
669
  "rewards/cumulative_reward_2": 0.0,
670
- "rewards/final_correctness_reward_func": 0.5,
671
- "rewards/question_recreation_reward_func": 0.49425020324997604,
672
- "rewards/soft_format_reward_func": 0.0,
673
- "rewards/strict_format_reward_func": 0.03125,
674
- "rewards/xmlcount_reward_func": 0.5402499944902956,
675
  "step": 70
676
  },
677
  {
678
- "completion_length": 297.1666666666667,
679
- "epoch": 4.0,
680
- "grad_norm": 1.390352487564087,
681
- "kl": 0.015421232635465762,
682
  "learning_rate": 1.0239940674851941e-07,
683
  "loss": 0.0,
684
- "reward": 2.524449576934179,
685
- "reward_std": 1.8518620263785124,
686
- "rewards/concensus_correctness_reward_func": 0.8333333333333334,
687
- "rewards/consensus_reward_func": 0.0,
688
  "rewards/cumulative_reward_2": 0.0,
689
- "rewards/final_correctness_reward_func": 0.75,
690
- "rewards/question_recreation_reward_func": 0.5952413380146027,
691
  "rewards/soft_format_reward_func": 0.0,
692
- "rewards/strict_format_reward_func": 0.0,
693
- "rewards/xmlcount_reward_func": 0.34587500027070445,
694
  "step": 72
695
  },
696
  {
697
- "completion_length": 251.84375,
698
- "epoch": 4.114285714285714,
699
- "grad_norm": 2.091481924057007,
700
- "kl": 0.01848960021743551,
701
  "learning_rate": 8.964794509221507e-08,
702
  "loss": 0.0,
703
- "reward": 3.833787925541401,
704
- "reward_std": 3.6238025003112853,
705
- "rewards/concensus_correctness_reward_func": 1.9853749983012676,
706
- "rewards/consensus_reward_func": 0.25,
707
  "rewards/cumulative_reward_2": 0.0,
708
- "rewards/final_correctness_reward_func": 0.5625,
709
- "rewards/question_recreation_reward_func": 0.4506316084880382,
710
  "rewards/soft_format_reward_func": 0.0,
711
- "rewards/strict_format_reward_func": 0.03125,
712
- "rewards/xmlcount_reward_func": 0.5540312556549907,
713
  "step": 74
714
  },
715
  {
716
- "completion_length": 289.90625,
717
- "epoch": 4.228571428571429,
718
- "grad_norm": 2.230874538421631,
719
- "kl": 0.01755688912817277,
720
  "learning_rate": 7.756905568047392e-08,
721
  "loss": 0.0,
722
- "reward": 4.188707336783409,
723
- "reward_std": 3.5413391031324863,
724
- "rewards/concensus_correctness_reward_func": 2.0821875180117786,
725
  "rewards/consensus_reward_func": 0.5,
726
  "rewards/cumulative_reward_2": 0.0,
727
- "rewards/final_correctness_reward_func": 0.4375,
728
- "rewards/question_recreation_reward_func": 0.5773635879158974,
729
  "rewards/soft_format_reward_func": 0.0,
730
  "rewards/strict_format_reward_func": 0.015625,
731
- "rewards/xmlcount_reward_func": 0.5760312499478459,
732
  "step": 76
733
  },
734
  {
735
- "completion_length": 292.03125,
736
- "epoch": 4.3428571428571425,
737
- "grad_norm": 2.0806541442871094,
738
- "kl": 0.019203898555133492,
739
  "learning_rate": 6.621340157319996e-08,
740
  "loss": 0.0,
741
- "reward": 4.9116186555475,
742
- "reward_std": 2.788472333922982,
743
- "rewards/concensus_correctness_reward_func": 2.791312505491078,
744
- "rewards/consensus_reward_func": 0.4375,
745
  "rewards/cumulative_reward_2": 0.0,
746
- "rewards/final_correctness_reward_func": 0.5625,
747
- "rewards/question_recreation_reward_func": 0.5585249215364456,
748
  "rewards/soft_format_reward_func": 0.0,
749
- "rewards/strict_format_reward_func": 0.015625,
750
- "rewards/xmlcount_reward_func": 0.5461562548298389,
751
  "step": 78
752
  },
753
  {
754
- "completion_length": 286.4375,
755
- "epoch": 4.457142857142857,
756
- "grad_norm": 2.314354181289673,
757
- "kl": 0.020370854035718367,
758
  "learning_rate": 5.5628612330087724e-08,
759
  "loss": 0.0,
760
- "reward": 3.2957633957266808,
761
- "reward_std": 1.2075208893074887,
762
- "rewards/concensus_correctness_reward_func": 0.5571249965578318,
763
- "rewards/consensus_reward_func": 0.5625,
764
  "rewards/cumulative_reward_2": 0.0,
765
- "rewards/final_correctness_reward_func": 0.8125,
766
- "rewards/question_recreation_reward_func": 0.46320093097165227,
767
  "rewards/soft_format_reward_func": 0.0,
768
  "rewards/strict_format_reward_func": 0.0625,
769
- "rewards/xmlcount_reward_func": 0.8379375012591481,
770
  "step": 80
771
  },
772
  {
773
- "completion_length": 320.75,
774
- "epoch": 4.571428571428571,
775
- "grad_norm": 1.8077600002288818,
776
- "kl": 0.01631612313212827,
777
  "learning_rate": 4.5859084235697235e-08,
778
  "loss": 0.0,
779
- "reward": 1.9219548534601927,
780
- "reward_std": 0.7796940823318437,
781
- "rewards/concensus_correctness_reward_func": 0.15187500603497028,
782
  "rewards/consensus_reward_func": 0.375,
783
  "rewards/cumulative_reward_2": 0.0,
784
  "rewards/final_correctness_reward_func": 0.3125,
785
- "rewards/question_recreation_reward_func": 0.5343610807321966,
786
  "rewards/soft_format_reward_func": 0.0,
787
- "rewards/strict_format_reward_func": 0.03125,
788
- "rewards/xmlcount_reward_func": 0.5169687578454614,
789
  "step": 82
790
  },
791
  {
792
- "completion_length": 272.25,
793
- "epoch": 4.685714285714286,
794
- "grad_norm": 2.141181707382202,
795
- "kl": 0.017336227814666927,
796
  "learning_rate": 3.6945794086007705e-08,
797
  "loss": 0.0,
798
- "reward": 4.1498072892427444,
799
- "reward_std": 3.9383774576708674,
800
- "rewards/concensus_correctness_reward_func": 2.1485625030472875,
801
- "rewards/consensus_reward_func": 0.6875,
802
  "rewards/cumulative_reward_2": 0.0,
803
- "rewards/final_correctness_reward_func": 0.5,
804
- "rewards/question_recreation_reward_func": 0.5559322023764253,
805
- "rewards/soft_format_reward_func": 0.0,
806
  "rewards/strict_format_reward_func": 0.015625,
807
- "rewards/xmlcount_reward_func": 0.24218749906867743,
808
  "step": 84
809
  },
810
  {
811
- "completion_length": 263.3125,
812
- "epoch": 4.8,
813
- "grad_norm": 3.1173408031463623,
814
- "kl": 0.022343544085742906,
815
  "learning_rate": 2.892612731749414e-08,
816
  "loss": 0.0,
817
- "reward": 4.092738252133131,
818
- "reward_std": 1.894249181394116,
819
- "rewards/concensus_correctness_reward_func": 2.0336249992251396,
820
- "rewards/consensus_reward_func": 0.375,
821
  "rewards/cumulative_reward_2": 0.0,
822
- "rewards/final_correctness_reward_func": 0.5,
823
- "rewards/question_recreation_reward_func": 0.5459569627419114,
824
  "rewards/soft_format_reward_func": 0.0,
825
- "rewards/strict_format_reward_func": 0.0625,
826
- "rewards/xmlcount_reward_func": 0.5756562538444996,
827
  "step": 86
828
  },
829
  {
830
- "completion_length": 257.625,
831
- "epoch": 4.914285714285715,
832
- "grad_norm": 2.271671772003174,
833
- "kl": 0.019497435074299574,
834
  "learning_rate": 2.183372119961499e-08,
835
  "loss": 0.0,
836
- "reward": 2.9152057096362114,
837
- "reward_std": 2.1247581483912654,
838
- "rewards/concensus_correctness_reward_func": 0.7478749994188547,
839
- "rewards/consensus_reward_func": 0.3125,
840
  "rewards/cumulative_reward_2": 0.0,
841
- "rewards/final_correctness_reward_func": 0.5,
842
- "rewards/question_recreation_reward_func": 0.5662681483663619,
843
- "rewards/soft_format_reward_func": 0.0,
844
- "rewards/strict_format_reward_func": 0.015625,
845
- "rewards/xmlcount_reward_func": 0.7729375008493662,
846
  "step": 88
847
  },
848
  {
849
- "completion_length": 295.5833333333333,
850
- "epoch": 5.0,
851
- "grad_norm": 2.0402109622955322,
852
- "kl": 0.023389367774749797,
853
  "learning_rate": 1.5698323748414122e-08,
854
  "loss": 0.0,
855
- "reward": 4.139378140370051,
856
- "reward_std": 2.417986176908016,
857
- "rewards/concensus_correctness_reward_func": 1.5169999947150548,
858
- "rewards/consensus_reward_func": 0.8333333333333334,
859
  "rewards/cumulative_reward_2": 0.0,
860
- "rewards/final_correctness_reward_func": 0.5,
861
- "rewards/question_recreation_reward_func": 0.628961397955815,
862
  "rewards/soft_format_reward_func": 0.0,
863
- "rewards/strict_format_reward_func": 0.0,
864
- "rewards/xmlcount_reward_func": 0.660083344206214,
865
  "step": 90
866
  },
867
  {
868
- "completion_length": 284.21875,
869
- "epoch": 5.114285714285714,
870
- "grad_norm": 2.9972126483917236,
871
- "kl": 0.020673706487286836,
872
  "learning_rate": 1.054566895300324e-08,
873
  "loss": 0.0,
874
- "reward": 5.346356093883514,
875
- "reward_std": 3.1238869512453675,
876
- "rewards/concensus_correctness_reward_func": 2.7256875003222376,
877
  "rewards/consensus_reward_func": 0.625,
878
  "rewards/cumulative_reward_2": 0.0,
879
- "rewards/final_correctness_reward_func": 0.8125,
880
- "rewards/question_recreation_reward_func": 0.5663561150431633,
881
  "rewards/soft_format_reward_func": 0.0,
882
- "rewards/strict_format_reward_func": 0.0,
883
- "rewards/xmlcount_reward_func": 0.6168124983087182,
884
  "step": 92
885
  },
886
  {
887
- "completion_length": 320.9375,
888
- "epoch": 5.228571428571429,
889
- "grad_norm": 2.07313871383667,
890
- "kl": 0.015393795300042257,
891
  "learning_rate": 6.397368838268496e-09,
892
  "loss": 0.0,
893
- "reward": 4.471762063913047,
894
- "reward_std": 2.338888173457235,
895
- "rewards/concensus_correctness_reward_func": 2.6311249914579093,
896
- "rewards/consensus_reward_func": 0.3125,
897
  "rewards/cumulative_reward_2": 0.0,
898
- "rewards/final_correctness_reward_func": 0.4375,
899
- "rewards/question_recreation_reward_func": 0.6448872559703887,
900
  "rewards/soft_format_reward_func": 0.0,
901
- "rewards/strict_format_reward_func": 0.0,
902
- "rewards/xmlcount_reward_func": 0.44575000391341746,
903
  "step": 94
904
  },
905
  {
906
- "completion_length": 299.75,
907
- "epoch": 5.3428571428571425,
908
- "grad_norm": 1.8763506412506104,
909
- "kl": 0.019261252775322646,
910
  "learning_rate": 3.2708228165273244e-09,
911
  "loss": 0.0,
912
- "reward": 4.042540363967419,
913
- "reward_std": 1.5998169761151075,
914
- "rewards/concensus_correctness_reward_func": 2.1019999987911433,
915
- "rewards/consensus_reward_func": 0.4375,
916
  "rewards/cumulative_reward_2": 0.0,
917
- "rewards/final_correctness_reward_func": 0.4375,
918
- "rewards/question_recreation_reward_func": 0.602071626111865,
919
  "rewards/soft_format_reward_func": 0.0,
920
- "rewards/strict_format_reward_func": 0.015625,
921
- "rewards/xmlcount_reward_func": 0.4478437500074506,
922
  "step": 96
923
  },
924
  {
925
- "completion_length": 261.5625,
926
- "epoch": 5.457142857142857,
927
- "grad_norm": 2.159895420074463,
928
- "kl": 0.023489647486712784,
929
  "learning_rate": 1.1791447083465133e-09,
930
  "loss": 0.0,
931
- "reward": 2.6311225779354572,
932
- "reward_std": 1.2659746335702948,
933
- "rewards/concensus_correctness_reward_func": 0.3871250026859343,
934
- "rewards/consensus_reward_func": 0.4375,
935
  "rewards/cumulative_reward_2": 0.0,
936
- "rewards/final_correctness_reward_func": 0.5,
937
- "rewards/question_recreation_reward_func": 0.6012788028456271,
938
  "rewards/soft_format_reward_func": 0.0,
939
- "rewards/strict_format_reward_func": 0.015625,
940
- "rewards/xmlcount_reward_func": 0.6895937495864928,
941
  "step": 98
942
  },
943
  {
944
- "completion_length": 256.0,
945
- "epoch": 5.571428571428571,
946
- "grad_norm": 2.200911045074463,
947
- "kl": 0.0240181177505292,
948
  "learning_rate": 1.3110773862126667e-10,
949
  "loss": 0.0,
950
- "reward": 4.581089836545289,
951
- "reward_std": 4.262082199566066,
952
- "rewards/concensus_correctness_reward_func": 2.612312503159046,
953
- "rewards/consensus_reward_func": 0.1875,
954
  "rewards/cumulative_reward_2": 0.0,
955
- "rewards/final_correctness_reward_func": 0.625,
956
- "rewards/question_recreation_reward_func": 0.5605585183948278,
957
  "rewards/soft_format_reward_func": 0.0,
958
- "rewards/strict_format_reward_func": 0.03125,
959
- "rewards/xmlcount_reward_func": 0.5644687572494149,
960
  "step": 100
961
  },
962
  {
963
- "epoch": 5.571428571428571,
964
  "step": 100,
965
  "total_flos": 0.0,
966
- "train_loss": 1.1608919289756158e-05,
967
- "train_runtime": 2917.0763,
968
- "train_samples_per_second": 0.548,
969
- "train_steps_per_second": 0.034
970
  }
971
  ],
972
  "logging_steps": 2,
973
  "max_steps": 100,
974
  "num_input_tokens_seen": 0,
975
- "num_train_epochs": 6,
976
  "save_steps": 25,
977
  "stateful_callbacks": {
978
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 16.666666666666668,
6
  "eval_steps": 500,
7
  "global_step": 100,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "completion_length": 227.8125,
14
+ "epoch": 0.3333333333333333,
15
+ "grad_norm": 6.339727878570557,
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666665e-07,
18
+ "loss": -0.0,
19
+ "reward": 1.5849424004554749,
20
+ "reward_std": 1.131471099331975,
21
+ "rewards/concensus_correctness_reward_func": 0.12262500077486038,
22
+ "rewards/consensus_reward_func": 0.5625,
23
  "rewards/cumulative_reward_2": 0.0,
24
+ "rewards/final_correctness_reward_func": 0.0625,
25
+ "rewards/question_recreation_reward_func": 0.46509863436222076,
26
  "rewards/soft_format_reward_func": 0.0,
27
+ "rewards/strict_format_reward_func": 0.015625,
28
+ "rewards/xmlcount_reward_func": 0.35659374902024865,
29
  "step": 2
30
  },
31
  {
32
+ "completion_length": 287.125,
33
+ "epoch": 0.6666666666666666,
34
+ "grad_norm": 2.283590793609619,
35
+ "kl": 0.000743766751838848,
36
  "learning_rate": 5e-07,
37
+ "loss": 0.0,
38
+ "reward": 2.0471834875643253,
39
+ "reward_std": 0.9305327897891402,
40
+ "rewards/concensus_correctness_reward_func": 0.0,
41
+ "rewards/consensus_reward_func": 0.625,
42
  "rewards/cumulative_reward_2": 0.0,
43
+ "rewards/final_correctness_reward_func": 0.3125,
44
+ "rewards/question_recreation_reward_func": 0.6308709642617032,
45
  "rewards/soft_format_reward_func": 0.0,
46
  "rewards/strict_format_reward_func": 0.015625,
47
+ "rewards/xmlcount_reward_func": 0.46318750735372305,
48
  "step": 4
49
  },
50
  {
51
+ "completion_length": 286.03125,
52
+ "epoch": 1.0,
53
+ "grad_norm": 2.7924423217773438,
54
+ "kl": 0.0008395667373406468,
55
  "learning_rate": 4.994757065594279e-07,
56
  "loss": 0.0,
57
+ "reward": 1.7317106202244759,
58
+ "reward_std": 1.3122332114144228,
59
+ "rewards/concensus_correctness_reward_func": 0.12025000154972076,
60
+ "rewards/consensus_reward_func": 0.75,
61
  "rewards/cumulative_reward_2": 0.0,
62
+ "rewards/final_correctness_reward_func": 0.125,
63
+ "rewards/question_recreation_reward_func": 0.4853668725118041,
64
  "rewards/soft_format_reward_func": 0.0,
65
  "rewards/strict_format_reward_func": 0.03125,
66
+ "rewards/xmlcount_reward_func": 0.21984374802559614,
67
  "step": 6
68
  },
69
  {
70
+ "completion_length": 289.90625,
71
+ "epoch": 1.3333333333333333,
72
+ "grad_norm": 2.7986364364624023,
73
+ "kl": 0.0008700692887941841,
74
  "learning_rate": 4.979050253066063e-07,
75
  "loss": 0.0,
76
+ "reward": 1.865300026256591,
77
+ "reward_std": 0.8366383262909949,
78
+ "rewards/concensus_correctness_reward_func": 0.06006250157952309,
79
+ "rewards/consensus_reward_func": 0.6875,
80
  "rewards/cumulative_reward_2": 0.0,
81
+ "rewards/final_correctness_reward_func": 0.0625,
82
+ "rewards/question_recreation_reward_func": 0.6006125509738922,
83
  "rewards/soft_format_reward_func": 0.0,
84
+ "rewards/strict_format_reward_func": 0.015625,
85
+ "rewards/xmlcount_reward_func": 0.4389999993145466,
86
  "step": 8
87
  },
88
  {
89
+ "completion_length": 224.3125,
90
+ "epoch": 1.6666666666666665,
91
+ "grad_norm": 3.4794700145721436,
92
+ "kl": 0.0009347902014269494,
93
  "learning_rate": 4.952945442245597e-07,
94
+ "loss": 0.0,
95
+ "reward": 1.8756496086716652,
96
+ "reward_std": 1.0868838177993894,
97
+ "rewards/concensus_correctness_reward_func": 0.08568750135600567,
98
+ "rewards/consensus_reward_func": 0.625,
99
  "rewards/cumulative_reward_2": 0.0,
100
+ "rewards/final_correctness_reward_func": 0.1875,
101
+ "rewards/question_recreation_reward_func": 0.4599620746448636,
102
  "rewards/soft_format_reward_func": 0.0,
103
  "rewards/strict_format_reward_func": 0.015625,
104
+ "rewards/xmlcount_reward_func": 0.5018750056624413,
105
  "step": 10
106
  },
107
  {
108
+ "completion_length": 296.28125,
109
+ "epoch": 2.0,
110
+ "grad_norm": 3.0765440464019775,
111
+ "kl": 0.0012358422500255983,
112
  "learning_rate": 4.916552125781528e-07,
113
  "loss": 0.0,
114
+ "reward": 1.9579493142664433,
115
+ "reward_std": 0.8908882063115016,
116
+ "rewards/concensus_correctness_reward_func": 0.12025000154972076,
117
+ "rewards/consensus_reward_func": 0.6875,
118
  "rewards/cumulative_reward_2": 0.0,
119
+ "rewards/final_correctness_reward_func": 0.1875,
120
+ "rewards/question_recreation_reward_func": 0.465324345510453,
121
  "rewards/soft_format_reward_func": 0.0,
122
+ "rewards/strict_format_reward_func": 0.015625,
123
+ "rewards/xmlcount_reward_func": 0.4817500030621886,
124
  "step": 12
125
  },
126
  {
127
+ "completion_length": 280.90625,
128
+ "epoch": 2.3333333333333335,
129
+ "grad_norm": 2.9266109466552734,
130
+ "kl": 0.0017369047309330199,
131
  "learning_rate": 4.870022949890676e-07,
132
  "loss": 0.0,
133
+ "reward": 1.4219924416393042,
134
+ "reward_std": 0.8281801359262317,
135
+ "rewards/concensus_correctness_reward_func": 0.0,
136
+ "rewards/consensus_reward_func": 0.4375,
137
  "rewards/cumulative_reward_2": 0.0,
138
+ "rewards/final_correctness_reward_func": 0.125,
139
+ "rewards/question_recreation_reward_func": 0.450836188509129,
140
  "rewards/soft_format_reward_func": 0.0,
141
+ "rewards/strict_format_reward_func": 0.0625,
142
+ "rewards/xmlcount_reward_func": 0.34615624509751797,
143
  "step": 14
144
  },
145
  {
146
+ "completion_length": 243.0625,
147
+ "epoch": 2.6666666666666665,
148
+ "grad_norm": 3.0913712978363037,
149
+ "kl": 0.0019520393034326844,
150
  "learning_rate": 4.81355307410676e-07,
151
  "loss": 0.0,
152
+ "reward": 2.2805784731172025,
153
+ "reward_std": 1.2899235817894805,
154
+ "rewards/concensus_correctness_reward_func": 0.24512499943375587,
155
+ "rewards/consensus_reward_func": 0.9375,
156
  "rewards/cumulative_reward_2": 0.0,
157
+ "rewards/final_correctness_reward_func": 0.125,
158
+ "rewards/question_recreation_reward_func": 0.47473472240380943,
159
  "rewards/soft_format_reward_func": 0.0,
160
+ "rewards/strict_format_reward_func": 0.046875,
161
+ "rewards/xmlcount_reward_func": 0.451343753375113,
162
  "step": 16
163
  },
164
  {
165
+ "completion_length": 267.40625,
166
+ "epoch": 3.0,
167
+ "grad_norm": 3.0766594409942627,
168
+ "kl": 0.0020293756315368228,
169
  "learning_rate": 4.747379352713488e-07,
170
  "loss": 0.0,
171
+ "reward": 1.9329118076711893,
172
+ "reward_std": 1.2741229943931103,
173
+ "rewards/concensus_correctness_reward_func": 0.12256250157952309,
174
+ "rewards/consensus_reward_func": 0.6875,
175
  "rewards/cumulative_reward_2": 0.0,
176
+ "rewards/final_correctness_reward_func": 0.25,
177
+ "rewards/question_recreation_reward_func": 0.619443034986034,
178
  "rewards/soft_format_reward_func": 0.0,
179
+ "rewards/strict_format_reward_func": 0.015625,
180
+ "rewards/xmlcount_reward_func": 0.23778124805539846,
181
  "step": 18
182
  },
183
  {
184
+ "completion_length": 236.09375,
185
+ "epoch": 3.3333333333333335,
186
+ "grad_norm": 2.809358596801758,
187
+ "kl": 0.0026170795026700944,
188
  "learning_rate": 4.6717793412953776e-07,
189
  "loss": 0.0,
190
+ "reward": 1.8976972922682762,
191
+ "reward_std": 1.0421031441655941,
192
+ "rewards/concensus_correctness_reward_func": 0.06012500077486038,
193
+ "rewards/consensus_reward_func": 0.625,
194
  "rewards/cumulative_reward_2": 0.0,
195
+ "rewards/final_correctness_reward_func": 0.125,
196
+ "rewards/question_recreation_reward_func": 0.5437285574153066,
197
  "rewards/soft_format_reward_func": 0.0,
198
+ "rewards/strict_format_reward_func": 0.03125,
199
+ "rewards/xmlcount_reward_func": 0.512593756429851,
200
  "step": 20
201
  },
202
  {
203
+ "completion_length": 265.0625,
204
+ "epoch": 3.6666666666666665,
205
+ "grad_norm": 3.7295589447021484,
206
+ "kl": 0.003362223884323612,
207
  "learning_rate": 4.5870701325731773e-07,
208
  "loss": 0.0,
209
+ "reward": 1.9837484806776047,
210
+ "reward_std": 0.753805372864008,
211
+ "rewards/concensus_correctness_reward_func": 0.11993750184774399,
212
+ "rewards/consensus_reward_func": 0.8125,
213
  "rewards/cumulative_reward_2": 0.0,
214
+ "rewards/final_correctness_reward_func": 0.125,
215
+ "rewards/question_recreation_reward_func": 0.4247172431787476,
216
  "rewards/soft_format_reward_func": 0.0,
217
+ "rewards/strict_format_reward_func": 0.0,
218
+ "rewards/xmlcount_reward_func": 0.5015937560237944,
219
  "step": 22
220
  },
221
  {
222
+ "completion_length": 237.6875,
223
+ "epoch": 4.0,
224
+ "grad_norm": 2.7614455223083496,
225
+ "kl": 0.0042717494507087395,
226
  "learning_rate": 4.4936070264068016e-07,
227
  "loss": 0.0,
228
+ "reward": 1.828284303745022,
229
+ "reward_std": 0.7124169690505369,
230
+ "rewards/concensus_correctness_reward_func": 0.05743750184774399,
231
+ "rewards/consensus_reward_func": 0.375,
232
  "rewards/cumulative_reward_2": 0.0,
233
+ "rewards/final_correctness_reward_func": 0.125,
234
+ "rewards/question_recreation_reward_func": 0.7025655592733528,
235
  "rewards/soft_format_reward_func": 0.0,
236
+ "rewards/strict_format_reward_func": 0.0625,
237
+ "rewards/xmlcount_reward_func": 0.5057812501909211,
238
  "step": 24
239
  },
240
  {
241
+ "completion_length": 296.71875,
242
+ "epoch": 4.333333333333333,
243
+ "grad_norm": 3.3926703929901123,
244
+ "kl": 0.0044053339806851,
245
  "learning_rate": 4.391782039544238e-07,
246
  "loss": 0.0,
247
+ "reward": 1.3961677476763725,
248
+ "reward_std": 1.0368912005797029,
249
+ "rewards/concensus_correctness_reward_func": 0.024687500670552254,
250
+ "rewards/consensus_reward_func": 0.5625,
251
  "rewards/cumulative_reward_2": 0.0,
252
+ "rewards/final_correctness_reward_func": 0.0625,
253
+ "rewards/question_recreation_reward_func": 0.46885524597018957,
254
  "rewards/soft_format_reward_func": 0.0,
255
+ "rewards/strict_format_reward_func": 0.046875,
256
+ "rewards/xmlcount_reward_func": 0.230750004760921,
257
  "step": 26
258
  },
259
  {
260
+ "completion_length": 208.375,
261
+ "epoch": 4.666666666666667,
262
+ "grad_norm": 3.8391273021698,
263
+ "kl": 0.005014390684664249,
264
  "learning_rate": 4.282022261367073e-07,
265
  "loss": 0.0,
266
+ "reward": 1.6386928837746382,
267
+ "reward_std": 0.5791225910652429,
268
+ "rewards/concensus_correctness_reward_func": 0.125,
269
+ "rewards/consensus_reward_func": 0.5,
270
  "rewards/cumulative_reward_2": 0.0,
271
+ "rewards/final_correctness_reward_func": 0.0,
272
+ "rewards/question_recreation_reward_func": 0.5378491813316941,
273
+ "rewards/soft_format_reward_func": 0.0,
274
+ "rewards/strict_format_reward_func": 0.03125,
275
+ "rewards/xmlcount_reward_func": 0.44459374831058085,
276
  "step": 28
277
  },
278
  {
279
+ "completion_length": 285.625,
280
+ "epoch": 5.0,
281
+ "grad_norm": 3.2744433879852295,
282
+ "kl": 0.004670602691476233,
283
  "learning_rate": 4.1647880625292027e-07,
284
  "loss": 0.0,
285
+ "reward": 1.5773469675332308,
286
+ "reward_std": 0.8148748113308102,
287
+ "rewards/concensus_correctness_reward_func": 0.0,
288
+ "rewards/consensus_reward_func": 0.5625,
289
  "rewards/cumulative_reward_2": 0.0,
290
+ "rewards/final_correctness_reward_func": 0.0625,
291
+ "rewards/question_recreation_reward_func": 0.5454094994347543,
292
  "rewards/soft_format_reward_func": 0.0,
293
+ "rewards/strict_format_reward_func": 0.03125,
294
+ "rewards/xmlcount_reward_func": 0.3756875009275973,
295
  "step": 30
296
  },
297
  {
298
+ "completion_length": 265.375,
299
+ "epoch": 5.333333333333333,
300
+ "grad_norm": 3.7803711891174316,
301
+ "kl": 0.0071189729642355815,
302
  "learning_rate": 4.040571164002318e-07,
303
  "loss": 0.0,
304
+ "reward": 1.915034051053226,
305
+ "reward_std": 1.0130134378559887,
306
+ "rewards/concensus_correctness_reward_func": 0.0,
307
+ "rewards/consensus_reward_func": 0.75,
308
  "rewards/cumulative_reward_2": 0.0,
309
+ "rewards/final_correctness_reward_func": 0.125,
310
+ "rewards/question_recreation_reward_func": 0.5950653096660972,
311
  "rewards/soft_format_reward_func": 0.0,
312
+ "rewards/strict_format_reward_func": 0.015625,
313
+ "rewards/xmlcount_reward_func": 0.42934375285403803,
314
  "step": 32
315
  },
316
  {
317
+ "completion_length": 257.96875,
318
+ "epoch": 5.666666666666667,
319
+ "grad_norm": 2.1448609828948975,
320
+ "kl": 0.006875298960949294,
321
  "learning_rate": 3.909892574627266e-07,
322
  "loss": 0.0,
323
+ "reward": 2.0118120573461056,
324
+ "reward_std": 1.1375641755294055,
325
+ "rewards/concensus_correctness_reward_func": 0.0625,
326
+ "rewards/consensus_reward_func": 0.5,
327
  "rewards/cumulative_reward_2": 0.0,
328
+ "rewards/final_correctness_reward_func": 0.4375,
329
+ "rewards/question_recreation_reward_func": 0.4547183125978336,
330
  "rewards/soft_format_reward_func": 0.0,
331
+ "rewards/strict_format_reward_func": 0.015625,
332
+ "rewards/xmlcount_reward_func": 0.5414687562733889,
333
  "step": 34
334
  },
335
  {
336
+ "completion_length": 228.03125,
337
+ "epoch": 6.0,
338
+ "grad_norm": 4.327014923095703,
339
+ "kl": 0.010515401692828164,
340
  "learning_rate": 3.773300405821908e-07,
341
  "loss": 0.0,
342
+ "reward": 2.3596832640469074,
343
+ "reward_std": 0.8677473589777946,
344
+ "rewards/concensus_correctness_reward_func": 0.29768750071525574,
345
+ "rewards/consensus_reward_func": 0.625,
346
  "rewards/cumulative_reward_2": 0.0,
347
+ "rewards/final_correctness_reward_func": 0.125,
348
+ "rewards/question_recreation_reward_func": 0.5928395111113787,
349
  "rewards/soft_format_reward_func": 0.0,
350
+ "rewards/strict_format_reward_func": 0.0625,
351
+ "rewards/xmlcount_reward_func": 0.6566562494263053,
352
  "step": 36
353
  },
354
  {
355
+ "completion_length": 255.125,
356
+ "epoch": 6.333333333333333,
357
+ "grad_norm": 3.2016897201538086,
358
+ "kl": 0.010390633338829502,
359
  "learning_rate": 3.6313675726113475e-07,
360
  "loss": 0.0,
361
+ "reward": 2.1920023262500763,
362
+ "reward_std": 1.4953037183731794,
363
+ "rewards/concensus_correctness_reward_func": 0.18418750166893005,
364
+ "rewards/consensus_reward_func": 0.75,
365
  "rewards/cumulative_reward_2": 0.0,
366
+ "rewards/final_correctness_reward_func": 0.3125,
367
+ "rewards/question_recreation_reward_func": 0.5606585624627769,
368
  "rewards/soft_format_reward_func": 0.0,
369
+ "rewards/strict_format_reward_func": 0.015625,
370
+ "rewards/xmlcount_reward_func": 0.3690312569960952,
371
  "step": 38
372
  },
373
  {
374
+ "completion_length": 255.21875,
375
+ "epoch": 6.666666666666667,
376
+ "grad_norm": 2.821410894393921,
377
+ "kl": 0.011521625056047924,
378
  "learning_rate": 3.484689390623218e-07,
379
  "loss": 0.0,
380
+ "reward": 2.26532906293869,
381
+ "reward_std": 1.2260184331098571,
382
+ "rewards/concensus_correctness_reward_func": 0.30037499964237213,
383
+ "rewards/consensus_reward_func": 0.8125,
384
  "rewards/cumulative_reward_2": 0.0,
385
+ "rewards/final_correctness_reward_func": 0.0625,
386
+ "rewards/question_recreation_reward_func": 0.6025790590792894,
387
  "rewards/soft_format_reward_func": 0.0,
388
+ "rewards/strict_format_reward_func": 0.046875,
389
+ "rewards/xmlcount_reward_func": 0.4405000088736415,
390
  "step": 40
391
  },
392
  {
393
+ "completion_length": 296.125,
394
+ "epoch": 7.0,
395
+ "grad_norm": 2.361708641052246,
396
+ "kl": 0.009184279042528942,
397
  "learning_rate": 3.3338810791270517e-07,
398
  "loss": 0.0,
399
+ "reward": 1.8683023676276207,
400
+ "reward_std": 0.931499857455492,
401
+ "rewards/concensus_correctness_reward_func": 0.11750000342726707,
402
+ "rewards/consensus_reward_func": 0.625,
403
  "rewards/cumulative_reward_2": 0.0,
404
+ "rewards/final_correctness_reward_func": 0.1875,
405
+ "rewards/question_recreation_reward_func": 0.5745836496353149,
406
  "rewards/soft_format_reward_func": 0.0,
407
+ "rewards/strict_format_reward_func": 0.046875,
408
+ "rewards/xmlcount_reward_func": 0.3168437508866191,
409
  "step": 42
410
  },
411
  {
412
+ "completion_length": 264.34375,
413
+ "epoch": 7.333333333333333,
414
+ "grad_norm": 3.3224129676818848,
415
+ "kl": 0.01854561164509505,
416
  "learning_rate": 3.179575180590857e-07,
417
  "loss": 0.0,
418
+ "reward": 1.4564557429403067,
419
+ "reward_std": 0.8835583210457116,
420
+ "rewards/concensus_correctness_reward_func": 0.0,
421
+ "rewards/consensus_reward_func": 0.375,
422
  "rewards/cumulative_reward_2": 0.0,
423
+ "rewards/final_correctness_reward_func": 0.125,
424
+ "rewards/question_recreation_reward_func": 0.5099557400681078,
425
  "rewards/soft_format_reward_func": 0.0,
426
+ "rewards/strict_format_reward_func": 0.03125,
427
+ "rewards/xmlcount_reward_func": 0.41524998657405376,
428
  "step": 44
429
  },
430
  {
431
+ "completion_length": 302.34375,
432
+ "epoch": 7.666666666666667,
433
+ "grad_norm": 3.5825905799865723,
434
+ "kl": 0.013279616308864206,
435
  "learning_rate": 3.022418907578188e-07,
436
  "loss": 0.0,
437
+ "reward": 2.223274264484644,
438
+ "reward_std": 0.9707561411778443,
439
+ "rewards/concensus_correctness_reward_func": 0.22681250050663948,
440
+ "rewards/consensus_reward_func": 0.875,
441
  "rewards/cumulative_reward_2": 0.0,
442
+ "rewards/final_correctness_reward_func": 0.0625,
443
+ "rewards/question_recreation_reward_func": 0.5176492994651198,
444
  "rewards/soft_format_reward_func": 0.0,
445
+ "rewards/strict_format_reward_func": 0.015625,
446
+ "rewards/xmlcount_reward_func": 0.5256875129416585,
447
  "step": 46
448
  },
449
  {
450
+ "completion_length": 284.5,
451
+ "epoch": 8.0,
452
+ "grad_norm": 3.046675682067871,
453
+ "kl": 0.014580188668332994,
454
  "learning_rate": 2.863071428113726e-07,
455
  "loss": 0.0,
456
+ "reward": 2.079888518899679,
457
+ "reward_std": 1.115025261300616,
458
+ "rewards/concensus_correctness_reward_func": 0.125,
459
+ "rewards/consensus_reward_func": 0.6875,
460
  "rewards/cumulative_reward_2": 0.0,
461
+ "rewards/final_correctness_reward_func": 0.25,
462
+ "rewards/question_recreation_reward_func": 0.6062635215930641,
463
  "rewards/soft_format_reward_func": 0.0,
464
+ "rewards/strict_format_reward_func": 0.03125,
465
+ "rewards/xmlcount_reward_func": 0.37987500336021185,
466
  "step": 48
467
  },
468
  {
469
+ "completion_length": 283.40625,
470
+ "epoch": 8.333333333333334,
471
+ "grad_norm": 4.013537883758545,
472
+ "kl": 0.01412903075106442,
473
  "learning_rate": 2.7022011009035107e-07,
474
  "loss": 0.0,
475
+ "reward": 1.7887509390711784,
476
+ "reward_std": 1.0781924333423376,
477
+ "rewards/concensus_correctness_reward_func": 0.06012500077486038,
478
+ "rewards/consensus_reward_func": 0.6875,
479
  "rewards/cumulative_reward_2": 0.0,
480
+ "rewards/final_correctness_reward_func": 0.0625,
481
+ "rewards/question_recreation_reward_func": 0.4950321572832763,
482
  "rewards/soft_format_reward_func": 0.0,
483
+ "rewards/strict_format_reward_func": 0.015625,
484
+ "rewards/xmlcount_reward_func": 0.46796874795109034,
485
  "step": 50
486
  },
487
  {
488
+ "completion_length": 255.75,
489
+ "epoch": 8.666666666666666,
490
+ "grad_norm": 3.4840548038482666,
491
+ "kl": 0.020143478643149137,
492
  "learning_rate": 2.540482672006254e-07,
493
  "loss": 0.0,
494
+ "reward": 2.2514243982732296,
495
+ "reward_std": 0.9967716456158087,
496
+ "rewards/concensus_correctness_reward_func": 0.14731250144541264,
497
+ "rewards/consensus_reward_func": 0.6875,
498
  "rewards/cumulative_reward_2": 0.0,
499
+ "rewards/final_correctness_reward_func": 0.0625,
500
+ "rewards/question_recreation_reward_func": 0.7038306472823024,
501
  "rewards/soft_format_reward_func": 0.0,
502
+ "rewards/strict_format_reward_func": 0.09375,
503
+ "rewards/xmlcount_reward_func": 0.5565312498365529,
504
  "step": 52
505
  },
506
  {
507
+ "completion_length": 257.25,
508
+ "epoch": 9.0,
509
+ "grad_norm": 3.285977602005005,
510
+ "kl": 0.020005275029689074,
511
  "learning_rate": 2.37859444471388e-07,
512
  "loss": 0.0,
513
+ "reward": 1.8586036376655102,
514
+ "reward_std": 0.8674497168976814,
515
+ "rewards/concensus_correctness_reward_func": 0.23737500235438347,
516
+ "rewards/consensus_reward_func": 0.5625,
517
  "rewards/cumulative_reward_2": 0.0,
518
+ "rewards/final_correctness_reward_func": 0.0625,
519
+ "rewards/question_recreation_reward_func": 0.4530098957475275,
520
  "rewards/soft_format_reward_func": 0.0,
521
+ "rewards/strict_format_reward_func": 0.0625,
522
+ "rewards/xmlcount_reward_func": 0.4807187654078007,
523
  "step": 54
524
  },
525
  {
526
+ "completion_length": 240.28125,
527
+ "epoch": 9.333333333333334,
528
+ "grad_norm": 3.7038044929504395,
529
+ "kl": 0.018931806669570506,
530
  "learning_rate": 2.2172154345117894e-07,
531
  "loss": 0.0,
532
+ "reward": 1.623201709240675,
533
+ "reward_std": 1.2911038948222995,
534
+ "rewards/concensus_correctness_reward_func": 0.06012500077486038,
535
+ "rewards/consensus_reward_func": 0.625,
536
  "rewards/cumulative_reward_2": 0.0,
537
+ "rewards/final_correctness_reward_func": 0.0625,
538
+ "rewards/question_recreation_reward_func": 0.48360797856003046,
539
  "rewards/soft_format_reward_func": 0.0,
540
+ "rewards/strict_format_reward_func": 0.03125,
541
+ "rewards/xmlcount_reward_func": 0.3607187531888485,
542
  "step": 56
543
  },
544
  {
545
+ "completion_length": 269.84375,
546
+ "epoch": 9.666666666666666,
547
+ "grad_norm": 2.881014585494995,
548
+ "kl": 0.017237395339179784,
549
  "learning_rate": 2.0570225210519433e-07,
550
  "loss": 0.0,
551
+ "reward": 1.9306830489076674,
552
+ "reward_std": 1.3888351377099752,
553
+ "rewards/concensus_correctness_reward_func": 0.24249999597668648,
554
+ "rewards/consensus_reward_func": 0.6875,
555
  "rewards/cumulative_reward_2": 0.0,
556
+ "rewards/final_correctness_reward_func": 0.0,
557
+ "rewards/question_recreation_reward_func": 0.5115580353885889,
558
  "rewards/soft_format_reward_func": 0.0,
559
  "rewards/strict_format_reward_func": 0.015625,
560
+ "rewards/xmlcount_reward_func": 0.4735000031068921,
561
  "step": 58
562
  },
563
  {
564
+ "completion_length": 249.15625,
565
+ "epoch": 10.0,
566
+ "grad_norm": 3.1154470443725586,
567
+ "kl": 0.02515559794846922,
568
  "learning_rate": 1.8986876090843664e-07,
569
  "loss": 0.0,
570
+ "reward": 2.0984230153262615,
571
+ "reward_std": 1.3092318717390299,
572
+ "rewards/concensus_correctness_reward_func": 0.06012500077486038,
573
+ "rewards/consensus_reward_func": 0.6875,
574
  "rewards/cumulative_reward_2": 0.0,
575
+ "rewards/final_correctness_reward_func": 0.1875,
576
+ "rewards/question_recreation_reward_func": 0.5064230696298182,
577
  "rewards/soft_format_reward_func": 0.0,
578
+ "rewards/strict_format_reward_func": 0.078125,
579
+ "rewards/xmlcount_reward_func": 0.5787499956786633,
580
  "step": 60
581
  },
582
  {
583
+ "completion_length": 253.25,
584
+ "epoch": 10.333333333333334,
585
+ "grad_norm": 3.137545347213745,
586
+ "kl": 0.024528241163352504,
587
  "learning_rate": 1.7428748102551234e-07,
588
  "loss": 0.0,
589
+ "reward": 1.8578170202672482,
590
+ "reward_std": 1.1804295498877764,
591
+ "rewards/concensus_correctness_reward_func": 0.059812501072883606,
592
+ "rewards/consensus_reward_func": 0.3125,
593
  "rewards/cumulative_reward_2": 0.0,
594
+ "rewards/final_correctness_reward_func": 0.25,
595
+ "rewards/question_recreation_reward_func": 0.4470045296475291,
596
  "rewards/soft_format_reward_func": 0.0,
597
  "rewards/strict_format_reward_func": 0.046875,
598
+ "rewards/xmlcount_reward_func": 0.7416250004898757,
599
  "step": 62
600
  },
601
  {
602
+ "completion_length": 223.3125,
603
+ "epoch": 10.666666666666666,
604
+ "grad_norm": 3.1050543785095215,
605
+ "kl": 0.02882619173033163,
606
  "learning_rate": 1.5902376575912814e-07,
607
  "loss": 0.0,
608
+ "reward": 1.743748527020216,
609
+ "reward_std": 0.6933234713651473,
610
+ "rewards/concensus_correctness_reward_func": 0.20481249876320362,
611
+ "rewards/consensus_reward_func": 0.5,
612
  "rewards/cumulative_reward_2": 0.0,
613
+ "rewards/final_correctness_reward_func": 0.0625,
614
+ "rewards/question_recreation_reward_func": 0.4549047634936869,
615
  "rewards/soft_format_reward_func": 0.0,
616
+ "rewards/strict_format_reward_func": 0.03125,
617
+ "rewards/xmlcount_reward_func": 0.4902812475338578,
618
  "step": 64
619
  },
620
  {
621
+ "completion_length": 237.8125,
622
+ "epoch": 11.0,
623
+ "grad_norm": 3.4488277435302734,
624
+ "kl": 0.024283534032292664,
625
  "learning_rate": 1.4414163643562753e-07,
626
  "loss": 0.0,
627
+ "reward": 1.9523196145892143,
628
+ "reward_std": 1.559489093720913,
629
+ "rewards/concensus_correctness_reward_func": 0.14818750135600567,
630
+ "rewards/consensus_reward_func": 0.6875,
631
  "rewards/cumulative_reward_2": 0.0,
632
+ "rewards/final_correctness_reward_func": 0.0625,
633
+ "rewards/question_recreation_reward_func": 0.5911321062594652,
634
+ "rewards/soft_format_reward_func": 0.015625,
635
+ "rewards/strict_format_reward_func": 0.015625,
636
+ "rewards/xmlcount_reward_func": 0.43175000394694507,
637
  "step": 66
638
  },
639
  {
640
+ "completion_length": 285.21875,
641
+ "epoch": 11.333333333333334,
642
+ "grad_norm": 3.1919057369232178,
643
+ "kl": 0.029086316528264433,
644
  "learning_rate": 1.2970351387729872e-07,
645
  "loss": 0.0,
646
+ "reward": 1.9048342034220695,
647
+ "reward_std": 1.2717823022976518,
648
+ "rewards/concensus_correctness_reward_func": 0.06012500077486038,
649
+ "rewards/consensus_reward_func": 0.75,
650
  "rewards/cumulative_reward_2": 0.0,
651
+ "rewards/final_correctness_reward_func": 0.0625,
652
+ "rewards/question_recreation_reward_func": 0.5372404857771471,
653
  "rewards/soft_format_reward_func": 0.0,
654
+ "rewards/strict_format_reward_func": 0.0625,
655
+ "rewards/xmlcount_reward_func": 0.4324687570333481,
656
  "step": 68
657
  },
658
  {
659
+ "completion_length": 274.03125,
660
+ "epoch": 11.666666666666666,
661
+ "grad_norm": 4.114351749420166,
662
+ "kl": 0.028069639985915273,
663
  "learning_rate": 1.1576995658775404e-07,
664
  "loss": 0.0,
665
+ "reward": 1.8863378204405308,
666
+ "reward_std": 1.3141623558476567,
667
+ "rewards/concensus_correctness_reward_func": 0.12256250157952309,
668
+ "rewards/consensus_reward_func": 0.625,
669
  "rewards/cumulative_reward_2": 0.0,
670
+ "rewards/final_correctness_reward_func": 0.125,
671
+ "rewards/question_recreation_reward_func": 0.5985253136605024,
672
+ "rewards/soft_format_reward_func": 0.015625,
673
+ "rewards/strict_format_reward_func": 0.078125,
674
+ "rewards/xmlcount_reward_func": 0.3215000149793923,
675
  "step": 70
676
  },
677
  {
678
+ "completion_length": 248.75,
679
+ "epoch": 12.0,
680
+ "grad_norm": 2.4701993465423584,
681
+ "kl": 0.020335440058261156,
682
  "learning_rate": 1.0239940674851941e-07,
683
  "loss": 0.0,
684
+ "reward": 2.2941419184207916,
685
+ "reward_std": 0.8937435210682452,
686
+ "rewards/concensus_correctness_reward_func": 0.11487500369548798,
687
+ "rewards/consensus_reward_func": 0.5625,
688
  "rewards/cumulative_reward_2": 0.0,
689
+ "rewards/final_correctness_reward_func": 0.4375,
690
+ "rewards/question_recreation_reward_func": 0.5048919152468443,
691
  "rewards/soft_format_reward_func": 0.0,
692
+ "rewards/strict_format_reward_func": 0.046875,
693
+ "rewards/xmlcount_reward_func": 0.627500012749806,
694
  "step": 72
695
  },
696
  {
697
+ "completion_length": 220.09375,
698
+ "epoch": 12.333333333333334,
699
+ "grad_norm": 2.9127869606018066,
700
+ "kl": 0.022209461370948702,
701
  "learning_rate": 8.964794509221507e-08,
702
  "loss": 0.0,
703
+ "reward": 2.0042398422956467,
704
+ "reward_std": 1.136198466643691,
705
+ "rewards/concensus_correctness_reward_func": 0.24262500181794167,
706
+ "rewards/consensus_reward_func": 0.5625,
707
  "rewards/cumulative_reward_2": 0.0,
708
+ "rewards/final_correctness_reward_func": 0.1875,
709
+ "rewards/question_recreation_reward_func": 0.49439611518755555,
710
  "rewards/soft_format_reward_func": 0.0,
711
+ "rewards/strict_format_reward_func": 0.015625,
712
+ "rewards/xmlcount_reward_func": 0.5015937518328428,
713
  "step": 74
714
  },
715
  {
716
+ "completion_length": 273.3125,
717
+ "epoch": 12.666666666666666,
718
+ "grad_norm": 2.498856782913208,
719
+ "kl": 0.01810361386742443,
720
  "learning_rate": 7.756905568047392e-08,
721
  "loss": 0.0,
722
+ "reward": 1.7342982944101095,
723
+ "reward_std": 1.2671478418633342,
724
+ "rewards/concensus_correctness_reward_func": 0.05999999865889549,
725
  "rewards/consensus_reward_func": 0.5,
726
  "rewards/cumulative_reward_2": 0.0,
727
+ "rewards/final_correctness_reward_func": 0.1875,
728
+ "rewards/question_recreation_reward_func": 0.575642024166882,
729
  "rewards/soft_format_reward_func": 0.0,
730
  "rewards/strict_format_reward_func": 0.015625,
731
+ "rewards/xmlcount_reward_func": 0.39553124993108213,
732
  "step": 76
733
  },
734
  {
735
+ "completion_length": 239.71875,
736
+ "epoch": 13.0,
737
+ "grad_norm": 2.6426241397857666,
738
+ "kl": 0.027558521192986518,
739
  "learning_rate": 6.621340157319996e-08,
740
  "loss": 0.0,
741
+ "reward": 2.0297871977090836,
742
+ "reward_std": 0.9375762529671192,
743
+ "rewards/concensus_correctness_reward_func": 0.06012500077486038,
744
+ "rewards/consensus_reward_func": 0.8125,
745
  "rewards/cumulative_reward_2": 0.0,
746
+ "rewards/final_correctness_reward_func": 0.0,
747
+ "rewards/question_recreation_reward_func": 0.643943477421999,
748
  "rewards/soft_format_reward_func": 0.0,
749
+ "rewards/strict_format_reward_func": 0.046875,
750
+ "rewards/xmlcount_reward_func": 0.4663437455892563,
751
  "step": 78
752
  },
753
  {
754
+ "completion_length": 217.0,
755
+ "epoch": 13.333333333333334,
756
+ "grad_norm": 2.7676427364349365,
757
+ "kl": 0.03532541490858421,
758
  "learning_rate": 5.5628612330087724e-08,
759
  "loss": 0.0,
760
+ "reward": 2.5324832424521446,
761
+ "reward_std": 1.1609387751668692,
762
+ "rewards/concensus_correctness_reward_func": 0.1401250008493662,
763
+ "rewards/consensus_reward_func": 1.1875,
764
  "rewards/cumulative_reward_2": 0.0,
765
+ "rewards/final_correctness_reward_func": 0.125,
766
+ "rewards/question_recreation_reward_func": 0.4988894378184341,
767
  "rewards/soft_format_reward_func": 0.0,
768
  "rewards/strict_format_reward_func": 0.0625,
769
+ "rewards/xmlcount_reward_func": 0.5184687459841371,
770
  "step": 80
771
  },
772
  {
773
+ "completion_length": 263.84375,
774
+ "epoch": 13.666666666666666,
775
+ "grad_norm": 3.3539531230926514,
776
+ "kl": 0.02452456980245188,
777
  "learning_rate": 4.5859084235697235e-08,
778
  "loss": 0.0,
779
+ "reward": 1.9322984656319022,
780
+ "reward_std": 1.3757213475182652,
781
+ "rewards/concensus_correctness_reward_func": 0.12262500077486038,
782
  "rewards/consensus_reward_func": 0.375,
783
  "rewards/cumulative_reward_2": 0.0,
784
  "rewards/final_correctness_reward_func": 0.3125,
785
+ "rewards/question_recreation_reward_func": 0.5088922204449773,
786
  "rewards/soft_format_reward_func": 0.0,
787
+ "rewards/strict_format_reward_func": 0.046875,
788
+ "rewards/xmlcount_reward_func": 0.5664062486030161,
789
  "step": 82
790
  },
791
  {
792
+ "completion_length": 258.9375,
793
+ "epoch": 14.0,
794
+ "grad_norm": 3.50706148147583,
795
+ "kl": 0.02300187383661978,
796
  "learning_rate": 3.6945794086007705e-08,
797
  "loss": 0.0,
798
+ "reward": 2.368156984448433,
799
+ "reward_std": 1.27269869716838,
800
+ "rewards/concensus_correctness_reward_func": 0.23487500101327896,
801
+ "rewards/consensus_reward_func": 0.9375,
802
  "rewards/cumulative_reward_2": 0.0,
803
+ "rewards/final_correctness_reward_func": 0.125,
804
+ "rewards/question_recreation_reward_func": 0.6125319767743349,
805
+ "rewards/soft_format_reward_func": 0.015625,
806
  "rewards/strict_format_reward_func": 0.015625,
807
+ "rewards/xmlcount_reward_func": 0.42700000666081905,
808
  "step": 84
809
  },
810
  {
811
+ "completion_length": 238.8125,
812
+ "epoch": 14.333333333333334,
813
+ "grad_norm": 2.6380765438079834,
814
+ "kl": 0.027566006290726364,
815
  "learning_rate": 2.892612731749414e-08,
816
  "loss": 0.0,
817
+ "reward": 2.055258920416236,
818
+ "reward_std": 0.6858638301491737,
819
+ "rewards/concensus_correctness_reward_func": 0.18012499809265137,
820
+ "rewards/consensus_reward_func": 0.75,
821
  "rewards/cumulative_reward_2": 0.0,
822
+ "rewards/final_correctness_reward_func": 0.0,
823
+ "rewards/question_recreation_reward_func": 0.5494776804698631,
824
  "rewards/soft_format_reward_func": 0.0,
825
+ "rewards/strict_format_reward_func": 0.015625,
826
+ "rewards/xmlcount_reward_func": 0.5600312501192093,
827
  "step": 86
828
  },
829
  {
830
+ "completion_length": 283.0,
831
+ "epoch": 14.666666666666666,
832
+ "grad_norm": 2.740476369857788,
833
+ "kl": 0.02960860973689705,
834
  "learning_rate": 2.183372119961499e-08,
835
  "loss": 0.0,
836
+ "reward": 1.8798525519669056,
837
+ "reward_std": 1.1110668628825806,
838
+ "rewards/concensus_correctness_reward_func": 0.06012500077486038,
839
+ "rewards/consensus_reward_func": 0.6875,
840
  "rewards/cumulative_reward_2": 0.0,
841
+ "rewards/final_correctness_reward_func": 0.125,
842
+ "rewards/question_recreation_reward_func": 0.5739150438457727,
843
+ "rewards/soft_format_reward_func": 0.015625,
844
+ "rewards/strict_format_reward_func": 0.046875,
845
+ "rewards/xmlcount_reward_func": 0.37081251526251435,
846
  "step": 88
847
  },
848
  {
849
+ "completion_length": 242.3125,
850
+ "epoch": 15.0,
851
+ "grad_norm": 2.513399124145508,
852
+ "kl": 0.026559468533378094,
853
  "learning_rate": 1.5698323748414122e-08,
854
  "loss": 0.0,
855
+ "reward": 2.03849926404655,
856
+ "reward_std": 0.964166424702853,
857
+ "rewards/concensus_correctness_reward_func": 0.14824999682605267,
858
+ "rewards/consensus_reward_func": 0.625,
859
  "rewards/cumulative_reward_2": 0.0,
860
+ "rewards/final_correctness_reward_func": 0.1875,
861
+ "rewards/question_recreation_reward_func": 0.5135617647320032,
862
  "rewards/soft_format_reward_func": 0.0,
863
+ "rewards/strict_format_reward_func": 0.0625,
864
+ "rewards/xmlcount_reward_func": 0.5016875043511391,
865
  "step": 90
866
  },
867
  {
868
+ "completion_length": 205.65625,
869
+ "epoch": 15.333333333333334,
870
+ "grad_norm": 3.1790051460266113,
871
+ "kl": 0.036513498693238944,
872
  "learning_rate": 1.054566895300324e-08,
873
  "loss": 0.0,
874
+ "reward": 2.030654199421406,
875
+ "reward_std": 1.0498718353919685,
876
+ "rewards/concensus_correctness_reward_func": 0.08824999816715717,
877
  "rewards/consensus_reward_func": 0.625,
878
  "rewards/cumulative_reward_2": 0.0,
879
+ "rewards/final_correctness_reward_func": 0.0625,
880
+ "rewards/question_recreation_reward_func": 0.6112791877239943,
881
  "rewards/soft_format_reward_func": 0.0,
882
+ "rewards/strict_format_reward_func": 0.046875,
883
+ "rewards/xmlcount_reward_func": 0.596749996766448,
884
  "step": 92
885
  },
886
  {
887
+ "completion_length": 263.0,
888
+ "epoch": 15.666666666666666,
889
+ "grad_norm": 3.719067096710205,
890
+ "kl": 0.021271194782457314,
891
  "learning_rate": 6.397368838268496e-09,
892
  "loss": 0.0,
893
+ "reward": 2.2683150228112936,
894
+ "reward_std": 1.4738470809534192,
895
+ "rewards/concensus_correctness_reward_func": 0.36025000363588333,
896
+ "rewards/consensus_reward_func": 0.5,
897
  "rewards/cumulative_reward_2": 0.0,
898
+ "rewards/final_correctness_reward_func": 0.1875,
899
+ "rewards/question_recreation_reward_func": 0.5795337841846049,
900
  "rewards/soft_format_reward_func": 0.0,
901
+ "rewards/strict_format_reward_func": 0.046875,
902
+ "rewards/xmlcount_reward_func": 0.5941562601365149,
903
  "step": 94
904
  },
905
  {
906
+ "completion_length": 257.71875,
907
+ "epoch": 16.0,
908
+ "grad_norm": 5.480099201202393,
909
+ "kl": 0.028246068861335516,
910
  "learning_rate": 3.2708228165273244e-09,
911
  "loss": 0.0,
912
+ "reward": 2.2440423257648945,
913
+ "reward_std": 0.9810051110107452,
914
+ "rewards/concensus_correctness_reward_func": 0.18512500077486038,
915
+ "rewards/consensus_reward_func": 0.8125,
916
  "rewards/cumulative_reward_2": 0.0,
917
+ "rewards/final_correctness_reward_func": 0.1875,
918
+ "rewards/question_recreation_reward_func": 0.4930110676214099,
919
  "rewards/soft_format_reward_func": 0.0,
920
+ "rewards/strict_format_reward_func": 0.046875,
921
+ "rewards/xmlcount_reward_func": 0.5190312538761646,
922
  "step": 96
923
  },
924
  {
925
+ "completion_length": 239.34375,
926
+ "epoch": 16.333333333333332,
927
+ "grad_norm": 3.926326274871826,
928
+ "kl": 0.03365567361470312,
929
  "learning_rate": 1.1791447083465133e-09,
930
  "loss": 0.0,
931
+ "reward": 1.9388289339840412,
932
+ "reward_std": 0.958217971608974,
933
+ "rewards/concensus_correctness_reward_func": 0.11999999731779099,
934
+ "rewards/consensus_reward_func": 0.625,
935
  "rewards/cumulative_reward_2": 0.0,
936
+ "rewards/final_correctness_reward_func": 0.0625,
937
+ "rewards/question_recreation_reward_func": 0.4502664606552571,
938
  "rewards/soft_format_reward_func": 0.0,
939
+ "rewards/strict_format_reward_func": 0.0625,
940
+ "rewards/xmlcount_reward_func": 0.6185624990612268,
941
  "step": 98
942
  },
943
  {
944
+ "completion_length": 271.6875,
945
+ "epoch": 16.666666666666668,
946
+ "grad_norm": 2.2361977100372314,
947
+ "kl": 0.02408603549702093,
948
  "learning_rate": 1.3110773862126667e-10,
949
  "loss": 0.0,
950
+ "reward": 1.8076389655470848,
951
+ "reward_std": 1.001751037707436,
952
+ "rewards/concensus_correctness_reward_func": 0.2303125038743019,
953
+ "rewards/consensus_reward_func": 0.5,
954
  "rewards/cumulative_reward_2": 0.0,
955
+ "rewards/final_correctness_reward_func": 0.0,
956
+ "rewards/question_recreation_reward_func": 0.5585764544084668,
957
  "rewards/soft_format_reward_func": 0.0,
958
+ "rewards/strict_format_reward_func": 0.046875,
959
+ "rewards/xmlcount_reward_func": 0.4718750088359229,
960
  "step": 100
961
  },
962
  {
963
+ "epoch": 16.666666666666668,
964
  "step": 100,
965
  "total_flos": 0.0,
966
+ "train_loss": 1.5718676149845123e-05,
967
+ "train_runtime": 4262.7362,
968
+ "train_samples_per_second": 0.375,
969
+ "train_steps_per_second": 0.023
970
  }
971
  ],
972
  "logging_steps": 2,
973
  "max_steps": 100,
974
  "num_input_tokens_seen": 0,
975
+ "num_train_epochs": 17,
976
  "save_steps": 25,
977
  "stateful_callbacks": {
978
  "TrainerControl": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03d6f864640ff2f7874aaff157b6b0c8d4f48abf640964a2ea7de104211b2b07
3
  size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f771f4e41b930eadf03c7a0abf7ec0d72660f696af7340eebd672e2ccbbd1025
3
  size 5944