wyceee commited on
Commit
043f071
·
verified ·
1 Parent(s): 2702ec3

End of training

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.018054345073833248,
4
- "train_runtime": 1357.263,
5
- "train_samples": 140,
6
- "train_samples_per_second": 1.179,
7
- "train_steps_per_second": 0.074
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 5.246203290880657e-06,
4
+ "train_runtime": 3714.3127,
5
+ "train_samples": 48,
6
+ "train_samples_per_second": 0.431,
7
+ "train_steps_per_second": 0.027
8
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c63c6ce3ce02d94e50a0f1d5d9c98db03b918fde96b79778ff70084beaad495
3
  size 4996670464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da4263e7a35562370a51d0fa373be9cc7d793b22ac7adabe400565ca4c12e9b4
3
  size 4996670464
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f05b1466c485e49eb663e61e433c4cecdae40d3c64ac97a04c91578b92c3e43
3
  size 1178224960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:820a3848e15044629f249d7c069f6631abe495b0c8c46d64e65d1d8e91b39678
3
  size 1178224960
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.018054345073833248,
4
- "train_runtime": 1357.263,
5
- "train_samples": 140,
6
- "train_samples_per_second": 1.179,
7
- "train_steps_per_second": 0.074
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 5.246203290880657e-06,
4
+ "train_runtime": 3714.3127,
5
+ "train_samples": 48,
6
+ "train_samples_per_second": 0.431,
7
+ "train_steps_per_second": 0.027
8
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 5.571428571428571,
6
  "eval_steps": 500,
7
  "global_step": 100,
8
  "is_hyper_param_search": false,
@@ -10,969 +10,969 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 238.8125,
14
- "epoch": 0.11428571428571428,
15
- "grad_norm": 10.970682144165039,
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666665e-07,
18
  "loss": -0.0,
19
- "reward": 12.407915402203798,
20
- "reward_std": 5.188350445881952,
21
- "rewards/concensus_correctness_reward_func": 8.488562531769276,
22
- "rewards/consensus_reward_func": 0.9375,
23
  "rewards/cumulative_reward_2": 0.0,
24
- "rewards/final_correctness_reward_func": 1.3125,
25
- "rewards/question_recreation_reward_func": 0.6283841012045741,
26
  "rewards/soft_format_reward_func": 0.0,
27
- "rewards/strict_format_reward_func": 0.1875,
28
- "rewards/xmlcount_reward_func": 0.8534687550272793,
29
  "step": 2
30
  },
31
  {
32
- "completion_length": 251.6875,
33
- "epoch": 0.22857142857142856,
34
- "grad_norm": 7.923824310302734,
35
- "kl": 0.042037057399284095,
36
  "learning_rate": 5e-07,
37
  "loss": 0.0,
38
- "reward": 14.51514945924282,
39
- "reward_std": 5.186308401403949,
40
- "rewards/concensus_correctness_reward_func": 9.763374998001382,
41
- "rewards/consensus_reward_func": 1.25,
42
  "rewards/cumulative_reward_2": 0.0,
43
- "rewards/final_correctness_reward_func": 1.4375,
44
- "rewards/question_recreation_reward_func": 0.8561495095491409,
45
  "rewards/soft_format_reward_func": 0.0,
46
- "rewards/strict_format_reward_func": 0.21875,
47
- "rewards/xmlcount_reward_func": 0.989375002682209,
48
  "step": 4
49
  },
50
  {
51
- "completion_length": 255.4375,
52
- "epoch": 0.34285714285714286,
53
- "grad_norm": 118.1331558227539,
54
- "kl": 15.88943462620955,
55
  "learning_rate": 4.994757065594279e-07,
56
- "loss": 0.0163,
57
- "reward": 10.717048183083534,
58
- "reward_std": 6.401097939996816,
59
- "rewards/concensus_correctness_reward_func": 6.5562499817460775,
60
- "rewards/consensus_reward_func": 1.0,
61
  "rewards/cumulative_reward_2": 0.0,
62
- "rewards/final_correctness_reward_func": 1.125,
63
- "rewards/question_recreation_reward_func": 0.714673223963473,
64
  "rewards/soft_format_reward_func": 0.0,
65
- "rewards/strict_format_reward_func": 0.1875,
66
- "rewards/xmlcount_reward_func": 1.1336250007152557,
67
  "step": 6
68
  },
69
  {
70
- "completion_length": 266.34375,
71
- "epoch": 0.45714285714285713,
72
- "grad_norm": 10.96125602722168,
73
- "kl": 2.4939099808689207,
74
  "learning_rate": 4.979050253066063e-07,
75
- "loss": 0.0025,
76
- "reward": 13.457202315330505,
77
- "reward_std": 5.08114674501121,
78
- "rewards/concensus_correctness_reward_func": 8.9798124730587,
79
- "rewards/consensus_reward_func": 1.3125,
80
  "rewards/cumulative_reward_2": 0.0,
81
- "rewards/final_correctness_reward_func": 1.3125,
82
- "rewards/question_recreation_reward_func": 0.6849835254251957,
83
  "rewards/soft_format_reward_func": 0.0,
84
- "rewards/strict_format_reward_func": 0.203125,
85
- "rewards/xmlcount_reward_func": 0.9642812530510128,
86
  "step": 8
87
  },
88
  {
89
- "completion_length": 252.03125,
90
- "epoch": 0.5714285714285714,
91
- "grad_norm": 18.904144287109375,
92
- "kl": 4.680454459274188,
93
  "learning_rate": 4.952945442245597e-07,
94
- "loss": 0.0047,
95
- "reward": 16.20142674446106,
96
- "reward_std": 8.446806251784437,
97
- "rewards/concensus_correctness_reward_func": 11.267187535762787,
98
- "rewards/consensus_reward_func": 1.125,
99
- "rewards/cumulative_reward_2": 0.0,
100
- "rewards/final_correctness_reward_func": 1.6875,
101
- "rewards/question_recreation_reward_func": 0.7198330629616976,
102
- "rewards/soft_format_reward_func": 0.0,
103
- "rewards/strict_format_reward_func": 0.3125,
104
- "rewards/xmlcount_reward_func": 1.089406255632639,
105
  "step": 10
106
  },
107
  {
108
- "completion_length": 234.09375,
109
- "epoch": 0.6857142857142857,
110
- "grad_norm": 18.74327850341797,
111
- "kl": 6.5579799057450145,
112
  "learning_rate": 4.916552125781528e-07,
113
- "loss": 0.0066,
114
- "reward": 17.783124715089798,
115
- "reward_std": 8.461443929263623,
116
- "rewards/concensus_correctness_reward_func": 12.696437515318394,
117
- "rewards/consensus_reward_func": 1.1875,
118
  "rewards/cumulative_reward_2": 0.0,
119
- "rewards/final_correctness_reward_func": 1.8125,
120
- "rewards/question_recreation_reward_func": 0.6896556532010436,
121
  "rewards/soft_format_reward_func": 0.0,
122
- "rewards/strict_format_reward_func": 0.25,
123
- "rewards/xmlcount_reward_func": 1.1470312476158142,
124
  "step": 12
125
  },
126
  {
127
- "completion_length": 269.53125,
128
- "epoch": 0.8,
129
- "grad_norm": 6.917476654052734,
130
- "kl": 4.977367766899988,
131
  "learning_rate": 4.870022949890676e-07,
132
- "loss": 0.005,
133
- "reward": 15.972103998064995,
134
- "reward_std": 2.9697941527701914,
135
- "rewards/concensus_correctness_reward_func": 11.044250033795834,
136
- "rewards/consensus_reward_func": 1.625,
137
  "rewards/cumulative_reward_2": 0.0,
138
- "rewards/final_correctness_reward_func": 1.3125,
139
- "rewards/question_recreation_reward_func": 0.8037290628999472,
140
  "rewards/soft_format_reward_func": 0.0,
141
- "rewards/strict_format_reward_func": 0.234375,
142
- "rewards/xmlcount_reward_func": 0.952250000089407,
143
  "step": 14
144
  },
145
  {
146
- "completion_length": 270.6875,
147
- "epoch": 0.9142857142857143,
148
- "grad_norm": 15.063738822937012,
149
- "kl": 2.8824040475301445,
150
  "learning_rate": 4.81355307410676e-07,
151
- "loss": 0.0029,
152
- "reward": 10.288215219974518,
153
- "reward_std": 5.413245497271419,
154
- "rewards/concensus_correctness_reward_func": 6.029687514528632,
155
- "rewards/consensus_reward_func": 1.125,
156
  "rewards/cumulative_reward_2": 0.0,
157
- "rewards/final_correctness_reward_func": 0.9375,
158
- "rewards/question_recreation_reward_func": 0.7746213864884339,
159
  "rewards/soft_format_reward_func": 0.0,
160
- "rewards/strict_format_reward_func": 0.3125,
161
- "rewards/xmlcount_reward_func": 1.1089062541723251,
162
  "step": 16
163
  },
164
  {
165
- "completion_length": 233.95833333333334,
166
- "epoch": 1.0,
167
- "grad_norm": 4.807560443878174,
168
- "kl": 1.358936990921696,
169
  "learning_rate": 4.747379352713488e-07,
170
- "loss": 0.001,
171
- "reward": 16.22363926966985,
172
- "reward_std": 4.749512891595562,
173
- "rewards/concensus_correctness_reward_func": 11.275166720151901,
174
- "rewards/consensus_reward_func": 1.1666666666666667,
175
  "rewards/cumulative_reward_2": 0.0,
176
- "rewards/final_correctness_reward_func": 1.5833333333333333,
177
- "rewards/question_recreation_reward_func": 0.6966391950845718,
178
  "rewards/soft_format_reward_func": 0.0,
179
- "rewards/strict_format_reward_func": 0.2916666666666667,
180
- "rewards/xmlcount_reward_func": 1.210166667898496,
181
  "step": 18
182
  },
183
  {
184
- "completion_length": 247.25,
185
- "epoch": 1.1142857142857143,
186
- "grad_norm": 75.82022857666016,
187
- "kl": 8.28540600137785,
188
  "learning_rate": 4.6717793412953776e-07,
189
- "loss": 0.0083,
190
- "reward": 15.897803097963333,
191
- "reward_std": 3.59038737998344,
192
- "rewards/concensus_correctness_reward_func": 11.287875011563301,
193
- "rewards/consensus_reward_func": 1.375,
194
  "rewards/cumulative_reward_2": 0.0,
195
- "rewards/final_correctness_reward_func": 1.125,
196
- "rewards/question_recreation_reward_func": 0.7908340934664011,
197
  "rewards/soft_format_reward_func": 0.0,
198
- "rewards/strict_format_reward_func": 0.234375,
199
- "rewards/xmlcount_reward_func": 1.0847187526524067,
200
  "step": 20
201
  },
202
  {
203
- "completion_length": 243.03125,
204
- "epoch": 1.2285714285714286,
205
- "grad_norm": 18.962554931640625,
206
- "kl": 5.726443055551499,
207
  "learning_rate": 4.5870701325731773e-07,
208
- "loss": 0.0057,
209
- "reward": 15.181419372558594,
210
- "reward_std": 5.393561449236586,
211
- "rewards/concensus_correctness_reward_func": 10.06800001859665,
212
- "rewards/consensus_reward_func": 1.25,
213
  "rewards/cumulative_reward_2": 0.0,
214
- "rewards/final_correctness_reward_func": 1.6875,
215
- "rewards/question_recreation_reward_func": 0.7863881841767579,
216
  "rewards/soft_format_reward_func": 0.0,
217
- "rewards/strict_format_reward_func": 0.296875,
218
- "rewards/xmlcount_reward_func": 1.0926562510430813,
219
  "step": 22
220
  },
221
  {
222
- "completion_length": 246.6875,
223
- "epoch": 1.342857142857143,
224
- "grad_norm": 8.005983352661133,
225
- "kl": 2.3502084868960083,
226
  "learning_rate": 4.4936070264068016e-07,
227
- "loss": 0.0023,
228
- "reward": 15.992204293608665,
229
- "reward_std": 5.12271255068481,
230
- "rewards/concensus_correctness_reward_func": 11.144812501966953,
231
- "rewards/consensus_reward_func": 1.3125,
232
  "rewards/cumulative_reward_2": 0.0,
233
- "rewards/final_correctness_reward_func": 1.5,
234
- "rewards/question_recreation_reward_func": 0.7768920734524727,
235
  "rewards/soft_format_reward_func": 0.0,
236
- "rewards/strict_format_reward_func": 0.25,
237
- "rewards/xmlcount_reward_func": 1.0079999975860119,
238
  "step": 24
239
  },
240
  {
241
- "completion_length": 221.375,
242
- "epoch": 1.457142857142857,
243
- "grad_norm": 5.74846887588501,
244
- "kl": 11.304446186637506,
245
  "learning_rate": 4.391782039544238e-07,
246
- "loss": 0.0113,
247
- "reward": 13.14157024025917,
248
- "reward_std": 4.419819911709055,
249
- "rewards/concensus_correctness_reward_func": 8.400812551379204,
250
- "rewards/consensus_reward_func": 1.125,
251
  "rewards/cumulative_reward_2": 0.0,
252
- "rewards/final_correctness_reward_func": 1.375,
253
- "rewards/question_recreation_reward_func": 0.7510077767074108,
254
  "rewards/soft_format_reward_func": 0.0,
255
- "rewards/strict_format_reward_func": 0.28125,
256
- "rewards/xmlcount_reward_func": 1.208500012755394,
257
  "step": 26
258
  },
259
  {
260
- "completion_length": 239.21875,
261
- "epoch": 1.5714285714285714,
262
- "grad_norm": 13.846333503723145,
263
- "kl": 2.687789380317554,
264
  "learning_rate": 4.282022261367073e-07,
265
- "loss": 0.0027,
266
- "reward": 13.192271679639816,
267
- "reward_std": 8.560021809767932,
268
- "rewards/concensus_correctness_reward_func": 8.766687527298927,
269
- "rewards/consensus_reward_func": 0.9375,
270
  "rewards/cumulative_reward_2": 0.0,
271
- "rewards/final_correctness_reward_func": 1.375,
272
- "rewards/question_recreation_reward_func": 0.6983654154464602,
273
  "rewards/soft_format_reward_func": 0.0,
274
- "rewards/strict_format_reward_func": 0.234375,
275
- "rewards/xmlcount_reward_func": 1.1803437545895576,
276
  "step": 28
277
  },
278
  {
279
- "completion_length": 268.40625,
280
- "epoch": 1.6857142857142857,
281
- "grad_norm": 10.3473482131958,
282
- "kl": 2.4997252374887466,
283
  "learning_rate": 4.1647880625292027e-07,
284
- "loss": 0.0025,
285
- "reward": 13.871409982442856,
286
- "reward_std": 7.89060926809907,
287
- "rewards/concensus_correctness_reward_func": 9.331625029444695,
288
- "rewards/consensus_reward_func": 1.1875,
289
  "rewards/cumulative_reward_2": 0.0,
290
- "rewards/final_correctness_reward_func": 1.375,
291
- "rewards/question_recreation_reward_func": 0.7168787051923573,
292
  "rewards/soft_format_reward_func": 0.0,
293
- "rewards/strict_format_reward_func": 0.21875,
294
- "rewards/xmlcount_reward_func": 1.0416562482714653,
295
  "step": 30
296
  },
297
  {
298
- "completion_length": 270.625,
299
- "epoch": 1.8,
300
- "grad_norm": 125.35323333740234,
301
- "kl": 15.523216519504786,
302
  "learning_rate": 4.040571164002318e-07,
303
- "loss": 0.0155,
304
- "reward": 14.418130666017532,
305
- "reward_std": 5.059405547566712,
306
- "rewards/concensus_correctness_reward_func": 9.623312469571829,
307
- "rewards/consensus_reward_func": 1.5,
308
  "rewards/cumulative_reward_2": 0.0,
309
- "rewards/final_correctness_reward_func": 1.1875,
310
- "rewards/question_recreation_reward_func": 0.7698495015501976,
311
  "rewards/soft_format_reward_func": 0.0,
312
- "rewards/strict_format_reward_func": 0.28125,
313
- "rewards/xmlcount_reward_func": 1.0562187489122152,
314
  "step": 32
315
  },
316
  {
317
- "completion_length": 278.625,
318
- "epoch": 1.9142857142857141,
319
- "grad_norm": 41.78948974609375,
320
- "kl": 22.502449576277286,
321
  "learning_rate": 3.909892574627266e-07,
322
- "loss": 0.0225,
323
- "reward": 14.04726266860962,
324
- "reward_std": 4.064370384789072,
325
- "rewards/concensus_correctness_reward_func": 9.231374958530068,
326
- "rewards/consensus_reward_func": 1.375,
327
  "rewards/cumulative_reward_2": 0.0,
328
- "rewards/final_correctness_reward_func": 1.3125,
329
- "rewards/question_recreation_reward_func": 0.7898254841566086,
330
  "rewards/soft_format_reward_func": 0.0,
331
- "rewards/strict_format_reward_func": 0.296875,
332
- "rewards/xmlcount_reward_func": 1.0416875034570694,
333
  "step": 34
334
  },
335
  {
336
- "completion_length": 279.5416666666667,
337
- "epoch": 2.0,
338
- "grad_norm": 9.201582908630371,
339
- "kl": 21.56579526141286,
340
  "learning_rate": 3.773300405821908e-07,
341
- "loss": 0.0162,
342
- "reward": 14.157723685105642,
343
- "reward_std": 4.17470802180469,
344
- "rewards/concensus_correctness_reward_func": 9.812166665991148,
345
- "rewards/consensus_reward_func": 1.25,
346
  "rewards/cumulative_reward_2": 0.0,
347
- "rewards/final_correctness_reward_func": 1.4166666666666667,
348
- "rewards/question_recreation_reward_func": 0.7103902654101452,
349
  "rewards/soft_format_reward_func": 0.0,
350
- "rewards/strict_format_reward_func": 0.14583333333333334,
351
- "rewards/xmlcount_reward_func": 0.8226666711270809,
352
  "step": 36
353
  },
354
  {
355
- "completion_length": 222.8125,
356
- "epoch": 2.1142857142857143,
357
- "grad_norm": 36.22496032714844,
358
- "kl": 16.87672139937058,
359
  "learning_rate": 3.6313675726113475e-07,
360
- "loss": 0.0169,
361
- "reward": 14.74327240884304,
362
- "reward_std": 4.779272536048666,
363
- "rewards/concensus_correctness_reward_func": 10.158499985933304,
364
- "rewards/consensus_reward_func": 0.9375,
365
  "rewards/cumulative_reward_2": 0.0,
366
- "rewards/final_correctness_reward_func": 1.5625,
367
- "rewards/question_recreation_reward_func": 0.7058975584805012,
368
  "rewards/soft_format_reward_func": 0.0,
369
- "rewards/strict_format_reward_func": 0.265625,
370
- "rewards/xmlcount_reward_func": 1.1132500022649765,
371
  "step": 38
372
  },
373
  {
374
- "completion_length": 246.65625,
375
- "epoch": 2.2285714285714286,
376
- "grad_norm": 19.6318359375,
377
- "kl": 16.881314451806247,
378
  "learning_rate": 3.484689390623218e-07,
379
- "loss": 0.0169,
380
- "reward": 18.013662189245224,
381
- "reward_std": 7.812059601201327,
382
- "rewards/concensus_correctness_reward_func": 13.051250010728836,
383
- "rewards/consensus_reward_func": 1.375,
384
  "rewards/cumulative_reward_2": 0.0,
385
- "rewards/final_correctness_reward_func": 1.5625,
386
- "rewards/question_recreation_reward_func": 0.6924745952710509,
387
  "rewards/soft_format_reward_func": 0.0,
388
- "rewards/strict_format_reward_func": 0.203125,
389
- "rewards/xmlcount_reward_func": 1.1293125040829182,
390
  "step": 40
391
  },
392
  {
393
- "completion_length": 272.125,
394
- "epoch": 2.342857142857143,
395
- "grad_norm": 622.0595092773438,
396
- "kl": 40.245274308137596,
397
  "learning_rate": 3.3338810791270517e-07,
398
- "loss": 0.0402,
399
- "reward": 15.975400440394878,
400
- "reward_std": 3.027749645523727,
401
- "rewards/concensus_correctness_reward_func": 11.107500043697655,
402
- "rewards/consensus_reward_func": 1.4375,
403
  "rewards/cumulative_reward_2": 0.0,
404
- "rewards/final_correctness_reward_func": 1.375,
405
- "rewards/question_recreation_reward_func": 0.8080566711723804,
406
  "rewards/soft_format_reward_func": 0.0,
407
- "rewards/strict_format_reward_func": 0.28125,
408
- "rewards/xmlcount_reward_func": 0.9660937523003668,
409
  "step": 42
410
  },
411
  {
412
- "completion_length": 230.40625,
413
- "epoch": 2.4571428571428573,
414
- "grad_norm": 12.730220794677734,
415
- "kl": 2.831924519967288,
416
  "learning_rate": 3.179575180590857e-07,
417
- "loss": 0.0028,
418
- "reward": 14.358340233564377,
419
- "reward_std": 7.420220456435345,
420
- "rewards/concensus_correctness_reward_func": 9.464937542332336,
421
- "rewards/consensus_reward_func": 1.1875,
422
  "rewards/cumulative_reward_2": 0.0,
423
- "rewards/final_correctness_reward_func": 1.5,
424
- "rewards/question_recreation_reward_func": 0.7924965554848313,
425
  "rewards/soft_format_reward_func": 0.0,
426
- "rewards/strict_format_reward_func": 0.28125,
427
- "rewards/xmlcount_reward_func": 1.132156252861023,
428
  "step": 44
429
  },
430
  {
431
- "completion_length": 252.6875,
432
- "epoch": 2.571428571428571,
433
- "grad_norm": 91.50186920166016,
434
- "kl": 13.465573271270841,
435
  "learning_rate": 3.022418907578188e-07,
436
- "loss": 0.0135,
437
- "reward": 10.495669901371002,
438
- "reward_std": 3.2124593601038214,
439
- "rewards/concensus_correctness_reward_func": 6.154187520965934,
440
- "rewards/consensus_reward_func": 1.125,
441
  "rewards/cumulative_reward_2": 0.0,
442
- "rewards/final_correctness_reward_func": 1.1875,
443
- "rewards/question_recreation_reward_func": 0.7502010772004724,
444
  "rewards/soft_format_reward_func": 0.0,
445
- "rewards/strict_format_reward_func": 0.171875,
446
- "rewards/xmlcount_reward_func": 1.1069062501192093,
447
  "step": 46
448
  },
449
  {
450
- "completion_length": 267.3125,
451
- "epoch": 2.685714285714286,
452
- "grad_norm": 9.117658615112305,
453
- "kl": 2.676259968895465,
454
  "learning_rate": 2.863071428113726e-07,
455
- "loss": 0.0027,
456
- "reward": 13.41846364736557,
457
- "reward_std": 4.830206839018501,
458
- "rewards/concensus_correctness_reward_func": 9.071687553077936,
459
- "rewards/consensus_reward_func": 1.125,
460
  "rewards/cumulative_reward_2": 0.0,
461
- "rewards/final_correctness_reward_func": 1.3125,
462
- "rewards/question_recreation_reward_func": 0.7668074369430542,
463
  "rewards/soft_format_reward_func": 0.0,
464
- "rewards/strict_format_reward_func": 0.203125,
465
- "rewards/xmlcount_reward_func": 0.9393437597900629,
466
  "step": 48
467
  },
468
  {
469
- "completion_length": 268.9375,
470
- "epoch": 2.8,
471
- "grad_norm": 6.602773666381836,
472
- "kl": 3.5302634860854596,
473
  "learning_rate": 2.7022011009035107e-07,
474
- "loss": 0.0035,
475
- "reward": 13.124091684818268,
476
- "reward_std": 6.698563320926041,
477
- "rewards/concensus_correctness_reward_func": 8.646312475204468,
478
- "rewards/consensus_reward_func": 1.25,
479
  "rewards/cumulative_reward_2": 0.0,
480
- "rewards/final_correctness_reward_func": 1.3125,
481
- "rewards/question_recreation_reward_func": 0.7686855588108301,
482
  "rewards/soft_format_reward_func": 0.0,
483
- "rewards/strict_format_reward_func": 0.109375,
484
- "rewards/xmlcount_reward_func": 1.0372187588363886,
485
  "step": 50
486
  },
487
  {
488
- "completion_length": 249.09375,
489
- "epoch": 2.914285714285714,
490
- "grad_norm": 8879.2197265625,
491
- "kl": 443.5015660626814,
492
  "learning_rate": 2.540482672006254e-07,
493
- "loss": 0.4435,
494
- "reward": 14.807004809379578,
495
- "reward_std": 6.6539650214836,
496
- "rewards/concensus_correctness_reward_func": 10.309812501072884,
497
- "rewards/consensus_reward_func": 1.1875,
498
  "rewards/cumulative_reward_2": 0.0,
499
- "rewards/final_correctness_reward_func": 1.3125,
500
- "rewards/question_recreation_reward_func": 0.7173797897994518,
501
  "rewards/soft_format_reward_func": 0.0,
502
- "rewards/strict_format_reward_func": 0.265625,
503
- "rewards/xmlcount_reward_func": 1.0141874998807907,
504
  "step": 52
505
  },
506
  {
507
- "completion_length": 238.20833333333334,
508
- "epoch": 3.0,
509
- "grad_norm": 5.6875457763671875,
510
- "kl": 43.41115608314673,
511
  "learning_rate": 2.37859444471388e-07,
512
- "loss": 0.0326,
513
- "reward": 12.497497618198395,
514
- "reward_std": 4.959660264973839,
515
- "rewards/concensus_correctness_reward_func": 8.096666658918062,
516
- "rewards/consensus_reward_func": 0.9166666666666666,
517
  "rewards/cumulative_reward_2": 0.0,
518
- "rewards/final_correctness_reward_func": 1.5,
519
- "rewards/question_recreation_reward_func": 0.6404145161310831,
520
  "rewards/soft_format_reward_func": 0.0,
521
- "rewards/strict_format_reward_func": 0.2708333333333333,
522
- "rewards/xmlcount_reward_func": 1.0729166616996129,
523
  "step": 54
524
  },
525
  {
526
- "completion_length": 226.96875,
527
- "epoch": 3.1142857142857143,
528
- "grad_norm": 13.502339363098145,
529
- "kl": 14.871396910399199,
530
  "learning_rate": 2.2172154345117894e-07,
531
- "loss": 0.0149,
532
- "reward": 14.69212443381548,
533
- "reward_std": 6.670687978621572,
534
- "rewards/concensus_correctness_reward_func": 10.009812474250793,
535
- "rewards/consensus_reward_func": 1.25,
536
  "rewards/cumulative_reward_2": 0.0,
537
- "rewards/final_correctness_reward_func": 1.5,
538
- "rewards/question_recreation_reward_func": 0.7185621028766036,
539
  "rewards/soft_format_reward_func": 0.0,
540
- "rewards/strict_format_reward_func": 0.234375,
541
- "rewards/xmlcount_reward_func": 0.9793750094249845,
542
  "step": 56
543
  },
544
  {
545
- "completion_length": 251.40625,
546
- "epoch": 3.2285714285714286,
547
- "grad_norm": 12.282088279724121,
548
- "kl": 18.559334562858567,
549
  "learning_rate": 2.0570225210519433e-07,
550
- "loss": 0.0186,
551
- "reward": 15.513990081846714,
552
- "reward_std": 3.7251646753866225,
553
- "rewards/concensus_correctness_reward_func": 10.890124998986721,
554
- "rewards/consensus_reward_func": 1.3125,
555
  "rewards/cumulative_reward_2": 0.0,
556
- "rewards/final_correctness_reward_func": 1.375,
557
- "rewards/question_recreation_reward_func": 0.6494588730856776,
558
  "rewards/soft_format_reward_func": 0.0,
559
- "rewards/strict_format_reward_func": 0.203125,
560
- "rewards/xmlcount_reward_func": 1.0837812572717667,
561
  "step": 58
562
  },
563
  {
564
- "completion_length": 278.84375,
565
- "epoch": 3.342857142857143,
566
- "grad_norm": 12.170891761779785,
567
- "kl": 4.615750857628882,
568
  "learning_rate": 1.8986876090843664e-07,
569
- "loss": 0.0046,
570
- "reward": 13.320735141634941,
571
- "reward_std": 6.643354088068008,
572
- "rewards/concensus_correctness_reward_func": 8.784562550485134,
573
- "rewards/consensus_reward_func": 1.0625,
574
  "rewards/cumulative_reward_2": 0.0,
575
- "rewards/final_correctness_reward_func": 1.5625,
576
- "rewards/question_recreation_reward_func": 0.6847666085232049,
577
  "rewards/soft_format_reward_func": 0.0,
578
- "rewards/strict_format_reward_func": 0.171875,
579
- "rewards/xmlcount_reward_func": 1.0545312482863665,
580
  "step": 60
581
  },
582
  {
583
- "completion_length": 246.78125,
584
- "epoch": 3.4571428571428573,
585
- "grad_norm": 9.369564056396484,
586
- "kl": 4.713114297017455,
587
  "learning_rate": 1.7428748102551234e-07,
588
- "loss": 0.0047,
589
- "reward": 13.258399233222008,
590
- "reward_std": 6.423583036288619,
591
- "rewards/concensus_correctness_reward_func": 9.021875010803342,
592
- "rewards/consensus_reward_func": 1.25,
593
  "rewards/cumulative_reward_2": 0.0,
594
- "rewards/final_correctness_reward_func": 0.9375,
595
- "rewards/question_recreation_reward_func": 0.7111177369952202,
596
  "rewards/soft_format_reward_func": 0.0,
597
- "rewards/strict_format_reward_func": 0.234375,
598
- "rewards/xmlcount_reward_func": 1.103531252592802,
599
  "step": 62
600
  },
601
  {
602
- "completion_length": 263.75,
603
- "epoch": 3.571428571428571,
604
- "grad_norm": 34.83134078979492,
605
- "kl": 8.73400528007187,
606
  "learning_rate": 1.5902376575912814e-07,
607
- "loss": 0.0087,
608
- "reward": 13.156703546643257,
609
- "reward_std": 3.433466176968068,
610
- "rewards/concensus_correctness_reward_func": 8.570312479510903,
611
- "rewards/consensus_reward_func": 1.25,
612
  "rewards/cumulative_reward_2": 0.0,
613
- "rewards/final_correctness_reward_func": 1.1875,
614
- "rewards/question_recreation_reward_func": 0.7510160449892282,
615
  "rewards/soft_format_reward_func": 0.0,
616
- "rewards/strict_format_reward_func": 0.265625,
617
- "rewards/xmlcount_reward_func": 1.1322499997913837,
618
  "step": 64
619
  },
620
  {
621
- "completion_length": 248.53125,
622
- "epoch": 3.685714285714286,
623
- "grad_norm": 7.867584228515625,
624
- "kl": 2.0124676286941394,
625
  "learning_rate": 1.4414163643562753e-07,
626
- "loss": 0.002,
627
- "reward": 16.860622122883797,
628
- "reward_std": 5.908110194373876,
629
- "rewards/concensus_correctness_reward_func": 11.929625045508146,
630
- "rewards/consensus_reward_func": 1.3125,
631
  "rewards/cumulative_reward_2": 0.0,
632
- "rewards/final_correctness_reward_func": 1.3125,
633
- "rewards/question_recreation_reward_func": 0.7619347143918276,
634
  "rewards/soft_format_reward_func": 0.0,
635
- "rewards/strict_format_reward_func": 0.375,
636
- "rewards/xmlcount_reward_func": 1.1690624989569187,
637
  "step": 66
638
  },
639
  {
640
- "completion_length": 264.8125,
641
- "epoch": 3.8,
642
- "grad_norm": 6.784933567047119,
643
- "kl": 1.5569924684241414,
644
  "learning_rate": 1.2970351387729872e-07,
645
- "loss": 0.0016,
646
- "reward": 12.743360340595245,
647
- "reward_std": 2.557829722936731,
648
- "rewards/concensus_correctness_reward_func": 8.357437543570995,
649
- "rewards/consensus_reward_func": 1.125,
650
  "rewards/cumulative_reward_2": 0.0,
651
- "rewards/final_correctness_reward_func": 1.25,
652
- "rewards/question_recreation_reward_func": 0.6942979600280523,
653
  "rewards/soft_format_reward_func": 0.0,
654
- "rewards/strict_format_reward_func": 0.3125,
655
- "rewards/xmlcount_reward_func": 1.0041249990463257,
656
  "step": 68
657
  },
658
  {
659
- "completion_length": 261.5625,
660
- "epoch": 3.914285714285714,
661
- "grad_norm": 10.075136184692383,
662
- "kl": 2.913988582789898,
663
  "learning_rate": 1.1576995658775404e-07,
664
- "loss": 0.0029,
665
- "reward": 15.701346069574356,
666
- "reward_std": 4.226778008276597,
667
- "rewards/concensus_correctness_reward_func": 10.768500030040741,
668
- "rewards/consensus_reward_func": 1.1875,
669
- "rewards/cumulative_reward_2": 0.0,
670
- "rewards/final_correctness_reward_func": 1.6875,
671
- "rewards/question_recreation_reward_func": 0.7689711172133684,
672
- "rewards/soft_format_reward_func": 0.0,
673
- "rewards/strict_format_reward_func": 0.25,
674
- "rewards/xmlcount_reward_func": 1.0388750080019236,
675
  "step": 70
676
  },
677
  {
678
- "completion_length": 242.375,
679
- "epoch": 4.0,
680
- "grad_norm": 7.251352787017822,
681
- "kl": 2.0374175421893597,
682
  "learning_rate": 1.0239940674851941e-07,
683
- "loss": 0.0015,
684
- "reward": 16.076478640238445,
685
- "reward_std": 6.51126005128026,
686
- "rewards/concensus_correctness_reward_func": 11.290750041604042,
687
- "rewards/consensus_reward_func": 1.3333333333333333,
688
  "rewards/cumulative_reward_2": 0.0,
689
- "rewards/final_correctness_reward_func": 1.5,
690
- "rewards/question_recreation_reward_func": 0.7505618532498678,
691
  "rewards/soft_format_reward_func": 0.0,
692
- "rewards/strict_format_reward_func": 0.22916666666666666,
693
- "rewards/xmlcount_reward_func": 0.9726666783293089,
694
  "step": 72
695
  },
696
  {
697
- "completion_length": 273.78125,
698
- "epoch": 4.114285714285714,
699
- "grad_norm": 11.66654109954834,
700
- "kl": 5.751777551136911,
701
  "learning_rate": 8.964794509221507e-08,
702
- "loss": 0.0058,
703
- "reward": 12.40931786596775,
704
- "reward_std": 4.601450197398663,
705
- "rewards/concensus_correctness_reward_func": 8.060937531292439,
706
- "rewards/consensus_reward_func": 0.9375,
707
  "rewards/cumulative_reward_2": 0.0,
708
- "rewards/final_correctness_reward_func": 1.4375,
709
- "rewards/question_recreation_reward_func": 0.6974428757093847,
710
  "rewards/soft_format_reward_func": 0.0,
711
- "rewards/strict_format_reward_func": 0.21875,
712
- "rewards/xmlcount_reward_func": 1.0571874994784594,
713
  "step": 74
714
  },
715
  {
716
- "completion_length": 280.625,
717
- "epoch": 4.228571428571429,
718
- "grad_norm": 8.615840911865234,
719
- "kl": 2.3532106273341924,
720
  "learning_rate": 7.756905568047392e-08,
721
- "loss": 0.0024,
722
- "reward": 12.607302218675613,
723
- "reward_std": 4.357940239366144,
724
- "rewards/concensus_correctness_reward_func": 7.97631249576807,
725
- "rewards/consensus_reward_func": 1.3125,
726
  "rewards/cumulative_reward_2": 0.0,
727
- "rewards/final_correctness_reward_func": 1.25,
728
- "rewards/question_recreation_reward_func": 0.8087085485458374,
729
  "rewards/soft_format_reward_func": 0.0,
730
- "rewards/strict_format_reward_func": 0.25,
731
- "rewards/xmlcount_reward_func": 1.009781249333173,
732
  "step": 76
733
  },
734
  {
735
- "completion_length": 228.125,
736
- "epoch": 4.3428571428571425,
737
- "grad_norm": 15.023426055908203,
738
- "kl": 4.856777695240453,
739
  "learning_rate": 6.621340157319996e-08,
740
- "loss": 0.0049,
741
- "reward": 15.21293680369854,
742
- "reward_std": 7.808012545632664,
743
- "rewards/concensus_correctness_reward_func": 10.372124960646033,
744
- "rewards/consensus_reward_func": 1.1875,
745
  "rewards/cumulative_reward_2": 0.0,
746
- "rewards/final_correctness_reward_func": 1.5,
747
- "rewards/question_recreation_reward_func": 0.7565617831423879,
748
  "rewards/soft_format_reward_func": 0.0,
749
- "rewards/strict_format_reward_func": 0.296875,
750
- "rewards/xmlcount_reward_func": 1.0998749984428287,
751
  "step": 78
752
  },
753
  {
754
- "completion_length": 228.3125,
755
- "epoch": 4.457142857142857,
756
- "grad_norm": 14.391371726989746,
757
- "kl": 25.115882573183626,
758
  "learning_rate": 5.5628612330087724e-08,
759
- "loss": 0.0251,
760
- "reward": 10.238258555531502,
761
- "reward_std": 6.636154420673847,
762
- "rewards/concensus_correctness_reward_func": 5.875812470912933,
763
- "rewards/consensus_reward_func": 0.75,
764
  "rewards/cumulative_reward_2": 0.0,
765
- "rewards/final_correctness_reward_func": 1.625,
766
- "rewards/question_recreation_reward_func": 0.6746960403397679,
767
  "rewards/soft_format_reward_func": 0.0,
768
- "rewards/strict_format_reward_func": 0.25,
769
- "rewards/xmlcount_reward_func": 1.0627500042319298,
770
  "step": 80
771
  },
772
  {
773
- "completion_length": 297.6875,
774
- "epoch": 4.571428571428571,
775
- "grad_norm": 9.444145202636719,
776
- "kl": 2.323721103835851,
777
  "learning_rate": 4.5859084235697235e-08,
778
- "loss": 0.0024,
779
- "reward": 14.212048962712288,
780
- "reward_std": 5.437735598199652,
781
- "rewards/concensus_correctness_reward_func": 9.653124989941716,
782
- "rewards/consensus_reward_func": 1.4375,
783
  "rewards/cumulative_reward_2": 0.0,
784
- "rewards/final_correctness_reward_func": 1.1875,
785
- "rewards/question_recreation_reward_func": 0.7724551558494568,
786
  "rewards/soft_format_reward_func": 0.0,
787
- "rewards/strict_format_reward_func": 0.1875,
788
- "rewards/xmlcount_reward_func": 0.9739687610417604,
789
  "step": 82
790
  },
791
  {
792
- "completion_length": 263.28125,
793
- "epoch": 4.685714285714286,
794
- "grad_norm": 6.39396858215332,
795
- "kl": 7.9060614665504545,
796
  "learning_rate": 3.6945794086007705e-08,
797
- "loss": 0.0079,
798
- "reward": 16.380697011947632,
799
- "reward_std": 3.2155018125195056,
800
- "rewards/concensus_correctness_reward_func": 11.199437521398067,
801
- "rewards/consensus_reward_func": 1.5,
802
  "rewards/cumulative_reward_2": 0.0,
803
- "rewards/final_correctness_reward_func": 1.375,
804
- "rewards/question_recreation_reward_func": 0.854978347197175,
805
  "rewards/soft_format_reward_func": 0.0,
806
- "rewards/strict_format_reward_func": 0.265625,
807
- "rewards/xmlcount_reward_func": 1.1856562532484531,
808
  "step": 84
809
  },
810
  {
811
- "completion_length": 264.5,
812
- "epoch": 4.8,
813
- "grad_norm": 8.364204406738281,
814
- "kl": 5.299565837252885,
815
  "learning_rate": 2.892612731749414e-08,
816
- "loss": 0.0053,
817
- "reward": 15.947876825928688,
818
- "reward_std": 6.744944950565696,
819
- "rewards/concensus_correctness_reward_func": 11.113375023007393,
820
- "rewards/consensus_reward_func": 1.4375,
821
  "rewards/cumulative_reward_2": 0.0,
822
- "rewards/final_correctness_reward_func": 1.5,
823
- "rewards/question_recreation_reward_func": 0.697533005848527,
824
  "rewards/soft_format_reward_func": 0.0,
825
- "rewards/strict_format_reward_func": 0.203125,
826
- "rewards/xmlcount_reward_func": 0.9963437505066395,
827
  "step": 86
828
  },
829
  {
830
- "completion_length": 239.28125,
831
- "epoch": 4.914285714285715,
832
- "grad_norm": 20.019237518310547,
833
- "kl": 6.356791608501226,
834
  "learning_rate": 2.183372119961499e-08,
835
- "loss": 0.0064,
836
- "reward": 17.015481919050217,
837
- "reward_std": 3.0891955395927653,
838
- "rewards/concensus_correctness_reward_func": 12.235625021159649,
839
- "rewards/consensus_reward_func": 1.3125,
840
  "rewards/cumulative_reward_2": 0.0,
841
- "rewards/final_correctness_reward_func": 1.5,
842
- "rewards/question_recreation_reward_func": 0.6681380509398878,
843
  "rewards/soft_format_reward_func": 0.0,
844
- "rewards/strict_format_reward_func": 0.203125,
845
- "rewards/xmlcount_reward_func": 1.0960937510244548,
846
  "step": 88
847
  },
848
  {
849
- "completion_length": 260.2916666666667,
850
- "epoch": 5.0,
851
- "grad_norm": 5.369960784912109,
852
- "kl": 7.677339844405651,
853
  "learning_rate": 1.5698323748414122e-08,
854
- "loss": 0.0058,
855
- "reward": 12.575575411319733,
856
- "reward_std": 9.121821681658426,
857
- "rewards/concensus_correctness_reward_func": 8.391249996920427,
858
- "rewards/consensus_reward_func": 0.9166666666666666,
859
  "rewards/cumulative_reward_2": 0.0,
860
- "rewards/final_correctness_reward_func": 1.5,
861
- "rewards/question_recreation_reward_func": 0.7138255399962267,
862
  "rewards/soft_format_reward_func": 0.0,
863
- "rewards/strict_format_reward_func": 0.125,
864
- "rewards/xmlcount_reward_func": 0.9288333433990678,
865
  "step": 90
866
  },
867
  {
868
- "completion_length": 239.96875,
869
- "epoch": 5.114285714285714,
870
- "grad_norm": 96.5102310180664,
871
- "kl": 18.3592914538458,
872
  "learning_rate": 1.054566895300324e-08,
873
- "loss": 0.0184,
874
- "reward": 13.847605228424072,
875
- "reward_std": 6.045313289389014,
876
- "rewards/concensus_correctness_reward_func": 9.380875006318092,
877
- "rewards/consensus_reward_func": 1.125,
878
  "rewards/cumulative_reward_2": 0.0,
879
- "rewards/final_correctness_reward_func": 1.4375,
880
- "rewards/question_recreation_reward_func": 0.6726677799597383,
881
  "rewards/soft_format_reward_func": 0.0,
882
- "rewards/strict_format_reward_func": 0.25,
883
- "rewards/xmlcount_reward_func": 0.981562502682209,
884
  "step": 92
885
  },
886
  {
887
- "completion_length": 285.28125,
888
- "epoch": 5.228571428571429,
889
- "grad_norm": 99.03144073486328,
890
- "kl": 15.879234082996845,
891
  "learning_rate": 6.397368838268496e-09,
892
- "loss": 0.0159,
893
- "reward": 9.960642769932747,
894
- "reward_std": 5.102133866865188,
895
- "rewards/concensus_correctness_reward_func": 6.02468744199723,
896
- "rewards/consensus_reward_func": 1.3125,
897
  "rewards/cumulative_reward_2": 0.0,
898
- "rewards/final_correctness_reward_func": 0.9375,
899
- "rewards/question_recreation_reward_func": 0.7212677141651511,
900
  "rewards/soft_format_reward_func": 0.0,
901
- "rewards/strict_format_reward_func": 0.15625,
902
- "rewards/xmlcount_reward_func": 0.8084374981699511,
903
  "step": 94
904
  },
905
  {
906
- "completion_length": 240.71875,
907
- "epoch": 5.3428571428571425,
908
- "grad_norm": 52.954837799072266,
909
- "kl": 20.95912204636261,
910
  "learning_rate": 3.2708228165273244e-09,
911
- "loss": 0.021,
912
- "reward": 15.641045704483986,
913
- "reward_std": 3.0330299666675273,
914
- "rewards/concensus_correctness_reward_func": 10.918187517672777,
915
- "rewards/consensus_reward_func": 1.375,
916
  "rewards/cumulative_reward_2": 0.0,
917
- "rewards/final_correctness_reward_func": 1.4375,
918
- "rewards/question_recreation_reward_func": 0.5993270333856344,
919
  "rewards/soft_format_reward_func": 0.0,
920
- "rewards/strict_format_reward_func": 0.234375,
921
- "rewards/xmlcount_reward_func": 1.076656258199364,
922
  "step": 96
923
  },
924
  {
925
- "completion_length": 234.71875,
926
- "epoch": 5.457142857142857,
927
- "grad_norm": 16.95273208618164,
928
- "kl": 11.928820116794668,
929
  "learning_rate": 1.1791447083465133e-09,
930
- "loss": 0.0119,
931
- "reward": 12.215655967593193,
932
- "reward_std": 8.51226019859314,
933
- "rewards/concensus_correctness_reward_func": 7.525312505662441,
934
- "rewards/consensus_reward_func": 0.875,
935
  "rewards/cumulative_reward_2": 0.0,
936
- "rewards/final_correctness_reward_func": 1.5625,
937
- "rewards/question_recreation_reward_func": 0.8116247765719891,
938
  "rewards/soft_format_reward_func": 0.0,
939
- "rewards/strict_format_reward_func": 0.25,
940
- "rewards/xmlcount_reward_func": 1.1912187524139881,
941
  "step": 98
942
  },
943
  {
944
- "completion_length": 220.0625,
945
- "epoch": 5.571428571428571,
946
- "grad_norm": 21.187820434570312,
947
- "kl": 11.71281518554315,
948
  "learning_rate": 1.3110773862126667e-10,
949
- "loss": 0.0117,
950
- "reward": 13.504412293434143,
951
- "reward_std": 6.072454041801393,
952
- "rewards/concensus_correctness_reward_func": 9.202812563627958,
953
- "rewards/consensus_reward_func": 1.0,
954
  "rewards/cumulative_reward_2": 0.0,
955
- "rewards/final_correctness_reward_func": 1.375,
956
- "rewards/question_recreation_reward_func": 0.6520686270669103,
957
  "rewards/soft_format_reward_func": 0.0,
958
- "rewards/strict_format_reward_func": 0.234375,
959
- "rewards/xmlcount_reward_func": 1.040156252682209,
960
  "step": 100
961
  },
962
  {
963
- "epoch": 5.571428571428571,
964
  "step": 100,
965
  "total_flos": 0.0,
966
- "train_loss": 0.018054345073833248,
967
- "train_runtime": 1357.263,
968
- "train_samples_per_second": 1.179,
969
- "train_steps_per_second": 0.074
970
  }
971
  ],
972
  "logging_steps": 2,
973
  "max_steps": 100,
974
  "num_input_tokens_seen": 0,
975
- "num_train_epochs": 6,
976
  "save_steps": 25,
977
  "stateful_callbacks": {
978
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 16.666666666666668,
6
  "eval_steps": 500,
7
  "global_step": 100,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "completion_length": 248.21875,
14
+ "epoch": 0.3333333333333333,
15
+ "grad_norm": 3.4614758491516113,
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666665e-07,
18
  "loss": -0.0,
19
+ "reward": 1.7851608581840992,
20
+ "reward_std": 1.0083980893250555,
21
+ "rewards/concensus_correctness_reward_func": 0.12012499943375587,
22
+ "rewards/consensus_reward_func": 0.4375,
23
  "rewards/cumulative_reward_2": 0.0,
24
+ "rewards/final_correctness_reward_func": 0.25,
25
+ "rewards/question_recreation_reward_func": 0.518067104741931,
26
  "rewards/soft_format_reward_func": 0.0,
27
+ "rewards/strict_format_reward_func": 0.046875,
28
+ "rewards/xmlcount_reward_func": 0.4125937600620091,
29
  "step": 2
30
  },
31
  {
32
+ "completion_length": 273.46875,
33
+ "epoch": 0.6666666666666666,
34
+ "grad_norm": 2.2614688873291016,
35
+ "kl": 0.000583945713515277,
36
  "learning_rate": 5e-07,
37
  "loss": 0.0,
38
+ "reward": 2.26899578794837,
39
+ "reward_std": 0.8781831180676818,
40
+ "rewards/concensus_correctness_reward_func": 0.18174999952316284,
41
+ "rewards/consensus_reward_func": 0.625,
42
  "rewards/cumulative_reward_2": 0.0,
43
+ "rewards/final_correctness_reward_func": 0.125,
44
+ "rewards/question_recreation_reward_func": 0.6493395259603858,
45
  "rewards/soft_format_reward_func": 0.0,
46
+ "rewards/strict_format_reward_func": 0.046875,
47
+ "rewards/xmlcount_reward_func": 0.6410312540829182,
48
  "step": 4
49
  },
50
  {
51
+ "completion_length": 230.65625,
52
+ "epoch": 1.0,
53
+ "grad_norm": 2.344890594482422,
54
+ "kl": 0.0007161081903177546,
55
  "learning_rate": 4.994757065594279e-07,
56
+ "loss": 0.0,
57
+ "reward": 1.7817438933998346,
58
+ "reward_std": 0.8162978617474437,
59
+ "rewards/concensus_correctness_reward_func": 0.09124999865889549,
60
+ "rewards/consensus_reward_func": 0.6875,
61
  "rewards/cumulative_reward_2": 0.0,
62
+ "rewards/final_correctness_reward_func": 0.0,
63
+ "rewards/question_recreation_reward_func": 0.4062751979799941,
64
  "rewards/soft_format_reward_func": 0.0,
65
+ "rewards/strict_format_reward_func": 0.046875,
66
+ "rewards/xmlcount_reward_func": 0.5498437532223761,
67
  "step": 6
68
  },
69
  {
70
+ "completion_length": 219.8125,
71
+ "epoch": 1.3333333333333333,
72
+ "grad_norm": 26.660634994506836,
73
+ "kl": 0.0012114797100366559,
74
  "learning_rate": 4.979050253066063e-07,
75
+ "loss": -0.0,
76
+ "reward": 1.8910054825246334,
77
+ "reward_std": 1.1408569859668205,
78
+ "rewards/concensus_correctness_reward_func": 0.12249999865889549,
79
+ "rewards/consensus_reward_func": 0.4375,
80
  "rewards/cumulative_reward_2": 0.0,
81
+ "rewards/final_correctness_reward_func": 0.125,
82
+ "rewards/question_recreation_reward_func": 0.536755473469384,
83
  "rewards/soft_format_reward_func": 0.0,
84
+ "rewards/strict_format_reward_func": 0.078125,
85
+ "rewards/xmlcount_reward_func": 0.5911250002682209,
86
  "step": 8
87
  },
88
  {
89
+ "completion_length": 202.46875,
90
+ "epoch": 1.6666666666666665,
91
+ "grad_norm": 7.760759353637695,
92
+ "kl": 0.0012618956752703525,
93
  "learning_rate": 4.952945442245597e-07,
94
+ "loss": 0.0,
95
+ "reward": 1.9051898624747992,
96
+ "reward_std": 0.958464287687093,
97
+ "rewards/concensus_correctness_reward_func": 0.12012499943375587,
98
+ "rewards/consensus_reward_func": 0.4375,
99
+ "rewards/cumulative_reward_2": 0.0,
100
+ "rewards/final_correctness_reward_func": 0.1875,
101
+ "rewards/question_recreation_reward_func": 0.3846898500341922,
102
+ "rewards/soft_format_reward_func": 0.015625,
103
+ "rewards/strict_format_reward_func": 0.09375,
104
+ "rewards/xmlcount_reward_func": 0.6660000011324883,
105
  "step": 10
106
  },
107
  {
108
+ "completion_length": 336.28125,
109
+ "epoch": 2.0,
110
+ "grad_norm": 2.4356420040130615,
111
+ "kl": 0.0008406055276282132,
112
  "learning_rate": 4.916552125781528e-07,
113
+ "loss": 0.0,
114
+ "reward": 1.8280094247311354,
115
+ "reward_std": 1.094328472390771,
116
+ "rewards/concensus_correctness_reward_func": 0.06012500077486038,
117
+ "rewards/consensus_reward_func": 0.5625,
118
  "rewards/cumulative_reward_2": 0.0,
119
+ "rewards/final_correctness_reward_func": 0.3125,
120
+ "rewards/question_recreation_reward_func": 0.5948844254016876,
121
  "rewards/soft_format_reward_func": 0.0,
122
+ "rewards/strict_format_reward_func": 0.015625,
123
+ "rewards/xmlcount_reward_func": 0.28237500321120024,
124
  "step": 12
125
  },
126
  {
127
+ "completion_length": 282.8125,
128
+ "epoch": 2.3333333333333335,
129
+ "grad_norm": 2.0542495250701904,
130
+ "kl": 0.0009237846843461739,
131
  "learning_rate": 4.870022949890676e-07,
132
+ "loss": 0.0,
133
+ "reward": 1.8283372893929482,
134
+ "reward_std": 1.189666131976992,
135
+ "rewards/concensus_correctness_reward_func": 0.23618749901652336,
136
+ "rewards/consensus_reward_func": 0.625,
137
  "rewards/cumulative_reward_2": 0.0,
138
+ "rewards/final_correctness_reward_func": 0.125,
139
+ "rewards/question_recreation_reward_func": 0.6880872882902622,
140
  "rewards/soft_format_reward_func": 0.0,
141
+ "rewards/strict_format_reward_func": 0.0,
142
+ "rewards/xmlcount_reward_func": 0.15406249510124326,
143
  "step": 14
144
  },
145
  {
146
+ "completion_length": 266.84375,
147
+ "epoch": 2.6666666666666665,
148
+ "grad_norm": 3.6135895252227783,
149
+ "kl": 0.0011918679920199793,
150
  "learning_rate": 4.81355307410676e-07,
151
+ "loss": 0.0,
152
+ "reward": 1.9164189714938402,
153
+ "reward_std": 1.160076865926385,
154
+ "rewards/concensus_correctness_reward_func": 0.11756250262260437,
155
+ "rewards/consensus_reward_func": 0.625,
156
  "rewards/cumulative_reward_2": 0.0,
157
+ "rewards/final_correctness_reward_func": 0.125,
158
+ "rewards/question_recreation_reward_func": 0.48829398211091757,
159
  "rewards/soft_format_reward_func": 0.0,
160
+ "rewards/strict_format_reward_func": 0.046875,
161
+ "rewards/xmlcount_reward_func": 0.513687499333173,
162
  "step": 16
163
  },
164
  {
165
+ "completion_length": 238.6875,
166
+ "epoch": 3.0,
167
+ "grad_norm": 2.7586710453033447,
168
+ "kl": 0.0012445114989532158,
169
  "learning_rate": 4.747379352713488e-07,
170
+ "loss": 0.0,
171
+ "reward": 1.9129376076161861,
172
+ "reward_std": 0.9141782382503152,
173
+ "rewards/concensus_correctness_reward_func": 0.05999999865889549,
174
+ "rewards/consensus_reward_func": 0.5625,
175
  "rewards/cumulative_reward_2": 0.0,
176
+ "rewards/final_correctness_reward_func": 0.125,
177
+ "rewards/question_recreation_reward_func": 0.5800001341849566,
178
  "rewards/soft_format_reward_func": 0.0,
179
+ "rewards/strict_format_reward_func": 0.015625,
180
+ "rewards/xmlcount_reward_func": 0.5698124971240759,
181
  "step": 18
182
  },
183
  {
184
+ "completion_length": 243.90625,
185
+ "epoch": 3.3333333333333335,
186
+ "grad_norm": 3.191770553588867,
187
+ "kl": 0.0015616379932907876,
188
  "learning_rate": 4.6717793412953776e-07,
189
+ "loss": 0.0,
190
+ "reward": 1.7438320461660624,
191
+ "reward_std": 1.1812182366847992,
192
+ "rewards/concensus_correctness_reward_func": 0.1197500005364418,
193
+ "rewards/consensus_reward_func": 0.75,
194
  "rewards/cumulative_reward_2": 0.0,
195
+ "rewards/final_correctness_reward_func": 0.0625,
196
+ "rewards/question_recreation_reward_func": 0.40089456969872117,
197
  "rewards/soft_format_reward_func": 0.0,
198
+ "rewards/strict_format_reward_func": 0.015625,
199
+ "rewards/xmlcount_reward_func": 0.3950625052675605,
200
  "step": 20
201
  },
202
  {
203
+ "completion_length": 295.375,
204
+ "epoch": 3.6666666666666665,
205
+ "grad_norm": 2.1044507026672363,
206
+ "kl": 0.002216623950516805,
207
  "learning_rate": 4.5870701325731773e-07,
208
+ "loss": 0.0,
209
+ "reward": 2.3217686265707016,
210
+ "reward_std": 0.9629865860333666,
211
+ "rewards/concensus_correctness_reward_func": 0.12012499943375587,
212
+ "rewards/consensus_reward_func": 0.8125,
213
  "rewards/cumulative_reward_2": 0.0,
214
+ "rewards/final_correctness_reward_func": 0.3125,
215
+ "rewards/question_recreation_reward_func": 0.5311123521532863,
216
  "rewards/soft_format_reward_func": 0.0,
217
+ "rewards/strict_format_reward_func": 0.03125,
218
+ "rewards/xmlcount_reward_func": 0.5142812496051192,
219
  "step": 22
220
  },
221
  {
222
+ "completion_length": 250.9375,
223
+ "epoch": 4.0,
224
+ "grad_norm": 2.842360496520996,
225
+ "kl": 0.001575836147821974,
226
  "learning_rate": 4.4936070264068016e-07,
227
+ "loss": 0.0,
228
+ "reward": 1.8550556376576424,
229
+ "reward_std": 1.1311865192838013,
230
+ "rewards/concensus_correctness_reward_func": 0.11993750184774399,
231
+ "rewards/consensus_reward_func": 0.5,
232
  "rewards/cumulative_reward_2": 0.0,
233
+ "rewards/final_correctness_reward_func": 0.25,
234
+ "rewards/question_recreation_reward_func": 0.6037118844687939,
235
  "rewards/soft_format_reward_func": 0.0,
236
+ "rewards/strict_format_reward_func": 0.015625,
237
+ "rewards/xmlcount_reward_func": 0.36578124947845936,
238
  "step": 24
239
  },
240
  {
241
+ "completion_length": 236.375,
242
+ "epoch": 4.333333333333333,
243
+ "grad_norm": 3.5260348320007324,
244
+ "kl": 0.0018173266253143083,
245
  "learning_rate": 4.391782039544238e-07,
246
+ "loss": 0.0,
247
+ "reward": 2.022034127265215,
248
+ "reward_std": 0.9448904548771679,
249
+ "rewards/concensus_correctness_reward_func": 0.26981250010430813,
250
+ "rewards/consensus_reward_func": 0.8125,
251
  "rewards/cumulative_reward_2": 0.0,
252
+ "rewards/final_correctness_reward_func": 0.1875,
253
+ "rewards/question_recreation_reward_func": 0.410690407268703,
254
  "rewards/soft_format_reward_func": 0.0,
255
+ "rewards/strict_format_reward_func": 0.0,
256
+ "rewards/xmlcount_reward_func": 0.34153125202283263,
257
  "step": 26
258
  },
259
  {
260
+ "completion_length": 222.1875,
261
+ "epoch": 4.666666666666667,
262
+ "grad_norm": 2.8555054664611816,
263
+ "kl": 0.0025639020168455318,
264
  "learning_rate": 4.282022261367073e-07,
265
+ "loss": 0.0,
266
+ "reward": 1.850969212129712,
267
+ "reward_std": 0.8895461307838559,
268
+ "rewards/concensus_correctness_reward_func": 0.05999999865889549,
269
+ "rewards/consensus_reward_func": 0.625,
270
  "rewards/cumulative_reward_2": 0.0,
271
+ "rewards/final_correctness_reward_func": 0.0625,
272
+ "rewards/question_recreation_reward_func": 0.5744379386305809,
273
  "rewards/soft_format_reward_func": 0.0,
274
+ "rewards/strict_format_reward_func": 0.015625,
275
+ "rewards/xmlcount_reward_func": 0.5134062475990504,
276
  "step": 28
277
  },
278
  {
279
+ "completion_length": 312.1875,
280
+ "epoch": 5.0,
281
+ "grad_norm": 2.6812455654144287,
282
+ "kl": 0.002034817836829461,
283
  "learning_rate": 4.1647880625292027e-07,
284
+ "loss": 0.0,
285
+ "reward": 2.235242122784257,
286
+ "reward_std": 0.903150615748018,
287
+ "rewards/concensus_correctness_reward_func": 0.302437499165535,
288
+ "rewards/consensus_reward_func": 0.625,
289
  "rewards/cumulative_reward_2": 0.0,
290
+ "rewards/final_correctness_reward_func": 0.0625,
291
+ "rewards/question_recreation_reward_func": 0.701929610222578,
292
  "rewards/soft_format_reward_func": 0.0,
293
+ "rewards/strict_format_reward_func": 0.0625,
294
+ "rewards/xmlcount_reward_func": 0.48087499663233757,
295
  "step": 30
296
  },
297
  {
298
+ "completion_length": 222.03125,
299
+ "epoch": 5.333333333333333,
300
+ "grad_norm": 3.4555599689483643,
301
+ "kl": 0.0031993754600989632,
302
  "learning_rate": 4.040571164002318e-07,
303
+ "loss": 0.0,
304
+ "reward": 2.742512159049511,
305
+ "reward_std": 1.1281160314101726,
306
+ "rewards/concensus_correctness_reward_func": 0.42487499490380287,
307
+ "rewards/consensus_reward_func": 0.625,
308
  "rewards/cumulative_reward_2": 0.0,
309
+ "rewards/final_correctness_reward_func": 0.375,
310
+ "rewards/question_recreation_reward_func": 0.6586684007197618,
311
  "rewards/soft_format_reward_func": 0.0,
312
+ "rewards/strict_format_reward_func": 0.0625,
313
+ "rewards/xmlcount_reward_func": 0.5964687503874302,
314
  "step": 32
315
  },
316
  {
317
+ "completion_length": 289.875,
318
+ "epoch": 5.666666666666667,
319
+ "grad_norm": 2.337428092956543,
320
+ "kl": 0.0029264501317811664,
321
  "learning_rate": 3.909892574627266e-07,
322
+ "loss": 0.0,
323
+ "reward": 1.9419155293144286,
324
+ "reward_std": 1.168605322483927,
325
+ "rewards/concensus_correctness_reward_func": 0.30256250128149986,
326
+ "rewards/consensus_reward_func": 0.75,
327
  "rewards/cumulative_reward_2": 0.0,
328
+ "rewards/final_correctness_reward_func": 0.0625,
329
+ "rewards/question_recreation_reward_func": 0.4774467754177749,
330
  "rewards/soft_format_reward_func": 0.0,
331
+ "rewards/strict_format_reward_func": 0.03125,
332
+ "rewards/xmlcount_reward_func": 0.31815625075250864,
333
  "step": 34
334
  },
335
  {
336
+ "completion_length": 271.75,
337
+ "epoch": 6.0,
338
+ "grad_norm": 3.06026291847229,
339
+ "kl": 0.0034197392051282804,
340
  "learning_rate": 3.773300405821908e-07,
341
+ "loss": 0.0,
342
+ "reward": 2.5837525203824043,
343
+ "reward_std": 1.1307801827788353,
344
+ "rewards/concensus_correctness_reward_func": 0.29768750071525574,
345
+ "rewards/consensus_reward_func": 1.0,
346
  "rewards/cumulative_reward_2": 0.0,
347
+ "rewards/final_correctness_reward_func": 0.3125,
348
+ "rewards/question_recreation_reward_func": 0.54522127751261,
349
  "rewards/soft_format_reward_func": 0.0,
350
+ "rewards/strict_format_reward_func": 0.03125,
351
+ "rewards/xmlcount_reward_func": 0.3970937414560467,
352
  "step": 36
353
  },
354
  {
355
+ "completion_length": 273.96875,
356
+ "epoch": 6.333333333333333,
357
+ "grad_norm": 6.918361186981201,
358
+ "kl": 0.0025484783691354096,
359
  "learning_rate": 3.6313675726113475e-07,
360
+ "loss": 0.0,
361
+ "reward": 1.684893824160099,
362
+ "reward_std": 1.0457911072298884,
363
+ "rewards/concensus_correctness_reward_func": 0.1797499991953373,
364
+ "rewards/consensus_reward_func": 0.5625,
365
  "rewards/cumulative_reward_2": 0.0,
366
+ "rewards/final_correctness_reward_func": 0.0,
367
+ "rewards/question_recreation_reward_func": 0.5752063244581223,
368
  "rewards/soft_format_reward_func": 0.0,
369
+ "rewards/strict_format_reward_func": 0.0,
370
+ "rewards/xmlcount_reward_func": 0.3674375033006072,
371
  "step": 38
372
  },
373
  {
374
+ "completion_length": 268.4375,
375
+ "epoch": 6.666666666666667,
376
+ "grad_norm": 2.9002139568328857,
377
+ "kl": 0.002866584218281787,
378
  "learning_rate": 3.484689390623218e-07,
379
+ "loss": 0.0,
380
+ "reward": 1.900494983419776,
381
+ "reward_std": 1.2571525508537889,
382
+ "rewards/concensus_correctness_reward_func": 0.24512499943375587,
383
+ "rewards/consensus_reward_func": 0.625,
384
  "rewards/cumulative_reward_2": 0.0,
385
+ "rewards/final_correctness_reward_func": 0.125,
386
+ "rewards/question_recreation_reward_func": 0.6020261840894818,
387
  "rewards/soft_format_reward_func": 0.0,
388
+ "rewards/strict_format_reward_func": 0.0,
389
+ "rewards/xmlcount_reward_func": 0.3033437514677644,
390
  "step": 40
391
  },
392
  {
393
+ "completion_length": 292.96875,
394
+ "epoch": 7.0,
395
+ "grad_norm": 2.595947742462158,
396
+ "kl": 0.0034651760361157358,
397
  "learning_rate": 3.3338810791270517e-07,
398
+ "loss": 0.0,
399
+ "reward": 2.582839421927929,
400
+ "reward_std": 0.8570395507849753,
401
+ "rewards/concensus_correctness_reward_func": 0.42537499964237213,
402
+ "rewards/consensus_reward_func": 0.6875,
403
  "rewards/cumulative_reward_2": 0.0,
404
+ "rewards/final_correctness_reward_func": 0.375,
405
+ "rewards/question_recreation_reward_func": 0.5414331587962806,
406
  "rewards/soft_format_reward_func": 0.0,
407
+ "rewards/strict_format_reward_func": 0.03125,
408
+ "rewards/xmlcount_reward_func": 0.5222812481224537,
409
  "step": 42
410
  },
411
  {
412
+ "completion_length": 277.78125,
413
+ "epoch": 7.333333333333333,
414
+ "grad_norm": 3.5444931983947754,
415
+ "kl": 0.005151846205990296,
416
  "learning_rate": 3.179575180590857e-07,
417
+ "loss": 0.0,
418
+ "reward": 2.2039641477167606,
419
+ "reward_std": 1.085899718105793,
420
+ "rewards/concensus_correctness_reward_func": 0.1823749952018261,
421
+ "rewards/consensus_reward_func": 0.75,
422
  "rewards/cumulative_reward_2": 0.0,
423
+ "rewards/final_correctness_reward_func": 0.1875,
424
+ "rewards/question_recreation_reward_func": 0.4583704025717452,
425
  "rewards/soft_format_reward_func": 0.0,
426
+ "rewards/strict_format_reward_func": 0.015625,
427
+ "rewards/xmlcount_reward_func": 0.6100937575101852,
428
  "step": 44
429
  },
430
  {
431
+ "completion_length": 255.9375,
432
+ "epoch": 7.666666666666667,
433
+ "grad_norm": 2.611354112625122,
434
+ "kl": 0.004227373341564089,
435
  "learning_rate": 3.022418907578188e-07,
436
+ "loss": 0.0,
437
+ "reward": 1.8945876602083445,
438
+ "reward_std": 0.9345793006941676,
439
+ "rewards/concensus_correctness_reward_func": 0.18268750235438347,
440
+ "rewards/consensus_reward_func": 0.5,
441
  "rewards/cumulative_reward_2": 0.0,
442
+ "rewards/final_correctness_reward_func": 0.1875,
443
+ "rewards/question_recreation_reward_func": 0.56027518119663,
444
  "rewards/soft_format_reward_func": 0.0,
445
+ "rewards/strict_format_reward_func": 0.015625,
446
+ "rewards/xmlcount_reward_func": 0.4484999952837825,
447
  "step": 46
448
  },
449
  {
450
+ "completion_length": 211.78125,
451
+ "epoch": 8.0,
452
+ "grad_norm": 7.4239501953125,
453
+ "kl": 0.005186592134123202,
454
  "learning_rate": 2.863071428113726e-07,
455
+ "loss": 0.0,
456
+ "reward": 2.3263955637812614,
457
+ "reward_std": 0.9710670886561275,
458
+ "rewards/concensus_correctness_reward_func": 0.05962499976158142,
459
+ "rewards/consensus_reward_func": 0.75,
460
  "rewards/cumulative_reward_2": 0.0,
461
+ "rewards/final_correctness_reward_func": 0.25,
462
+ "rewards/question_recreation_reward_func": 0.6223018001765013,
463
  "rewards/soft_format_reward_func": 0.0,
464
+ "rewards/strict_format_reward_func": 0.046875,
465
+ "rewards/xmlcount_reward_func": 0.5975937452167273,
466
  "step": 48
467
  },
468
  {
469
+ "completion_length": 237.625,
470
+ "epoch": 8.333333333333334,
471
+ "grad_norm": 3.110980272293091,
472
+ "kl": 0.005397260436438955,
473
  "learning_rate": 2.7022011009035107e-07,
474
+ "loss": 0.0,
475
+ "reward": 2.3635387681424618,
476
+ "reward_std": 1.1881022350862622,
477
+ "rewards/concensus_correctness_reward_func": 0.1797499991953373,
478
+ "rewards/consensus_reward_func": 0.875,
479
  "rewards/cumulative_reward_2": 0.0,
480
+ "rewards/final_correctness_reward_func": 0.1875,
481
+ "rewards/question_recreation_reward_func": 0.4961012676358223,
482
  "rewards/soft_format_reward_func": 0.0,
483
+ "rewards/strict_format_reward_func": 0.015625,
484
+ "rewards/xmlcount_reward_func": 0.6095625087618828,
485
  "step": 50
486
  },
487
  {
488
+ "completion_length": 302.375,
489
+ "epoch": 8.666666666666666,
490
+ "grad_norm": 3.359147787094116,
491
+ "kl": 0.00526040667318739,
492
  "learning_rate": 2.540482672006254e-07,
493
+ "loss": 0.0,
494
+ "reward": 1.951256174594164,
495
+ "reward_std": 1.4674215724226087,
496
+ "rewards/concensus_correctness_reward_func": 0.14731250144541264,
497
+ "rewards/consensus_reward_func": 0.5625,
498
  "rewards/cumulative_reward_2": 0.0,
499
+ "rewards/final_correctness_reward_func": 0.3125,
500
+ "rewards/question_recreation_reward_func": 0.5990999331697822,
501
  "rewards/soft_format_reward_func": 0.0,
502
+ "rewards/strict_format_reward_func": 0.03125,
503
+ "rewards/xmlcount_reward_func": 0.29859375115484,
504
  "step": 52
505
  },
506
  {
507
+ "completion_length": 271.34375,
508
+ "epoch": 9.0,
509
+ "grad_norm": 2.3752264976501465,
510
+ "kl": 0.006538017390994355,
511
  "learning_rate": 2.37859444471388e-07,
512
+ "loss": 0.0,
513
+ "reward": 2.0346078351140022,
514
+ "reward_std": 0.7186543988646008,
515
+ "rewards/concensus_correctness_reward_func": 0.24512499943375587,
516
+ "rewards/consensus_reward_func": 0.5625,
517
  "rewards/cumulative_reward_2": 0.0,
518
+ "rewards/final_correctness_reward_func": 0.0625,
519
+ "rewards/question_recreation_reward_func": 0.5727328350767493,
520
  "rewards/soft_format_reward_func": 0.0,
521
+ "rewards/strict_format_reward_func": 0.015625,
522
+ "rewards/xmlcount_reward_func": 0.5761250024661422,
523
  "step": 54
524
  },
525
  {
526
+ "completion_length": 258.5625,
527
+ "epoch": 9.333333333333334,
528
+ "grad_norm": 2.415621519088745,
529
+ "kl": 0.0068657751398859546,
530
  "learning_rate": 2.2172154345117894e-07,
531
+ "loss": 0.0,
532
+ "reward": 2.7013516426086426,
533
+ "reward_std": 1.1713198693469167,
534
+ "rewards/concensus_correctness_reward_func": 0.17987500131130219,
535
+ "rewards/consensus_reward_func": 1.0625,
536
  "rewards/cumulative_reward_2": 0.0,
537
+ "rewards/final_correctness_reward_func": 0.25,
538
+ "rewards/question_recreation_reward_func": 0.5597578389570117,
539
  "rewards/soft_format_reward_func": 0.0,
540
+ "rewards/strict_format_reward_func": 0.0625,
541
+ "rewards/xmlcount_reward_func": 0.5867187539115548,
542
  "step": 56
543
  },
544
  {
545
+ "completion_length": 250.4375,
546
+ "epoch": 9.666666666666666,
547
+ "grad_norm": 3.3267598152160645,
548
+ "kl": 0.007265938082127832,
549
  "learning_rate": 2.0570225210519433e-07,
550
+ "loss": 0.0,
551
+ "reward": 1.7129965675994754,
552
+ "reward_std": 0.9574196808971465,
553
+ "rewards/concensus_correctness_reward_func": 0.24262499809265137,
554
+ "rewards/consensus_reward_func": 0.3125,
555
  "rewards/cumulative_reward_2": 0.0,
556
+ "rewards/final_correctness_reward_func": 0.125,
557
+ "rewards/question_recreation_reward_func": 0.5204653043183498,
558
  "rewards/soft_format_reward_func": 0.0,
559
+ "rewards/strict_format_reward_func": 0.03125,
560
+ "rewards/xmlcount_reward_func": 0.4811562584945932,
561
  "step": 58
562
  },
563
  {
564
+ "completion_length": 294.5625,
565
+ "epoch": 10.0,
566
+ "grad_norm": 3.0382344722747803,
567
+ "kl": 0.007241527950100135,
568
  "learning_rate": 1.8986876090843664e-07,
569
+ "loss": 0.0,
570
+ "reward": 1.9392794030718505,
571
+ "reward_std": 1.0110920977240312,
572
+ "rewards/concensus_correctness_reward_func": 0.11993750184774399,
573
+ "rewards/consensus_reward_func": 0.8125,
574
  "rewards/cumulative_reward_2": 0.0,
575
+ "rewards/final_correctness_reward_func": 0.0625,
576
+ "rewards/question_recreation_reward_func": 0.500998193398118,
577
  "rewards/soft_format_reward_func": 0.0,
578
+ "rewards/strict_format_reward_func": 0.015625,
579
+ "rewards/xmlcount_reward_func": 0.42771875113248825,
580
  "step": 60
581
  },
582
  {
583
+ "completion_length": 222.125,
584
+ "epoch": 10.333333333333334,
585
+ "grad_norm": 3.4102041721343994,
586
+ "kl": 0.00738686349359341,
587
  "learning_rate": 1.7428748102551234e-07,
588
+ "loss": 0.0,
589
+ "reward": 1.7328537330031395,
590
+ "reward_std": 0.9404821525095031,
591
+ "rewards/concensus_correctness_reward_func": 0.1223749965429306,
592
+ "rewards/consensus_reward_func": 0.5625,
593
  "rewards/cumulative_reward_2": 0.0,
594
+ "rewards/final_correctness_reward_func": 0.0,
595
+ "rewards/question_recreation_reward_func": 0.45538498694077134,
596
  "rewards/soft_format_reward_func": 0.0,
597
+ "rewards/strict_format_reward_func": 0.03125,
598
+ "rewards/xmlcount_reward_func": 0.5613437537103891,
599
  "step": 62
600
  },
601
  {
602
+ "completion_length": 322.46875,
603
+ "epoch": 10.666666666666666,
604
+ "grad_norm": 2.5114853382110596,
605
+ "kl": 0.005600389951723628,
606
  "learning_rate": 1.5902376575912814e-07,
607
+ "loss": 0.0,
608
+ "reward": 2.345261871814728,
609
+ "reward_std": 1.4083554986864328,
610
+ "rewards/concensus_correctness_reward_func": 0.35993750020861626,
611
+ "rewards/consensus_reward_func": 0.6875,
612
  "rewards/cumulative_reward_2": 0.0,
613
+ "rewards/final_correctness_reward_func": 0.25,
614
+ "rewards/question_recreation_reward_func": 0.621855610050261,
615
  "rewards/soft_format_reward_func": 0.0,
616
+ "rewards/strict_format_reward_func": 0.015625,
617
+ "rewards/xmlcount_reward_func": 0.4103437541052699,
618
  "step": 64
619
  },
620
  {
621
+ "completion_length": 238.65625,
622
+ "epoch": 11.0,
623
+ "grad_norm": 2.935357093811035,
624
+ "kl": 0.00783520688128192,
625
  "learning_rate": 1.4414163643562753e-07,
626
+ "loss": 0.0,
627
+ "reward": 2.1366525441408157,
628
+ "reward_std": 1.0672186479205266,
629
+ "rewards/concensus_correctness_reward_func": 0.11743750050663948,
630
+ "rewards/consensus_reward_func": 0.5625,
631
  "rewards/cumulative_reward_2": 0.0,
632
+ "rewards/final_correctness_reward_func": 0.1875,
633
+ "rewards/question_recreation_reward_func": 0.588996303267777,
634
  "rewards/soft_format_reward_func": 0.0,
635
+ "rewards/strict_format_reward_func": 0.0,
636
+ "rewards/xmlcount_reward_func": 0.6802187506109476,
637
  "step": 66
638
  },
639
  {
640
+ "completion_length": 266.90625,
641
+ "epoch": 11.333333333333334,
642
+ "grad_norm": 2.8767664432525635,
643
+ "kl": 0.006433373549953103,
644
  "learning_rate": 1.2970351387729872e-07,
645
+ "loss": 0.0,
646
+ "reward": 1.6148663735948503,
647
+ "reward_std": 1.3225993164815009,
648
+ "rewards/concensus_correctness_reward_func": 0.20812500081956387,
649
+ "rewards/consensus_reward_func": 0.4375,
650
  "rewards/cumulative_reward_2": 0.0,
651
+ "rewards/final_correctness_reward_func": 0.0625,
652
+ "rewards/question_recreation_reward_func": 0.5250538997352123,
653
  "rewards/soft_format_reward_func": 0.0,
654
+ "rewards/strict_format_reward_func": 0.03125,
655
+ "rewards/xmlcount_reward_func": 0.3504374986514449,
656
  "step": 68
657
  },
658
  {
659
+ "completion_length": 250.21875,
660
+ "epoch": 11.666666666666666,
661
+ "grad_norm": 4.7036638259887695,
662
+ "kl": 0.007804472814314067,
663
  "learning_rate": 1.1576995658775404e-07,
664
+ "loss": 0.0,
665
+ "reward": 2.2799468226730824,
666
+ "reward_std": 1.166436342522502,
667
+ "rewards/concensus_correctness_reward_func": 0.30262500047683716,
668
+ "rewards/consensus_reward_func": 0.6875,
669
+ "rewards/cumulative_reward_2": 0.0,
670
+ "rewards/final_correctness_reward_func": 0.1875,
671
+ "rewards/question_recreation_reward_func": 0.5988530963659286,
672
+ "rewards/soft_format_reward_func": 0.015625,
673
+ "rewards/strict_format_reward_func": 0.015625,
674
+ "rewards/xmlcount_reward_func": 0.47221875097602606,
675
  "step": 70
676
  },
677
  {
678
+ "completion_length": 248.9375,
679
+ "epoch": 12.0,
680
+ "grad_norm": 2.7038285732269287,
681
+ "kl": 0.00650077304453589,
682
  "learning_rate": 1.0239940674851941e-07,
683
+ "loss": 0.0,
684
+ "reward": 1.7189936628565192,
685
+ "reward_std": 1.1028610868379474,
686
+ "rewards/concensus_correctness_reward_func": 0.11999999731779099,
687
+ "rewards/consensus_reward_func": 0.5625,
688
  "rewards/cumulative_reward_2": 0.0,
689
+ "rewards/final_correctness_reward_func": 0.125,
690
+ "rewards/question_recreation_reward_func": 0.4415561552159488,
691
  "rewards/soft_format_reward_func": 0.0,
692
+ "rewards/strict_format_reward_func": 0.015625,
693
+ "rewards/xmlcount_reward_func": 0.4543124968186021,
694
  "step": 72
695
  },
696
  {
697
+ "completion_length": 241.9375,
698
+ "epoch": 12.333333333333334,
699
+ "grad_norm": 3.6059765815734863,
700
+ "kl": 0.007852518698200583,
701
  "learning_rate": 8.964794509221507e-08,
702
+ "loss": 0.0,
703
+ "reward": 1.7060329876840115,
704
+ "reward_std": 1.2234434876590967,
705
+ "rewards/concensus_correctness_reward_func": 0.05999999865889549,
706
+ "rewards/consensus_reward_func": 0.6875,
707
  "rewards/cumulative_reward_2": 0.0,
708
+ "rewards/final_correctness_reward_func": 0.0625,
709
+ "rewards/question_recreation_reward_func": 0.4156580241397023,
710
  "rewards/soft_format_reward_func": 0.0,
711
+ "rewards/strict_format_reward_func": 0.03125,
712
+ "rewards/xmlcount_reward_func": 0.44912500213831663,
713
  "step": 74
714
  },
715
  {
716
+ "completion_length": 286.28125,
717
+ "epoch": 12.666666666666666,
718
+ "grad_norm": 2.444171190261841,
719
+ "kl": 0.00814603897742927,
720
  "learning_rate": 7.756905568047392e-08,
721
+ "loss": 0.0,
722
+ "reward": 2.122521236538887,
723
+ "reward_std": 0.9578379703452811,
724
+ "rewards/concensus_correctness_reward_func": 0.42274999618530273,
725
+ "rewards/consensus_reward_func": 0.3125,
726
  "rewards/cumulative_reward_2": 0.0,
727
+ "rewards/final_correctness_reward_func": 0.25,
728
+ "rewards/question_recreation_reward_func": 0.504114959621802,
729
  "rewards/soft_format_reward_func": 0.0,
730
+ "rewards/strict_format_reward_func": 0.015625,
731
+ "rewards/xmlcount_reward_func": 0.6175312478444539,
732
  "step": 76
733
  },
734
  {
735
+ "completion_length": 234.6875,
736
+ "epoch": 13.0,
737
+ "grad_norm": 3.0294716358184814,
738
+ "kl": 0.008441783153102733,
739
  "learning_rate": 6.621340157319996e-08,
740
+ "loss": 0.0,
741
+ "reward": 2.081810735166073,
742
+ "reward_std": 0.9143577099894173,
743
+ "rewards/concensus_correctness_reward_func": 0.125,
744
+ "rewards/consensus_reward_func": 0.8125,
745
  "rewards/cumulative_reward_2": 0.0,
746
+ "rewards/final_correctness_reward_func": 0.25,
747
+ "rewards/question_recreation_reward_func": 0.5170919904485345,
748
  "rewards/soft_format_reward_func": 0.0,
749
+ "rewards/strict_format_reward_func": 0.0,
750
+ "rewards/xmlcount_reward_func": 0.37721875379793346,
751
  "step": 78
752
  },
753
  {
754
+ "completion_length": 224.40625,
755
+ "epoch": 13.333333333333334,
756
+ "grad_norm": 3.4747705459594727,
757
+ "kl": 0.01025624800240621,
758
  "learning_rate": 5.5628612330087724e-08,
759
+ "loss": 0.0,
760
+ "reward": 1.9557546898722649,
761
+ "reward_std": 0.9802739145234227,
762
+ "rewards/concensus_correctness_reward_func": 0.24450000002980232,
763
+ "rewards/consensus_reward_func": 0.5625,
764
  "rewards/cumulative_reward_2": 0.0,
765
+ "rewards/final_correctness_reward_func": 0.0625,
766
+ "rewards/question_recreation_reward_func": 0.562535947188735,
767
  "rewards/soft_format_reward_func": 0.0,
768
+ "rewards/strict_format_reward_func": 0.03125,
769
+ "rewards/xmlcount_reward_func": 0.4924687468446791,
770
  "step": 80
771
  },
772
  {
773
+ "completion_length": 234.75,
774
+ "epoch": 13.666666666666666,
775
+ "grad_norm": 2.7190427780151367,
776
+ "kl": 0.008365696892724372,
777
  "learning_rate": 4.5859084235697235e-08,
778
+ "loss": 0.0,
779
+ "reward": 1.6996353343129158,
780
+ "reward_std": 0.6642087557120249,
781
+ "rewards/concensus_correctness_reward_func": 0.05999999865889549,
782
+ "rewards/consensus_reward_func": 0.5625,
783
  "rewards/cumulative_reward_2": 0.0,
784
+ "rewards/final_correctness_reward_func": 0.125,
785
+ "rewards/question_recreation_reward_func": 0.395666572265327,
786
  "rewards/soft_format_reward_func": 0.0,
787
+ "rewards/strict_format_reward_func": 0.046875,
788
+ "rewards/xmlcount_reward_func": 0.5095937536098063,
789
  "step": 82
790
  },
791
  {
792
+ "completion_length": 263.96875,
793
+ "epoch": 14.0,
794
+ "grad_norm": 2.663147449493408,
795
+ "kl": 0.010309035773389041,
796
  "learning_rate": 3.6945794086007705e-08,
797
+ "loss": 0.0,
798
+ "reward": 2.0628331936895847,
799
+ "reward_std": 1.0619396911934018,
800
+ "rewards/concensus_correctness_reward_func": 0.17756250128149986,
801
+ "rewards/consensus_reward_func": 0.6875,
802
  "rewards/cumulative_reward_2": 0.0,
803
+ "rewards/final_correctness_reward_func": 0.125,
804
+ "rewards/question_recreation_reward_func": 0.5063332146964967,
805
  "rewards/soft_format_reward_func": 0.0,
806
+ "rewards/strict_format_reward_func": 0.046875,
807
+ "rewards/xmlcount_reward_func": 0.5195625014603138,
808
  "step": 84
809
  },
810
  {
811
+ "completion_length": 231.1875,
812
+ "epoch": 14.333333333333334,
813
+ "grad_norm": 3.154367685317993,
814
+ "kl": 0.01306345833290834,
815
  "learning_rate": 2.892612731749414e-08,
816
+ "loss": 0.0,
817
+ "reward": 2.1949321180582047,
818
+ "reward_std": 1.4137339515145868,
819
+ "rewards/concensus_correctness_reward_func": 0.18025000020861626,
820
+ "rewards/consensus_reward_func": 0.8125,
821
  "rewards/cumulative_reward_2": 0.0,
822
+ "rewards/final_correctness_reward_func": 0.125,
823
+ "rewards/question_recreation_reward_func": 0.49090082419570535,
824
  "rewards/soft_format_reward_func": 0.0,
825
+ "rewards/strict_format_reward_func": 0.03125,
826
+ "rewards/xmlcount_reward_func": 0.5550312586128712,
827
  "step": 86
828
  },
829
  {
830
+ "completion_length": 315.9375,
831
+ "epoch": 14.666666666666666,
832
+ "grad_norm": 2.249851703643799,
833
+ "kl": 0.006411108406609856,
834
  "learning_rate": 2.183372119961499e-08,
835
+ "loss": 0.0,
836
+ "reward": 2.2250705547630787,
837
+ "reward_std": 1.5259972442872822,
838
+ "rewards/concensus_correctness_reward_func": 0.39806249737739563,
839
+ "rewards/consensus_reward_func": 0.625,
840
  "rewards/cumulative_reward_2": 0.0,
841
+ "rewards/final_correctness_reward_func": 0.3125,
842
+ "rewards/question_recreation_reward_func": 0.5338830836117268,
843
  "rewards/soft_format_reward_func": 0.0,
844
+ "rewards/strict_format_reward_func": 0.046875,
845
+ "rewards/xmlcount_reward_func": 0.3087499989196658,
846
  "step": 88
847
  },
848
  {
849
+ "completion_length": 261.09375,
850
+ "epoch": 15.0,
851
+ "grad_norm": 2.743177652359009,
852
+ "kl": 0.00881043096887879,
853
  "learning_rate": 1.5698323748414122e-08,
854
+ "loss": 0.0,
855
+ "reward": 1.895367180928588,
856
+ "reward_std": 1.2597154856775887,
857
+ "rewards/concensus_correctness_reward_func": 0.18012499809265137,
858
+ "rewards/consensus_reward_func": 0.625,
859
  "rewards/cumulative_reward_2": 0.0,
860
+ "rewards/final_correctness_reward_func": 0.125,
861
+ "rewards/question_recreation_reward_func": 0.5660547083243728,
862
  "rewards/soft_format_reward_func": 0.0,
863
+ "rewards/strict_format_reward_func": 0.046875,
864
+ "rewards/xmlcount_reward_func": 0.35231249686330557,
865
  "step": 90
866
  },
867
  {
868
+ "completion_length": 277.90625,
869
+ "epoch": 15.333333333333334,
870
+ "grad_norm": 3.3928024768829346,
871
+ "kl": 0.006757803043001331,
872
  "learning_rate": 1.054566895300324e-08,
873
+ "loss": 0.0,
874
+ "reward": 1.7474502064287663,
875
+ "reward_std": 0.7529050658922642,
876
+ "rewards/concensus_correctness_reward_func": 0.1798749975860119,
877
+ "rewards/consensus_reward_func": 0.5,
878
  "rewards/cumulative_reward_2": 0.0,
879
+ "rewards/final_correctness_reward_func": 0.1875,
880
+ "rewards/question_recreation_reward_func": 0.5421689683571458,
881
  "rewards/soft_format_reward_func": 0.0,
882
+ "rewards/strict_format_reward_func": 0.0,
883
+ "rewards/xmlcount_reward_func": 0.337906246073544,
884
  "step": 92
885
  },
886
  {
887
+ "completion_length": 304.09375,
888
+ "epoch": 15.666666666666666,
889
+ "grad_norm": 2.197697162628174,
890
+ "kl": 0.006906057009473443,
891
  "learning_rate": 6.397368838268496e-09,
892
+ "loss": 0.0,
893
+ "reward": 1.6495681330561638,
894
+ "reward_std": 0.6983778811700176,
895
+ "rewards/concensus_correctness_reward_func": 0.06012500077486038,
896
+ "rewards/consensus_reward_func": 0.4375,
897
  "rewards/cumulative_reward_2": 0.0,
898
+ "rewards/final_correctness_reward_func": 0.125,
899
+ "rewards/question_recreation_reward_func": 0.5284118615090847,
900
  "rewards/soft_format_reward_func": 0.0,
901
+ "rewards/strict_format_reward_func": 0.015625,
902
+ "rewards/xmlcount_reward_func": 0.48290624911896884,
903
  "step": 94
904
  },
905
  {
906
+ "completion_length": 269.59375,
907
+ "epoch": 16.0,
908
+ "grad_norm": 2.6010444164276123,
909
+ "kl": 0.011520398882566951,
910
  "learning_rate": 3.2708228165273244e-09,
911
+ "loss": 0.0,
912
+ "reward": 2.7417864352464676,
913
+ "reward_std": 1.6308739269152284,
914
+ "rewards/concensus_correctness_reward_func": 0.30268750339746475,
915
+ "rewards/consensus_reward_func": 1.0625,
916
  "rewards/cumulative_reward_2": 0.0,
917
+ "rewards/final_correctness_reward_func": 0.3125,
918
+ "rewards/question_recreation_reward_func": 0.5818177200853825,
919
  "rewards/soft_format_reward_func": 0.0,
920
+ "rewards/strict_format_reward_func": 0.046875,
921
+ "rewards/xmlcount_reward_func": 0.4354062539059669,
922
  "step": 96
923
  },
924
  {
925
+ "completion_length": 256.625,
926
+ "epoch": 16.333333333333332,
927
+ "grad_norm": 3.064824104309082,
928
+ "kl": 0.013104435958666727,
929
  "learning_rate": 1.1791447083465133e-09,
930
+ "loss": 0.0,
931
+ "reward": 2.3778811804950237,
932
+ "reward_std": 1.2875230926729273,
933
+ "rewards/concensus_correctness_reward_func": 0.18262499943375587,
934
+ "rewards/consensus_reward_func": 0.8125,
935
  "rewards/cumulative_reward_2": 0.0,
936
+ "rewards/final_correctness_reward_func": 0.1875,
937
+ "rewards/question_recreation_reward_func": 0.44909992604516447,
938
  "rewards/soft_format_reward_func": 0.0,
939
+ "rewards/strict_format_reward_func": 0.0625,
940
+ "rewards/xmlcount_reward_func": 0.6836562557145953,
941
  "step": 98
942
  },
943
  {
944
+ "completion_length": 251.875,
945
+ "epoch": 16.666666666666668,
946
+ "grad_norm": 5.960361957550049,
947
+ "kl": 0.014634610328357667,
948
  "learning_rate": 1.3110773862126667e-10,
949
+ "loss": 0.0,
950
+ "reward": 1.9424380734562874,
951
+ "reward_std": 1.420625472906977,
952
+ "rewards/concensus_correctness_reward_func": 0.24249999597668648,
953
+ "rewards/consensus_reward_func": 0.625,
954
  "rewards/cumulative_reward_2": 0.0,
955
+ "rewards/final_correctness_reward_func": 0.125,
956
+ "rewards/question_recreation_reward_func": 0.38812559354119003,
957
  "rewards/soft_format_reward_func": 0.0,
958
+ "rewards/strict_format_reward_func": 0.046875,
959
+ "rewards/xmlcount_reward_func": 0.5149374911561608,
960
  "step": 100
961
  },
962
  {
963
+ "epoch": 16.666666666666668,
964
  "step": 100,
965
  "total_flos": 0.0,
966
+ "train_loss": 5.246203290880657e-06,
967
+ "train_runtime": 3714.3127,
968
+ "train_samples_per_second": 0.431,
969
+ "train_steps_per_second": 0.027
970
  }
971
  ],
972
  "logging_steps": 2,
973
  "max_steps": 100,
974
  "num_input_tokens_seen": 0,
975
+ "num_train_epochs": 17,
976
  "save_steps": 25,
977
  "stateful_callbacks": {
978
  "TrainerControl": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81edfe9f3c2f35464bd2b7f022611d889bd2ecffa44d2c1040267960882f5651
3
  size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:929dae0ec6f304f24eac715a1449fdddd810343375eb432e5ee668c04ef01240
3
  size 5944