wyceee commited on
Commit
a95ba00
·
verified ·
1 Parent(s): 87b7688

End of training

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 1.5718676149845123e-05,
4
- "train_runtime": 4262.7362,
5
- "train_samples": 48,
6
- "train_samples_per_second": 0.375,
7
- "train_steps_per_second": 0.023
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 5.454165089759044e-05,
4
+ "train_runtime": 1795.073,
5
+ "train_samples": 140,
6
+ "train_samples_per_second": 0.891,
7
+ "train_steps_per_second": 0.056
8
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7aa4f6bae2562b59ce5b4c50ed5ba43bb6c7a6490d3bdbfd840200784bb86db8
3
  size 4996670464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:764033450745b9d53b869460352f8bc2d41916cffe68c35c87e6be8403ce7673
3
  size 4996670464
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5d2dcd5763357799ad9114059b0028a01c6683f147409faa0f31aafbd95ece0
3
  size 1178224960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e9aae628f32296665d7f661b5bbbdf492ddfa8dde935107560b21db3423e7de
3
  size 1178224960
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 1.5718676149845123e-05,
4
- "train_runtime": 4262.7362,
5
- "train_samples": 48,
6
- "train_samples_per_second": 0.375,
7
- "train_steps_per_second": 0.023
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 5.454165089759044e-05,
4
+ "train_runtime": 1795.073,
5
+ "train_samples": 140,
6
+ "train_samples_per_second": 0.891,
7
+ "train_steps_per_second": 0.056
8
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 16.666666666666668,
6
  "eval_steps": 500,
7
  "global_step": 100,
8
  "is_hyper_param_search": false,
@@ -10,969 +10,969 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 227.8125,
14
- "epoch": 0.3333333333333333,
15
- "grad_norm": 6.339727878570557,
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666665e-07,
18
  "loss": -0.0,
19
- "reward": 1.5849424004554749,
20
- "reward_std": 1.131471099331975,
21
- "rewards/concensus_correctness_reward_func": 0.12262500077486038,
22
  "rewards/consensus_reward_func": 0.5625,
23
  "rewards/cumulative_reward_2": 0.0,
24
- "rewards/final_correctness_reward_func": 0.0625,
25
- "rewards/question_recreation_reward_func": 0.46509863436222076,
26
  "rewards/soft_format_reward_func": 0.0,
27
- "rewards/strict_format_reward_func": 0.015625,
28
- "rewards/xmlcount_reward_func": 0.35659374902024865,
29
  "step": 2
30
  },
31
  {
32
- "completion_length": 287.125,
33
- "epoch": 0.6666666666666666,
34
- "grad_norm": 2.283590793609619,
35
- "kl": 0.000743766751838848,
36
  "learning_rate": 5e-07,
37
  "loss": 0.0,
38
- "reward": 2.0471834875643253,
39
- "reward_std": 0.9305327897891402,
40
- "rewards/concensus_correctness_reward_func": 0.0,
41
- "rewards/consensus_reward_func": 0.625,
42
  "rewards/cumulative_reward_2": 0.0,
43
- "rewards/final_correctness_reward_func": 0.3125,
44
- "rewards/question_recreation_reward_func": 0.6308709642617032,
45
  "rewards/soft_format_reward_func": 0.0,
46
- "rewards/strict_format_reward_func": 0.015625,
47
- "rewards/xmlcount_reward_func": 0.46318750735372305,
48
  "step": 4
49
  },
50
  {
51
- "completion_length": 286.03125,
52
- "epoch": 1.0,
53
- "grad_norm": 2.7924423217773438,
54
- "kl": 0.0008395667373406468,
55
  "learning_rate": 4.994757065594279e-07,
56
  "loss": 0.0,
57
- "reward": 1.7317106202244759,
58
- "reward_std": 1.3122332114144228,
59
- "rewards/concensus_correctness_reward_func": 0.12025000154972076,
60
- "rewards/consensus_reward_func": 0.75,
61
  "rewards/cumulative_reward_2": 0.0,
62
- "rewards/final_correctness_reward_func": 0.125,
63
- "rewards/question_recreation_reward_func": 0.4853668725118041,
64
  "rewards/soft_format_reward_func": 0.0,
65
- "rewards/strict_format_reward_func": 0.03125,
66
- "rewards/xmlcount_reward_func": 0.21984374802559614,
67
  "step": 6
68
  },
69
  {
70
- "completion_length": 289.90625,
71
- "epoch": 1.3333333333333333,
72
- "grad_norm": 2.7986364364624023,
73
- "kl": 0.0008700692887941841,
74
  "learning_rate": 4.979050253066063e-07,
75
  "loss": 0.0,
76
- "reward": 1.865300026256591,
77
- "reward_std": 0.8366383262909949,
78
- "rewards/concensus_correctness_reward_func": 0.06006250157952309,
79
- "rewards/consensus_reward_func": 0.6875,
80
  "rewards/cumulative_reward_2": 0.0,
81
- "rewards/final_correctness_reward_func": 0.0625,
82
- "rewards/question_recreation_reward_func": 0.6006125509738922,
83
  "rewards/soft_format_reward_func": 0.0,
84
- "rewards/strict_format_reward_func": 0.015625,
85
- "rewards/xmlcount_reward_func": 0.4389999993145466,
86
  "step": 8
87
  },
88
  {
89
- "completion_length": 224.3125,
90
- "epoch": 1.6666666666666665,
91
- "grad_norm": 3.4794700145721436,
92
- "kl": 0.0009347902014269494,
93
  "learning_rate": 4.952945442245597e-07,
94
  "loss": 0.0,
95
- "reward": 1.8756496086716652,
96
- "reward_std": 1.0868838177993894,
97
- "rewards/concensus_correctness_reward_func": 0.08568750135600567,
98
- "rewards/consensus_reward_func": 0.625,
99
  "rewards/cumulative_reward_2": 0.0,
100
- "rewards/final_correctness_reward_func": 0.1875,
101
- "rewards/question_recreation_reward_func": 0.4599620746448636,
102
  "rewards/soft_format_reward_func": 0.0,
103
- "rewards/strict_format_reward_func": 0.015625,
104
- "rewards/xmlcount_reward_func": 0.5018750056624413,
105
  "step": 10
106
  },
107
  {
108
- "completion_length": 296.28125,
109
- "epoch": 2.0,
110
- "grad_norm": 3.0765440464019775,
111
- "kl": 0.0012358422500255983,
112
  "learning_rate": 4.916552125781528e-07,
113
  "loss": 0.0,
114
- "reward": 1.9579493142664433,
115
- "reward_std": 0.8908882063115016,
116
- "rewards/concensus_correctness_reward_func": 0.12025000154972076,
117
- "rewards/consensus_reward_func": 0.6875,
118
  "rewards/cumulative_reward_2": 0.0,
119
- "rewards/final_correctness_reward_func": 0.1875,
120
- "rewards/question_recreation_reward_func": 0.465324345510453,
121
  "rewards/soft_format_reward_func": 0.0,
122
- "rewards/strict_format_reward_func": 0.015625,
123
- "rewards/xmlcount_reward_func": 0.4817500030621886,
124
  "step": 12
125
  },
126
  {
127
- "completion_length": 280.90625,
128
- "epoch": 2.3333333333333335,
129
- "grad_norm": 2.9266109466552734,
130
- "kl": 0.0017369047309330199,
131
  "learning_rate": 4.870022949890676e-07,
132
  "loss": 0.0,
133
- "reward": 1.4219924416393042,
134
- "reward_std": 0.8281801359262317,
135
- "rewards/concensus_correctness_reward_func": 0.0,
136
- "rewards/consensus_reward_func": 0.4375,
137
  "rewards/cumulative_reward_2": 0.0,
138
- "rewards/final_correctness_reward_func": 0.125,
139
- "rewards/question_recreation_reward_func": 0.450836188509129,
140
  "rewards/soft_format_reward_func": 0.0,
141
- "rewards/strict_format_reward_func": 0.0625,
142
- "rewards/xmlcount_reward_func": 0.34615624509751797,
143
  "step": 14
144
  },
145
  {
146
- "completion_length": 243.0625,
147
- "epoch": 2.6666666666666665,
148
- "grad_norm": 3.0913712978363037,
149
- "kl": 0.0019520393034326844,
150
  "learning_rate": 4.81355307410676e-07,
151
  "loss": 0.0,
152
- "reward": 2.2805784731172025,
153
- "reward_std": 1.2899235817894805,
154
- "rewards/concensus_correctness_reward_func": 0.24512499943375587,
155
- "rewards/consensus_reward_func": 0.9375,
156
  "rewards/cumulative_reward_2": 0.0,
157
- "rewards/final_correctness_reward_func": 0.125,
158
- "rewards/question_recreation_reward_func": 0.47473472240380943,
159
  "rewards/soft_format_reward_func": 0.0,
160
- "rewards/strict_format_reward_func": 0.046875,
161
- "rewards/xmlcount_reward_func": 0.451343753375113,
162
  "step": 16
163
  },
164
  {
165
- "completion_length": 267.40625,
166
- "epoch": 3.0,
167
- "grad_norm": 3.0766594409942627,
168
- "kl": 0.0020293756315368228,
169
  "learning_rate": 4.747379352713488e-07,
170
  "loss": 0.0,
171
- "reward": 1.9329118076711893,
172
- "reward_std": 1.2741229943931103,
173
- "rewards/concensus_correctness_reward_func": 0.12256250157952309,
174
- "rewards/consensus_reward_func": 0.6875,
175
  "rewards/cumulative_reward_2": 0.0,
176
- "rewards/final_correctness_reward_func": 0.25,
177
- "rewards/question_recreation_reward_func": 0.619443034986034,
178
  "rewards/soft_format_reward_func": 0.0,
179
- "rewards/strict_format_reward_func": 0.015625,
180
- "rewards/xmlcount_reward_func": 0.23778124805539846,
181
  "step": 18
182
  },
183
  {
184
- "completion_length": 236.09375,
185
- "epoch": 3.3333333333333335,
186
- "grad_norm": 2.809358596801758,
187
- "kl": 0.0026170795026700944,
188
  "learning_rate": 4.6717793412953776e-07,
189
  "loss": 0.0,
190
- "reward": 1.8976972922682762,
191
- "reward_std": 1.0421031441655941,
192
- "rewards/concensus_correctness_reward_func": 0.06012500077486038,
193
- "rewards/consensus_reward_func": 0.625,
194
  "rewards/cumulative_reward_2": 0.0,
195
- "rewards/final_correctness_reward_func": 0.125,
196
- "rewards/question_recreation_reward_func": 0.5437285574153066,
197
  "rewards/soft_format_reward_func": 0.0,
198
  "rewards/strict_format_reward_func": 0.03125,
199
- "rewards/xmlcount_reward_func": 0.512593756429851,
200
  "step": 20
201
  },
202
  {
203
- "completion_length": 265.0625,
204
- "epoch": 3.6666666666666665,
205
- "grad_norm": 3.7295589447021484,
206
- "kl": 0.003362223884323612,
207
  "learning_rate": 4.5870701325731773e-07,
208
  "loss": 0.0,
209
- "reward": 1.9837484806776047,
210
- "reward_std": 0.753805372864008,
211
- "rewards/concensus_correctness_reward_func": 0.11993750184774399,
212
- "rewards/consensus_reward_func": 0.8125,
213
  "rewards/cumulative_reward_2": 0.0,
214
- "rewards/final_correctness_reward_func": 0.125,
215
- "rewards/question_recreation_reward_func": 0.4247172431787476,
216
  "rewards/soft_format_reward_func": 0.0,
217
- "rewards/strict_format_reward_func": 0.0,
218
- "rewards/xmlcount_reward_func": 0.5015937560237944,
219
  "step": 22
220
  },
221
  {
222
- "completion_length": 237.6875,
223
- "epoch": 4.0,
224
- "grad_norm": 2.7614455223083496,
225
- "kl": 0.0042717494507087395,
226
  "learning_rate": 4.4936070264068016e-07,
227
  "loss": 0.0,
228
- "reward": 1.828284303745022,
229
- "reward_std": 0.7124169690505369,
230
- "rewards/concensus_correctness_reward_func": 0.05743750184774399,
231
- "rewards/consensus_reward_func": 0.375,
232
  "rewards/cumulative_reward_2": 0.0,
233
- "rewards/final_correctness_reward_func": 0.125,
234
- "rewards/question_recreation_reward_func": 0.7025655592733528,
235
  "rewards/soft_format_reward_func": 0.0,
236
- "rewards/strict_format_reward_func": 0.0625,
237
- "rewards/xmlcount_reward_func": 0.5057812501909211,
238
  "step": 24
239
  },
240
  {
241
- "completion_length": 296.71875,
242
- "epoch": 4.333333333333333,
243
- "grad_norm": 3.3926703929901123,
244
- "kl": 0.0044053339806851,
245
  "learning_rate": 4.391782039544238e-07,
246
  "loss": 0.0,
247
- "reward": 1.3961677476763725,
248
- "reward_std": 1.0368912005797029,
249
- "rewards/concensus_correctness_reward_func": 0.024687500670552254,
250
  "rewards/consensus_reward_func": 0.5625,
251
  "rewards/cumulative_reward_2": 0.0,
252
- "rewards/final_correctness_reward_func": 0.0625,
253
- "rewards/question_recreation_reward_func": 0.46885524597018957,
254
  "rewards/soft_format_reward_func": 0.0,
255
- "rewards/strict_format_reward_func": 0.046875,
256
- "rewards/xmlcount_reward_func": 0.230750004760921,
257
  "step": 26
258
  },
259
  {
260
- "completion_length": 208.375,
261
- "epoch": 4.666666666666667,
262
- "grad_norm": 3.8391273021698,
263
- "kl": 0.005014390684664249,
264
  "learning_rate": 4.282022261367073e-07,
265
  "loss": 0.0,
266
- "reward": 1.6386928837746382,
267
- "reward_std": 0.5791225910652429,
268
- "rewards/concensus_correctness_reward_func": 0.125,
269
- "rewards/consensus_reward_func": 0.5,
270
  "rewards/cumulative_reward_2": 0.0,
271
- "rewards/final_correctness_reward_func": 0.0,
272
- "rewards/question_recreation_reward_func": 0.5378491813316941,
273
  "rewards/soft_format_reward_func": 0.0,
274
- "rewards/strict_format_reward_func": 0.03125,
275
- "rewards/xmlcount_reward_func": 0.44459374831058085,
276
  "step": 28
277
  },
278
  {
279
- "completion_length": 285.625,
280
- "epoch": 5.0,
281
- "grad_norm": 3.2744433879852295,
282
- "kl": 0.004670602691476233,
283
  "learning_rate": 4.1647880625292027e-07,
284
  "loss": 0.0,
285
- "reward": 1.5773469675332308,
286
- "reward_std": 0.8148748113308102,
287
- "rewards/concensus_correctness_reward_func": 0.0,
288
- "rewards/consensus_reward_func": 0.5625,
289
  "rewards/cumulative_reward_2": 0.0,
290
- "rewards/final_correctness_reward_func": 0.0625,
291
- "rewards/question_recreation_reward_func": 0.5454094994347543,
292
  "rewards/soft_format_reward_func": 0.0,
293
- "rewards/strict_format_reward_func": 0.03125,
294
- "rewards/xmlcount_reward_func": 0.3756875009275973,
295
  "step": 30
296
  },
297
  {
298
- "completion_length": 265.375,
299
- "epoch": 5.333333333333333,
300
- "grad_norm": 3.7803711891174316,
301
- "kl": 0.0071189729642355815,
302
  "learning_rate": 4.040571164002318e-07,
303
  "loss": 0.0,
304
- "reward": 1.915034051053226,
305
- "reward_std": 1.0130134378559887,
306
- "rewards/concensus_correctness_reward_func": 0.0,
307
- "rewards/consensus_reward_func": 0.75,
308
  "rewards/cumulative_reward_2": 0.0,
309
- "rewards/final_correctness_reward_func": 0.125,
310
- "rewards/question_recreation_reward_func": 0.5950653096660972,
311
  "rewards/soft_format_reward_func": 0.0,
312
- "rewards/strict_format_reward_func": 0.015625,
313
- "rewards/xmlcount_reward_func": 0.42934375285403803,
314
  "step": 32
315
  },
316
  {
317
- "completion_length": 257.96875,
318
- "epoch": 5.666666666666667,
319
- "grad_norm": 2.1448609828948975,
320
- "kl": 0.006875298960949294,
321
  "learning_rate": 3.909892574627266e-07,
322
  "loss": 0.0,
323
- "reward": 2.0118120573461056,
324
- "reward_std": 1.1375641755294055,
325
- "rewards/concensus_correctness_reward_func": 0.0625,
326
- "rewards/consensus_reward_func": 0.5,
327
  "rewards/cumulative_reward_2": 0.0,
328
- "rewards/final_correctness_reward_func": 0.4375,
329
- "rewards/question_recreation_reward_func": 0.4547183125978336,
330
  "rewards/soft_format_reward_func": 0.0,
331
- "rewards/strict_format_reward_func": 0.015625,
332
- "rewards/xmlcount_reward_func": 0.5414687562733889,
333
  "step": 34
334
  },
335
  {
336
- "completion_length": 228.03125,
337
- "epoch": 6.0,
338
- "grad_norm": 4.327014923095703,
339
- "kl": 0.010515401692828164,
340
  "learning_rate": 3.773300405821908e-07,
341
  "loss": 0.0,
342
- "reward": 2.3596832640469074,
343
- "reward_std": 0.8677473589777946,
344
- "rewards/concensus_correctness_reward_func": 0.29768750071525574,
345
- "rewards/consensus_reward_func": 0.625,
346
  "rewards/cumulative_reward_2": 0.0,
347
- "rewards/final_correctness_reward_func": 0.125,
348
- "rewards/question_recreation_reward_func": 0.5928395111113787,
349
  "rewards/soft_format_reward_func": 0.0,
350
  "rewards/strict_format_reward_func": 0.0625,
351
- "rewards/xmlcount_reward_func": 0.6566562494263053,
352
  "step": 36
353
  },
354
  {
355
- "completion_length": 255.125,
356
- "epoch": 6.333333333333333,
357
- "grad_norm": 3.2016897201538086,
358
- "kl": 0.010390633338829502,
359
  "learning_rate": 3.6313675726113475e-07,
360
  "loss": 0.0,
361
- "reward": 2.1920023262500763,
362
- "reward_std": 1.4953037183731794,
363
- "rewards/concensus_correctness_reward_func": 0.18418750166893005,
364
- "rewards/consensus_reward_func": 0.75,
365
  "rewards/cumulative_reward_2": 0.0,
366
- "rewards/final_correctness_reward_func": 0.3125,
367
- "rewards/question_recreation_reward_func": 0.5606585624627769,
368
  "rewards/soft_format_reward_func": 0.0,
369
- "rewards/strict_format_reward_func": 0.015625,
370
- "rewards/xmlcount_reward_func": 0.3690312569960952,
371
  "step": 38
372
  },
373
  {
374
- "completion_length": 255.21875,
375
- "epoch": 6.666666666666667,
376
- "grad_norm": 2.821410894393921,
377
- "kl": 0.011521625056047924,
378
  "learning_rate": 3.484689390623218e-07,
379
  "loss": 0.0,
380
- "reward": 2.26532906293869,
381
- "reward_std": 1.2260184331098571,
382
- "rewards/concensus_correctness_reward_func": 0.30037499964237213,
383
- "rewards/consensus_reward_func": 0.8125,
384
  "rewards/cumulative_reward_2": 0.0,
385
- "rewards/final_correctness_reward_func": 0.0625,
386
- "rewards/question_recreation_reward_func": 0.6025790590792894,
387
  "rewards/soft_format_reward_func": 0.0,
388
- "rewards/strict_format_reward_func": 0.046875,
389
- "rewards/xmlcount_reward_func": 0.4405000088736415,
390
  "step": 40
391
  },
392
  {
393
- "completion_length": 296.125,
394
- "epoch": 7.0,
395
- "grad_norm": 2.361708641052246,
396
- "kl": 0.009184279042528942,
397
  "learning_rate": 3.3338810791270517e-07,
398
  "loss": 0.0,
399
- "reward": 1.8683023676276207,
400
- "reward_std": 0.931499857455492,
401
- "rewards/concensus_correctness_reward_func": 0.11750000342726707,
402
- "rewards/consensus_reward_func": 0.625,
403
  "rewards/cumulative_reward_2": 0.0,
404
- "rewards/final_correctness_reward_func": 0.1875,
405
- "rewards/question_recreation_reward_func": 0.5745836496353149,
406
  "rewards/soft_format_reward_func": 0.0,
407
- "rewards/strict_format_reward_func": 0.046875,
408
- "rewards/xmlcount_reward_func": 0.3168437508866191,
409
  "step": 42
410
  },
411
  {
412
- "completion_length": 264.34375,
413
- "epoch": 7.333333333333333,
414
- "grad_norm": 3.3224129676818848,
415
- "kl": 0.01854561164509505,
416
  "learning_rate": 3.179575180590857e-07,
417
  "loss": 0.0,
418
- "reward": 1.4564557429403067,
419
- "reward_std": 0.8835583210457116,
420
- "rewards/concensus_correctness_reward_func": 0.0,
421
- "rewards/consensus_reward_func": 0.375,
422
  "rewards/cumulative_reward_2": 0.0,
423
- "rewards/final_correctness_reward_func": 0.125,
424
- "rewards/question_recreation_reward_func": 0.5099557400681078,
425
  "rewards/soft_format_reward_func": 0.0,
426
- "rewards/strict_format_reward_func": 0.03125,
427
- "rewards/xmlcount_reward_func": 0.41524998657405376,
428
  "step": 44
429
  },
430
  {
431
- "completion_length": 302.34375,
432
- "epoch": 7.666666666666667,
433
- "grad_norm": 3.5825905799865723,
434
- "kl": 0.013279616308864206,
435
  "learning_rate": 3.022418907578188e-07,
436
  "loss": 0.0,
437
- "reward": 2.223274264484644,
438
- "reward_std": 0.9707561411778443,
439
- "rewards/concensus_correctness_reward_func": 0.22681250050663948,
440
- "rewards/consensus_reward_func": 0.875,
441
  "rewards/cumulative_reward_2": 0.0,
442
- "rewards/final_correctness_reward_func": 0.0625,
443
- "rewards/question_recreation_reward_func": 0.5176492994651198,
444
  "rewards/soft_format_reward_func": 0.0,
445
  "rewards/strict_format_reward_func": 0.015625,
446
- "rewards/xmlcount_reward_func": 0.5256875129416585,
447
  "step": 46
448
  },
449
  {
450
- "completion_length": 284.5,
451
- "epoch": 8.0,
452
- "grad_norm": 3.046675682067871,
453
- "kl": 0.014580188668332994,
454
  "learning_rate": 2.863071428113726e-07,
455
  "loss": 0.0,
456
- "reward": 2.079888518899679,
457
- "reward_std": 1.115025261300616,
458
- "rewards/concensus_correctness_reward_func": 0.125,
459
- "rewards/consensus_reward_func": 0.6875,
460
  "rewards/cumulative_reward_2": 0.0,
461
- "rewards/final_correctness_reward_func": 0.25,
462
- "rewards/question_recreation_reward_func": 0.6062635215930641,
463
  "rewards/soft_format_reward_func": 0.0,
464
- "rewards/strict_format_reward_func": 0.03125,
465
- "rewards/xmlcount_reward_func": 0.37987500336021185,
466
  "step": 48
467
  },
468
  {
469
- "completion_length": 283.40625,
470
- "epoch": 8.333333333333334,
471
- "grad_norm": 4.013537883758545,
472
- "kl": 0.01412903075106442,
473
  "learning_rate": 2.7022011009035107e-07,
474
  "loss": 0.0,
475
- "reward": 1.7887509390711784,
476
- "reward_std": 1.0781924333423376,
477
- "rewards/concensus_correctness_reward_func": 0.06012500077486038,
478
- "rewards/consensus_reward_func": 0.6875,
479
  "rewards/cumulative_reward_2": 0.0,
480
- "rewards/final_correctness_reward_func": 0.0625,
481
- "rewards/question_recreation_reward_func": 0.4950321572832763,
482
  "rewards/soft_format_reward_func": 0.0,
483
- "rewards/strict_format_reward_func": 0.015625,
484
- "rewards/xmlcount_reward_func": 0.46796874795109034,
485
  "step": 50
486
  },
487
  {
488
- "completion_length": 255.75,
489
- "epoch": 8.666666666666666,
490
- "grad_norm": 3.4840548038482666,
491
- "kl": 0.020143478643149137,
492
  "learning_rate": 2.540482672006254e-07,
493
- "loss": 0.0,
494
- "reward": 2.2514243982732296,
495
- "reward_std": 0.9967716456158087,
496
- "rewards/concensus_correctness_reward_func": 0.14731250144541264,
497
- "rewards/consensus_reward_func": 0.6875,
498
  "rewards/cumulative_reward_2": 0.0,
499
- "rewards/final_correctness_reward_func": 0.0625,
500
- "rewards/question_recreation_reward_func": 0.7038306472823024,
501
  "rewards/soft_format_reward_func": 0.0,
502
- "rewards/strict_format_reward_func": 0.09375,
503
- "rewards/xmlcount_reward_func": 0.5565312498365529,
504
  "step": 52
505
  },
506
  {
507
- "completion_length": 257.25,
508
- "epoch": 9.0,
509
- "grad_norm": 3.285977602005005,
510
- "kl": 0.020005275029689074,
511
  "learning_rate": 2.37859444471388e-07,
512
  "loss": 0.0,
513
- "reward": 1.8586036376655102,
514
- "reward_std": 0.8674497168976814,
515
- "rewards/concensus_correctness_reward_func": 0.23737500235438347,
516
- "rewards/consensus_reward_func": 0.5625,
517
  "rewards/cumulative_reward_2": 0.0,
518
- "rewards/final_correctness_reward_func": 0.0625,
519
- "rewards/question_recreation_reward_func": 0.4530098957475275,
520
  "rewards/soft_format_reward_func": 0.0,
521
- "rewards/strict_format_reward_func": 0.0625,
522
- "rewards/xmlcount_reward_func": 0.4807187654078007,
523
  "step": 54
524
  },
525
  {
526
- "completion_length": 240.28125,
527
- "epoch": 9.333333333333334,
528
- "grad_norm": 3.7038044929504395,
529
- "kl": 0.018931806669570506,
530
  "learning_rate": 2.2172154345117894e-07,
531
- "loss": 0.0,
532
- "reward": 1.623201709240675,
533
- "reward_std": 1.2911038948222995,
534
- "rewards/concensus_correctness_reward_func": 0.06012500077486038,
535
- "rewards/consensus_reward_func": 0.625,
536
  "rewards/cumulative_reward_2": 0.0,
537
- "rewards/final_correctness_reward_func": 0.0625,
538
- "rewards/question_recreation_reward_func": 0.48360797856003046,
539
  "rewards/soft_format_reward_func": 0.0,
540
- "rewards/strict_format_reward_func": 0.03125,
541
- "rewards/xmlcount_reward_func": 0.3607187531888485,
542
  "step": 56
543
  },
544
  {
545
- "completion_length": 269.84375,
546
- "epoch": 9.666666666666666,
547
- "grad_norm": 2.881014585494995,
548
- "kl": 0.017237395339179784,
549
  "learning_rate": 2.0570225210519433e-07,
550
- "loss": 0.0,
551
- "reward": 1.9306830489076674,
552
- "reward_std": 1.3888351377099752,
553
- "rewards/concensus_correctness_reward_func": 0.24249999597668648,
554
- "rewards/consensus_reward_func": 0.6875,
555
  "rewards/cumulative_reward_2": 0.0,
556
- "rewards/final_correctness_reward_func": 0.0,
557
- "rewards/question_recreation_reward_func": 0.5115580353885889,
558
  "rewards/soft_format_reward_func": 0.0,
559
- "rewards/strict_format_reward_func": 0.015625,
560
- "rewards/xmlcount_reward_func": 0.4735000031068921,
561
  "step": 58
562
  },
563
  {
564
- "completion_length": 249.15625,
565
- "epoch": 10.0,
566
- "grad_norm": 3.1154470443725586,
567
- "kl": 0.02515559794846922,
568
  "learning_rate": 1.8986876090843664e-07,
569
  "loss": 0.0,
570
- "reward": 2.0984230153262615,
571
- "reward_std": 1.3092318717390299,
572
- "rewards/concensus_correctness_reward_func": 0.06012500077486038,
573
- "rewards/consensus_reward_func": 0.6875,
574
  "rewards/cumulative_reward_2": 0.0,
575
- "rewards/final_correctness_reward_func": 0.1875,
576
- "rewards/question_recreation_reward_func": 0.5064230696298182,
577
  "rewards/soft_format_reward_func": 0.0,
578
- "rewards/strict_format_reward_func": 0.078125,
579
- "rewards/xmlcount_reward_func": 0.5787499956786633,
580
  "step": 60
581
  },
582
  {
583
- "completion_length": 253.25,
584
- "epoch": 10.333333333333334,
585
- "grad_norm": 3.137545347213745,
586
- "kl": 0.024528241163352504,
587
  "learning_rate": 1.7428748102551234e-07,
588
- "loss": 0.0,
589
- "reward": 1.8578170202672482,
590
- "reward_std": 1.1804295498877764,
591
- "rewards/concensus_correctness_reward_func": 0.059812501072883606,
592
- "rewards/consensus_reward_func": 0.3125,
593
  "rewards/cumulative_reward_2": 0.0,
594
- "rewards/final_correctness_reward_func": 0.25,
595
- "rewards/question_recreation_reward_func": 0.4470045296475291,
596
  "rewards/soft_format_reward_func": 0.0,
597
  "rewards/strict_format_reward_func": 0.046875,
598
- "rewards/xmlcount_reward_func": 0.7416250004898757,
599
  "step": 62
600
  },
601
  {
602
- "completion_length": 223.3125,
603
- "epoch": 10.666666666666666,
604
- "grad_norm": 3.1050543785095215,
605
- "kl": 0.02882619173033163,
606
  "learning_rate": 1.5902376575912814e-07,
607
- "loss": 0.0,
608
- "reward": 1.743748527020216,
609
- "reward_std": 0.6933234713651473,
610
- "rewards/concensus_correctness_reward_func": 0.20481249876320362,
611
- "rewards/consensus_reward_func": 0.5,
612
  "rewards/cumulative_reward_2": 0.0,
613
- "rewards/final_correctness_reward_func": 0.0625,
614
- "rewards/question_recreation_reward_func": 0.4549047634936869,
615
  "rewards/soft_format_reward_func": 0.0,
616
- "rewards/strict_format_reward_func": 0.03125,
617
- "rewards/xmlcount_reward_func": 0.4902812475338578,
618
  "step": 64
619
  },
620
  {
621
- "completion_length": 237.8125,
622
- "epoch": 11.0,
623
- "grad_norm": 3.4488277435302734,
624
- "kl": 0.024283534032292664,
625
  "learning_rate": 1.4414163643562753e-07,
626
- "loss": 0.0,
627
- "reward": 1.9523196145892143,
628
- "reward_std": 1.559489093720913,
629
- "rewards/concensus_correctness_reward_func": 0.14818750135600567,
630
- "rewards/consensus_reward_func": 0.6875,
631
  "rewards/cumulative_reward_2": 0.0,
632
- "rewards/final_correctness_reward_func": 0.0625,
633
- "rewards/question_recreation_reward_func": 0.5911321062594652,
634
- "rewards/soft_format_reward_func": 0.015625,
635
  "rewards/strict_format_reward_func": 0.015625,
636
- "rewards/xmlcount_reward_func": 0.43175000394694507,
637
  "step": 66
638
  },
639
  {
640
- "completion_length": 285.21875,
641
- "epoch": 11.333333333333334,
642
- "grad_norm": 3.1919057369232178,
643
- "kl": 0.029086316528264433,
644
  "learning_rate": 1.2970351387729872e-07,
645
- "loss": 0.0,
646
- "reward": 1.9048342034220695,
647
- "reward_std": 1.2717823022976518,
648
- "rewards/concensus_correctness_reward_func": 0.06012500077486038,
649
  "rewards/consensus_reward_func": 0.75,
650
  "rewards/cumulative_reward_2": 0.0,
651
- "rewards/final_correctness_reward_func": 0.0625,
652
- "rewards/question_recreation_reward_func": 0.5372404857771471,
653
  "rewards/soft_format_reward_func": 0.0,
654
- "rewards/strict_format_reward_func": 0.0625,
655
- "rewards/xmlcount_reward_func": 0.4324687570333481,
656
  "step": 68
657
  },
658
  {
659
- "completion_length": 274.03125,
660
- "epoch": 11.666666666666666,
661
- "grad_norm": 4.114351749420166,
662
- "kl": 0.028069639985915273,
663
  "learning_rate": 1.1576995658775404e-07,
664
- "loss": 0.0,
665
- "reward": 1.8863378204405308,
666
- "reward_std": 1.3141623558476567,
667
- "rewards/concensus_correctness_reward_func": 0.12256250157952309,
668
- "rewards/consensus_reward_func": 0.625,
669
  "rewards/cumulative_reward_2": 0.0,
670
- "rewards/final_correctness_reward_func": 0.125,
671
- "rewards/question_recreation_reward_func": 0.5985253136605024,
672
- "rewards/soft_format_reward_func": 0.015625,
673
- "rewards/strict_format_reward_func": 0.078125,
674
- "rewards/xmlcount_reward_func": 0.3215000149793923,
675
  "step": 70
676
  },
677
  {
678
- "completion_length": 248.75,
679
- "epoch": 12.0,
680
- "grad_norm": 2.4701993465423584,
681
- "kl": 0.020335440058261156,
682
  "learning_rate": 1.0239940674851941e-07,
683
- "loss": 0.0,
684
- "reward": 2.2941419184207916,
685
- "reward_std": 0.8937435210682452,
686
- "rewards/concensus_correctness_reward_func": 0.11487500369548798,
687
- "rewards/consensus_reward_func": 0.5625,
688
  "rewards/cumulative_reward_2": 0.0,
689
- "rewards/final_correctness_reward_func": 0.4375,
690
- "rewards/question_recreation_reward_func": 0.5048919152468443,
691
  "rewards/soft_format_reward_func": 0.0,
692
- "rewards/strict_format_reward_func": 0.046875,
693
- "rewards/xmlcount_reward_func": 0.627500012749806,
694
  "step": 72
695
  },
696
  {
697
- "completion_length": 220.09375,
698
- "epoch": 12.333333333333334,
699
- "grad_norm": 2.9127869606018066,
700
- "kl": 0.022209461370948702,
701
  "learning_rate": 8.964794509221507e-08,
702
- "loss": 0.0,
703
- "reward": 2.0042398422956467,
704
- "reward_std": 1.136198466643691,
705
- "rewards/concensus_correctness_reward_func": 0.24262500181794167,
706
- "rewards/consensus_reward_func": 0.5625,
707
  "rewards/cumulative_reward_2": 0.0,
708
- "rewards/final_correctness_reward_func": 0.1875,
709
- "rewards/question_recreation_reward_func": 0.49439611518755555,
710
  "rewards/soft_format_reward_func": 0.0,
711
- "rewards/strict_format_reward_func": 0.015625,
712
- "rewards/xmlcount_reward_func": 0.5015937518328428,
713
  "step": 74
714
  },
715
  {
716
- "completion_length": 273.3125,
717
- "epoch": 12.666666666666666,
718
- "grad_norm": 2.498856782913208,
719
- "kl": 0.01810361386742443,
720
  "learning_rate": 7.756905568047392e-08,
721
- "loss": 0.0,
722
- "reward": 1.7342982944101095,
723
- "reward_std": 1.2671478418633342,
724
- "rewards/concensus_correctness_reward_func": 0.05999999865889549,
725
- "rewards/consensus_reward_func": 0.5,
726
  "rewards/cumulative_reward_2": 0.0,
727
- "rewards/final_correctness_reward_func": 0.1875,
728
- "rewards/question_recreation_reward_func": 0.575642024166882,
729
  "rewards/soft_format_reward_func": 0.0,
730
- "rewards/strict_format_reward_func": 0.015625,
731
- "rewards/xmlcount_reward_func": 0.39553124993108213,
732
  "step": 76
733
  },
734
  {
735
- "completion_length": 239.71875,
736
- "epoch": 13.0,
737
- "grad_norm": 2.6426241397857666,
738
- "kl": 0.027558521192986518,
739
  "learning_rate": 6.621340157319996e-08,
740
- "loss": 0.0,
741
- "reward": 2.0297871977090836,
742
- "reward_std": 0.9375762529671192,
743
- "rewards/concensus_correctness_reward_func": 0.06012500077486038,
744
- "rewards/consensus_reward_func": 0.8125,
745
  "rewards/cumulative_reward_2": 0.0,
746
- "rewards/final_correctness_reward_func": 0.0,
747
- "rewards/question_recreation_reward_func": 0.643943477421999,
748
  "rewards/soft_format_reward_func": 0.0,
749
- "rewards/strict_format_reward_func": 0.046875,
750
- "rewards/xmlcount_reward_func": 0.4663437455892563,
751
  "step": 78
752
  },
753
  {
754
- "completion_length": 217.0,
755
- "epoch": 13.333333333333334,
756
- "grad_norm": 2.7676427364349365,
757
- "kl": 0.03532541490858421,
758
  "learning_rate": 5.5628612330087724e-08,
759
- "loss": 0.0,
760
- "reward": 2.5324832424521446,
761
- "reward_std": 1.1609387751668692,
762
- "rewards/concensus_correctness_reward_func": 0.1401250008493662,
763
- "rewards/consensus_reward_func": 1.1875,
764
  "rewards/cumulative_reward_2": 0.0,
765
- "rewards/final_correctness_reward_func": 0.125,
766
- "rewards/question_recreation_reward_func": 0.4988894378184341,
767
  "rewards/soft_format_reward_func": 0.0,
768
- "rewards/strict_format_reward_func": 0.0625,
769
- "rewards/xmlcount_reward_func": 0.5184687459841371,
770
  "step": 80
771
  },
772
  {
773
- "completion_length": 263.84375,
774
- "epoch": 13.666666666666666,
775
- "grad_norm": 3.3539531230926514,
776
- "kl": 0.02452456980245188,
777
  "learning_rate": 4.5859084235697235e-08,
778
- "loss": 0.0,
779
- "reward": 1.9322984656319022,
780
- "reward_std": 1.3757213475182652,
781
- "rewards/concensus_correctness_reward_func": 0.12262500077486038,
782
- "rewards/consensus_reward_func": 0.375,
783
  "rewards/cumulative_reward_2": 0.0,
784
- "rewards/final_correctness_reward_func": 0.3125,
785
- "rewards/question_recreation_reward_func": 0.5088922204449773,
786
  "rewards/soft_format_reward_func": 0.0,
787
- "rewards/strict_format_reward_func": 0.046875,
788
- "rewards/xmlcount_reward_func": 0.5664062486030161,
789
  "step": 82
790
  },
791
  {
792
- "completion_length": 258.9375,
793
- "epoch": 14.0,
794
- "grad_norm": 3.50706148147583,
795
- "kl": 0.02300187383661978,
796
  "learning_rate": 3.6945794086007705e-08,
797
- "loss": 0.0,
798
- "reward": 2.368156984448433,
799
- "reward_std": 1.27269869716838,
800
- "rewards/concensus_correctness_reward_func": 0.23487500101327896,
801
- "rewards/consensus_reward_func": 0.9375,
802
  "rewards/cumulative_reward_2": 0.0,
803
- "rewards/final_correctness_reward_func": 0.125,
804
- "rewards/question_recreation_reward_func": 0.6125319767743349,
805
- "rewards/soft_format_reward_func": 0.015625,
806
- "rewards/strict_format_reward_func": 0.015625,
807
- "rewards/xmlcount_reward_func": 0.42700000666081905,
808
  "step": 84
809
  },
810
  {
811
- "completion_length": 238.8125,
812
- "epoch": 14.333333333333334,
813
- "grad_norm": 2.6380765438079834,
814
- "kl": 0.027566006290726364,
815
  "learning_rate": 2.892612731749414e-08,
816
- "loss": 0.0,
817
- "reward": 2.055258920416236,
818
- "reward_std": 0.6858638301491737,
819
- "rewards/concensus_correctness_reward_func": 0.18012499809265137,
820
- "rewards/consensus_reward_func": 0.75,
821
  "rewards/cumulative_reward_2": 0.0,
822
- "rewards/final_correctness_reward_func": 0.0,
823
- "rewards/question_recreation_reward_func": 0.5494776804698631,
824
  "rewards/soft_format_reward_func": 0.0,
825
- "rewards/strict_format_reward_func": 0.015625,
826
- "rewards/xmlcount_reward_func": 0.5600312501192093,
827
  "step": 86
828
  },
829
  {
830
- "completion_length": 283.0,
831
- "epoch": 14.666666666666666,
832
- "grad_norm": 2.740476369857788,
833
- "kl": 0.02960860973689705,
834
  "learning_rate": 2.183372119961499e-08,
835
- "loss": 0.0,
836
- "reward": 1.8798525519669056,
837
- "reward_std": 1.1110668628825806,
838
- "rewards/concensus_correctness_reward_func": 0.06012500077486038,
839
- "rewards/consensus_reward_func": 0.6875,
840
  "rewards/cumulative_reward_2": 0.0,
841
- "rewards/final_correctness_reward_func": 0.125,
842
- "rewards/question_recreation_reward_func": 0.5739150438457727,
843
- "rewards/soft_format_reward_func": 0.015625,
844
- "rewards/strict_format_reward_func": 0.046875,
845
- "rewards/xmlcount_reward_func": 0.37081251526251435,
846
  "step": 88
847
  },
848
  {
849
- "completion_length": 242.3125,
850
- "epoch": 15.0,
851
- "grad_norm": 2.513399124145508,
852
- "kl": 0.026559468533378094,
853
  "learning_rate": 1.5698323748414122e-08,
854
- "loss": 0.0,
855
- "reward": 2.03849926404655,
856
- "reward_std": 0.964166424702853,
857
- "rewards/concensus_correctness_reward_func": 0.14824999682605267,
858
- "rewards/consensus_reward_func": 0.625,
859
  "rewards/cumulative_reward_2": 0.0,
860
- "rewards/final_correctness_reward_func": 0.1875,
861
- "rewards/question_recreation_reward_func": 0.5135617647320032,
862
  "rewards/soft_format_reward_func": 0.0,
863
- "rewards/strict_format_reward_func": 0.0625,
864
- "rewards/xmlcount_reward_func": 0.5016875043511391,
865
  "step": 90
866
  },
867
  {
868
- "completion_length": 205.65625,
869
- "epoch": 15.333333333333334,
870
- "grad_norm": 3.1790051460266113,
871
- "kl": 0.036513498693238944,
872
  "learning_rate": 1.054566895300324e-08,
873
- "loss": 0.0,
874
- "reward": 2.030654199421406,
875
- "reward_std": 1.0498718353919685,
876
- "rewards/concensus_correctness_reward_func": 0.08824999816715717,
877
- "rewards/consensus_reward_func": 0.625,
878
  "rewards/cumulative_reward_2": 0.0,
879
- "rewards/final_correctness_reward_func": 0.0625,
880
- "rewards/question_recreation_reward_func": 0.6112791877239943,
881
  "rewards/soft_format_reward_func": 0.0,
882
  "rewards/strict_format_reward_func": 0.046875,
883
- "rewards/xmlcount_reward_func": 0.596749996766448,
884
  "step": 92
885
  },
886
  {
887
- "completion_length": 263.0,
888
- "epoch": 15.666666666666666,
889
- "grad_norm": 3.719067096710205,
890
- "kl": 0.021271194782457314,
891
  "learning_rate": 6.397368838268496e-09,
892
- "loss": 0.0,
893
- "reward": 2.2683150228112936,
894
- "reward_std": 1.4738470809534192,
895
- "rewards/concensus_correctness_reward_func": 0.36025000363588333,
896
- "rewards/consensus_reward_func": 0.5,
897
  "rewards/cumulative_reward_2": 0.0,
898
- "rewards/final_correctness_reward_func": 0.1875,
899
- "rewards/question_recreation_reward_func": 0.5795337841846049,
900
  "rewards/soft_format_reward_func": 0.0,
901
- "rewards/strict_format_reward_func": 0.046875,
902
- "rewards/xmlcount_reward_func": 0.5941562601365149,
903
  "step": 94
904
  },
905
  {
906
- "completion_length": 257.71875,
907
- "epoch": 16.0,
908
- "grad_norm": 5.480099201202393,
909
- "kl": 0.028246068861335516,
910
  "learning_rate": 3.2708228165273244e-09,
911
- "loss": 0.0,
912
- "reward": 2.2440423257648945,
913
- "reward_std": 0.9810051110107452,
914
- "rewards/concensus_correctness_reward_func": 0.18512500077486038,
915
- "rewards/consensus_reward_func": 0.8125,
916
  "rewards/cumulative_reward_2": 0.0,
917
- "rewards/final_correctness_reward_func": 0.1875,
918
- "rewards/question_recreation_reward_func": 0.4930110676214099,
919
  "rewards/soft_format_reward_func": 0.0,
920
- "rewards/strict_format_reward_func": 0.046875,
921
- "rewards/xmlcount_reward_func": 0.5190312538761646,
922
  "step": 96
923
  },
924
  {
925
- "completion_length": 239.34375,
926
- "epoch": 16.333333333333332,
927
- "grad_norm": 3.926326274871826,
928
- "kl": 0.03365567361470312,
929
  "learning_rate": 1.1791447083465133e-09,
930
- "loss": 0.0,
931
- "reward": 1.9388289339840412,
932
- "reward_std": 0.958217971608974,
933
- "rewards/concensus_correctness_reward_func": 0.11999999731779099,
934
  "rewards/consensus_reward_func": 0.625,
935
  "rewards/cumulative_reward_2": 0.0,
936
- "rewards/final_correctness_reward_func": 0.0625,
937
- "rewards/question_recreation_reward_func": 0.4502664606552571,
938
  "rewards/soft_format_reward_func": 0.0,
939
  "rewards/strict_format_reward_func": 0.0625,
940
- "rewards/xmlcount_reward_func": 0.6185624990612268,
941
  "step": 98
942
  },
943
  {
944
- "completion_length": 271.6875,
945
- "epoch": 16.666666666666668,
946
- "grad_norm": 2.2361977100372314,
947
- "kl": 0.02408603549702093,
948
  "learning_rate": 1.3110773862126667e-10,
949
- "loss": 0.0,
950
- "reward": 1.8076389655470848,
951
- "reward_std": 1.001751037707436,
952
- "rewards/concensus_correctness_reward_func": 0.2303125038743019,
953
  "rewards/consensus_reward_func": 0.5,
954
  "rewards/cumulative_reward_2": 0.0,
955
- "rewards/final_correctness_reward_func": 0.0,
956
- "rewards/question_recreation_reward_func": 0.5585764544084668,
957
  "rewards/soft_format_reward_func": 0.0,
958
- "rewards/strict_format_reward_func": 0.046875,
959
- "rewards/xmlcount_reward_func": 0.4718750088359229,
960
  "step": 100
961
  },
962
  {
963
- "epoch": 16.666666666666668,
964
  "step": 100,
965
  "total_flos": 0.0,
966
- "train_loss": 1.5718676149845123e-05,
967
- "train_runtime": 4262.7362,
968
- "train_samples_per_second": 0.375,
969
- "train_steps_per_second": 0.023
970
  }
971
  ],
972
  "logging_steps": 2,
973
  "max_steps": 100,
974
  "num_input_tokens_seen": 0,
975
- "num_train_epochs": 17,
976
  "save_steps": 25,
977
  "stateful_callbacks": {
978
  "TrainerControl": {
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 5.571428571428571,
6
  "eval_steps": 500,
7
  "global_step": 100,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "completion_length": 326.21875,
14
+ "epoch": 0.11428571428571428,
15
+ "grad_norm": 2.7555673122406006,
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666665e-07,
18
  "loss": -0.0,
19
+ "reward": 3.2572884149849415,
20
+ "reward_std": 2.711772508919239,
21
+ "rewards/concensus_correctness_reward_func": 1.3658749970927602,
22
  "rewards/consensus_reward_func": 0.5625,
23
  "rewards/cumulative_reward_2": 0.0,
24
+ "rewards/final_correctness_reward_func": 0.125,
25
+ "rewards/question_recreation_reward_func": 0.676913361530751,
26
  "rewards/soft_format_reward_func": 0.0,
27
+ "rewards/strict_format_reward_func": 0.046875,
28
+ "rewards/xmlcount_reward_func": 0.48012501280754805,
29
  "step": 2
30
  },
31
  {
32
+ "completion_length": 223.25,
33
+ "epoch": 0.22857142857142856,
34
+ "grad_norm": 2.6882662773132324,
35
+ "kl": 0.0006823752573836828,
36
  "learning_rate": 5e-07,
37
  "loss": 0.0,
38
+ "reward": 6.5029780976474285,
39
+ "reward_std": 3.8371810587123036,
40
+ "rewards/concensus_correctness_reward_func": 3.9408124699257314,
41
+ "rewards/consensus_reward_func": 0.5625,
42
  "rewards/cumulative_reward_2": 0.0,
43
+ "rewards/final_correctness_reward_func": 0.6875,
44
+ "rewards/question_recreation_reward_func": 0.4921030206605792,
45
  "rewards/soft_format_reward_func": 0.0,
46
+ "rewards/strict_format_reward_func": 0.0625,
47
+ "rewards/xmlcount_reward_func": 0.7575625074096024,
48
  "step": 4
49
  },
50
  {
51
+ "completion_length": 327.5625,
52
+ "epoch": 0.34285714285714286,
53
+ "grad_norm": 2.6698217391967773,
54
+ "kl": 0.0007466921615559841,
55
  "learning_rate": 4.994757065594279e-07,
56
  "loss": 0.0,
57
+ "reward": 3.1852196622639894,
58
+ "reward_std": 1.617547769099474,
59
+ "rewards/concensus_correctness_reward_func": 0.972874996252358,
60
+ "rewards/consensus_reward_func": 0.4375,
61
  "rewards/cumulative_reward_2": 0.0,
62
+ "rewards/final_correctness_reward_func": 0.375,
63
+ "rewards/question_recreation_reward_func": 0.6452508568763733,
64
  "rewards/soft_format_reward_func": 0.0,
65
+ "rewards/strict_format_reward_func": 0.046875,
66
+ "rewards/xmlcount_reward_func": 0.7077187523245811,
67
  "step": 6
68
  },
69
  {
70
+ "completion_length": 300.34375,
71
+ "epoch": 0.45714285714285713,
72
+ "grad_norm": 2.2753820419311523,
73
+ "kl": 0.0009206273971358314,
74
  "learning_rate": 4.979050253066063e-07,
75
  "loss": 0.0,
76
+ "reward": 4.944724701344967,
77
+ "reward_std": 4.053568044560961,
78
+ "rewards/concensus_correctness_reward_func": 2.3233749866485596,
79
+ "rewards/consensus_reward_func": 0.4375,
80
  "rewards/cumulative_reward_2": 0.0,
81
+ "rewards/final_correctness_reward_func": 0.875,
82
+ "rewards/question_recreation_reward_func": 0.6202871967107058,
83
  "rewards/soft_format_reward_func": 0.0,
84
+ "rewards/strict_format_reward_func": 0.046875,
85
+ "rewards/xmlcount_reward_func": 0.6416875068098307,
86
  "step": 8
87
  },
88
  {
89
+ "completion_length": 251.34375,
90
+ "epoch": 0.5714285714285714,
91
+ "grad_norm": 3.361114263534546,
92
+ "kl": 0.001550516844872618,
93
  "learning_rate": 4.952945442245597e-07,
94
  "loss": 0.0,
95
+ "reward": 4.728278212249279,
96
+ "reward_std": 4.151839345460758,
97
+ "rewards/concensus_correctness_reward_func": 2.2346875024959445,
98
+ "rewards/consensus_reward_func": 0.75,
99
  "rewards/cumulative_reward_2": 0.0,
100
+ "rewards/final_correctness_reward_func": 0.5,
101
+ "rewards/question_recreation_reward_func": 0.5102781374007463,
102
  "rewards/soft_format_reward_func": 0.0,
103
+ "rewards/strict_format_reward_func": 0.046875,
104
+ "rewards/xmlcount_reward_func": 0.6864375146105886,
105
  "step": 10
106
  },
107
  {
108
+ "completion_length": 283.0625,
109
+ "epoch": 0.6857142857142857,
110
+ "grad_norm": 2.796189308166504,
111
+ "kl": 0.0018549648048065137,
112
  "learning_rate": 4.916552125781528e-07,
113
  "loss": 0.0,
114
+ "reward": 5.428475089371204,
115
+ "reward_std": 2.1235571010038257,
116
+ "rewards/concensus_correctness_reward_func": 2.752250012010336,
117
+ "rewards/consensus_reward_func": 0.625,
118
  "rewards/cumulative_reward_2": 0.0,
119
+ "rewards/final_correctness_reward_func": 0.75,
120
+ "rewards/question_recreation_reward_func": 0.5671625286340714,
121
  "rewards/soft_format_reward_func": 0.0,
122
+ "rewards/strict_format_reward_func": 0.046875,
123
+ "rewards/xmlcount_reward_func": 0.6871875002980232,
124
  "step": 12
125
  },
126
  {
127
+ "completion_length": 292.625,
128
+ "epoch": 0.8,
129
+ "grad_norm": 2.12504243850708,
130
+ "kl": 0.002416994764644187,
131
  "learning_rate": 4.870022949890676e-07,
132
  "loss": 0.0,
133
+ "reward": 3.0031582564115524,
134
+ "reward_std": 1.936399682686897,
135
+ "rewards/concensus_correctness_reward_func": 0.9825624963268638,
136
+ "rewards/consensus_reward_func": 0.3125,
137
  "rewards/cumulative_reward_2": 0.0,
138
+ "rewards/final_correctness_reward_func": 0.375,
139
+ "rewards/question_recreation_reward_func": 0.52943952428177,
140
  "rewards/soft_format_reward_func": 0.0,
141
+ "rewards/strict_format_reward_func": 0.015625,
142
+ "rewards/xmlcount_reward_func": 0.7880312651395798,
143
  "step": 14
144
  },
145
  {
146
+ "completion_length": 236.5,
147
+ "epoch": 0.9142857142857143,
148
+ "grad_norm": 3.788289785385132,
149
+ "kl": 0.004197390335320961,
150
  "learning_rate": 4.81355307410676e-07,
151
  "loss": 0.0,
152
+ "reward": 4.460998922586441,
153
+ "reward_std": 3.673946577589959,
154
+ "rewards/concensus_correctness_reward_func": 2.212312502786517,
155
+ "rewards/consensus_reward_func": 0.5,
156
  "rewards/cumulative_reward_2": 0.0,
157
+ "rewards/final_correctness_reward_func": 0.4375,
158
+ "rewards/question_recreation_reward_func": 0.5744363954290748,
159
  "rewards/soft_format_reward_func": 0.0,
160
+ "rewards/strict_format_reward_func": 0.0625,
161
+ "rewards/xmlcount_reward_func": 0.6742500034160912,
162
  "step": 16
163
  },
164
  {
165
+ "completion_length": 223.5,
166
+ "epoch": 1.0,
167
+ "grad_norm": 2.340423345565796,
168
+ "kl": 0.00462426839900824,
169
  "learning_rate": 4.747379352713488e-07,
170
  "loss": 0.0,
171
+ "reward": 5.577042788267136,
172
+ "reward_std": 2.891117551790861,
173
+ "rewards/concensus_correctness_reward_func": 3.5999166841308274,
174
+ "rewards/consensus_reward_func": 0.3333333333333333,
175
  "rewards/cumulative_reward_2": 0.0,
176
+ "rewards/final_correctness_reward_func": 0.3333333333333333,
177
+ "rewards/question_recreation_reward_func": 0.5223344924549261,
178
  "rewards/soft_format_reward_func": 0.0,
179
+ "rewards/strict_format_reward_func": 0.10416666666666667,
180
+ "rewards/xmlcount_reward_func": 0.6839583379526933,
181
  "step": 18
182
  },
183
  {
184
+ "completion_length": 285.375,
185
+ "epoch": 1.1142857142857143,
186
+ "grad_norm": 2.9781136512756348,
187
+ "kl": 0.005876571987755597,
188
  "learning_rate": 4.6717793412953776e-07,
189
  "loss": 0.0,
190
+ "reward": 3.856148846447468,
191
+ "reward_std": 3.056236045435071,
192
+ "rewards/concensus_correctness_reward_func": 1.7554374812170863,
193
+ "rewards/consensus_reward_func": 0.4375,
194
  "rewards/cumulative_reward_2": 0.0,
195
+ "rewards/final_correctness_reward_func": 0.4375,
196
+ "rewards/question_recreation_reward_func": 0.5816801311448216,
197
  "rewards/soft_format_reward_func": 0.0,
198
  "rewards/strict_format_reward_func": 0.03125,
199
+ "rewards/xmlcount_reward_func": 0.6127812552731484,
200
  "step": 20
201
  },
202
  {
203
+ "completion_length": 258.78125,
204
+ "epoch": 1.2285714285714286,
205
+ "grad_norm": 2.790374279022217,
206
+ "kl": 0.008101415849523619,
207
  "learning_rate": 4.5870701325731773e-07,
208
  "loss": 0.0,
209
+ "reward": 5.01688564568758,
210
+ "reward_std": 2.427376964595169,
211
+ "rewards/concensus_correctness_reward_func": 2.506874994840473,
212
+ "rewards/consensus_reward_func": 0.4375,
213
  "rewards/cumulative_reward_2": 0.0,
214
+ "rewards/final_correctness_reward_func": 0.875,
215
+ "rewards/question_recreation_reward_func": 0.5136668155901134,
216
  "rewards/soft_format_reward_func": 0.0,
217
+ "rewards/strict_format_reward_func": 0.015625,
218
+ "rewards/xmlcount_reward_func": 0.6682187579572201,
219
  "step": 22
220
  },
221
  {
222
+ "completion_length": 240.25,
223
+ "epoch": 1.342857142857143,
224
+ "grad_norm": 2.5987563133239746,
225
+ "kl": 0.012570352992042899,
226
  "learning_rate": 4.4936070264068016e-07,
227
  "loss": 0.0,
228
+ "reward": 4.539519101381302,
229
+ "reward_std": 2.6822728496044874,
230
+ "rewards/concensus_correctness_reward_func": 1.7519375048577785,
231
+ "rewards/consensus_reward_func": 0.9375,
232
  "rewards/cumulative_reward_2": 0.0,
233
+ "rewards/final_correctness_reward_func": 0.5625,
234
+ "rewards/question_recreation_reward_func": 0.5148315682308748,
235
  "rewards/soft_format_reward_func": 0.0,
236
+ "rewards/strict_format_reward_func": 0.046875,
237
+ "rewards/xmlcount_reward_func": 0.7258749920874834,
238
  "step": 24
239
  },
240
  {
241
+ "completion_length": 297.03125,
242
+ "epoch": 1.457142857142857,
243
+ "grad_norm": 2.034766912460327,
244
+ "kl": 0.010542554169660434,
245
  "learning_rate": 4.391782039544238e-07,
246
  "loss": 0.0,
247
+ "reward": 5.3825334794819355,
248
+ "reward_std": 4.827194595243782,
249
+ "rewards/concensus_correctness_reward_func": 2.6841250059515005,
250
  "rewards/consensus_reward_func": 0.5625,
251
  "rewards/cumulative_reward_2": 0.0,
252
+ "rewards/final_correctness_reward_func": 0.6875,
253
+ "rewards/question_recreation_reward_func": 0.7439396986737847,
254
  "rewards/soft_format_reward_func": 0.0,
255
+ "rewards/strict_format_reward_func": 0.0625,
256
+ "rewards/xmlcount_reward_func": 0.6419687559828162,
257
  "step": 26
258
  },
259
  {
260
+ "completion_length": 282.1875,
261
+ "epoch": 1.5714285714285714,
262
+ "grad_norm": 4.419146537780762,
263
+ "kl": 0.010264099051710218,
264
  "learning_rate": 4.282022261367073e-07,
265
  "loss": 0.0,
266
+ "reward": 2.5538329035043716,
267
+ "reward_std": 0.86597695434466,
268
+ "rewards/concensus_correctness_reward_func": 0.33731249440461397,
269
+ "rewards/consensus_reward_func": 0.6875,
270
  "rewards/cumulative_reward_2": 0.0,
271
+ "rewards/final_correctness_reward_func": 0.25,
272
+ "rewards/question_recreation_reward_func": 0.6221455032937229,
273
  "rewards/soft_format_reward_func": 0.0,
274
+ "rewards/strict_format_reward_func": 0.046875,
275
+ "rewards/xmlcount_reward_func": 0.6099999930593185,
276
  "step": 28
277
  },
278
  {
279
+ "completion_length": 285.875,
280
+ "epoch": 1.6857142857142857,
281
+ "grad_norm": 2.3088910579681396,
282
+ "kl": 0.014536559290718287,
283
  "learning_rate": 4.1647880625292027e-07,
284
  "loss": 0.0,
285
+ "reward": 7.595606815069914,
286
+ "reward_std": 2.5936438450589776,
287
+ "rewards/concensus_correctness_reward_func": 4.180062495172024,
288
+ "rewards/consensus_reward_func": 1.125,
289
  "rewards/cumulative_reward_2": 0.0,
290
+ "rewards/final_correctness_reward_func": 0.8125,
291
+ "rewards/question_recreation_reward_func": 0.7352005220018327,
292
  "rewards/soft_format_reward_func": 0.0,
293
+ "rewards/strict_format_reward_func": 0.046875,
294
+ "rewards/xmlcount_reward_func": 0.6959687564522028,
295
  "step": 30
296
  },
297
  {
298
+ "completion_length": 260.65625,
299
+ "epoch": 1.8,
300
+ "grad_norm": 2.6501407623291016,
301
+ "kl": 0.018609989958349615,
302
  "learning_rate": 4.040571164002318e-07,
303
  "loss": 0.0,
304
+ "reward": 4.827333331108093,
305
+ "reward_std": 2.085965577978641,
306
+ "rewards/concensus_correctness_reward_func": 2.4736250173300505,
307
+ "rewards/consensus_reward_func": 0.4375,
308
  "rewards/cumulative_reward_2": 0.0,
309
+ "rewards/final_correctness_reward_func": 0.6875,
310
+ "rewards/question_recreation_reward_func": 0.5945833660662174,
311
  "rewards/soft_format_reward_func": 0.0,
312
+ "rewards/strict_format_reward_func": 0.03125,
313
+ "rewards/xmlcount_reward_func": 0.6028750017285347,
314
  "step": 32
315
  },
316
  {
317
+ "completion_length": 262.71875,
318
+ "epoch": 1.9142857142857141,
319
+ "grad_norm": 3.4099960327148438,
320
+ "kl": 0.018288226914592087,
321
  "learning_rate": 3.909892574627266e-07,
322
  "loss": 0.0,
323
+ "reward": 5.539210200309753,
324
+ "reward_std": 2.2049794927006587,
325
+ "rewards/concensus_correctness_reward_func": 2.6087499796412885,
326
+ "rewards/consensus_reward_func": 0.8125,
327
  "rewards/cumulative_reward_2": 0.0,
328
+ "rewards/final_correctness_reward_func": 0.5625,
329
+ "rewards/question_recreation_reward_func": 0.6833352446556091,
330
  "rewards/soft_format_reward_func": 0.0,
331
+ "rewards/strict_format_reward_func": 0.0625,
332
+ "rewards/xmlcount_reward_func": 0.8096250100061297,
333
  "step": 34
334
  },
335
  {
336
+ "completion_length": 315.9166666666667,
337
+ "epoch": 2.0,
338
+ "grad_norm": 1.4127824306488037,
339
+ "kl": 0.0216289390809834,
340
  "learning_rate": 3.773300405821908e-07,
341
  "loss": 0.0,
342
+ "reward": 3.2096741100152335,
343
+ "reward_std": 1.8589469492435455,
344
+ "rewards/concensus_correctness_reward_func": 0.7585000023245811,
345
+ "rewards/consensus_reward_func": 0.4166666666666667,
346
  "rewards/cumulative_reward_2": 0.0,
347
+ "rewards/final_correctness_reward_func": 0.6666666666666666,
348
+ "rewards/question_recreation_reward_func": 0.6264241177899142,
349
  "rewards/soft_format_reward_func": 0.0,
350
  "rewards/strict_format_reward_func": 0.0625,
351
+ "rewards/xmlcount_reward_func": 0.6789166710029045,
352
  "step": 36
353
  },
354
  {
355
+ "completion_length": 235.25,
356
+ "epoch": 2.1142857142857143,
357
+ "grad_norm": 2.856210470199585,
358
+ "kl": 0.03234067652374506,
359
  "learning_rate": 3.6313675726113475e-07,
360
  "loss": 0.0,
361
+ "reward": 4.8823426477611065,
362
+ "reward_std": 4.0293696410954,
363
+ "rewards/concensus_correctness_reward_func": 2.1174999997019768,
364
+ "rewards/consensus_reward_func": 0.625,
365
  "rewards/cumulative_reward_2": 0.0,
366
+ "rewards/final_correctness_reward_func": 0.625,
367
+ "rewards/question_recreation_reward_func": 0.6155301326652989,
368
  "rewards/soft_format_reward_func": 0.0,
369
+ "rewards/strict_format_reward_func": 0.09375,
370
+ "rewards/xmlcount_reward_func": 0.8055625003762543,
371
  "step": 38
372
  },
373
  {
374
+ "completion_length": 289.0,
375
+ "epoch": 2.2285714285714286,
376
+ "grad_norm": 2.498208999633789,
377
+ "kl": 0.032537119346670806,
378
  "learning_rate": 3.484689390623218e-07,
379
  "loss": 0.0,
380
+ "reward": 3.257258500903845,
381
+ "reward_std": 1.8479195050895214,
382
+ "rewards/concensus_correctness_reward_func": 0.9986874996393453,
383
+ "rewards/consensus_reward_func": 0.75,
384
  "rewards/cumulative_reward_2": 0.0,
385
+ "rewards/final_correctness_reward_func": 0.25,
386
+ "rewards/question_recreation_reward_func": 0.6424147803336382,
387
  "rewards/soft_format_reward_func": 0.0,
388
+ "rewards/strict_format_reward_func": 0.03125,
389
+ "rewards/xmlcount_reward_func": 0.5849062511697412,
390
  "step": 40
391
  },
392
  {
393
+ "completion_length": 286.0,
394
+ "epoch": 2.342857142857143,
395
+ "grad_norm": 3.0837841033935547,
396
+ "kl": 0.03241805831203237,
397
  "learning_rate": 3.3338810791270517e-07,
398
  "loss": 0.0,
399
+ "reward": 6.6657252591103315,
400
+ "reward_std": 5.738411407917738,
401
+ "rewards/concensus_correctness_reward_func": 4.001499989069998,
402
+ "rewards/consensus_reward_func": 0.5625,
403
  "rewards/cumulative_reward_2": 0.0,
404
+ "rewards/final_correctness_reward_func": 0.75,
405
+ "rewards/question_recreation_reward_func": 0.6340690106153488,
406
  "rewards/soft_format_reward_func": 0.0,
407
+ "rewards/strict_format_reward_func": 0.0625,
408
+ "rewards/xmlcount_reward_func": 0.6551562692038715,
409
  "step": 42
410
  },
411
  {
412
+ "completion_length": 240.5,
413
+ "epoch": 2.4571428571428573,
414
+ "grad_norm": 2.2751593589782715,
415
+ "kl": 0.027716133918147534,
416
  "learning_rate": 3.179575180590857e-07,
417
  "loss": 0.0,
418
+ "reward": 3.8581665493547916,
419
+ "reward_std": 1.8522115424275398,
420
+ "rewards/concensus_correctness_reward_func": 1.1526875039562583,
421
+ "rewards/consensus_reward_func": 0.5,
422
  "rewards/cumulative_reward_2": 0.0,
423
+ "rewards/final_correctness_reward_func": 0.5625,
424
+ "rewards/question_recreation_reward_func": 0.5496665136888623,
425
  "rewards/soft_format_reward_func": 0.0,
426
+ "rewards/strict_format_reward_func": 0.109375,
427
+ "rewards/xmlcount_reward_func": 0.9839375028386712,
428
  "step": 44
429
  },
430
  {
431
+ "completion_length": 287.8125,
432
+ "epoch": 2.571428571428571,
433
+ "grad_norm": 2.325424909591675,
434
+ "kl": 0.03136032959446311,
435
  "learning_rate": 3.022418907578188e-07,
436
  "loss": 0.0,
437
+ "reward": 5.811520978808403,
438
+ "reward_std": 2.1761377695947886,
439
+ "rewards/concensus_correctness_reward_func": 3.5759375113993883,
440
+ "rewards/consensus_reward_func": 0.5625,
441
  "rewards/cumulative_reward_2": 0.0,
442
+ "rewards/final_correctness_reward_func": 0.375,
443
+ "rewards/question_recreation_reward_func": 0.5898647699505091,
444
  "rewards/soft_format_reward_func": 0.0,
445
  "rewards/strict_format_reward_func": 0.015625,
446
+ "rewards/xmlcount_reward_func": 0.6925937533378601,
447
  "step": 46
448
  },
449
  {
450
+ "completion_length": 264.09375,
451
+ "epoch": 2.685714285714286,
452
+ "grad_norm": 2.799055814743042,
453
+ "kl": 0.035045830823946744,
454
  "learning_rate": 2.863071428113726e-07,
455
  "loss": 0.0,
456
+ "reward": 5.103824369609356,
457
+ "reward_std": 3.5301670129410923,
458
+ "rewards/concensus_correctness_reward_func": 2.3825000133365393,
459
+ "rewards/consensus_reward_func": 0.8125,
460
  "rewards/cumulative_reward_2": 0.0,
461
+ "rewards/final_correctness_reward_func": 0.625,
462
+ "rewards/question_recreation_reward_func": 0.5504180546849966,
463
  "rewards/soft_format_reward_func": 0.0,
464
+ "rewards/strict_format_reward_func": 0.046875,
465
+ "rewards/xmlcount_reward_func": 0.6865312550216913,
466
  "step": 48
467
  },
468
  {
469
+ "completion_length": 280.375,
470
+ "epoch": 2.8,
471
+ "grad_norm": 2.319396495819092,
472
+ "kl": 0.033160059072542936,
473
  "learning_rate": 2.7022011009035107e-07,
474
  "loss": 0.0,
475
+ "reward": 4.136773347854614,
476
+ "reward_std": 1.7412771796807647,
477
+ "rewards/concensus_correctness_reward_func": 1.2328750090673566,
478
+ "rewards/consensus_reward_func": 0.75,
479
  "rewards/cumulative_reward_2": 0.0,
480
+ "rewards/final_correctness_reward_func": 0.625,
481
+ "rewards/question_recreation_reward_func": 0.7363045308738947,
482
  "rewards/soft_format_reward_func": 0.0,
483
+ "rewards/strict_format_reward_func": 0.09375,
484
+ "rewards/xmlcount_reward_func": 0.6988437669351697,
485
  "step": 50
486
  },
487
  {
488
+ "completion_length": 252.5,
489
+ "epoch": 2.914285714285714,
490
+ "grad_norm": 2.2659144401550293,
491
+ "kl": 0.08391272573499009,
492
  "learning_rate": 2.540482672006254e-07,
493
+ "loss": 0.0001,
494
+ "reward": 5.248130708932877,
495
+ "reward_std": 2.9013717267662287,
496
+ "rewards/concensus_correctness_reward_func": 2.086687508970499,
497
+ "rewards/consensus_reward_func": 0.875,
498
  "rewards/cumulative_reward_2": 0.0,
499
+ "rewards/final_correctness_reward_func": 0.75,
500
+ "rewards/question_recreation_reward_func": 0.6195994764566422,
501
  "rewards/soft_format_reward_func": 0.0,
502
+ "rewards/strict_format_reward_func": 0.125,
503
+ "rewards/xmlcount_reward_func": 0.7918437521439046,
504
  "step": 52
505
  },
506
  {
507
+ "completion_length": 336.3333333333333,
508
+ "epoch": 3.0,
509
+ "grad_norm": 1.9638556241989136,
510
+ "kl": 0.0329820365489771,
511
  "learning_rate": 2.37859444471388e-07,
512
  "loss": 0.0,
513
+ "reward": 2.897741069396337,
514
+ "reward_std": 1.3238216874500115,
515
+ "rewards/concensus_correctness_reward_func": 0.4622500070060293,
516
+ "rewards/consensus_reward_func": 0.5,
517
  "rewards/cumulative_reward_2": 0.0,
518
+ "rewards/final_correctness_reward_func": 0.5833333333333334,
519
+ "rewards/question_recreation_reward_func": 0.7555743406216303,
520
  "rewards/soft_format_reward_func": 0.0,
521
+ "rewards/strict_format_reward_func": 0.0,
522
+ "rewards/xmlcount_reward_func": 0.5965833238636454,
523
  "step": 54
524
  },
525
  {
526
+ "completion_length": 297.90625,
527
+ "epoch": 3.1142857142857143,
528
+ "grad_norm": 2.512716770172119,
529
+ "kl": 0.07957816089037806,
530
  "learning_rate": 2.2172154345117894e-07,
531
+ "loss": 0.0001,
532
+ "reward": 5.407194800674915,
533
+ "reward_std": 2.5112848294666037,
534
+ "rewards/concensus_correctness_reward_func": 2.6179999876767397,
535
+ "rewards/consensus_reward_func": 1.0625,
536
  "rewards/cumulative_reward_2": 0.0,
537
+ "rewards/final_correctness_reward_func": 0.625,
538
+ "rewards/question_recreation_reward_func": 0.6985698798671365,
539
  "rewards/soft_format_reward_func": 0.0,
540
+ "rewards/strict_format_reward_func": 0.046875,
541
+ "rewards/xmlcount_reward_func": 0.3562500039115548,
542
  "step": 56
543
  },
544
  {
545
+ "completion_length": 261.375,
546
+ "epoch": 3.2285714285714286,
547
+ "grad_norm": 2.727494716644287,
548
+ "kl": 0.051797536259982735,
549
  "learning_rate": 2.0570225210519433e-07,
550
+ "loss": 0.0001,
551
+ "reward": 4.303291346877813,
552
+ "reward_std": 3.120794242247939,
553
+ "rewards/concensus_correctness_reward_func": 1.5453749848529696,
554
+ "rewards/consensus_reward_func": 0.625,
555
  "rewards/cumulative_reward_2": 0.0,
556
+ "rewards/final_correctness_reward_func": 0.8125,
557
+ "rewards/question_recreation_reward_func": 0.5874788034707308,
558
  "rewards/soft_format_reward_func": 0.0,
559
+ "rewards/strict_format_reward_func": 0.078125,
560
+ "rewards/xmlcount_reward_func": 0.6548125031404197,
561
  "step": 58
562
  },
563
  {
564
+ "completion_length": 312.5,
565
+ "epoch": 3.342857142857143,
566
+ "grad_norm": 2.5867483615875244,
567
+ "kl": 0.045177310064900666,
568
  "learning_rate": 1.8986876090843664e-07,
569
  "loss": 0.0,
570
+ "reward": 6.679476020857692,
571
+ "reward_std": 6.590652715298347,
572
+ "rewards/concensus_correctness_reward_func": 3.911812473088503,
573
+ "rewards/consensus_reward_func": 0.625,
574
  "rewards/cumulative_reward_2": 0.0,
575
+ "rewards/final_correctness_reward_func": 0.8125,
576
+ "rewards/question_recreation_reward_func": 0.6978510078042746,
577
  "rewards/soft_format_reward_func": 0.0,
578
+ "rewards/strict_format_reward_func": 0.0625,
579
+ "rewards/xmlcount_reward_func": 0.5698124994523823,
580
  "step": 60
581
  },
582
  {
583
+ "completion_length": 275.46875,
584
+ "epoch": 3.4571428571428573,
585
+ "grad_norm": 2.2337958812713623,
586
+ "kl": 0.05546386865898967,
587
  "learning_rate": 1.7428748102551234e-07,
588
+ "loss": 0.0001,
589
+ "reward": 4.224702462553978,
590
+ "reward_std": 2.568043567443965,
591
+ "rewards/concensus_correctness_reward_func": 1.6509375016321428,
592
+ "rewards/consensus_reward_func": 0.5625,
593
  "rewards/cumulative_reward_2": 0.0,
594
+ "rewards/final_correctness_reward_func": 0.5,
595
+ "rewards/question_recreation_reward_func": 0.5963900072965771,
596
  "rewards/soft_format_reward_func": 0.0,
597
  "rewards/strict_format_reward_func": 0.046875,
598
+ "rewards/xmlcount_reward_func": 0.8680000007152557,
599
  "step": 62
600
  },
601
  {
602
+ "completion_length": 254.375,
603
+ "epoch": 3.571428571428571,
604
+ "grad_norm": 2.5836949348449707,
605
+ "kl": 0.06771399604622275,
606
  "learning_rate": 1.5902376575912814e-07,
607
+ "loss": 0.0001,
608
+ "reward": 5.309353556483984,
609
+ "reward_std": 2.0958344470709562,
610
+ "rewards/concensus_correctness_reward_func": 2.4754375047050416,
611
+ "rewards/consensus_reward_func": 0.9375,
612
  "rewards/cumulative_reward_2": 0.0,
613
+ "rewards/final_correctness_reward_func": 0.5625,
614
+ "rewards/question_recreation_reward_func": 0.5066348570398986,
615
  "rewards/soft_format_reward_func": 0.0,
616
+ "rewards/strict_format_reward_func": 0.09375,
617
+ "rewards/xmlcount_reward_func": 0.7335312599316239,
618
  "step": 64
619
  },
620
  {
621
+ "completion_length": 294.90625,
622
+ "epoch": 3.685714285714286,
623
+ "grad_norm": 2.2413110733032227,
624
+ "kl": 0.058190350187942386,
625
  "learning_rate": 1.4414163643562753e-07,
626
+ "loss": 0.0001,
627
+ "reward": 4.059148486703634,
628
+ "reward_std": 2.2921111752657453,
629
+ "rewards/concensus_correctness_reward_func": 1.5227500088512897,
630
+ "rewards/consensus_reward_func": 0.4375,
631
  "rewards/cumulative_reward_2": 0.0,
632
+ "rewards/final_correctness_reward_func": 0.5625,
633
+ "rewards/question_recreation_reward_func": 0.7638359684497118,
634
+ "rewards/soft_format_reward_func": 0.0,
635
  "rewards/strict_format_reward_func": 0.015625,
636
+ "rewards/xmlcount_reward_func": 0.7569375038146973,
637
  "step": 66
638
  },
639
  {
640
+ "completion_length": 277.53125,
641
+ "epoch": 3.8,
642
+ "grad_norm": 2.7358996868133545,
643
+ "kl": 0.07721107231918722,
644
  "learning_rate": 1.2970351387729872e-07,
645
+ "loss": 0.0001,
646
+ "reward": 3.4795289039611816,
647
+ "reward_std": 2.1741816513240337,
648
+ "rewards/concensus_correctness_reward_func": 0.8386874985590111,
649
  "rewards/consensus_reward_func": 0.75,
650
  "rewards/cumulative_reward_2": 0.0,
651
+ "rewards/final_correctness_reward_func": 0.25,
652
+ "rewards/question_recreation_reward_func": 0.6809976994991302,
653
  "rewards/soft_format_reward_func": 0.0,
654
+ "rewards/strict_format_reward_func": 0.078125,
655
+ "rewards/xmlcount_reward_func": 0.8817187771201134,
656
  "step": 68
657
  },
658
  {
659
+ "completion_length": 256.0625,
660
+ "epoch": 3.914285714285714,
661
+ "grad_norm": 4.087591648101807,
662
+ "kl": 0.0788359681610018,
663
  "learning_rate": 1.1576995658775404e-07,
664
+ "loss": 0.0001,
665
+ "reward": 6.083272695541382,
666
+ "reward_std": 4.112470694584772,
667
+ "rewards/concensus_correctness_reward_func": 3.4234374810475856,
668
+ "rewards/consensus_reward_func": 0.875,
669
  "rewards/cumulative_reward_2": 0.0,
670
+ "rewards/final_correctness_reward_func": 0.3125,
671
+ "rewards/question_recreation_reward_func": 0.5136476922780275,
672
+ "rewards/soft_format_reward_func": 0.0,
673
+ "rewards/strict_format_reward_func": 0.125,
674
+ "rewards/xmlcount_reward_func": 0.8336875168606639,
675
  "step": 70
676
  },
677
  {
678
+ "completion_length": 276.5,
679
+ "epoch": 4.0,
680
+ "grad_norm": 1.627131700515747,
681
+ "kl": 0.11795352476959427,
682
  "learning_rate": 1.0239940674851941e-07,
683
+ "loss": 0.0001,
684
+ "reward": 5.3608784476916,
685
+ "reward_std": 2.300887676576773,
686
+ "rewards/concensus_correctness_reward_func": 1.9943333491683006,
687
+ "rewards/consensus_reward_func": 1.0833333333333333,
688
  "rewards/cumulative_reward_2": 0.0,
689
+ "rewards/final_correctness_reward_func": 0.75,
690
+ "rewards/question_recreation_reward_func": 0.6572117364654938,
691
  "rewards/soft_format_reward_func": 0.0,
692
+ "rewards/strict_format_reward_func": 0.08333333333333333,
693
+ "rewards/xmlcount_reward_func": 0.7926666811108589,
694
  "step": 72
695
  },
696
  {
697
+ "completion_length": 353.4375,
698
+ "epoch": 4.114285714285714,
699
+ "grad_norm": 4.412367820739746,
700
+ "kl": 0.08782886900007725,
701
  "learning_rate": 8.964794509221507e-08,
702
+ "loss": 0.0001,
703
+ "reward": 4.319345578551292,
704
+ "reward_std": 1.4791212249547243,
705
+ "rewards/concensus_correctness_reward_func": 1.8143750003073364,
706
+ "rewards/consensus_reward_func": 0.625,
707
  "rewards/cumulative_reward_2": 0.0,
708
+ "rewards/final_correctness_reward_func": 0.4375,
709
+ "rewards/question_recreation_reward_func": 0.6734706219285727,
710
  "rewards/soft_format_reward_func": 0.0,
711
+ "rewards/strict_format_reward_func": 0.0625,
712
+ "rewards/xmlcount_reward_func": 0.706500010099262,
713
  "step": 74
714
  },
715
  {
716
+ "completion_length": 298.5,
717
+ "epoch": 4.228571428571429,
718
+ "grad_norm": 43.921119689941406,
719
+ "kl": 0.09398894105106592,
720
  "learning_rate": 7.756905568047392e-08,
721
+ "loss": 0.0001,
722
+ "reward": 5.068341612815857,
723
+ "reward_std": 2.5960501823574305,
724
+ "rewards/concensus_correctness_reward_func": 2.2858750016748672,
725
+ "rewards/consensus_reward_func": 0.8125,
726
  "rewards/cumulative_reward_2": 0.0,
727
+ "rewards/final_correctness_reward_func": 0.6875,
728
+ "rewards/question_recreation_reward_func": 0.6213102764450014,
729
  "rewards/soft_format_reward_func": 0.0,
730
+ "rewards/strict_format_reward_func": 0.046875,
731
+ "rewards/xmlcount_reward_func": 0.6142812587786466,
732
  "step": 76
733
  },
734
  {
735
+ "completion_length": 279.4375,
736
+ "epoch": 4.3428571428571425,
737
+ "grad_norm": 3.3324248790740967,
738
+ "kl": 0.12617466738447547,
739
  "learning_rate": 6.621340157319996e-08,
740
+ "loss": 0.0001,
741
+ "reward": 6.855641521513462,
742
+ "reward_std": 5.589063869789243,
743
+ "rewards/concensus_correctness_reward_func": 3.5954999728128314,
744
+ "rewards/consensus_reward_func": 1.125,
745
  "rewards/cumulative_reward_2": 0.0,
746
+ "rewards/final_correctness_reward_func": 0.6875,
747
+ "rewards/question_recreation_reward_func": 0.6044852556660771,
748
  "rewards/soft_format_reward_func": 0.0,
749
+ "rewards/strict_format_reward_func": 0.078125,
750
+ "rewards/xmlcount_reward_func": 0.7650312539190054,
751
  "step": 78
752
  },
753
  {
754
+ "completion_length": 282.25,
755
+ "epoch": 4.457142857142857,
756
+ "grad_norm": 2.391388416290283,
757
+ "kl": 0.0781217070762068,
758
  "learning_rate": 5.5628612330087724e-08,
759
+ "loss": 0.0001,
760
+ "reward": 5.143940486013889,
761
+ "reward_std": 3.1008094910066575,
762
+ "rewards/concensus_correctness_reward_func": 2.1685625007376075,
763
+ "rewards/consensus_reward_func": 0.6875,
764
  "rewards/cumulative_reward_2": 0.0,
765
+ "rewards/final_correctness_reward_func": 0.8125,
766
+ "rewards/question_recreation_reward_func": 0.5825029462575912,
767
  "rewards/soft_format_reward_func": 0.0,
768
+ "rewards/strict_format_reward_func": 0.109375,
769
+ "rewards/xmlcount_reward_func": 0.7835000064224005,
770
  "step": 80
771
  },
772
  {
773
+ "completion_length": 250.875,
774
+ "epoch": 4.571428571428571,
775
+ "grad_norm": 3.275106906890869,
776
+ "kl": 0.08860545780044049,
777
  "learning_rate": 4.5859084235697235e-08,
778
+ "loss": 0.0001,
779
+ "reward": 3.176556244492531,
780
+ "reward_std": 1.5254050176981764,
781
+ "rewards/concensus_correctness_reward_func": 0.8501874984940514,
782
+ "rewards/consensus_reward_func": 0.5625,
783
  "rewards/cumulative_reward_2": 0.0,
784
+ "rewards/final_correctness_reward_func": 0.125,
785
+ "rewards/question_recreation_reward_func": 0.5377749832696281,
786
  "rewards/soft_format_reward_func": 0.0,
787
+ "rewards/strict_format_reward_func": 0.109375,
788
+ "rewards/xmlcount_reward_func": 0.9917187504470348,
789
  "step": 82
790
  },
791
  {
792
+ "completion_length": 252.65625,
793
+ "epoch": 4.685714285714286,
794
+ "grad_norm": 5.577023029327393,
795
+ "kl": 0.14010742434766144,
796
  "learning_rate": 3.6945794086007705e-08,
797
+ "loss": 0.0001,
798
+ "reward": 4.1389394868165255,
799
+ "reward_std": 2.314908188767731,
800
+ "rewards/concensus_correctness_reward_func": 1.5294374911900377,
801
+ "rewards/consensus_reward_func": 0.6875,
802
  "rewards/cumulative_reward_2": 0.0,
803
+ "rewards/final_correctness_reward_func": 0.625,
804
+ "rewards/question_recreation_reward_func": 0.560220692306757,
805
+ "rewards/soft_format_reward_func": 0.0,
806
+ "rewards/strict_format_reward_func": 0.078125,
807
+ "rewards/xmlcount_reward_func": 0.6586562575539574,
808
  "step": 84
809
  },
810
  {
811
+ "completion_length": 257.34375,
812
+ "epoch": 4.8,
813
+ "grad_norm": 2.5406899452209473,
814
+ "kl": 0.09263441083021462,
815
  "learning_rate": 2.892612731749414e-08,
816
+ "loss": 0.0001,
817
+ "reward": 3.948366153985262,
818
+ "reward_std": 1.6628333161497721,
819
+ "rewards/concensus_correctness_reward_func": 1.1678125290200114,
820
+ "rewards/consensus_reward_func": 0.5625,
821
  "rewards/cumulative_reward_2": 0.0,
822
+ "rewards/final_correctness_reward_func": 0.6875,
823
+ "rewards/question_recreation_reward_func": 0.6303973635658622,
824
  "rewards/soft_format_reward_func": 0.0,
825
+ "rewards/strict_format_reward_func": 0.09375,
826
+ "rewards/xmlcount_reward_func": 0.8064062613993883,
827
  "step": 86
828
  },
829
  {
830
+ "completion_length": 266.59375,
831
+ "epoch": 4.914285714285715,
832
+ "grad_norm": 2.87119197845459,
833
+ "kl": 0.11624281201511621,
834
  "learning_rate": 2.183372119961499e-08,
835
+ "loss": 0.0001,
836
+ "reward": 4.197016902267933,
837
+ "reward_std": 2.4855765970423818,
838
+ "rewards/concensus_correctness_reward_func": 1.5803749952465296,
839
+ "rewards/consensus_reward_func": 0.4375,
840
  "rewards/cumulative_reward_2": 0.0,
841
+ "rewards/final_correctness_reward_func": 0.5,
842
+ "rewards/question_recreation_reward_func": 0.6657043690793216,
843
+ "rewards/soft_format_reward_func": 0.0,
844
+ "rewards/strict_format_reward_func": 0.078125,
845
+ "rewards/xmlcount_reward_func": 0.9353125058114529,
846
  "step": 88
847
  },
848
  {
849
+ "completion_length": 259.5833333333333,
850
+ "epoch": 5.0,
851
+ "grad_norm": 3.3491158485412598,
852
+ "kl": 0.16860986345758042,
853
  "learning_rate": 1.5698323748414122e-08,
854
+ "loss": 0.0001,
855
+ "reward": 7.5917567908763885,
856
+ "reward_std": 5.551452632372578,
857
+ "rewards/concensus_correctness_reward_func": 4.24174995906651,
858
+ "rewards/consensus_reward_func": 0.8333333333333334,
859
  "rewards/cumulative_reward_2": 0.0,
860
+ "rewards/final_correctness_reward_func": 0.8333333333333334,
861
+ "rewards/question_recreation_reward_func": 0.7116317736605803,
862
  "rewards/soft_format_reward_func": 0.0,
863
+ "rewards/strict_format_reward_func": 0.041666666666666664,
864
+ "rewards/xmlcount_reward_func": 0.9300416857004166,
865
  "step": 90
866
  },
867
  {
868
+ "completion_length": 259.78125,
869
+ "epoch": 5.114285714285714,
870
+ "grad_norm": 4.982123851776123,
871
+ "kl": 0.2259263969026506,
872
  "learning_rate": 1.054566895300324e-08,
873
+ "loss": 0.0002,
874
+ "reward": 4.731869850307703,
875
+ "reward_std": 3.6210765979485586,
876
+ "rewards/concensus_correctness_reward_func": 1.8394375070929527,
877
+ "rewards/consensus_reward_func": 0.75,
878
  "rewards/cumulative_reward_2": 0.0,
879
+ "rewards/final_correctness_reward_func": 0.625,
880
+ "rewards/question_recreation_reward_func": 0.6873698411509395,
881
  "rewards/soft_format_reward_func": 0.0,
882
  "rewards/strict_format_reward_func": 0.046875,
883
+ "rewards/xmlcount_reward_func": 0.7831875099800527,
884
  "step": 92
885
  },
886
  {
887
+ "completion_length": 309.03125,
888
+ "epoch": 5.228571428571429,
889
+ "grad_norm": 2.2618565559387207,
890
+ "kl": 0.2819003712502308,
891
  "learning_rate": 6.397368838268496e-09,
892
+ "loss": 0.0003,
893
+ "reward": 3.303976181894541,
894
+ "reward_std": 2.2868103915825486,
895
+ "rewards/concensus_correctness_reward_func": 0.8359999973326921,
896
+ "rewards/consensus_reward_func": 0.8125,
897
  "rewards/cumulative_reward_2": 0.0,
898
+ "rewards/final_correctness_reward_func": 0.25,
899
+ "rewards/question_recreation_reward_func": 0.6375387134030461,
900
  "rewards/soft_format_reward_func": 0.0,
901
+ "rewards/strict_format_reward_func": 0.09375,
902
+ "rewards/xmlcount_reward_func": 0.6741875065490603,
903
  "step": 94
904
  },
905
  {
906
+ "completion_length": 263.03125,
907
+ "epoch": 5.3428571428571425,
908
+ "grad_norm": 2.844130039215088,
909
+ "kl": 0.059587346273474395,
910
  "learning_rate": 3.2708228165273244e-09,
911
+ "loss": 0.0001,
912
+ "reward": 5.61201386898756,
913
+ "reward_std": 4.463006908656098,
914
+ "rewards/concensus_correctness_reward_func": 2.957062483765185,
915
+ "rewards/consensus_reward_func": 0.625,
916
  "rewards/cumulative_reward_2": 0.0,
917
+ "rewards/final_correctness_reward_func": 0.625,
918
+ "rewards/question_recreation_reward_func": 0.5966701377183199,
919
  "rewards/soft_format_reward_func": 0.0,
920
+ "rewards/strict_format_reward_func": 0.09375,
921
+ "rewards/xmlcount_reward_func": 0.7145312689244747,
922
  "step": 96
923
  },
924
  {
925
+ "completion_length": 250.0625,
926
+ "epoch": 5.457142857142857,
927
+ "grad_norm": 2.9318857192993164,
928
+ "kl": 0.06650862575042993,
929
  "learning_rate": 1.1791447083465133e-09,
930
+ "loss": 0.0001,
931
+ "reward": 5.33496169000864,
932
+ "reward_std": 1.7955414667958394,
933
+ "rewards/concensus_correctness_reward_func": 2.302812503403402,
934
  "rewards/consensus_reward_func": 0.625,
935
  "rewards/cumulative_reward_2": 0.0,
936
+ "rewards/final_correctness_reward_func": 0.875,
937
+ "rewards/question_recreation_reward_func": 0.6075867600739002,
938
  "rewards/soft_format_reward_func": 0.0,
939
  "rewards/strict_format_reward_func": 0.0625,
940
+ "rewards/xmlcount_reward_func": 0.8620625035837293,
941
  "step": 98
942
  },
943
  {
944
+ "completion_length": 270.4375,
945
+ "epoch": 5.571428571428571,
946
+ "grad_norm": 2.9673755168914795,
947
+ "kl": 0.08620016509667039,
948
  "learning_rate": 1.3110773862126667e-10,
949
+ "loss": 0.0001,
950
+ "reward": 4.476045485585928,
951
+ "reward_std": 3.1387285026721656,
952
+ "rewards/concensus_correctness_reward_func": 1.7854374921880662,
953
  "rewards/consensus_reward_func": 0.5,
954
  "rewards/cumulative_reward_2": 0.0,
955
+ "rewards/final_correctness_reward_func": 0.5625,
956
+ "rewards/question_recreation_reward_func": 0.606795561965555,
957
  "rewards/soft_format_reward_func": 0.0,
958
+ "rewards/strict_format_reward_func": 0.109375,
959
+ "rewards/xmlcount_reward_func": 0.9119375087320805,
960
  "step": 100
961
  },
962
  {
963
+ "epoch": 5.571428571428571,
964
  "step": 100,
965
  "total_flos": 0.0,
966
+ "train_loss": 5.454165089759044e-05,
967
+ "train_runtime": 1795.073,
968
+ "train_samples_per_second": 0.891,
969
+ "train_steps_per_second": 0.056
970
  }
971
  ],
972
  "logging_steps": 2,
973
  "max_steps": 100,
974
  "num_input_tokens_seen": 0,
975
+ "num_train_epochs": 6,
976
  "save_steps": 25,
977
  "stateful_callbacks": {
978
  "TrainerControl": {