wyceee commited on
Commit
6301e79
·
verified ·
1 Parent(s): ed39b5e

End of training

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 69070.94929174794,
4
- "train_runtime": 1492.4283,
5
  "train_samples": 140,
6
- "train_samples_per_second": 1.072,
7
  "train_steps_per_second": 0.067
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 296549.3844699778,
4
+ "train_runtime": 1501.8809,
5
  "train_samples": 140,
6
+ "train_samples_per_second": 1.065,
7
  "train_steps_per_second": 0.067
8
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60f95d9dca8dab5a3bbe962bd31abceb4e646ea910d56401b65c9e5bbb65a2f5
3
  size 4996670464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aaa9065df806024fdc8b6235eb388fe611870af778efd05d97eff23f4eb2a52
3
  size 4996670464
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2812ae866c6020426eb8b6dd447cce7d4d1747315cf87bbcf91b987b63db1433
3
  size 1178224960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3124cc9c290006d42f9e8ff285c36d3a31712f9601e7c939cc305a738cb9539c
3
  size 1178224960
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 69070.94929174794,
4
- "train_runtime": 1492.4283,
5
  "train_samples": 140,
6
- "train_samples_per_second": 1.072,
7
  "train_steps_per_second": 0.067
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 296549.3844699778,
4
+ "train_runtime": 1501.8809,
5
  "train_samples": 140,
6
+ "train_samples_per_second": 1.065,
7
  "train_steps_per_second": 0.067
8
  }
trainer_state.json CHANGED
@@ -10,962 +10,962 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 296.65625,
14
  "epoch": 0.11428571428571428,
15
- "grad_norm": 6.421307563781738,
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666665e-07,
18
  "loss": 0.0,
19
- "reward": 16.115779522806406,
20
- "reward_std": 1.6200225981592666,
21
- "rewards/concensus_correctness_reward_func": 11.875,
22
- "rewards/consensus_reward_func": 1.1875,
23
  "rewards/cumulative_reward_2": 0.0,
24
- "rewards/final_correctness_reward_func": 1.3125,
25
- "rewards/question_recreation_reward_func": 0.8294045682996511,
26
  "rewards/soft_format_reward_func": 0.0,
27
  "rewards/strict_format_reward_func": 0.25,
28
- "rewards/xmlcount_reward_func": 0.6613750024698675,
29
  "step": 2
30
  },
31
  {
32
- "completion_length": 287.375,
33
  "epoch": 0.22857142857142856,
34
- "grad_norm": 83.90865325927734,
35
- "kl": 0.060066412363084964,
36
  "learning_rate": 5e-07,
37
- "loss": 0.0001,
38
- "reward": 22.926603138446808,
39
- "reward_std": 1.1492585969390348,
40
- "rewards/concensus_correctness_reward_func": 16.875,
41
- "rewards/consensus_reward_func": 1.9375,
42
  "rewards/cumulative_reward_2": 0.0,
43
- "rewards/final_correctness_reward_func": 1.4375,
44
- "rewards/question_recreation_reward_func": 0.9539468586444855,
45
  "rewards/soft_format_reward_func": 0.0,
46
  "rewards/strict_format_reward_func": 0.484375,
47
- "rewards/xmlcount_reward_func": 1.23828125,
48
  "step": 4
49
  },
50
  {
51
- "completion_length": 303.34375,
52
  "epoch": 0.34285714285714286,
53
- "grad_norm": 8.03852367401123,
54
- "kl": 0.08650782657787204,
55
  "learning_rate": 4.994757065594279e-07,
56
- "loss": 0.0001,
57
- "reward": 19.984008461236954,
58
- "reward_std": 0.1418806642468553,
59
- "rewards/concensus_correctness_reward_func": 13.75,
60
- "rewards/consensus_reward_func": 2.0,
61
  "rewards/cumulative_reward_2": 0.0,
62
- "rewards/final_correctness_reward_func": 1.5625,
63
- "rewards/question_recreation_reward_func": 0.9683833792805672,
64
  "rewards/soft_format_reward_func": 0.0,
65
  "rewards/strict_format_reward_func": 0.453125,
66
- "rewards/xmlcount_reward_func": 1.25,
67
  "step": 6
68
  },
69
  {
70
- "completion_length": 287.90625,
71
  "epoch": 0.45714285714285713,
72
- "grad_norm": 5.897157669067383,
73
- "kl": 0.2046783430268988,
74
  "learning_rate": 4.979050253066063e-07,
75
- "loss": 0.0002,
76
- "reward": 21.05784034729004,
77
- "reward_std": 0.1875998378091026,
78
- "rewards/concensus_correctness_reward_func": 15.0,
79
  "rewards/consensus_reward_func": 2.0,
80
  "rewards/cumulative_reward_2": 0.0,
81
- "rewards/final_correctness_reward_func": 1.375,
82
- "rewards/question_recreation_reward_func": 0.9328403398394585,
83
  "rewards/soft_format_reward_func": 0.0,
84
- "rewards/strict_format_reward_func": 0.5,
85
  "rewards/xmlcount_reward_func": 1.25,
86
  "step": 8
87
  },
88
  {
89
- "completion_length": 307.34375,
90
  "epoch": 0.5714285714285714,
91
- "grad_norm": 10.724071502685547,
92
- "kl": 0.43954612920060754,
93
  "learning_rate": 4.952945442245597e-07,
94
- "loss": 0.0004,
95
- "reward": 24.9216285943985,
96
- "reward_std": 0.23980266088619828,
97
  "rewards/concensus_correctness_reward_func": 18.75,
98
  "rewards/consensus_reward_func": 2.0,
99
  "rewards/cumulative_reward_2": 0.0,
100
- "rewards/final_correctness_reward_func": 1.5,
101
- "rewards/question_recreation_reward_func": 0.9216288048774004,
102
  "rewards/soft_format_reward_func": 0.0,
103
- "rewards/strict_format_reward_func": 0.5,
104
- "rewards/xmlcount_reward_func": 1.25,
105
  "step": 10
106
  },
107
  {
108
- "completion_length": 288.09375,
109
  "epoch": 0.6857142857142857,
110
- "grad_norm": 78.01028442382812,
111
- "kl": 3.3898711418733,
112
  "learning_rate": 4.916552125781528e-07,
113
- "loss": 0.0034,
114
- "reward": 25.49212944507599,
115
- "reward_std": 0.004952086288540158,
116
- "rewards/concensus_correctness_reward_func": 18.75,
117
  "rewards/consensus_reward_func": 2.0,
118
  "rewards/cumulative_reward_2": 0.0,
119
- "rewards/final_correctness_reward_func": 2.0,
120
- "rewards/question_recreation_reward_func": 0.9921293407678604,
121
  "rewards/soft_format_reward_func": 0.0,
122
  "rewards/strict_format_reward_func": 0.5,
123
  "rewards/xmlcount_reward_func": 1.25,
124
  "step": 12
125
  },
126
  {
127
- "completion_length": 288.09375,
128
  "epoch": 0.8,
129
- "grad_norm": 5.335823059082031,
130
- "kl": 0.6103032417595387,
131
  "learning_rate": 4.870022949890676e-07,
132
- "loss": 0.0005,
133
- "reward": 22.442598521709442,
134
- "reward_std": 0.2463319645394222,
135
- "rewards/concensus_correctness_reward_func": 16.25,
136
  "rewards/consensus_reward_func": 2.0,
137
  "rewards/cumulative_reward_2": 0.0,
138
- "rewards/final_correctness_reward_func": 1.5,
139
- "rewards/question_recreation_reward_func": 0.9621299877762794,
140
  "rewards/soft_format_reward_func": 0.0,
141
  "rewards/strict_format_reward_func": 0.484375,
142
- "rewards/xmlcount_reward_func": 1.24609375,
143
  "step": 14
144
  },
145
  {
146
- "completion_length": 302.65625,
147
  "epoch": 0.9142857142857143,
148
- "grad_norm": 555077.625,
149
- "kl": 19965.11735479813,
150
  "learning_rate": 4.81355307410676e-07,
151
- "loss": 19.9651,
152
- "reward": 20.95334878563881,
153
- "reward_std": 0.4493027157150209,
154
- "rewards/concensus_correctness_reward_func": 15.0,
155
- "rewards/consensus_reward_func": 2.0,
156
  "rewards/cumulative_reward_2": 0.0,
157
- "rewards/final_correctness_reward_func": 1.375,
158
- "rewards/question_recreation_reward_func": 0.9466612227261066,
159
  "rewards/soft_format_reward_func": 0.0,
160
- "rewards/strict_format_reward_func": 0.40625,
161
- "rewards/xmlcount_reward_func": 1.225437507033348,
162
  "step": 16
163
  },
164
  {
165
- "completion_length": 301.5,
166
  "epoch": 1.0,
167
- "grad_norm": 0.38694965839385986,
168
- "kl": 0.24521661549806595,
169
  "learning_rate": 4.747379352713488e-07,
170
- "loss": 0.0002,
171
- "reward": 21.118529717127483,
172
- "reward_std": 0.12700148997828364,
173
- "rewards/concensus_correctness_reward_func": 15.0,
174
- "rewards/consensus_reward_func": 2.0,
175
  "rewards/cumulative_reward_2": 0.0,
176
- "rewards/final_correctness_reward_func": 1.4166666666666667,
177
- "rewards/question_recreation_reward_func": 0.9935297220945358,
178
  "rewards/soft_format_reward_func": 0.0,
179
  "rewards/strict_format_reward_func": 0.4583333333333333,
180
- "rewards/xmlcount_reward_func": 1.25,
181
  "step": 18
182
  },
183
  {
184
- "completion_length": 296.09375,
185
  "epoch": 1.1142857142857143,
186
- "grad_norm": 5.091805458068848,
187
- "kl": 0.8085636631585658,
188
  "learning_rate": 4.6717793412953776e-07,
189
- "loss": 0.0009,
190
- "reward": 21.177861392498016,
191
- "reward_std": 0.09898363525644527,
192
- "rewards/concensus_correctness_reward_func": 15.0,
193
  "rewards/consensus_reward_func": 2.0,
194
  "rewards/cumulative_reward_2": 0.0,
195
- "rewards/final_correctness_reward_func": 1.5,
196
- "rewards/question_recreation_reward_func": 0.9473926387727261,
197
  "rewards/soft_format_reward_func": 0.0,
198
- "rewards/strict_format_reward_func": 0.484375,
199
- "rewards/xmlcount_reward_func": 1.24609375,
200
  "step": 20
201
  },
202
  {
203
- "completion_length": 299.8125,
204
  "epoch": 1.2285714285714286,
205
- "grad_norm": 35.016902923583984,
206
- "kl": 5.345846755430102,
207
  "learning_rate": 4.5870701325731773e-07,
208
- "loss": 0.0053,
209
- "reward": 24.928703397512436,
210
- "reward_std": 0.32053298623941373,
211
  "rewards/concensus_correctness_reward_func": 18.75,
212
  "rewards/consensus_reward_func": 2.0,
213
  "rewards/cumulative_reward_2": 0.0,
214
- "rewards/final_correctness_reward_func": 1.5625,
215
- "rewards/question_recreation_reward_func": 0.9355159923434258,
216
  "rewards/soft_format_reward_func": 0.0,
217
- "rewards/strict_format_reward_func": 0.453125,
218
- "rewards/xmlcount_reward_func": 1.227562502026558,
219
  "step": 22
220
  },
221
  {
222
- "completion_length": 299.625,
223
  "epoch": 1.342857142857143,
224
- "grad_norm": 3.4500892162323,
225
- "kl": 0.48288927460089326,
226
  "learning_rate": 4.4936070264068016e-07,
227
- "loss": 0.0005,
228
- "reward": 26.40441143512726,
229
- "reward_std": 0.20266110051306896,
230
- "rewards/concensus_correctness_reward_func": 20.0,
231
  "rewards/consensus_reward_func": 2.0,
232
  "rewards/cumulative_reward_2": 0.0,
233
  "rewards/final_correctness_reward_func": 1.75,
234
- "rewards/question_recreation_reward_func": 0.974724005907774,
235
  "rewards/soft_format_reward_func": 0.0,
236
- "rewards/strict_format_reward_func": 0.46875,
237
- "rewards/xmlcount_reward_func": 1.2109375,
238
  "step": 24
239
  },
240
  {
241
- "completion_length": 285.65625,
242
  "epoch": 1.457142857142857,
243
- "grad_norm": 61298.04296875,
244
- "kl": 15362.55208538007,
245
  "learning_rate": 4.391782039544238e-07,
246
- "loss": 15.3626,
247
- "reward": 22.46716809272766,
248
- "reward_std": 0.24589635042502778,
249
- "rewards/concensus_correctness_reward_func": 16.25,
250
- "rewards/consensus_reward_func": 2.0,
251
  "rewards/cumulative_reward_2": 0.0,
252
- "rewards/final_correctness_reward_func": 1.625,
253
- "rewards/question_recreation_reward_func": 0.919355308637023,
254
  "rewards/soft_format_reward_func": 0.0,
255
- "rewards/strict_format_reward_func": 0.453125,
256
- "rewards/xmlcount_reward_func": 1.2196875028312206,
257
  "step": 26
258
  },
259
  {
260
- "completion_length": 284.6875,
261
  "epoch": 1.5714285714285714,
262
- "grad_norm": 10.184200286865234,
263
- "kl": 287324.80912376195,
264
  "learning_rate": 4.282022261367073e-07,
265
- "loss": 287.3248,
266
- "reward": 21.207891523838043,
267
- "reward_std": 0.3488070629828144,
268
  "rewards/concensus_correctness_reward_func": 15.0,
269
- "rewards/consensus_reward_func": 2.0,
270
  "rewards/cumulative_reward_2": 0.0,
271
- "rewards/final_correctness_reward_func": 1.5625,
272
- "rewards/question_recreation_reward_func": 0.9461727738380432,
273
  "rewards/soft_format_reward_func": 0.0,
274
- "rewards/strict_format_reward_func": 0.453125,
275
- "rewards/xmlcount_reward_func": 1.24609375,
276
  "step": 28
277
  },
278
  {
279
- "completion_length": 292.75,
280
  "epoch": 1.6857142857142857,
281
- "grad_norm": 8.854732513427734,
282
- "kl": 15780.277490709908,
283
  "learning_rate": 4.1647880625292027e-07,
284
- "loss": 15.7803,
285
- "reward": 22.25498914718628,
286
- "reward_std": 0.4585948936874047,
287
- "rewards/concensus_correctness_reward_func": 16.25,
288
  "rewards/consensus_reward_func": 2.0,
289
  "rewards/cumulative_reward_2": 0.0,
290
- "rewards/final_correctness_reward_func": 1.375,
291
- "rewards/question_recreation_reward_func": 0.8799891024827957,
292
  "rewards/soft_format_reward_func": 0.0,
293
- "rewards/strict_format_reward_func": 0.5,
294
- "rewards/xmlcount_reward_func": 1.25,
295
  "step": 30
296
  },
297
  {
298
- "completion_length": 301.5,
299
  "epoch": 1.8,
300
- "grad_norm": 5.105992794036865,
301
- "kl": 0.41865785513073206,
302
  "learning_rate": 4.040571164002318e-07,
303
- "loss": 0.0004,
304
- "reward": 21.198615819215775,
305
- "reward_std": 0.18745470189242042,
306
- "rewards/concensus_correctness_reward_func": 15.0,
307
- "rewards/consensus_reward_func": 2.0,
308
  "rewards/cumulative_reward_2": 0.0,
309
- "rewards/final_correctness_reward_func": 1.5,
310
- "rewards/question_recreation_reward_func": 0.9798658639192581,
311
  "rewards/soft_format_reward_func": 0.0,
312
  "rewards/strict_format_reward_func": 0.46875,
313
- "rewards/xmlcount_reward_func": 1.25,
314
  "step": 32
315
  },
316
  {
317
- "completion_length": 291.5625,
318
  "epoch": 1.9142857142857141,
319
- "grad_norm": 70842.7109375,
320
- "kl": 7595.819510911126,
321
  "learning_rate": 3.909892574627266e-07,
322
- "loss": 7.5958,
323
- "reward": 21.041768729686737,
324
- "reward_std": 0.25126288196770474,
325
- "rewards/concensus_correctness_reward_func": 15.0,
326
  "rewards/consensus_reward_func": 2.0,
327
  "rewards/cumulative_reward_2": 0.0,
328
- "rewards/final_correctness_reward_func": 1.3125,
329
- "rewards/question_recreation_reward_func": 0.9792686924338341,
330
  "rewards/soft_format_reward_func": 0.0,
331
  "rewards/strict_format_reward_func": 0.5,
332
  "rewards/xmlcount_reward_func": 1.25,
333
  "step": 34
334
  },
335
  {
336
- "completion_length": 298.25,
337
  "epoch": 2.0,
338
- "grad_norm": 329.4862365722656,
339
- "kl": 316224.1397752762,
340
  "learning_rate": 3.773300405821908e-07,
341
- "loss": 237.1681,
342
- "reward": 22.495349248250324,
343
- "reward_std": 0.5391187722949932,
344
  "rewards/concensus_correctness_reward_func": 16.666666666666668,
345
  "rewards/consensus_reward_func": 2.0,
346
  "rewards/cumulative_reward_2": 0.0,
347
- "rewards/final_correctness_reward_func": 1.1666666666666667,
348
- "rewards/question_recreation_reward_func": 0.9120159248510996,
349
  "rewards/soft_format_reward_func": 0.0,
350
- "rewards/strict_format_reward_func": 0.5,
351
- "rewards/xmlcount_reward_func": 1.25,
352
  "step": 36
353
  },
354
  {
355
- "completion_length": 278.0625,
356
  "epoch": 2.1142857142857143,
357
- "grad_norm": 83.9132308959961,
358
- "kl": 274299.0737755578,
359
  "learning_rate": 3.6313675726113475e-07,
360
- "loss": 274.2991,
361
- "reward": 26.602285027503967,
362
- "reward_std": 0.20196314249187708,
363
- "rewards/concensus_correctness_reward_func": 20.0,
364
  "rewards/consensus_reward_func": 2.0,
365
  "rewards/cumulative_reward_2": 0.0,
366
- "rewards/final_correctness_reward_func": 1.875,
367
- "rewards/question_recreation_reward_func": 0.9772850759327412,
368
  "rewards/soft_format_reward_func": 0.0,
369
  "rewards/strict_format_reward_func": 0.5,
370
  "rewards/xmlcount_reward_func": 1.25,
371
  "step": 38
372
  },
373
  {
374
- "completion_length": 290.9375,
375
  "epoch": 2.2285714285714286,
376
- "grad_norm": 13.199821472167969,
377
- "kl": 16.826391340699047,
378
  "learning_rate": 3.484689390623218e-07,
379
- "loss": 0.0168,
380
- "reward": 23.80302059650421,
381
- "reward_std": 0.4077005465514958,
382
- "rewards/concensus_correctness_reward_func": 17.5,
383
- "rewards/consensus_reward_func": 2.0,
384
  "rewards/cumulative_reward_2": 0.0,
385
- "rewards/final_correctness_reward_func": 1.625,
386
- "rewards/question_recreation_reward_func": 0.9601456187665462,
387
  "rewards/soft_format_reward_func": 0.0,
388
- "rewards/strict_format_reward_func": 0.46875,
389
- "rewards/xmlcount_reward_func": 1.2491250038146973,
390
  "step": 40
391
  },
392
  {
393
- "completion_length": 305.0625,
394
  "epoch": 2.342857142857143,
395
- "grad_norm": 8.125014305114746,
396
- "kl": 1.4574856441468,
397
  "learning_rate": 3.3338810791270517e-07,
398
- "loss": 0.0014,
399
- "reward": 19.75320541858673,
400
- "reward_std": 0.3490199828884215,
401
  "rewards/concensus_correctness_reward_func": 13.75,
402
  "rewards/consensus_reward_func": 2.0,
403
  "rewards/cumulative_reward_2": 0.0,
404
- "rewards/final_correctness_reward_func": 1.3125,
405
- "rewards/question_recreation_reward_func": 0.9407055526971817,
406
  "rewards/soft_format_reward_func": 0.0,
407
- "rewards/strict_format_reward_func": 0.5,
408
- "rewards/xmlcount_reward_func": 1.25,
409
  "step": 42
410
  },
411
  {
412
- "completion_length": 296.46875,
413
  "epoch": 2.4571428571428573,
414
- "grad_norm": 0.7579712867736816,
415
- "kl": 96.8114567277953,
416
  "learning_rate": 3.179575180590857e-07,
417
- "loss": 0.0968,
418
- "reward": 22.325815856456757,
419
- "reward_std": 0.2327170339121949,
420
- "rewards/concensus_correctness_reward_func": 16.25,
421
- "rewards/consensus_reward_func": 2.0,
422
  "rewards/cumulative_reward_2": 0.0,
423
  "rewards/final_correctness_reward_func": 1.375,
424
- "rewards/question_recreation_reward_func": 0.9508159533143044,
425
  "rewards/soft_format_reward_func": 0.0,
426
  "rewards/strict_format_reward_func": 0.5,
427
  "rewards/xmlcount_reward_func": 1.25,
428
  "step": 44
429
  },
430
  {
431
- "completion_length": 301.59375,
432
  "epoch": 2.571428571428571,
433
- "grad_norm": 5.712331771850586,
434
- "kl": 6.341034076642245,
435
  "learning_rate": 3.022418907578188e-07,
436
- "loss": 0.0063,
437
- "reward": 22.558997809886932,
438
- "reward_std": 0.0432420757424552,
439
  "rewards/concensus_correctness_reward_func": 16.25,
440
- "rewards/consensus_reward_func": 2.0,
441
  "rewards/cumulative_reward_2": 0.0,
442
- "rewards/final_correctness_reward_func": 1.625,
443
- "rewards/question_recreation_reward_func": 0.9808727987110615,
444
  "rewards/soft_format_reward_func": 0.0,
445
- "rewards/strict_format_reward_func": 0.453125,
446
  "rewards/xmlcount_reward_func": 1.25,
447
  "step": 46
448
  },
449
  {
450
- "completion_length": 302.1875,
451
  "epoch": 2.685714285714286,
452
- "grad_norm": 44952662016.0,
453
- "kl": 3445322040.5881023,
454
  "learning_rate": 2.863071428113726e-07,
455
- "loss": 3445322.0,
456
- "reward": 21.52135854959488,
457
- "reward_std": 1.3011465523632069,
458
  "rewards/concensus_correctness_reward_func": 15.625,
459
  "rewards/consensus_reward_func": 1.9375,
460
  "rewards/cumulative_reward_2": 0.0,
461
- "rewards/final_correctness_reward_func": 1.3125,
462
- "rewards/question_recreation_reward_func": 0.8964210189878941,
463
  "rewards/soft_format_reward_func": 0.0,
464
  "rewards/strict_format_reward_func": 0.5,
465
- "rewards/xmlcount_reward_func": 1.249937504529953,
466
  "step": 48
467
  },
468
  {
469
- "completion_length": 288.5,
470
  "epoch": 2.8,
471
- "grad_norm": 4259.3798828125,
472
- "kl": 223.73643614712637,
473
  "learning_rate": 2.7022011009035107e-07,
474
- "loss": 0.2237,
475
- "reward": 23.679662585258484,
476
- "reward_std": 0.1834919499233365,
477
  "rewards/concensus_correctness_reward_func": 17.5,
478
- "rewards/consensus_reward_func": 2.0,
479
  "rewards/cumulative_reward_2": 0.0,
480
- "rewards/final_correctness_reward_func": 1.5,
481
- "rewards/question_recreation_reward_func": 0.960912674665451,
482
  "rewards/soft_format_reward_func": 0.0,
483
  "rewards/strict_format_reward_func": 0.46875,
484
  "rewards/xmlcount_reward_func": 1.25,
485
  "step": 50
486
  },
487
  {
488
- "completion_length": 306.4375,
489
  "epoch": 2.914285714285714,
490
- "grad_norm": 204.8453826904297,
491
- "kl": 32238.16137995734,
492
  "learning_rate": 2.540482672006254e-07,
493
- "loss": 32.2382,
494
- "reward": 20.900875002145767,
495
- "reward_std": 0.08672392391599715,
496
- "rewards/concensus_correctness_reward_func": 15.0,
497
  "rewards/consensus_reward_func": 2.0,
498
  "rewards/cumulative_reward_2": 0.0,
499
- "rewards/final_correctness_reward_func": 1.25,
500
- "rewards/question_recreation_reward_func": 0.9594687074422836,
501
  "rewards/soft_format_reward_func": 0.0,
502
- "rewards/strict_format_reward_func": 0.453125,
503
- "rewards/xmlcount_reward_func": 1.23828125,
504
  "step": 52
505
  },
506
  {
507
- "completion_length": 306.5416666666667,
508
  "epoch": 3.0,
509
- "grad_norm": 671869.25,
510
- "kl": 71707.22513961916,
511
  "learning_rate": 2.37859444471388e-07,
512
- "loss": 53.7804,
513
- "reward": 21.561778783798218,
514
- "reward_std": 0.2654109305391709,
515
- "rewards/concensus_correctness_reward_func": 15.0,
516
- "rewards/consensus_reward_func": 2.0,
517
  "rewards/cumulative_reward_2": 0.0,
518
- "rewards/final_correctness_reward_func": 1.8333333333333333,
519
- "rewards/question_recreation_reward_func": 0.978445569674174,
520
  "rewards/soft_format_reward_func": 0.0,
521
  "rewards/strict_format_reward_func": 0.5,
522
  "rewards/xmlcount_reward_func": 1.25,
523
  "step": 54
524
  },
525
  {
526
- "completion_length": 284.25,
527
  "epoch": 3.1142857142857143,
528
- "grad_norm": 75.00627899169922,
529
- "kl": 7.327355385874398,
530
  "learning_rate": 2.2172154345117894e-07,
531
- "loss": 0.0073,
532
- "reward": 23.936051309108734,
533
- "reward_std": 0.0883883461356163,
534
  "rewards/concensus_correctness_reward_func": 17.5,
535
  "rewards/consensus_reward_func": 2.0,
536
  "rewards/cumulative_reward_2": 0.0,
537
  "rewards/final_correctness_reward_func": 1.6875,
538
- "rewards/question_recreation_reward_func": 0.9985513240098953,
539
  "rewards/soft_format_reward_func": 0.0,
540
  "rewards/strict_format_reward_func": 0.5,
541
  "rewards/xmlcount_reward_func": 1.25,
542
  "step": 56
543
  },
544
  {
545
- "completion_length": 285.03125,
546
  "epoch": 3.2285714285714286,
547
- "grad_norm": 6.350292205810547,
548
- "kl": 0.6016469232272357,
549
  "learning_rate": 2.0570225210519433e-07,
550
- "loss": 0.0006,
551
- "reward": 21.1969113945961,
552
- "reward_std": 0.05649599526077509,
553
  "rewards/concensus_correctness_reward_func": 15.0,
554
  "rewards/consensus_reward_func": 2.0,
555
  "rewards/cumulative_reward_2": 0.0,
556
- "rewards/final_correctness_reward_func": 1.5,
557
- "rewards/question_recreation_reward_func": 0.982067696750164,
558
  "rewards/soft_format_reward_func": 0.0,
559
- "rewards/strict_format_reward_func": 0.46875,
560
- "rewards/xmlcount_reward_func": 1.24609375,
561
  "step": 58
562
  },
563
  {
564
- "completion_length": 297.875,
565
  "epoch": 3.342857142857143,
566
- "grad_norm": 4.946191787719727,
567
- "kl": 2931.4463935733074,
568
  "learning_rate": 1.8986876090843664e-07,
569
- "loss": 2.9315,
570
- "reward": 21.175804615020752,
571
- "reward_std": 0.1754202425290714,
572
- "rewards/concensus_correctness_reward_func": 15.0,
573
- "rewards/consensus_reward_func": 2.0,
574
  "rewards/cumulative_reward_2": 0.0,
575
- "rewards/final_correctness_reward_func": 1.5,
576
- "rewards/question_recreation_reward_func": 0.9570545256137848,
577
  "rewards/soft_format_reward_func": 0.0,
578
- "rewards/strict_format_reward_func": 0.46875,
579
  "rewards/xmlcount_reward_func": 1.25,
580
  "step": 60
581
  },
582
  {
583
- "completion_length": 296.6875,
584
  "epoch": 3.4571428571428573,
585
- "grad_norm": 4.3400092124938965,
586
- "kl": 647.1872735361103,
587
  "learning_rate": 1.7428748102551234e-07,
588
- "loss": 0.6472,
589
- "reward": 21.385672003030777,
590
- "reward_std": 0.11072899930877611,
591
  "rewards/concensus_correctness_reward_func": 15.0,
592
- "rewards/consensus_reward_func": 2.0,
593
  "rewards/cumulative_reward_2": 0.0,
594
- "rewards/final_correctness_reward_func": 1.6875,
595
- "rewards/question_recreation_reward_func": 0.9950470179319382,
596
  "rewards/soft_format_reward_func": 0.0,
597
- "rewards/strict_format_reward_func": 0.453125,
598
  "rewards/xmlcount_reward_func": 1.25,
599
  "step": 62
600
  },
601
  {
602
- "completion_length": 292.375,
603
  "epoch": 3.571428571428571,
604
- "grad_norm": 4.24850606918335,
605
- "kl": 0.27754142810590565,
606
  "learning_rate": 1.5902376575912814e-07,
607
- "loss": 0.0003,
608
- "reward": 22.218331456184387,
609
- "reward_std": 0.17698911238403525,
610
  "rewards/concensus_correctness_reward_func": 16.25,
611
  "rewards/consensus_reward_func": 2.0,
612
  "rewards/cumulative_reward_2": 0.0,
613
- "rewards/final_correctness_reward_func": 1.25,
614
- "rewards/question_recreation_reward_func": 0.9683314673602581,
615
  "rewards/soft_format_reward_func": 0.0,
616
  "rewards/strict_format_reward_func": 0.5,
617
  "rewards/xmlcount_reward_func": 1.25,
618
  "step": 64
619
  },
620
  {
621
- "completion_length": 300.125,
622
  "epoch": 3.685714285714286,
623
- "grad_norm": 5.008735179901123,
624
- "kl": 0.1533073673490435,
625
  "learning_rate": 1.4414163643562753e-07,
626
- "loss": 0.0002,
627
- "reward": 22.805782973766327,
628
- "reward_std": 0.09155728336190805,
629
  "rewards/concensus_correctness_reward_func": 16.25,
630
  "rewards/consensus_reward_func": 2.0,
631
  "rewards/cumulative_reward_2": 0.0,
632
- "rewards/final_correctness_reward_func": 1.8125,
633
- "rewards/question_recreation_reward_func": 0.993282962590456,
634
  "rewards/soft_format_reward_func": 0.0,
635
  "rewards/strict_format_reward_func": 0.5,
636
  "rewards/xmlcount_reward_func": 1.25,
637
  "step": 66
638
  },
639
  {
640
- "completion_length": 324.21875,
641
  "epoch": 3.8,
642
- "grad_norm": 4.079814434051514,
643
- "kl": 240703.86267980258,
644
  "learning_rate": 1.2970351387729872e-07,
645
- "loss": 240.7039,
646
- "reward": 22.386139184236526,
647
- "reward_std": 0.144500538790453,
648
- "rewards/concensus_correctness_reward_func": 16.25,
649
  "rewards/consensus_reward_func": 2.0,
650
  "rewards/cumulative_reward_2": 0.0,
651
  "rewards/final_correctness_reward_func": 1.4375,
652
- "rewards/question_recreation_reward_func": 0.9681704863905907,
653
  "rewards/soft_format_reward_func": 0.0,
654
- "rewards/strict_format_reward_func": 0.484375,
655
- "rewards/xmlcount_reward_func": 1.24609375,
656
  "step": 68
657
  },
658
  {
659
- "completion_length": 291.84375,
660
  "epoch": 3.914285714285714,
661
- "grad_norm": 5.8456339836120605,
662
- "kl": 9508.670825830894,
663
  "learning_rate": 1.1576995658775404e-07,
664
- "loss": 9.5088,
665
- "reward": 24.96782159805298,
666
- "reward_std": 0.35711817086848896,
667
- "rewards/concensus_correctness_reward_func": 18.75,
668
- "rewards/consensus_reward_func": 2.0,
669
  "rewards/cumulative_reward_2": 0.0,
670
- "rewards/final_correctness_reward_func": 1.5625,
671
- "rewards/question_recreation_reward_func": 0.9571027122437954,
672
  "rewards/soft_format_reward_func": 0.0,
673
  "rewards/strict_format_reward_func": 0.46875,
674
- "rewards/xmlcount_reward_func": 1.229468747973442,
675
  "step": 70
676
  },
677
  {
678
- "completion_length": 309.75,
679
  "epoch": 4.0,
680
- "grad_norm": 4.866423606872559,
681
- "kl": 4863924.029457786,
682
  "learning_rate": 1.0239940674851941e-07,
683
- "loss": 3647.9429,
684
- "reward": 24.461654980977375,
685
- "reward_std": 0.09552274364978075,
686
  "rewards/concensus_correctness_reward_func": 18.333333333333332,
687
  "rewards/consensus_reward_func": 2.0,
688
  "rewards/cumulative_reward_2": 0.0,
689
- "rewards/final_correctness_reward_func": 1.5,
690
- "rewards/question_recreation_reward_func": 0.9409046818812689,
691
  "rewards/soft_format_reward_func": 0.0,
692
- "rewards/strict_format_reward_func": 0.4375,
693
- "rewards/xmlcount_reward_func": 1.249916672706604,
694
  "step": 72
695
  },
696
  {
697
- "completion_length": 313.125,
698
  "epoch": 4.114285714285714,
699
- "grad_norm": 7.353360652923584,
700
- "kl": 21.889807341387495,
701
  "learning_rate": 8.964794509221507e-08,
702
- "loss": 0.0218,
703
- "reward": 23.785234093666077,
704
- "reward_std": 0.10294873413658934,
705
- "rewards/concensus_correctness_reward_func": 17.5,
706
- "rewards/consensus_reward_func": 2.0,
707
  "rewards/cumulative_reward_2": 0.0,
708
- "rewards/final_correctness_reward_func": 1.5625,
709
- "rewards/question_recreation_reward_func": 0.9727341197431087,
710
  "rewards/soft_format_reward_func": 0.0,
711
  "rewards/strict_format_reward_func": 0.5,
712
  "rewards/xmlcount_reward_func": 1.25,
713
  "step": 74
714
  },
715
  {
716
- "completion_length": 312.0625,
717
  "epoch": 4.228571428571429,
718
- "grad_norm": 3.4897100925445557,
719
- "kl": 3379667.7199420603,
720
  "learning_rate": 7.756905568047392e-08,
721
- "loss": 3379.668,
722
- "reward": 24.401052832603455,
723
- "reward_std": 1.1398758271243423,
724
- "rewards/concensus_correctness_reward_func": 18.249875009059906,
725
- "rewards/consensus_reward_func": 1.9375,
726
  "rewards/cumulative_reward_2": 0.0,
727
- "rewards/final_correctness_reward_func": 1.5625,
728
- "rewards/question_recreation_reward_func": 0.9491151906549931,
729
  "rewards/soft_format_reward_func": 0.0,
730
- "rewards/strict_format_reward_func": 0.484375,
731
- "rewards/xmlcount_reward_func": 1.2176874987781048,
732
  "step": 76
733
  },
734
  {
735
- "completion_length": 277.21875,
736
  "epoch": 4.3428571428571425,
737
- "grad_norm": 5.596811294555664,
738
- "kl": 2.9425320012960583,
739
  "learning_rate": 6.621340157319996e-08,
740
- "loss": 0.0029,
741
- "reward": 20.854586482048035,
742
- "reward_std": 0.05484232149319723,
743
  "rewards/concensus_correctness_reward_func": 15.0,
744
  "rewards/consensus_reward_func": 2.0,
745
  "rewards/cumulative_reward_2": 0.0,
746
- "rewards/final_correctness_reward_func": 1.25,
747
- "rewards/question_recreation_reward_func": 0.9405239969491959,
748
  "rewards/soft_format_reward_func": 0.0,
749
  "rewards/strict_format_reward_func": 0.453125,
750
- "rewards/xmlcount_reward_func": 1.2109375,
751
  "step": 78
752
  },
753
  {
754
- "completion_length": 270.59375,
755
  "epoch": 4.457142857142857,
756
- "grad_norm": 92.10065460205078,
757
- "kl": 3.8375826531555504,
758
  "learning_rate": 5.5628612330087724e-08,
759
- "loss": 0.0038,
760
- "reward": 23.8216050863266,
761
- "reward_std": 0.2481758954236284,
762
- "rewards/concensus_correctness_reward_func": 17.5,
763
  "rewards/consensus_reward_func": 2.0,
764
  "rewards/cumulative_reward_2": 0.0,
765
- "rewards/final_correctness_reward_func": 1.625,
766
- "rewards/question_recreation_reward_func": 0.9661363810300827,
767
  "rewards/soft_format_reward_func": 0.0,
768
- "rewards/strict_format_reward_func": 0.484375,
769
- "rewards/xmlcount_reward_func": 1.24609375,
770
  "step": 80
771
  },
772
  {
773
- "completion_length": 311.625,
774
  "epoch": 4.571428571428571,
775
- "grad_norm": 3.580958843231201,
776
- "kl": 0.21251056378241628,
777
  "learning_rate": 4.5859084235697235e-08,
778
- "loss": 0.0002,
779
- "reward": 17.167713046073914,
780
- "reward_std": 0.24630744382739067,
781
- "rewards/concensus_correctness_reward_func": 11.25,
782
- "rewards/consensus_reward_func": 2.0,
783
  "rewards/cumulative_reward_2": 0.0,
784
  "rewards/final_correctness_reward_func": 1.25,
785
- "rewards/question_recreation_reward_func": 0.9645880535244942,
786
  "rewards/soft_format_reward_func": 0.0,
787
- "rewards/strict_format_reward_func": 0.453125,
788
  "rewards/xmlcount_reward_func": 1.25,
789
  "step": 82
790
  },
791
  {
792
- "completion_length": 281.8125,
793
  "epoch": 4.685714285714286,
794
- "grad_norm": 36.02040481567383,
795
- "kl": 3.439565767766908,
796
  "learning_rate": 3.6945794086007705e-08,
797
- "loss": 0.0034,
798
- "reward": 22.68697714805603,
799
- "reward_std": 0.19833910652960185,
800
  "rewards/concensus_correctness_reward_func": 16.25,
801
- "rewards/consensus_reward_func": 2.0,
802
  "rewards/cumulative_reward_2": 0.0,
803
- "rewards/final_correctness_reward_func": 1.75,
804
- "rewards/question_recreation_reward_func": 0.9860397130250931,
805
  "rewards/soft_format_reward_func": 0.0,
806
- "rewards/strict_format_reward_func": 0.453125,
807
- "rewards/xmlcount_reward_func": 1.2478125020861626,
808
  "step": 84
809
  },
810
  {
811
- "completion_length": 299.8125,
812
  "epoch": 4.8,
813
- "grad_norm": 5.582298755645752,
814
- "kl": 0.19541561254300177,
815
  "learning_rate": 2.892612731749414e-08,
816
- "loss": 0.0002,
817
- "reward": 23.891813337802887,
818
- "reward_std": 0.14955170825123787,
819
- "rewards/concensus_correctness_reward_func": 17.5,
820
  "rewards/consensus_reward_func": 2.0,
821
  "rewards/cumulative_reward_2": 0.0,
822
- "rewards/final_correctness_reward_func": 1.6875,
823
- "rewards/question_recreation_reward_func": 0.954313363879919,
824
  "rewards/soft_format_reward_func": 0.0,
825
- "rewards/strict_format_reward_func": 0.5,
826
- "rewards/xmlcount_reward_func": 1.25,
827
  "step": 86
828
  },
829
  {
830
- "completion_length": 298.875,
831
  "epoch": 4.914285714285715,
832
- "grad_norm": 194.55441284179688,
833
- "kl": 16.51644215453416,
834
  "learning_rate": 2.183372119961499e-08,
835
- "loss": 0.0165,
836
- "reward": 23.978663206100464,
837
- "reward_std": 0.18283734598662704,
838
  "rewards/concensus_correctness_reward_func": 17.5,
839
  "rewards/consensus_reward_func": 2.0,
840
  "rewards/cumulative_reward_2": 0.0,
841
- "rewards/final_correctness_reward_func": 1.75,
842
- "rewards/question_recreation_reward_func": 0.978663295507431,
843
  "rewards/soft_format_reward_func": 0.0,
844
  "rewards/strict_format_reward_func": 0.5,
845
  "rewards/xmlcount_reward_func": 1.25,
846
  "step": 88
847
  },
848
  {
849
- "completion_length": 281.1666666666667,
850
  "epoch": 5.0,
851
- "grad_norm": 2.8502633571624756,
852
- "kl": 1.1907670572400093,
853
  "learning_rate": 1.5698323748414122e-08,
854
- "loss": 0.0009,
855
- "reward": 22.561877250671387,
856
- "reward_std": 0.26481426507234573,
857
- "rewards/concensus_correctness_reward_func": 16.666666666666668,
858
  "rewards/consensus_reward_func": 2.0,
859
  "rewards/cumulative_reward_2": 0.0,
860
- "rewards/final_correctness_reward_func": 1.1666666666666667,
861
- "rewards/question_recreation_reward_func": 0.999377171198527,
862
  "rewards/soft_format_reward_func": 0.0,
863
- "rewards/strict_format_reward_func": 0.4791666666666667,
864
  "rewards/xmlcount_reward_func": 1.25,
865
  "step": 90
866
  },
867
  {
868
- "completion_length": 305.125,
869
  "epoch": 5.114285714285714,
870
- "grad_norm": 228.8664093017578,
871
- "kl": 25.48564671073109,
872
  "learning_rate": 1.054566895300324e-08,
873
- "loss": 0.0255,
874
- "reward": 25.125020027160645,
875
- "reward_std": 0.11629757171976962,
876
- "rewards/concensus_correctness_reward_func": 18.75,
877
- "rewards/consensus_reward_func": 2.0,
878
  "rewards/cumulative_reward_2": 0.0,
879
- "rewards/final_correctness_reward_func": 1.6875,
880
- "rewards/question_recreation_reward_func": 0.9687698371708393,
881
  "rewards/soft_format_reward_func": 0.0,
882
- "rewards/strict_format_reward_func": 0.46875,
883
- "rewards/xmlcount_reward_func": 1.25,
884
  "step": 92
885
  },
886
  {
887
- "completion_length": 310.375,
888
  "epoch": 5.228571428571429,
889
- "grad_norm": 1273.266357421875,
890
- "kl": 99.00182915199548,
891
  "learning_rate": 6.397368838268496e-09,
892
- "loss": 0.099,
893
- "reward": 15.815190017223358,
894
- "reward_std": 0.18272748742310796,
895
- "rewards/concensus_correctness_reward_func": 10.0,
896
- "rewards/consensus_reward_func": 2.0,
897
  "rewards/cumulative_reward_2": 0.0,
898
- "rewards/final_correctness_reward_func": 1.125,
899
- "rewards/question_recreation_reward_func": 0.9714402034878731,
900
  "rewards/soft_format_reward_func": 0.0,
901
  "rewards/strict_format_reward_func": 0.46875,
902
- "rewards/xmlcount_reward_func": 1.25,
903
  "step": 94
904
  },
905
  {
906
- "completion_length": 294.28125,
907
  "epoch": 5.3428571428571425,
908
- "grad_norm": 25.452686309814453,
909
- "kl": 1.9639836233109236,
910
  "learning_rate": 3.2708228165273244e-09,
911
- "loss": 0.002,
912
- "reward": 23.699071168899536,
913
- "reward_std": 0.41938550118356943,
914
- "rewards/concensus_correctness_reward_func": 17.5,
915
  "rewards/consensus_reward_func": 2.0,
916
  "rewards/cumulative_reward_2": 0.0,
917
- "rewards/final_correctness_reward_func": 1.5,
918
- "rewards/question_recreation_reward_func": 0.964696180075407,
919
  "rewards/soft_format_reward_func": 0.0,
920
- "rewards/strict_format_reward_func": 0.484375,
921
  "rewards/xmlcount_reward_func": 1.25,
922
  "step": 96
923
  },
924
  {
925
- "completion_length": 259.5625,
926
  "epoch": 5.457142857142857,
927
- "grad_norm": 2.797135591506958,
928
- "kl": 6.031112845055759,
929
  "learning_rate": 1.1791447083465133e-09,
930
- "loss": 0.006,
931
- "reward": 21.275031089782715,
932
- "reward_std": 0.09077301478828304,
933
  "rewards/concensus_correctness_reward_func": 15.0,
934
- "rewards/consensus_reward_func": 2.0,
935
  "rewards/cumulative_reward_2": 0.0,
936
- "rewards/final_correctness_reward_func": 1.5625,
937
- "rewards/question_recreation_reward_func": 0.9937809370458126,
938
  "rewards/soft_format_reward_func": 0.0,
939
- "rewards/strict_format_reward_func": 0.46875,
940
- "rewards/xmlcount_reward_func": 1.25,
941
  "step": 98
942
  },
943
  {
944
- "completion_length": 289.28125,
945
  "epoch": 5.571428571428571,
946
- "grad_norm": 4.234410762786865,
947
- "kl": 0.2854203868191689,
948
  "learning_rate": 1.3110773862126667e-10,
949
- "loss": 0.0003,
950
- "reward": 26.528146028518677,
951
- "reward_std": 0.21690074453363195,
952
  "rewards/concensus_correctness_reward_func": 20.0,
953
  "rewards/consensus_reward_func": 2.0,
954
  "rewards/cumulative_reward_2": 0.0,
955
- "rewards/final_correctness_reward_func": 1.875,
956
- "rewards/question_recreation_reward_func": 0.9520210325717926,
957
  "rewards/soft_format_reward_func": 0.0,
958
- "rewards/strict_format_reward_func": 0.453125,
959
- "rewards/xmlcount_reward_func": 1.2480000033974648,
960
  "step": 100
961
  },
962
  {
963
  "epoch": 5.571428571428571,
964
  "step": 100,
965
  "total_flos": 0.0,
966
- "train_loss": 69070.94929174794,
967
- "train_runtime": 1492.4283,
968
- "train_samples_per_second": 1.072,
969
  "train_steps_per_second": 0.067
970
  }
971
  ],
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "completion_length": 258.78125,
14
  "epoch": 0.11428571428571428,
15
+ "grad_norm": 2.054772138595581,
16
  "kl": 0.0,
17
  "learning_rate": 1.6666666666666665e-07,
18
  "loss": 0.0,
19
+ "reward": 17.02327122539282,
20
+ "reward_std": 4.40558909252286,
21
+ "rewards/concensus_correctness_reward_func": 12.5,
22
+ "rewards/consensus_reward_func": 1.25,
23
  "rewards/cumulative_reward_2": 0.0,
24
+ "rewards/final_correctness_reward_func": 1.4375,
25
+ "rewards/question_recreation_reward_func": 0.7546773618087173,
26
  "rewards/soft_format_reward_func": 0.0,
27
  "rewards/strict_format_reward_func": 0.25,
28
+ "rewards/xmlcount_reward_func": 0.8310937508940697,
29
  "step": 2
30
  },
31
  {
32
+ "completion_length": 279.90625,
33
  "epoch": 0.22857142857142856,
34
+ "grad_norm": 6.912466049194336,
35
+ "kl": 0.01094070820545312,
36
  "learning_rate": 5e-07,
37
+ "loss": 0.0,
38
+ "reward": 23.899269431829453,
39
+ "reward_std": 0.04921933158766478,
40
+ "rewards/concensus_correctness_reward_func": 17.5,
41
+ "rewards/consensus_reward_func": 2.0,
42
  "rewards/cumulative_reward_2": 0.0,
43
+ "rewards/final_correctness_reward_func": 1.75,
44
+ "rewards/question_recreation_reward_func": 0.9344256119802594,
45
  "rewards/soft_format_reward_func": 0.0,
46
  "rewards/strict_format_reward_func": 0.484375,
47
+ "rewards/xmlcount_reward_func": 1.23046875,
48
  "step": 4
49
  },
50
  {
51
+ "completion_length": 318.5625,
52
  "epoch": 0.34285714285714286,
53
+ "grad_norm": 5.637369632720947,
54
+ "kl": 83747384.09356707,
55
  "learning_rate": 4.994757065594279e-07,
56
+ "loss": 83747.3828,
57
+ "reward": 18.383561819791794,
58
+ "reward_std": 0.45949314400786534,
59
+ "rewards/concensus_correctness_reward_func": 12.5,
60
+ "rewards/consensus_reward_func": 1.8125,
61
  "rewards/cumulative_reward_2": 0.0,
62
+ "rewards/final_correctness_reward_func": 1.4375,
63
+ "rewards/question_recreation_reward_func": 0.9421556144952774,
64
  "rewards/soft_format_reward_func": 0.0,
65
  "rewards/strict_format_reward_func": 0.453125,
66
+ "rewards/xmlcount_reward_func": 1.23828125,
67
  "step": 6
68
  },
69
  {
70
+ "completion_length": 293.25,
71
  "epoch": 0.45714285714285713,
72
+ "grad_norm": 6.48016881942749,
73
+ "kl": 0.10146056092344224,
74
  "learning_rate": 4.979050253066063e-07,
75
+ "loss": 0.0001,
76
+ "reward": 24.90006184577942,
77
+ "reward_std": 0.250804515555501,
78
+ "rewards/concensus_correctness_reward_func": 18.75,
79
  "rewards/consensus_reward_func": 2.0,
80
  "rewards/cumulative_reward_2": 0.0,
81
+ "rewards/final_correctness_reward_func": 1.5,
82
+ "rewards/question_recreation_reward_func": 0.9156866073608398,
83
  "rewards/soft_format_reward_func": 0.0,
84
+ "rewards/strict_format_reward_func": 0.484375,
85
  "rewards/xmlcount_reward_func": 1.25,
86
  "step": 8
87
  },
88
  {
89
+ "completion_length": 294.90625,
90
  "epoch": 0.5714285714285714,
91
+ "grad_norm": 16.164697647094727,
92
+ "kl": 0.09648153139278293,
93
  "learning_rate": 4.952945442245597e-07,
94
+ "loss": 0.0001,
95
+ "reward": 25.115739941596985,
96
+ "reward_std": 0.14409029902890325,
97
  "rewards/concensus_correctness_reward_func": 18.75,
98
  "rewards/consensus_reward_func": 2.0,
99
  "rewards/cumulative_reward_2": 0.0,
100
+ "rewards/final_correctness_reward_func": 1.6875,
101
+ "rewards/question_recreation_reward_func": 0.9477710761129856,
102
  "rewards/soft_format_reward_func": 0.0,
103
+ "rewards/strict_format_reward_func": 0.484375,
104
+ "rewards/xmlcount_reward_func": 1.24609375,
105
  "step": 10
106
  },
107
  {
108
+ "completion_length": 285.53125,
109
  "epoch": 0.6857142857142857,
110
+ "grad_norm": 0.182708278298378,
111
+ "kl": 0.16675707220565528,
112
  "learning_rate": 4.916552125781528e-07,
113
+ "loss": 0.0002,
114
+ "reward": 26.597843527793884,
115
+ "reward_std": 0.002685512910829857,
116
+ "rewards/concensus_correctness_reward_func": 20.0,
117
  "rewards/consensus_reward_func": 2.0,
118
  "rewards/cumulative_reward_2": 0.0,
119
+ "rewards/final_correctness_reward_func": 1.875,
120
+ "rewards/question_recreation_reward_func": 0.9728433899581432,
121
  "rewards/soft_format_reward_func": 0.0,
122
  "rewards/strict_format_reward_func": 0.5,
123
  "rewards/xmlcount_reward_func": 1.25,
124
  "step": 12
125
  },
126
  {
127
+ "completion_length": 308.71875,
128
  "epoch": 0.8,
129
+ "grad_norm": 6.084612846374512,
130
+ "kl": 0.21913069766014814,
131
  "learning_rate": 4.870022949890676e-07,
132
+ "loss": 0.0002,
133
+ "reward": 21.113146513700485,
134
+ "reward_std": 0.13240044617487,
135
+ "rewards/concensus_correctness_reward_func": 15.0,
136
  "rewards/consensus_reward_func": 2.0,
137
  "rewards/cumulative_reward_2": 0.0,
138
+ "rewards/final_correctness_reward_func": 1.4375,
139
+ "rewards/question_recreation_reward_func": 0.9442716278135777,
140
  "rewards/soft_format_reward_func": 0.0,
141
  "rewards/strict_format_reward_func": 0.484375,
142
+ "rewards/xmlcount_reward_func": 1.2470000013709068,
143
  "step": 14
144
  },
145
  {
146
+ "completion_length": 333.25,
147
  "epoch": 0.9142857142857143,
148
+ "grad_norm": 6157868.5,
149
+ "kl": 288729.23049705196,
150
  "learning_rate": 4.81355307410676e-07,
151
+ "loss": 288.7293,
152
+ "reward": 19.089906871318817,
153
+ "reward_std": 1.0611610219348222,
154
+ "rewards/concensus_correctness_reward_func": 13.125,
155
+ "rewards/consensus_reward_func": 1.8125,
156
  "rewards/cumulative_reward_2": 0.0,
157
+ "rewards/final_correctness_reward_func": 1.4375,
158
+ "rewards/question_recreation_reward_func": 0.9974069446325302,
159
  "rewards/soft_format_reward_func": 0.0,
160
+ "rewards/strict_format_reward_func": 0.46875,
161
+ "rewards/xmlcount_reward_func": 1.248750001192093,
162
  "step": 16
163
  },
164
  {
165
+ "completion_length": 285.5,
166
  "epoch": 1.0,
167
+ "grad_norm": 78.33558654785156,
168
+ "kl": 4402.47263197725,
169
  "learning_rate": 4.747379352713488e-07,
170
+ "loss": 3.3019,
171
+ "reward": 20.285069465637207,
172
+ "reward_std": 1.364617843956997,
173
+ "rewards/concensus_correctness_reward_func": 14.166666666666666,
174
+ "rewards/consensus_reward_func": 1.9166666666666667,
175
  "rewards/cumulative_reward_2": 0.0,
176
+ "rewards/final_correctness_reward_func": 1.5,
177
+ "rewards/question_recreation_reward_func": 0.9986111124356588,
178
  "rewards/soft_format_reward_func": 0.0,
179
  "rewards/strict_format_reward_func": 0.4583333333333333,
180
+ "rewards/xmlcount_reward_func": 1.2447916666666667,
181
  "step": 18
182
  },
183
  {
184
+ "completion_length": 302.59375,
185
  "epoch": 1.1142857142857143,
186
+ "grad_norm": 3.6053833961486816,
187
+ "kl": 0.13355864235199988,
188
  "learning_rate": 4.6717793412953776e-07,
189
+ "loss": 0.0001,
190
+ "reward": 19.868769019842148,
191
+ "reward_std": 0.008435817871941254,
192
+ "rewards/concensus_correctness_reward_func": 13.75,
193
  "rewards/consensus_reward_func": 2.0,
194
  "rewards/cumulative_reward_2": 0.0,
195
+ "rewards/final_correctness_reward_func": 1.375,
196
+ "rewards/question_recreation_reward_func": 0.9937690384685993,
197
  "rewards/soft_format_reward_func": 0.0,
198
+ "rewards/strict_format_reward_func": 0.5,
199
+ "rewards/xmlcount_reward_func": 1.25,
200
  "step": 20
201
  },
202
  {
203
+ "completion_length": 298.9375,
204
  "epoch": 1.2285714285714286,
205
+ "grad_norm": 4.034364700317383,
206
+ "kl": 0.284994178917259,
207
  "learning_rate": 4.5870701325731773e-07,
208
+ "loss": 0.0003,
209
+ "reward": 25.217251539230347,
210
+ "reward_std": 0.1768817222182406,
211
  "rewards/concensus_correctness_reward_func": 18.75,
212
  "rewards/consensus_reward_func": 2.0,
213
  "rewards/cumulative_reward_2": 0.0,
214
+ "rewards/final_correctness_reward_func": 1.75,
215
+ "rewards/question_recreation_reward_func": 0.9985014833509922,
216
  "rewards/soft_format_reward_func": 0.0,
217
+ "rewards/strict_format_reward_func": 0.46875,
218
+ "rewards/xmlcount_reward_func": 1.25,
219
  "step": 22
220
  },
221
  {
222
+ "completion_length": 295.875,
223
  "epoch": 1.342857142857143,
224
+ "grad_norm": 19.670671463012695,
225
+ "kl": 93.65684759290889,
226
  "learning_rate": 4.4936070264068016e-07,
227
+ "loss": 0.0937,
228
+ "reward": 25.219618678092957,
229
+ "reward_std": 0.2108936388976872,
230
+ "rewards/concensus_correctness_reward_func": 18.75,
231
  "rewards/consensus_reward_func": 2.0,
232
  "rewards/cumulative_reward_2": 0.0,
233
  "rewards/final_correctness_reward_func": 1.75,
234
+ "rewards/question_recreation_reward_func": 0.9891498424112797,
235
  "rewards/soft_format_reward_func": 0.0,
236
+ "rewards/strict_format_reward_func": 0.484375,
237
+ "rewards/xmlcount_reward_func": 1.24609375,
238
  "step": 24
239
  },
240
  {
241
+ "completion_length": 289.5,
242
  "epoch": 1.457142857142857,
243
+ "grad_norm": 118071795712.0,
244
+ "kl": 14642561024.712172,
245
  "learning_rate": 4.391782039544238e-07,
246
+ "loss": 14642560.0,
247
+ "reward": 23.393190026283264,
248
+ "reward_std": 1.1594447159441188,
249
+ "rewards/concensus_correctness_reward_func": 16.875,
250
+ "rewards/consensus_reward_func": 1.9375,
251
  "rewards/cumulative_reward_2": 0.0,
252
+ "rewards/final_correctness_reward_func": 1.875,
253
+ "rewards/question_recreation_reward_func": 0.987252376973629,
254
  "rewards/soft_format_reward_func": 0.0,
255
+ "rewards/strict_format_reward_func": 0.46875,
256
+ "rewards/xmlcount_reward_func": 1.2496875077486038,
257
  "step": 26
258
  },
259
  {
260
+ "completion_length": 290.4375,
261
  "epoch": 1.5714285714285714,
262
+ "grad_norm": 985.9644165039062,
263
+ "kl": 47.3938269498758,
264
  "learning_rate": 4.282022261367073e-07,
265
+ "loss": 0.0474,
266
+ "reward": 20.982531785964966,
267
+ "reward_std": 0.09451750945299864,
268
  "rewards/concensus_correctness_reward_func": 15.0,
269
+ "rewards/consensus_reward_func": 1.875,
270
  "rewards/cumulative_reward_2": 0.0,
271
+ "rewards/final_correctness_reward_func": 1.4375,
272
+ "rewards/question_recreation_reward_func": 0.9512817114591599,
273
  "rewards/soft_format_reward_func": 0.0,
274
+ "rewards/strict_format_reward_func": 0.46875,
275
+ "rewards/xmlcount_reward_func": 1.25,
276
  "step": 28
277
  },
278
  {
279
+ "completion_length": 291.75,
280
  "epoch": 1.6857142857142857,
281
+ "grad_norm": 14.219393730163574,
282
+ "kl": 5697.919686453417,
283
  "learning_rate": 4.1647880625292027e-07,
284
+ "loss": 5.6979,
285
+ "reward": 24.938138127326965,
286
+ "reward_std": 0.5503595303162001,
287
+ "rewards/concensus_correctness_reward_func": 18.75,
288
  "rewards/consensus_reward_func": 2.0,
289
  "rewards/cumulative_reward_2": 0.0,
290
+ "rewards/final_correctness_reward_func": 1.5625,
291
+ "rewards/question_recreation_reward_func": 0.9167943075299263,
292
  "rewards/soft_format_reward_func": 0.0,
293
+ "rewards/strict_format_reward_func": 0.484375,
294
+ "rewards/xmlcount_reward_func": 1.2244687490165234,
295
  "step": 30
296
  },
297
  {
298
+ "completion_length": 320.78125,
299
  "epoch": 1.8,
300
+ "grad_norm": 111691.4140625,
301
+ "kl": 9744.625750856474,
302
  "learning_rate": 4.040571164002318e-07,
303
+ "loss": 9.7446,
304
+ "reward": 23.627057909965515,
305
+ "reward_std": 0.3214102545171045,
306
+ "rewards/concensus_correctness_reward_func": 17.5,
307
+ "rewards/consensus_reward_func": 1.875,
308
  "rewards/cumulative_reward_2": 0.0,
309
+ "rewards/final_correctness_reward_func": 1.6875,
310
+ "rewards/question_recreation_reward_func": 0.9628704935312271,
311
  "rewards/soft_format_reward_func": 0.0,
312
  "rewards/strict_format_reward_func": 0.46875,
313
+ "rewards/xmlcount_reward_func": 1.1329374983906746,
314
  "step": 32
315
  },
316
  {
317
+ "completion_length": 288.90625,
318
  "epoch": 1.9142857142857141,
319
+ "grad_norm": 5.710204124450684,
320
+ "kl": 35.09870433434844,
321
  "learning_rate": 3.909892574627266e-07,
322
+ "loss": 0.0351,
323
+ "reward": 19.93251895904541,
324
+ "reward_std": 0.09295251258299686,
325
+ "rewards/concensus_correctness_reward_func": 13.75,
326
  "rewards/consensus_reward_func": 2.0,
327
  "rewards/cumulative_reward_2": 0.0,
328
+ "rewards/final_correctness_reward_func": 1.4375,
329
+ "rewards/question_recreation_reward_func": 0.9950189106166363,
330
  "rewards/soft_format_reward_func": 0.0,
331
  "rewards/strict_format_reward_func": 0.5,
332
  "rewards/xmlcount_reward_func": 1.25,
333
  "step": 34
334
  },
335
  {
336
+ "completion_length": 312.9583333333333,
337
  "epoch": 2.0,
338
+ "grad_norm": 186.2655029296875,
339
+ "kl": 8339.249363450954,
340
  "learning_rate": 3.773300405821908e-07,
341
+ "loss": 6.2544,
342
+ "reward": 22.710465768973034,
343
+ "reward_std": 0.40588310376430553,
344
  "rewards/concensus_correctness_reward_func": 16.666666666666668,
345
  "rewards/consensus_reward_func": 2.0,
346
  "rewards/cumulative_reward_2": 0.0,
347
+ "rewards/final_correctness_reward_func": 1.5,
348
+ "rewards/question_recreation_reward_func": 0.8562991668780645,
349
  "rewards/soft_format_reward_func": 0.0,
350
+ "rewards/strict_format_reward_func": 0.4583333333333333,
351
+ "rewards/xmlcount_reward_func": 1.2291666666666667,
352
  "step": 36
353
  },
354
  {
355
+ "completion_length": 286.53125,
356
  "epoch": 2.1142857142857143,
357
+ "grad_norm": 10.10280704498291,
358
+ "kl": 1.4235679027624428,
359
  "learning_rate": 3.6313675726113475e-07,
360
+ "loss": 0.0014,
361
+ "reward": 25.085916996002197,
362
+ "reward_std": 0.01389117946382612,
363
+ "rewards/concensus_correctness_reward_func": 18.75,
364
  "rewards/consensus_reward_func": 2.0,
365
  "rewards/cumulative_reward_2": 0.0,
366
+ "rewards/final_correctness_reward_func": 1.625,
367
+ "rewards/question_recreation_reward_func": 0.9609169252216816,
368
  "rewards/soft_format_reward_func": 0.0,
369
  "rewards/strict_format_reward_func": 0.5,
370
  "rewards/xmlcount_reward_func": 1.25,
371
  "step": 38
372
  },
373
  {
374
+ "completion_length": 307.84375,
375
  "epoch": 2.2285714285714286,
376
+ "grad_norm": 301.5379333496094,
377
+ "kl": 31.047572038602084,
378
  "learning_rate": 3.484689390623218e-07,
379
+ "loss": 0.031,
380
+ "reward": 24.409271508455276,
381
+ "reward_std": 1.3443972589448094,
382
+ "rewards/concensus_correctness_reward_func": 18.125,
383
+ "rewards/consensus_reward_func": 1.9375,
384
  "rewards/cumulative_reward_2": 0.0,
385
+ "rewards/final_correctness_reward_func": 1.75,
386
+ "rewards/question_recreation_reward_func": 0.9250839278101921,
387
  "rewards/soft_format_reward_func": 0.0,
388
+ "rewards/strict_format_reward_func": 0.453125,
389
+ "rewards/xmlcount_reward_func": 1.2185625061392784,
390
  "step": 40
391
  },
392
  {
393
+ "completion_length": 321.125,
394
  "epoch": 2.342857142857143,
395
+ "grad_norm": 9.44746208190918,
396
+ "kl": 99998651.09828581,
397
  "learning_rate": 3.3338810791270517e-07,
398
+ "loss": 99998.6562,
399
+ "reward": 19.91300740838051,
400
+ "reward_std": 0.06640663610232878,
401
  "rewards/concensus_correctness_reward_func": 13.75,
402
  "rewards/consensus_reward_func": 2.0,
403
  "rewards/cumulative_reward_2": 0.0,
404
+ "rewards/final_correctness_reward_func": 1.4375,
405
+ "rewards/question_recreation_reward_func": 0.9950386732816696,
406
  "rewards/soft_format_reward_func": 0.0,
407
+ "rewards/strict_format_reward_func": 0.484375,
408
+ "rewards/xmlcount_reward_func": 1.24609375,
409
  "step": 42
410
  },
411
  {
412
+ "completion_length": 297.0,
413
  "epoch": 2.4571428571428573,
414
+ "grad_norm": 4.234458923339844,
415
+ "kl": 26.058129234821536,
416
  "learning_rate": 3.179575180590857e-07,
417
+ "loss": 0.0261,
418
+ "reward": 20.36888739466667,
419
+ "reward_std": 1.2362263132818043,
420
+ "rewards/concensus_correctness_reward_func": 14.375,
421
+ "rewards/consensus_reward_func": 1.9375,
422
  "rewards/cumulative_reward_2": 0.0,
423
  "rewards/final_correctness_reward_func": 1.375,
424
+ "rewards/question_recreation_reward_func": 0.9313873276114464,
425
  "rewards/soft_format_reward_func": 0.0,
426
  "rewards/strict_format_reward_func": 0.5,
427
  "rewards/xmlcount_reward_func": 1.25,
428
  "step": 44
429
  },
430
  {
431
+ "completion_length": 293.375,
432
  "epoch": 2.571428571428571,
433
+ "grad_norm": 3.131242513656616,
434
+ "kl": 0.3580109770409763,
435
  "learning_rate": 3.022418907578188e-07,
436
+ "loss": 0.0004,
437
+ "reward": 22.51879781484604,
438
+ "reward_std": 0.1030593290925026,
439
  "rewards/concensus_correctness_reward_func": 16.25,
440
+ "rewards/consensus_reward_func": 1.875,
441
  "rewards/cumulative_reward_2": 0.0,
442
+ "rewards/final_correctness_reward_func": 1.6875,
443
+ "rewards/question_recreation_reward_func": 0.9875477105379105,
444
  "rewards/soft_format_reward_func": 0.0,
445
+ "rewards/strict_format_reward_func": 0.46875,
446
  "rewards/xmlcount_reward_func": 1.25,
447
  "step": 46
448
  },
449
  {
450
+ "completion_length": 303.6875,
451
  "epoch": 2.685714285714286,
452
+ "grad_norm": 7157400.5,
453
+ "kl": 723253.104146285,
454
  "learning_rate": 2.863071428113726e-07,
455
+ "loss": 723.2531,
456
+ "reward": 21.64236208796501,
457
+ "reward_std": 1.3292767723178258,
458
  "rewards/concensus_correctness_reward_func": 15.625,
459
  "rewards/consensus_reward_func": 1.9375,
460
  "rewards/cumulative_reward_2": 0.0,
461
+ "rewards/final_correctness_reward_func": 1.375,
462
+ "rewards/question_recreation_reward_func": 0.9548620991408825,
463
  "rewards/soft_format_reward_func": 0.0,
464
  "rewards/strict_format_reward_func": 0.5,
465
+ "rewards/xmlcount_reward_func": 1.25,
466
  "step": 48
467
  },
468
  {
469
+ "completion_length": 288.4375,
470
  "epoch": 2.8,
471
+ "grad_norm": 3.812501907348633,
472
+ "kl": 0.1941389513667673,
473
  "learning_rate": 2.7022011009035107e-07,
474
+ "loss": 0.0002,
475
+ "reward": 23.754722118377686,
476
+ "reward_std": 0.13451411115238443,
477
  "rewards/concensus_correctness_reward_func": 17.5,
478
+ "rewards/consensus_reward_func": 1.875,
479
  "rewards/cumulative_reward_2": 0.0,
480
+ "rewards/final_correctness_reward_func": 1.6875,
481
+ "rewards/question_recreation_reward_func": 0.9734721444547176,
482
  "rewards/soft_format_reward_func": 0.0,
483
  "rewards/strict_format_reward_func": 0.46875,
484
  "rewards/xmlcount_reward_func": 1.25,
485
  "step": 50
486
  },
487
  {
488
+ "completion_length": 273.25,
489
  "epoch": 2.914285714285714,
490
+ "grad_norm": 64386.68359375,
491
+ "kl": 4703.566962761339,
492
  "learning_rate": 2.540482672006254e-07,
493
+ "loss": 4.7036,
494
+ "reward": 23.86061406135559,
495
+ "reward_std": 0.11497652903926792,
496
+ "rewards/concensus_correctness_reward_func": 17.5,
497
  "rewards/consensus_reward_func": 2.0,
498
  "rewards/cumulative_reward_2": 0.0,
499
+ "rewards/final_correctness_reward_func": 1.6875,
500
+ "rewards/question_recreation_reward_func": 0.9387390580959618,
501
  "rewards/soft_format_reward_func": 0.0,
502
+ "rewards/strict_format_reward_func": 0.484375,
503
+ "rewards/xmlcount_reward_func": 1.25,
504
  "step": 52
505
  },
506
  {
507
+ "completion_length": 283.375,
508
  "epoch": 3.0,
509
+ "grad_norm": 4.72637939453125,
510
+ "kl": 0.41162373684346676,
511
  "learning_rate": 2.37859444471388e-07,
512
+ "loss": 0.0003,
513
+ "reward": 22.2163348197937,
514
+ "reward_std": 1.4251218068723877,
515
+ "rewards/concensus_correctness_reward_func": 15.833333333333334,
516
+ "rewards/consensus_reward_func": 1.9166666666666667,
517
  "rewards/cumulative_reward_2": 0.0,
518
+ "rewards/final_correctness_reward_func": 1.75,
519
+ "rewards/question_recreation_reward_func": 0.9663348992665609,
520
  "rewards/soft_format_reward_func": 0.0,
521
  "rewards/strict_format_reward_func": 0.5,
522
  "rewards/xmlcount_reward_func": 1.25,
523
  "step": 54
524
  },
525
  {
526
+ "completion_length": 288.875,
527
  "epoch": 3.1142857142857143,
528
+ "grad_norm": 8.240327835083008,
529
+ "kl": 0.11788215674459934,
530
  "learning_rate": 2.2172154345117894e-07,
531
+ "loss": 0.0001,
532
+ "reward": 23.904327988624573,
533
+ "reward_std": 0.30620702836313285,
534
  "rewards/concensus_correctness_reward_func": 17.5,
535
  "rewards/consensus_reward_func": 2.0,
536
  "rewards/cumulative_reward_2": 0.0,
537
  "rewards/final_correctness_reward_func": 1.6875,
538
+ "rewards/question_recreation_reward_func": 0.9668280705809593,
539
  "rewards/soft_format_reward_func": 0.0,
540
  "rewards/strict_format_reward_func": 0.5,
541
  "rewards/xmlcount_reward_func": 1.25,
542
  "step": 56
543
  },
544
  {
545
+ "completion_length": 286.1875,
546
  "epoch": 3.2285714285714286,
547
+ "grad_norm": 5.542027950286865,
548
+ "kl": 0.11034615035168827,
549
  "learning_rate": 2.0570225210519433e-07,
550
+ "loss": 0.0001,
551
+ "reward": 21.113164007663727,
552
+ "reward_std": 0.19153568047477165,
553
  "rewards/concensus_correctness_reward_func": 15.0,
554
  "rewards/consensus_reward_func": 2.0,
555
  "rewards/cumulative_reward_2": 0.0,
556
+ "rewards/final_correctness_reward_func": 1.4375,
557
+ "rewards/question_recreation_reward_func": 0.941288948059082,
558
  "rewards/soft_format_reward_func": 0.0,
559
+ "rewards/strict_format_reward_func": 0.484375,
560
+ "rewards/xmlcount_reward_func": 1.25,
561
  "step": 58
562
  },
563
  {
564
+ "completion_length": 293.96875,
565
  "epoch": 3.342857142857143,
566
+ "grad_norm": 89873.234375,
567
+ "kl": 15506.7046823774,
568
  "learning_rate": 1.8986876090843664e-07,
569
+ "loss": 15.5067,
570
+ "reward": 24.321931093931198,
571
+ "reward_std": 0.9067758475139271,
572
+ "rewards/concensus_correctness_reward_func": 18.238062500953674,
573
+ "rewards/consensus_reward_func": 1.9375,
574
  "rewards/cumulative_reward_2": 0.0,
575
+ "rewards/final_correctness_reward_func": 1.4375,
576
+ "rewards/question_recreation_reward_func": 0.9588685538619757,
577
  "rewards/soft_format_reward_func": 0.0,
578
+ "rewards/strict_format_reward_func": 0.5,
579
  "rewards/xmlcount_reward_func": 1.25,
580
  "step": 60
581
  },
582
  {
583
+ "completion_length": 292.65625,
584
  "epoch": 3.4571428571428573,
585
+ "grad_norm": 1.29377019405365,
586
+ "kl": 2.4666087054647505,
587
  "learning_rate": 1.7428748102551234e-07,
588
+ "loss": 0.0025,
589
+ "reward": 21.342330873012543,
590
+ "reward_std": 0.17704137448163237,
591
  "rewards/concensus_correctness_reward_func": 15.0,
592
+ "rewards/consensus_reward_func": 1.875,
593
  "rewards/cumulative_reward_2": 0.0,
594
+ "rewards/final_correctness_reward_func": 1.75,
595
+ "rewards/question_recreation_reward_func": 0.9985808059573174,
596
  "rewards/soft_format_reward_func": 0.0,
597
+ "rewards/strict_format_reward_func": 0.46875,
598
  "rewards/xmlcount_reward_func": 1.25,
599
  "step": 62
600
  },
601
  {
602
+ "completion_length": 296.125,
603
  "epoch": 3.571428571428571,
604
+ "grad_norm": 11.036781311035156,
605
+ "kl": 1.3272000113502145,
606
  "learning_rate": 1.5902376575912814e-07,
607
+ "loss": 0.0013,
608
+ "reward": 22.402726590633392,
609
+ "reward_std": 0.0906209279637551,
610
  "rewards/concensus_correctness_reward_func": 16.25,
611
  "rewards/consensus_reward_func": 2.0,
612
  "rewards/cumulative_reward_2": 0.0,
613
+ "rewards/final_correctness_reward_func": 1.4375,
614
+ "rewards/question_recreation_reward_func": 0.9652266465127468,
615
  "rewards/soft_format_reward_func": 0.0,
616
  "rewards/strict_format_reward_func": 0.5,
617
  "rewards/xmlcount_reward_func": 1.25,
618
  "step": 64
619
  },
620
  {
621
+ "completion_length": 311.25,
622
  "epoch": 3.685714285714286,
623
+ "grad_norm": 3.4472224712371826,
624
+ "kl": 0.8945652563124895,
625
  "learning_rate": 1.4414163643562753e-07,
626
+ "loss": 0.0009,
627
+ "reward": 22.61855262517929,
628
+ "reward_std": 0.18265954998787493,
629
  "rewards/concensus_correctness_reward_func": 16.25,
630
  "rewards/consensus_reward_func": 2.0,
631
  "rewards/cumulative_reward_2": 0.0,
632
+ "rewards/final_correctness_reward_func": 1.625,
633
+ "rewards/question_recreation_reward_func": 0.9935526624321938,
634
  "rewards/soft_format_reward_func": 0.0,
635
  "rewards/strict_format_reward_func": 0.5,
636
  "rewards/xmlcount_reward_func": 1.25,
637
  "step": 66
638
  },
639
  {
640
+ "completion_length": 322.75,
641
  "epoch": 3.8,
642
+ "grad_norm": 6.844115734100342,
643
+ "kl": 3.179185423068702,
644
  "learning_rate": 1.2970351387729872e-07,
645
+ "loss": 0.0032,
646
+ "reward": 21.179388463497162,
647
+ "reward_std": 0.09523822067421861,
648
+ "rewards/concensus_correctness_reward_func": 15.0,
649
  "rewards/consensus_reward_func": 2.0,
650
  "rewards/cumulative_reward_2": 0.0,
651
  "rewards/final_correctness_reward_func": 1.4375,
652
+ "rewards/question_recreation_reward_func": 0.9918884485960007,
653
  "rewards/soft_format_reward_func": 0.0,
654
+ "rewards/strict_format_reward_func": 0.5,
655
+ "rewards/xmlcount_reward_func": 1.25,
656
  "step": 68
657
  },
658
  {
659
+ "completion_length": 280.5,
660
  "epoch": 3.914285714285714,
661
+ "grad_norm": 5.8618083000183105,
662
+ "kl": 7725.971832116949,
663
  "learning_rate": 1.1576995658775404e-07,
664
+ "loss": 7.726,
665
+ "reward": 24.525205433368683,
666
+ "reward_std": 1.1531662662309827,
667
+ "rewards/concensus_correctness_reward_func": 18.125,
668
+ "rewards/consensus_reward_func": 1.9375,
669
  "rewards/cumulative_reward_2": 0.0,
670
+ "rewards/final_correctness_reward_func": 1.8125,
671
+ "rewards/question_recreation_reward_func": 0.935486514121294,
672
  "rewards/soft_format_reward_func": 0.0,
673
  "rewards/strict_format_reward_func": 0.46875,
674
+ "rewards/xmlcount_reward_func": 1.2459687516093254,
675
  "step": 70
676
  },
677
  {
678
+ "completion_length": 295.375,
679
  "epoch": 4.0,
680
+ "grad_norm": 4.205502986907959,
681
+ "kl": 5170.305763981615,
682
  "learning_rate": 1.0239940674851941e-07,
683
+ "loss": 3.8777,
684
+ "reward": 24.69401526451111,
685
+ "reward_std": 0.29310186601166305,
686
  "rewards/concensus_correctness_reward_func": 18.333333333333332,
687
  "rewards/consensus_reward_func": 2.0,
688
  "rewards/cumulative_reward_2": 0.0,
689
+ "rewards/final_correctness_reward_func": 1.6666666666666667,
690
+ "rewards/question_recreation_reward_func": 0.9440152446428934,
691
  "rewards/soft_format_reward_func": 0.0,
692
+ "rewards/strict_format_reward_func": 0.5,
693
+ "rewards/xmlcount_reward_func": 1.25,
694
  "step": 72
695
  },
696
  {
697
+ "completion_length": 322.75,
698
  "epoch": 4.114285714285714,
699
+ "grad_norm": 853897.4375,
700
+ "kl": 93995.61682448094,
701
  "learning_rate": 8.964794509221507e-08,
702
+ "loss": 93.9956,
703
+ "reward": 21.796031087636948,
704
+ "reward_std": 1.1461751838796772,
705
+ "rewards/concensus_correctness_reward_func": 15.625,
706
+ "rewards/consensus_reward_func": 1.9375,
707
  "rewards/cumulative_reward_2": 0.0,
708
+ "rewards/final_correctness_reward_func": 1.5,
709
+ "rewards/question_recreation_reward_func": 0.9835311211645603,
710
  "rewards/soft_format_reward_func": 0.0,
711
  "rewards/strict_format_reward_func": 0.5,
712
  "rewards/xmlcount_reward_func": 1.25,
713
  "step": 74
714
  },
715
  {
716
+ "completion_length": 310.625,
717
  "epoch": 4.228571428571429,
718
+ "grad_norm": 6.561783790588379,
719
+ "kl": 0.6868871822953224,
720
  "learning_rate": 7.756905568047392e-08,
721
+ "loss": 0.0007,
722
+ "reward": 23.608940601348877,
723
+ "reward_std": 0.1481841884815367,
724
+ "rewards/concensus_correctness_reward_func": 17.5,
725
+ "rewards/consensus_reward_func": 2.0,
726
  "rewards/cumulative_reward_2": 0.0,
727
+ "rewards/final_correctness_reward_func": 1.4375,
728
+ "rewards/question_recreation_reward_func": 0.9526904746890068,
729
  "rewards/soft_format_reward_func": 0.0,
730
+ "rewards/strict_format_reward_func": 0.46875,
731
+ "rewards/xmlcount_reward_func": 1.25,
732
  "step": 76
733
  },
734
  {
735
+ "completion_length": 300.6875,
736
  "epoch": 4.3428571428571425,
737
+ "grad_norm": 5.61808967590332,
738
+ "kl": 0.17497208586428314,
739
  "learning_rate": 6.621340157319996e-08,
740
+ "loss": 0.0002,
741
+ "reward": 20.960274010896683,
742
+ "reward_std": 0.12753743107896298,
743
  "rewards/concensus_correctness_reward_func": 15.0,
744
  "rewards/consensus_reward_func": 2.0,
745
  "rewards/cumulative_reward_2": 0.0,
746
+ "rewards/final_correctness_reward_func": 1.375,
747
+ "rewards/question_recreation_reward_func": 0.9174302890896797,
748
  "rewards/soft_format_reward_func": 0.0,
749
  "rewards/strict_format_reward_func": 0.453125,
750
+ "rewards/xmlcount_reward_func": 1.2147187516093254,
751
  "step": 78
752
  },
753
  {
754
+ "completion_length": 269.59375,
755
  "epoch": 4.457142857142857,
756
+ "grad_norm": 58.665802001953125,
757
+ "kl": 5.802749904803932,
758
  "learning_rate": 5.5628612330087724e-08,
759
+ "loss": 0.0058,
760
+ "reward": 26.747634530067444,
761
+ "reward_std": 0.0016583941760472953,
762
+ "rewards/concensus_correctness_reward_func": 20.0,
763
  "rewards/consensus_reward_func": 2.0,
764
  "rewards/cumulative_reward_2": 0.0,
765
+ "rewards/final_correctness_reward_func": 2.0,
766
+ "rewards/question_recreation_reward_func": 0.9976345337927341,
767
  "rewards/soft_format_reward_func": 0.0,
768
+ "rewards/strict_format_reward_func": 0.5,
769
+ "rewards/xmlcount_reward_func": 1.25,
770
  "step": 80
771
  },
772
  {
773
+ "completion_length": 315.21875,
774
  "epoch": 4.571428571428571,
775
+ "grad_norm": 0.19868668913841248,
776
+ "kl": 131.63193395012058,
777
  "learning_rate": 4.5859084235697235e-08,
778
+ "loss": 0.1316,
779
+ "reward": 16.382025599479675,
780
+ "reward_std": 1.1490484923124313,
781
+ "rewards/concensus_correctness_reward_func": 10.625,
782
+ "rewards/consensus_reward_func": 1.8125,
783
  "rewards/cumulative_reward_2": 0.0,
784
  "rewards/final_correctness_reward_func": 1.25,
785
+ "rewards/question_recreation_reward_func": 0.9757755696773529,
786
  "rewards/soft_format_reward_func": 0.0,
787
+ "rewards/strict_format_reward_func": 0.46875,
788
  "rewards/xmlcount_reward_func": 1.25,
789
  "step": 82
790
  },
791
  {
792
+ "completion_length": 291.96875,
793
  "epoch": 4.685714285714286,
794
+ "grad_norm": 1.1480889320373535,
795
+ "kl": 0.19505003839731216,
796
  "learning_rate": 3.6945794086007705e-08,
797
+ "loss": 0.0002,
798
+ "reward": 22.737855315208435,
799
+ "reward_std": 0.0,
800
  "rewards/concensus_correctness_reward_func": 16.25,
801
+ "rewards/consensus_reward_func": 1.875,
802
  "rewards/cumulative_reward_2": 0.0,
803
+ "rewards/final_correctness_reward_func": 1.875,
804
+ "rewards/question_recreation_reward_func": 0.9878552854061127,
805
  "rewards/soft_format_reward_func": 0.0,
806
+ "rewards/strict_format_reward_func": 0.5,
807
+ "rewards/xmlcount_reward_func": 1.25,
808
  "step": 84
809
  },
810
  {
811
+ "completion_length": 284.625,
812
  "epoch": 4.8,
813
+ "grad_norm": 9.016759872436523,
814
+ "kl": 0.27029623091220856,
815
  "learning_rate": 2.892612731749414e-08,
816
+ "loss": 0.0003,
817
+ "reward": 25.095996618270874,
818
+ "reward_std": 0.38329675246495754,
819
+ "rewards/concensus_correctness_reward_func": 18.75,
820
  "rewards/consensus_reward_func": 2.0,
821
  "rewards/cumulative_reward_2": 0.0,
822
+ "rewards/final_correctness_reward_func": 1.625,
823
+ "rewards/question_recreation_reward_func": 0.9905278831720352,
824
  "rewards/soft_format_reward_func": 0.0,
825
+ "rewards/strict_format_reward_func": 0.484375,
826
+ "rewards/xmlcount_reward_func": 1.24609375,
827
  "step": 86
828
  },
829
  {
830
+ "completion_length": 282.96875,
831
  "epoch": 4.914285714285715,
832
+ "grad_norm": 10.288511276245117,
833
+ "kl": 1.007630041684024,
834
  "learning_rate": 2.183372119961499e-08,
835
+ "loss": 0.001,
836
+ "reward": 23.806805968284607,
837
+ "reward_std": 0.08782566487207077,
838
  "rewards/concensus_correctness_reward_func": 17.5,
839
  "rewards/consensus_reward_func": 2.0,
840
  "rewards/cumulative_reward_2": 0.0,
841
+ "rewards/final_correctness_reward_func": 1.5625,
842
+ "rewards/question_recreation_reward_func": 0.9943058528006077,
843
  "rewards/soft_format_reward_func": 0.0,
844
  "rewards/strict_format_reward_func": 0.5,
845
  "rewards/xmlcount_reward_func": 1.25,
846
  "step": 88
847
  },
848
  {
849
+ "completion_length": 288.9166666666667,
850
  "epoch": 5.0,
851
+ "grad_norm": 5.103769779205322,
852
+ "kl": 0.5060152485966682,
853
  "learning_rate": 1.5698323748414122e-08,
854
+ "loss": 0.0004,
855
+ "reward": 24.75821030139923,
856
+ "reward_std": 0.1651654613087885,
857
+ "rewards/concensus_correctness_reward_func": 18.333333333333332,
858
  "rewards/consensus_reward_func": 2.0,
859
  "rewards/cumulative_reward_2": 0.0,
860
+ "rewards/final_correctness_reward_func": 1.75,
861
+ "rewards/question_recreation_reward_func": 0.9873770674069723,
862
  "rewards/soft_format_reward_func": 0.0,
863
+ "rewards/strict_format_reward_func": 0.4375,
864
  "rewards/xmlcount_reward_func": 1.25,
865
  "step": 90
866
  },
867
  {
868
+ "completion_length": 312.25,
869
  "epoch": 5.114285714285714,
870
+ "grad_norm": 4.315695762634277,
871
+ "kl": 0.2543930097017437,
872
  "learning_rate": 1.054566895300324e-08,
873
+ "loss": 0.0003,
874
+ "reward": 24.557986617088318,
875
+ "reward_std": 1.1104227881878614,
876
+ "rewards/concensus_correctness_reward_func": 18.125,
877
+ "rewards/consensus_reward_func": 1.9375,
878
  "rewards/cumulative_reward_2": 0.0,
879
+ "rewards/final_correctness_reward_func": 1.8125,
880
+ "rewards/question_recreation_reward_func": 0.9994240589439869,
881
  "rewards/soft_format_reward_func": 0.0,
882
+ "rewards/strict_format_reward_func": 0.453125,
883
+ "rewards/xmlcount_reward_func": 1.2304375022649765,
884
  "step": 92
885
  },
886
  {
887
+ "completion_length": 297.375,
888
  "epoch": 5.228571428571429,
889
+ "grad_norm": 4.21738338470459,
890
+ "kl": 0.24928702949546278,
891
  "learning_rate": 6.397368838268496e-09,
892
+ "loss": 0.0002,
893
+ "reward": 19.507496863603592,
894
+ "reward_std": 0.33748881984502077,
895
+ "rewards/concensus_correctness_reward_func": 13.75,
896
+ "rewards/consensus_reward_func": 1.875,
897
  "rewards/cumulative_reward_2": 0.0,
898
+ "rewards/final_correctness_reward_func": 1.25,
899
+ "rewards/question_recreation_reward_func": 0.9494031816720963,
900
  "rewards/soft_format_reward_func": 0.0,
901
  "rewards/strict_format_reward_func": 0.46875,
902
+ "rewards/xmlcount_reward_func": 1.214343748986721,
903
  "step": 94
904
  },
905
  {
906
+ "completion_length": 286.0,
907
  "epoch": 5.3428571428571425,
908
+ "grad_norm": 4.452499866485596,
909
+ "kl": 7.8268896921072155,
910
  "learning_rate": 3.2708228165273244e-09,
911
+ "loss": 0.0078,
912
+ "reward": 22.384023070335388,
913
+ "reward_std": 0.30442836828297004,
914
+ "rewards/concensus_correctness_reward_func": 16.25,
915
  "rewards/consensus_reward_func": 2.0,
916
  "rewards/cumulative_reward_2": 0.0,
917
+ "rewards/final_correctness_reward_func": 1.4375,
918
+ "rewards/question_recreation_reward_func": 0.9465231820940971,
919
  "rewards/soft_format_reward_func": 0.0,
920
+ "rewards/strict_format_reward_func": 0.5,
921
  "rewards/xmlcount_reward_func": 1.25,
922
  "step": 96
923
  },
924
  {
925
+ "completion_length": 264.4375,
926
  "epoch": 5.457142857142857,
927
+ "grad_norm": 2.327564001083374,
928
+ "kl": 0.21026335144415498,
929
  "learning_rate": 1.1791447083465133e-09,
930
+ "loss": 0.0002,
931
+ "reward": 21.16586148738861,
932
+ "reward_std": 0.29173441627062857,
933
  "rewards/concensus_correctness_reward_func": 15.0,
934
+ "rewards/consensus_reward_func": 1.875,
935
  "rewards/cumulative_reward_2": 0.0,
936
+ "rewards/final_correctness_reward_func": 1.625,
937
+ "rewards/question_recreation_reward_func": 0.968142680823803,
938
  "rewards/soft_format_reward_func": 0.0,
939
+ "rewards/strict_format_reward_func": 0.484375,
940
+ "rewards/xmlcount_reward_func": 1.2133437506854534,
941
  "step": 98
942
  },
943
  {
944
+ "completion_length": 284.6875,
945
  "epoch": 5.571428571428571,
946
+ "grad_norm": 7.0255889892578125,
947
+ "kl": 0.2286427365615964,
948
  "learning_rate": 1.3110773862126667e-10,
949
+ "loss": 0.0002,
950
+ "reward": 26.42949390411377,
951
+ "reward_std": 0.40112065349239856,
952
  "rewards/concensus_correctness_reward_func": 20.0,
953
  "rewards/consensus_reward_func": 2.0,
954
  "rewards/cumulative_reward_2": 0.0,
955
+ "rewards/final_correctness_reward_func": 1.75,
956
+ "rewards/question_recreation_reward_func": 0.9490251876413822,
957
  "rewards/soft_format_reward_func": 0.0,
958
+ "rewards/strict_format_reward_func": 0.484375,
959
+ "rewards/xmlcount_reward_func": 1.24609375,
960
  "step": 100
961
  },
962
  {
963
  "epoch": 5.571428571428571,
964
  "step": 100,
965
  "total_flos": 0.0,
966
+ "train_loss": 296549.3844699778,
967
+ "train_runtime": 1501.8809,
968
+ "train_samples_per_second": 1.065,
969
  "train_steps_per_second": 0.067
970
  }
971
  ],