wyceee commited on
Commit
901c90c
·
verified ·
1 Parent(s): 05ccd9e

End of training

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 276553.1687706073,
4
- "train_runtime": 2525.6374,
5
- "train_samples": 206,
6
- "train_samples_per_second": 0.95,
7
- "train_steps_per_second": 0.059
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0006611158326268196,
4
+ "train_runtime": 713.3917,
5
+ "train_samples": 72,
6
+ "train_samples_per_second": 1.121,
7
+ "train_steps_per_second": 0.07
8
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6102ca7929a258ad58ec988fa39abf56b4c3b27e24c4c27bdb4f33590c759e89
3
  size 4996670464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88d23a82979b36e975ac9b94d46aafdf2bd039961e27af4946cdf04da546bbf5
3
  size 4996670464
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70da21c9a2202c68e19c97f09b05110ce47570d0b9a4f89234cf025ca6c96a0a
3
  size 1178224960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d061050aed06a1540a4286a6d5ab25594b16297dce0822939459f8c5da0f1b7
3
  size 1178224960
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 276553.1687706073,
4
- "train_runtime": 2525.6374,
5
- "train_samples": 206,
6
- "train_samples_per_second": 0.95,
7
- "train_steps_per_second": 0.059
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0006611158326268196,
4
+ "train_runtime": 713.3917,
5
+ "train_samples": 72,
6
+ "train_samples_per_second": 1.121,
7
+ "train_steps_per_second": 0.07
8
  }
trainer_state.json CHANGED
@@ -2,1450 +2,500 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 5.776699029126213,
6
  "eval_steps": 500,
7
- "global_step": 150,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 326.65625,
14
- "epoch": 0.07766990291262135,
15
- "grad_norm": 5.087812423706055,
16
  "kl": 0.0,
17
- "learning_rate": 1e-07,
18
  "loss": 0.0,
19
- "reward": 12.541546734049916,
20
- "reward_std": 2.0986733530589845,
21
- "rewards/concensus_correctness_reward_func": 8.84543752670288,
22
- "rewards/consensus_reward_func": 1.1875,
23
  "rewards/cumulative_reward_2": 0.0,
24
- "rewards/final_correctness_reward_func": 0.9375,
25
- "rewards/question_recreation_reward_func": 0.7532341619953513,
26
  "rewards/soft_format_reward_func": 0.0,
27
- "rewards/strict_format_reward_func": 0.21875,
28
- "rewards/xmlcount_reward_func": 0.5991250043734908,
29
  "step": 2
30
  },
31
  {
32
- "completion_length": 418.21875,
33
- "epoch": 0.1553398058252427,
34
- "grad_norm": 5.183842658996582,
35
- "kl": 0.00497966196599009,
36
- "learning_rate": 3e-07,
37
  "loss": 0.0,
38
- "reward": 24.215519309043884,
39
- "reward_std": 0.6347227603837382,
40
- "rewards/concensus_correctness_reward_func": 18.75,
41
- "rewards/consensus_reward_func": 2.0,
42
  "rewards/cumulative_reward_2": 0.0,
43
- "rewards/final_correctness_reward_func": 1.125,
44
- "rewards/question_recreation_reward_func": 0.6920818646904081,
45
  "rewards/soft_format_reward_func": 0.0,
46
- "rewards/strict_format_reward_func": 0.453125,
47
- "rewards/xmlcount_reward_func": 1.1953125,
48
  "step": 4
49
  },
50
  {
51
- "completion_length": 442.4375,
52
- "epoch": 0.23300970873786409,
53
- "grad_norm": 15.643959045410156,
54
- "kl": 0.02986301330020069,
55
- "learning_rate": 5e-07,
56
- "loss": 0.0001,
57
- "reward": 22.029146760702133,
58
- "reward_std": 0.9156581156421453,
59
- "rewards/concensus_correctness_reward_func": 17.5,
60
- "rewards/consensus_reward_func": 2.0,
61
  "rewards/cumulative_reward_2": 0.0,
62
  "rewards/final_correctness_reward_func": 0.625,
63
- "rewards/question_recreation_reward_func": 0.5350532572483644,
64
  "rewards/soft_format_reward_func": 0.0,
65
- "rewards/strict_format_reward_func": 0.421875,
66
- "rewards/xmlcount_reward_func": 0.9472187459468842,
67
  "step": 6
68
  },
69
  {
70
- "completion_length": 461.59375,
71
- "epoch": 0.3106796116504854,
72
- "grad_norm": 838.2263793945312,
73
- "kl": 5747.859675845146,
74
- "learning_rate": 4.997653255609941e-07,
75
- "loss": 5.7478,
76
- "reward": 21.297580152750015,
77
- "reward_std": 0.9976767862717679,
78
- "rewards/concensus_correctness_reward_func": 16.25,
79
- "rewards/consensus_reward_func": 2.0,
80
  "rewards/cumulative_reward_2": 0.0,
81
- "rewards/final_correctness_reward_func": 1.0,
82
- "rewards/question_recreation_reward_func": 0.7561427439504769,
83
  "rewards/soft_format_reward_func": 0.0,
84
- "rewards/strict_format_reward_func": 0.390625,
85
- "rewards/xmlcount_reward_func": 0.9008125066757202,
86
  "step": 8
87
  },
88
  {
89
- "completion_length": 398.4375,
90
- "epoch": 0.3883495145631068,
91
- "grad_norm": 0.9904800653457642,
92
- "kl": 5.776627257338987,
93
- "learning_rate": 4.990617428207153e-07,
94
- "loss": 0.0058,
95
- "reward": 24.642052471637726,
96
- "reward_std": 0.7652387537236791,
97
- "rewards/concensus_correctness_reward_func": 18.75,
98
- "rewards/consensus_reward_func": 2.0,
99
  "rewards/cumulative_reward_2": 0.0,
100
- "rewards/final_correctness_reward_func": 1.5625,
101
- "rewards/question_recreation_reward_func": 0.8629898414947093,
102
  "rewards/soft_format_reward_func": 0.0,
103
- "rewards/strict_format_reward_func": 0.453125,
104
- "rewards/xmlcount_reward_func": 1.013437494635582,
105
  "step": 10
106
  },
107
  {
108
- "completion_length": 359.5625,
109
- "epoch": 0.46601941747572817,
110
- "grad_norm": 4.318275451660156,
111
- "kl": 0.058380404247145634,
112
- "learning_rate": 4.978905726822423e-07,
113
- "loss": 0.0001,
114
- "reward": 23.885154008865356,
115
- "reward_std": 0.9347413125506137,
116
- "rewards/concensus_correctness_reward_func": 18.75,
117
- "rewards/consensus_reward_func": 2.0,
118
  "rewards/cumulative_reward_2": 0.0,
119
- "rewards/final_correctness_reward_func": 1.0625,
120
- "rewards/question_recreation_reward_func": 0.7434666138142347,
121
  "rewards/soft_format_reward_func": 0.0,
122
- "rewards/strict_format_reward_func": 0.453125,
123
- "rewards/xmlcount_reward_func": 0.8760625123977661,
124
  "step": 12
125
  },
126
  {
127
- "completion_length": 464.5625,
128
- "epoch": 0.5436893203883495,
129
- "grad_norm": 751.3751831054688,
130
- "kl": 150.33370856080728,
131
- "learning_rate": 4.962540138951371e-07,
132
- "loss": 0.1503,
133
- "reward": 24.065202921628952,
134
- "reward_std": 1.3048145293723792,
135
- "rewards/concensus_correctness_reward_func": 18.75,
136
- "rewards/consensus_reward_func": 2.0,
137
  "rewards/cumulative_reward_2": 0.0,
138
- "rewards/final_correctness_reward_func": 1.3125,
139
- "rewards/question_recreation_reward_func": 0.7590467594563961,
140
  "rewards/soft_format_reward_func": 0.0,
141
- "rewards/strict_format_reward_func": 0.390625,
142
- "rewards/xmlcount_reward_func": 0.8530312478542328,
143
  "step": 14
144
  },
145
  {
146
- "completion_length": 531.28125,
147
- "epoch": 0.6213592233009708,
148
- "grad_norm": 7.301548004150391,
149
- "kl": 80.93185214634286,
150
- "learning_rate": 4.941551389275217e-07,
151
- "loss": 0.0809,
152
- "reward": 22.246170967817307,
153
- "reward_std": 0.6592393836472183,
154
- "rewards/concensus_correctness_reward_func": 17.5,
155
- "rewards/consensus_reward_func": 2.0,
156
  "rewards/cumulative_reward_2": 0.0,
157
- "rewards/final_correctness_reward_func": 0.8125,
158
- "rewards/question_recreation_reward_func": 0.5000774906948209,
159
  "rewards/soft_format_reward_func": 0.0,
160
- "rewards/strict_format_reward_func": 0.34375,
161
- "rewards/xmlcount_reward_func": 1.08984375,
162
  "step": 16
163
  },
164
  {
165
- "completion_length": 452.4375,
166
- "epoch": 0.6990291262135923,
167
- "grad_norm": 9.507166862487793,
168
- "kl": 2.0539360598195344,
169
- "learning_rate": 4.915978881978406e-07,
170
- "loss": 0.0021,
171
- "reward": 23.97859239578247,
172
- "reward_std": 1.5465943743183743,
173
- "rewards/concensus_correctness_reward_func": 18.75,
174
- "rewards/consensus_reward_func": 2.0,
175
  "rewards/cumulative_reward_2": 0.0,
176
- "rewards/final_correctness_reward_func": 1.5625,
177
- "rewards/question_recreation_reward_func": 0.8017172659747303,
178
  "rewards/soft_format_reward_func": 0.0,
179
- "rewards/strict_format_reward_func": 0.390625,
180
- "rewards/xmlcount_reward_func": 0.4737500101327896,
181
  "step": 18
182
  },
183
  {
184
- "completion_length": 362.65625,
185
- "epoch": 0.7766990291262136,
186
- "grad_norm": 89.55291748046875,
187
- "kl": 11.09997240814846,
188
- "learning_rate": 4.88587062677137e-07,
189
- "loss": 0.0111,
190
- "reward": 23.28926581144333,
191
- "reward_std": 0.2580796589318197,
192
- "rewards/concensus_correctness_reward_func": 17.5,
193
- "rewards/consensus_reward_func": 2.0,
194
  "rewards/cumulative_reward_2": 0.0,
195
- "rewards/final_correctness_reward_func": 1.3125,
196
- "rewards/question_recreation_reward_func": 0.8048908673226833,
197
  "rewards/soft_format_reward_func": 0.0,
198
- "rewards/strict_format_reward_func": 0.453125,
199
- "rewards/xmlcount_reward_func": 1.21875,
200
  "step": 20
201
  },
202
  {
203
- "completion_length": 355.15625,
204
- "epoch": 0.8543689320388349,
205
- "grad_norm": 152.47579956054688,
206
- "kl": 39.7650267755962,
207
- "learning_rate": 4.85128314875731e-07,
208
- "loss": 0.0398,
209
- "reward": 24.909537255764008,
210
- "reward_std": 0.4422591715119779,
211
- "rewards/concensus_correctness_reward_func": 18.75,
212
- "rewards/consensus_reward_func": 2.0,
213
- "rewards/cumulative_reward_2": 0.0,
214
- "rewards/final_correctness_reward_func": 1.5625,
215
- "rewards/question_recreation_reward_func": 0.8821935635060072,
216
- "rewards/soft_format_reward_func": 0.0,
217
- "rewards/strict_format_reward_func": 0.484375,
218
- "rewards/xmlcount_reward_func": 1.23046875,
219
  "step": 22
220
  },
221
  {
222
- "completion_length": 343.1875,
223
- "epoch": 0.9320388349514563,
224
- "grad_norm": 2.846195697784424,
225
- "kl": 0.054342186282156035,
226
- "learning_rate": 4.812281382312223e-07,
227
- "loss": 0.0,
228
- "reward": 26.43182897567749,
229
- "reward_std": 0.19202635630063014,
230
- "rewards/concensus_correctness_reward_func": 20.0,
231
- "rewards/consensus_reward_func": 2.0,
232
  "rewards/cumulative_reward_2": 0.0,
233
- "rewards/final_correctness_reward_func": 1.75,
234
- "rewards/question_recreation_reward_func": 0.9318290073424578,
235
  "rewards/soft_format_reward_func": 0.0,
236
- "rewards/strict_format_reward_func": 0.5,
237
- "rewards/xmlcount_reward_func": 1.25,
238
  "step": 24
239
  },
240
  {
241
- "completion_length": 385.60714285714283,
242
- "epoch": 1.0,
243
- "grad_norm": 115.14498901367188,
244
- "kl": 6.028850141380515,
245
- "learning_rate": 4.768938549177392e-07,
246
- "loss": 0.0053,
247
- "reward": 24.90876007080078,
248
- "reward_std": 0.12485182759285506,
249
- "rewards/concensus_correctness_reward_func": 18.571428571428573,
250
- "rewards/consensus_reward_func": 2.0,
251
  "rewards/cumulative_reward_2": 0.0,
252
- "rewards/final_correctness_reward_func": 1.7142857142857142,
253
- "rewards/question_recreation_reward_func": 0.9132242266620908,
254
  "rewards/soft_format_reward_func": 0.0,
255
- "rewards/strict_format_reward_func": 0.48214285714285715,
256
- "rewards/xmlcount_reward_func": 1.2276785714285714,
257
  "step": 26
258
  },
259
  {
260
- "completion_length": 402.8125,
261
- "epoch": 1.0776699029126213,
262
- "grad_norm": 8.83275032043457,
263
- "kl": 0.36135920707602054,
264
- "learning_rate": 4.721336020993228e-07,
265
- "loss": 0.0003,
266
- "reward": 21.92051473259926,
267
- "reward_std": 0.560565251740627,
268
- "rewards/concensus_correctness_reward_func": 16.25,
269
- "rewards/consensus_reward_func": 2.0,
270
  "rewards/cumulative_reward_2": 0.0,
271
- "rewards/final_correctness_reward_func": 1.25,
272
- "rewards/question_recreation_reward_func": 0.7525460473261774,
273
  "rewards/soft_format_reward_func": 0.0,
274
- "rewards/strict_format_reward_func": 0.4375,
275
- "rewards/xmlcount_reward_func": 1.23046875,
276
  "step": 28
277
  },
278
  {
279
- "completion_length": 443.625,
280
- "epoch": 1.1553398058252426,
281
- "grad_norm": 31.22798728942871,
282
- "kl": 4.156043950817548,
283
- "learning_rate": 4.669563166532503e-07,
284
- "loss": 0.0042,
285
- "reward": 22.479478359222412,
286
- "reward_std": 1.144610105198808,
287
- "rewards/concensus_correctness_reward_func": 17.5,
288
- "rewards/consensus_reward_func": 2.0,
289
  "rewards/cumulative_reward_2": 0.0,
290
- "rewards/final_correctness_reward_func": 1.0625,
291
- "rewards/question_recreation_reward_func": 0.6516035442473367,
292
  "rewards/soft_format_reward_func": 0.0,
293
- "rewards/strict_format_reward_func": 0.421875,
294
- "rewards/xmlcount_reward_func": 0.8434999883174896,
295
  "step": 30
296
  },
297
  {
298
- "completion_length": 343.5,
299
- "epoch": 1.233009708737864,
300
- "grad_norm": 48.63099670410156,
301
- "kl": 7.887416427605785,
302
- "learning_rate": 4.6137171839198297e-07,
303
- "loss": 0.0079,
304
- "reward": 21.779454231262207,
305
- "reward_std": 0.41974417671735864,
306
- "rewards/concensus_correctness_reward_func": 16.25,
307
- "rewards/consensus_reward_func": 2.0,
308
  "rewards/cumulative_reward_2": 0.0,
309
- "rewards/final_correctness_reward_func": 1.125,
310
- "rewards/question_recreation_reward_func": 0.7599229542538524,
311
  "rewards/soft_format_reward_func": 0.0,
312
- "rewards/strict_format_reward_func": 0.421875,
313
- "rewards/xmlcount_reward_func": 1.22265625,
314
  "step": 32
315
  },
316
  {
317
- "completion_length": 321.28125,
318
- "epoch": 1.3106796116504853,
319
- "grad_norm": 2.55145263671875,
320
- "kl": 0.2716539311222732,
321
- "learning_rate": 4.5539029181523284e-07,
322
- "loss": 0.0003,
323
- "reward": 25.20154845714569,
324
- "reward_std": 0.015626534332113806,
325
- "rewards/concensus_correctness_reward_func": 18.75,
326
- "rewards/consensus_reward_func": 2.0,
327
- "rewards/cumulative_reward_2": 0.0,
328
- "rewards/final_correctness_reward_func": 1.75,
329
- "rewards/question_recreation_reward_func": 0.951548483222723,
330
- "rewards/soft_format_reward_func": 0.0,
331
- "rewards/strict_format_reward_func": 0.5,
332
- "rewards/xmlcount_reward_func": 1.25,
333
  "step": 34
334
  },
335
  {
336
- "completion_length": 400.1875,
337
- "epoch": 1.3883495145631068,
338
- "grad_norm": 41.0902214050293,
339
- "kl": 179.8097121512983,
340
- "learning_rate": 4.490232664264109e-07,
341
- "loss": 0.1798,
342
- "reward": 24.11970481276512,
343
- "reward_std": 1.2140166640747339,
344
- "rewards/concensus_correctness_reward_func": 18.125,
345
- "rewards/consensus_reward_func": 1.9375,
346
- "rewards/cumulative_reward_2": 0.0,
347
- "rewards/final_correctness_reward_func": 1.5625,
348
- "rewards/question_recreation_reward_func": 0.8697048253379762,
349
- "rewards/soft_format_reward_func": 0.0,
350
- "rewards/strict_format_reward_func": 0.4375,
351
- "rewards/xmlcount_reward_func": 1.1875,
352
  "step": 36
353
  },
354
  {
355
- "completion_length": 476.65625,
356
- "epoch": 1.4660194174757282,
357
- "grad_norm": 11.256326675415039,
358
- "kl": 42.19154109992087,
359
- "learning_rate": 4.422825956504072e-07,
360
- "loss": 0.0422,
361
- "reward": 24.907432556152344,
362
- "reward_std": 1.979055570642231,
363
- "rewards/concensus_correctness_reward_func": 20.0,
364
- "rewards/consensus_reward_func": 2.0,
365
  "rewards/cumulative_reward_2": 0.0,
366
- "rewards/final_correctness_reward_func": 1.375,
367
- "rewards/question_recreation_reward_func": 0.7545888773165643,
368
  "rewards/soft_format_reward_func": 0.0,
369
- "rewards/strict_format_reward_func": 0.375,
370
- "rewards/xmlcount_reward_func": 0.40284372866153717,
371
  "step": 38
372
  },
373
  {
374
- "completion_length": 383.59375,
375
- "epoch": 1.5436893203883495,
376
- "grad_norm": 90.17584991455078,
377
- "kl": 5.2121097205381375,
378
- "learning_rate": 4.3518093439228474e-07,
379
- "loss": 0.0053,
380
- "reward": 26.081862449645996,
381
- "reward_std": 0.27292351297001005,
382
- "rewards/concensus_correctness_reward_func": 20.0,
383
- "rewards/consensus_reward_func": 2.0,
384
- "rewards/cumulative_reward_2": 0.0,
385
- "rewards/final_correctness_reward_func": 1.5625,
386
- "rewards/question_recreation_reward_func": 0.8240499030798674,
387
- "rewards/soft_format_reward_func": 0.0,
388
- "rewards/strict_format_reward_func": 0.46875,
389
- "rewards/xmlcount_reward_func": 1.2265625,
390
  "step": 40
391
  },
392
  {
393
- "completion_length": 412.90625,
394
- "epoch": 1.6213592233009708,
395
- "grad_norm": 107.38036346435547,
396
- "kl": 9.203814042499289,
397
- "learning_rate": 4.277316152790177e-07,
398
- "loss": 0.0092,
399
- "reward": 23.489959120750427,
400
- "reward_std": 0.6975525947163987,
401
- "rewards/concensus_correctness_reward_func": 17.5,
402
- "rewards/consensus_reward_func": 2.0,
403
  "rewards/cumulative_reward_2": 0.0,
404
- "rewards/final_correctness_reward_func": 1.5,
405
- "rewards/question_recreation_reward_func": 0.861052667722106,
406
  "rewards/soft_format_reward_func": 0.0,
407
- "rewards/strict_format_reward_func": 0.4375,
408
- "rewards/xmlcount_reward_func": 1.19140625,
409
  "step": 42
410
  },
411
  {
412
- "completion_length": 469.0625,
413
- "epoch": 1.6990291262135924,
414
- "grad_norm": 17690.62109375,
415
- "kl": 2305.546810301603,
416
- "learning_rate": 4.1994862362887694e-07,
417
- "loss": 2.3055,
418
- "reward": 22.30199635028839,
419
- "reward_std": 1.4130048673432611,
420
- "rewards/concensus_correctness_reward_func": 17.5,
421
- "rewards/consensus_reward_func": 2.0,
422
  "rewards/cumulative_reward_2": 0.0,
423
- "rewards/final_correctness_reward_func": 0.8125,
424
- "rewards/question_recreation_reward_func": 0.6429024401586503,
425
  "rewards/soft_format_reward_func": 0.0,
426
- "rewards/strict_format_reward_func": 0.390625,
427
- "rewards/xmlcount_reward_func": 0.9559687525033951,
428
  "step": 44
429
  },
430
  {
431
- "completion_length": 365.46875,
432
- "epoch": 1.7766990291262137,
433
- "grad_norm": 14.231730461120605,
434
- "kl": 217.54472852719482,
435
- "learning_rate": 4.118465711954569e-07,
436
- "loss": 0.2175,
437
- "reward": 24.774591758847237,
438
- "reward_std": 0.6327212975957082,
439
- "rewards/concensus_correctness_reward_func": 18.75,
440
- "rewards/consensus_reward_func": 2.0,
441
  "rewards/cumulative_reward_2": 0.0,
442
- "rewards/final_correctness_reward_func": 1.5625,
443
- "rewards/question_recreation_reward_func": 0.7980292174033821,
444
  "rewards/soft_format_reward_func": 0.0,
445
- "rewards/strict_format_reward_func": 0.453125,
446
- "rewards/xmlcount_reward_func": 1.2109375,
447
  "step": 46
448
  },
449
  {
450
- "completion_length": 463.15625,
451
- "epoch": 1.854368932038835,
452
- "grad_norm": 143.4844970703125,
453
- "kl": 52.87269651985844,
454
- "learning_rate": 4.0344066873563436e-07,
455
- "loss": 0.0529,
456
- "reward": 22.579327821731567,
457
- "reward_std": 0.8465873862733133,
458
- "rewards/concensus_correctness_reward_func": 17.5,
459
- "rewards/consensus_reward_func": 2.0,
460
  "rewards/cumulative_reward_2": 0.0,
461
- "rewards/final_correctness_reward_func": 1.0,
462
- "rewards/question_recreation_reward_func": 0.6733904737047851,
463
  "rewards/soft_format_reward_func": 0.0,
464
- "rewards/strict_format_reward_func": 0.40625,
465
- "rewards/xmlcount_reward_func": 0.9996874928474426,
466
  "step": 48
467
  },
468
  {
469
- "completion_length": 377.71875,
470
- "epoch": 1.9320388349514563,
471
- "grad_norm": 31.81964874267578,
472
- "kl": 7.2554787584231235,
473
- "learning_rate": 3.947466974529622e-07,
474
- "loss": 0.0073,
475
- "reward": 23.998561143875122,
476
- "reward_std": 0.48244747589342296,
477
- "rewards/concensus_correctness_reward_func": 18.75,
478
- "rewards/consensus_reward_func": 2.0,
479
- "rewards/cumulative_reward_2": 0.0,
480
- "rewards/final_correctness_reward_func": 1.0625,
481
- "rewards/question_recreation_reward_func": 0.644623419851996,
482
- "rewards/soft_format_reward_func": 0.0,
483
- "rewards/strict_format_reward_func": 0.40625,
484
- "rewards/xmlcount_reward_func": 1.1351874992251396,
485
- "step": 50
486
- },
487
- {
488
- "completion_length": 447.2857142857143,
489
- "epoch": 2.0,
490
- "grad_norm": 22.75885772705078,
491
- "kl": 15.86337446052182,
492
- "learning_rate": 3.857809793701082e-07,
493
- "loss": 0.0139,
494
- "reward": 23.066213062831334,
495
- "reward_std": 0.24750209852520907,
496
- "rewards/concensus_correctness_reward_func": 17.142857142857142,
497
- "rewards/consensus_reward_func": 2.0,
498
- "rewards/cumulative_reward_2": 0.0,
499
- "rewards/final_correctness_reward_func": 1.4285714285714286,
500
- "rewards/question_recreation_reward_func": 0.8563917829056403,
501
- "rewards/soft_format_reward_func": 0.0,
502
- "rewards/strict_format_reward_func": 0.44642857142857145,
503
- "rewards/xmlcount_reward_func": 1.1919642857142858,
504
- "step": 52
505
- },
506
- {
507
- "completion_length": 394.125,
508
- "epoch": 2.0776699029126213,
509
- "grad_norm": 39.635250091552734,
510
- "kl": 14.09071085829055,
511
- "learning_rate": 3.765603466859635e-07,
512
- "loss": 0.0141,
513
- "reward": 26.076446533203125,
514
- "reward_std": 0.586104866641108,
515
- "rewards/concensus_correctness_reward_func": 20.0,
516
- "rewards/consensus_reward_func": 2.0,
517
- "rewards/cumulative_reward_2": 0.0,
518
- "rewards/final_correctness_reward_func": 1.875,
519
- "rewards/question_recreation_reward_func": 0.9359152019023895,
520
- "rewards/soft_format_reward_func": 0.0,
521
- "rewards/strict_format_reward_func": 0.453125,
522
- "rewards/xmlcount_reward_func": 0.8124062418937683,
523
- "step": 54
524
- },
525
- {
526
- "completion_length": 326.4375,
527
- "epoch": 2.1553398058252426,
528
- "grad_norm": 5.913288593292236,
529
- "kl": 0.21376457961741835,
530
- "learning_rate": 3.6710211017494754e-07,
531
- "loss": 0.0002,
532
- "reward": 23.754507184028625,
533
- "reward_std": 0.09555516406544484,
534
- "rewards/concensus_correctness_reward_func": 17.5,
535
- "rewards/consensus_reward_func": 2.0,
536
- "rewards/cumulative_reward_2": 0.0,
537
- "rewards/final_correctness_reward_func": 1.625,
538
- "rewards/question_recreation_reward_func": 0.8990384005010128,
539
- "rewards/soft_format_reward_func": 0.0,
540
- "rewards/strict_format_reward_func": 0.484375,
541
- "rewards/xmlcount_reward_func": 1.24609375,
542
- "step": 56
543
- },
544
- {
545
- "completion_length": 407.8125,
546
- "epoch": 2.233009708737864,
547
- "grad_norm": 528601.3125,
548
- "kl": 29976.82579972106,
549
- "learning_rate": 3.5742402668783795e-07,
550
- "loss": 29.9768,
551
- "reward": 22.809705063700676,
552
- "reward_std": 0.45170957152731717,
553
- "rewards/concensus_correctness_reward_func": 17.5,
554
- "rewards/consensus_reward_func": 2.0,
555
- "rewards/cumulative_reward_2": 0.0,
556
- "rewards/final_correctness_reward_func": 1.0,
557
- "rewards/question_recreation_reward_func": 0.6847049514763057,
558
- "rewards/soft_format_reward_func": 0.0,
559
- "rewards/strict_format_reward_func": 0.4375,
560
- "rewards/xmlcount_reward_func": 1.1875,
561
- "step": 58
562
- },
563
- {
564
- "completion_length": 437.8125,
565
- "epoch": 2.3106796116504853,
566
- "grad_norm": 2.3213562965393066,
567
- "kl": 4.420506345661124,
568
- "learning_rate": 3.475442658151386e-07,
569
- "loss": 0.0044,
570
- "reward": 23.30189959704876,
571
- "reward_std": 0.14962102323806903,
572
- "rewards/concensus_correctness_reward_func": 17.5,
573
- "rewards/consensus_reward_func": 2.0,
574
- "rewards/cumulative_reward_2": 0.0,
575
- "rewards/final_correctness_reward_func": 1.3125,
576
- "rewards/question_recreation_reward_func": 0.8448684550821781,
577
- "rewards/soft_format_reward_func": 0.0,
578
- "rewards/strict_format_reward_func": 0.453125,
579
- "rewards/xmlcount_reward_func": 1.19140625,
580
- "step": 60
581
- },
582
- {
583
- "completion_length": 396.1875,
584
- "epoch": 2.3883495145631066,
585
- "grad_norm": 23.692230224609375,
586
- "kl": 4.907707513310015,
587
- "learning_rate": 3.374813757755721e-07,
588
- "loss": 0.0049,
589
- "reward": 24.607102632522583,
590
- "reward_std": 0.7705591155681759,
591
- "rewards/concensus_correctness_reward_func": 18.75,
592
- "rewards/consensus_reward_func": 2.0,
593
- "rewards/cumulative_reward_2": 0.0,
594
- "rewards/final_correctness_reward_func": 1.5,
595
- "rewards/question_recreation_reward_func": 0.7985086906701326,
596
- "rewards/soft_format_reward_func": 0.0,
597
- "rewards/strict_format_reward_func": 0.390625,
598
- "rewards/xmlcount_reward_func": 1.16796875,
599
- "step": 62
600
- },
601
- {
602
- "completion_length": 526.25,
603
- "epoch": 2.466019417475728,
604
- "grad_norm": 238.9390106201172,
605
- "kl": 65.44736079673748,
606
- "learning_rate": 3.272542485937368e-07,
607
- "loss": 0.0654,
608
- "reward": 22.10479310154915,
609
- "reward_std": 1.4251302876218688,
610
- "rewards/concensus_correctness_reward_func": 17.5,
611
- "rewards/consensus_reward_func": 2.0,
612
- "rewards/cumulative_reward_2": 0.0,
613
- "rewards/final_correctness_reward_func": 0.875,
614
- "rewards/question_recreation_reward_func": 0.5487930907984264,
615
- "rewards/soft_format_reward_func": 0.0,
616
- "rewards/strict_format_reward_func": 0.328125,
617
- "rewards/xmlcount_reward_func": 0.8528750017285347,
618
- "step": 64
619
- },
620
- {
621
- "completion_length": 419.59375,
622
- "epoch": 2.5436893203883493,
623
- "grad_norm": 28.01909065246582,
624
- "kl": 78.19776605040533,
625
- "learning_rate": 3.168820846323053e-07,
626
- "loss": 0.0782,
627
- "reward": 23.52632273733616,
628
- "reward_std": 0.41156103121466003,
629
- "rewards/concensus_correctness_reward_func": 17.5,
630
- "rewards/consensus_reward_func": 2.0,
631
- "rewards/cumulative_reward_2": 0.0,
632
- "rewards/final_correctness_reward_func": 1.625,
633
- "rewards/question_recreation_reward_func": 0.8583538136444986,
634
- "rewards/soft_format_reward_func": 0.0,
635
- "rewards/strict_format_reward_func": 0.375,
636
- "rewards/xmlcount_reward_func": 1.16796875,
637
- "step": 66
638
- },
639
- {
640
- "completion_length": 368.3125,
641
- "epoch": 2.6213592233009706,
642
- "grad_norm": 4372794368.0,
643
- "kl": 72907616.02178724,
644
- "learning_rate": 3.0638435654534855e-07,
645
- "loss": 72907.6172,
646
- "reward": 25.846083283424377,
647
- "reward_std": 0.12156938298721798,
648
- "rewards/concensus_correctness_reward_func": 20.0,
649
- "rewards/consensus_reward_func": 2.0,
650
- "rewards/cumulative_reward_2": 0.0,
651
- "rewards/final_correctness_reward_func": 1.4375,
652
- "rewards/question_recreation_reward_func": 0.7288958071731031,
653
- "rewards/soft_format_reward_func": 0.0,
654
- "rewards/strict_format_reward_func": 0.46875,
655
- "rewards/xmlcount_reward_func": 1.2109375,
656
- "step": 68
657
- },
658
- {
659
- "completion_length": 384.46875,
660
- "epoch": 2.6990291262135924,
661
- "grad_norm": 18.700071334838867,
662
- "kl": 18.835724933887832,
663
- "learning_rate": 2.9578077272046406e-07,
664
- "loss": 0.0188,
665
- "reward": 23.173890352249146,
666
- "reward_std": 0.6314981132745743,
667
- "rewards/concensus_correctness_reward_func": 17.5,
668
- "rewards/consensus_reward_func": 2.0,
669
- "rewards/cumulative_reward_2": 0.0,
670
- "rewards/final_correctness_reward_func": 1.25,
671
- "rewards/question_recreation_reward_func": 0.7949840980581939,
672
- "rewards/soft_format_reward_func": 0.0,
673
- "rewards/strict_format_reward_func": 0.4375,
674
- "rewards/xmlcount_reward_func": 1.19140625,
675
- "step": 70
676
- },
677
- {
678
- "completion_length": 470.59375,
679
- "epoch": 2.7766990291262137,
680
- "grad_norm": 153.18116760253906,
681
- "kl": 145.82655545510352,
682
- "learning_rate": 2.850912402783361e-07,
683
- "loss": 0.1458,
684
- "reward": 21.002964735031128,
685
- "reward_std": 1.2343942740408238,
686
- "rewards/concensus_correctness_reward_func": 16.25,
687
- "rewards/consensus_reward_func": 2.0,
688
- "rewards/cumulative_reward_2": 0.0,
689
- "rewards/final_correctness_reward_func": 0.8125,
690
- "rewards/question_recreation_reward_func": 0.6601520623080432,
691
- "rewards/soft_format_reward_func": 0.0,
692
- "rewards/strict_format_reward_func": 0.390625,
693
- "rewards/xmlcount_reward_func": 0.8896874934434891,
694
- "step": 72
695
- },
696
- {
697
- "completion_length": 401.875,
698
- "epoch": 2.854368932038835,
699
- "grad_norm": 5.5080952644348145,
700
- "kl": 28.601856674649753,
701
- "learning_rate": 2.743358276991975e-07,
702
- "loss": 0.0287,
703
- "reward": 24.490681916475296,
704
- "reward_std": 0.6077816126016842,
705
- "rewards/concensus_correctness_reward_func": 18.75,
706
- "rewards/consensus_reward_func": 2.0,
707
- "rewards/cumulative_reward_2": 0.0,
708
- "rewards/final_correctness_reward_func": 1.5,
709
- "rewards/question_recreation_reward_func": 0.8336818036623299,
710
- "rewards/soft_format_reward_func": 0.0,
711
- "rewards/strict_format_reward_func": 0.453125,
712
- "rewards/xmlcount_reward_func": 0.9538750052452087,
713
- "step": 74
714
- },
715
- {
716
- "completion_length": 472.5,
717
- "epoch": 2.9320388349514563,
718
- "grad_norm": 23.572187423706055,
719
- "kl": 151.66161472682143,
720
- "learning_rate": 2.635347271463544e-07,
721
- "loss": 0.1517,
722
- "reward": 22.536366999149323,
723
- "reward_std": 1.158596018794924,
724
- "rewards/concensus_correctness_reward_func": 17.5,
725
- "rewards/consensus_reward_func": 2.0,
726
- "rewards/cumulative_reward_2": 0.0,
727
- "rewards/final_correctness_reward_func": 0.875,
728
- "rewards/question_recreation_reward_func": 0.7228355249390006,
729
- "rewards/soft_format_reward_func": 0.0,
730
- "rewards/strict_format_reward_func": 0.4375,
731
- "rewards/xmlcount_reward_func": 1.0010312497615814,
732
- "step": 76
733
- },
734
- {
735
- "completion_length": 367.4642857142857,
736
- "epoch": 3.0,
737
- "grad_norm": 0.6478450298309326,
738
- "kl": 10.578315089689568,
739
- "learning_rate": 2.5270821655750997e-07,
740
- "loss": 0.0093,
741
- "reward": 24.1918785572052,
742
- "reward_std": 0.7971390634109932,
743
- "rewards/concensus_correctness_reward_func": 18.571428571428573,
744
- "rewards/consensus_reward_func": 2.0,
745
- "rewards/cumulative_reward_2": 0.0,
746
- "rewards/final_correctness_reward_func": 1.4285714285714286,
747
- "rewards/question_recreation_reward_func": 0.8539499055062022,
748
- "rewards/soft_format_reward_func": 0.0,
749
- "rewards/strict_format_reward_func": 0.48214285714285715,
750
- "rewards/xmlcount_reward_func": 0.855785710471017,
751
- "step": 78
752
- },
753
- {
754
- "completion_length": 424.09375,
755
- "epoch": 3.0776699029126213,
756
- "grad_norm": 42.824214935302734,
757
- "kl": 6.965218568395358,
758
- "learning_rate": 2.418766215750549e-07,
759
- "loss": 0.0069,
760
- "reward": 23.107779026031494,
761
- "reward_std": 0.4121289917820832,
762
- "rewards/concensus_correctness_reward_func": 17.5,
763
- "rewards/consensus_reward_func": 2.0,
764
- "rewards/cumulative_reward_2": 0.0,
765
- "rewards/final_correctness_reward_func": 1.25,
766
- "rewards/question_recreation_reward_func": 0.7991853792918846,
767
- "rewards/soft_format_reward_func": 0.0,
768
- "rewards/strict_format_reward_func": 0.40625,
769
- "rewards/xmlcount_reward_func": 1.15234375,
770
- "step": 80
771
- },
772
- {
773
- "completion_length": 405.71875,
774
- "epoch": 3.1553398058252426,
775
- "grad_norm": 302975.21875,
776
- "kl": 29946.866098867613,
777
- "learning_rate": 2.310602773867974e-07,
778
- "loss": 29.9469,
779
- "reward": 23.7015261054039,
780
- "reward_std": 1.7029972076416016,
781
- "rewards/concensus_correctness_reward_func": 18.75,
782
- "rewards/consensus_reward_func": 2.0,
783
- "rewards/cumulative_reward_2": 0.0,
784
- "rewards/final_correctness_reward_func": 1.125,
785
- "rewards/question_recreation_reward_func": 0.7799010239541531,
786
- "rewards/soft_format_reward_func": 0.0,
787
- "rewards/strict_format_reward_func": 0.4375,
788
- "rewards/xmlcount_reward_func": 0.6091249883174896,
789
- "step": 82
790
- },
791
- {
792
- "completion_length": 455.96875,
793
- "epoch": 3.233009708737864,
794
- "grad_norm": 875.0205688476562,
795
- "kl": 54.195998407143634,
796
- "learning_rate": 2.202794905487734e-07,
797
- "loss": 0.0542,
798
- "reward": 22.65313957631588,
799
- "reward_std": 0.5022053974098526,
800
- "rewards/concensus_correctness_reward_func": 17.5,
801
- "rewards/consensus_reward_func": 2.0,
802
- "rewards/cumulative_reward_2": 0.0,
803
- "rewards/final_correctness_reward_func": 1.0,
804
- "rewards/question_recreation_reward_func": 0.6375148533843458,
805
- "rewards/soft_format_reward_func": 0.0,
806
- "rewards/strict_format_reward_func": 0.375,
807
- "rewards/xmlcount_reward_func": 1.140625,
808
- "step": 84
809
- },
810
- {
811
- "completion_length": 370.8125,
812
- "epoch": 3.3106796116504853,
813
- "grad_norm": 30.859088897705078,
814
- "kl": 2.3182793061132543,
815
- "learning_rate": 2.0955450086180881e-07,
816
- "loss": 0.0023,
817
- "reward": 23.097095608711243,
818
- "reward_std": 0.5322014247940388,
819
- "rewards/concensus_correctness_reward_func": 17.5,
820
- "rewards/consensus_reward_func": 2.0,
821
- "rewards/cumulative_reward_2": 0.0,
822
- "rewards/final_correctness_reward_func": 1.1875,
823
- "rewards/question_recreation_reward_func": 0.7650644350796938,
824
- "rewards/soft_format_reward_func": 0.0,
825
- "rewards/strict_format_reward_func": 0.4375,
826
- "rewards/xmlcount_reward_func": 1.20703125,
827
- "step": 86
828
- },
829
- {
830
- "completion_length": 354.25,
831
- "epoch": 3.3883495145631066,
832
- "grad_norm": 5.160157203674316,
833
- "kl": 0.05770259181736037,
834
- "learning_rate": 1.9890544337340882e-07,
835
- "loss": 0.0001,
836
- "reward": 22.168138414621353,
837
- "reward_std": 0.37399009661749005,
838
- "rewards/concensus_correctness_reward_func": 16.25,
839
- "rewards/consensus_reward_func": 2.0,
840
- "rewards/cumulative_reward_2": 0.0,
841
- "rewards/final_correctness_reward_func": 1.3125,
842
- "rewards/question_recreation_reward_func": 0.8556385338306427,
843
- "rewards/soft_format_reward_func": 0.0,
844
- "rewards/strict_format_reward_func": 0.5,
845
- "rewards/xmlcount_reward_func": 1.25,
846
- "step": 88
847
- },
848
- {
849
- "completion_length": 428.53125,
850
- "epoch": 3.466019417475728,
851
- "grad_norm": 1738.85546875,
852
- "kl": 241.5327543563326,
853
- "learning_rate": 1.8835231057630952e-07,
854
- "loss": 0.2415,
855
- "reward": 21.881572127342224,
856
- "reward_std": 0.36119075283568236,
857
- "rewards/concensus_correctness_reward_func": 16.25,
858
- "rewards/consensus_reward_func": 2.0,
859
- "rewards/cumulative_reward_2": 0.0,
860
- "rewards/final_correctness_reward_func": 1.1875,
861
- "rewards/question_recreation_reward_func": 0.7995408938731998,
862
- "rewards/soft_format_reward_func": 0.0,
863
- "rewards/strict_format_reward_func": 0.453125,
864
- "rewards/xmlcount_reward_func": 1.19140625,
865
- "step": 90
866
- },
867
- {
868
- "completion_length": 497.125,
869
- "epoch": 3.5436893203883493,
870
- "grad_norm": 753.3305053710938,
871
- "kl": 286.44772909092717,
872
- "learning_rate": 1.779149148746623e-07,
873
- "loss": 0.2864,
874
- "reward": 23.124612510204315,
875
- "reward_std": 0.8461131641233806,
876
- "rewards/concensus_correctness_reward_func": 17.5,
877
- "rewards/consensus_reward_func": 2.0,
878
- "rewards/cumulative_reward_2": 0.0,
879
- "rewards/final_correctness_reward_func": 1.375,
880
- "rewards/question_recreation_reward_func": 0.7261750160250813,
881
- "rewards/soft_format_reward_func": 0.0,
882
- "rewards/strict_format_reward_func": 0.390625,
883
- "rewards/xmlcount_reward_func": 1.1328125,
884
- "step": 92
885
- },
886
- {
887
- "completion_length": 390.3125,
888
- "epoch": 3.6213592233009706,
889
- "grad_norm": 12.107856750488281,
890
- "kl": 67.06170634337468,
891
- "learning_rate": 1.6761285138831492e-07,
892
- "loss": 0.0671,
893
- "reward": 24.467315137386322,
894
- "reward_std": 0.5394242729380494,
895
- "rewards/concensus_correctness_reward_func": 18.75,
896
- "rewards/consensus_reward_func": 2.0,
897
- "rewards/cumulative_reward_2": 0.0,
898
- "rewards/final_correctness_reward_func": 1.4375,
899
- "rewards/question_recreation_reward_func": 0.81672128662467,
900
- "rewards/soft_format_reward_func": 0.0,
901
- "rewards/strict_format_reward_func": 0.46875,
902
- "rewards/xmlcount_reward_func": 0.9943437427282333,
903
- "step": 94
904
- },
905
- {
906
- "completion_length": 448.21875,
907
- "epoch": 3.6990291262135924,
908
- "grad_norm": 160.2933349609375,
909
- "kl": 164.18609845184255,
910
- "learning_rate": 1.5746546116502139e-07,
911
- "loss": 0.1641,
912
- "reward": 25.518889784812927,
913
- "reward_std": 0.4152768708518124,
914
- "rewards/concensus_correctness_reward_func": 20.0,
915
- "rewards/consensus_reward_func": 2.0,
916
- "rewards/cumulative_reward_2": 0.0,
917
- "rewards/final_correctness_reward_func": 1.25,
918
- "rewards/question_recreation_reward_func": 0.7102962075732648,
919
- "rewards/soft_format_reward_func": 0.0,
920
- "rewards/strict_format_reward_func": 0.40625,
921
- "rewards/xmlcount_reward_func": 1.15234375,
922
- "step": 96
923
- },
924
- {
925
- "completion_length": 359.625,
926
- "epoch": 3.7766990291262137,
927
- "grad_norm": 1.4972585439682007,
928
- "kl": 68.93537064210977,
929
- "learning_rate": 1.4749179486964598e-07,
930
- "loss": 0.0689,
931
- "reward": 25.106204017996788,
932
- "reward_std": 0.3019349011592567,
933
- "rewards/concensus_correctness_reward_func": 18.75,
934
- "rewards/consensus_reward_func": 2.0,
935
- "rewards/cumulative_reward_2": 0.0,
936
- "rewards/final_correctness_reward_func": 1.75,
937
- "rewards/question_recreation_reward_func": 0.9108914663083851,
938
- "rewards/soft_format_reward_func": 0.0,
939
- "rewards/strict_format_reward_func": 0.46875,
940
- "rewards/xmlcount_reward_func": 1.2265625,
941
- "step": 98
942
- },
943
- {
944
- "completion_length": 399.75,
945
- "epoch": 3.854368932038835,
946
- "grad_norm": 74.90406036376953,
947
- "kl": 66.00170491752215,
948
- "learning_rate": 1.377105770185303e-07,
949
- "loss": 0.066,
950
- "reward": 25.909737586975098,
951
- "reward_std": 0.3808807341847569,
952
- "rewards/concensus_correctness_reward_func": 20.0,
953
- "rewards/consensus_reward_func": 2.0,
954
- "rewards/cumulative_reward_2": 0.0,
955
- "rewards/final_correctness_reward_func": 1.5,
956
- "rewards/question_recreation_reward_func": 0.7925501388963312,
957
- "rewards/soft_format_reward_func": 0.0,
958
- "rewards/strict_format_reward_func": 0.40625,
959
- "rewards/xmlcount_reward_func": 1.2109375,
960
- "step": 100
961
- },
962
- {
963
- "completion_length": 336.40625,
964
- "epoch": 3.9320388349514563,
965
- "grad_norm": 1850.0972900390625,
966
- "kl": 279.5790399988764,
967
- "learning_rate": 1.2814017082617022e-07,
968
- "loss": 0.2796,
969
- "reward": 23.730770647525787,
970
- "reward_std": 0.5459704388704267,
971
- "rewards/concensus_correctness_reward_func": 17.5,
972
- "rewards/consensus_reward_func": 2.0,
973
- "rewards/cumulative_reward_2": 0.0,
974
- "rewards/final_correctness_reward_func": 1.625,
975
- "rewards/question_recreation_reward_func": 0.8909267354756594,
976
- "rewards/soft_format_reward_func": 0.0,
977
- "rewards/strict_format_reward_func": 0.484375,
978
- "rewards/xmlcount_reward_func": 1.23046875,
979
- "step": 102
980
- },
981
- {
982
- "completion_length": 311.14285714285717,
983
- "epoch": 4.0,
984
- "grad_norm": 2.6803574562072754,
985
- "kl": 0.10023432803739395,
986
- "learning_rate": 1.1879854373019988e-07,
987
- "loss": 0.0001,
988
- "reward": 24.816970143999374,
989
- "reward_std": 0.5062780659978411,
990
- "rewards/concensus_correctness_reward_func": 18.571428571428573,
991
- "rewards/consensus_reward_func": 2.0,
992
- "rewards/cumulative_reward_2": 0.0,
993
- "rewards/final_correctness_reward_func": 1.6428571428571428,
994
- "rewards/question_recreation_reward_func": 0.9086843686444419,
995
- "rewards/soft_format_reward_func": 0.0,
996
- "rewards/strict_format_reward_func": 0.48214285714285715,
997
- "rewards/xmlcount_reward_func": 1.211857144321714,
998
- "step": 104
999
- },
1000
- {
1001
- "completion_length": 419.3125,
1002
- "epoch": 4.077669902912621,
1003
- "grad_norm": 17364.595703125,
1004
- "kl": 1653.4706951549742,
1005
- "learning_rate": 1.0970323365940443e-07,
1006
- "loss": 1.6535,
1007
- "reward": 23.04638957977295,
1008
- "reward_std": 0.7551776195468847,
1009
- "rewards/concensus_correctness_reward_func": 17.5,
1010
- "rewards/consensus_reward_func": 2.0,
1011
- "rewards/cumulative_reward_2": 0.0,
1012
- "rewards/final_correctness_reward_func": 1.4375,
1013
- "rewards/question_recreation_reward_func": 0.8437021533027291,
1014
- "rewards/soft_format_reward_func": 0.0,
1015
- "rewards/strict_format_reward_func": 0.390625,
1016
- "rewards/xmlcount_reward_func": 0.8745625019073486,
1017
- "step": 106
1018
- },
1019
- {
1020
- "completion_length": 379.40625,
1021
- "epoch": 4.155339805825243,
1022
- "grad_norm": 5.585306167602539,
1023
- "kl": 0.2358675587165635,
1024
- "learning_rate": 1.0087131610809151e-07,
1025
- "loss": 0.0002,
1026
- "reward": 21.933794915676117,
1027
- "reward_std": 0.26604328020403045,
1028
- "rewards/concensus_correctness_reward_func": 16.25,
1029
- "rewards/consensus_reward_func": 2.0,
1030
- "rewards/cumulative_reward_2": 0.0,
1031
- "rewards/final_correctness_reward_func": 1.125,
1032
- "rewards/question_recreation_reward_func": 0.8478576112538576,
1033
- "rewards/soft_format_reward_func": 0.0,
1034
- "rewards/strict_format_reward_func": 0.46875,
1035
- "rewards/xmlcount_reward_func": 1.2421875,
1036
- "step": 108
1037
- },
1038
- {
1039
- "completion_length": 388.90625,
1040
- "epoch": 4.233009708737864,
1041
- "grad_norm": 29.13212776184082,
1042
- "kl": 27.422925863123965,
1043
- "learning_rate": 9.231937207863458e-08,
1044
- "loss": 0.0274,
1045
- "reward": 24.267252385616302,
1046
- "reward_std": 0.6281342142028734,
1047
- "rewards/concensus_correctness_reward_func": 18.75,
1048
- "rewards/consensus_reward_func": 2.0,
1049
- "rewards/cumulative_reward_2": 0.0,
1050
- "rewards/final_correctness_reward_func": 1.375,
1051
- "rewards/question_recreation_reward_func": 0.7860024385154247,
1052
- "rewards/soft_format_reward_func": 0.0,
1053
- "rewards/strict_format_reward_func": 0.46875,
1054
- "rewards/xmlcount_reward_func": 0.887499988079071,
1055
- "step": 110
1056
- },
1057
- {
1058
- "completion_length": 468.5625,
1059
- "epoch": 4.310679611650485,
1060
- "grad_norm": 22.03756332397461,
1061
- "kl": 124.18902574002277,
1062
- "learning_rate": 8.406345695237394e-08,
1063
- "loss": 0.1242,
1064
- "reward": 23.003397941589355,
1065
- "reward_std": 0.33238032677036244,
1066
- "rewards/concensus_correctness_reward_func": 17.5,
1067
- "rewards/consensus_reward_func": 2.0,
1068
- "rewards/cumulative_reward_2": 0.0,
1069
- "rewards/final_correctness_reward_func": 1.1875,
1070
- "rewards/question_recreation_reward_func": 0.7573043735465035,
1071
- "rewards/soft_format_reward_func": 0.0,
1072
- "rewards/strict_format_reward_func": 0.40625,
1073
- "rewards/xmlcount_reward_func": 1.15234375,
1074
- "step": 112
1075
- },
1076
- {
1077
- "completion_length": 321.15625,
1078
- "epoch": 4.388349514563107,
1079
- "grad_norm": 4.012566089630127,
1080
- "kl": 4.633670265262481,
1081
- "learning_rate": 7.611907034731538e-08,
1082
- "loss": 0.0046,
1083
- "reward": 23.765257954597473,
1084
- "reward_std": 0.04982626732089557,
1085
- "rewards/concensus_correctness_reward_func": 17.5,
1086
- "rewards/consensus_reward_func": 2.0,
1087
- "rewards/cumulative_reward_2": 0.0,
1088
- "rewards/final_correctness_reward_func": 1.625,
1089
- "rewards/question_recreation_reward_func": 0.909789162222296,
1090
- "rewards/soft_format_reward_func": 0.0,
1091
- "rewards/strict_format_reward_func": 0.484375,
1092
- "rewards/xmlcount_reward_func": 1.24609375,
1093
- "step": 114
1094
- },
1095
- {
1096
- "completion_length": 395.0,
1097
- "epoch": 4.466019417475728,
1098
- "grad_norm": 26.409461975097656,
1099
- "kl": 4.807898882776499,
1100
- "learning_rate": 6.850112701921735e-08,
1101
- "loss": 0.0048,
1102
- "reward": 24.434596300125122,
1103
- "reward_std": 0.2017319258520729,
1104
- "rewards/concensus_correctness_reward_func": 18.75,
1105
- "rewards/consensus_reward_func": 2.0,
1106
- "rewards/cumulative_reward_2": 0.0,
1107
- "rewards/final_correctness_reward_func": 1.3125,
1108
- "rewards/question_recreation_reward_func": 0.7431900606607087,
1109
- "rewards/soft_format_reward_func": 0.0,
1110
- "rewards/strict_format_reward_func": 0.4375,
1111
- "rewards/xmlcount_reward_func": 1.19140625,
1112
- "step": 116
1113
- },
1114
- {
1115
- "completion_length": 368.34375,
1116
- "epoch": 4.543689320388349,
1117
- "grad_norm": 321.2220458984375,
1118
- "kl": 24.05298131686868,
1119
- "learning_rate": 6.122392886069486e-08,
1120
- "loss": 0.0241,
1121
- "reward": 22.82888913154602,
1122
- "reward_std": 0.4426635252311826,
1123
- "rewards/concensus_correctness_reward_func": 17.5,
1124
- "rewards/consensus_reward_func": 2.0,
1125
- "rewards/cumulative_reward_2": 0.0,
1126
- "rewards/final_correctness_reward_func": 1.0,
1127
- "rewards/question_recreation_reward_func": 0.6882640882395208,
1128
- "rewards/soft_format_reward_func": 0.0,
1129
- "rewards/strict_format_reward_func": 0.4375,
1130
- "rewards/xmlcount_reward_func": 1.203125,
1131
- "step": 118
1132
- },
1133
- {
1134
- "completion_length": 383.34375,
1135
- "epoch": 4.621359223300971,
1136
- "grad_norm": 18.415369033813477,
1137
- "kl": 2.3699891800642945,
1138
- "learning_rate": 5.43011380509111e-08,
1139
- "loss": 0.0024,
1140
- "reward": 25.932987451553345,
1141
- "reward_std": 0.43629120306286495,
1142
- "rewards/concensus_correctness_reward_func": 20.0,
1143
- "rewards/consensus_reward_func": 2.0,
1144
- "rewards/cumulative_reward_2": 0.0,
1145
- "rewards/final_correctness_reward_func": 1.4375,
1146
- "rewards/question_recreation_reward_func": 0.7806434081867337,
1147
- "rewards/soft_format_reward_func": 0.0,
1148
- "rewards/strict_format_reward_func": 0.484375,
1149
- "rewards/xmlcount_reward_func": 1.23046875,
1150
- "step": 120
1151
- },
1152
- {
1153
- "completion_length": 395.59375,
1154
- "epoch": 4.699029126213592,
1155
- "grad_norm": 1103.49755859375,
1156
- "kl": 16051478.45370954,
1157
- "learning_rate": 4.774575140626316e-08,
1158
- "loss": 16051.4766,
1159
- "reward": 24.530160903930664,
1160
- "reward_std": 2.334480008463288,
1161
- "rewards/concensus_correctness_reward_func": 18.75,
1162
- "rewards/consensus_reward_func": 1.875,
1163
- "rewards/cumulative_reward_2": 0.0,
1164
- "rewards/final_correctness_reward_func": 1.5,
1165
- "rewards/question_recreation_reward_func": 0.7879733214504085,
1166
- "rewards/soft_format_reward_func": 0.0,
1167
- "rewards/strict_format_reward_func": 0.4375,
1168
- "rewards/xmlcount_reward_func": 1.1796875,
1169
- "step": 122
1170
- },
1171
- {
1172
- "completion_length": 365.28125,
1173
- "epoch": 4.776699029126213,
1174
- "grad_norm": 0.8362126350402832,
1175
- "kl": 5.141667291987687,
1176
- "learning_rate": 4.15700759802175e-08,
1177
- "loss": 0.0051,
1178
- "reward": 22.40603679418564,
1179
- "reward_std": 0.305447471793741,
1180
- "rewards/concensus_correctness_reward_func": 16.25,
1181
- "rewards/consensus_reward_func": 2.0,
1182
- "rewards/cumulative_reward_2": 0.0,
1183
- "rewards/final_correctness_reward_func": 1.5,
1184
- "rewards/question_recreation_reward_func": 0.9411930441856384,
1185
- "rewards/soft_format_reward_func": 0.0,
1186
- "rewards/strict_format_reward_func": 0.484375,
1187
- "rewards/xmlcount_reward_func": 1.23046875,
1188
- "step": 124
1189
- },
1190
- {
1191
- "completion_length": 403.65625,
1192
- "epoch": 4.854368932038835,
1193
- "grad_norm": 131.3492431640625,
1194
- "kl": 11.855081085057463,
1195
- "learning_rate": 3.578570595810274e-08,
1196
- "loss": 0.0118,
1197
- "reward": 25.011828303337097,
1198
- "reward_std": 1.4557088574583759,
1199
- "rewards/concensus_correctness_reward_func": 19.469312489032745,
1200
- "rewards/consensus_reward_func": 1.9375,
1201
- "rewards/cumulative_reward_2": 0.0,
1202
- "rewards/final_correctness_reward_func": 1.5,
1203
- "rewards/question_recreation_reward_func": 0.8393594082444906,
1204
- "rewards/soft_format_reward_func": 0.0,
1205
- "rewards/strict_format_reward_func": 0.453125,
1206
- "rewards/xmlcount_reward_func": 0.8125312626361847,
1207
- "step": 126
1208
- },
1209
- {
1210
- "completion_length": 392.84375,
1211
- "epoch": 4.932038834951456,
1212
- "grad_norm": 237.50135803222656,
1213
- "kl": 25.763554503879277,
1214
- "learning_rate": 3.0403500890238435e-08,
1215
- "loss": 0.0258,
1216
- "reward": 23.001006707549095,
1217
- "reward_std": 0.8864295431994833,
1218
- "rewards/concensus_correctness_reward_func": 17.5,
1219
- "rewards/consensus_reward_func": 2.0,
1220
- "rewards/cumulative_reward_2": 0.0,
1221
- "rewards/final_correctness_reward_func": 1.3125,
1222
- "rewards/question_recreation_reward_func": 0.761944386176765,
1223
- "rewards/soft_format_reward_func": 0.0,
1224
- "rewards/strict_format_reward_func": 0.421875,
1225
- "rewards/xmlcount_reward_func": 1.0046875029802322,
1226
- "step": 128
1227
- },
1228
- {
1229
- "completion_length": 351.0,
1230
- "epoch": 5.0,
1231
- "grad_norm": 29.503049850463867,
1232
- "kl": 12.375416018867067,
1233
- "learning_rate": 2.5433565304263937e-08,
1234
- "loss": 0.0108,
1235
- "reward": 22.869648592812673,
1236
- "reward_std": 0.6192083631509117,
1237
- "rewards/concensus_correctness_reward_func": 17.142857142857142,
1238
- "rewards/consensus_reward_func": 2.0,
1239
- "rewards/cumulative_reward_2": 0.0,
1240
- "rewards/final_correctness_reward_func": 1.5714285714285714,
1241
- "rewards/question_recreation_reward_func": 0.8947199302326355,
1242
- "rewards/soft_format_reward_func": 0.0,
1243
- "rewards/strict_format_reward_func": 0.4642857142857143,
1244
- "rewards/xmlcount_reward_func": 0.7963571548461914,
1245
- "step": 130
1246
- },
1247
- {
1248
- "completion_length": 440.71875,
1249
- "epoch": 5.077669902912621,
1250
- "grad_norm": 22.593021392822266,
1251
- "kl": 9.92510396009311,
1252
- "learning_rate": 2.08852297349435e-08,
1253
- "loss": 0.0099,
1254
- "reward": 23.77554178237915,
1255
- "reward_std": 1.3542704163264716,
1256
- "rewards/concensus_correctness_reward_func": 18.125,
1257
- "rewards/consensus_reward_func": 1.9375,
1258
- "rewards/cumulative_reward_2": 0.0,
1259
- "rewards/final_correctness_reward_func": 1.375,
1260
- "rewards/question_recreation_reward_func": 0.7833543182350695,
1261
- "rewards/soft_format_reward_func": 0.0,
1262
- "rewards/strict_format_reward_func": 0.390625,
1263
- "rewards/xmlcount_reward_func": 1.1640625,
1264
- "step": 132
1265
- },
1266
- {
1267
- "completion_length": 355.375,
1268
- "epoch": 5.155339805825243,
1269
- "grad_norm": 2.0897696018218994,
1270
- "kl": 0.19044405315071344,
1271
- "learning_rate": 1.6767033207062297e-08,
1272
- "loss": 0.0002,
1273
- "reward": 20.781523525714874,
1274
- "reward_std": 0.247977533377707,
1275
- "rewards/concensus_correctness_reward_func": 15.0,
1276
- "rewards/consensus_reward_func": 2.0,
1277
- "rewards/cumulative_reward_2": 0.0,
1278
- "rewards/final_correctness_reward_func": 1.125,
1279
- "rewards/question_recreation_reward_func": 0.9065235331654549,
1280
- "rewards/soft_format_reward_func": 0.0,
1281
- "rewards/strict_format_reward_func": 0.5,
1282
- "rewards/xmlcount_reward_func": 1.25,
1283
- "step": 134
1284
- },
1285
- {
1286
- "completion_length": 385.9375,
1287
- "epoch": 5.233009708737864,
1288
- "grad_norm": 6.2668375968933105,
1289
- "kl": 6.430564053385751,
1290
- "learning_rate": 1.3086707204299413e-08,
1291
- "loss": 0.0064,
1292
- "reward": 23.092693090438843,
1293
- "reward_std": 0.3644739533774555,
1294
- "rewards/concensus_correctness_reward_func": 17.5,
1295
- "rewards/consensus_reward_func": 2.0,
1296
- "rewards/cumulative_reward_2": 0.0,
1297
- "rewards/final_correctness_reward_func": 1.125,
1298
- "rewards/question_recreation_reward_func": 0.7880056530702859,
1299
- "rewards/soft_format_reward_func": 0.0,
1300
- "rewards/strict_format_reward_func": 0.46875,
1301
- "rewards/xmlcount_reward_func": 1.2109375,
1302
- "step": 136
1303
- },
1304
- {
1305
- "completion_length": 363.5,
1306
- "epoch": 5.310679611650485,
1307
- "grad_norm": 3.9685120582580566,
1308
- "kl": 14.413090521993581,
1309
- "learning_rate": 9.851161154175336e-09,
1310
- "loss": 0.0144,
1311
- "reward": 21.950038075447083,
1312
- "reward_std": 0.565456024807645,
1313
- "rewards/concensus_correctness_reward_func": 16.25,
1314
- "rewards/consensus_reward_func": 2.0,
1315
- "rewards/cumulative_reward_2": 0.0,
1316
- "rewards/final_correctness_reward_func": 1.25,
1317
- "rewards/question_recreation_reward_func": 0.770350604550913,
1318
- "rewards/soft_format_reward_func": 0.0,
1319
- "rewards/strict_format_reward_func": 0.46875,
1320
- "rewards/xmlcount_reward_func": 1.2109375,
1321
- "step": 138
1322
- },
1323
- {
1324
- "completion_length": 413.125,
1325
- "epoch": 5.388349514563107,
1326
- "grad_norm": 262.2953186035156,
1327
- "kl": 59.283854203880765,
1328
- "learning_rate": 7.066469456323609e-09,
1329
- "loss": 0.0593,
1330
- "reward": 23.07690404355526,
1331
- "reward_std": 0.9015675941482186,
1332
- "rewards/concensus_correctness_reward_func": 17.5,
1333
- "rewards/consensus_reward_func": 2.0,
1334
- "rewards/cumulative_reward_2": 0.0,
1335
- "rewards/final_correctness_reward_func": 1.25,
1336
- "rewards/question_recreation_reward_func": 0.7487789960578084,
1337
- "rewards/soft_format_reward_func": 0.0,
1338
- "rewards/strict_format_reward_func": 0.40625,
1339
- "rewards/xmlcount_reward_func": 1.171875,
1340
- "step": 140
1341
- },
1342
- {
1343
- "completion_length": 310.875,
1344
- "epoch": 5.466019417475728,
1345
- "grad_norm": 2.542823076248169,
1346
- "kl": 0.949069471040275,
1347
- "learning_rate": 4.737860078440209e-09,
1348
- "loss": 0.0009,
1349
- "reward": 25.07814311981201,
1350
- "reward_std": 0.33153435809072107,
1351
- "rewards/concensus_correctness_reward_func": 18.75,
1352
- "rewards/consensus_reward_func": 2.0,
1353
- "rewards/cumulative_reward_2": 0.0,
1354
- "rewards/final_correctness_reward_func": 1.75,
1355
- "rewards/question_recreation_reward_func": 0.8828306125942618,
1356
- "rewards/soft_format_reward_func": 0.0,
1357
- "rewards/strict_format_reward_func": 0.46875,
1358
- "rewards/xmlcount_reward_func": 1.2265625,
1359
- "step": 142
1360
- },
1361
- {
1362
- "completion_length": 399.125,
1363
- "epoch": 5.543689320388349,
1364
- "grad_norm": 4.695295810699463,
1365
- "kl": 20652455942.321938,
1366
- "learning_rate": 2.8697047413204778e-09,
1367
- "loss": 20652456.0,
1368
- "reward": 24.869468212127686,
1369
- "reward_std": 1.555299612984527,
1370
- "rewards/concensus_correctness_reward_func": 19.375,
1371
- "rewards/consensus_reward_func": 1.9375,
1372
- "rewards/cumulative_reward_2": 0.0,
1373
- "rewards/final_correctness_reward_func": 1.25,
1374
- "rewards/question_recreation_reward_func": 0.6858745750214439,
1375
- "rewards/soft_format_reward_func": 0.0,
1376
- "rewards/strict_format_reward_func": 0.4375,
1377
- "rewards/xmlcount_reward_func": 1.18359375,
1378
- "step": 144
1379
- },
1380
- {
1381
- "completion_length": 411.0,
1382
- "epoch": 5.621359223300971,
1383
- "grad_norm": 19.52279281616211,
1384
- "kl": 4.609899569710251,
1385
- "learning_rate": 1.4655107114101007e-09,
1386
- "loss": 0.0046,
1387
- "reward": 23.454522281885147,
1388
- "reward_std": 0.18195666404790245,
1389
- "rewards/concensus_correctness_reward_func": 17.5,
1390
- "rewards/consensus_reward_func": 2.0,
1391
- "rewards/cumulative_reward_2": 0.0,
1392
- "rewards/final_correctness_reward_func": 1.4375,
1393
- "rewards/question_recreation_reward_func": 0.872491292655468,
1394
- "rewards/soft_format_reward_func": 0.0,
1395
- "rewards/strict_format_reward_func": 0.453125,
1396
- "rewards/xmlcount_reward_func": 1.19140625,
1397
- "step": 146
1398
- },
1399
- {
1400
- "completion_length": 353.1875,
1401
- "epoch": 5.699029126213592,
1402
- "grad_norm": 1.7859758138656616,
1403
- "kl": 0.07984746794681996,
1404
- "learning_rate": 5.279142162789018e-10,
1405
- "loss": 0.0001,
1406
- "reward": 26.47150719165802,
1407
- "reward_std": 0.18575278946082108,
1408
- "rewards/concensus_correctness_reward_func": 20.0,
1409
- "rewards/consensus_reward_func": 2.0,
1410
- "rewards/cumulative_reward_2": 0.0,
1411
- "rewards/final_correctness_reward_func": 1.8125,
1412
- "rewards/question_recreation_reward_func": 0.9285384453833103,
1413
- "rewards/soft_format_reward_func": 0.0,
1414
- "rewards/strict_format_reward_func": 0.484375,
1415
- "rewards/xmlcount_reward_func": 1.24609375,
1416
- "step": 148
1417
- },
1418
- {
1419
- "completion_length": 371.84375,
1420
- "epoch": 5.776699029126213,
1421
- "grad_norm": 3.739283561706543,
1422
- "kl": 1.505985322932247,
1423
- "learning_rate": 5.86754953789681e-11,
1424
  "loss": 0.0015,
1425
- "reward": 24.783316731452942,
1426
- "reward_std": 0.4831498049898073,
1427
- "rewards/concensus_correctness_reward_func": 18.75,
1428
- "rewards/consensus_reward_func": 2.0,
1429
  "rewards/cumulative_reward_2": 0.0,
1430
- "rewards/final_correctness_reward_func": 1.5,
1431
- "rewards/question_recreation_reward_func": 0.8497229460626841,
1432
  "rewards/soft_format_reward_func": 0.0,
1433
- "rewards/strict_format_reward_func": 0.453125,
1434
- "rewards/xmlcount_reward_func": 1.23046875,
1435
- "step": 150
1436
  },
1437
  {
1438
- "epoch": 5.776699029126213,
1439
- "step": 150,
1440
  "total_flos": 0.0,
1441
- "train_loss": 276553.1687706073,
1442
- "train_runtime": 2525.6374,
1443
- "train_samples_per_second": 0.95,
1444
- "train_steps_per_second": 0.059
1445
  }
1446
  ],
1447
  "logging_steps": 2,
1448
- "max_steps": 150,
1449
  "num_input_tokens_seen": 0,
1450
  "num_train_epochs": 6,
1451
  "save_steps": 25,
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 5.555555555555555,
6
  "eval_steps": 500,
7
+ "global_step": 50,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "completion_length": 272.34375,
14
+ "epoch": 0.2222222222222222,
15
+ "grad_norm": 1.9709233045578003,
16
  "kl": 0.0,
17
+ "learning_rate": 2e-07,
18
  "loss": 0.0,
19
+ "reward": 2.2598760463297367,
20
+ "reward_std": 1.378811091184616,
21
+ "rewards/concensus_correctness_reward_func": 0.2803749965969473,
22
+ "rewards/consensus_reward_func": 0.5,
23
  "rewards/cumulative_reward_2": 0.0,
24
+ "rewards/final_correctness_reward_func": 0.5,
25
+ "rewards/question_recreation_reward_func": 0.6086260285228491,
26
  "rewards/soft_format_reward_func": 0.0,
27
+ "rewards/strict_format_reward_func": 0.015625,
28
+ "rewards/xmlcount_reward_func": 0.3552500042133033,
29
  "step": 2
30
  },
31
  {
32
+ "completion_length": 278.09375,
33
+ "epoch": 0.4444444444444444,
34
+ "grad_norm": 2.1986608505249023,
35
+ "kl": 0.00067875010950047,
36
+ "learning_rate": 6e-07,
37
  "loss": 0.0,
38
+ "reward": 2.499070023186505,
39
+ "reward_std": 1.6266908054240048,
40
+ "rewards/concensus_correctness_reward_func": 0.671875,
41
+ "rewards/consensus_reward_func": 0.1875,
42
  "rewards/cumulative_reward_2": 0.0,
43
+ "rewards/final_correctness_reward_func": 0.4375,
44
+ "rewards/question_recreation_reward_func": 0.6149450317025185,
45
  "rewards/soft_format_reward_func": 0.0,
46
+ "rewards/strict_format_reward_func": 0.046875,
47
+ "rewards/xmlcount_reward_func": 0.540374998934567,
48
  "step": 4
49
  },
50
  {
51
+ "completion_length": 277.5625,
52
+ "epoch": 0.6666666666666666,
53
+ "grad_norm": 1.906295895576477,
54
+ "kl": 0.004192030290141702,
55
+ "learning_rate": 1e-06,
56
+ "loss": 0.0,
57
+ "reward": 5.647615324705839,
58
+ "reward_std": 3.8777514427220012,
59
+ "rewards/concensus_correctness_reward_func": 3.2930625271983445,
60
+ "rewards/consensus_reward_func": 0.625,
61
  "rewards/cumulative_reward_2": 0.0,
62
  "rewards/final_correctness_reward_func": 0.625,
63
+ "rewards/question_recreation_reward_func": 0.6198026705533266,
64
  "rewards/soft_format_reward_func": 0.0,
65
+ "rewards/strict_format_reward_func": 0.03125,
66
+ "rewards/xmlcount_reward_func": 0.4535000016912818,
67
  "step": 6
68
  },
69
  {
70
+ "completion_length": 272.96875,
71
+ "epoch": 0.8888888888888888,
72
+ "grad_norm": 2.2941324710845947,
73
+ "kl": 0.01219035469694063,
74
+ "learning_rate": 9.95134034370785e-07,
75
+ "loss": 0.0,
76
+ "reward": 2.4888051878660917,
77
+ "reward_std": 2.122196763404645,
78
+ "rewards/concensus_correctness_reward_func": 0.820124997291714,
79
+ "rewards/consensus_reward_func": 0.5625,
80
  "rewards/cumulative_reward_2": 0.0,
81
+ "rewards/final_correctness_reward_func": 0.1875,
82
+ "rewards/question_recreation_reward_func": 0.5332739478908479,
83
  "rewards/soft_format_reward_func": 0.0,
84
+ "rewards/strict_format_reward_func": 0.015625,
85
+ "rewards/xmlcount_reward_func": 0.36978125176392496,
86
  "step": 8
87
  },
88
  {
89
+ "completion_length": 319.375,
90
+ "epoch": 1.1111111111111112,
91
+ "grad_norm": 34.59638214111328,
92
+ "kl": 0.02727395889814943,
93
+ "learning_rate": 9.806308479691594e-07,
94
+ "loss": 0.0,
95
+ "reward": 2.30086531303823,
96
+ "reward_std": 2.4632392609491944,
97
+ "rewards/concensus_correctness_reward_func": 0.8314999975264072,
98
+ "rewards/consensus_reward_func": 0.3125,
99
  "rewards/cumulative_reward_2": 0.0,
100
+ "rewards/final_correctness_reward_func": 0.4375,
101
+ "rewards/question_recreation_reward_func": 0.5690215453505516,
102
  "rewards/soft_format_reward_func": 0.0,
103
+ "rewards/strict_format_reward_func": 0.0,
104
+ "rewards/xmlcount_reward_func": 0.15034375991672277,
105
  "step": 10
106
  },
107
  {
108
+ "completion_length": 288.53125,
109
+ "epoch": 1.3333333333333333,
110
+ "grad_norm": 2.2190043926239014,
111
+ "kl": 0.0441530579701066,
112
+ "learning_rate": 9.567727288213004e-07,
113
+ "loss": 0.0,
114
+ "reward": 3.0847667902708054,
115
+ "reward_std": 2.7779684653505683,
116
+ "rewards/concensus_correctness_reward_func": 1.3179375007748604,
117
+ "rewards/consensus_reward_func": 0.375,
118
  "rewards/cumulative_reward_2": 0.0,
119
+ "rewards/final_correctness_reward_func": 0.4375,
120
+ "rewards/question_recreation_reward_func": 0.5960479695349932,
121
  "rewards/soft_format_reward_func": 0.0,
122
+ "rewards/strict_format_reward_func": 0.0,
123
+ "rewards/xmlcount_reward_func": 0.35828124824911356,
124
  "step": 12
125
  },
126
  {
127
+ "completion_length": 261.46875,
128
+ "epoch": 1.5555555555555556,
129
+ "grad_norm": 1.9829434156417847,
130
+ "kl": 0.06888556969352067,
131
+ "learning_rate": 9.240240480782129e-07,
132
+ "loss": 0.0001,
133
+ "reward": 3.713436372578144,
134
+ "reward_std": 3.275791682302952,
135
+ "rewards/concensus_correctness_reward_func": 2.0718125123530626,
136
+ "rewards/consensus_reward_func": 0.375,
137
  "rewards/cumulative_reward_2": 0.0,
138
+ "rewards/final_correctness_reward_func": 0.1875,
139
+ "rewards/question_recreation_reward_func": 0.5886238208040595,
140
  "rewards/soft_format_reward_func": 0.0,
141
+ "rewards/strict_format_reward_func": 0.015625,
142
+ "rewards/xmlcount_reward_func": 0.4748750049620867,
143
  "step": 14
144
  },
145
  {
146
+ "completion_length": 318.3125,
147
+ "epoch": 1.7777777777777777,
148
+ "grad_norm": 2.1747398376464844,
149
+ "kl": 0.05949262389913201,
150
+ "learning_rate": 8.83022221559489e-07,
151
+ "loss": 0.0001,
152
+ "reward": 4.004723660647869,
153
+ "reward_std": 1.3993592984625138,
154
+ "rewards/concensus_correctness_reward_func": 2.198250020388514,
155
+ "rewards/consensus_reward_func": 0.6875,
156
  "rewards/cumulative_reward_2": 0.0,
157
+ "rewards/final_correctness_reward_func": 0.3125,
158
+ "rewards/question_recreation_reward_func": 0.5488486010581255,
159
  "rewards/soft_format_reward_func": 0.0,
160
+ "rewards/strict_format_reward_func": 0.0,
161
+ "rewards/xmlcount_reward_func": 0.25762500520795584,
162
  "step": 16
163
  },
164
  {
165
+ "completion_length": 281.65625,
166
+ "epoch": 2.0,
167
+ "grad_norm": 1.9465759992599487,
168
+ "kl": 0.10193903697654605,
169
+ "learning_rate": 8.34565303179429e-07,
170
+ "loss": 0.0001,
171
+ "reward": 3.8964588195085526,
172
+ "reward_std": 3.7396021018503234,
173
+ "rewards/concensus_correctness_reward_func": 1.9127500001341105,
174
+ "rewards/consensus_reward_func": 0.375,
175
  "rewards/cumulative_reward_2": 0.0,
176
+ "rewards/final_correctness_reward_func": 0.5625,
177
+ "rewards/question_recreation_reward_func": 0.5738963634939864,
178
  "rewards/soft_format_reward_func": 0.0,
179
+ "rewards/strict_format_reward_func": 0.03125,
180
+ "rewards/xmlcount_reward_func": 0.44106249837204814,
181
  "step": 18
182
  },
183
  {
184
+ "completion_length": 297.53125,
185
+ "epoch": 2.2222222222222223,
186
+ "grad_norm": 2.368654727935791,
187
+ "kl": 0.10433738771826029,
188
+ "learning_rate": 7.795964517353733e-07,
189
+ "loss": 0.0001,
190
+ "reward": 5.208045449107885,
191
+ "reward_std": 3.6840367396362126,
192
+ "rewards/concensus_correctness_reward_func": 3.2463124967180192,
193
+ "rewards/consensus_reward_func": 0.6875,
194
  "rewards/cumulative_reward_2": 0.0,
195
+ "rewards/final_correctness_reward_func": 0.5,
196
+ "rewards/question_recreation_reward_func": 0.5021704637911171,
197
  "rewards/soft_format_reward_func": 0.0,
198
+ "rewards/strict_format_reward_func": 0.0,
199
+ "rewards/xmlcount_reward_func": 0.2720625028014183,
200
  "step": 20
201
  },
202
  {
203
+ "completion_length": 287.375,
204
+ "epoch": 2.4444444444444446,
205
+ "grad_norm": 4.305432319641113,
206
+ "kl": 8.904280132148415,
207
+ "learning_rate": 7.191855733945386e-07,
208
+ "loss": 0.0089,
209
+ "reward": 2.7408705558627844,
210
+ "reward_std": 1.748457981273532,
211
+ "rewards/concensus_correctness_reward_func": 0.8576874881982803,
212
+ "rewards/consensus_reward_func": 0.4375,
213
+ "rewards/cumulative_reward_2": 0.0,
214
+ "rewards/final_correctness_reward_func": 0.4375,
215
+ "rewards/question_recreation_reward_func": 0.5240580616518855,
216
+ "rewards/soft_format_reward_func": 0.03125,
217
+ "rewards/strict_format_reward_func": 0.015625,
218
+ "rewards/xmlcount_reward_func": 0.4372500046156347,
219
  "step": 22
220
  },
221
  {
222
+ "completion_length": 298.375,
223
+ "epoch": 2.6666666666666665,
224
+ "grad_norm": 2.7135236263275146,
225
+ "kl": 0.15368254156783223,
226
+ "learning_rate": 6.545084971874736e-07,
227
+ "loss": 0.0002,
228
+ "reward": 2.9278519060462713,
229
+ "reward_std": 2.033405718393624,
230
+ "rewards/concensus_correctness_reward_func": 0.7936874981969595,
231
+ "rewards/consensus_reward_func": 0.625,
232
  "rewards/cumulative_reward_2": 0.0,
233
+ "rewards/final_correctness_reward_func": 0.375,
234
+ "rewards/question_recreation_reward_func": 0.5647581112571061,
235
  "rewards/soft_format_reward_func": 0.0,
236
+ "rewards/strict_format_reward_func": 0.046875,
237
+ "rewards/xmlcount_reward_func": 0.5225312472321093,
238
  "step": 24
239
  },
240
  {
241
+ "completion_length": 284.75,
242
+ "epoch": 2.888888888888889,
243
+ "grad_norm": 3.2465226650238037,
244
+ "kl": 0.36339320987463,
245
+ "learning_rate": 5.868240888334652e-07,
246
+ "loss": 0.0004,
247
+ "reward": 3.065687384456396,
248
+ "reward_std": 1.8235639781632926,
249
+ "rewards/concensus_correctness_reward_func": 1.0326249934732914,
250
+ "rewards/consensus_reward_func": 0.5,
251
  "rewards/cumulative_reward_2": 0.0,
252
+ "rewards/final_correctness_reward_func": 0.5,
253
+ "rewards/question_recreation_reward_func": 0.6777498843148351,
254
  "rewards/soft_format_reward_func": 0.0,
255
+ "rewards/strict_format_reward_func": 0.0,
256
+ "rewards/xmlcount_reward_func": 0.35531249456107616,
257
  "step": 26
258
  },
259
  {
260
+ "completion_length": 253.46875,
261
+ "epoch": 3.111111111111111,
262
+ "grad_norm": 2.922649383544922,
263
+ "kl": 0.44744345219805837,
264
+ "learning_rate": 5.174497483512505e-07,
265
+ "loss": 0.0004,
266
+ "reward": 3.2730717603117228,
267
+ "reward_std": 1.1045972863212228,
268
+ "rewards/concensus_correctness_reward_func": 1.3550624987110496,
269
+ "rewards/consensus_reward_func": 0.5,
270
  "rewards/cumulative_reward_2": 0.0,
271
+ "rewards/final_correctness_reward_func": 0.25,
272
+ "rewards/question_recreation_reward_func": 0.5656342646107078,
273
  "rewards/soft_format_reward_func": 0.0,
274
+ "rewards/strict_format_reward_func": 0.015625,
275
+ "rewards/xmlcount_reward_func": 0.5867500050953822,
276
  "step": 28
277
  },
278
  {
279
+ "completion_length": 259.84375,
280
+ "epoch": 3.3333333333333335,
281
+ "grad_norm": 2.482933521270752,
282
+ "kl": 0.2743630134500563,
283
+ "learning_rate": 4.477357683661733e-07,
284
+ "loss": 0.0003,
285
+ "reward": 3.6187379471957684,
286
+ "reward_std": 3.6063565080985427,
287
+ "rewards/concensus_correctness_reward_func": 1.444624996976927,
288
+ "rewards/consensus_reward_func": 0.8125,
289
  "rewards/cumulative_reward_2": 0.0,
290
+ "rewards/final_correctness_reward_func": 0.375,
291
+ "rewards/question_recreation_reward_func": 0.5534566845744848,
292
  "rewards/soft_format_reward_func": 0.0,
293
+ "rewards/strict_format_reward_func": 0.0,
294
+ "rewards/xmlcount_reward_func": 0.4331562491133809,
295
  "step": 30
296
  },
297
  {
298
+ "completion_length": 271.5,
299
+ "epoch": 3.5555555555555554,
300
+ "grad_norm": 5.067188262939453,
301
+ "kl": 0.30074576614424586,
302
+ "learning_rate": 3.790390522001662e-07,
303
+ "loss": 0.0003,
304
+ "reward": 4.151941349729896,
305
+ "reward_std": 3.3430008568102494,
306
+ "rewards/concensus_correctness_reward_func": 2.1432499843649566,
307
+ "rewards/consensus_reward_func": 0.375,
308
  "rewards/cumulative_reward_2": 0.0,
309
+ "rewards/final_correctness_reward_func": 0.5625,
310
+ "rewards/question_recreation_reward_func": 0.5586601588875055,
311
  "rewards/soft_format_reward_func": 0.0,
312
+ "rewards/strict_format_reward_func": 0.046875,
313
+ "rewards/xmlcount_reward_func": 0.4656562495511025,
314
  "step": 32
315
  },
316
  {
317
+ "completion_length": 240.78125,
318
+ "epoch": 3.7777777777777777,
319
+ "grad_norm": 2.204274892807007,
320
+ "kl": 0.5477567701600492,
321
+ "learning_rate": 3.1269670329204393e-07,
322
+ "loss": 0.0005,
323
+ "reward": 2.8508229684084654,
324
+ "reward_std": 1.868494457739871,
325
+ "rewards/concensus_correctness_reward_func": 0.7966250022873282,
326
+ "rewards/consensus_reward_func": 0.4375,
327
+ "rewards/cumulative_reward_2": 0.0,
328
+ "rewards/final_correctness_reward_func": 0.4375,
329
+ "rewards/question_recreation_reward_func": 0.48391672410070896,
330
+ "rewards/soft_format_reward_func": 0.015625,
331
+ "rewards/strict_format_reward_func": 0.046875,
332
+ "rewards/xmlcount_reward_func": 0.6327812653034925,
333
  "step": 34
334
  },
335
  {
336
+ "completion_length": 246.59375,
337
+ "epoch": 4.0,
338
+ "grad_norm": 3.4755871295928955,
339
+ "kl": 0.3700036955997348,
340
+ "learning_rate": 2.500000000000001e-07,
341
+ "loss": 0.0004,
342
+ "reward": 3.609894147142768,
343
+ "reward_std": 2.596530891722068,
344
+ "rewards/concensus_correctness_reward_func": 1.3913749977946281,
345
+ "rewards/consensus_reward_func": 0.375,
346
+ "rewards/cumulative_reward_2": 0.0,
347
+ "rewards/final_correctness_reward_func": 0.75,
348
+ "rewards/question_recreation_reward_func": 0.6090504107996821,
349
+ "rewards/soft_format_reward_func": 0.0,
350
+ "rewards/strict_format_reward_func": 0.03125,
351
+ "rewards/xmlcount_reward_func": 0.45321875205263495,
352
  "step": 36
353
  },
354
  {
355
+ "completion_length": 250.5,
356
+ "epoch": 4.222222222222222,
357
+ "grad_norm": 2.5430102348327637,
358
+ "kl": 1.0490361750125885,
359
+ "learning_rate": 1.9216926233717084e-07,
360
+ "loss": 0.001,
361
+ "reward": 4.269449989311397,
362
+ "reward_std": 1.6662210901267827,
363
+ "rewards/concensus_correctness_reward_func": 2.0434374917458626,
364
+ "rewards/consensus_reward_func": 1.0,
365
  "rewards/cumulative_reward_2": 0.0,
366
+ "rewards/final_correctness_reward_func": 0.25,
367
+ "rewards/question_recreation_reward_func": 0.5368563425727189,
368
  "rewards/soft_format_reward_func": 0.0,
369
+ "rewards/strict_format_reward_func": 0.015625,
370
+ "rewards/xmlcount_reward_func": 0.4235312477685511,
371
  "step": 38
372
  },
373
  {
374
+ "completion_length": 271.875,
375
+ "epoch": 4.444444444444445,
376
+ "grad_norm": 2.8382978439331055,
377
+ "kl": 0.34369932441040874,
378
+ "learning_rate": 1.4033009983067452e-07,
379
+ "loss": 0.0003,
380
+ "reward": 4.158377468585968,
381
+ "reward_std": 4.038643125444651,
382
+ "rewards/concensus_correctness_reward_func": 1.9884374965913594,
383
+ "rewards/consensus_reward_func": 0.6875,
384
+ "rewards/cumulative_reward_2": 0.0,
385
+ "rewards/final_correctness_reward_func": 0.5,
386
+ "rewards/question_recreation_reward_func": 0.5553461285308003,
387
+ "rewards/soft_format_reward_func": 0.03125,
388
+ "rewards/strict_format_reward_func": 0.015625,
389
+ "rewards/xmlcount_reward_func": 0.38021874986588955,
390
  "step": 40
391
  },
392
  {
393
+ "completion_length": 289.84375,
394
+ "epoch": 4.666666666666667,
395
+ "grad_norm": 2.3925092220306396,
396
+ "kl": 0.5022453009150922,
397
+ "learning_rate": 9.549150281252632e-08,
398
+ "loss": 0.0005,
399
+ "reward": 3.199108015745878,
400
+ "reward_std": 1.863160651177168,
401
+ "rewards/concensus_correctness_reward_func": 0.8601875007152557,
402
+ "rewards/consensus_reward_func": 0.375,
403
  "rewards/cumulative_reward_2": 0.0,
404
+ "rewards/final_correctness_reward_func": 0.5625,
405
+ "rewards/question_recreation_reward_func": 0.6550767524167895,
406
  "rewards/soft_format_reward_func": 0.0,
407
+ "rewards/strict_format_reward_func": 0.015625,
408
+ "rewards/xmlcount_reward_func": 0.7307187486439943,
409
  "step": 42
410
  },
411
  {
412
+ "completion_length": 334.78125,
413
+ "epoch": 4.888888888888889,
414
+ "grad_norm": 17.802112579345703,
415
+ "kl": 0.46959283016622066,
416
+ "learning_rate": 5.8526203570536504e-08,
417
+ "loss": 0.0005,
418
+ "reward": 3.619211522862315,
419
+ "reward_std": 1.9751139997970313,
420
+ "rewards/concensus_correctness_reward_func": 1.9755624998360872,
421
+ "rewards/consensus_reward_func": 0.3125,
422
  "rewards/cumulative_reward_2": 0.0,
423
+ "rewards/final_correctness_reward_func": 0.375,
424
+ "rewards/question_recreation_reward_func": 0.5751178320497274,
425
  "rewards/soft_format_reward_func": 0.0,
426
+ "rewards/strict_format_reward_func": 0.015625,
427
+ "rewards/xmlcount_reward_func": 0.3654062505811453,
428
  "step": 44
429
  },
430
  {
431
+ "completion_length": 273.25,
432
+ "epoch": 5.111111111111111,
433
+ "grad_norm": 2.5441696643829346,
434
+ "kl": 0.4962086542509496,
435
+ "learning_rate": 3.015368960704584e-08,
436
+ "loss": 0.0005,
437
+ "reward": 3.7636695094406605,
438
+ "reward_std": 2.9820365188643336,
439
+ "rewards/concensus_correctness_reward_func": 1.4986874964088202,
440
+ "rewards/consensus_reward_func": 0.625,
441
  "rewards/cumulative_reward_2": 0.0,
442
+ "rewards/final_correctness_reward_func": 0.375,
443
+ "rewards/question_recreation_reward_func": 0.6830132203176618,
444
  "rewards/soft_format_reward_func": 0.0,
445
+ "rewards/strict_format_reward_func": 0.015625,
446
+ "rewards/xmlcount_reward_func": 0.566343754529953,
447
  "step": 46
448
  },
449
  {
450
+ "completion_length": 297.9375,
451
+ "epoch": 5.333333333333333,
452
+ "grad_norm": 3.9671196937561035,
453
+ "kl": 0.35691973846405745,
454
+ "learning_rate": 1.0926199633097154e-08,
455
+ "loss": 0.0004,
456
+ "reward": 3.2234934605658054,
457
+ "reward_std": 2.496055698953569,
458
+ "rewards/concensus_correctness_reward_func": 1.2527499999850988,
459
+ "rewards/consensus_reward_func": 0.5,
460
  "rewards/cumulative_reward_2": 0.0,
461
+ "rewards/final_correctness_reward_func": 0.5,
462
+ "rewards/question_recreation_reward_func": 0.498899728176184,
463
  "rewards/soft_format_reward_func": 0.0,
464
+ "rewards/strict_format_reward_func": 0.03125,
465
+ "rewards/xmlcount_reward_func": 0.44059374555945396,
466
  "step": 48
467
  },
468
  {
469
+ "completion_length": 301.875,
470
+ "epoch": 5.555555555555555,
471
+ "grad_norm": 3.0020861625671387,
472
+ "kl": 1.5168349244631827,
473
+ "learning_rate": 1.217974870087901e-09,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  "loss": 0.0015,
475
+ "reward": 4.553922591730952,
476
+ "reward_std": 4.596473885292653,
477
+ "rewards/concensus_correctness_reward_func": 2.5792499780654907,
478
+ "rewards/consensus_reward_func": 0.5,
479
  "rewards/cumulative_reward_2": 0.0,
480
+ "rewards/final_correctness_reward_func": 0.625,
481
+ "rewards/question_recreation_reward_func": 0.47670389572158456,
482
  "rewards/soft_format_reward_func": 0.0,
483
+ "rewards/strict_format_reward_func": 0.015625,
484
+ "rewards/xmlcount_reward_func": 0.3573437510058284,
485
+ "step": 50
486
  },
487
  {
488
+ "epoch": 5.555555555555555,
489
+ "step": 50,
490
  "total_flos": 0.0,
491
+ "train_loss": 0.0006611158326268196,
492
+ "train_runtime": 713.3917,
493
+ "train_samples_per_second": 1.121,
494
+ "train_steps_per_second": 0.07
495
  }
496
  ],
497
  "logging_steps": 2,
498
+ "max_steps": 50,
499
  "num_input_tokens_seen": 0,
500
  "num_train_epochs": 6,
501
  "save_steps": 25,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03808a2d232459ed0195b0b6efec857d3701df79459ebc0390917213c224296e
3
  size 6008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b84f71c6d07548f3c7af027cb5447f74c5e899980ce9f92bbd681e01bf335f1
3
  size 6008