wyceee commited on
Commit
f570c30
·
verified ·
1 Parent(s): 25b19df

End of training

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.005916564127692254,
4
- "train_runtime": 674.0869,
5
  "train_samples": 72,
6
- "train_samples_per_second": 1.187,
7
  "train_steps_per_second": 0.074
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 5.491919757317737,
4
+ "train_runtime": 671.7848,
5
  "train_samples": 72,
6
+ "train_samples_per_second": 1.191,
7
  "train_steps_per_second": 0.074
8
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:593ee440b09725d9acb23b2622980636c66fd339f28aa1e1bda5ad9b564de4eb
3
  size 4996670464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9591b7f4c6657c764961e248f7d66b28b4009b8d30657471519d7a283035b1b
3
  size 4996670464
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20935d86192d99dbc3c37a29f99ae92ed2c2e534cec5f1396cb3497338f93080
3
  size 1178224960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b495ec090ba0f8d34eafe59eefa10a2bf7b82a591e1a355c539e74dc16503a6f
3
  size 1178224960
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.005916564127692254,
4
- "train_runtime": 674.0869,
5
  "train_samples": 72,
6
- "train_samples_per_second": 1.187,
7
  "train_steps_per_second": 0.074
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 5.491919757317737,
4
+ "train_runtime": 671.7848,
5
  "train_samples": 72,
6
+ "train_samples_per_second": 1.191,
7
  "train_steps_per_second": 0.074
8
  }
trainer_state.json CHANGED
@@ -10,487 +10,487 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 292.375,
14
  "epoch": 0.2222222222222222,
15
- "grad_norm": 3.8107857704162598,
16
  "kl": 0.0,
17
  "learning_rate": 2e-07,
18
  "loss": -0.0,
19
- "reward": 4.110599463805556,
20
- "reward_std": 3.714366073545534,
21
- "rewards/concensus_correctness_reward_func": 2.0961249992251396,
22
  "rewards/consensus_reward_func": 0.4375,
23
  "rewards/cumulative_reward_2": 0.0,
24
- "rewards/final_correctness_reward_func": 0.5,
25
- "rewards/question_recreation_reward_func": 0.5882245441898704,
26
  "rewards/soft_format_reward_func": 0.0,
27
- "rewards/strict_format_reward_func": 0.015625,
28
- "rewards/xmlcount_reward_func": 0.47312500420957804,
29
  "step": 2
30
  },
31
  {
32
- "completion_length": 242.84375,
33
  "epoch": 0.4444444444444444,
34
- "grad_norm": 4.162234306335449,
35
- "kl": 0.002089262896333821,
36
  "learning_rate": 6e-07,
37
  "loss": 0.0,
38
- "reward": 6.153819195926189,
39
- "reward_std": 3.73072009999305,
40
- "rewards/concensus_correctness_reward_func": 3.3734999974258244,
41
- "rewards/consensus_reward_func": 0.5625,
42
  "rewards/cumulative_reward_2": 0.0,
43
- "rewards/final_correctness_reward_func": 0.6875,
44
- "rewards/question_recreation_reward_func": 0.7112567536532879,
45
  "rewards/soft_format_reward_func": 0.0,
46
- "rewards/strict_format_reward_func": 0.03125,
47
- "rewards/xmlcount_reward_func": 0.7878125039860606,
48
  "step": 4
49
  },
50
  {
51
- "completion_length": 238.96875,
52
  "epoch": 0.6666666666666666,
53
- "grad_norm": 3.1884305477142334,
54
- "kl": 0.03603306884178892,
55
  "learning_rate": 1e-06,
56
- "loss": 0.0001,
57
- "reward": 6.223309241235256,
58
- "reward_std": 4.281767612181284,
59
- "rewards/concensus_correctness_reward_func": 3.5337499850429595,
60
- "rewards/consensus_reward_func": 0.625,
61
  "rewards/cumulative_reward_2": 0.0,
62
- "rewards/final_correctness_reward_func": 0.5,
63
- "rewards/question_recreation_reward_func": 0.6908091912046075,
64
  "rewards/soft_format_reward_func": 0.0,
65
- "rewards/strict_format_reward_func": 0.015625,
66
- "rewards/xmlcount_reward_func": 0.8581249974668026,
67
  "step": 6
68
  },
69
  {
70
- "completion_length": 268.625,
71
  "epoch": 0.8888888888888888,
72
- "grad_norm": 3.539409875869751,
73
- "kl": 0.4308171756565571,
74
  "learning_rate": 9.95134034370785e-07,
75
- "loss": 0.0004,
76
- "reward": 2.3415392953902483,
77
- "reward_std": 1.1331623401492834,
78
- "rewards/concensus_correctness_reward_func": 0.2723749913275242,
79
- "rewards/consensus_reward_func": 0.5,
80
  "rewards/cumulative_reward_2": 0.0,
81
- "rewards/final_correctness_reward_func": 0.4375,
82
- "rewards/question_recreation_reward_func": 0.5665392577648163,
83
  "rewards/soft_format_reward_func": 0.0,
84
- "rewards/strict_format_reward_func": 0.015625,
85
- "rewards/xmlcount_reward_func": 0.5494999946095049,
86
  "step": 8
87
  },
88
  {
89
- "completion_length": 258.5625,
90
  "epoch": 1.1111111111111112,
91
- "grad_norm": 54.83183288574219,
92
- "kl": 15.701219761976972,
93
  "learning_rate": 9.806308479691594e-07,
94
- "loss": 0.0157,
95
- "reward": 4.375889036804438,
96
- "reward_std": 3.6191295210737735,
97
- "rewards/concensus_correctness_reward_func": 2.0978124998509884,
98
- "rewards/consensus_reward_func": 0.5625,
99
  "rewards/cumulative_reward_2": 0.0,
100
- "rewards/final_correctness_reward_func": 0.5625,
101
- "rewards/question_recreation_reward_func": 0.5807640934363008,
102
  "rewards/soft_format_reward_func": 0.0,
103
- "rewards/strict_format_reward_func": 0.0,
104
- "rewards/xmlcount_reward_func": 0.5723125000949949,
105
  "step": 10
106
  },
107
  {
108
- "completion_length": 224.5,
109
  "epoch": 1.3333333333333333,
110
- "grad_norm": 5.148858547210693,
111
- "kl": 2.3633982008323073,
112
  "learning_rate": 9.567727288213004e-07,
113
- "loss": 0.0024,
114
- "reward": 5.041572377085686,
115
- "reward_std": 3.1810707906261086,
116
- "rewards/concensus_correctness_reward_func": 2.3565000272355974,
117
- "rewards/consensus_reward_func": 0.625,
118
  "rewards/cumulative_reward_2": 0.0,
119
- "rewards/final_correctness_reward_func": 0.625,
120
- "rewards/question_recreation_reward_func": 0.4960412373766303,
121
  "rewards/soft_format_reward_func": 0.0,
122
- "rewards/strict_format_reward_func": 0.015625,
123
- "rewards/xmlcount_reward_func": 0.923406271263957,
124
  "step": 12
125
  },
126
  {
127
- "completion_length": 244.71875,
128
  "epoch": 1.5555555555555556,
129
- "grad_norm": 4.224116802215576,
130
- "kl": 1.5940971969394013,
131
  "learning_rate": 9.240240480782129e-07,
132
- "loss": 0.0016,
133
- "reward": 3.845541685819626,
134
- "reward_std": 0.7413416516501456,
135
- "rewards/concensus_correctness_reward_func": 1.4198125060647726,
136
- "rewards/consensus_reward_func": 0.6875,
137
  "rewards/cumulative_reward_2": 0.0,
138
- "rewards/final_correctness_reward_func": 0.375,
139
- "rewards/question_recreation_reward_func": 0.5501666869968176,
140
  "rewards/soft_format_reward_func": 0.0,
141
- "rewards/strict_format_reward_func": 0.015625,
142
- "rewards/xmlcount_reward_func": 0.7974375048652291,
143
  "step": 14
144
  },
145
  {
146
- "completion_length": 288.03125,
147
  "epoch": 1.7777777777777777,
148
- "grad_norm": 3.2477917671203613,
149
- "kl": 1.24803132773377,
150
  "learning_rate": 8.83022221559489e-07,
151
- "loss": 0.0012,
152
- "reward": 3.5894551295787096,
153
- "reward_std": 1.4415434753755108,
154
- "rewards/concensus_correctness_reward_func": 1.0913125132210553,
155
- "rewards/consensus_reward_func": 0.75,
156
  "rewards/cumulative_reward_2": 0.0,
157
- "rewards/final_correctness_reward_func": 0.3125,
158
- "rewards/question_recreation_reward_func": 0.5858613271266222,
159
  "rewards/soft_format_reward_func": 0.0,
160
- "rewards/strict_format_reward_func": 0.015625,
161
- "rewards/xmlcount_reward_func": 0.8341562564019114,
162
  "step": 16
163
  },
164
  {
165
- "completion_length": 277.90625,
166
  "epoch": 2.0,
167
- "grad_norm": 3.8915557861328125,
168
- "kl": 1.672196735162288,
169
  "learning_rate": 8.34565303179429e-07,
170
- "loss": 0.0017,
171
- "reward": 5.043356254696846,
172
- "reward_std": 3.8196976413019,
173
- "rewards/concensus_correctness_reward_func": 2.2441875115036964,
174
- "rewards/consensus_reward_func": 0.75,
175
  "rewards/cumulative_reward_2": 0.0,
176
- "rewards/final_correctness_reward_func": 0.75,
177
- "rewards/question_recreation_reward_func": 0.6289812638424337,
178
  "rewards/soft_format_reward_func": 0.0,
179
  "rewards/strict_format_reward_func": 0.0,
180
- "rewards/xmlcount_reward_func": 0.6701875114813447,
181
  "step": 18
182
  },
183
  {
184
- "completion_length": 252.84375,
185
  "epoch": 2.2222222222222223,
186
- "grad_norm": 4.681396484375,
187
- "kl": 0.7993139801546931,
188
  "learning_rate": 7.795964517353733e-07,
189
- "loss": 0.0008,
190
- "reward": 4.347612947225571,
191
- "reward_std": 3.5975783905014396,
192
- "rewards/concensus_correctness_reward_func": 1.8726249998435378,
193
- "rewards/consensus_reward_func": 0.4375,
194
  "rewards/cumulative_reward_2": 0.0,
195
- "rewards/final_correctness_reward_func": 0.625,
196
- "rewards/question_recreation_reward_func": 0.4997691195458174,
197
  "rewards/soft_format_reward_func": 0.0,
198
- "rewards/strict_format_reward_func": 0.046875,
199
- "rewards/xmlcount_reward_func": 0.865843765437603,
200
  "step": 20
201
  },
202
  {
203
- "completion_length": 230.5625,
204
  "epoch": 2.4444444444444446,
205
- "grad_norm": 14.132287979125977,
206
- "kl": 3.0374708212912083,
207
  "learning_rate": 7.191855733945386e-07,
208
- "loss": 0.003,
209
- "reward": 6.949155829846859,
210
- "reward_std": 3.3791947152931243,
211
- "rewards/concensus_correctness_reward_func": 3.738687491742894,
212
- "rewards/consensus_reward_func": 1.0,
213
  "rewards/cumulative_reward_2": 0.0,
214
- "rewards/final_correctness_reward_func": 0.8125,
215
- "rewards/question_recreation_reward_func": 0.611030868254602,
216
  "rewards/soft_format_reward_func": 0.0,
217
- "rewards/strict_format_reward_func": 0.03125,
218
- "rewards/xmlcount_reward_func": 0.7556875105947256,
219
  "step": 22
220
  },
221
  {
222
- "completion_length": 265.5625,
223
  "epoch": 2.6666666666666665,
224
- "grad_norm": 4.178548336029053,
225
- "kl": 1.368599567329511,
226
  "learning_rate": 6.545084971874736e-07,
227
- "loss": 0.0014,
228
- "reward": 3.9503697529435158,
229
- "reward_std": 2.410870131352567,
230
- "rewards/concensus_correctness_reward_func": 1.6398749821819365,
231
- "rewards/consensus_reward_func": 0.6875,
232
  "rewards/cumulative_reward_2": 0.0,
233
- "rewards/final_correctness_reward_func": 0.375,
234
- "rewards/question_recreation_reward_func": 0.576807199511677,
235
  "rewards/soft_format_reward_func": 0.0,
236
- "rewards/strict_format_reward_func": 0.015625,
237
- "rewards/xmlcount_reward_func": 0.6555625096370932,
238
  "step": 24
239
  },
240
  {
241
- "completion_length": 255.40625,
242
  "epoch": 2.888888888888889,
243
- "grad_norm": 3.538485050201416,
244
- "kl": 1.5496129877865314,
245
  "learning_rate": 5.868240888334652e-07,
246
- "loss": 0.0015,
247
- "reward": 4.10240344889462,
248
- "reward_std": 2.6716376403346658,
249
- "rewards/concensus_correctness_reward_func": 1.5082499934360385,
250
- "rewards/consensus_reward_func": 0.4375,
251
  "rewards/cumulative_reward_2": 0.0,
252
- "rewards/final_correctness_reward_func": 0.625,
253
- "rewards/question_recreation_reward_func": 0.6146223489195108,
254
  "rewards/soft_format_reward_func": 0.0,
255
- "rewards/strict_format_reward_func": 0.046875,
256
- "rewards/xmlcount_reward_func": 0.8701562639325857,
257
  "step": 26
258
  },
259
  {
260
- "completion_length": 241.5625,
261
  "epoch": 3.111111111111111,
262
- "grad_norm": 3.752775192260742,
263
- "kl": 3.886347927618772,
264
  "learning_rate": 5.174497483512505e-07,
265
- "loss": 0.0039,
266
- "reward": 3.3237821646034718,
267
- "reward_std": 2.1524549201130867,
268
- "rewards/concensus_correctness_reward_func": 0.8466250016354024,
269
- "rewards/consensus_reward_func": 0.5625,
270
  "rewards/cumulative_reward_2": 0.0,
271
- "rewards/final_correctness_reward_func": 0.5625,
272
- "rewards/question_recreation_reward_func": 0.5080321566201746,
273
  "rewards/soft_format_reward_func": 0.0,
274
- "rewards/strict_format_reward_func": 0.046875,
275
- "rewards/xmlcount_reward_func": 0.7972500007599592,
276
  "step": 28
277
  },
278
  {
279
- "completion_length": 261.8125,
280
  "epoch": 3.3333333333333335,
281
- "grad_norm": 5.829784870147705,
282
- "kl": 3.451839253306389,
283
  "learning_rate": 4.477357683661733e-07,
284
- "loss": 0.0035,
285
- "reward": 2.6074002254754305,
286
- "reward_std": 1.677693044708576,
287
- "rewards/concensus_correctness_reward_func": 0.9816249692812562,
288
- "rewards/consensus_reward_func": 0.1875,
289
  "rewards/cumulative_reward_2": 0.0,
290
- "rewards/final_correctness_reward_func": 0.25,
291
- "rewards/question_recreation_reward_func": 0.5892127025872469,
292
  "rewards/soft_format_reward_func": 0.0,
293
- "rewards/strict_format_reward_func": 0.03125,
294
- "rewards/xmlcount_reward_func": 0.5678125042468309,
295
  "step": 30
296
  },
297
  {
298
- "completion_length": 267.4375,
299
  "epoch": 3.5555555555555554,
300
- "grad_norm": 22.223487854003906,
301
- "kl": 83.22811474930495,
302
  "learning_rate": 3.790390522001662e-07,
303
- "loss": 0.0832,
304
- "reward": 5.953872742131352,
305
- "reward_std": 1.9447148024337366,
306
- "rewards/concensus_correctness_reward_func": 3.6568749928846955,
307
- "rewards/consensus_reward_func": 0.5625,
308
  "rewards/cumulative_reward_2": 0.0,
309
- "rewards/final_correctness_reward_func": 0.6875,
310
- "rewards/question_recreation_reward_func": 0.614747672341764,
311
  "rewards/soft_format_reward_func": 0.0,
312
- "rewards/strict_format_reward_func": 0.015625,
313
- "rewards/xmlcount_reward_func": 0.4166250051930547,
314
  "step": 32
315
  },
316
  {
317
- "completion_length": 252.8125,
318
  "epoch": 3.7777777777777777,
319
- "grad_norm": 3.262629508972168,
320
- "kl": 3.2397216241806746,
321
  "learning_rate": 3.1269670329204393e-07,
322
- "loss": 0.0032,
323
- "reward": 5.505353234708309,
324
- "reward_std": 2.5832796641625464,
325
- "rewards/concensus_correctness_reward_func": 3.007625007769093,
326
- "rewards/consensus_reward_func": 0.625,
327
  "rewards/cumulative_reward_2": 0.0,
328
- "rewards/final_correctness_reward_func": 0.5,
329
- "rewards/question_recreation_reward_func": 0.6153219498228282,
330
  "rewards/soft_format_reward_func": 0.0,
331
  "rewards/strict_format_reward_func": 0.03125,
332
- "rewards/xmlcount_reward_func": 0.7261562403291464,
333
  "step": 34
334
  },
335
  {
336
- "completion_length": 263.75,
337
  "epoch": 4.0,
338
- "grad_norm": 3.4170873165130615,
339
- "kl": 5.119970758911222,
340
  "learning_rate": 2.500000000000001e-07,
341
- "loss": 0.0051,
342
- "reward": 5.552348531782627,
343
- "reward_std": 3.542175215203315,
344
- "rewards/concensus_correctness_reward_func": 2.4716250132769346,
345
- "rewards/consensus_reward_func": 0.875,
346
  "rewards/cumulative_reward_2": 0.0,
347
- "rewards/final_correctness_reward_func": 0.75,
348
- "rewards/question_recreation_reward_func": 0.7155673063825816,
349
  "rewards/soft_format_reward_func": 0.0,
350
- "rewards/strict_format_reward_func": 0.0,
351
- "rewards/xmlcount_reward_func": 0.7401562570594251,
352
  "step": 36
353
  },
354
  {
355
- "completion_length": 236.34375,
356
  "epoch": 4.222222222222222,
357
- "grad_norm": 3.966975212097168,
358
- "kl": 2.2727794479578733,
359
  "learning_rate": 1.9216926233717084e-07,
360
- "loss": 0.0023,
361
- "reward": 2.9746797680854797,
362
- "reward_std": 1.6573905128752813,
363
- "rewards/concensus_correctness_reward_func": 0.8823749981820583,
364
- "rewards/consensus_reward_func": 0.375,
365
  "rewards/cumulative_reward_2": 0.0,
366
- "rewards/final_correctness_reward_func": 0.3125,
367
- "rewards/question_recreation_reward_func": 0.5670548444613814,
368
  "rewards/soft_format_reward_func": 0.0,
369
- "rewards/strict_format_reward_func": 0.03125,
370
- "rewards/xmlcount_reward_func": 0.8065000018104911,
371
  "step": 38
372
  },
373
  {
374
- "completion_length": 282.34375,
375
  "epoch": 4.444444444444445,
376
- "grad_norm": 3.61830735206604,
377
- "kl": 0.557787942700088,
378
  "learning_rate": 1.4033009983067452e-07,
379
- "loss": 0.0006,
380
- "reward": 4.611605238169432,
381
- "reward_std": 3.382820954779163,
382
- "rewards/concensus_correctness_reward_func": 2.163499998860061,
383
- "rewards/consensus_reward_func": 0.75,
384
  "rewards/cumulative_reward_2": 0.0,
385
- "rewards/final_correctness_reward_func": 0.4375,
386
- "rewards/question_recreation_reward_func": 0.531542734708637,
387
  "rewards/soft_format_reward_func": 0.0,
388
- "rewards/strict_format_reward_func": 0.03125,
389
- "rewards/xmlcount_reward_func": 0.697812516707927,
390
  "step": 40
391
  },
392
  {
393
- "completion_length": 249.75,
394
  "epoch": 4.666666666666667,
395
- "grad_norm": 5.680083274841309,
396
- "kl": 1.7294750094879419,
397
  "learning_rate": 9.549150281252632e-08,
398
- "loss": 0.0017,
399
- "reward": 4.764165852218866,
400
- "reward_std": 4.05824239552021,
401
- "rewards/concensus_correctness_reward_func": 2.169937494210899,
402
- "rewards/consensus_reward_func": 0.375,
403
  "rewards/cumulative_reward_2": 0.0,
404
- "rewards/final_correctness_reward_func": 0.8125,
405
- "rewards/question_recreation_reward_func": 0.5612284038215876,
406
  "rewards/soft_format_reward_func": 0.0,
407
- "rewards/strict_format_reward_func": 0.046875,
408
- "rewards/xmlcount_reward_func": 0.7986249960958958,
409
  "step": 42
410
  },
411
  {
412
- "completion_length": 325.9375,
413
  "epoch": 4.888888888888889,
414
- "grad_norm": 27.005922317504883,
415
- "kl": 2.3825969519093633,
416
  "learning_rate": 5.8526203570536504e-08,
417
- "loss": 0.0024,
418
- "reward": 3.219888724386692,
419
- "reward_std": 2.2487800465896726,
420
- "rewards/concensus_correctness_reward_func": 0.9441250010859221,
421
- "rewards/consensus_reward_func": 0.5,
422
  "rewards/cumulative_reward_2": 0.0,
423
- "rewards/final_correctness_reward_func": 0.25,
424
- "rewards/question_recreation_reward_func": 0.7558574602007866,
425
  "rewards/soft_format_reward_func": 0.0,
426
- "rewards/strict_format_reward_func": 0.015625,
427
- "rewards/xmlcount_reward_func": 0.7542812507599592,
428
  "step": 44
429
  },
430
  {
431
- "completion_length": 271.21875,
432
  "epoch": 5.111111111111111,
433
- "grad_norm": 3.743121385574341,
434
- "kl": 0.6370164407417178,
435
  "learning_rate": 3.015368960704584e-08,
436
- "loss": 0.0006,
437
- "reward": 5.291439572349191,
438
- "reward_std": 3.2558085079072043,
439
- "rewards/concensus_correctness_reward_func": 2.495437517296523,
440
- "rewards/consensus_reward_func": 0.5,
441
  "rewards/cumulative_reward_2": 0.0,
442
  "rewards/final_correctness_reward_func": 0.75,
443
- "rewards/question_recreation_reward_func": 0.7084708036854863,
444
  "rewards/soft_format_reward_func": 0.0,
445
  "rewards/strict_format_reward_func": 0.03125,
446
- "rewards/xmlcount_reward_func": 0.8062812462449074,
447
  "step": 46
448
  },
449
  {
450
- "completion_length": 255.59375,
451
  "epoch": 5.333333333333333,
452
- "grad_norm": 4.239270210266113,
453
- "kl": 9.081267803441733,
454
  "learning_rate": 1.0926199633097154e-08,
455
- "loss": 0.0091,
456
- "reward": 3.829737439751625,
457
- "reward_std": 2.9596789788920432,
458
- "rewards/concensus_correctness_reward_func": 1.5678749959915876,
459
  "rewards/consensus_reward_func": 0.5,
460
  "rewards/cumulative_reward_2": 0.0,
461
- "rewards/final_correctness_reward_func": 0.4375,
462
- "rewards/question_recreation_reward_func": 0.4738624934107065,
463
  "rewards/soft_format_reward_func": 0.0,
464
- "rewards/strict_format_reward_func": 0.0,
465
- "rewards/xmlcount_reward_func": 0.8505000225268304,
466
  "step": 48
467
  },
468
  {
469
- "completion_length": 243.875,
470
  "epoch": 5.555555555555555,
471
- "grad_norm": 11.775144577026367,
472
- "kl": 2.5097127612680197,
473
  "learning_rate": 1.217974870087901e-09,
474
- "loss": 0.0025,
475
- "reward": 3.7787333577871323,
476
- "reward_std": 1.0139204831793904,
477
- "rewards/concensus_correctness_reward_func": 1.442687500268221,
478
- "rewards/consensus_reward_func": 0.375,
479
  "rewards/cumulative_reward_2": 0.0,
480
- "rewards/final_correctness_reward_func": 0.5625,
481
- "rewards/question_recreation_reward_func": 0.5674208542332053,
482
- "rewards/soft_format_reward_func": 0.015625,
483
- "rewards/strict_format_reward_func": 0.015625,
484
- "rewards/xmlcount_reward_func": 0.7998750088736415,
485
  "step": 50
486
  },
487
  {
488
  "epoch": 5.555555555555555,
489
  "step": 50,
490
  "total_flos": 0.0,
491
- "train_loss": 0.005916564127692254,
492
- "train_runtime": 674.0869,
493
- "train_samples_per_second": 1.187,
494
  "train_steps_per_second": 0.074
495
  }
496
  ],
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "completion_length": 251.09375,
14
  "epoch": 0.2222222222222222,
15
+ "grad_norm": 3.406686544418335,
16
  "kl": 0.0,
17
  "learning_rate": 2e-07,
18
  "loss": -0.0,
19
+ "reward": 5.090359630994499,
20
+ "reward_std": 4.374027599813417,
21
+ "rewards/concensus_correctness_reward_func": 2.586999997496605,
22
  "rewards/consensus_reward_func": 0.4375,
23
  "rewards/cumulative_reward_2": 0.0,
24
+ "rewards/final_correctness_reward_func": 0.8125,
25
+ "rewards/question_recreation_reward_func": 0.5857971757650375,
26
  "rewards/soft_format_reward_func": 0.0,
27
+ "rewards/strict_format_reward_func": 0.03125,
28
+ "rewards/xmlcount_reward_func": 0.6363125052303076,
29
  "step": 2
30
  },
31
  {
32
+ "completion_length": 238.53125,
33
  "epoch": 0.4444444444444444,
34
+ "grad_norm": 6.297623157501221,
35
+ "kl": 0.002095213483698899,
36
  "learning_rate": 6e-07,
37
  "loss": 0.0,
38
+ "reward": 6.276864215731621,
39
+ "reward_std": 4.376051411032677,
40
+ "rewards/concensus_correctness_reward_func": 3.2183125205338,
41
+ "rewards/consensus_reward_func": 0.5,
42
  "rewards/cumulative_reward_2": 0.0,
43
+ "rewards/final_correctness_reward_func": 0.8125,
44
+ "rewards/question_recreation_reward_func": 0.6349266925826669,
45
  "rewards/soft_format_reward_func": 0.0,
46
+ "rewards/strict_format_reward_func": 0.046875,
47
+ "rewards/xmlcount_reward_func": 1.0642500249668956,
48
  "step": 4
49
  },
50
  {
51
+ "completion_length": 247.1875,
52
  "epoch": 0.6666666666666666,
53
+ "grad_norm": 5.020749568939209,
54
+ "kl": 0.01812579674879089,
55
  "learning_rate": 1e-06,
56
+ "loss": 0.0,
57
+ "reward": 8.028940327465534,
58
+ "reward_std": 7.026336292154156,
59
+ "rewards/concensus_correctness_reward_func": 4.982750025577843,
60
+ "rewards/consensus_reward_func": 0.5625,
61
  "rewards/cumulative_reward_2": 0.0,
62
+ "rewards/final_correctness_reward_func": 0.875,
63
+ "rewards/question_recreation_reward_func": 0.6356277531012893,
64
  "rewards/soft_format_reward_func": 0.0,
65
+ "rewards/strict_format_reward_func": 0.03125,
66
+ "rewards/xmlcount_reward_func": 0.9418125078082085,
67
  "step": 6
68
  },
69
  {
70
+ "completion_length": 264.5625,
71
  "epoch": 0.8888888888888888,
72
+ "grad_norm": 6.246143341064453,
73
+ "kl": 1.4362198570743203,
74
  "learning_rate": 9.95134034370785e-07,
75
+ "loss": 0.0014,
76
+ "reward": 5.521899528801441,
77
+ "reward_std": 3.0373885043663904,
78
+ "rewards/concensus_correctness_reward_func": 2.975437503308058,
79
+ "rewards/consensus_reward_func": 0.625,
80
  "rewards/cumulative_reward_2": 0.0,
81
+ "rewards/final_correctness_reward_func": 0.3125,
82
+ "rewards/question_recreation_reward_func": 0.5394933708012104,
83
  "rewards/soft_format_reward_func": 0.0,
84
+ "rewards/strict_format_reward_func": 0.046875,
85
+ "rewards/xmlcount_reward_func": 1.0225937701761723,
86
  "step": 8
87
  },
88
  {
89
+ "completion_length": 253.09375,
90
  "epoch": 1.1111111111111112,
91
+ "grad_norm": 379351.5,
92
+ "kl": 121253.2733656764,
93
  "learning_rate": 9.806308479691594e-07,
94
+ "loss": 121.2533,
95
+ "reward": 6.7504779398441315,
96
+ "reward_std": 3.2276243461528793,
97
+ "rewards/concensus_correctness_reward_func": 3.5170000011567026,
98
+ "rewards/consensus_reward_func": 0.875,
99
  "rewards/cumulative_reward_2": 0.0,
100
+ "rewards/final_correctness_reward_func": 0.75,
101
+ "rewards/question_recreation_reward_func": 0.6782278986647725,
102
  "rewards/soft_format_reward_func": 0.0,
103
+ "rewards/strict_format_reward_func": 0.03125,
104
+ "rewards/xmlcount_reward_func": 0.8990000244230032,
105
  "step": 10
106
  },
107
  {
108
+ "completion_length": 246.3125,
109
  "epoch": 1.3333333333333333,
110
+ "grad_norm": 11.476463317871094,
111
+ "kl": 53.28857711702585,
112
  "learning_rate": 9.567727288213004e-07,
113
+ "loss": 0.0533,
114
+ "reward": 7.5567396730184555,
115
+ "reward_std": 6.025143433362246,
116
+ "rewards/concensus_correctness_reward_func": 4.465624971315265,
117
+ "rewards/consensus_reward_func": 0.8125,
118
  "rewards/cumulative_reward_2": 0.0,
119
+ "rewards/final_correctness_reward_func": 0.8125,
120
+ "rewards/question_recreation_reward_func": 0.43192729796282947,
121
  "rewards/soft_format_reward_func": 0.0,
122
+ "rewards/strict_format_reward_func": 0.03125,
123
+ "rewards/xmlcount_reward_func": 1.0029375031590462,
124
  "step": 12
125
  },
126
  {
127
+ "completion_length": 262.9375,
128
  "epoch": 1.5555555555555556,
129
+ "grad_norm": 108.78337860107422,
130
+ "kl": 15830.661798972636,
131
  "learning_rate": 9.240240480782129e-07,
132
+ "loss": 15.8307,
133
+ "reward": 4.460910737514496,
134
+ "reward_std": 2.0786191248334944,
135
+ "rewards/concensus_correctness_reward_func": 1.170062493532896,
136
+ "rewards/consensus_reward_func": 0.5625,
137
  "rewards/cumulative_reward_2": 0.0,
138
+ "rewards/final_correctness_reward_func": 0.9375,
139
+ "rewards/question_recreation_reward_func": 0.6721919253468513,
140
  "rewards/soft_format_reward_func": 0.0,
141
+ "rewards/strict_format_reward_func": 0.03125,
142
+ "rewards/xmlcount_reward_func": 1.087406262755394,
143
  "step": 14
144
  },
145
  {
146
+ "completion_length": 228.65625,
147
  "epoch": 1.7777777777777777,
148
+ "grad_norm": 17.862974166870117,
149
+ "kl": 24.956144016236067,
150
  "learning_rate": 8.83022221559489e-07,
151
+ "loss": 0.025,
152
+ "reward": 6.531906917691231,
153
+ "reward_std": 5.046906062096241,
154
+ "rewards/concensus_correctness_reward_func": 3.738874990493059,
155
+ "rewards/consensus_reward_func": 0.5,
156
  "rewards/cumulative_reward_2": 0.0,
157
+ "rewards/final_correctness_reward_func": 0.625,
158
+ "rewards/question_recreation_reward_func": 0.6465007420629263,
159
  "rewards/soft_format_reward_func": 0.0,
160
+ "rewards/strict_format_reward_func": 0.03125,
161
+ "rewards/xmlcount_reward_func": 0.9902812633663416,
162
  "step": 16
163
  },
164
  {
165
+ "completion_length": 251.90625,
166
  "epoch": 2.0,
167
+ "grad_norm": 4.168570041656494,
168
+ "kl": 5.209025803487748,
169
  "learning_rate": 8.34565303179429e-07,
170
+ "loss": 0.0052,
171
+ "reward": 7.48027828335762,
172
+ "reward_std": 3.668259933590889,
173
+ "rewards/concensus_correctness_reward_func": 4.085375004447997,
174
+ "rewards/consensus_reward_func": 0.875,
175
  "rewards/cumulative_reward_2": 0.0,
176
+ "rewards/final_correctness_reward_func": 0.8125,
177
+ "rewards/question_recreation_reward_func": 0.7168719079345465,
178
  "rewards/soft_format_reward_func": 0.0,
179
  "rewards/strict_format_reward_func": 0.0,
180
+ "rewards/xmlcount_reward_func": 0.9905312536284328,
181
  "step": 18
182
  },
183
  {
184
+ "completion_length": 253.46875,
185
  "epoch": 2.2222222222222223,
186
+ "grad_norm": 4.598588943481445,
187
+ "kl": 0.6110749203944579,
188
  "learning_rate": 7.795964517353733e-07,
189
+ "loss": 0.0006,
190
+ "reward": 6.335133410990238,
191
+ "reward_std": 4.133349603740498,
192
+ "rewards/concensus_correctness_reward_func": 3.3612499944865704,
193
+ "rewards/consensus_reward_func": 0.5625,
194
  "rewards/cumulative_reward_2": 0.0,
195
+ "rewards/final_correctness_reward_func": 0.875,
196
+ "rewards/question_recreation_reward_func": 0.6452896627597511,
197
  "rewards/soft_format_reward_func": 0.0,
198
+ "rewards/strict_format_reward_func": 0.015625,
199
+ "rewards/xmlcount_reward_func": 0.8754687653854489,
200
  "step": 20
201
  },
202
  {
203
+ "completion_length": 246.40625,
204
  "epoch": 2.4444444444444446,
205
+ "grad_norm": 4.547008514404297,
206
+ "kl": 0.8581339719239622,
207
  "learning_rate": 7.191855733945386e-07,
208
+ "loss": 0.0009,
209
+ "reward": 3.972074016928673,
210
+ "reward_std": 2.218664333457127,
211
+ "rewards/concensus_correctness_reward_func": 1.2331875103991479,
212
+ "rewards/consensus_reward_func": 0.5,
213
  "rewards/cumulative_reward_2": 0.0,
214
+ "rewards/final_correctness_reward_func": 0.5625,
215
+ "rewards/question_recreation_reward_func": 0.6385114891454577,
216
  "rewards/soft_format_reward_func": 0.0,
217
+ "rewards/strict_format_reward_func": 0.015625,
218
+ "rewards/xmlcount_reward_func": 1.0222500003874302,
219
  "step": 22
220
  },
221
  {
222
+ "completion_length": 314.75,
223
  "epoch": 2.6666666666666665,
224
+ "grad_norm": 3.5269718170166016,
225
+ "kl": 1.1453959059435874,
226
  "learning_rate": 6.545084971874736e-07,
227
+ "loss": 0.0011,
228
+ "reward": 5.517209440469742,
229
+ "reward_std": 2.4003249094821513,
230
+ "rewards/concensus_correctness_reward_func": 2.6363124921917915,
231
+ "rewards/consensus_reward_func": 0.5625,
232
  "rewards/cumulative_reward_2": 0.0,
233
+ "rewards/final_correctness_reward_func": 0.8125,
234
+ "rewards/question_recreation_reward_func": 0.5380531689152122,
235
  "rewards/soft_format_reward_func": 0.0,
236
+ "rewards/strict_format_reward_func": 0.0,
237
+ "rewards/xmlcount_reward_func": 0.9678437616676092,
238
  "step": 24
239
  },
240
  {
241
+ "completion_length": 242.59375,
242
  "epoch": 2.888888888888889,
243
+ "grad_norm": 3.7082345485687256,
244
+ "kl": 1.661696835188195,
245
  "learning_rate": 5.868240888334652e-07,
246
+ "loss": 0.0017,
247
+ "reward": 7.0335357040166855,
248
+ "reward_std": 5.964661547914147,
249
+ "rewards/concensus_correctness_reward_func": 4.0771874990314245,
250
+ "rewards/consensus_reward_func": 0.5625,
251
  "rewards/cumulative_reward_2": 0.0,
252
+ "rewards/final_correctness_reward_func": 0.75,
253
+ "rewards/question_recreation_reward_func": 0.6281919404864311,
254
  "rewards/soft_format_reward_func": 0.0,
255
+ "rewards/strict_format_reward_func": 0.015625,
256
+ "rewards/xmlcount_reward_func": 1.0000312514603138,
257
  "step": 26
258
  },
259
  {
260
+ "completion_length": 233.21875,
261
  "epoch": 3.111111111111111,
262
+ "grad_norm": 25.148460388183594,
263
+ "kl": 5.4034789321012795,
264
  "learning_rate": 5.174497483512505e-07,
265
+ "loss": 0.0054,
266
+ "reward": 5.528537306934595,
267
+ "reward_std": 2.2288794559426606,
268
+ "rewards/concensus_correctness_reward_func": 2.6711875112960115,
269
+ "rewards/consensus_reward_func": 0.6875,
270
  "rewards/cumulative_reward_2": 0.0,
271
+ "rewards/final_correctness_reward_func": 0.625,
272
+ "rewards/question_recreation_reward_func": 0.5974122565239668,
273
  "rewards/soft_format_reward_func": 0.0,
274
+ "rewards/strict_format_reward_func": 0.03125,
275
+ "rewards/xmlcount_reward_func": 0.9161875313147902,
276
  "step": 28
277
  },
278
  {
279
+ "completion_length": 257.6875,
280
  "epoch": 3.3333333333333335,
281
+ "grad_norm": 15.775626182556152,
282
+ "kl": 4.37631535762921,
283
  "learning_rate": 4.477357683661733e-07,
284
+ "loss": 0.0044,
285
+ "reward": 7.878351733088493,
286
+ "reward_std": 3.3786554981488734,
287
+ "rewards/concensus_correctness_reward_func": 5.094187501817942,
288
+ "rewards/consensus_reward_func": 0.8125,
289
  "rewards/cumulative_reward_2": 0.0,
290
+ "rewards/final_correctness_reward_func": 0.4375,
291
+ "rewards/question_recreation_reward_func": 0.5230704261921346,
292
  "rewards/soft_format_reward_func": 0.0,
293
+ "rewards/strict_format_reward_func": 0.046875,
294
+ "rewards/xmlcount_reward_func": 0.9642187729477882,
295
  "step": 30
296
  },
297
  {
298
+ "completion_length": 227.34375,
299
  "epoch": 3.5555555555555554,
300
+ "grad_norm": 3.913177967071533,
301
+ "kl": 1.274317068979144,
302
  "learning_rate": 3.790390522001662e-07,
303
+ "loss": 0.0013,
304
+ "reward": 8.299735330045223,
305
+ "reward_std": 3.323359102010727,
306
+ "rewards/concensus_correctness_reward_func": 5.080312505364418,
307
+ "rewards/consensus_reward_func": 0.625,
308
  "rewards/cumulative_reward_2": 0.0,
309
+ "rewards/final_correctness_reward_func": 0.875,
310
+ "rewards/question_recreation_reward_func": 0.6301415413618088,
311
  "rewards/soft_format_reward_func": 0.0,
312
+ "rewards/strict_format_reward_func": 0.0625,
313
+ "rewards/xmlcount_reward_func": 1.0267812612000853,
314
  "step": 32
315
  },
316
  {
317
+ "completion_length": 292.15625,
318
  "epoch": 3.7777777777777777,
319
+ "grad_norm": 11.773321151733398,
320
+ "kl": 7.366791445761919,
321
  "learning_rate": 3.1269670329204393e-07,
322
+ "loss": 0.0074,
323
+ "reward": 7.669991716742516,
324
+ "reward_std": 4.413828293792903,
325
+ "rewards/concensus_correctness_reward_func": 4.837812501937151,
326
+ "rewards/consensus_reward_func": 0.5625,
327
  "rewards/cumulative_reward_2": 0.0,
328
+ "rewards/final_correctness_reward_func": 0.875,
329
+ "rewards/question_recreation_reward_func": 0.5424916381016374,
330
  "rewards/soft_format_reward_func": 0.0,
331
  "rewards/strict_format_reward_func": 0.03125,
332
+ "rewards/xmlcount_reward_func": 0.8209374994039536,
333
  "step": 34
334
  },
335
  {
336
+ "completion_length": 251.59375,
337
  "epoch": 4.0,
338
+ "grad_norm": 9.20654582977295,
339
+ "kl": 3.2344849393703043,
340
  "learning_rate": 2.500000000000001e-07,
341
+ "loss": 0.0032,
342
+ "reward": 6.5783121436834335,
343
+ "reward_std": 4.831312101276126,
344
+ "rewards/concensus_correctness_reward_func": 3.0734374783933163,
345
+ "rewards/consensus_reward_func": 0.6875,
346
  "rewards/cumulative_reward_2": 0.0,
347
+ "rewards/final_correctness_reward_func": 1.125,
348
+ "rewards/question_recreation_reward_func": 0.6645933054387569,
349
  "rewards/soft_format_reward_func": 0.0,
350
+ "rewards/strict_format_reward_func": 0.015625,
351
+ "rewards/xmlcount_reward_func": 1.0121562704443932,
352
  "step": 36
353
  },
354
  {
355
+ "completion_length": 296.84375,
356
  "epoch": 4.222222222222222,
357
+ "grad_norm": 6.073974609375,
358
+ "kl": 80.8798265401274,
359
  "learning_rate": 1.9216926233717084e-07,
360
+ "loss": 0.0809,
361
+ "reward": 4.182305257767439,
362
+ "reward_std": 3.0763283036649227,
363
+ "rewards/concensus_correctness_reward_func": 1.8618750125169754,
364
+ "rewards/consensus_reward_func": 0.5,
365
  "rewards/cumulative_reward_2": 0.0,
366
+ "rewards/final_correctness_reward_func": 0.375,
367
+ "rewards/question_recreation_reward_func": 0.6596803162246943,
368
  "rewards/soft_format_reward_func": 0.0,
369
+ "rewards/strict_format_reward_func": 0.0625,
370
+ "rewards/xmlcount_reward_func": 0.7232500202953815,
371
  "step": 38
372
  },
373
  {
374
+ "completion_length": 250.09375,
375
  "epoch": 4.444444444444445,
376
+ "grad_norm": 8.074630737304688,
377
+ "kl": 2.947148013394326,
378
  "learning_rate": 1.4033009983067452e-07,
379
+ "loss": 0.0029,
380
+ "reward": 7.208126068115234,
381
+ "reward_std": 4.182072307477938,
382
+ "rewards/concensus_correctness_reward_func": 3.5688749849796295,
383
+ "rewards/consensus_reward_func": 1.0,
384
  "rewards/cumulative_reward_2": 0.0,
385
+ "rewards/final_correctness_reward_func": 0.9375,
386
+ "rewards/question_recreation_reward_func": 0.6139386314898729,
387
  "rewards/soft_format_reward_func": 0.0,
388
+ "rewards/strict_format_reward_func": 0.015625,
389
+ "rewards/xmlcount_reward_func": 1.0721875205636024,
390
  "step": 40
391
  },
392
  {
393
+ "completion_length": 223.28125,
394
  "epoch": 4.666666666666667,
395
+ "grad_norm": 4.93925666809082,
396
+ "kl": 2.2123069388326257,
397
  "learning_rate": 9.549150281252632e-08,
398
+ "loss": 0.0022,
399
+ "reward": 7.967550210654736,
400
+ "reward_std": 6.516119306630571,
401
+ "rewards/concensus_correctness_reward_func": 4.553937453776598,
402
+ "rewards/consensus_reward_func": 0.6875,
403
  "rewards/cumulative_reward_2": 0.0,
404
+ "rewards/final_correctness_reward_func": 1.0,
405
+ "rewards/question_recreation_reward_func": 0.6051440089941025,
406
  "rewards/soft_format_reward_func": 0.0,
407
+ "rewards/strict_format_reward_func": 0.015625,
408
+ "rewards/xmlcount_reward_func": 1.105343785136938,
409
  "step": 42
410
  },
411
  {
412
+ "completion_length": 261.0,
413
  "epoch": 4.888888888888889,
414
+ "grad_norm": 3.927772045135498,
415
+ "kl": 1.7613378865644336,
416
  "learning_rate": 5.8526203570536504e-08,
417
+ "loss": 0.0018,
418
+ "reward": 7.037954144179821,
419
+ "reward_std": 4.669055305188522,
420
+ "rewards/concensus_correctness_reward_func": 3.980187527835369,
421
+ "rewards/consensus_reward_func": 0.4375,
422
  "rewards/cumulative_reward_2": 0.0,
423
+ "rewards/final_correctness_reward_func": 1.125,
424
+ "rewards/question_recreation_reward_func": 0.6027041245251894,
425
  "rewards/soft_format_reward_func": 0.0,
426
+ "rewards/strict_format_reward_func": 0.0,
427
+ "rewards/xmlcount_reward_func": 0.892562497407198,
428
  "step": 44
429
  },
430
  {
431
+ "completion_length": 234.75,
432
  "epoch": 5.111111111111111,
433
+ "grad_norm": 5.521337985992432,
434
+ "kl": 6.4816615069285035,
435
  "learning_rate": 3.015368960704584e-08,
436
+ "loss": 0.0065,
437
+ "reward": 8.22635594010353,
438
+ "reward_std": 3.674573255237192,
439
+ "rewards/concensus_correctness_reward_func": 5.032687483006157,
440
+ "rewards/consensus_reward_func": 0.875,
441
  "rewards/cumulative_reward_2": 0.0,
442
  "rewards/final_correctness_reward_func": 0.75,
443
+ "rewards/question_recreation_reward_func": 0.5293248100206256,
444
  "rewards/soft_format_reward_func": 0.0,
445
  "rewards/strict_format_reward_func": 0.03125,
446
+ "rewards/xmlcount_reward_func": 1.0080937594175339,
447
  "step": 46
448
  },
449
  {
450
+ "completion_length": 294.0,
451
  "epoch": 5.333333333333333,
452
+ "grad_norm": 3.950500249862671,
453
+ "kl": 3.034157019108534,
454
  "learning_rate": 1.0926199633097154e-08,
455
+ "loss": 0.003,
456
+ "reward": 7.7403726652264595,
457
+ "reward_std": 5.27158590964973,
458
+ "rewards/concensus_correctness_reward_func": 4.709000015282072,
459
  "rewards/consensus_reward_func": 0.5,
460
  "rewards/cumulative_reward_2": 0.0,
461
+ "rewards/final_correctness_reward_func": 0.8125,
462
+ "rewards/question_recreation_reward_func": 0.7043102057650685,
463
  "rewards/soft_format_reward_func": 0.0,
464
+ "rewards/strict_format_reward_func": 0.015625,
465
+ "rewards/xmlcount_reward_func": 0.998937513679266,
466
  "step": 48
467
  },
468
  {
469
+ "completion_length": 237.59375,
470
  "epoch": 5.555555555555555,
471
+ "grad_norm": 8.28918743133545,
472
+ "kl": 5.900157635100186,
473
  "learning_rate": 1.217974870087901e-09,
474
+ "loss": 0.0059,
475
+ "reward": 8.886989444494247,
476
+ "reward_std": 5.726772859226912,
477
+ "rewards/concensus_correctness_reward_func": 5.724687514826655,
478
+ "rewards/consensus_reward_func": 0.6875,
479
  "rewards/cumulative_reward_2": 0.0,
480
+ "rewards/final_correctness_reward_func": 1.0,
481
+ "rewards/question_recreation_reward_func": 0.5425831722095609,
482
+ "rewards/soft_format_reward_func": 0.0,
483
+ "rewards/strict_format_reward_func": 0.078125,
484
+ "rewards/xmlcount_reward_func": 0.8540937826037407,
485
  "step": 50
486
  },
487
  {
488
  "epoch": 5.555555555555555,
489
  "step": 50,
490
  "total_flos": 0.0,
491
+ "train_loss": 5.491919757317737,
492
+ "train_runtime": 671.7848,
493
+ "train_samples_per_second": 1.191,
494
  "train_steps_per_second": 0.074
495
  }
496
  ],