chenggong1995 commited on
Commit
df1c2bc
·
verified ·
1 Parent(s): 99cd0b0

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B
3
+ library_name: transformers
4
+ model_name: Qwen-2.5-Base-7B-gen8-math3to5_olympiads_aime-ghpo-cold0-hint50-prompt1-redonum-test
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - ghpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for Qwen-2.5-Base-7B-gen8-math3to5_olympiads_aime-ghpo-cold0-hint50-prompt1-redonum-test
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="chenggong1995/Qwen-2.5-Base-7B-gen8-math3to5_olympiads_aime-ghpo-cold0-hint50-prompt1-redonum-test", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/gongc1995-city-university-of-hong-kong/huggingface/runs/ogasbug2)
31
+
32
+
33
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.16.0
38
+ - Transformers: 4.50.0
39
+ - Pytorch: 2.5.1
40
+ - Datasets: 3.5.0
41
+ - Tokenizers: 0.21.1
42
+
43
+ ## Citations
44
+
45
+ Cite GRPO as:
46
+
47
+ ```bibtex
48
+ @article{zhihong2024deepseekmath,
49
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
50
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.03300},
53
+ }
54
+
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.019440959097726123,
4
+ "train_runtime": 57675.0727,
5
+ "train_samples": 18328,
6
+ "train_samples_per_second": 0.318,
7
+ "train_steps_per_second": 0.003
8
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.50.0"
6
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.019440959097726123,
4
+ "train_runtime": 57675.0727,
5
+ "train_samples": 18328,
6
+ "train_samples_per_second": 0.318,
7
+ "train_steps_per_second": 0.003
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,512 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.9954198473282443,
6
+ "eval_steps": 200000,
7
+ "global_step": 163,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio": 0.0,
14
+ "completion_length": 567.619441986084,
15
+ "epoch": 0.0061068702290076335,
16
+ "grad_norm": 0.2135867174294605,
17
+ "learning_rate": 5.88235294117647e-08,
18
+ "loss": 0.0031,
19
+ "num_tokens": 743987.0,
20
+ "reward": 0.07700893143191934,
21
+ "reward_std": 0.1299287434667349,
22
+ "rewards/accuracy_reward": 0.06919642793945968,
23
+ "rewards/format_reward": 0.01562500058207661,
24
+ "step": 1
25
+ },
26
+ {
27
+ "clip_ratio": 0.0,
28
+ "completion_length": 588.1579504013062,
29
+ "epoch": 0.030534351145038167,
30
+ "grad_norm": 0.3343295200773406,
31
+ "learning_rate": 2.941176470588235e-07,
32
+ "loss": -0.0019,
33
+ "num_tokens": 3769305.0,
34
+ "reward": 0.0671037980064284,
35
+ "reward_std": 0.12649992539081722,
36
+ "rewards/accuracy_reward": 0.05803571452270262,
37
+ "rewards/format_reward": 0.018136161408619955,
38
+ "step": 5
39
+ },
40
+ {
41
+ "clip_ratio": 0.0,
42
+ "completion_length": 588.1192253112793,
43
+ "epoch": 0.061068702290076333,
44
+ "grad_norm": 3.974587027805743,
45
+ "learning_rate": 5.88235294117647e-07,
46
+ "loss": 0.004,
47
+ "num_tokens": 7571663.0,
48
+ "reward": 0.07087053959257901,
49
+ "reward_std": 0.12776878643780948,
50
+ "rewards/accuracy_reward": 0.05870535783469677,
51
+ "rewards/format_reward": 0.02433035774156451,
52
+ "step": 10
53
+ },
54
+ {
55
+ "clip_ratio": 0.0,
56
+ "completion_length": 606.0297157287598,
57
+ "epoch": 0.0916030534351145,
58
+ "grad_norm": 0.495391151906653,
59
+ "learning_rate": 8.823529411764705e-07,
60
+ "loss": 0.0015,
61
+ "num_tokens": 11421588.0,
62
+ "reward": 0.09140625372529029,
63
+ "reward_std": 0.1608576664701104,
64
+ "rewards/accuracy_reward": 0.06540178635623306,
65
+ "rewards/format_reward": 0.05200892963912338,
66
+ "step": 15
67
+ },
68
+ {
69
+ "clip_ratio": 0.0,
70
+ "completion_length": 603.9748046875,
71
+ "epoch": 0.12213740458015267,
72
+ "grad_norm": 0.40463318371067986,
73
+ "learning_rate": 9.989585804326962e-07,
74
+ "loss": -0.0012,
75
+ "num_tokens": 15290931.0,
76
+ "reward": 0.20602679532021284,
77
+ "reward_std": 0.26069728564471006,
78
+ "rewards/accuracy_reward": 0.08125000051222742,
79
+ "rewards/format_reward": 0.24955357071012257,
80
+ "step": 20
81
+ },
82
+ {
83
+ "clip_ratio": 0.0,
84
+ "completion_length": 637.4477951049805,
85
+ "epoch": 0.15267175572519084,
86
+ "grad_norm": 0.2621022081219828,
87
+ "learning_rate": 9.926100533780304e-07,
88
+ "loss": 0.0092,
89
+ "num_tokens": 19308993.0,
90
+ "reward": 0.4613839514553547,
91
+ "reward_std": 0.29145158380270003,
92
+ "rewards/accuracy_reward": 0.09285714323632419,
93
+ "rewards/format_reward": 0.737053570151329,
94
+ "step": 25
95
+ },
96
+ {
97
+ "clip_ratio": 0.0,
98
+ "completion_length": 643.634407043457,
99
+ "epoch": 0.183206106870229,
100
+ "grad_norm": 0.6637031031614584,
101
+ "learning_rate": 9.805648919361503e-07,
102
+ "loss": 0.0201,
103
+ "num_tokens": 23260675.0,
104
+ "reward": 0.607031274586916,
105
+ "reward_std": 0.22586645130068064,
106
+ "rewards/accuracy_reward": 0.1455357144586742,
107
+ "rewards/format_reward": 0.9229910746216774,
108
+ "step": 30
109
+ },
110
+ {
111
+ "clip_ratio": 0.0,
112
+ "completion_length": 719.7797180175781,
113
+ "epoch": 0.21374045801526717,
114
+ "grad_norm": 0.12398978901271052,
115
+ "learning_rate": 9.62962388596925e-07,
116
+ "loss": 0.0393,
117
+ "num_tokens": 27603904.0,
118
+ "reward": 0.6875000298023224,
119
+ "reward_std": 0.2122076230123639,
120
+ "rewards/accuracy_reward": 0.2053571424447,
121
+ "rewards/format_reward": 0.9642857119441033,
122
+ "step": 35
123
+ },
124
+ {
125
+ "clip_ratio": 0.0,
126
+ "completion_length": 754.8406555175782,
127
+ "epoch": 0.24427480916030533,
128
+ "grad_norm": 0.11716820534931413,
129
+ "learning_rate": 9.400061019867678e-07,
130
+ "loss": 0.0414,
131
+ "num_tokens": 32059078.0,
132
+ "reward": 0.6993303894996643,
133
+ "reward_std": 0.18653424717485906,
134
+ "rewards/accuracy_reward": 0.21361607229337096,
135
+ "rewards/format_reward": 0.9714285716414451,
136
+ "step": 40
137
+ },
138
+ {
139
+ "clip_ratio": 0.0,
140
+ "completion_length": 752.6609725952148,
141
+ "epoch": 0.2748091603053435,
142
+ "grad_norm": 0.1366862152370541,
143
+ "learning_rate": 9.11961502878777e-07,
144
+ "loss": 0.0295,
145
+ "num_tokens": 36473463.0,
146
+ "reward": 0.7407366394996643,
147
+ "reward_std": 0.16629975400865077,
148
+ "rewards/accuracy_reward": 0.24955357071012257,
149
+ "rewards/format_reward": 0.9823660656809807,
150
+ "step": 45
151
+ },
152
+ {
153
+ "clip_ratio": 0.0,
154
+ "completion_length": 762.0527130126953,
155
+ "epoch": 0.3053435114503817,
156
+ "grad_norm": 0.11417326322107703,
157
+ "learning_rate": 8.791529042392812e-07,
158
+ "loss": 0.0308,
159
+ "num_tokens": 40938643.0,
160
+ "reward": 0.7233259305357933,
161
+ "reward_std": 0.16576984385028481,
162
+ "rewards/accuracy_reward": 0.23281249962747097,
163
+ "rewards/format_reward": 0.9810267806053161,
164
+ "step": 50
165
+ },
166
+ {
167
+ "clip_ratio": 0.0,
168
+ "completion_length": 764.9165496826172,
169
+ "epoch": 0.33587786259541985,
170
+ "grad_norm": 0.10399028120391927,
171
+ "learning_rate": 8.419597108123053e-07,
172
+ "loss": 0.0269,
173
+ "num_tokens": 45439493.0,
174
+ "reward": 0.729910746216774,
175
+ "reward_std": 0.16336991311982274,
176
+ "rewards/accuracy_reward": 0.2368303578812629,
177
+ "rewards/format_reward": 0.9861607074737548,
178
+ "step": 55
179
+ },
180
+ {
181
+ "clip_ratio": 0.0,
182
+ "completion_length": 750.571240234375,
183
+ "epoch": 0.366412213740458,
184
+ "grad_norm": 0.10084790893922295,
185
+ "learning_rate": 8.008120316124611e-07,
186
+ "loss": 0.0243,
187
+ "num_tokens": 49869428.0,
188
+ "reward": 0.7180803894996644,
189
+ "reward_std": 0.1501687964424491,
190
+ "rewards/accuracy_reward": 0.22611607052385807,
191
+ "rewards/format_reward": 0.9839285641908646,
192
+ "step": 60
193
+ },
194
+ {
195
+ "clip_ratio": 0.0,
196
+ "completion_length": 788.064323425293,
197
+ "epoch": 0.3969465648854962,
198
+ "grad_norm": 0.0777401486625089,
199
+ "learning_rate": 7.561857060642119e-07,
200
+ "loss": 0.0195,
201
+ "num_tokens": 54497556.0,
202
+ "reward": 0.7145089581608772,
203
+ "reward_std": 0.16678392123430968,
204
+ "rewards/accuracy_reward": 0.22120535522699356,
205
+ "rewards/format_reward": 0.9866071388125419,
206
+ "step": 65
207
+ },
208
+ {
209
+ "clip_ratio": 0.0,
210
+ "completion_length": 759.6870880126953,
211
+ "epoch": 0.42748091603053434,
212
+ "grad_norm": 0.08469355022666704,
213
+ "learning_rate": 7.085968013061584e-07,
214
+ "loss": 0.0277,
215
+ "num_tokens": 58963650.0,
216
+ "reward": 0.7053571730852127,
217
+ "reward_std": 0.13858543820679187,
218
+ "rewards/accuracy_reward": 0.21272321385331452,
219
+ "rewards/format_reward": 0.9852678492665291,
220
+ "step": 70
221
+ },
222
+ {
223
+ "clip_ratio": 0.0,
224
+ "completion_length": 784.3569534301757,
225
+ "epoch": 0.4580152671755725,
226
+ "grad_norm": 0.09089115066810906,
227
+ "learning_rate": 6.585956442945531e-07,
228
+ "loss": 0.022,
229
+ "num_tokens": 63555393.0,
230
+ "reward": 0.7090402141213417,
231
+ "reward_std": 0.15061827255412935,
232
+ "rewards/accuracy_reward": 0.21651785587891936,
233
+ "rewards/format_reward": 0.9850446343421936,
234
+ "step": 75
235
+ },
236
+ {
237
+ "clip_ratio": 0.0,
238
+ "completion_length": 797.2703521728515,
239
+ "epoch": 0.48854961832061067,
240
+ "grad_norm": 0.09002554108722542,
241
+ "learning_rate": 6.06760457719898e-07,
242
+ "loss": 0.0238,
243
+ "num_tokens": 68216500.0,
244
+ "reward": 0.7666295006871223,
245
+ "reward_std": 0.16813623085618018,
246
+ "rewards/accuracy_reward": 0.27165178642608223,
247
+ "rewards/format_reward": 0.9899553492665291,
248
+ "step": 80
249
+ },
250
+ {
251
+ "clip_ratio": 0.0,
252
+ "completion_length": 761.4013763427735,
253
+ "epoch": 0.5190839694656488,
254
+ "grad_norm": 0.08498851686715876,
255
+ "learning_rate": 5.536906733320815e-07,
256
+ "loss": 0.0158,
257
+ "num_tokens": 72721850.0,
258
+ "reward": 0.7462053880095482,
259
+ "reward_std": 0.1457503356039524,
260
+ "rewards/accuracy_reward": 0.2502232126891613,
261
+ "rewards/format_reward": 0.9919642806053162,
262
+ "step": 85
263
+ },
264
+ {
265
+ "clip_ratio": 0.0,
266
+ "completion_length": 766.9694580078125,
267
+ "epoch": 0.549618320610687,
268
+ "grad_norm": 0.07561608030311687,
269
+ "learning_rate": 5e-07,
270
+ "loss": 0.024,
271
+ "num_tokens": 77220857.0,
272
+ "reward": 0.739285746216774,
273
+ "reward_std": 0.14479873944073915,
274
+ "rewards/accuracy_reward": 0.2448660720139742,
275
+ "rewards/format_reward": 0.9888392820954323,
276
+ "step": 90
277
+ },
278
+ {
279
+ "clip_ratio": 0.0,
280
+ "completion_length": 760.7444534301758,
281
+ "epoch": 0.5801526717557252,
282
+ "grad_norm": 0.08175108965516903,
283
+ "learning_rate": 4.463093266679185e-07,
284
+ "loss": 0.0241,
285
+ "num_tokens": 81689720.0,
286
+ "reward": 0.7234375298023223,
287
+ "reward_std": 0.1551271199248731,
288
+ "rewards/accuracy_reward": 0.22790178465656935,
289
+ "rewards/format_reward": 0.9910714194178581,
290
+ "step": 95
291
+ },
292
+ {
293
+ "clip_ratio": 0.0,
294
+ "completion_length": 759.4685607910156,
295
+ "epoch": 0.6106870229007634,
296
+ "grad_norm": 0.09009365686861091,
297
+ "learning_rate": 3.932395422801019e-07,
298
+ "loss": 0.0283,
299
+ "num_tokens": 86136451.0,
300
+ "reward": 0.7452009245753288,
301
+ "reward_std": 0.1618154514580965,
302
+ "rewards/accuracy_reward": 0.2524553562514484,
303
+ "rewards/format_reward": 0.9854910641908645,
304
+ "step": 100
305
+ },
306
+ {
307
+ "clip_ratio": 0.0,
308
+ "completion_length": 739.8957916259766,
309
+ "epoch": 0.6412213740458015,
310
+ "grad_norm": 0.09858205620896988,
311
+ "learning_rate": 3.41404355705447e-07,
312
+ "loss": 0.0175,
313
+ "num_tokens": 90456928.0,
314
+ "reward": 0.7717634305357933,
315
+ "reward_std": 0.1629006579518318,
316
+ "rewards/accuracy_reward": 0.2754464283585548,
317
+ "rewards/format_reward": 0.9926339209079742,
318
+ "step": 105
319
+ },
320
+ {
321
+ "clip_ratio": 0.0,
322
+ "completion_length": 779.8815078735352,
323
+ "epoch": 0.6717557251908397,
324
+ "grad_norm": 0.08118289189251325,
325
+ "learning_rate": 2.914031986938417e-07,
326
+ "loss": 0.019,
327
+ "num_tokens": 95039765.0,
328
+ "reward": 0.7184152096509934,
329
+ "reward_std": 0.15354990400373936,
330
+ "rewards/accuracy_reward": 0.2238839288474992,
331
+ "rewards/format_reward": 0.9890624955296516,
332
+ "step": 110
333
+ },
334
+ {
335
+ "clip_ratio": 0.0,
336
+ "completion_length": 743.4281570434571,
337
+ "epoch": 0.7022900763358778,
338
+ "grad_norm": 0.08942937354761228,
339
+ "learning_rate": 2.4381429393578815e-07,
340
+ "loss": 0.0134,
341
+ "num_tokens": 99422555.0,
342
+ "reward": 0.7406250327825546,
343
+ "reward_std": 0.15929818488657474,
344
+ "rewards/accuracy_reward": 0.24397321371361613,
345
+ "rewards/format_reward": 0.9933035641908645,
346
+ "step": 115
347
+ },
348
+ {
349
+ "clip_ratio": 0.0,
350
+ "completion_length": 755.4859725952149,
351
+ "epoch": 0.732824427480916,
352
+ "grad_norm": 0.08614513315269835,
353
+ "learning_rate": 1.991879683875386e-07,
354
+ "loss": 0.0216,
355
+ "num_tokens": 103884636.0,
356
+ "reward": 0.7185268118977547,
357
+ "reward_std": 0.14079238111153244,
358
+ "rewards/accuracy_reward": 0.22388392696157097,
359
+ "rewards/format_reward": 0.9892857059836387,
360
+ "step": 120
361
+ },
362
+ {
363
+ "clip_ratio": 0.0,
364
+ "completion_length": 738.5589630126954,
365
+ "epoch": 0.7633587786259542,
366
+ "grad_norm": 0.0987941233877778,
367
+ "learning_rate": 1.5804028918769485e-07,
368
+ "loss": 0.0177,
369
+ "num_tokens": 108269772.0,
370
+ "reward": 0.7502232447266579,
371
+ "reward_std": 0.1488088957965374,
372
+ "rewards/accuracy_reward": 0.2546874986961484,
373
+ "rewards/format_reward": 0.9910714223980903,
374
+ "step": 125
375
+ },
376
+ {
377
+ "clip_ratio": 0.0,
378
+ "completion_length": 718.1944564819336,
379
+ "epoch": 0.7938931297709924,
380
+ "grad_norm": 0.08957435241917934,
381
+ "learning_rate": 1.2084709576071883e-07,
382
+ "loss": 0.0164,
383
+ "num_tokens": 112486547.0,
384
+ "reward": 0.7435268193483353,
385
+ "reward_std": 0.15864104311913252,
386
+ "rewards/accuracy_reward": 0.24799107126891612,
387
+ "rewards/format_reward": 0.9910714194178581,
388
+ "step": 130
389
+ },
390
+ {
391
+ "clip_ratio": 0.0,
392
+ "completion_length": 771.4886535644531,
393
+ "epoch": 0.8244274809160306,
394
+ "grad_norm": 0.10033485818985768,
395
+ "learning_rate": 8.803849712122291e-08,
396
+ "loss": 0.0191,
397
+ "num_tokens": 117017984.0,
398
+ "reward": 0.7385044932365418,
399
+ "reward_std": 0.15429062955081463,
400
+ "rewards/accuracy_reward": 0.24508928610011935,
401
+ "rewards/format_reward": 0.9868303492665291,
402
+ "step": 135
403
+ },
404
+ {
405
+ "clip_ratio": 0.0,
406
+ "completion_length": 753.3828475952148,
407
+ "epoch": 0.8549618320610687,
408
+ "grad_norm": 0.08591161076983043,
409
+ "learning_rate": 5.999389801323218e-08,
410
+ "loss": 0.017,
411
+ "num_tokens": 121422939.0,
412
+ "reward": 0.7706473574042321,
413
+ "reward_std": 0.16626517940312624,
414
+ "rewards/accuracy_reward": 0.2743303582072258,
415
+ "rewards/format_reward": 0.9926339253783226,
416
+ "step": 140
417
+ },
418
+ {
419
+ "clip_ratio": 0.0,
420
+ "completion_length": 737.1861953735352,
421
+ "epoch": 0.8854961832061069,
422
+ "grad_norm": 0.08133023369347758,
423
+ "learning_rate": 3.7037611403075096e-08,
424
+ "loss": 0.0211,
425
+ "num_tokens": 125792325.0,
426
+ "reward": 0.7737723588943481,
427
+ "reward_std": 0.14874637452885509,
428
+ "rewards/accuracy_reward": 0.2774553569033742,
429
+ "rewards/format_reward": 0.9926339238882065,
430
+ "step": 145
431
+ },
432
+ {
433
+ "clip_ratio": 0.0,
434
+ "completion_length": 719.9216812133789,
435
+ "epoch": 0.916030534351145,
436
+ "grad_norm": 0.09062176860429237,
437
+ "learning_rate": 1.943510806384968e-08,
438
+ "loss": 0.0223,
439
+ "num_tokens": 130100454.0,
440
+ "reward": 0.7459821745753288,
441
+ "reward_std": 0.1444489900022745,
442
+ "rewards/accuracy_reward": 0.24977678433060646,
443
+ "rewards/format_reward": 0.9924107134342194,
444
+ "step": 150
445
+ },
446
+ {
447
+ "clip_ratio": 0.0,
448
+ "completion_length": 725.9875335693359,
449
+ "epoch": 0.9465648854961832,
450
+ "grad_norm": 0.0910938816922007,
451
+ "learning_rate": 7.389946621969678e-09,
452
+ "loss": 0.0111,
453
+ "num_tokens": 134418750.0,
454
+ "reward": 0.7150669932365418,
455
+ "reward_std": 0.145760334469378,
456
+ "rewards/accuracy_reward": 0.21897321371361614,
457
+ "rewards/format_reward": 0.9921874910593033,
458
+ "step": 155
459
+ },
460
+ {
461
+ "clip_ratio": 0.0,
462
+ "completion_length": 729.7480224609375,
463
+ "epoch": 0.9770992366412213,
464
+ "grad_norm": 0.0976254681849543,
465
+ "learning_rate": 1.0414195673039138e-09,
466
+ "loss": 0.0201,
467
+ "num_tokens": 138738941.0,
468
+ "reward": 0.7399553954601288,
469
+ "reward_std": 0.13362135970965028,
470
+ "rewards/accuracy_reward": 0.24397321399301292,
471
+ "rewards/format_reward": 0.9919642806053162,
472
+ "step": 160
473
+ },
474
+ {
475
+ "clip_ratio": 0.0,
476
+ "completion_length": 757.2314198811849,
477
+ "epoch": 0.9954198473282443,
478
+ "num_tokens": 141385215.0,
479
+ "reward": 0.7395833656191826,
480
+ "reward_std": 0.1369063208500544,
481
+ "rewards/accuracy_reward": 0.2447916651920726,
482
+ "rewards/format_reward": 0.9895833333333334,
483
+ "step": 163,
484
+ "total_flos": 0.0,
485
+ "train_loss": 0.019440959097726123,
486
+ "train_runtime": 57675.0727,
487
+ "train_samples_per_second": 0.318,
488
+ "train_steps_per_second": 0.003
489
+ }
490
+ ],
491
+ "logging_steps": 5,
492
+ "max_steps": 163,
493
+ "num_input_tokens_seen": 0,
494
+ "num_train_epochs": 1,
495
+ "save_steps": 500,
496
+ "stateful_callbacks": {
497
+ "TrainerControl": {
498
+ "args": {
499
+ "should_epoch_stop": false,
500
+ "should_evaluate": false,
501
+ "should_log": false,
502
+ "should_save": true,
503
+ "should_training_stop": true
504
+ },
505
+ "attributes": {}
506
+ }
507
+ },
508
+ "total_flos": 0.0,
509
+ "train_batch_size": 16,
510
+ "trial_name": null,
511
+ "trial_params": null
512
+ }