chenggong1995 commited on
Commit
d84ddac
·
verified ·
1 Parent(s): 3526338

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-Math-7B
3
+ library_name: transformers
4
+ model_name: Qwen2.5-Math-7B-gen8-math3to5-ghpo-cold0-3Dhint-prompt1-epoch1
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - ghpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for Qwen2.5-Math-7B-gen8-math3to5-ghpo-cold0-3Dhint-prompt1-epoch1
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="chenggong1995/Qwen2.5-Math-7B-gen8-math3to5-ghpo-cold0-3Dhint-prompt1-epoch1", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/gongc1995-city-university-of-hong-kong/huggingface/runs/j1gylbgn)
31
+
32
+
33
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.16.0
38
+ - Transformers: 4.50.0
39
+ - Pytorch: 2.5.1
40
+ - Datasets: 3.5.0
41
+ - Tokenizers: 0.21.1
42
+
43
+ ## Citations
44
+
45
+ Cite GRPO as:
46
+
47
+ ```bibtex
48
+ @article{zhihong2024deepseekmath,
49
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
50
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.03300},
53
+ }
54
+
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.12618592532375192,
4
+ "train_runtime": 35348.2691,
5
+ "train_samples": 8888,
6
+ "train_samples_per_second": 0.251,
7
+ "train_steps_per_second": 0.002
8
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.50.0"
6
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.12618592532375192,
4
+ "train_runtime": 35348.2691,
5
+ "train_samples": 8888,
6
+ "train_samples_per_second": 0.251,
7
+ "train_steps_per_second": 0.002
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.9952755905511811,
6
+ "eval_steps": 10000000000,
7
+ "global_step": 79,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio": 0.0,
14
+ "completion_length": 797.146240234375,
15
+ "epoch": 0.012598425196850394,
16
+ "grad_norm": 0.22265445878992737,
17
+ "learning_rate": 1.25e-07,
18
+ "loss": 0.201,
19
+ "num_tokens": 871723.0,
20
+ "reward": 0.5814732387661934,
21
+ "reward_std": 0.4103100262582302,
22
+ "rewards/accuracy_reward": 0.5747767761349678,
23
+ "rewards/format_reward": 0.01339285762514919,
24
+ "step": 1
25
+ },
26
+ {
27
+ "clip_ratio": 0.0,
28
+ "completion_length": 804.7539443969727,
29
+ "epoch": 0.06299212598425197,
30
+ "grad_norm": 0.2417152167110338,
31
+ "learning_rate": 6.249999999999999e-07,
32
+ "loss": 0.1642,
33
+ "num_tokens": 4403713.0,
34
+ "reward": 0.5676618544384837,
35
+ "reward_std": 0.4300461960956454,
36
+ "rewards/accuracy_reward": 0.563895090483129,
37
+ "rewards/format_reward": 0.007533482450526208,
38
+ "step": 5
39
+ },
40
+ {
41
+ "clip_ratio": 0.0,
42
+ "completion_length": 785.3382049560547,
43
+ "epoch": 0.12598425196850394,
44
+ "grad_norm": 0.3011761966028881,
45
+ "learning_rate": 9.980434110374724e-07,
46
+ "loss": 0.1628,
47
+ "num_tokens": 8734420.0,
48
+ "reward": 0.575000024586916,
49
+ "reward_std": 0.40654933378100394,
50
+ "rewards/accuracy_reward": 0.5716517880558968,
51
+ "rewards/format_reward": 0.006696428824216128,
52
+ "step": 10
53
+ },
54
+ {
55
+ "clip_ratio": 0.0,
56
+ "completion_length": 807.4542770385742,
57
+ "epoch": 0.1889763779527559,
58
+ "grad_norm": 0.1895048136482896,
59
+ "learning_rate": 9.762072666790656e-07,
60
+ "loss": 0.1844,
61
+ "num_tokens": 13144543.0,
62
+ "reward": 0.5949777036905288,
63
+ "reward_std": 0.3915623873472214,
64
+ "rewards/accuracy_reward": 0.5912946447730064,
65
+ "rewards/format_reward": 0.007366071711294353,
66
+ "step": 15
67
+ },
68
+ {
69
+ "clip_ratio": 0.0,
70
+ "completion_length": 804.9518280029297,
71
+ "epoch": 0.25196850393700787,
72
+ "grad_norm": 0.17095132727584114,
73
+ "learning_rate": 9.311572862600138e-07,
74
+ "loss": 0.1833,
75
+ "num_tokens": 17551799.0,
76
+ "reward": 0.6332589581608772,
77
+ "reward_std": 0.37318109199404714,
78
+ "rewards/accuracy_reward": 0.6314732164144516,
79
+ "rewards/format_reward": 0.00357142873108387,
80
+ "step": 20
81
+ },
82
+ {
83
+ "clip_ratio": 0.0,
84
+ "completion_length": 797.7531616210938,
85
+ "epoch": 0.31496062992125984,
86
+ "grad_norm": 0.18012116491915073,
87
+ "learning_rate": 8.650895363529172e-07,
88
+ "loss": 0.1815,
89
+ "num_tokens": 21890173.0,
90
+ "reward": 0.6541294917464257,
91
+ "reward_std": 0.3551797144114971,
92
+ "rewards/accuracy_reward": 0.6547733508050442,
93
+ "rewards/format_reward": 0.0015625000698491931,
94
+ "step": 25
95
+ },
96
+ {
97
+ "clip_ratio": 0.0,
98
+ "completion_length": 759.0582946777344,
99
+ "epoch": 0.3779527559055118,
100
+ "grad_norm": 0.22639137855339267,
101
+ "learning_rate": 7.812246438203903e-07,
102
+ "loss": 0.1692,
103
+ "num_tokens": 26076066.0,
104
+ "reward": 0.708482176065445,
105
+ "reward_std": 0.30925857946276664,
106
+ "rewards/accuracy_reward": 0.7082589268684387,
107
+ "rewards/format_reward": 0.00044642859138548373,
108
+ "step": 30
109
+ },
110
+ {
111
+ "clip_ratio": 0.0,
112
+ "completion_length": 739.7466842651368,
113
+ "epoch": 0.4409448818897638,
114
+ "grad_norm": 0.37314514493140266,
115
+ "learning_rate": 6.836507988323784e-07,
116
+ "loss": 0.1384,
117
+ "num_tokens": 30202131.0,
118
+ "reward": 0.7066964626312255,
119
+ "reward_std": 0.2901428207755089,
120
+ "rewards/accuracy_reward": 0.7066964276134968,
121
+ "rewards/format_reward": 0.0,
122
+ "step": 35
123
+ },
124
+ {
125
+ "clip_ratio": 0.0,
126
+ "completion_length": 772.0323989868164,
127
+ "epoch": 0.5039370078740157,
128
+ "grad_norm": 0.2614993046463775,
129
+ "learning_rate": 5.771244664826511e-07,
130
+ "loss": 0.1437,
131
+ "num_tokens": 34466708.0,
132
+ "reward": 0.6880580708384514,
133
+ "reward_std": 0.3077801916748285,
134
+ "rewards/accuracy_reward": 0.6879464246332645,
135
+ "rewards/format_reward": 0.00022321429569274187,
136
+ "step": 40
137
+ },
138
+ {
139
+ "clip_ratio": 0.0,
140
+ "completion_length": 713.0567276000977,
141
+ "epoch": 0.5669291338582677,
142
+ "grad_norm": 0.1454705351797755,
143
+ "learning_rate": 4.6683852178244817e-07,
144
+ "loss": 0.0997,
145
+ "num_tokens": 38522522.0,
146
+ "reward": 0.7095982447266579,
147
+ "reward_std": 0.2782834365963936,
148
+ "rewards/accuracy_reward": 0.7095982141792774,
149
+ "rewards/format_reward": 0.0,
150
+ "step": 45
151
+ },
152
+ {
153
+ "clip_ratio": 0.0,
154
+ "completion_length": 742.1245864868164,
155
+ "epoch": 0.6299212598425197,
156
+ "grad_norm": 0.14001943959476107,
157
+ "learning_rate": 3.5816911083285164e-07,
158
+ "loss": 0.0861,
159
+ "num_tokens": 42626344.0,
160
+ "reward": 0.7125000342726707,
161
+ "reward_std": 0.26790192127227785,
162
+ "rewards/accuracy_reward": 0.7125,
163
+ "rewards/format_reward": 0.0,
164
+ "step": 50
165
+ },
166
+ {
167
+ "clip_ratio": 0.0,
168
+ "completion_length": 725.4094085693359,
169
+ "epoch": 0.6929133858267716,
170
+ "grad_norm": 0.11596792832277024,
171
+ "learning_rate": 2.5641357801960184e-07,
172
+ "loss": 0.0765,
173
+ "num_tokens": 46657290.0,
174
+ "reward": 0.712500037252903,
175
+ "reward_std": 0.2700365446507931,
176
+ "rewards/accuracy_reward": 0.7125000022351742,
177
+ "rewards/format_reward": 0.0,
178
+ "step": 55
179
+ },
180
+ {
181
+ "clip_ratio": 0.0,
182
+ "completion_length": 716.4462371826172,
183
+ "epoch": 0.7559055118110236,
184
+ "grad_norm": 0.22234439564713598,
185
+ "learning_rate": 1.665322345816746e-07,
186
+ "loss": 0.0849,
187
+ "num_tokens": 50648721.0,
188
+ "reward": 0.7366071820259095,
189
+ "reward_std": 0.24836960211396217,
190
+ "rewards/accuracy_reward": 0.7366071425378322,
191
+ "rewards/format_reward": 0.0,
192
+ "step": 60
193
+ },
194
+ {
195
+ "clip_ratio": 0.0,
196
+ "completion_length": 717.4730224609375,
197
+ "epoch": 0.8188976377952756,
198
+ "grad_norm": 0.182600444186133,
199
+ "learning_rate": 9.290655664821296e-08,
200
+ "loss": 0.0909,
201
+ "num_tokens": 54634528.0,
202
+ "reward": 0.7285714581608772,
203
+ "reward_std": 0.25840977653861047,
204
+ "rewards/accuracy_reward": 0.7285714313387871,
205
+ "rewards/format_reward": 0.0,
206
+ "step": 65
207
+ },
208
+ {
209
+ "clip_ratio": 0.0,
210
+ "completion_length": 712.1507034301758,
211
+ "epoch": 0.8818897637795275,
212
+ "grad_norm": 0.17072451326668445,
213
+ "learning_rate": 3.912559994556086e-08,
214
+ "loss": 0.0883,
215
+ "num_tokens": 58570507.0,
216
+ "reward": 0.7503348544239998,
217
+ "reward_std": 0.25088600218296053,
218
+ "rewards/accuracy_reward": 0.7515796698629856,
219
+ "rewards/format_reward": 0.00022321429569274187,
220
+ "step": 70
221
+ },
222
+ {
223
+ "clip_ratio": 0.0,
224
+ "completion_length": 719.5852981567383,
225
+ "epoch": 0.9448818897637795,
226
+ "grad_norm": 0.2933895566333781,
227
+ "learning_rate": 7.811042888637209e-09,
228
+ "loss": 0.0763,
229
+ "num_tokens": 62600049.0,
230
+ "reward": 0.7198661029338836,
231
+ "reward_std": 0.2631711885333061,
232
+ "rewards/accuracy_reward": 0.7198660731315613,
233
+ "rewards/format_reward": 0.0,
234
+ "step": 75
235
+ },
236
+ {
237
+ "clip_ratio": 0.0,
238
+ "completion_length": 755.7718505859375,
239
+ "epoch": 0.9952755905511811,
240
+ "num_tokens": 65965046.0,
241
+ "reward": 0.7045201249420643,
242
+ "reward_std": 0.28055303543806076,
243
+ "rewards/accuracy_reward": 0.7042410708963871,
244
+ "rewards/format_reward": 0.0005580357392318547,
245
+ "step": 79,
246
+ "total_flos": 0.0,
247
+ "train_loss": 0.12618592532375192,
248
+ "train_runtime": 35348.2691,
249
+ "train_samples_per_second": 0.251,
250
+ "train_steps_per_second": 0.002
251
+ }
252
+ ],
253
+ "logging_steps": 5,
254
+ "max_steps": 79,
255
+ "num_input_tokens_seen": 0,
256
+ "num_train_epochs": 1,
257
+ "save_steps": 500,
258
+ "stateful_callbacks": {
259
+ "TrainerControl": {
260
+ "args": {
261
+ "should_epoch_stop": false,
262
+ "should_evaluate": false,
263
+ "should_log": false,
264
+ "should_save": true,
265
+ "should_training_stop": true
266
+ },
267
+ "attributes": {}
268
+ }
269
+ },
270
+ "total_flos": 0.0,
271
+ "train_batch_size": 16,
272
+ "trial_name": null,
273
+ "trial_params": null
274
+ }