vcabeli commited on
Commit
7fd2b90
·
verified ·
1 Parent(s): b3f884b

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-8B
3
+ library_name: transformers
4
+ model_name: Qwen3-8B-Open-R1-GRPO-spatial-dea
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - grpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for Qwen3-8B-Open-R1-GRPO-spatial-dea
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="vcabeli/Qwen3-8B-Open-R1-GRPO-spatial-dea", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/vincent-cabeli-owkin/huggingface/runs/zolvk2vf)
31
+
32
+
33
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.18.0
38
+ - Transformers: 4.52.3
39
+ - Pytorch: 2.6.0
40
+ - Datasets: 3.6.0
41
+ - Tokenizers: 0.21.1
42
+
43
+ ## Citations
44
+
45
+ Cite GRPO as:
46
+
47
+ ```bibtex
48
+ @article{zhihong2024deepseekmath,
49
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
50
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.03300},
53
+ }
54
+
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.04093834859403697,
4
+ "train_runtime": 1244.7225,
5
+ "train_samples": 1092,
6
+ "train_samples_per_second": 0.877,
7
+ "train_steps_per_second": 0.018
8
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": 151645,
5
+ "pad_token_id": 151643,
6
+ "temperature": 0.6,
7
+ "top_k": 20,
8
+ "top_p": 0.95,
9
+ "transformers_version": "4.52.3"
10
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.04093834859403697,
4
+ "train_runtime": 1244.7225,
5
+ "train_samples": 1092,
6
+ "train_samples_per_second": 0.877,
7
+ "train_steps_per_second": 0.018
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,747 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.967032967032967,
6
+ "eval_steps": 50,
7
+ "global_step": 22,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": -2.6625,
19
+ "completions/max_length": 1024.0,
20
+ "completions/max_terminated_length": 1022.0,
21
+ "completions/mean_length": 848.7000732421875,
22
+ "completions/mean_terminated_length": 736.819091796875,
23
+ "completions/min_length": 247.0,
24
+ "completions/min_terminated_length": 247.0,
25
+ "epoch": 0.04395604395604396,
26
+ "frac_reward_zero_std": 0.0,
27
+ "grad_norm": 0.5238365408949885,
28
+ "kl": 0.0005369186401367188,
29
+ "learning_rate": 0.0,
30
+ "loss": 0.112,
31
+ "num_tokens": 533936.0,
32
+ "reward": 1.4911459684371948,
33
+ "reward_std": 1.1085364818572998,
34
+ "rewards/format_reward/mean": 0.0,
35
+ "rewards/format_reward/std": 0.0,
36
+ "rewards/tag_count_reward/mean": 0.6682291626930237,
37
+ "rewards/tag_count_reward/std": 0.32341793179512024,
38
+ "rewards/tcga_signature_exact_answer_match/mean": 0.3270833194255829,
39
+ "rewards/tcga_signature_exact_answer_match/std": 0.4696374535560608,
40
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.4958333373069763,
41
+ "rewards/tcga_signature_valid_option_chosen/std": 0.5005043148994446,
42
+ "step": 1
43
+ },
44
+ {
45
+ "clip_ratio/high_max": 0.0,
46
+ "clip_ratio/high_mean": 0.0,
47
+ "clip_ratio/low_mean": 0.0,
48
+ "clip_ratio/low_min": 0.0,
49
+ "clip_ratio/region_mean": 0.0,
50
+ "completions/clipped_ratio": -3.05,
51
+ "completions/max_length": 1024.0,
52
+ "completions/max_terminated_length": 1024.0,
53
+ "completions/mean_length": 804.4771118164062,
54
+ "completions/mean_terminated_length": 698.7808837890625,
55
+ "completions/min_length": 303.0,
56
+ "completions/min_terminated_length": 303.0,
57
+ "epoch": 0.08791208791208792,
58
+ "frac_reward_zero_std": 0.0,
59
+ "grad_norm": 0.5679897102025524,
60
+ "kl": 0.0005602836608886719,
61
+ "learning_rate": 3.333333333333333e-07,
62
+ "loss": 0.1357,
63
+ "num_tokens": 1046665.0,
64
+ "reward": 1.6322917938232422,
65
+ "reward_std": 1.0769468545913696,
66
+ "rewards/format_reward/mean": 0.0,
67
+ "rewards/format_reward/std": 0.0,
68
+ "rewards/tag_count_reward/mean": 0.721875011920929,
69
+ "rewards/tag_count_reward/std": 0.30987676978111267,
70
+ "rewards/tcga_signature_exact_answer_match/mean": 0.34375,
71
+ "rewards/tcga_signature_exact_answer_match/std": 0.4754543900489807,
72
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.5666666626930237,
73
+ "rewards/tcga_signature_valid_option_chosen/std": 0.496052622795105,
74
+ "step": 2
75
+ },
76
+ {
77
+ "clip_ratio/high_max": 0.0,
78
+ "clip_ratio/high_mean": 0.0,
79
+ "clip_ratio/low_mean": 0.0,
80
+ "clip_ratio/low_min": 0.0,
81
+ "clip_ratio/region_mean": 0.0,
82
+ "completions/clipped_ratio": -2.1375,
83
+ "completions/max_length": 1024.0,
84
+ "completions/max_terminated_length": 1020.0,
85
+ "completions/mean_length": 856.67919921875,
86
+ "completions/mean_terminated_length": 704.02392578125,
87
+ "completions/min_length": 42.0,
88
+ "completions/min_terminated_length": 42.0,
89
+ "epoch": 0.13186813186813187,
90
+ "frac_reward_zero_std": 0.0,
91
+ "grad_norm": 0.657415026599291,
92
+ "kl": 0.0005474090576171875,
93
+ "learning_rate": 6.666666666666666e-07,
94
+ "loss": 0.1174,
95
+ "num_tokens": 1584311.0,
96
+ "reward": 1.3697917461395264,
97
+ "reward_std": 1.0497500896453857,
98
+ "rewards/format_reward/mean": 0.0,
99
+ "rewards/format_reward/std": 0.0,
100
+ "rewards/tag_count_reward/mean": 0.6197916865348816,
101
+ "rewards/tag_count_reward/std": 0.327057421207428,
102
+ "rewards/tcga_signature_exact_answer_match/mean": 0.2979166805744171,
103
+ "rewards/tcga_signature_exact_answer_match/std": 0.4578198194503784,
104
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.4520833194255829,
105
+ "rewards/tcga_signature_valid_option_chosen/std": 0.49821794033050537,
106
+ "step": 3
107
+ },
108
+ {
109
+ "clip_ratio/high_max": 0.0,
110
+ "clip_ratio/high_mean": 0.0,
111
+ "clip_ratio/low_mean": 0.0,
112
+ "clip_ratio/low_min": 0.0,
113
+ "clip_ratio/region_mean": 0.0,
114
+ "completions/clipped_ratio": -3.375,
115
+ "completions/max_length": 1024.0,
116
+ "completions/max_terminated_length": 1023.0,
117
+ "completions/mean_length": 758.464599609375,
118
+ "completions/mean_terminated_length": 659.837158203125,
119
+ "completions/min_length": 257.0,
120
+ "completions/min_terminated_length": 257.0,
121
+ "epoch": 0.17582417582417584,
122
+ "frac_reward_zero_std": 0.0,
123
+ "grad_norm": 0.6226173582925104,
124
+ "kl": 0.0006909370422363281,
125
+ "learning_rate": 1e-06,
126
+ "loss": 0.1216,
127
+ "num_tokens": 2075024.0,
128
+ "reward": 1.870833396911621,
129
+ "reward_std": 1.1165072917938232,
130
+ "rewards/format_reward/mean": 0.0,
131
+ "rewards/format_reward/std": 0.0,
132
+ "rewards/tag_count_reward/mean": 0.7270833253860474,
133
+ "rewards/tag_count_reward/std": 0.3073488175868988,
134
+ "rewards/tcga_signature_exact_answer_match/mean": 0.5291666388511658,
135
+ "rewards/tcga_signature_exact_answer_match/std": 0.4996693730354309,
136
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.6145833134651184,
137
+ "rewards/tcga_signature_valid_option_chosen/std": 0.48720136284828186,
138
+ "step": 4
139
+ },
140
+ {
141
+ "clip_ratio/high_max": 0.0,
142
+ "clip_ratio/high_mean": 0.0,
143
+ "clip_ratio/low_mean": 0.0,
144
+ "clip_ratio/low_min": 0.0,
145
+ "clip_ratio/region_mean": 0.0,
146
+ "completions/clipped_ratio": -4.75,
147
+ "completions/max_length": 1024.0,
148
+ "completions/max_terminated_length": 1014.0,
149
+ "completions/mean_length": 609.1812744140625,
150
+ "completions/mean_terminated_length": 591.1456298828125,
151
+ "completions/min_length": 296.0,
152
+ "completions/min_terminated_length": 296.0,
153
+ "epoch": 0.21978021978021978,
154
+ "frac_reward_zero_std": 0.0,
155
+ "grad_norm": 0.5021585269149018,
156
+ "kl": 0.0020532608032226562,
157
+ "learning_rate": 9.938441702975689e-07,
158
+ "loss": 0.0573,
159
+ "num_tokens": 2493761.0,
160
+ "reward": 2.1901042461395264,
161
+ "reward_std": 0.8987299203872681,
162
+ "rewards/format_reward/mean": 0.0,
163
+ "rewards/format_reward/std": 0.0,
164
+ "rewards/tag_count_reward/mean": 0.8401041626930237,
165
+ "rewards/tag_count_reward/std": 0.22692228853702545,
166
+ "rewards/tcga_signature_exact_answer_match/mean": 0.5354166626930237,
167
+ "rewards/tcga_signature_exact_answer_match/std": 0.4992644190788269,
168
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.8145833611488342,
169
+ "rewards/tcga_signature_valid_option_chosen/std": 0.38904064893722534,
170
+ "step": 5
171
+ },
172
+ {
173
+ "clip_ratio/high_max": 0.0,
174
+ "clip_ratio/high_mean": 0.0,
175
+ "clip_ratio/low_mean": 0.0,
176
+ "clip_ratio/low_min": 0.0,
177
+ "clip_ratio/region_mean": 0.0,
178
+ "completions/clipped_ratio": -4.9125,
179
+ "completions/max_length": 1024.0,
180
+ "completions/max_terminated_length": 981.0,
181
+ "completions/mean_length": 575.1917114257812,
182
+ "completions/mean_terminated_length": 568.5496826171875,
183
+ "completions/min_length": 253.0,
184
+ "completions/min_terminated_length": 253.0,
185
+ "epoch": 0.26373626373626374,
186
+ "frac_reward_zero_std": 0.0,
187
+ "grad_norm": 0.4390970519818153,
188
+ "kl": 0.0027971267700195312,
189
+ "learning_rate": 9.755282581475767e-07,
190
+ "loss": 0.0435,
191
+ "num_tokens": 2896323.0,
192
+ "reward": 2.367187738418579,
193
+ "reward_std": 0.7412388324737549,
194
+ "rewards/format_reward/mean": 0.0,
195
+ "rewards/format_reward/std": 0.0,
196
+ "rewards/tag_count_reward/mean": 0.8734375238418579,
197
+ "rewards/tag_count_reward/std": 0.1904495805501938,
198
+ "rewards/tcga_signature_exact_answer_match/mean": 0.612500011920929,
199
+ "rewards/tcga_signature_exact_answer_match/std": 0.4876876473426819,
200
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.8812500238418579,
201
+ "rewards/tcga_signature_valid_option_chosen/std": 0.32383161783218384,
202
+ "step": 6
203
+ },
204
+ {
205
+ "clip_ratio/high_max": 0.0,
206
+ "clip_ratio/high_mean": 0.0,
207
+ "clip_ratio/low_mean": 0.0,
208
+ "clip_ratio/low_min": 0.0,
209
+ "clip_ratio/region_mean": 0.0,
210
+ "completions/clipped_ratio": -4.9375,
211
+ "completions/max_length": 1024.0,
212
+ "completions/max_terminated_length": 971.0,
213
+ "completions/mean_length": 558.78125,
214
+ "completions/mean_terminated_length": 553.8841552734375,
215
+ "completions/min_length": 307.0,
216
+ "completions/min_terminated_length": 307.0,
217
+ "epoch": 0.3076923076923077,
218
+ "frac_reward_zero_std": 0.0,
219
+ "grad_norm": 0.3987193318925216,
220
+ "kl": 0.0035257339477539062,
221
+ "learning_rate": 9.455032620941839e-07,
222
+ "loss": 0.0355,
223
+ "num_tokens": 3290878.0,
224
+ "reward": 2.4875001907348633,
225
+ "reward_std": 0.7044431567192078,
226
+ "rewards/format_reward/mean": 0.0,
227
+ "rewards/format_reward/std": 0.0,
228
+ "rewards/tag_count_reward/mean": 0.887499988079071,
229
+ "rewards/tag_count_reward/std": 0.17945055663585663,
230
+ "rewards/tcga_signature_exact_answer_match/mean": 0.6916666626930237,
231
+ "rewards/tcga_signature_exact_answer_match/std": 0.4622868597507477,
232
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9083333611488342,
233
+ "rewards/tcga_signature_valid_option_chosen/std": 0.2888558804988861,
234
+ "step": 7
235
+ },
236
+ {
237
+ "clip_ratio/high_max": 0.0,
238
+ "clip_ratio/high_mean": 0.0,
239
+ "clip_ratio/low_mean": 0.0,
240
+ "clip_ratio/low_min": 0.0,
241
+ "clip_ratio/region_mean": 0.0,
242
+ "completions/clipped_ratio": -4.975,
243
+ "completions/max_length": 1024.0,
244
+ "completions/max_terminated_length": 941.0,
245
+ "completions/mean_length": 540.3146362304688,
246
+ "completions/mean_terminated_length": 538.290771484375,
247
+ "completions/min_length": 275.0,
248
+ "completions/min_terminated_length": 275.0,
249
+ "epoch": 0.3516483516483517,
250
+ "frac_reward_zero_std": 0.0,
251
+ "grad_norm": 0.41592371093547503,
252
+ "kl": 0.0023975372314453125,
253
+ "learning_rate": 9.045084971874737e-07,
254
+ "loss": 0.0333,
255
+ "num_tokens": 3676869.0,
256
+ "reward": 2.5500001907348633,
257
+ "reward_std": 0.5402681231498718,
258
+ "rewards/format_reward/mean": 0.0,
259
+ "rewards/format_reward/std": 0.0,
260
+ "rewards/tag_count_reward/mean": 0.9166666865348816,
261
+ "rewards/tag_count_reward/std": 0.14658012986183167,
262
+ "rewards/tcga_signature_exact_answer_match/mean": 0.6791666746139526,
263
+ "rewards/tcga_signature_exact_answer_match/std": 0.4672839045524597,
264
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9541666507720947,
265
+ "rewards/tcga_signature_valid_option_chosen/std": 0.20934168994426727,
266
+ "step": 8
267
+ },
268
+ {
269
+ "clip_ratio/high_max": 0.0,
270
+ "clip_ratio/high_mean": 0.0,
271
+ "clip_ratio/low_mean": 0.0,
272
+ "clip_ratio/low_min": 0.0,
273
+ "clip_ratio/region_mean": 0.0,
274
+ "completions/clipped_ratio": -5.0,
275
+ "completions/max_length": 916.0,
276
+ "completions/max_terminated_length": 916.0,
277
+ "completions/mean_length": 533.5396118164062,
278
+ "completions/mean_terminated_length": 533.5396118164062,
279
+ "completions/min_length": 245.0,
280
+ "completions/min_terminated_length": 245.0,
281
+ "epoch": 0.3956043956043956,
282
+ "frac_reward_zero_std": 0.0,
283
+ "grad_norm": 0.38857156013829486,
284
+ "kl": 0.002452850341796875,
285
+ "learning_rate": 8.535533905932737e-07,
286
+ "loss": 0.0216,
287
+ "num_tokens": 4059548.0,
288
+ "reward": 2.6036460399627686,
289
+ "reward_std": 0.43764111399650574,
290
+ "rewards/format_reward/mean": 0.0,
291
+ "rewards/format_reward/std": 0.0,
292
+ "rewards/tag_count_reward/mean": 0.9307291507720947,
293
+ "rewards/tag_count_reward/std": 0.1293087899684906,
294
+ "rewards/tcga_signature_exact_answer_match/mean": 0.699999988079071,
295
+ "rewards/tcga_signature_exact_answer_match/std": 0.4587356746196747,
296
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9729166626930237,
297
+ "rewards/tcga_signature_valid_option_chosen/std": 0.1624956578016281,
298
+ "step": 9
299
+ },
300
+ {
301
+ "clip_ratio/high_max": 0.0,
302
+ "clip_ratio/high_mean": 0.0,
303
+ "clip_ratio/low_mean": 0.0,
304
+ "clip_ratio/low_min": 0.0,
305
+ "clip_ratio/region_mean": 0.0,
306
+ "completions/clipped_ratio": -5.0,
307
+ "completions/max_length": 969.0,
308
+ "completions/max_terminated_length": 969.0,
309
+ "completions/mean_length": 530.3687744140625,
310
+ "completions/mean_terminated_length": 530.3687744140625,
311
+ "completions/min_length": 287.0,
312
+ "completions/min_terminated_length": 287.0,
313
+ "epoch": 0.43956043956043955,
314
+ "frac_reward_zero_std": 0.0416666679084301,
315
+ "grad_norm": 0.4087564887728117,
316
+ "kl": 0.0025196075439453125,
317
+ "learning_rate": 7.938926261462365e-07,
318
+ "loss": 0.0217,
319
+ "num_tokens": 4440775.0,
320
+ "reward": 2.577604293823242,
321
+ "reward_std": 0.4898071587085724,
322
+ "rewards/format_reward/mean": 0.0,
323
+ "rewards/format_reward/std": 0.0,
324
+ "rewards/tag_count_reward/mean": 0.9151041507720947,
325
+ "rewards/tag_count_reward/std": 0.14250953495502472,
326
+ "rewards/tcga_signature_exact_answer_match/mean": 0.7124999761581421,
327
+ "rewards/tcga_signature_exact_answer_match/std": 0.4530688524246216,
328
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.949999988079071,
329
+ "rewards/tcga_signature_valid_option_chosen/std": 0.2181723266839981,
330
+ "step": 10
331
+ },
332
+ {
333
+ "clip_ratio/high_max": 0.0,
334
+ "clip_ratio/high_mean": 0.0,
335
+ "clip_ratio/low_mean": 0.0,
336
+ "clip_ratio/low_min": 0.0,
337
+ "clip_ratio/region_mean": 0.0,
338
+ "completions/clipped_ratio": -5.0,
339
+ "completions/max_length": 913.0,
340
+ "completions/max_terminated_length": 913.0,
341
+ "completions/mean_length": 532.0667114257812,
342
+ "completions/mean_terminated_length": 532.0667114257812,
343
+ "completions/min_length": 132.0,
344
+ "completions/min_terminated_length": 132.0,
345
+ "epoch": 0.4835164835164835,
346
+ "frac_reward_zero_std": 0.02083333395421505,
347
+ "grad_norm": 0.3899590355782979,
348
+ "kl": 0.0027141571044921875,
349
+ "learning_rate": 7.269952498697734e-07,
350
+ "loss": 0.0213,
351
+ "num_tokens": 4822787.0,
352
+ "reward": 2.632812738418579,
353
+ "reward_std": 0.42536818981170654,
354
+ "rewards/format_reward/mean": 0.0,
355
+ "rewards/format_reward/std": 0.0,
356
+ "rewards/tag_count_reward/mean": 0.9348958134651184,
357
+ "rewards/tag_count_reward/std": 0.13046404719352722,
358
+ "rewards/tcga_signature_exact_answer_match/mean": 0.71875,
359
+ "rewards/tcga_signature_exact_answer_match/std": 0.4500782787799835,
360
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9791666865348816,
361
+ "rewards/tcga_signature_valid_option_chosen/std": 0.14297515153884888,
362
+ "step": 11
363
+ },
364
+ {
365
+ "clip_ratio/high_max": 0.0,
366
+ "clip_ratio/high_mean": 0.0,
367
+ "clip_ratio/low_mean": 0.0,
368
+ "clip_ratio/low_min": 0.0,
369
+ "clip_ratio/region_mean": 0.0,
370
+ "completions/clipped_ratio": -4.9875,
371
+ "completions/max_length": 1024.0,
372
+ "completions/max_terminated_length": 975.0,
373
+ "completions/mean_length": 518.9229736328125,
374
+ "completions/mean_terminated_length": 517.8684692382812,
375
+ "completions/min_length": 252.0,
376
+ "completions/min_terminated_length": 252.0,
377
+ "epoch": 0.5274725274725275,
378
+ "frac_reward_zero_std": 0.0416666679084301,
379
+ "grad_norm": 0.38564823683826566,
380
+ "kl": 0.0028743743896484375,
381
+ "learning_rate": 6.545084971874736e-07,
382
+ "loss": 0.0193,
383
+ "num_tokens": 5198460.0,
384
+ "reward": 2.6432292461395264,
385
+ "reward_std": 0.4345623850822449,
386
+ "rewards/format_reward/mean": 0.0,
387
+ "rewards/format_reward/std": 0.0,
388
+ "rewards/tag_count_reward/mean": 0.9536458253860474,
389
+ "rewards/tag_count_reward/std": 0.11451195180416107,
390
+ "rewards/tcga_signature_exact_answer_match/mean": 0.7104166746139526,
391
+ "rewards/tcga_signature_exact_answer_match/std": 0.4540421664714813,
392
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9791666865348816,
393
+ "rewards/tcga_signature_valid_option_chosen/std": 0.14297515153884888,
394
+ "step": 12
395
+ },
396
+ {
397
+ "clip_ratio/high_max": 0.0,
398
+ "clip_ratio/high_mean": 0.0,
399
+ "clip_ratio/low_mean": 0.0,
400
+ "clip_ratio/low_min": 0.0,
401
+ "clip_ratio/region_mean": 0.0,
402
+ "completions/clipped_ratio": -5.0,
403
+ "completions/max_length": 1000.0,
404
+ "completions/max_terminated_length": 1000.0,
405
+ "completions/mean_length": 512.902099609375,
406
+ "completions/mean_terminated_length": 512.902099609375,
407
+ "completions/min_length": 270.0,
408
+ "completions/min_terminated_length": 270.0,
409
+ "epoch": 0.5714285714285714,
410
+ "frac_reward_zero_std": 0.0833333358168602,
411
+ "grad_norm": 0.3731024809399284,
412
+ "kl": 0.0052433013916015625,
413
+ "learning_rate": 5.782172325201155e-07,
414
+ "loss": 0.0242,
415
+ "num_tokens": 5571373.0,
416
+ "reward": 2.7375001907348633,
417
+ "reward_std": 0.29626527428627014,
418
+ "rewards/format_reward/mean": 0.0,
419
+ "rewards/format_reward/std": 0.0,
420
+ "rewards/tag_count_reward/mean": 0.9666666388511658,
421
+ "rewards/tag_count_reward/std": 0.08955546468496323,
422
+ "rewards/tcga_signature_exact_answer_match/mean": 0.7770833373069763,
423
+ "rewards/tcga_signature_exact_answer_match/std": 0.4166370928287506,
424
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9937499761581421,
425
+ "rewards/tcga_signature_valid_option_chosen/std": 0.07889172434806824,
426
+ "step": 13
427
+ },
428
+ {
429
+ "clip_ratio/high_max": 0.0,
430
+ "clip_ratio/high_mean": 0.0,
431
+ "clip_ratio/low_mean": 0.0,
432
+ "clip_ratio/low_min": 0.0,
433
+ "clip_ratio/region_mean": 0.0,
434
+ "completions/clipped_ratio": -5.0,
435
+ "completions/max_length": 891.0,
436
+ "completions/max_terminated_length": 891.0,
437
+ "completions/mean_length": 502.7437744140625,
438
+ "completions/mean_terminated_length": 502.7437744140625,
439
+ "completions/min_length": 234.0,
440
+ "completions/min_terminated_length": 234.0,
441
+ "epoch": 0.6153846153846154,
442
+ "frac_reward_zero_std": 0.1875,
443
+ "grad_norm": 0.3562351970946724,
444
+ "kl": 0.0042247772216796875,
445
+ "learning_rate": 5e-07,
446
+ "loss": 0.0107,
447
+ "num_tokens": 5939420.0,
448
+ "reward": 2.711979389190674,
449
+ "reward_std": 0.3224673867225647,
450
+ "rewards/format_reward/mean": 0.0,
451
+ "rewards/format_reward/std": 0.0,
452
+ "rewards/tag_count_reward/mean": 0.9786458611488342,
453
+ "rewards/tag_count_reward/std": 0.07178923487663269,
454
+ "rewards/tcga_signature_exact_answer_match/mean": 0.7354166507720947,
455
+ "rewards/tcga_signature_exact_answer_match/std": 0.44157129526138306,
456
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9979166388511658,
457
+ "rewards/tcga_signature_valid_option_chosen/std": 0.04564354941248894,
458
+ "step": 14
459
+ },
460
+ {
461
+ "clip_ratio/high_max": 0.0,
462
+ "clip_ratio/high_mean": 0.0,
463
+ "clip_ratio/low_mean": 0.0,
464
+ "clip_ratio/low_min": 0.0,
465
+ "clip_ratio/region_mean": 0.0,
466
+ "completions/clipped_ratio": -4.9875,
467
+ "completions/max_length": 1024.0,
468
+ "completions/max_terminated_length": 855.0,
469
+ "completions/mean_length": 492.2833557128906,
470
+ "completions/mean_terminated_length": 491.17327880859375,
471
+ "completions/min_length": 266.0,
472
+ "completions/min_terminated_length": 266.0,
473
+ "epoch": 0.6593406593406593,
474
+ "frac_reward_zero_std": 0.2291666716337204,
475
+ "grad_norm": 0.3311246905201182,
476
+ "kl": 0.004230499267578125,
477
+ "learning_rate": 4.2178276747988444e-07,
478
+ "loss": 0.0218,
479
+ "num_tokens": 6302216.0,
480
+ "reward": 2.7687501907348633,
481
+ "reward_std": 0.24975202977657318,
482
+ "rewards/format_reward/mean": 0.0,
483
+ "rewards/format_reward/std": 0.0,
484
+ "rewards/tag_count_reward/mean": 0.9854166507720947,
485
+ "rewards/tag_count_reward/std": 0.0649862140417099,
486
+ "rewards/tcga_signature_exact_answer_match/mean": 0.7854166626930237,
487
+ "rewards/tcga_signature_exact_answer_match/std": 0.4109613001346588,
488
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9979166388511658,
489
+ "rewards/tcga_signature_valid_option_chosen/std": 0.04564354941248894,
490
+ "step": 15
491
+ },
492
+ {
493
+ "clip_ratio/high_max": 0.0,
494
+ "clip_ratio/high_mean": 0.0,
495
+ "clip_ratio/low_mean": 0.0,
496
+ "clip_ratio/low_min": 0.0,
497
+ "clip_ratio/region_mean": 0.0,
498
+ "completions/clipped_ratio": -5.0,
499
+ "completions/max_length": 1005.0,
500
+ "completions/max_terminated_length": 1005.0,
501
+ "completions/mean_length": 485.6562805175781,
502
+ "completions/mean_terminated_length": 485.6562805175781,
503
+ "completions/min_length": 242.0,
504
+ "completions/min_terminated_length": 242.0,
505
+ "epoch": 0.7032967032967034,
506
+ "frac_reward_zero_std": 0.2708333432674408,
507
+ "grad_norm": 11.132264796660943,
508
+ "kl": 0.009204864501953125,
509
+ "learning_rate": 3.454915028125263e-07,
510
+ "loss": 0.021,
511
+ "num_tokens": 6661731.0,
512
+ "reward": 2.772916793823242,
513
+ "reward_std": 0.23537729680538177,
514
+ "rewards/format_reward/mean": 0.0,
515
+ "rewards/format_reward/std": 0.0,
516
+ "rewards/tag_count_reward/mean": 0.9833333492279053,
517
+ "rewards/tag_count_reward/std": 0.06840971112251282,
518
+ "rewards/tcga_signature_exact_answer_match/mean": 0.793749988079071,
519
+ "rewards/tcga_signature_exact_answer_match/std": 0.40503421425819397,
520
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9958333373069763,
521
+ "rewards/tcga_signature_valid_option_chosen/std": 0.06448230892419815,
522
+ "step": 16
523
+ },
524
+ {
525
+ "clip_ratio/high_max": 0.0,
526
+ "clip_ratio/high_mean": 0.0,
527
+ "clip_ratio/low_mean": 0.0,
528
+ "clip_ratio/low_min": 0.0,
529
+ "clip_ratio/region_mean": 0.0,
530
+ "completions/clipped_ratio": -4.9875,
531
+ "completions/max_length": 1024.0,
532
+ "completions/max_terminated_length": 792.0,
533
+ "completions/mean_length": 487.0604553222656,
534
+ "completions/mean_terminated_length": 485.939453125,
535
+ "completions/min_length": 55.0,
536
+ "completions/min_terminated_length": 55.0,
537
+ "epoch": 0.7472527472527473,
538
+ "frac_reward_zero_std": 0.3125,
539
+ "grad_norm": 0.30658081196607834,
540
+ "kl": 0.00519561767578125,
541
+ "learning_rate": 2.730047501302266e-07,
542
+ "loss": 0.0184,
543
+ "num_tokens": 7022260.0,
544
+ "reward": 2.8072917461395264,
545
+ "reward_std": 0.26266491413116455,
546
+ "rewards/format_reward/mean": 0.0,
547
+ "rewards/format_reward/std": 0.0,
548
+ "rewards/tag_count_reward/mean": 0.9864583611488342,
549
+ "rewards/tag_count_reward/std": 0.0652117058634758,
550
+ "rewards/tcga_signature_exact_answer_match/mean": 0.824999988079071,
551
+ "rewards/tcga_signature_exact_answer_match/std": 0.3803635537624359,
552
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9958333373069763,
553
+ "rewards/tcga_signature_valid_option_chosen/std": 0.06448230892419815,
554
+ "step": 17
555
+ },
556
+ {
557
+ "clip_ratio/high_max": 0.0,
558
+ "clip_ratio/high_mean": 0.0,
559
+ "clip_ratio/low_mean": 0.0,
560
+ "clip_ratio/low_min": 0.0,
561
+ "clip_ratio/region_mean": 0.0,
562
+ "completions/clipped_ratio": -4.9875,
563
+ "completions/max_length": 1024.0,
564
+ "completions/max_terminated_length": 794.0,
565
+ "completions/mean_length": 482.32293701171875,
566
+ "completions/mean_terminated_length": 481.19207763671875,
567
+ "completions/min_length": 242.0,
568
+ "completions/min_terminated_length": 242.0,
569
+ "epoch": 0.7912087912087912,
570
+ "frac_reward_zero_std": 0.3333333432674408,
571
+ "grad_norm": 0.32854884329247835,
572
+ "kl": 0.0054168701171875,
573
+ "learning_rate": 2.0610737385376348e-07,
574
+ "loss": 0.0117,
575
+ "num_tokens": 7380055.0,
576
+ "reward": 2.7911460399627686,
577
+ "reward_std": 0.2634448707103729,
578
+ "rewards/format_reward/mean": 0.0,
579
+ "rewards/format_reward/std": 0.0,
580
+ "rewards/tag_count_reward/mean": 0.9911458492279053,
581
+ "rewards/tag_count_reward/std": 0.05406000465154648,
582
+ "rewards/tcga_signature_exact_answer_match/mean": 0.8041666746139526,
583
+ "rewards/tcga_signature_exact_answer_match/std": 0.3972548544406891,
584
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9958333373069763,
585
+ "rewards/tcga_signature_valid_option_chosen/std": 0.06448230892419815,
586
+ "step": 18
587
+ },
588
+ {
589
+ "clip_ratio/high_max": 0.0,
590
+ "clip_ratio/high_mean": 0.0,
591
+ "clip_ratio/low_mean": 0.0,
592
+ "clip_ratio/low_min": 0.0,
593
+ "clip_ratio/region_mean": 0.0,
594
+ "completions/clipped_ratio": -4.9875,
595
+ "completions/max_length": 1024.0,
596
+ "completions/max_terminated_length": 895.0,
597
+ "completions/mean_length": 478.82086181640625,
598
+ "completions/mean_terminated_length": 477.68267822265625,
599
+ "completions/min_length": 249.0,
600
+ "completions/min_terminated_length": 249.0,
601
+ "epoch": 0.8351648351648352,
602
+ "frac_reward_zero_std": 0.2083333432674408,
603
+ "grad_norm": 0.3345776464811846,
604
+ "kl": 0.00551605224609375,
605
+ "learning_rate": 1.4644660940672627e-07,
606
+ "loss": 0.0201,
607
+ "num_tokens": 7736379.0,
608
+ "reward": 2.7708334922790527,
609
+ "reward_std": 0.2900172472000122,
610
+ "rewards/format_reward/mean": 0.0,
611
+ "rewards/format_reward/std": 0.0,
612
+ "rewards/tag_count_reward/mean": 0.987500011920929,
613
+ "rewards/tag_count_reward/std": 0.061301134526729584,
614
+ "rewards/tcga_signature_exact_answer_match/mean": 0.7854166626930237,
615
+ "rewards/tcga_signature_exact_answer_match/std": 0.4109613001346588,
616
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9979166388511658,
617
+ "rewards/tcga_signature_valid_option_chosen/std": 0.04564354941248894,
618
+ "step": 19
619
+ },
620
+ {
621
+ "clip_ratio/high_max": 0.0,
622
+ "clip_ratio/high_mean": 0.0,
623
+ "clip_ratio/low_mean": 0.0,
624
+ "clip_ratio/low_min": 0.0,
625
+ "clip_ratio/region_mean": 0.0,
626
+ "completions/clipped_ratio": -4.975,
627
+ "completions/max_length": 1024.0,
628
+ "completions/max_terminated_length": 839.0,
629
+ "completions/mean_length": 475.1896057128906,
630
+ "completions/mean_terminated_length": 472.8932800292969,
631
+ "completions/min_length": 58.0,
632
+ "completions/min_terminated_length": 58.0,
633
+ "epoch": 0.8791208791208791,
634
+ "frac_reward_zero_std": 0.2291666716337204,
635
+ "grad_norm": 0.33048908161588275,
636
+ "kl": 0.00527191162109375,
637
+ "learning_rate": 9.549150281252632e-08,
638
+ "loss": 0.013,
639
+ "num_tokens": 8091100.0,
640
+ "reward": 2.766145944595337,
641
+ "reward_std": 0.28794413805007935,
642
+ "rewards/format_reward/mean": 0.0,
643
+ "rewards/format_reward/std": 0.0,
644
+ "rewards/tag_count_reward/mean": 0.9911458492279053,
645
+ "rewards/tag_count_reward/std": 0.05642201751470566,
646
+ "rewards/tcga_signature_exact_answer_match/mean": 0.7791666388511658,
647
+ "rewards/tcga_signature_exact_answer_match/std": 0.4152411222457886,
648
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9958333373069763,
649
+ "rewards/tcga_signature_valid_option_chosen/std": 0.06448230892419815,
650
+ "step": 20
651
+ },
652
+ {
653
+ "clip_ratio/high_max": 0.0,
654
+ "clip_ratio/high_mean": 0.0,
655
+ "clip_ratio/low_mean": 0.0,
656
+ "clip_ratio/low_min": 0.0,
657
+ "clip_ratio/region_mean": 0.0,
658
+ "completions/clipped_ratio": -5.0,
659
+ "completions/max_length": 885.0,
660
+ "completions/max_terminated_length": 885.0,
661
+ "completions/mean_length": 480.01043701171875,
662
+ "completions/mean_terminated_length": 480.01043701171875,
663
+ "completions/min_length": 266.0,
664
+ "completions/min_terminated_length": 266.0,
665
+ "epoch": 0.9230769230769231,
666
+ "frac_reward_zero_std": 0.2291666716337204,
667
+ "grad_norm": 0.34162729701029476,
668
+ "kl": 0.005542755126953125,
669
+ "learning_rate": 5.44967379058161e-08,
670
+ "loss": 0.0061,
671
+ "num_tokens": 8448065.0,
672
+ "reward": 2.781250238418579,
673
+ "reward_std": 0.3177189528942108,
674
+ "rewards/format_reward/mean": 0.0,
675
+ "rewards/format_reward/std": 0.0,
676
+ "rewards/tag_count_reward/mean": 0.9895833134651184,
677
+ "rewards/tag_count_reward/std": 0.05000869929790497,
678
+ "rewards/tcga_signature_exact_answer_match/mean": 0.7958333492279053,
679
+ "rewards/tcga_signature_exact_answer_match/std": 0.4035119116306305,
680
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9958333373069763,
681
+ "rewards/tcga_signature_valid_option_chosen/std": 0.06448230892419815,
682
+ "step": 21
683
+ },
684
+ {
685
+ "clip_ratio/high_max": 0.0,
686
+ "clip_ratio/high_mean": 0.0,
687
+ "clip_ratio/low_mean": 0.0,
688
+ "clip_ratio/low_min": 0.0,
689
+ "clip_ratio/region_mean": 0.0,
690
+ "completions/clipped_ratio": -5.0,
691
+ "completions/max_length": 803.0,
692
+ "completions/max_terminated_length": 803.0,
693
+ "completions/mean_length": 477.56878662109375,
694
+ "completions/mean_terminated_length": 477.56878662109375,
695
+ "completions/min_length": 247.0,
696
+ "completions/min_terminated_length": 247.0,
697
+ "epoch": 0.967032967032967,
698
+ "frac_reward_zero_std": 0.3541666865348816,
699
+ "grad_norm": 0.3120035431461661,
700
+ "kl": 0.00568389892578125,
701
+ "learning_rate": 2.4471741852423233e-08,
702
+ "loss": 0.0134,
703
+ "num_tokens": 8803698.0,
704
+ "reward": 2.8156254291534424,
705
+ "reward_std": 0.26590150594711304,
706
+ "rewards/format_reward/mean": 0.0,
707
+ "rewards/format_reward/std": 0.0,
708
+ "rewards/tag_count_reward/mean": 0.9947916865348816,
709
+ "rewards/tag_count_reward/std": 0.0453927218914032,
710
+ "rewards/tcga_signature_exact_answer_match/mean": 0.8270833492279053,
711
+ "rewards/tcga_signature_exact_answer_match/std": 0.3785697817802429,
712
+ "rewards/tcga_signature_valid_option_chosen/mean": 0.9937499761581421,
713
+ "rewards/tcga_signature_valid_option_chosen/std": 0.07889172434806824,
714
+ "step": 22
715
+ },
716
+ {
717
+ "epoch": 0.967032967032967,
718
+ "step": 22,
719
+ "total_flos": 0.0,
720
+ "train_loss": 0.04093834859403697,
721
+ "train_runtime": 1244.7225,
722
+ "train_samples_per_second": 0.877,
723
+ "train_steps_per_second": 0.018
724
+ }
725
+ ],
726
+ "logging_steps": 1,
727
+ "max_steps": 23,
728
+ "num_input_tokens_seen": 8803698,
729
+ "num_train_epochs": 1,
730
+ "save_steps": 500,
731
+ "stateful_callbacks": {
732
+ "TrainerControl": {
733
+ "args": {
734
+ "should_epoch_stop": false,
735
+ "should_evaluate": false,
736
+ "should_log": false,
737
+ "should_save": true,
738
+ "should_training_stop": false
739
+ },
740
+ "attributes": {}
741
+ }
742
+ },
743
+ "total_flos": 0.0,
744
+ "train_batch_size": 10,
745
+ "trial_name": null,
746
+ "trial_params": null
747
+ }