rulixiang commited on
Commit
d7f59f0
·
1 Parent(s): 6111f96
.gitattributes CHANGED
@@ -28,6 +28,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
28
  *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
31
  *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
 
28
  *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
32
  *.wasm filter=lfs diff=lfs merge=lfs -text
33
  *.xz filter=lfs diff=lfs merge=lfs -text
34
  *.zip filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|AUDIO|>": 151646,
5
+ "<|IMAGE|>": 151655,
6
+ "<|VIDEO|>": 151656,
7
+ "<|audio_bos|>": 151647,
8
+ "<|audio_eos|>": 151648,
9
+ "<|box_end|>": 151649,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|quad_end|>": 151651,
19
+ "<|quad_start|>": 151650,
20
+ "<|repo_name|>": 151663,
21
+ "<|vision_bos|>": 151652,
22
+ "<|vision_eos|>": 151653,
23
+ "<|vision_pad|>": 151654
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json ADDED
@@ -0,0 +1,573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2_5OmniForConditionalGeneration"
4
+ ],
5
+ "enable_audio_output": true,
6
+ "enable_talker": true,
7
+ "model_type": "qwen2_5_omni",
8
+ "talker_config": {
9
+ "_attn_implementation_autoset": true,
10
+ "_name_or_path": "Qwen2.5-Omni-7B/talker",
11
+ "architectures": [
12
+ "Qwen2OmniTalkerForConditionalGeneration"
13
+ ],
14
+ "attention_dropout": 0.0,
15
+ "audio_end_token_id": 151648,
16
+ "audio_start_token_id": 151647,
17
+ "audio_token_index": 151646,
18
+ "embedding_size": 3584,
19
+ "head_dim": 128,
20
+ "hidden_act": "silu",
21
+ "hidden_size": 896,
22
+ "image_token_index": 151655,
23
+ "init_std": 0.02,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 18944,
26
+ "max_position_embeddings": 32768,
27
+ "max_window_layers": 28,
28
+ "model_type": "qwen2_5_omni_talker",
29
+ "num_attention_heads": 12,
30
+ "num_hidden_layers": 24,
31
+ "num_key_value_heads": 4,
32
+ "position_id_per_seconds": 25,
33
+ "rms_norm_eps": 1e-06,
34
+ "rope_scaling": {
35
+ "mrope_section": [
36
+ 16,
37
+ 24,
38
+ 24
39
+ ],
40
+ "rope_type": "default",
41
+ "type": "default"
42
+ },
43
+ "rope_theta": 1000000.0,
44
+ "seconds_per_chunk": 2,
45
+ "sliding_window": 32768,
46
+ "spatial_merge_size": 2,
47
+ "torch_dtype": "float32",
48
+ "tts_codec_end_token_id": 8294,
49
+ "tts_codec_mask_token_id": 8296,
50
+ "tts_codec_pad_token_id": 8292,
51
+ "tts_codec_start_token_id": 8293,
52
+ "tts_text_end_token_id": 151861,
53
+ "tts_text_pad_token_id": 151859,
54
+ "tts_text_start_token_id": 151860,
55
+ "use_cache": true,
56
+ "use_sliding_window": false,
57
+ "video_token_index": 151656,
58
+ "vision_end_token_id": 151653,
59
+ "vision_start_token_id": 151652,
60
+ "vocab_size": 8448
61
+ },
62
+ "thinker_config": {
63
+ "_attn_implementation_autoset": true,
64
+ "_name_or_path": "Qwen2.5-Omni-7B/thinker",
65
+ "architectures": [
66
+ "Qwen2OmniNaViTThinkerForConditionalGeneration"
67
+ ],
68
+ "audio_config": {
69
+ "_attn_implementation_autoset": true,
70
+ "_name_or_path": "",
71
+ "activation_dropout": 0.0,
72
+ "activation_function": "gelu",
73
+ "add_cross_attention": false,
74
+ "architectures": null,
75
+ "attention_dropout": 0.0,
76
+ "bad_words_ids": null,
77
+ "begin_suppress_tokens": null,
78
+ "bos_token_id": null,
79
+ "chunk_size_feed_forward": 0,
80
+ "cross_attention_hidden_size": null,
81
+ "d_model": 1280,
82
+ "decoder_start_token_id": null,
83
+ "diversity_penalty": 0.0,
84
+ "do_sample": false,
85
+ "dropout": 0.0,
86
+ "early_stopping": false,
87
+ "encoder_attention_heads": 20,
88
+ "encoder_ffn_dim": 5120,
89
+ "encoder_layerdrop": 0.0,
90
+ "encoder_layers": 32,
91
+ "encoder_no_repeat_ngram_size": 0,
92
+ "eos_token_id": null,
93
+ "exponential_decay_length_penalty": null,
94
+ "finetuning_task": null,
95
+ "forced_bos_token_id": null,
96
+ "forced_eos_token_id": null,
97
+ "id2label": {
98
+ "0": "LABEL_0",
99
+ "1": "LABEL_1"
100
+ },
101
+ "init_std": 0.02,
102
+ "initializer_range": 0.02,
103
+ "is_decoder": false,
104
+ "is_encoder_decoder": false,
105
+ "label2id": {
106
+ "LABEL_0": 0,
107
+ "LABEL_1": 1
108
+ },
109
+ "length_penalty": 1.0,
110
+ "max_length": 20,
111
+ "max_source_positions": 1500,
112
+ "min_length": 0,
113
+ "model_type": "qwen2_5_omni_audio_encoder",
114
+ "n_window": 100,
115
+ "no_repeat_ngram_size": 0,
116
+ "num_beam_groups": 1,
117
+ "num_beams": 1,
118
+ "num_hidden_layers": 32,
119
+ "num_mel_bins": 128,
120
+ "num_return_sequences": 1,
121
+ "output_attentions": false,
122
+ "output_dim": 3584,
123
+ "output_hidden_states": false,
124
+ "output_scores": false,
125
+ "pad_token_id": null,
126
+ "prefix": null,
127
+ "problem_type": null,
128
+ "pruned_heads": {},
129
+ "remove_invalid_values": false,
130
+ "repetition_penalty": 1.0,
131
+ "return_dict": true,
132
+ "return_dict_in_generate": false,
133
+ "scale_embedding": false,
134
+ "sep_token_id": null,
135
+ "suppress_tokens": null,
136
+ "task_specific_params": null,
137
+ "temperature": 1.0,
138
+ "tf_legacy_loss": false,
139
+ "tie_encoder_decoder": false,
140
+ "tie_word_embeddings": true,
141
+ "tokenizer_class": null,
142
+ "top_k": 50,
143
+ "top_p": 1.0,
144
+ "torch_dtype": null,
145
+ "torchscript": false,
146
+ "typical_p": 1.0,
147
+ "use_bfloat16": false
148
+ },
149
+ "audio_end_token_id": 151648,
150
+ "audio_start_token_id": 151647,
151
+ "audio_token_index": 151646,
152
+ "bos_token_id": 151644,
153
+ "eos_token_id": 151645,
154
+ "ignore_index": -100,
155
+ "image_token_index": 151655,
156
+ "init_std": 0.02,
157
+ "initializer_range": 0.02,
158
+ "model_type": "qwen2_5_omni_thinker",
159
+ "pad_token_id": 151643,
160
+ "position_id_per_seconds": 25,
161
+ "seconds_per_chunk": 2,
162
+ "text_config": {
163
+ "_attn_implementation_autoset": false,
164
+ "_name_or_path": "",
165
+ "add_cross_attention": false,
166
+ "architectures": null,
167
+ "attention_dropout": 0.0,
168
+ "bad_words_ids": null,
169
+ "begin_suppress_tokens": null,
170
+ "bos_token_id": null,
171
+ "chunk_size_feed_forward": 0,
172
+ "cross_attention_hidden_size": null,
173
+ "decoder_start_token_id": null,
174
+ "diversity_penalty": 0.0,
175
+ "do_sample": false,
176
+ "early_stopping": false,
177
+ "encoder_no_repeat_ngram_size": 0,
178
+ "eos_token_id": null,
179
+ "exponential_decay_length_penalty": null,
180
+ "finetuning_task": null,
181
+ "forced_bos_token_id": null,
182
+ "forced_eos_token_id": null,
183
+ "hidden_act": "silu",
184
+ "hidden_size": 3584,
185
+ "id2label": {
186
+ "0": "LABEL_0",
187
+ "1": "LABEL_1"
188
+ },
189
+ "init_std": 0.02,
190
+ "initializer_range": 0.02,
191
+ "intermediate_size": 18944,
192
+ "is_decoder": false,
193
+ "is_encoder_decoder": false,
194
+ "label2id": {
195
+ "LABEL_0": 0,
196
+ "LABEL_1": 1
197
+ },
198
+ "length_penalty": 1.0,
199
+ "max_length": 20,
200
+ "max_position_embeddings": 32768,
201
+ "max_window_layers": 28,
202
+ "min_length": 0,
203
+ "model_type": "qwen2_5_omni_text",
204
+ "no_repeat_ngram_size": 0,
205
+ "num_attention_heads": 28,
206
+ "num_beam_groups": 1,
207
+ "num_beams": 1,
208
+ "num_hidden_layers": 28,
209
+ "num_key_value_heads": 4,
210
+ "num_return_sequences": 1,
211
+ "output_attentions": false,
212
+ "output_hidden_states": false,
213
+ "output_scores": false,
214
+ "pad_token_id": null,
215
+ "prefix": null,
216
+ "problem_type": null,
217
+ "pruned_heads": {},
218
+ "remove_invalid_values": false,
219
+ "repetition_penalty": 1.0,
220
+ "return_dict": true,
221
+ "return_dict_in_generate": false,
222
+ "rms_norm_eps": 1e-06,
223
+ "rope_scaling": {
224
+ "mrope_section": [
225
+ 16,
226
+ 24,
227
+ 24
228
+ ],
229
+ "rope_type": "default",
230
+ "type": "default"
231
+ },
232
+ "rope_theta": 1000000.0,
233
+ "sep_token_id": null,
234
+ "sliding_window": 32768,
235
+ "suppress_tokens": null,
236
+ "task_specific_params": null,
237
+ "temperature": 1.0,
238
+ "tf_legacy_loss": false,
239
+ "tie_encoder_decoder": false,
240
+ "tie_word_embeddings": false,
241
+ "tokenizer_class": null,
242
+ "top_k": 50,
243
+ "top_p": 1.0,
244
+ "torch_dtype": null,
245
+ "torchscript": false,
246
+ "typical_p": 1.0,
247
+ "use_bfloat16": false,
248
+ "use_cache": true,
249
+ "use_sliding_window": false,
250
+ "vocab_size": 152064
251
+ },
252
+ "torch_dtype": "float32",
253
+ "user_token_id": 872,
254
+ "video_token_index": 151656,
255
+ "vision_config": {
256
+ "_attn_implementation_autoset": true,
257
+ "_name_or_path": "",
258
+ "add_cross_attention": false,
259
+ "architectures": null,
260
+ "bad_words_ids": null,
261
+ "begin_suppress_tokens": null,
262
+ "bos_token_id": null,
263
+ "chunk_size_feed_forward": 0,
264
+ "cross_attention_hidden_size": null,
265
+ "decoder_start_token_id": null,
266
+ "depth": 32,
267
+ "diversity_penalty": 0.0,
268
+ "do_sample": false,
269
+ "early_stopping": false,
270
+ "embed_dim": 1280,
271
+ "encoder_no_repeat_ngram_size": 0,
272
+ "eos_token_id": null,
273
+ "exponential_decay_length_penalty": null,
274
+ "finetuning_task": null,
275
+ "forced_bos_token_id": null,
276
+ "forced_eos_token_id": null,
277
+ "fullatt_block_indexes": [
278
+ 7,
279
+ 15,
280
+ 23,
281
+ 31
282
+ ],
283
+ "hidden_act": "silu",
284
+ "hidden_size": 1280,
285
+ "id2label": {
286
+ "0": "LABEL_0",
287
+ "1": "LABEL_1"
288
+ },
289
+ "in_channels": 3,
290
+ "in_chans": 3,
291
+ "init_std": 0.02,
292
+ "initializer_range": 0.02,
293
+ "intermediate_size": 3420,
294
+ "is_decoder": false,
295
+ "is_encoder_decoder": false,
296
+ "label2id": {
297
+ "LABEL_0": 0,
298
+ "LABEL_1": 1
299
+ },
300
+ "length_penalty": 1.0,
301
+ "max_length": 20,
302
+ "min_length": 0,
303
+ "model_type": "qwen2_5_omni_vision_encoder",
304
+ "no_repeat_ngram_size": 0,
305
+ "num_beam_groups": 1,
306
+ "num_beams": 1,
307
+ "num_heads": 16,
308
+ "num_return_sequences": 1,
309
+ "out_hidden_size": 3584,
310
+ "output_attentions": false,
311
+ "output_hidden_states": false,
312
+ "output_scores": false,
313
+ "pad_token_id": null,
314
+ "patch_size": 14,
315
+ "prefix": null,
316
+ "problem_type": null,
317
+ "pruned_heads": {},
318
+ "remove_invalid_values": false,
319
+ "repetition_penalty": 1.0,
320
+ "return_dict": true,
321
+ "return_dict_in_generate": false,
322
+ "sep_token_id": null,
323
+ "spatial_merge_size": 2,
324
+ "spatial_patch_size": 14,
325
+ "suppress_tokens": null,
326
+ "task_specific_params": null,
327
+ "temperature": 1.0,
328
+ "temporal_patch_size": 2,
329
+ "tf_legacy_loss": false,
330
+ "tie_encoder_decoder": false,
331
+ "tie_word_embeddings": true,
332
+ "tokenizer_class": null,
333
+ "tokens_per_second": 25,
334
+ "top_k": 50,
335
+ "top_p": 1.0,
336
+ "torch_dtype": null,
337
+ "torchscript": false,
338
+ "typical_p": 1.0,
339
+ "use_bfloat16": false,
340
+ "window_size": 112
341
+ },
342
+ "vision_end_token_id": 151653,
343
+ "vision_start_token_id": 151652,
344
+ "vision_token_id": 151654
345
+ },
346
+ "token2wav_config": {
347
+ "_attn_implementation_autoset": true,
348
+ "bigvgan_config": {
349
+ "_attn_implementation_autoset": true,
350
+ "_name_or_path": "",
351
+ "add_cross_attention": false,
352
+ "architectures": null,
353
+ "bad_words_ids": null,
354
+ "begin_suppress_tokens": null,
355
+ "bos_token_id": null,
356
+ "chunk_size_feed_forward": 0,
357
+ "cross_attention_hidden_size": null,
358
+ "decoder_start_token_id": null,
359
+ "diversity_penalty": 0.0,
360
+ "do_sample": false,
361
+ "early_stopping": false,
362
+ "encoder_no_repeat_ngram_size": 0,
363
+ "eos_token_id": null,
364
+ "exponential_decay_length_penalty": null,
365
+ "finetuning_task": null,
366
+ "forced_bos_token_id": null,
367
+ "forced_eos_token_id": null,
368
+ "id2label": {
369
+ "0": "LABEL_0",
370
+ "1": "LABEL_1"
371
+ },
372
+ "is_decoder": false,
373
+ "is_encoder_decoder": false,
374
+ "label2id": {
375
+ "LABEL_0": 0,
376
+ "LABEL_1": 1
377
+ },
378
+ "length_penalty": 1.0,
379
+ "max_length": 20,
380
+ "mel_dim": 80,
381
+ "min_length": 0,
382
+ "model_type": "qwen2_5_omni_bigvgan",
383
+ "no_repeat_ngram_size": 0,
384
+ "num_beam_groups": 1,
385
+ "num_beams": 1,
386
+ "num_return_sequences": 1,
387
+ "output_attentions": false,
388
+ "output_hidden_states": false,
389
+ "output_scores": false,
390
+ "pad_token_id": null,
391
+ "prefix": null,
392
+ "problem_type": null,
393
+ "pruned_heads": {},
394
+ "remove_invalid_values": false,
395
+ "repetition_penalty": 1.0,
396
+ "resblock_dilation_sizes": [
397
+ [
398
+ 1,
399
+ 3,
400
+ 5
401
+ ],
402
+ [
403
+ 1,
404
+ 3,
405
+ 5
406
+ ],
407
+ [
408
+ 1,
409
+ 3,
410
+ 5
411
+ ]
412
+ ],
413
+ "resblock_kernel_sizes": [
414
+ 3,
415
+ 7,
416
+ 11
417
+ ],
418
+ "return_dict": true,
419
+ "return_dict_in_generate": false,
420
+ "sep_token_id": null,
421
+ "suppress_tokens": null,
422
+ "task_specific_params": null,
423
+ "temperature": 1.0,
424
+ "tf_legacy_loss": false,
425
+ "tie_encoder_decoder": false,
426
+ "tie_word_embeddings": true,
427
+ "tokenizer_class": null,
428
+ "top_k": 50,
429
+ "top_p": 1.0,
430
+ "torch_dtype": null,
431
+ "torchscript": false,
432
+ "typical_p": 1.0,
433
+ "upsample_initial_channel": 1536,
434
+ "upsample_kernel_sizes": [
435
+ 11,
436
+ 7,
437
+ 4,
438
+ 4,
439
+ 4,
440
+ 4
441
+ ],
442
+ "upsample_rates": [
443
+ 5,
444
+ 3,
445
+ 2,
446
+ 2,
447
+ 2,
448
+ 2
449
+ ],
450
+ "use_bfloat16": false,
451
+ "use_bias_at_final": false
452
+ },
453
+ "dit_config": {
454
+ "_attn_implementation_autoset": true,
455
+ "_name_or_path": "",
456
+ "add_cross_attention": false,
457
+ "architectures": null,
458
+ "bad_words_ids": null,
459
+ "begin_suppress_tokens": null,
460
+ "block_size": 24,
461
+ "bos_token_id": null,
462
+ "chunk_size_feed_forward": 0,
463
+ "cross_attention_hidden_size": null,
464
+ "decoder_start_token_id": null,
465
+ "depth": 22,
466
+ "dim": 1024,
467
+ "diversity_penalty": 0.0,
468
+ "do_sample": false,
469
+ "dropout": 0.1,
470
+ "early_stopping": false,
471
+ "emb_dim": 512,
472
+ "enc_attention_channels": 64,
473
+ "enc_channels": [
474
+ 256,
475
+ 256,
476
+ 256,
477
+ 256,
478
+ 768
479
+ ],
480
+ "enc_dilations": [
481
+ 1,
482
+ 2,
483
+ 3,
484
+ 4,
485
+ 1
486
+ ],
487
+ "enc_dim": 128,
488
+ "enc_emb_dim": 192,
489
+ "enc_global_context": true,
490
+ "enc_kernel_sizes": [
491
+ 5,
492
+ 3,
493
+ 3,
494
+ 3,
495
+ 1
496
+ ],
497
+ "enc_lin_neurons": 192,
498
+ "enc_res2net_scale": 2,
499
+ "enc_se_channels": 64,
500
+ "encoder_no_repeat_ngram_size": 0,
501
+ "eos_token_id": null,
502
+ "exponential_decay_length_penalty": null,
503
+ "ff_mult": 2,
504
+ "finetuning_task": null,
505
+ "forced_bos_token_id": null,
506
+ "forced_eos_token_id": null,
507
+ "head_dim": 64,
508
+ "heads": 16,
509
+ "hidden_size": 1024,
510
+ "id2label": {
511
+ "0": "LABEL_0",
512
+ "1": "LABEL_1"
513
+ },
514
+ "is_decoder": false,
515
+ "is_encoder_decoder": false,
516
+ "label2id": {
517
+ "LABEL_0": 0,
518
+ "LABEL_1": 1
519
+ },
520
+ "length_penalty": 1.0,
521
+ "look_ahead_layers": [
522
+ 10
523
+ ],
524
+ "look_backward_layers": [
525
+ 0,
526
+ 20
527
+ ],
528
+ "max_length": 20,
529
+ "max_position_embeddings": 32768,
530
+ "mel_dim": 80,
531
+ "min_length": 0,
532
+ "model_type": "qwen2_5_omni_dit",
533
+ "no_repeat_ngram_size": 0,
534
+ "num_attention_heads": 16,
535
+ "num_beam_groups": 1,
536
+ "num_beams": 1,
537
+ "num_embeds": 8193,
538
+ "num_hidden_layers": 22,
539
+ "num_return_sequences": 1,
540
+ "output_attentions": false,
541
+ "output_hidden_states": false,
542
+ "output_scores": false,
543
+ "pad_token_id": null,
544
+ "prefix": null,
545
+ "problem_type": null,
546
+ "pruned_heads": {},
547
+ "remove_invalid_values": false,
548
+ "repeats": 2,
549
+ "repetition_penalty": 1.0,
550
+ "return_dict": true,
551
+ "return_dict_in_generate": false,
552
+ "rope_theta": 10000.0,
553
+ "sep_token_id": null,
554
+ "suppress_tokens": null,
555
+ "task_specific_params": null,
556
+ "temperature": 1.0,
557
+ "tf_legacy_loss": false,
558
+ "tie_encoder_decoder": false,
559
+ "tie_word_embeddings": true,
560
+ "tokenizer_class": null,
561
+ "top_k": 50,
562
+ "top_p": 1.0,
563
+ "torch_dtype": "float32",
564
+ "torchscript": false,
565
+ "typical_p": 1.0,
566
+ "use_bfloat16": false
567
+ },
568
+ "model_type": "qwen2_5_omni_token2wav",
569
+ "torch_dtype": "float32"
570
+ },
571
+ "torch_dtype": "bfloat16",
572
+ "transformers_version": "4.52.0.dev0"
573
+ }
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.52.0.dev0"
4
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa2866bd0f5b18079e4fbf6470659cc975fc73b6025afef81bd863658e130e68
3
+ size 4985055536
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf3ecae31a5699f2146af5d9cb4c59947a8395a4bc7f8e2b179b53f7567a5212
3
+ size 4991496832
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7675269df75c670fa6a63f71a091570028a2d969e4f1602f8d12abb0575c68be
3
+ size 4991496936
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c975b68f6fbb37fdb31f59e5c4bb2415be9a57f02fa5b8e6ec0acf7fc152b1cb
3
+ size 2895740064
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 300,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 128,
6
+ "hop_length": 160,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_processor_type": "Qwen2VLImageProcessor",
13
+ "image_std": [
14
+ 0.26862954,
15
+ 0.26130258,
16
+ 0.27577711
17
+ ],
18
+ "max_pixels": 12845056,
19
+ "merge_size": 2,
20
+ "min_pixels": 3136,
21
+ "n_fft": 400,
22
+ "n_samples": 4800000,
23
+ "nb_max_frames": 30000,
24
+ "padding_side": "right",
25
+ "padding_value": 0.0,
26
+ "patch_size": 14,
27
+ "processor_class": "Qwen2_5OmniProcessor",
28
+ "return_attention_mask": true,
29
+ "sampling_rate": 16000,
30
+ "temporal_patch_size": 2
31
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|AUDIO|>",
6
+ "<|audio_bos|>",
7
+ "<|audio_eos|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_bos|>",
12
+ "<|vision_eos|>",
13
+ "<|vision_pad|>",
14
+ "<|IMAGE|>",
15
+ "<|VIDEO|>"
16
+ ],
17
+ "audio_bos_token": "<|audio_bos|>",
18
+ "audio_eos_token": "<|audio_eos|>",
19
+ "audio_token": "<|AUDIO|>",
20
+ "eos_token": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "image_token": "<|IMAGE|>",
28
+ "pad_token": {
29
+ "content": "<|endoftext|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "video_token": "<|VIDEO|>",
36
+ "vision_bos_token": "<|vision_bos|>",
37
+ "vision_eos_token": "<|vision_eos|>"
38
+ }
spk_dict.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a05609b28f5d42b7b748f0f07592545c8f1f6885b9ae8fff64baf56e86b2a18
3
+ size 259544
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9711e245647e88538786834977dc8afb51172e879ee661352c587cf01efd6b0
3
+ size 11422037
tokenizer_config.json ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|AUDIO|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|audio_bos|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|audio_eos|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_bos|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_eos|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|IMAGE|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|VIDEO|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "151657": {
117
+ "content": "<tool_call>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "151658": {
125
+ "content": "</tool_call>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "151659": {
133
+ "content": "<|fim_prefix|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "151660": {
141
+ "content": "<|fim_middle|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "151661": {
149
+ "content": "<|fim_suffix|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "151662": {
157
+ "content": "<|fim_pad|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "151663": {
165
+ "content": "<|repo_name|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "151664": {
173
+ "content": "<|file_sep|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ }
180
+ },
181
+ "additional_special_tokens": [
182
+ "<|im_start|>",
183
+ "<|im_end|>",
184
+ "<|AUDIO|>",
185
+ "<|audio_bos|>",
186
+ "<|audio_eos|>",
187
+ "<|box_end|>",
188
+ "<|quad_start|>",
189
+ "<|quad_end|>",
190
+ "<|vision_bos|>",
191
+ "<|vision_eos|>",
192
+ "<|vision_pad|>",
193
+ "<|IMAGE|>",
194
+ "<|VIDEO|>"
195
+ ],
196
+ "audio_bos_token": "<|audio_bos|>",
197
+ "audio_eos_token": "<|audio_eos|>",
198
+ "audio_token": "<|AUDIO|>",
199
+ "bos_token": null,
200
+ "clean_up_tokenization_spaces": false,
201
+ "eos_token": "<|im_end|>",
202
+ "errors": "replace",
203
+ "extra_special_tokens": {
204
+ "audio_bos_token": "<|audio_bos|>",
205
+ "audio_eos_token": "<|audio_eos|>",
206
+ "audio_token": "<|AUDIO|>",
207
+ "image_token": "<|IMAGE|>",
208
+ "video_token": "<|VIDEO|>",
209
+ "vision_bos_token": "<|vision_bos|>",
210
+ "vision_eos_token": "<|vision_eos|>"
211
+ },
212
+ "image_token": "<|IMAGE|>",
213
+ "max_length": null,
214
+ "model_max_length": 32768,
215
+ "pad_to_multiple_of": null,
216
+ "pad_token": "<|endoftext|>",
217
+ "pad_token_type_id": 0,
218
+ "padding_side": "left",
219
+ "processor_class": "Qwen2_5OmniProcessor",
220
+ "split_special_tokens": false,
221
+ "tokenizer_class": "Qwen2Tokenizer",
222
+ "unk_token": null,
223
+ "video_token": "<|VIDEO|>",
224
+ "vision_bos_token": "<|vision_bos|>",
225
+ "vision_eos_token": "<|vision_eos|>"
226
+ }
trainer_state.json ADDED
@@ -0,0 +1,2449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 161,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "advantages": 0.0,
14
+ "completion_length": 77.421875,
15
+ "epoch": 0.006211180124223602,
16
+ "grad_norm": 4.007043838500977,
17
+ "kl": 0.0,
18
+ "learning_rate": 9.937888198757763e-07,
19
+ "loss": -0.0,
20
+ "reward": 1.46875,
21
+ "reward_mean": 1.46875,
22
+ "reward_std": 0.23356688022613525,
23
+ "rewards/accuracy_reward": 0.46875,
24
+ "rewards/format_reward": 1.0,
25
+ "step": 1
26
+ },
27
+ {
28
+ "advantages": 1.30385160446167e-08,
29
+ "completion_length": 78.921875,
30
+ "epoch": 0.012422360248447204,
31
+ "grad_norm": 7.15122652053833,
32
+ "kl": 0.00041961669921875,
33
+ "learning_rate": 9.875776397515528e-07,
34
+ "loss": 0.0,
35
+ "reward": 1.53125,
36
+ "reward_mean": 1.53125,
37
+ "reward_std": 0.2845909595489502,
38
+ "rewards/accuracy_reward": 0.53125,
39
+ "rewards/format_reward": 1.0,
40
+ "step": 2
41
+ },
42
+ {
43
+ "advantages": 0.0,
44
+ "completion_length": 78.671875,
45
+ "epoch": 0.018633540372670808,
46
+ "grad_norm": 2.9171459674835205,
47
+ "kl": 0.000392913818359375,
48
+ "learning_rate": 9.813664596273291e-07,
49
+ "loss": 0.0,
50
+ "reward": 1.6875,
51
+ "reward_mean": 1.6875,
52
+ "reward_std": 0.1552036553621292,
53
+ "rewards/accuracy_reward": 0.6875,
54
+ "rewards/format_reward": 1.0,
55
+ "step": 3
56
+ },
57
+ {
58
+ "advantages": -1.862645149230957e-09,
59
+ "completion_length": 77.78125,
60
+ "epoch": 0.024844720496894408,
61
+ "grad_norm": 5.474589824676514,
62
+ "kl": 0.000759124755859375,
63
+ "learning_rate": 9.751552795031055e-07,
64
+ "loss": 0.0001,
65
+ "reward": 1.71875,
66
+ "reward_mean": 1.71875,
67
+ "reward_std": 0.213067427277565,
68
+ "rewards/accuracy_reward": 0.71875,
69
+ "rewards/format_reward": 1.0,
70
+ "step": 4
71
+ },
72
+ {
73
+ "advantages": -2.7939677238464355e-09,
74
+ "completion_length": 90.8125,
75
+ "epoch": 0.031055900621118012,
76
+ "grad_norm": 5.2480363845825195,
77
+ "kl": 0.001495361328125,
78
+ "learning_rate": 9.68944099378882e-07,
79
+ "loss": 0.0001,
80
+ "reward": 1.53125,
81
+ "reward_mean": 1.53125,
82
+ "reward_std": 0.17570313811302185,
83
+ "rewards/accuracy_reward": 0.53125,
84
+ "rewards/format_reward": 1.0,
85
+ "step": 5
86
+ },
87
+ {
88
+ "advantages": 6.51925802230835e-09,
89
+ "completion_length": 81.203125,
90
+ "epoch": 0.037267080745341616,
91
+ "grad_norm": 8.329508781433105,
92
+ "kl": 0.006256103515625,
93
+ "learning_rate": 9.627329192546583e-07,
94
+ "loss": 0.0006,
95
+ "reward": 1.375,
96
+ "reward_mean": 1.375,
97
+ "reward_std": 0.26409146189689636,
98
+ "rewards/accuracy_reward": 0.375,
99
+ "rewards/format_reward": 1.0,
100
+ "step": 6
101
+ },
102
+ {
103
+ "advantages": -3.725290298461914e-09,
104
+ "completion_length": 84.265625,
105
+ "epoch": 0.043478260869565216,
106
+ "grad_norm": 9.400680541992188,
107
+ "kl": 0.0106201171875,
108
+ "learning_rate": 9.565217391304349e-07,
109
+ "loss": 0.0011,
110
+ "reward": 1.65625,
111
+ "reward_mean": 1.65625,
112
+ "reward_std": 0.2404065877199173,
113
+ "rewards/accuracy_reward": 0.65625,
114
+ "rewards/format_reward": 1.0,
115
+ "step": 7
116
+ },
117
+ {
118
+ "advantages": -2.7939677238464355e-09,
119
+ "completion_length": 84.109375,
120
+ "epoch": 0.049689440993788817,
121
+ "grad_norm": 6.011223316192627,
122
+ "kl": 0.0057373046875,
123
+ "learning_rate": 9.503105590062112e-07,
124
+ "loss": 0.0006,
125
+ "reward": 1.4375,
126
+ "reward_mean": 1.4375,
127
+ "reward_std": 0.22461533546447754,
128
+ "rewards/accuracy_reward": 0.453125,
129
+ "rewards/format_reward": 0.984375,
130
+ "step": 8
131
+ },
132
+ {
133
+ "advantages": -3.725290298461914e-09,
134
+ "completion_length": 87.046875,
135
+ "epoch": 0.055900621118012424,
136
+ "grad_norm": 4.103212356567383,
137
+ "kl": 0.00244140625,
138
+ "learning_rate": 9.440993788819875e-07,
139
+ "loss": 0.0002,
140
+ "reward": 1.71875,
141
+ "reward_mean": 1.71875,
142
+ "reward_std": 0.0578637570142746,
143
+ "rewards/accuracy_reward": 0.71875,
144
+ "rewards/format_reward": 1.0,
145
+ "step": 9
146
+ },
147
+ {
148
+ "advantages": -7.450580596923828e-09,
149
+ "completion_length": 76.15625,
150
+ "epoch": 0.062111801242236024,
151
+ "grad_norm": 7.4132466316223145,
152
+ "kl": 0.01287841796875,
153
+ "learning_rate": 9.37888198757764e-07,
154
+ "loss": 0.0013,
155
+ "reward": 1.4375,
156
+ "reward_mean": 1.4375,
157
+ "reward_std": 0.3335031569004059,
158
+ "rewards/accuracy_reward": 0.4375,
159
+ "rewards/format_reward": 1.0,
160
+ "step": 10
161
+ },
162
+ {
163
+ "advantages": 3.725290298461914e-09,
164
+ "completion_length": 76.75,
165
+ "epoch": 0.06832298136645963,
166
+ "grad_norm": 4.6751017570495605,
167
+ "kl": 0.0130615234375,
168
+ "learning_rate": 9.316770186335403e-07,
169
+ "loss": 0.0013,
170
+ "reward": 1.28125,
171
+ "reward_mean": 1.28125,
172
+ "reward_std": 0.0578637570142746,
173
+ "rewards/accuracy_reward": 0.28125,
174
+ "rewards/format_reward": 1.0,
175
+ "step": 11
176
+ },
177
+ {
178
+ "advantages": -1.862645149230957e-09,
179
+ "completion_length": 76.390625,
180
+ "epoch": 0.07453416149068323,
181
+ "grad_norm": 7.682182788848877,
182
+ "kl": 0.018310546875,
183
+ "learning_rate": 9.254658385093167e-07,
184
+ "loss": 0.0018,
185
+ "reward": 1.734375,
186
+ "reward_mean": 1.734375,
187
+ "reward_std": 0.15992169082164764,
188
+ "rewards/accuracy_reward": 0.734375,
189
+ "rewards/format_reward": 1.0,
190
+ "step": 12
191
+ },
192
+ {
193
+ "advantages": -2.7939677238464355e-09,
194
+ "completion_length": 85.234375,
195
+ "epoch": 0.08074534161490683,
196
+ "grad_norm": 4.814305305480957,
197
+ "kl": 0.00396728515625,
198
+ "learning_rate": 9.19254658385093e-07,
199
+ "loss": 0.0004,
200
+ "reward": 1.8125,
201
+ "reward_mean": 1.8125,
202
+ "reward_std": 0.22461533546447754,
203
+ "rewards/accuracy_reward": 0.8125,
204
+ "rewards/format_reward": 1.0,
205
+ "step": 13
206
+ },
207
+ {
208
+ "advantages": -1.862645149230957e-09,
209
+ "completion_length": 86.3125,
210
+ "epoch": 0.08695652173913043,
211
+ "grad_norm": 189.3062744140625,
212
+ "kl": 0.033203125,
213
+ "learning_rate": 9.130434782608695e-07,
214
+ "loss": 0.0033,
215
+ "reward": 1.609375,
216
+ "reward_mean": 1.609375,
217
+ "reward_std": 0.19044628739356995,
218
+ "rewards/accuracy_reward": 0.609375,
219
+ "rewards/format_reward": 1.0,
220
+ "step": 14
221
+ },
222
+ {
223
+ "advantages": 2.7939677238464355e-09,
224
+ "completion_length": 69.8125,
225
+ "epoch": 0.09316770186335403,
226
+ "grad_norm": 3.5520412921905518,
227
+ "kl": 0.01129150390625,
228
+ "learning_rate": 9.06832298136646e-07,
229
+ "loss": 0.0011,
230
+ "reward": 1.78125,
231
+ "reward_mean": 1.78125,
232
+ "reward_std": 0.10888782143592834,
233
+ "rewards/accuracy_reward": 0.78125,
234
+ "rewards/format_reward": 1.0,
235
+ "step": 15
236
+ },
237
+ {
238
+ "advantages": -9.313225746154785e-10,
239
+ "completion_length": 76.296875,
240
+ "epoch": 0.09937888198757763,
241
+ "grad_norm": 3.526542901992798,
242
+ "kl": 0.00909423828125,
243
+ "learning_rate": 9.006211180124223e-07,
244
+ "loss": 0.0009,
245
+ "reward": 1.609375,
246
+ "reward_mean": 1.609375,
247
+ "reward_std": 0.12255740165710449,
248
+ "rewards/accuracy_reward": 0.609375,
249
+ "rewards/format_reward": 1.0,
250
+ "step": 16
251
+ },
252
+ {
253
+ "advantages": 1.862645149230957e-09,
254
+ "completion_length": 86.3125,
255
+ "epoch": 0.10559006211180125,
256
+ "grad_norm": 5.98048210144043,
257
+ "kl": 0.0057373046875,
258
+ "learning_rate": 8.944099378881988e-07,
259
+ "loss": 0.0006,
260
+ "reward": 1.71875,
261
+ "reward_mean": 1.71875,
262
+ "reward_std": 0.2041158676147461,
263
+ "rewards/accuracy_reward": 0.71875,
264
+ "rewards/format_reward": 1.0,
265
+ "step": 17
266
+ },
267
+ {
268
+ "advantages": 0.0,
269
+ "completion_length": 82.921875,
270
+ "epoch": 0.11180124223602485,
271
+ "grad_norm": 2.0705599784851074,
272
+ "kl": 0.00592041015625,
273
+ "learning_rate": 8.881987577639751e-07,
274
+ "loss": 0.0006,
275
+ "reward": 1.5625,
276
+ "reward_mean": 1.5625,
277
+ "reward_std": 0.06681530922651291,
278
+ "rewards/accuracy_reward": 0.578125,
279
+ "rewards/format_reward": 0.984375,
280
+ "step": 18
281
+ },
282
+ {
283
+ "advantages": -2.7939677238464355e-09,
284
+ "completion_length": 81.40625,
285
+ "epoch": 0.11801242236024845,
286
+ "grad_norm": 9.266715049743652,
287
+ "kl": 0.0079345703125,
288
+ "learning_rate": 8.819875776397515e-07,
289
+ "loss": 0.0008,
290
+ "reward": 1.546875,
291
+ "reward_mean": 1.546875,
292
+ "reward_std": 0.2109457552433014,
293
+ "rewards/accuracy_reward": 0.5625,
294
+ "rewards/format_reward": 0.984375,
295
+ "step": 19
296
+ },
297
+ {
298
+ "advantages": -3.725290298461914e-09,
299
+ "completion_length": 80.203125,
300
+ "epoch": 0.12422360248447205,
301
+ "grad_norm": 10.367863655090332,
302
+ "kl": 0.0072021484375,
303
+ "learning_rate": 8.757763975155279e-07,
304
+ "loss": 0.0007,
305
+ "reward": 1.40625,
306
+ "reward_mean": 1.40625,
307
+ "reward_std": 0.2404065728187561,
308
+ "rewards/accuracy_reward": 0.40625,
309
+ "rewards/format_reward": 1.0,
310
+ "step": 20
311
+ },
312
+ {
313
+ "advantages": 1.862645149230957e-09,
314
+ "completion_length": 75.734375,
315
+ "epoch": 0.13043478260869565,
316
+ "grad_norm": 2.6553478240966797,
317
+ "kl": 0.00592041015625,
318
+ "learning_rate": 8.695652173913043e-07,
319
+ "loss": 0.0006,
320
+ "reward": 1.578125,
321
+ "reward_mean": 1.578125,
322
+ "reward_std": 0.10205793380737305,
323
+ "rewards/accuracy_reward": 0.578125,
324
+ "rewards/format_reward": 1.0,
325
+ "step": 21
326
+ },
327
+ {
328
+ "advantages": 3.725290298461914e-09,
329
+ "completion_length": 85.8125,
330
+ "epoch": 0.13664596273291926,
331
+ "grad_norm": 3.458266496658325,
332
+ "kl": 0.00604248046875,
333
+ "learning_rate": 8.633540372670807e-07,
334
+ "loss": 0.0006,
335
+ "reward": 1.515625,
336
+ "reward_mean": 1.515625,
337
+ "reward_std": 0.15981829166412354,
338
+ "rewards/accuracy_reward": 0.53125,
339
+ "rewards/format_reward": 0.984375,
340
+ "step": 22
341
+ },
342
+ {
343
+ "advantages": -2.7939677238464355e-09,
344
+ "completion_length": 79.328125,
345
+ "epoch": 0.14285714285714285,
346
+ "grad_norm": 3.2002384662628174,
347
+ "kl": 0.00543212890625,
348
+ "learning_rate": 8.57142857142857e-07,
349
+ "loss": 0.0005,
350
+ "reward": 1.671875,
351
+ "reward_mean": 1.671875,
352
+ "reward_std": 0.2109457552433014,
353
+ "rewards/accuracy_reward": 0.671875,
354
+ "rewards/format_reward": 1.0,
355
+ "step": 23
356
+ },
357
+ {
358
+ "advantages": -1.862645149230957e-09,
359
+ "completion_length": 75.375,
360
+ "epoch": 0.14906832298136646,
361
+ "grad_norm": 5.946903705596924,
362
+ "kl": 0.0087890625,
363
+ "learning_rate": 8.509316770186336e-07,
364
+ "loss": 0.0009,
365
+ "reward": 1.484375,
366
+ "reward_mean": 1.484375,
367
+ "reward_std": 0.19044628739356995,
368
+ "rewards/accuracy_reward": 0.5,
369
+ "rewards/format_reward": 0.984375,
370
+ "step": 24
371
+ },
372
+ {
373
+ "advantages": -9.313225746154785e-10,
374
+ "completion_length": 68.8125,
375
+ "epoch": 0.15527950310559005,
376
+ "grad_norm": 4.977855682373047,
377
+ "kl": 0.008544921875,
378
+ "learning_rate": 8.447204968944099e-07,
379
+ "loss": 0.0009,
380
+ "reward": 1.734375,
381
+ "reward_mean": 1.734375,
382
+ "reward_std": 0.12255740165710449,
383
+ "rewards/accuracy_reward": 0.734375,
384
+ "rewards/format_reward": 1.0,
385
+ "step": 25
386
+ },
387
+ {
388
+ "advantages": -8.381903171539307e-09,
389
+ "completion_length": 83.59375,
390
+ "epoch": 0.16149068322981366,
391
+ "grad_norm": 4.409206390380859,
392
+ "kl": 0.01239013671875,
393
+ "learning_rate": 8.385093167701863e-07,
394
+ "loss": 0.0012,
395
+ "reward": 1.609375,
396
+ "reward_mean": 1.609375,
397
+ "reward_std": 0.2198973000049591,
398
+ "rewards/accuracy_reward": 0.609375,
399
+ "rewards/format_reward": 1.0,
400
+ "step": 26
401
+ },
402
+ {
403
+ "advantages": 1.862645149230957e-09,
404
+ "completion_length": 78.109375,
405
+ "epoch": 0.16770186335403728,
406
+ "grad_norm": 3.1185989379882812,
407
+ "kl": 0.006072998046875,
408
+ "learning_rate": 8.322981366459628e-07,
409
+ "loss": 0.0006,
410
+ "reward": 1.65625,
411
+ "reward_mean": 1.65625,
412
+ "reward_std": 0.10888782143592834,
413
+ "rewards/accuracy_reward": 0.671875,
414
+ "rewards/format_reward": 0.984375,
415
+ "step": 27
416
+ },
417
+ {
418
+ "advantages": 4.6566128730773926e-09,
419
+ "completion_length": 72.0625,
420
+ "epoch": 0.17391304347826086,
421
+ "grad_norm": 4.9565935134887695,
422
+ "kl": 0.010009765625,
423
+ "learning_rate": 8.260869565217391e-07,
424
+ "loss": 0.001,
425
+ "reward": 1.34375,
426
+ "reward_mean": 1.34375,
427
+ "reward_std": 0.16675157845020294,
428
+ "rewards/accuracy_reward": 0.34375,
429
+ "rewards/format_reward": 1.0,
430
+ "step": 28
431
+ },
432
+ {
433
+ "advantages": -4.6566128730773926e-09,
434
+ "completion_length": 84.953125,
435
+ "epoch": 0.18012422360248448,
436
+ "grad_norm": 4.35609769821167,
437
+ "kl": 0.011962890625,
438
+ "learning_rate": 8.198757763975155e-07,
439
+ "loss": 0.0012,
440
+ "reward": 1.46875,
441
+ "reward_mean": 1.46875,
442
+ "reward_std": 0.25513991713523865,
443
+ "rewards/accuracy_reward": 0.484375,
444
+ "rewards/format_reward": 0.984375,
445
+ "step": 29
446
+ },
447
+ {
448
+ "advantages": -9.313225746154785e-10,
449
+ "completion_length": 85.34375,
450
+ "epoch": 0.18633540372670807,
451
+ "grad_norm": 5.767938137054443,
452
+ "kl": 0.009033203125,
453
+ "learning_rate": 8.136645962732918e-07,
454
+ "loss": 0.0009,
455
+ "reward": 1.609375,
456
+ "reward_mean": 1.609375,
457
+ "reward_std": 0.1530819982290268,
458
+ "rewards/accuracy_reward": 0.609375,
459
+ "rewards/format_reward": 1.0,
460
+ "step": 30
461
+ },
462
+ {
463
+ "advantages": -5.587935447692871e-09,
464
+ "completion_length": 83.5625,
465
+ "epoch": 0.19254658385093168,
466
+ "grad_norm": 49.12059783935547,
467
+ "kl": 0.0091552734375,
468
+ "learning_rate": 8.074534161490683e-07,
469
+ "loss": 0.0009,
470
+ "reward": 1.578125,
471
+ "reward_mean": 1.578125,
472
+ "reward_std": 0.10205793380737305,
473
+ "rewards/accuracy_reward": 0.578125,
474
+ "rewards/format_reward": 1.0,
475
+ "step": 31
476
+ },
477
+ {
478
+ "advantages": 4.6566128730773926e-09,
479
+ "completion_length": 77.734375,
480
+ "epoch": 0.19875776397515527,
481
+ "grad_norm": 1.4828208684921265,
482
+ "kl": 0.00970458984375,
483
+ "learning_rate": 8.012422360248446e-07,
484
+ "loss": 0.001,
485
+ "reward": 1.421875,
486
+ "reward_mean": 1.421875,
487
+ "reward_std": 0.0646936446428299,
488
+ "rewards/accuracy_reward": 0.421875,
489
+ "rewards/format_reward": 1.0,
490
+ "step": 32
491
+ },
492
+ {
493
+ "advantages": 1.862645149230957e-09,
494
+ "completion_length": 78.453125,
495
+ "epoch": 0.20496894409937888,
496
+ "grad_norm": 7.876468658447266,
497
+ "kl": 0.020263671875,
498
+ "learning_rate": 7.95031055900621e-07,
499
+ "loss": 0.002,
500
+ "reward": 1.734375,
501
+ "reward_mean": 1.734375,
502
+ "reward_std": 0.2109457552433014,
503
+ "rewards/accuracy_reward": 0.734375,
504
+ "rewards/format_reward": 1.0,
505
+ "step": 33
506
+ },
507
+ {
508
+ "advantages": -3.725290298461914e-09,
509
+ "completion_length": 80.5,
510
+ "epoch": 0.2111801242236025,
511
+ "grad_norm": 3.8213541507720947,
512
+ "kl": 0.01361083984375,
513
+ "learning_rate": 7.888198757763976e-07,
514
+ "loss": 0.0014,
515
+ "reward": 1.46875,
516
+ "reward_mean": 1.46875,
517
+ "reward_std": 0.0578637570142746,
518
+ "rewards/accuracy_reward": 0.46875,
519
+ "rewards/format_reward": 1.0,
520
+ "step": 34
521
+ },
522
+ {
523
+ "advantages": -6.51925802230835e-09,
524
+ "completion_length": 89.5625,
525
+ "epoch": 0.21739130434782608,
526
+ "grad_norm": 3.453101634979248,
527
+ "kl": 0.01470947265625,
528
+ "learning_rate": 7.826086956521739e-07,
529
+ "loss": 0.0015,
530
+ "reward": 1.75,
531
+ "reward_mean": 1.75,
532
+ "reward_std": 0.17570312321186066,
533
+ "rewards/accuracy_reward": 0.75,
534
+ "rewards/format_reward": 1.0,
535
+ "step": 35
536
+ },
537
+ {
538
+ "advantages": 0.0,
539
+ "completion_length": 90.28125,
540
+ "epoch": 0.2236024844720497,
541
+ "grad_norm": 2.642101526260376,
542
+ "kl": 0.0107421875,
543
+ "learning_rate": 7.763975155279503e-07,
544
+ "loss": 0.0011,
545
+ "reward": 1.5625,
546
+ "reward_mean": 1.5625,
547
+ "reward_std": 0.06681530922651291,
548
+ "rewards/accuracy_reward": 0.5625,
549
+ "rewards/format_reward": 1.0,
550
+ "step": 36
551
+ },
552
+ {
553
+ "advantages": -3.725290298461914e-09,
554
+ "completion_length": 80.3125,
555
+ "epoch": 0.22981366459627328,
556
+ "grad_norm": 3.5424673557281494,
557
+ "kl": 0.01239013671875,
558
+ "learning_rate": 7.701863354037266e-07,
559
+ "loss": 0.0012,
560
+ "reward": 1.71875,
561
+ "reward_mean": 1.71875,
562
+ "reward_std": 0.0578637570142746,
563
+ "rewards/accuracy_reward": 0.71875,
564
+ "rewards/format_reward": 1.0,
565
+ "step": 37
566
+ },
567
+ {
568
+ "advantages": 0.0,
569
+ "completion_length": 84.140625,
570
+ "epoch": 0.2360248447204969,
571
+ "grad_norm": 0.38800248503685,
572
+ "kl": 0.01275634765625,
573
+ "learning_rate": 7.639751552795031e-07,
574
+ "loss": 0.0013,
575
+ "reward": 1.375,
576
+ "reward_mean": 1.375,
577
+ "reward_std": 0.0,
578
+ "rewards/accuracy_reward": 0.390625,
579
+ "rewards/format_reward": 0.984375,
580
+ "step": 38
581
+ },
582
+ {
583
+ "advantages": 1.862645149230957e-09,
584
+ "completion_length": 89.109375,
585
+ "epoch": 0.2422360248447205,
586
+ "grad_norm": 2.645474433898926,
587
+ "kl": 0.01397705078125,
588
+ "learning_rate": 7.577639751552795e-07,
589
+ "loss": 0.0014,
590
+ "reward": 1.515625,
591
+ "reward_mean": 1.515625,
592
+ "reward_std": 0.04419417306780815,
593
+ "rewards/accuracy_reward": 0.515625,
594
+ "rewards/format_reward": 1.0,
595
+ "step": 39
596
+ },
597
+ {
598
+ "advantages": -3.725290298461914e-09,
599
+ "completion_length": 72.296875,
600
+ "epoch": 0.2484472049689441,
601
+ "grad_norm": 8.574762344360352,
602
+ "kl": 0.0159912109375,
603
+ "learning_rate": 7.515527950310558e-07,
604
+ "loss": 0.0016,
605
+ "reward": 1.671875,
606
+ "reward_mean": 1.671875,
607
+ "reward_std": 0.23144522309303284,
608
+ "rewards/accuracy_reward": 0.671875,
609
+ "rewards/format_reward": 1.0,
610
+ "step": 40
611
+ },
612
+ {
613
+ "advantages": 7.450580596923828e-09,
614
+ "completion_length": 86.046875,
615
+ "epoch": 0.2546583850931677,
616
+ "grad_norm": 36.26329040527344,
617
+ "kl": 0.0147705078125,
618
+ "learning_rate": 7.453416149068323e-07,
619
+ "loss": 0.0015,
620
+ "reward": 1.65625,
621
+ "reward_mean": 1.65625,
622
+ "reward_std": 0.23356688022613525,
623
+ "rewards/accuracy_reward": 0.65625,
624
+ "rewards/format_reward": 1.0,
625
+ "step": 41
626
+ },
627
+ {
628
+ "advantages": -3.725290298461914e-09,
629
+ "completion_length": 77.0625,
630
+ "epoch": 0.2608695652173913,
631
+ "grad_norm": 10.992830276489258,
632
+ "kl": 0.0113525390625,
633
+ "learning_rate": 7.391304347826086e-07,
634
+ "loss": 0.0011,
635
+ "reward": 1.703125,
636
+ "reward_mean": 1.703125,
637
+ "reward_std": 0.24464011192321777,
638
+ "rewards/accuracy_reward": 0.703125,
639
+ "rewards/format_reward": 1.0,
640
+ "step": 42
641
+ },
642
+ {
643
+ "advantages": -1.862645149230957e-09,
644
+ "completion_length": 86.53125,
645
+ "epoch": 0.2670807453416149,
646
+ "grad_norm": 6.251725673675537,
647
+ "kl": 0.009033203125,
648
+ "learning_rate": 7.329192546583851e-07,
649
+ "loss": 0.0009,
650
+ "reward": 1.609375,
651
+ "reward_mean": 1.609375,
652
+ "reward_std": 0.23144522309303284,
653
+ "rewards/accuracy_reward": 0.609375,
654
+ "rewards/format_reward": 1.0,
655
+ "step": 43
656
+ },
657
+ {
658
+ "advantages": -4.6566128730773926e-09,
659
+ "completion_length": 86.4375,
660
+ "epoch": 0.2732919254658385,
661
+ "grad_norm": 3.8048486709594727,
662
+ "kl": 0.01385498046875,
663
+ "learning_rate": 7.267080745341615e-07,
664
+ "loss": 0.0014,
665
+ "reward": 1.765625,
666
+ "reward_mean": 1.765625,
667
+ "reward_std": 0.17358146607875824,
668
+ "rewards/accuracy_reward": 0.765625,
669
+ "rewards/format_reward": 1.0,
670
+ "step": 44
671
+ },
672
+ {
673
+ "advantages": -1.862645149230957e-09,
674
+ "completion_length": 84.21875,
675
+ "epoch": 0.2795031055900621,
676
+ "grad_norm": 2.5062499046325684,
677
+ "kl": 0.00811767578125,
678
+ "learning_rate": 7.204968944099379e-07,
679
+ "loss": 0.0008,
680
+ "reward": 1.796875,
681
+ "reward_mean": 1.796875,
682
+ "reward_std": 0.11100947856903076,
683
+ "rewards/accuracy_reward": 0.8125,
684
+ "rewards/format_reward": 0.984375,
685
+ "step": 45
686
+ },
687
+ {
688
+ "advantages": 3.725290298461914e-09,
689
+ "completion_length": 77.9375,
690
+ "epoch": 0.2857142857142857,
691
+ "grad_norm": 3.8415560722351074,
692
+ "kl": 0.01165771484375,
693
+ "learning_rate": 7.142857142857143e-07,
694
+ "loss": 0.0012,
695
+ "reward": 1.53125,
696
+ "reward_mean": 1.53125,
697
+ "reward_std": 0.1462521106004715,
698
+ "rewards/accuracy_reward": 0.53125,
699
+ "rewards/format_reward": 1.0,
700
+ "step": 46
701
+ },
702
+ {
703
+ "advantages": 1.862645149230957e-09,
704
+ "completion_length": 82.6875,
705
+ "epoch": 0.2919254658385093,
706
+ "grad_norm": 2.903069496154785,
707
+ "kl": 0.012451171875,
708
+ "learning_rate": 7.080745341614906e-07,
709
+ "loss": 0.0012,
710
+ "reward": 1.578125,
711
+ "reward_mean": 1.578125,
712
+ "reward_std": 0.11100947856903076,
713
+ "rewards/accuracy_reward": 0.59375,
714
+ "rewards/format_reward": 0.984375,
715
+ "step": 47
716
+ },
717
+ {
718
+ "advantages": -2.7939677238464355e-09,
719
+ "completion_length": 75.578125,
720
+ "epoch": 0.2981366459627329,
721
+ "grad_norm": 11.884781837463379,
722
+ "kl": 0.0125732421875,
723
+ "learning_rate": 7.018633540372671e-07,
724
+ "loss": 0.0013,
725
+ "reward": 1.65625,
726
+ "reward_mean": 1.65625,
727
+ "reward_std": 0.17570312321186066,
728
+ "rewards/accuracy_reward": 0.65625,
729
+ "rewards/format_reward": 1.0,
730
+ "step": 48
731
+ },
732
+ {
733
+ "advantages": -1.862645149230957e-09,
734
+ "completion_length": 73.34375,
735
+ "epoch": 0.30434782608695654,
736
+ "grad_norm": 2.234876871109009,
737
+ "kl": 0.0084228515625,
738
+ "learning_rate": 6.956521739130434e-07,
739
+ "loss": 0.0008,
740
+ "reward": 1.484375,
741
+ "reward_mean": 1.484375,
742
+ "reward_std": 0.04419417306780815,
743
+ "rewards/accuracy_reward": 0.484375,
744
+ "rewards/format_reward": 1.0,
745
+ "step": 49
746
+ },
747
+ {
748
+ "advantages": 1.862645149230957e-09,
749
+ "completion_length": 81.8125,
750
+ "epoch": 0.3105590062111801,
751
+ "grad_norm": 4.401739597320557,
752
+ "kl": 0.007232666015625,
753
+ "learning_rate": 6.894409937888198e-07,
754
+ "loss": 0.0007,
755
+ "reward": 1.765625,
756
+ "reward_mean": 1.765625,
757
+ "reward_std": 0.17782479524612427,
758
+ "rewards/accuracy_reward": 0.765625,
759
+ "rewards/format_reward": 1.0,
760
+ "step": 50
761
+ },
762
+ {
763
+ "advantages": 0.0,
764
+ "completion_length": 84.015625,
765
+ "epoch": 0.3167701863354037,
766
+ "grad_norm": 0.28293830156326294,
767
+ "kl": 0.0062255859375,
768
+ "learning_rate": 6.832298136645962e-07,
769
+ "loss": 0.0006,
770
+ "reward": 2.0,
771
+ "reward_mean": 2.0,
772
+ "reward_std": 0.0,
773
+ "rewards/accuracy_reward": 1.0,
774
+ "rewards/format_reward": 1.0,
775
+ "step": 51
776
+ },
777
+ {
778
+ "advantages": 0.0,
779
+ "completion_length": 79.125,
780
+ "epoch": 0.32298136645962733,
781
+ "grad_norm": 2.2039246559143066,
782
+ "kl": 0.0106201171875,
783
+ "learning_rate": 6.770186335403726e-07,
784
+ "loss": 0.0011,
785
+ "reward": 1.75,
786
+ "reward_mean": 1.75,
787
+ "reward_std": 0.0883883461356163,
788
+ "rewards/accuracy_reward": 0.75,
789
+ "rewards/format_reward": 1.0,
790
+ "step": 52
791
+ },
792
+ {
793
+ "advantages": 9.313225746154785e-10,
794
+ "completion_length": 76.0625,
795
+ "epoch": 0.32919254658385094,
796
+ "grad_norm": 4.176709175109863,
797
+ "kl": 0.01123046875,
798
+ "learning_rate": 6.708074534161491e-07,
799
+ "loss": 0.0011,
800
+ "reward": 1.640625,
801
+ "reward_mean": 1.640625,
802
+ "reward_std": 0.1530819982290268,
803
+ "rewards/accuracy_reward": 0.640625,
804
+ "rewards/format_reward": 1.0,
805
+ "step": 53
806
+ },
807
+ {
808
+ "advantages": -3.725290298461914e-09,
809
+ "completion_length": 81.125,
810
+ "epoch": 0.33540372670807456,
811
+ "grad_norm": 30.12848663330078,
812
+ "kl": 0.099609375,
813
+ "learning_rate": 6.645962732919254e-07,
814
+ "loss": 0.01,
815
+ "reward": 1.71875,
816
+ "reward_mean": 1.71875,
817
+ "reward_std": 0.1462520956993103,
818
+ "rewards/accuracy_reward": 0.71875,
819
+ "rewards/format_reward": 1.0,
820
+ "step": 54
821
+ },
822
+ {
823
+ "advantages": -1.862645149230957e-09,
824
+ "completion_length": 80.609375,
825
+ "epoch": 0.3416149068322981,
826
+ "grad_norm": 12.808406829833984,
827
+ "kl": 0.01416015625,
828
+ "learning_rate": 6.583850931677019e-07,
829
+ "loss": 0.0014,
830
+ "reward": 1.6875,
831
+ "reward_mean": 1.6875,
832
+ "reward_std": 0.2238783985376358,
833
+ "rewards/accuracy_reward": 0.703125,
834
+ "rewards/format_reward": 0.984375,
835
+ "step": 55
836
+ },
837
+ {
838
+ "advantages": -5.587935447692871e-09,
839
+ "completion_length": 76.15625,
840
+ "epoch": 0.34782608695652173,
841
+ "grad_norm": 5.750046253204346,
842
+ "kl": 0.01019287109375,
843
+ "learning_rate": 6.521739130434782e-07,
844
+ "loss": 0.001,
845
+ "reward": 1.5,
846
+ "reward_mean": 1.5,
847
+ "reward_std": 0.2041158676147461,
848
+ "rewards/accuracy_reward": 0.515625,
849
+ "rewards/format_reward": 0.984375,
850
+ "step": 56
851
+ },
852
+ {
853
+ "advantages": -3.725290298461914e-09,
854
+ "completion_length": 76.765625,
855
+ "epoch": 0.35403726708074534,
856
+ "grad_norm": 4.7853102684021,
857
+ "kl": 0.010986328125,
858
+ "learning_rate": 6.459627329192546e-07,
859
+ "loss": 0.0011,
860
+ "reward": 1.328125,
861
+ "reward_mean": 1.328125,
862
+ "reward_std": 0.19044628739356995,
863
+ "rewards/accuracy_reward": 0.34375,
864
+ "rewards/format_reward": 0.984375,
865
+ "step": 57
866
+ },
867
+ {
868
+ "advantages": 4.6566128730773926e-09,
869
+ "completion_length": 88.796875,
870
+ "epoch": 0.36024844720496896,
871
+ "grad_norm": 1.7344197034835815,
872
+ "kl": 0.00982666015625,
873
+ "learning_rate": 6.39751552795031e-07,
874
+ "loss": 0.001,
875
+ "reward": 1.671875,
876
+ "reward_mean": 1.671875,
877
+ "reward_std": 0.0646936446428299,
878
+ "rewards/accuracy_reward": 0.671875,
879
+ "rewards/format_reward": 1.0,
880
+ "step": 58
881
+ },
882
+ {
883
+ "advantages": -9.313225746154785e-09,
884
+ "completion_length": 84.28125,
885
+ "epoch": 0.36645962732919257,
886
+ "grad_norm": 3.1260499954223633,
887
+ "kl": 0.01416015625,
888
+ "learning_rate": 6.335403726708074e-07,
889
+ "loss": 0.0014,
890
+ "reward": 1.6875,
891
+ "reward_mean": 1.6875,
892
+ "reward_std": 0.1828794628381729,
893
+ "rewards/accuracy_reward": 0.703125,
894
+ "rewards/format_reward": 0.984375,
895
+ "step": 59
896
+ },
897
+ {
898
+ "advantages": -3.725290298461914e-09,
899
+ "completion_length": 77.21875,
900
+ "epoch": 0.37267080745341613,
901
+ "grad_norm": 1.7963190078735352,
902
+ "kl": 0.0089111328125,
903
+ "learning_rate": 6.273291925465838e-07,
904
+ "loss": 0.0009,
905
+ "reward": 1.84375,
906
+ "reward_mean": 1.84375,
907
+ "reward_std": 0.0578637570142746,
908
+ "rewards/accuracy_reward": 0.84375,
909
+ "rewards/format_reward": 1.0,
910
+ "step": 60
911
+ },
912
+ {
913
+ "advantages": 0.0,
914
+ "completion_length": 82.25,
915
+ "epoch": 0.37888198757763975,
916
+ "grad_norm": 2.5049538612365723,
917
+ "kl": 0.00787353515625,
918
+ "learning_rate": 6.211180124223601e-07,
919
+ "loss": 0.0008,
920
+ "reward": 1.625,
921
+ "reward_mean": 1.625,
922
+ "reward_std": 0.0883883461356163,
923
+ "rewards/accuracy_reward": 0.625,
924
+ "rewards/format_reward": 1.0,
925
+ "step": 61
926
+ },
927
+ {
928
+ "advantages": -1.210719347000122e-08,
929
+ "completion_length": 80.375,
930
+ "epoch": 0.38509316770186336,
931
+ "grad_norm": 5.739541530609131,
932
+ "kl": 0.01312255859375,
933
+ "learning_rate": 6.149068322981367e-07,
934
+ "loss": 0.0013,
935
+ "reward": 1.75,
936
+ "reward_mean": 1.75,
937
+ "reward_std": 0.2177756428718567,
938
+ "rewards/accuracy_reward": 0.75,
939
+ "rewards/format_reward": 1.0,
940
+ "step": 62
941
+ },
942
+ {
943
+ "advantages": 1.862645149230957e-09,
944
+ "completion_length": 84.296875,
945
+ "epoch": 0.391304347826087,
946
+ "grad_norm": 4.335031032562256,
947
+ "kl": 0.01116943359375,
948
+ "learning_rate": 6.08695652173913e-07,
949
+ "loss": 0.0011,
950
+ "reward": 1.90625,
951
+ "reward_mean": 1.90625,
952
+ "reward_std": 0.2041158676147461,
953
+ "rewards/accuracy_reward": 0.90625,
954
+ "rewards/format_reward": 1.0,
955
+ "step": 63
956
+ },
957
+ {
958
+ "advantages": 1.862645149230957e-09,
959
+ "completion_length": 86.828125,
960
+ "epoch": 0.39751552795031053,
961
+ "grad_norm": 4.443232536315918,
962
+ "kl": 0.01336669921875,
963
+ "learning_rate": 6.024844720496894e-07,
964
+ "loss": 0.0013,
965
+ "reward": 1.703125,
966
+ "reward_mean": 1.703125,
967
+ "reward_std": 0.19939783215522766,
968
+ "rewards/accuracy_reward": 0.703125,
969
+ "rewards/format_reward": 1.0,
970
+ "step": 64
971
+ },
972
+ {
973
+ "advantages": 5.587935447692871e-09,
974
+ "completion_length": 75.71875,
975
+ "epoch": 0.40372670807453415,
976
+ "grad_norm": 7.092515468597412,
977
+ "kl": 0.01251220703125,
978
+ "learning_rate": 5.962732919254659e-07,
979
+ "loss": 0.0013,
980
+ "reward": 1.65625,
981
+ "reward_mean": 1.65625,
982
+ "reward_std": 0.23827511072158813,
983
+ "rewards/accuracy_reward": 0.65625,
984
+ "rewards/format_reward": 1.0,
985
+ "step": 65
986
+ },
987
+ {
988
+ "advantages": 4.6566128730773926e-09,
989
+ "completion_length": 82.140625,
990
+ "epoch": 0.40993788819875776,
991
+ "grad_norm": 4.468729496002197,
992
+ "kl": 0.0211181640625,
993
+ "learning_rate": 5.900621118012422e-07,
994
+ "loss": 0.0021,
995
+ "reward": 1.796875,
996
+ "reward_mean": 1.796875,
997
+ "reward_std": 0.0646936446428299,
998
+ "rewards/accuracy_reward": 0.796875,
999
+ "rewards/format_reward": 1.0,
1000
+ "step": 66
1001
+ },
1002
+ {
1003
+ "advantages": 4.6566128730773926e-09,
1004
+ "completion_length": 74.890625,
1005
+ "epoch": 0.4161490683229814,
1006
+ "grad_norm": 9.289567947387695,
1007
+ "kl": 0.01611328125,
1008
+ "learning_rate": 5.838509316770186e-07,
1009
+ "loss": 0.0016,
1010
+ "reward": 1.421875,
1011
+ "reward_mean": 1.421875,
1012
+ "reward_std": 0.1983242630958557,
1013
+ "rewards/accuracy_reward": 0.421875,
1014
+ "rewards/format_reward": 1.0,
1015
+ "step": 67
1016
+ },
1017
+ {
1018
+ "advantages": 0.0,
1019
+ "completion_length": 77.625,
1020
+ "epoch": 0.422360248447205,
1021
+ "grad_norm": 0.4326918125152588,
1022
+ "kl": 0.0140380859375,
1023
+ "learning_rate": 5.77639751552795e-07,
1024
+ "loss": 0.0014,
1025
+ "reward": 1.875,
1026
+ "reward_mean": 1.875,
1027
+ "reward_std": 0.0,
1028
+ "rewards/accuracy_reward": 0.875,
1029
+ "rewards/format_reward": 1.0,
1030
+ "step": 68
1031
+ },
1032
+ {
1033
+ "advantages": -9.313225746154785e-10,
1034
+ "completion_length": 81.71875,
1035
+ "epoch": 0.42857142857142855,
1036
+ "grad_norm": 5.539842128753662,
1037
+ "kl": 0.04296875,
1038
+ "learning_rate": 5.714285714285714e-07,
1039
+ "loss": 0.0043,
1040
+ "reward": 1.4375,
1041
+ "reward_mean": 1.4375,
1042
+ "reward_std": 0.34352827072143555,
1043
+ "rewards/accuracy_reward": 0.453125,
1044
+ "rewards/format_reward": 0.984375,
1045
+ "step": 69
1046
+ },
1047
+ {
1048
+ "advantages": 0.0,
1049
+ "completion_length": 90.703125,
1050
+ "epoch": 0.43478260869565216,
1051
+ "grad_norm": 0.46686819195747375,
1052
+ "kl": 0.0074462890625,
1053
+ "learning_rate": 5.652173913043477e-07,
1054
+ "loss": 0.0007,
1055
+ "reward": 1.875,
1056
+ "reward_mean": 1.875,
1057
+ "reward_std": 0.0,
1058
+ "rewards/accuracy_reward": 0.875,
1059
+ "rewards/format_reward": 1.0,
1060
+ "step": 70
1061
+ },
1062
+ {
1063
+ "advantages": 1.862645149230957e-09,
1064
+ "completion_length": 83.578125,
1065
+ "epoch": 0.4409937888198758,
1066
+ "grad_norm": 8.54028606414795,
1067
+ "kl": 0.00897216796875,
1068
+ "learning_rate": 5.590062111801241e-07,
1069
+ "loss": 0.0009,
1070
+ "reward": 1.765625,
1071
+ "reward_mean": 1.765625,
1072
+ "reward_std": 0.04419417306780815,
1073
+ "rewards/accuracy_reward": 0.765625,
1074
+ "rewards/format_reward": 1.0,
1075
+ "step": 71
1076
+ },
1077
+ {
1078
+ "advantages": 7.450580596923828e-09,
1079
+ "completion_length": 84.25,
1080
+ "epoch": 0.4472049689440994,
1081
+ "grad_norm": 12.895256996154785,
1082
+ "kl": 0.00579833984375,
1083
+ "learning_rate": 5.527950310559007e-07,
1084
+ "loss": 0.0006,
1085
+ "reward": 1.453125,
1086
+ "reward_mean": 1.453125,
1087
+ "reward_std": 0.12255740165710449,
1088
+ "rewards/accuracy_reward": 0.453125,
1089
+ "rewards/format_reward": 1.0,
1090
+ "step": 72
1091
+ },
1092
+ {
1093
+ "advantages": -9.313225746154785e-09,
1094
+ "completion_length": 77.15625,
1095
+ "epoch": 0.453416149068323,
1096
+ "grad_norm": 5.548634052276611,
1097
+ "kl": 0.0123291015625,
1098
+ "learning_rate": 5.46583850931677e-07,
1099
+ "loss": 0.0012,
1100
+ "reward": 1.796875,
1101
+ "reward_mean": 1.796875,
1102
+ "reward_std": 0.31983357667922974,
1103
+ "rewards/accuracy_reward": 0.796875,
1104
+ "rewards/format_reward": 1.0,
1105
+ "step": 73
1106
+ },
1107
+ {
1108
+ "advantages": -1.0244548320770264e-08,
1109
+ "completion_length": 84.75,
1110
+ "epoch": 0.45962732919254656,
1111
+ "grad_norm": 3.4154112339019775,
1112
+ "kl": 0.018798828125,
1113
+ "learning_rate": 5.403726708074534e-07,
1114
+ "loss": 0.0019,
1115
+ "reward": 1.78125,
1116
+ "reward_mean": 1.78125,
1117
+ "reward_std": 0.19727616012096405,
1118
+ "rewards/accuracy_reward": 0.78125,
1119
+ "rewards/format_reward": 1.0,
1120
+ "step": 74
1121
+ },
1122
+ {
1123
+ "advantages": -4.6566128730773926e-09,
1124
+ "completion_length": 83.015625,
1125
+ "epoch": 0.4658385093167702,
1126
+ "grad_norm": 3.4328691959381104,
1127
+ "kl": 0.01275634765625,
1128
+ "learning_rate": 5.341614906832298e-07,
1129
+ "loss": 0.0013,
1130
+ "reward": 1.53125,
1131
+ "reward_mean": 1.53125,
1132
+ "reward_std": 0.23356688022613525,
1133
+ "rewards/accuracy_reward": 0.53125,
1134
+ "rewards/format_reward": 1.0,
1135
+ "step": 75
1136
+ },
1137
+ {
1138
+ "advantages": -1.862645149230957e-09,
1139
+ "completion_length": 78.609375,
1140
+ "epoch": 0.4720496894409938,
1141
+ "grad_norm": 3.627190113067627,
1142
+ "kl": 0.0142822265625,
1143
+ "learning_rate": 5.279503105590062e-07,
1144
+ "loss": 0.0014,
1145
+ "reward": 1.9375,
1146
+ "reward_mean": 1.9375,
1147
+ "reward_std": 0.1462521106004715,
1148
+ "rewards/accuracy_reward": 0.9375,
1149
+ "rewards/format_reward": 1.0,
1150
+ "step": 76
1151
+ },
1152
+ {
1153
+ "advantages": -3.725290298461914e-09,
1154
+ "completion_length": 80.46875,
1155
+ "epoch": 0.4782608695652174,
1156
+ "grad_norm": 10.168981552124023,
1157
+ "kl": 0.01251220703125,
1158
+ "learning_rate": 5.217391304347825e-07,
1159
+ "loss": 0.0013,
1160
+ "reward": 1.515625,
1161
+ "reward_mean": 1.515625,
1162
+ "reward_std": 0.2109457552433014,
1163
+ "rewards/accuracy_reward": 0.515625,
1164
+ "rewards/format_reward": 1.0,
1165
+ "step": 77
1166
+ },
1167
+ {
1168
+ "advantages": 0.0,
1169
+ "completion_length": 83.421875,
1170
+ "epoch": 0.484472049689441,
1171
+ "grad_norm": 20.923242568969727,
1172
+ "kl": 0.0113525390625,
1173
+ "learning_rate": 5.15527950310559e-07,
1174
+ "loss": 0.0011,
1175
+ "reward": 1.828125,
1176
+ "reward_mean": 1.828125,
1177
+ "reward_std": 0.19044628739356995,
1178
+ "rewards/accuracy_reward": 0.828125,
1179
+ "rewards/format_reward": 1.0,
1180
+ "step": 78
1181
+ },
1182
+ {
1183
+ "advantages": 2.7939677238464355e-09,
1184
+ "completion_length": 75.109375,
1185
+ "epoch": 0.4906832298136646,
1186
+ "grad_norm": 3.643770933151245,
1187
+ "kl": 0.00811767578125,
1188
+ "learning_rate": 5.093167701863354e-07,
1189
+ "loss": 0.0008,
1190
+ "reward": 1.78125,
1191
+ "reward_mean": 1.78125,
1192
+ "reward_std": 0.10888782143592834,
1193
+ "rewards/accuracy_reward": 0.78125,
1194
+ "rewards/format_reward": 1.0,
1195
+ "step": 79
1196
+ },
1197
+ {
1198
+ "advantages": 1.862645149230957e-09,
1199
+ "completion_length": 81.65625,
1200
+ "epoch": 0.4968944099378882,
1201
+ "grad_norm": 4.883938312530518,
1202
+ "kl": 0.015625,
1203
+ "learning_rate": 5.031055900621117e-07,
1204
+ "loss": 0.0016,
1205
+ "reward": 1.25,
1206
+ "reward_mean": 1.25,
1207
+ "reward_std": 0.2130674123764038,
1208
+ "rewards/accuracy_reward": 0.25,
1209
+ "rewards/format_reward": 1.0,
1210
+ "step": 80
1211
+ },
1212
+ {
1213
+ "advantages": -3.725290298461914e-09,
1214
+ "completion_length": 82.515625,
1215
+ "epoch": 0.5031055900621118,
1216
+ "grad_norm": 1.3860398530960083,
1217
+ "kl": 0.00799560546875,
1218
+ "learning_rate": 4.968944099378881e-07,
1219
+ "loss": 0.0008,
1220
+ "reward": 1.71875,
1221
+ "reward_mean": 1.71875,
1222
+ "reward_std": 0.0578637570142746,
1223
+ "rewards/accuracy_reward": 0.71875,
1224
+ "rewards/format_reward": 1.0,
1225
+ "step": 81
1226
+ },
1227
+ {
1228
+ "advantages": 9.313225746154785e-10,
1229
+ "completion_length": 77.6875,
1230
+ "epoch": 0.5093167701863354,
1231
+ "grad_norm": 3.7328872680664062,
1232
+ "kl": 0.0181884765625,
1233
+ "learning_rate": 4.906832298136646e-07,
1234
+ "loss": 0.0018,
1235
+ "reward": 1.75,
1236
+ "reward_mean": 1.75,
1237
+ "reward_std": 0.16675157845020294,
1238
+ "rewards/accuracy_reward": 0.75,
1239
+ "rewards/format_reward": 1.0,
1240
+ "step": 82
1241
+ },
1242
+ {
1243
+ "advantages": 3.725290298461914e-09,
1244
+ "completion_length": 76.4375,
1245
+ "epoch": 0.515527950310559,
1246
+ "grad_norm": 3.6228644847869873,
1247
+ "kl": 0.01446533203125,
1248
+ "learning_rate": 4.84472049689441e-07,
1249
+ "loss": 0.0014,
1250
+ "reward": 1.46875,
1251
+ "reward_mean": 1.46875,
1252
+ "reward_std": 0.1246790662407875,
1253
+ "rewards/accuracy_reward": 0.46875,
1254
+ "rewards/format_reward": 1.0,
1255
+ "step": 83
1256
+ },
1257
+ {
1258
+ "advantages": -1.862645149230957e-09,
1259
+ "completion_length": 79.140625,
1260
+ "epoch": 0.5217391304347826,
1261
+ "grad_norm": 5.579171180725098,
1262
+ "kl": 0.0159912109375,
1263
+ "learning_rate": 4.782608695652174e-07,
1264
+ "loss": 0.0016,
1265
+ "reward": 1.65625,
1266
+ "reward_mean": 1.65625,
1267
+ "reward_std": 0.23356688022613525,
1268
+ "rewards/accuracy_reward": 0.671875,
1269
+ "rewards/format_reward": 0.984375,
1270
+ "step": 84
1271
+ },
1272
+ {
1273
+ "advantages": -5.587935447692871e-09,
1274
+ "completion_length": 80.0,
1275
+ "epoch": 0.5279503105590062,
1276
+ "grad_norm": 9.611387252807617,
1277
+ "kl": 0.01080322265625,
1278
+ "learning_rate": 4.7204968944099376e-07,
1279
+ "loss": 0.0011,
1280
+ "reward": 1.828125,
1281
+ "reward_mean": 1.828125,
1282
+ "reward_std": 0.13258251547813416,
1283
+ "rewards/accuracy_reward": 0.828125,
1284
+ "rewards/format_reward": 1.0,
1285
+ "step": 85
1286
+ },
1287
+ {
1288
+ "advantages": -1.862645149230957e-09,
1289
+ "completion_length": 85.5,
1290
+ "epoch": 0.5341614906832298,
1291
+ "grad_norm": 4.1448540687561035,
1292
+ "kl": 0.01007080078125,
1293
+ "learning_rate": 4.6583850931677014e-07,
1294
+ "loss": 0.001,
1295
+ "reward": 1.859375,
1296
+ "reward_mean": 1.859375,
1297
+ "reward_std": 0.17358146607875824,
1298
+ "rewards/accuracy_reward": 0.859375,
1299
+ "rewards/format_reward": 1.0,
1300
+ "step": 86
1301
+ },
1302
+ {
1303
+ "advantages": 1.862645149230957e-09,
1304
+ "completion_length": 75.5,
1305
+ "epoch": 0.5403726708074534,
1306
+ "grad_norm": 5.654483795166016,
1307
+ "kl": 0.01123046875,
1308
+ "learning_rate": 4.596273291925465e-07,
1309
+ "loss": 0.0011,
1310
+ "reward": 1.796875,
1311
+ "reward_mean": 1.796875,
1312
+ "reward_std": 0.1530819982290268,
1313
+ "rewards/accuracy_reward": 0.796875,
1314
+ "rewards/format_reward": 1.0,
1315
+ "step": 87
1316
+ },
1317
+ {
1318
+ "advantages": -3.725290298461914e-09,
1319
+ "completion_length": 72.671875,
1320
+ "epoch": 0.546583850931677,
1321
+ "grad_norm": 2.2370052337646484,
1322
+ "kl": 0.0137939453125,
1323
+ "learning_rate": 4.53416149068323e-07,
1324
+ "loss": 0.0014,
1325
+ "reward": 1.46875,
1326
+ "reward_mean": 1.46875,
1327
+ "reward_std": 0.0883883461356163,
1328
+ "rewards/accuracy_reward": 0.46875,
1329
+ "rewards/format_reward": 1.0,
1330
+ "step": 88
1331
+ },
1332
+ {
1333
+ "advantages": 0.0,
1334
+ "completion_length": 81.84375,
1335
+ "epoch": 0.5527950310559007,
1336
+ "grad_norm": 1.389394760131836,
1337
+ "kl": 0.00836181640625,
1338
+ "learning_rate": 4.472049689440994e-07,
1339
+ "loss": 0.0008,
1340
+ "reward": 1.75,
1341
+ "reward_mean": 1.75,
1342
+ "reward_std": 0.06681530922651291,
1343
+ "rewards/accuracy_reward": 0.765625,
1344
+ "rewards/format_reward": 0.984375,
1345
+ "step": 89
1346
+ },
1347
+ {
1348
+ "advantages": 0.0,
1349
+ "completion_length": 74.59375,
1350
+ "epoch": 0.5590062111801242,
1351
+ "grad_norm": 2.353760242462158,
1352
+ "kl": 0.00811767578125,
1353
+ "learning_rate": 4.4099378881987576e-07,
1354
+ "loss": 0.0008,
1355
+ "reward": 1.6875,
1356
+ "reward_mean": 1.6875,
1357
+ "reward_std": 0.06681530922651291,
1358
+ "rewards/accuracy_reward": 0.6875,
1359
+ "rewards/format_reward": 1.0,
1360
+ "step": 90
1361
+ },
1362
+ {
1363
+ "advantages": 0.0,
1364
+ "completion_length": 85.28125,
1365
+ "epoch": 0.5652173913043478,
1366
+ "grad_norm": 1.5767848491668701,
1367
+ "kl": 0.009765625,
1368
+ "learning_rate": 4.3478260869565214e-07,
1369
+ "loss": 0.001,
1370
+ "reward": 1.75,
1371
+ "reward_mean": 1.75,
1372
+ "reward_std": 0.0,
1373
+ "rewards/accuracy_reward": 0.75,
1374
+ "rewards/format_reward": 1.0,
1375
+ "step": 91
1376
+ },
1377
+ {
1378
+ "advantages": -8.381903171539307e-09,
1379
+ "completion_length": 81.859375,
1380
+ "epoch": 0.5714285714285714,
1381
+ "grad_norm": 3.835320234298706,
1382
+ "kl": 0.0181884765625,
1383
+ "learning_rate": 4.285714285714285e-07,
1384
+ "loss": 0.0018,
1385
+ "reward": 1.671875,
1386
+ "reward_mean": 1.671875,
1387
+ "reward_std": 0.1530819982290268,
1388
+ "rewards/accuracy_reward": 0.6875,
1389
+ "rewards/format_reward": 0.984375,
1390
+ "step": 92
1391
+ },
1392
+ {
1393
+ "advantages": 3.725290298461914e-09,
1394
+ "completion_length": 83.671875,
1395
+ "epoch": 0.577639751552795,
1396
+ "grad_norm": 9.30271053314209,
1397
+ "kl": 0.017822265625,
1398
+ "learning_rate": 4.2236024844720495e-07,
1399
+ "loss": 0.0018,
1400
+ "reward": 1.796875,
1401
+ "reward_mean": 1.796875,
1402
+ "reward_std": 0.23144522309303284,
1403
+ "rewards/accuracy_reward": 0.796875,
1404
+ "rewards/format_reward": 1.0,
1405
+ "step": 93
1406
+ },
1407
+ {
1408
+ "advantages": -4.6566128730773926e-09,
1409
+ "completion_length": 77.53125,
1410
+ "epoch": 0.5838509316770186,
1411
+ "grad_norm": 6.170975685119629,
1412
+ "kl": 0.009521484375,
1413
+ "learning_rate": 4.161490683229814e-07,
1414
+ "loss": 0.001,
1415
+ "reward": 1.65625,
1416
+ "reward_mean": 1.65625,
1417
+ "reward_std": 0.16675157845020294,
1418
+ "rewards/accuracy_reward": 0.65625,
1419
+ "rewards/format_reward": 1.0,
1420
+ "step": 94
1421
+ },
1422
+ {
1423
+ "advantages": -1.862645149230957e-09,
1424
+ "completion_length": 85.640625,
1425
+ "epoch": 0.5900621118012422,
1426
+ "grad_norm": 4.217593669891357,
1427
+ "kl": 0.01409912109375,
1428
+ "learning_rate": 4.0993788819875776e-07,
1429
+ "loss": 0.0014,
1430
+ "reward": 1.734375,
1431
+ "reward_mean": 1.734375,
1432
+ "reward_std": 0.15992169082164764,
1433
+ "rewards/accuracy_reward": 0.75,
1434
+ "rewards/format_reward": 0.984375,
1435
+ "step": 95
1436
+ },
1437
+ {
1438
+ "advantages": 0.0,
1439
+ "completion_length": 77.296875,
1440
+ "epoch": 0.5962732919254659,
1441
+ "grad_norm": 6.138365268707275,
1442
+ "kl": 0.0106201171875,
1443
+ "learning_rate": 4.0372670807453413e-07,
1444
+ "loss": 0.0011,
1445
+ "reward": 1.375,
1446
+ "reward_mean": 1.375,
1447
+ "reward_std": 0.06681530922651291,
1448
+ "rewards/accuracy_reward": 0.390625,
1449
+ "rewards/format_reward": 0.984375,
1450
+ "step": 96
1451
+ },
1452
+ {
1453
+ "advantages": 1.862645149230957e-09,
1454
+ "completion_length": 76.484375,
1455
+ "epoch": 0.6024844720496895,
1456
+ "grad_norm": 1.2896429300308228,
1457
+ "kl": 0.00970458984375,
1458
+ "learning_rate": 3.975155279503105e-07,
1459
+ "loss": 0.001,
1460
+ "reward": 1.71875,
1461
+ "reward_mean": 1.71875,
1462
+ "reward_std": 0.0883883461356163,
1463
+ "rewards/accuracy_reward": 0.734375,
1464
+ "rewards/format_reward": 0.984375,
1465
+ "step": 97
1466
+ },
1467
+ {
1468
+ "advantages": -3.725290298461914e-09,
1469
+ "completion_length": 81.71875,
1470
+ "epoch": 0.6086956521739131,
1471
+ "grad_norm": 6.941093444824219,
1472
+ "kl": 0.01165771484375,
1473
+ "learning_rate": 3.9130434782608694e-07,
1474
+ "loss": 0.0012,
1475
+ "reward": 1.71875,
1476
+ "reward_mean": 1.71875,
1477
+ "reward_std": 0.0578637570142746,
1478
+ "rewards/accuracy_reward": 0.71875,
1479
+ "rewards/format_reward": 1.0,
1480
+ "step": 98
1481
+ },
1482
+ {
1483
+ "advantages": -3.725290298461914e-09,
1484
+ "completion_length": 81.390625,
1485
+ "epoch": 0.6149068322981367,
1486
+ "grad_norm": 3.163457155227661,
1487
+ "kl": 0.00787353515625,
1488
+ "learning_rate": 3.850931677018633e-07,
1489
+ "loss": 0.0008,
1490
+ "reward": 1.96875,
1491
+ "reward_mean": 1.96875,
1492
+ "reward_std": 0.0883883461356163,
1493
+ "rewards/accuracy_reward": 0.96875,
1494
+ "rewards/format_reward": 1.0,
1495
+ "step": 99
1496
+ },
1497
+ {
1498
+ "advantages": 4.6566128730773926e-09,
1499
+ "completion_length": 79.1875,
1500
+ "epoch": 0.6211180124223602,
1501
+ "grad_norm": 4.2669830322265625,
1502
+ "kl": 0.0108642578125,
1503
+ "learning_rate": 3.7888198757763975e-07,
1504
+ "loss": 0.0011,
1505
+ "reward": 1.671875,
1506
+ "reward_mean": 1.671875,
1507
+ "reward_std": 0.0646936446428299,
1508
+ "rewards/accuracy_reward": 0.671875,
1509
+ "rewards/format_reward": 1.0,
1510
+ "step": 100
1511
+ },
1512
+ {
1513
+ "advantages": -1.862645149230957e-09,
1514
+ "completion_length": 77.578125,
1515
+ "epoch": 0.6273291925465838,
1516
+ "grad_norm": 6.153615474700928,
1517
+ "kl": 0.0101318359375,
1518
+ "learning_rate": 3.7267080745341613e-07,
1519
+ "loss": 0.001,
1520
+ "reward": 1.359375,
1521
+ "reward_mean": 1.359375,
1522
+ "reward_std": 0.04419417306780815,
1523
+ "rewards/accuracy_reward": 0.359375,
1524
+ "rewards/format_reward": 1.0,
1525
+ "step": 101
1526
+ },
1527
+ {
1528
+ "advantages": -1.862645149230957e-09,
1529
+ "completion_length": 81.53125,
1530
+ "epoch": 0.6335403726708074,
1531
+ "grad_norm": 4.077609539031982,
1532
+ "kl": 0.0181884765625,
1533
+ "learning_rate": 3.6645962732919256e-07,
1534
+ "loss": 0.0018,
1535
+ "reward": 1.84375,
1536
+ "reward_mean": 1.84375,
1537
+ "reward_std": 0.2177756428718567,
1538
+ "rewards/accuracy_reward": 0.84375,
1539
+ "rewards/format_reward": 1.0,
1540
+ "step": 102
1541
+ },
1542
+ {
1543
+ "advantages": -3.725290298461914e-09,
1544
+ "completion_length": 80.375,
1545
+ "epoch": 0.639751552795031,
1546
+ "grad_norm": 3.084027051925659,
1547
+ "kl": 0.01007080078125,
1548
+ "learning_rate": 3.6024844720496894e-07,
1549
+ "loss": 0.001,
1550
+ "reward": 1.53125,
1551
+ "reward_mean": 1.53125,
1552
+ "reward_std": 0.1462521106004715,
1553
+ "rewards/accuracy_reward": 0.53125,
1554
+ "rewards/format_reward": 1.0,
1555
+ "step": 103
1556
+ },
1557
+ {
1558
+ "advantages": -3.725290298461914e-09,
1559
+ "completion_length": 83.953125,
1560
+ "epoch": 0.6459627329192547,
1561
+ "grad_norm": 2.0512335300445557,
1562
+ "kl": 0.007476806640625,
1563
+ "learning_rate": 3.540372670807453e-07,
1564
+ "loss": 0.0007,
1565
+ "reward": 1.453125,
1566
+ "reward_mean": 1.453125,
1567
+ "reward_std": 0.0646936446428299,
1568
+ "rewards/accuracy_reward": 0.453125,
1569
+ "rewards/format_reward": 1.0,
1570
+ "step": 104
1571
+ },
1572
+ {
1573
+ "advantages": 0.0,
1574
+ "completion_length": 82.5625,
1575
+ "epoch": 0.6521739130434783,
1576
+ "grad_norm": 0.5302374362945557,
1577
+ "kl": 0.00982666015625,
1578
+ "learning_rate": 3.478260869565217e-07,
1579
+ "loss": 0.001,
1580
+ "reward": 1.75,
1581
+ "reward_mean": 1.75,
1582
+ "reward_std": 0.0,
1583
+ "rewards/accuracy_reward": 0.75,
1584
+ "rewards/format_reward": 1.0,
1585
+ "step": 105
1586
+ },
1587
+ {
1588
+ "advantages": -4.6566128730773926e-09,
1589
+ "completion_length": 75.953125,
1590
+ "epoch": 0.6583850931677019,
1591
+ "grad_norm": 6.2678751945495605,
1592
+ "kl": 0.0111083984375,
1593
+ "learning_rate": 3.416149068322981e-07,
1594
+ "loss": 0.0011,
1595
+ "reward": 1.890625,
1596
+ "reward_mean": 1.890625,
1597
+ "reward_std": 0.1315089464187622,
1598
+ "rewards/accuracy_reward": 0.890625,
1599
+ "rewards/format_reward": 1.0,
1600
+ "step": 106
1601
+ },
1602
+ {
1603
+ "advantages": -3.725290298461914e-09,
1604
+ "completion_length": 78.328125,
1605
+ "epoch": 0.6645962732919255,
1606
+ "grad_norm": 1.7859537601470947,
1607
+ "kl": 0.00946044921875,
1608
+ "learning_rate": 3.3540372670807456e-07,
1609
+ "loss": 0.0009,
1610
+ "reward": 1.71875,
1611
+ "reward_mean": 1.71875,
1612
+ "reward_std": 0.0883883461356163,
1613
+ "rewards/accuracy_reward": 0.71875,
1614
+ "rewards/format_reward": 1.0,
1615
+ "step": 107
1616
+ },
1617
+ {
1618
+ "advantages": -3.725290298461914e-09,
1619
+ "completion_length": 91.8125,
1620
+ "epoch": 0.6708074534161491,
1621
+ "grad_norm": 2.7167623043060303,
1622
+ "kl": 0.0081787109375,
1623
+ "learning_rate": 3.2919254658385094e-07,
1624
+ "loss": 0.0008,
1625
+ "reward": 1.65625,
1626
+ "reward_mean": 1.65625,
1627
+ "reward_std": 0.0883883461356163,
1628
+ "rewards/accuracy_reward": 0.671875,
1629
+ "rewards/format_reward": 0.984375,
1630
+ "step": 108
1631
+ },
1632
+ {
1633
+ "advantages": 0.0,
1634
+ "completion_length": 76.984375,
1635
+ "epoch": 0.6770186335403726,
1636
+ "grad_norm": 5.3628058433532715,
1637
+ "kl": 0.009033203125,
1638
+ "learning_rate": 3.229813664596273e-07,
1639
+ "loss": 0.0009,
1640
+ "reward": 1.515625,
1641
+ "reward_mean": 1.515625,
1642
+ "reward_std": 0.19044628739356995,
1643
+ "rewards/accuracy_reward": 0.515625,
1644
+ "rewards/format_reward": 1.0,
1645
+ "step": 109
1646
+ },
1647
+ {
1648
+ "advantages": -1.862645149230957e-09,
1649
+ "completion_length": 73.5,
1650
+ "epoch": 0.6832298136645962,
1651
+ "grad_norm": 3.2727582454681396,
1652
+ "kl": 0.0108642578125,
1653
+ "learning_rate": 3.167701863354037e-07,
1654
+ "loss": 0.0011,
1655
+ "reward": 1.609375,
1656
+ "reward_mean": 1.609375,
1657
+ "reward_std": 0.04419417306780815,
1658
+ "rewards/accuracy_reward": 0.609375,
1659
+ "rewards/format_reward": 1.0,
1660
+ "step": 110
1661
+ },
1662
+ {
1663
+ "advantages": 0.0,
1664
+ "completion_length": 75.75,
1665
+ "epoch": 0.6894409937888198,
1666
+ "grad_norm": 11.552366256713867,
1667
+ "kl": 0.0145263671875,
1668
+ "learning_rate": 3.105590062111801e-07,
1669
+ "loss": 0.0015,
1670
+ "reward": 1.75,
1671
+ "reward_mean": 1.75,
1672
+ "reward_std": 0.1157275140285492,
1673
+ "rewards/accuracy_reward": 0.75,
1674
+ "rewards/format_reward": 1.0,
1675
+ "step": 111
1676
+ },
1677
+ {
1678
+ "advantages": 3.725290298461914e-09,
1679
+ "completion_length": 85.734375,
1680
+ "epoch": 0.6956521739130435,
1681
+ "grad_norm": 6.025736331939697,
1682
+ "kl": 0.01556396484375,
1683
+ "learning_rate": 3.043478260869565e-07,
1684
+ "loss": 0.0016,
1685
+ "reward": 1.59375,
1686
+ "reward_mean": 1.59375,
1687
+ "reward_std": 0.1552036553621292,
1688
+ "rewards/accuracy_reward": 0.59375,
1689
+ "rewards/format_reward": 1.0,
1690
+ "step": 112
1691
+ },
1692
+ {
1693
+ "advantages": -3.725290298461914e-09,
1694
+ "completion_length": 82.734375,
1695
+ "epoch": 0.7018633540372671,
1696
+ "grad_norm": 15.336418151855469,
1697
+ "kl": 0.057373046875,
1698
+ "learning_rate": 2.9813664596273294e-07,
1699
+ "loss": 0.0057,
1700
+ "reward": 1.84375,
1701
+ "reward_mean": 1.84375,
1702
+ "reward_std": 0.0883883461356163,
1703
+ "rewards/accuracy_reward": 0.84375,
1704
+ "rewards/format_reward": 1.0,
1705
+ "step": 113
1706
+ },
1707
+ {
1708
+ "advantages": -3.725290298461914e-09,
1709
+ "completion_length": 78.953125,
1710
+ "epoch": 0.7080745341614907,
1711
+ "grad_norm": 65.76184844970703,
1712
+ "kl": 0.01385498046875,
1713
+ "learning_rate": 2.919254658385093e-07,
1714
+ "loss": 0.0014,
1715
+ "reward": 1.90625,
1716
+ "reward_mean": 1.90625,
1717
+ "reward_std": 0.1552036553621292,
1718
+ "rewards/accuracy_reward": 0.90625,
1719
+ "rewards/format_reward": 1.0,
1720
+ "step": 114
1721
+ },
1722
+ {
1723
+ "advantages": -1.862645149230957e-09,
1724
+ "completion_length": 79.765625,
1725
+ "epoch": 0.7142857142857143,
1726
+ "grad_norm": 3.660456657409668,
1727
+ "kl": 0.0194091796875,
1728
+ "learning_rate": 2.857142857142857e-07,
1729
+ "loss": 0.0019,
1730
+ "reward": 1.59375,
1731
+ "reward_mean": 1.59375,
1732
+ "reward_std": 0.10888782143592834,
1733
+ "rewards/accuracy_reward": 0.59375,
1734
+ "rewards/format_reward": 1.0,
1735
+ "step": 115
1736
+ },
1737
+ {
1738
+ "advantages": -5.587935447692871e-09,
1739
+ "completion_length": 76.34375,
1740
+ "epoch": 0.7204968944099379,
1741
+ "grad_norm": 4.989613056182861,
1742
+ "kl": 0.00787353515625,
1743
+ "learning_rate": 2.7950310559006207e-07,
1744
+ "loss": 0.0008,
1745
+ "reward": 1.828125,
1746
+ "reward_mean": 1.828125,
1747
+ "reward_std": 0.13258251547813416,
1748
+ "rewards/accuracy_reward": 0.84375,
1749
+ "rewards/format_reward": 0.984375,
1750
+ "step": 116
1751
+ },
1752
+ {
1753
+ "advantages": -9.313225746154785e-10,
1754
+ "completion_length": 77.859375,
1755
+ "epoch": 0.7267080745341615,
1756
+ "grad_norm": 2.4932050704956055,
1757
+ "kl": 0.0081787109375,
1758
+ "learning_rate": 2.732919254658385e-07,
1759
+ "loss": 0.0008,
1760
+ "reward": 1.859375,
1761
+ "reward_mean": 1.859375,
1762
+ "reward_std": 0.12255740165710449,
1763
+ "rewards/accuracy_reward": 0.859375,
1764
+ "rewards/format_reward": 1.0,
1765
+ "step": 117
1766
+ },
1767
+ {
1768
+ "advantages": 1.862645149230957e-09,
1769
+ "completion_length": 84.5,
1770
+ "epoch": 0.7329192546583851,
1771
+ "grad_norm": 5.0420732498168945,
1772
+ "kl": 0.01226806640625,
1773
+ "learning_rate": 2.670807453416149e-07,
1774
+ "loss": 0.0012,
1775
+ "reward": 1.640625,
1776
+ "reward_mean": 1.640625,
1777
+ "reward_std": 0.23144522309303284,
1778
+ "rewards/accuracy_reward": 0.640625,
1779
+ "rewards/format_reward": 1.0,
1780
+ "step": 118
1781
+ },
1782
+ {
1783
+ "advantages": -1.862645149230957e-09,
1784
+ "completion_length": 79.703125,
1785
+ "epoch": 0.7391304347826086,
1786
+ "grad_norm": 3.599855899810791,
1787
+ "kl": 0.0086669921875,
1788
+ "learning_rate": 2.6086956521739126e-07,
1789
+ "loss": 0.0009,
1790
+ "reward": 1.484375,
1791
+ "reward_mean": 1.484375,
1792
+ "reward_std": 0.13258251547813416,
1793
+ "rewards/accuracy_reward": 0.484375,
1794
+ "rewards/format_reward": 1.0,
1795
+ "step": 119
1796
+ },
1797
+ {
1798
+ "advantages": -5.587935447692871e-09,
1799
+ "completion_length": 79.1875,
1800
+ "epoch": 0.7453416149068323,
1801
+ "grad_norm": 3.320706605911255,
1802
+ "kl": 0.0133056640625,
1803
+ "learning_rate": 2.546583850931677e-07,
1804
+ "loss": 0.0013,
1805
+ "reward": 1.828125,
1806
+ "reward_mean": 1.828125,
1807
+ "reward_std": 0.10205793380737305,
1808
+ "rewards/accuracy_reward": 0.828125,
1809
+ "rewards/format_reward": 1.0,
1810
+ "step": 120
1811
+ },
1812
+ {
1813
+ "advantages": -8.381903171539307e-09,
1814
+ "completion_length": 95.203125,
1815
+ "epoch": 0.7515527950310559,
1816
+ "grad_norm": 2.8366851806640625,
1817
+ "kl": 0.0064697265625,
1818
+ "learning_rate": 2.4844720496894407e-07,
1819
+ "loss": 0.0006,
1820
+ "reward": 1.671875,
1821
+ "reward_mean": 1.671875,
1822
+ "reward_std": 0.1530819833278656,
1823
+ "rewards/accuracy_reward": 0.671875,
1824
+ "rewards/format_reward": 1.0,
1825
+ "step": 121
1826
+ },
1827
+ {
1828
+ "advantages": 1.862645149230957e-09,
1829
+ "completion_length": 78.765625,
1830
+ "epoch": 0.7577639751552795,
1831
+ "grad_norm": 3.376732587814331,
1832
+ "kl": 0.00860595703125,
1833
+ "learning_rate": 2.422360248447205e-07,
1834
+ "loss": 0.0009,
1835
+ "reward": 1.640625,
1836
+ "reward_mean": 1.640625,
1837
+ "reward_std": 0.10205793380737305,
1838
+ "rewards/accuracy_reward": 0.640625,
1839
+ "rewards/format_reward": 1.0,
1840
+ "step": 122
1841
+ },
1842
+ {
1843
+ "advantages": 1.862645149230957e-09,
1844
+ "completion_length": 79.453125,
1845
+ "epoch": 0.7639751552795031,
1846
+ "grad_norm": 3.5682129859924316,
1847
+ "kl": 0.018798828125,
1848
+ "learning_rate": 2.3602484472049688e-07,
1849
+ "loss": 0.0019,
1850
+ "reward": 1.671875,
1851
+ "reward_mean": 1.671875,
1852
+ "reward_std": 0.1804211586713791,
1853
+ "rewards/accuracy_reward": 0.671875,
1854
+ "rewards/format_reward": 1.0,
1855
+ "step": 123
1856
+ },
1857
+ {
1858
+ "advantages": 4.6566128730773926e-09,
1859
+ "completion_length": 74.96875,
1860
+ "epoch": 0.7701863354037267,
1861
+ "grad_norm": 2.6698434352874756,
1862
+ "kl": 0.006256103515625,
1863
+ "learning_rate": 2.2981366459627326e-07,
1864
+ "loss": 0.0006,
1865
+ "reward": 1.546875,
1866
+ "reward_mean": 1.546875,
1867
+ "reward_std": 0.0646936446428299,
1868
+ "rewards/accuracy_reward": 0.546875,
1869
+ "rewards/format_reward": 1.0,
1870
+ "step": 124
1871
+ },
1872
+ {
1873
+ "advantages": 0.0,
1874
+ "completion_length": 78.421875,
1875
+ "epoch": 0.7763975155279503,
1876
+ "grad_norm": 3.1063811779022217,
1877
+ "kl": 0.01214599609375,
1878
+ "learning_rate": 2.236024844720497e-07,
1879
+ "loss": 0.0012,
1880
+ "reward": 1.765625,
1881
+ "reward_mean": 1.765625,
1882
+ "reward_std": 0.12255740165710449,
1883
+ "rewards/accuracy_reward": 0.765625,
1884
+ "rewards/format_reward": 1.0,
1885
+ "step": 125
1886
+ },
1887
+ {
1888
+ "advantages": 1.862645149230957e-09,
1889
+ "completion_length": 86.703125,
1890
+ "epoch": 0.782608695652174,
1891
+ "grad_norm": 2.7392446994781494,
1892
+ "kl": 0.00634765625,
1893
+ "learning_rate": 2.1739130434782607e-07,
1894
+ "loss": 0.0006,
1895
+ "reward": 1.640625,
1896
+ "reward_mean": 1.640625,
1897
+ "reward_std": 0.04419417306780815,
1898
+ "rewards/accuracy_reward": 0.640625,
1899
+ "rewards/format_reward": 1.0,
1900
+ "step": 126
1901
+ },
1902
+ {
1903
+ "advantages": 7.450580596923828e-09,
1904
+ "completion_length": 83.546875,
1905
+ "epoch": 0.7888198757763976,
1906
+ "grad_norm": 9.345684051513672,
1907
+ "kl": 0.008056640625,
1908
+ "learning_rate": 2.1118012422360247e-07,
1909
+ "loss": 0.0008,
1910
+ "reward": 1.296875,
1911
+ "reward_mean": 1.296875,
1912
+ "reward_std": 0.19044628739356995,
1913
+ "rewards/accuracy_reward": 0.296875,
1914
+ "rewards/format_reward": 1.0,
1915
+ "step": 127
1916
+ },
1917
+ {
1918
+ "advantages": 0.0,
1919
+ "completion_length": 85.84375,
1920
+ "epoch": 0.7950310559006211,
1921
+ "grad_norm": 0.22835175693035126,
1922
+ "kl": 0.0084228515625,
1923
+ "learning_rate": 2.0496894409937888e-07,
1924
+ "loss": 0.0008,
1925
+ "reward": 1.75,
1926
+ "reward_mean": 1.75,
1927
+ "reward_std": 0.0,
1928
+ "rewards/accuracy_reward": 0.75,
1929
+ "rewards/format_reward": 1.0,
1930
+ "step": 128
1931
+ },
1932
+ {
1933
+ "advantages": 0.0,
1934
+ "completion_length": 81.828125,
1935
+ "epoch": 0.8012422360248447,
1936
+ "grad_norm": 2.44989275932312,
1937
+ "kl": 0.007171630859375,
1938
+ "learning_rate": 1.9875776397515526e-07,
1939
+ "loss": 0.0007,
1940
+ "reward": 1.5,
1941
+ "reward_mean": 1.5,
1942
+ "reward_std": 0.0883883461356163,
1943
+ "rewards/accuracy_reward": 0.5,
1944
+ "rewards/format_reward": 1.0,
1945
+ "step": 129
1946
+ },
1947
+ {
1948
+ "advantages": 9.313225746154785e-10,
1949
+ "completion_length": 83.296875,
1950
+ "epoch": 0.8074534161490683,
1951
+ "grad_norm": 26.60379409790039,
1952
+ "kl": 0.01031494140625,
1953
+ "learning_rate": 1.9254658385093166e-07,
1954
+ "loss": 0.001,
1955
+ "reward": 1.640625,
1956
+ "reward_mean": 1.640625,
1957
+ "reward_std": 0.1530819982290268,
1958
+ "rewards/accuracy_reward": 0.640625,
1959
+ "rewards/format_reward": 1.0,
1960
+ "step": 130
1961
+ },
1962
+ {
1963
+ "advantages": -3.725290298461914e-09,
1964
+ "completion_length": 75.296875,
1965
+ "epoch": 0.8136645962732919,
1966
+ "grad_norm": 2.649775981903076,
1967
+ "kl": 0.00787353515625,
1968
+ "learning_rate": 1.8633540372670807e-07,
1969
+ "loss": 0.0008,
1970
+ "reward": 1.71875,
1971
+ "reward_mean": 1.71875,
1972
+ "reward_std": 0.0883883461356163,
1973
+ "rewards/accuracy_reward": 0.71875,
1974
+ "rewards/format_reward": 1.0,
1975
+ "step": 131
1976
+ },
1977
+ {
1978
+ "advantages": -7.450580596923828e-09,
1979
+ "completion_length": 72.921875,
1980
+ "epoch": 0.8198757763975155,
1981
+ "grad_norm": 6.021523952484131,
1982
+ "kl": 0.017333984375,
1983
+ "learning_rate": 1.8012422360248447e-07,
1984
+ "loss": 0.0017,
1985
+ "reward": 1.546875,
1986
+ "reward_mean": 1.546875,
1987
+ "reward_std": 0.17358146607875824,
1988
+ "rewards/accuracy_reward": 0.546875,
1989
+ "rewards/format_reward": 1.0,
1990
+ "step": 132
1991
+ },
1992
+ {
1993
+ "advantages": 1.862645149230957e-09,
1994
+ "completion_length": 80.203125,
1995
+ "epoch": 0.8260869565217391,
1996
+ "grad_norm": 5.850553035736084,
1997
+ "kl": 0.0118408203125,
1998
+ "learning_rate": 1.7391304347826085e-07,
1999
+ "loss": 0.0012,
2000
+ "reward": 1.671875,
2001
+ "reward_mean": 1.671875,
2002
+ "reward_std": 0.25726157426834106,
2003
+ "rewards/accuracy_reward": 0.671875,
2004
+ "rewards/format_reward": 1.0,
2005
+ "step": 133
2006
+ },
2007
+ {
2008
+ "advantages": -5.587935447692871e-09,
2009
+ "completion_length": 90.234375,
2010
+ "epoch": 0.8322981366459627,
2011
+ "grad_norm": 9.700899124145508,
2012
+ "kl": 0.01422119140625,
2013
+ "learning_rate": 1.6770186335403728e-07,
2014
+ "loss": 0.0014,
2015
+ "reward": 1.78125,
2016
+ "reward_mean": 1.78125,
2017
+ "reward_std": 0.2651650309562683,
2018
+ "rewards/accuracy_reward": 0.828125,
2019
+ "rewards/format_reward": 0.953125,
2020
+ "step": 134
2021
+ },
2022
+ {
2023
+ "advantages": 3.725290298461914e-09,
2024
+ "completion_length": 81.90625,
2025
+ "epoch": 0.8385093167701864,
2026
+ "grad_norm": 2.9975473880767822,
2027
+ "kl": 0.00909423828125,
2028
+ "learning_rate": 1.6149068322981366e-07,
2029
+ "loss": 0.0009,
2030
+ "reward": 1.6875,
2031
+ "reward_mean": 1.6875,
2032
+ "reward_std": 0.1552036553621292,
2033
+ "rewards/accuracy_reward": 0.6875,
2034
+ "rewards/format_reward": 1.0,
2035
+ "step": 135
2036
+ },
2037
+ {
2038
+ "advantages": -3.725290298461914e-09,
2039
+ "completion_length": 79.3125,
2040
+ "epoch": 0.84472049689441,
2041
+ "grad_norm": 4.324582099914551,
2042
+ "kl": 0.01165771484375,
2043
+ "learning_rate": 1.5527950310559004e-07,
2044
+ "loss": 0.0012,
2045
+ "reward": 1.84375,
2046
+ "reward_mean": 1.84375,
2047
+ "reward_std": 0.2177756428718567,
2048
+ "rewards/accuracy_reward": 0.859375,
2049
+ "rewards/format_reward": 0.984375,
2050
+ "step": 136
2051
+ },
2052
+ {
2053
+ "advantages": 7.450580596923828e-09,
2054
+ "completion_length": 80.265625,
2055
+ "epoch": 0.8509316770186336,
2056
+ "grad_norm": 3.8911736011505127,
2057
+ "kl": 0.009521484375,
2058
+ "learning_rate": 1.4906832298136647e-07,
2059
+ "loss": 0.001,
2060
+ "reward": 1.5625,
2061
+ "reward_mean": 1.5625,
2062
+ "reward_std": 0.1552036553621292,
2063
+ "rewards/accuracy_reward": 0.5625,
2064
+ "rewards/format_reward": 1.0,
2065
+ "step": 137
2066
+ },
2067
+ {
2068
+ "advantages": 0.0,
2069
+ "completion_length": 78.40625,
2070
+ "epoch": 0.8571428571428571,
2071
+ "grad_norm": 2.864941120147705,
2072
+ "kl": 0.01129150390625,
2073
+ "learning_rate": 1.4285714285714285e-07,
2074
+ "loss": 0.0011,
2075
+ "reward": 1.765625,
2076
+ "reward_mean": 1.765625,
2077
+ "reward_std": 0.10205793380737305,
2078
+ "rewards/accuracy_reward": 0.765625,
2079
+ "rewards/format_reward": 1.0,
2080
+ "step": 138
2081
+ },
2082
+ {
2083
+ "advantages": 3.725290298461914e-09,
2084
+ "completion_length": 80.21875,
2085
+ "epoch": 0.8633540372670807,
2086
+ "grad_norm": 5.788990497589111,
2087
+ "kl": 0.00799560546875,
2088
+ "learning_rate": 1.3664596273291925e-07,
2089
+ "loss": 0.0008,
2090
+ "reward": 1.5625,
2091
+ "reward_mean": 1.5625,
2092
+ "reward_std": 0.1552036553621292,
2093
+ "rewards/accuracy_reward": 0.5625,
2094
+ "rewards/format_reward": 1.0,
2095
+ "step": 139
2096
+ },
2097
+ {
2098
+ "advantages": -1.862645149230957e-09,
2099
+ "completion_length": 92.90625,
2100
+ "epoch": 0.8695652173913043,
2101
+ "grad_norm": 4.130926609039307,
2102
+ "kl": 0.0111083984375,
2103
+ "learning_rate": 1.3043478260869563e-07,
2104
+ "loss": 0.0011,
2105
+ "reward": 1.859375,
2106
+ "reward_mean": 1.859375,
2107
+ "reward_std": 0.2198973000049591,
2108
+ "rewards/accuracy_reward": 0.859375,
2109
+ "rewards/format_reward": 1.0,
2110
+ "step": 140
2111
+ },
2112
+ {
2113
+ "advantages": -3.725290298461914e-09,
2114
+ "completion_length": 79.109375,
2115
+ "epoch": 0.8757763975155279,
2116
+ "grad_norm": 3.025212287902832,
2117
+ "kl": 0.01324462890625,
2118
+ "learning_rate": 1.2422360248447204e-07,
2119
+ "loss": 0.0013,
2120
+ "reward": 1.46875,
2121
+ "reward_mean": 1.46875,
2122
+ "reward_std": 0.1767766922712326,
2123
+ "rewards/accuracy_reward": 0.484375,
2124
+ "rewards/format_reward": 0.984375,
2125
+ "step": 141
2126
+ },
2127
+ {
2128
+ "advantages": 3.725290298461914e-09,
2129
+ "completion_length": 78.25,
2130
+ "epoch": 0.8819875776397516,
2131
+ "grad_norm": 6.828762531280518,
2132
+ "kl": 0.01177978515625,
2133
+ "learning_rate": 1.1801242236024844e-07,
2134
+ "loss": 0.0012,
2135
+ "reward": 1.40625,
2136
+ "reward_mean": 1.40625,
2137
+ "reward_std": 0.1462520956993103,
2138
+ "rewards/accuracy_reward": 0.40625,
2139
+ "rewards/format_reward": 1.0,
2140
+ "step": 142
2141
+ },
2142
+ {
2143
+ "advantages": 0.0,
2144
+ "completion_length": 73.625,
2145
+ "epoch": 0.8881987577639752,
2146
+ "grad_norm": 3.4486515522003174,
2147
+ "kl": 0.00762939453125,
2148
+ "learning_rate": 1.1180124223602484e-07,
2149
+ "loss": 0.0008,
2150
+ "reward": 1.65625,
2151
+ "reward_mean": 1.65625,
2152
+ "reward_std": 0.1552036553621292,
2153
+ "rewards/accuracy_reward": 0.65625,
2154
+ "rewards/format_reward": 1.0,
2155
+ "step": 143
2156
+ },
2157
+ {
2158
+ "advantages": -3.725290298461914e-09,
2159
+ "completion_length": 80.359375,
2160
+ "epoch": 0.8944099378881988,
2161
+ "grad_norm": 8.272978782653809,
2162
+ "kl": 0.00628662109375,
2163
+ "learning_rate": 1.0559006211180124e-07,
2164
+ "loss": 0.0006,
2165
+ "reward": 1.71875,
2166
+ "reward_mean": 1.71875,
2167
+ "reward_std": 0.0883883461356163,
2168
+ "rewards/accuracy_reward": 0.734375,
2169
+ "rewards/format_reward": 0.984375,
2170
+ "step": 144
2171
+ },
2172
+ {
2173
+ "advantages": -7.450580596923828e-09,
2174
+ "completion_length": 81.1875,
2175
+ "epoch": 0.9006211180124224,
2176
+ "grad_norm": 4.848587512969971,
2177
+ "kl": 0.017578125,
2178
+ "learning_rate": 9.937888198757763e-08,
2179
+ "loss": 0.0018,
2180
+ "reward": 1.5625,
2181
+ "reward_mean": 1.5625,
2182
+ "reward_std": 0.2041158676147461,
2183
+ "rewards/accuracy_reward": 0.5625,
2184
+ "rewards/format_reward": 1.0,
2185
+ "step": 145
2186
+ },
2187
+ {
2188
+ "advantages": 0.0,
2189
+ "completion_length": 79.984375,
2190
+ "epoch": 0.906832298136646,
2191
+ "grad_norm": 0.3604845702648163,
2192
+ "kl": 0.00958251953125,
2193
+ "learning_rate": 9.316770186335403e-08,
2194
+ "loss": 0.001,
2195
+ "reward": 1.75,
2196
+ "reward_mean": 1.75,
2197
+ "reward_std": 0.0,
2198
+ "rewards/accuracy_reward": 0.75,
2199
+ "rewards/format_reward": 1.0,
2200
+ "step": 146
2201
+ },
2202
+ {
2203
+ "advantages": 5.587935447692871e-09,
2204
+ "completion_length": 76.5625,
2205
+ "epoch": 0.9130434782608695,
2206
+ "grad_norm": 10.680438995361328,
2207
+ "kl": 0.01116943359375,
2208
+ "learning_rate": 8.695652173913042e-08,
2209
+ "loss": 0.0011,
2210
+ "reward": 1.671875,
2211
+ "reward_mean": 1.671875,
2212
+ "reward_std": 0.19939783215522766,
2213
+ "rewards/accuracy_reward": 0.671875,
2214
+ "rewards/format_reward": 1.0,
2215
+ "step": 147
2216
+ },
2217
+ {
2218
+ "advantages": -3.725290298461914e-09,
2219
+ "completion_length": 76.984375,
2220
+ "epoch": 0.9192546583850931,
2221
+ "grad_norm": 2.091907024383545,
2222
+ "kl": 0.010498046875,
2223
+ "learning_rate": 8.074534161490683e-08,
2224
+ "loss": 0.0011,
2225
+ "reward": 1.640625,
2226
+ "reward_mean": 1.640625,
2227
+ "reward_std": 0.08010874688625336,
2228
+ "rewards/accuracy_reward": 0.65625,
2229
+ "rewards/format_reward": 0.984375,
2230
+ "step": 148
2231
+ },
2232
+ {
2233
+ "advantages": 0.0,
2234
+ "completion_length": 76.203125,
2235
+ "epoch": 0.9254658385093167,
2236
+ "grad_norm": 0.20045147836208344,
2237
+ "kl": 0.0078125,
2238
+ "learning_rate": 7.453416149068323e-08,
2239
+ "loss": 0.0008,
2240
+ "reward": 1.75,
2241
+ "reward_mean": 1.75,
2242
+ "reward_std": 0.0,
2243
+ "rewards/accuracy_reward": 0.75,
2244
+ "rewards/format_reward": 1.0,
2245
+ "step": 149
2246
+ },
2247
+ {
2248
+ "advantages": 0.0,
2249
+ "completion_length": 84.140625,
2250
+ "epoch": 0.9316770186335404,
2251
+ "grad_norm": 3.21720814704895,
2252
+ "kl": 0.008544921875,
2253
+ "learning_rate": 6.832298136645963e-08,
2254
+ "loss": 0.0009,
2255
+ "reward": 1.6875,
2256
+ "reward_mean": 1.6875,
2257
+ "reward_std": 0.06681530922651291,
2258
+ "rewards/accuracy_reward": 0.6875,
2259
+ "rewards/format_reward": 1.0,
2260
+ "step": 150
2261
+ },
2262
+ {
2263
+ "advantages": -1.862645149230957e-09,
2264
+ "completion_length": 82.734375,
2265
+ "epoch": 0.937888198757764,
2266
+ "grad_norm": 7.955801963806152,
2267
+ "kl": 0.01263427734375,
2268
+ "learning_rate": 6.211180124223602e-08,
2269
+ "loss": 0.0013,
2270
+ "reward": 1.734375,
2271
+ "reward_mean": 1.734375,
2272
+ "reward_std": 0.10205793380737305,
2273
+ "rewards/accuracy_reward": 0.734375,
2274
+ "rewards/format_reward": 1.0,
2275
+ "step": 151
2276
+ },
2277
+ {
2278
+ "advantages": 3.725290298461914e-09,
2279
+ "completion_length": 72.96875,
2280
+ "epoch": 0.9440993788819876,
2281
+ "grad_norm": 3.563530921936035,
2282
+ "kl": 0.0093994140625,
2283
+ "learning_rate": 5.590062111801242e-08,
2284
+ "loss": 0.0009,
2285
+ "reward": 1.90625,
2286
+ "reward_mean": 1.90625,
2287
+ "reward_std": 0.0578637570142746,
2288
+ "rewards/accuracy_reward": 0.90625,
2289
+ "rewards/format_reward": 1.0,
2290
+ "step": 152
2291
+ },
2292
+ {
2293
+ "advantages": -9.313225746154785e-09,
2294
+ "completion_length": 83.125,
2295
+ "epoch": 0.9503105590062112,
2296
+ "grad_norm": 9.811988830566406,
2297
+ "kl": 0.0184326171875,
2298
+ "learning_rate": 4.9689440993788814e-08,
2299
+ "loss": 0.0019,
2300
+ "reward": 1.796875,
2301
+ "reward_mean": 1.796875,
2302
+ "reward_std": 0.15992169082164764,
2303
+ "rewards/accuracy_reward": 0.796875,
2304
+ "rewards/format_reward": 1.0,
2305
+ "step": 153
2306
+ },
2307
+ {
2308
+ "advantages": -1.0244548320770264e-08,
2309
+ "completion_length": 83.515625,
2310
+ "epoch": 0.9565217391304348,
2311
+ "grad_norm": 3.8269639015197754,
2312
+ "kl": 0.0133056640625,
2313
+ "learning_rate": 4.347826086956521e-08,
2314
+ "loss": 0.0013,
2315
+ "reward": 1.78125,
2316
+ "reward_mean": 1.78125,
2317
+ "reward_std": 0.16675157845020294,
2318
+ "rewards/accuracy_reward": 0.78125,
2319
+ "rewards/format_reward": 1.0,
2320
+ "step": 154
2321
+ },
2322
+ {
2323
+ "advantages": -7.450580596923828e-09,
2324
+ "completion_length": 85.671875,
2325
+ "epoch": 0.9627329192546584,
2326
+ "grad_norm": 3.470165252685547,
2327
+ "kl": 0.01300048828125,
2328
+ "learning_rate": 3.726708074534162e-08,
2329
+ "loss": 0.0013,
2330
+ "reward": 1.5625,
2331
+ "reward_mean": 1.5625,
2332
+ "reward_std": 0.1462520956993103,
2333
+ "rewards/accuracy_reward": 0.5625,
2334
+ "rewards/format_reward": 1.0,
2335
+ "step": 155
2336
+ },
2337
+ {
2338
+ "advantages": -9.313225746154785e-10,
2339
+ "completion_length": 84.203125,
2340
+ "epoch": 0.968944099378882,
2341
+ "grad_norm": 2.550407648086548,
2342
+ "kl": 0.00946044921875,
2343
+ "learning_rate": 3.105590062111801e-08,
2344
+ "loss": 0.0009,
2345
+ "reward": 1.625,
2346
+ "reward_mean": 1.625,
2347
+ "reward_std": 0.16675157845020294,
2348
+ "rewards/accuracy_reward": 0.640625,
2349
+ "rewards/format_reward": 0.984375,
2350
+ "step": 156
2351
+ },
2352
+ {
2353
+ "advantages": 3.725290298461914e-09,
2354
+ "completion_length": 76.296875,
2355
+ "epoch": 0.9751552795031055,
2356
+ "grad_norm": 3.396425247192383,
2357
+ "kl": 0.00714111328125,
2358
+ "learning_rate": 2.4844720496894407e-08,
2359
+ "loss": 0.0007,
2360
+ "reward": 1.546875,
2361
+ "reward_mean": 1.546875,
2362
+ "reward_std": 0.15992169082164764,
2363
+ "rewards/accuracy_reward": 0.546875,
2364
+ "rewards/format_reward": 1.0,
2365
+ "step": 157
2366
+ },
2367
+ {
2368
+ "advantages": -2.7939677238464355e-09,
2369
+ "completion_length": 73.859375,
2370
+ "epoch": 0.9813664596273292,
2371
+ "grad_norm": 3.776041030883789,
2372
+ "kl": 0.00921630859375,
2373
+ "learning_rate": 1.863354037267081e-08,
2374
+ "loss": 0.0009,
2375
+ "reward": 1.59375,
2376
+ "reward_mean": 1.59375,
2377
+ "reward_std": 0.10888782143592834,
2378
+ "rewards/accuracy_reward": 0.59375,
2379
+ "rewards/format_reward": 1.0,
2380
+ "step": 158
2381
+ },
2382
+ {
2383
+ "advantages": 1.862645149230957e-09,
2384
+ "completion_length": 77.953125,
2385
+ "epoch": 0.9875776397515528,
2386
+ "grad_norm": 3.304471254348755,
2387
+ "kl": 0.01263427734375,
2388
+ "learning_rate": 1.2422360248447204e-08,
2389
+ "loss": 0.0013,
2390
+ "reward": 1.796875,
2391
+ "reward_mean": 1.796875,
2392
+ "reward_std": 0.11100947856903076,
2393
+ "rewards/accuracy_reward": 0.796875,
2394
+ "rewards/format_reward": 1.0,
2395
+ "step": 159
2396
+ },
2397
+ {
2398
+ "advantages": 0.0,
2399
+ "completion_length": 79.25,
2400
+ "epoch": 0.9937888198757764,
2401
+ "grad_norm": 5.823967456817627,
2402
+ "kl": 0.00897216796875,
2403
+ "learning_rate": 6.211180124223602e-09,
2404
+ "loss": 0.0009,
2405
+ "reward": 1.5,
2406
+ "reward_mean": 1.5,
2407
+ "reward_std": 0.0883883461356163,
2408
+ "rewards/accuracy_reward": 0.5,
2409
+ "rewards/format_reward": 1.0,
2410
+ "step": 160
2411
+ },
2412
+ {
2413
+ "advantages": -0.5890890955924988,
2414
+ "completion_length": 89.33333587646484,
2415
+ "epoch": 1.0,
2416
+ "grad_norm": 2.0931286811828613,
2417
+ "kl": 0.00677490234375,
2418
+ "learning_rate": 0.0,
2419
+ "loss": 0.001,
2420
+ "reward": 1.6666667461395264,
2421
+ "reward_mean": 1.875,
2422
+ "reward_std": 0.3535533845424652,
2423
+ "rewards/accuracy_reward": 0.6666666865348816,
2424
+ "rewards/format_reward": 1.0,
2425
+ "step": 161
2426
+ }
2427
+ ],
2428
+ "logging_steps": 1.0,
2429
+ "max_steps": 161,
2430
+ "num_input_tokens_seen": 0,
2431
+ "num_train_epochs": 1,
2432
+ "save_steps": 10,
2433
+ "stateful_callbacks": {
2434
+ "TrainerControl": {
2435
+ "args": {
2436
+ "should_epoch_stop": false,
2437
+ "should_evaluate": false,
2438
+ "should_log": false,
2439
+ "should_save": true,
2440
+ "should_training_stop": true
2441
+ },
2442
+ "attributes": {}
2443
+ }
2444
+ },
2445
+ "total_flos": 0.0,
2446
+ "train_batch_size": 1,
2447
+ "trial_name": null,
2448
+ "trial_params": null
2449
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ff22b61060846c443ccf16d8054c34f49711075b591a288d7706ce0f75243ee
3
+ size 8056
vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
zero_to_fp32.py ADDED
@@ -0,0 +1,674 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example:
14
+ # python zero_to_fp32.py . output_dir/
15
+ # or
16
+ # python zero_to_fp32.py . output_dir/ --safe_serialization
17
+
18
+ import argparse
19
+ import torch
20
+ import glob
21
+ import math
22
+ import os
23
+ import re
24
+ import json
25
+ from tqdm import tqdm
26
+ from collections import OrderedDict
27
+ from dataclasses import dataclass
28
+
29
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
30
+ # DeepSpeed data structures it has to be available in the current python environment.
31
+ from deepspeed.utils import logger
32
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
33
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
34
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
35
+
36
+
37
+ @dataclass
38
+ class zero_model_state:
39
+ buffers: dict()
40
+ param_shapes: dict()
41
+ shared_params: list
42
+ ds_version: int
43
+ frozen_param_shapes: dict()
44
+ frozen_param_fragments: dict()
45
+
46
+
47
+ debug = 0
48
+
49
+ # load to cpu
50
+ device = torch.device('cpu')
51
+
52
+
53
+ def atoi(text):
54
+ return int(text) if text.isdigit() else text
55
+
56
+
57
+ def natural_keys(text):
58
+ '''
59
+ alist.sort(key=natural_keys) sorts in human order
60
+ http://nedbatchelder.com/blog/200712/human_sorting.html
61
+ (See Toothy's implementation in the comments)
62
+ '''
63
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
64
+
65
+
66
+ def get_model_state_file(checkpoint_dir, zero_stage):
67
+ if not os.path.isdir(checkpoint_dir):
68
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
69
+
70
+ # there should be only one file
71
+ if zero_stage <= 2:
72
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
73
+ elif zero_stage == 3:
74
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
75
+
76
+ if not os.path.exists(file):
77
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
78
+
79
+ return file
80
+
81
+
82
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
83
+ # XXX: need to test that this simple glob rule works for multi-node setup too
84
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
85
+
86
+ if len(ckpt_files) == 0:
87
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
88
+
89
+ return ckpt_files
90
+
91
+
92
+ def get_optim_files(checkpoint_dir):
93
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
94
+
95
+
96
+ def get_model_state_files(checkpoint_dir):
97
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
98
+
99
+
100
+ def parse_model_states(files):
101
+ zero_model_states = []
102
+ for file in files:
103
+ state_dict = torch.load(file, map_location=device)
104
+
105
+ if BUFFER_NAMES not in state_dict:
106
+ raise ValueError(f"{file} is not a model state checkpoint")
107
+ buffer_names = state_dict[BUFFER_NAMES]
108
+ if debug:
109
+ print("Found buffers:", buffer_names)
110
+
111
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
112
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
113
+ param_shapes = state_dict[PARAM_SHAPES]
114
+
115
+ # collect parameters that are included in param_shapes
116
+ param_names = []
117
+ for s in param_shapes:
118
+ for name in s.keys():
119
+ param_names.append(name)
120
+
121
+ # update with frozen parameters
122
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
123
+ if frozen_param_shapes is not None:
124
+ if debug:
125
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
126
+ param_names += list(frozen_param_shapes.keys())
127
+
128
+ # handle shared params
129
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
130
+
131
+ ds_version = state_dict.get(DS_VERSION, None)
132
+
133
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
134
+
135
+ z_model_state = zero_model_state(buffers=buffers,
136
+ param_shapes=param_shapes,
137
+ shared_params=shared_params,
138
+ ds_version=ds_version,
139
+ frozen_param_shapes=frozen_param_shapes,
140
+ frozen_param_fragments=frozen_param_fragments)
141
+ zero_model_states.append(z_model_state)
142
+
143
+ return zero_model_states
144
+
145
+
146
+ def parse_optim_states(files, ds_checkpoint_dir):
147
+ total_files = len(files)
148
+ state_dicts = []
149
+ for f in files:
150
+ state_dict = torch.load(f, map_location=device)
151
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
152
+ # and also handle the case where it was already removed by another helper script
153
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
154
+ state_dicts.append(state_dict)
155
+
156
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
157
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
158
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
159
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
160
+
161
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
162
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
163
+ # use the max of the partition_count to get the dp world_size.
164
+
165
+ if type(world_size) is list:
166
+ world_size = max(world_size)
167
+
168
+ if world_size != total_files:
169
+ raise ValueError(
170
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
171
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
172
+ )
173
+
174
+ # the groups are named differently in each stage
175
+ if zero_stage <= 2:
176
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
177
+ elif zero_stage == 3:
178
+ fp32_groups_key = FP32_FLAT_GROUPS
179
+ else:
180
+ raise ValueError(f"unknown zero stage {zero_stage}")
181
+
182
+ if zero_stage <= 2:
183
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
184
+ elif zero_stage == 3:
185
+ # if there is more than one param group, there will be multiple flattened tensors - one
186
+ # flattened tensor per group - for simplicity merge them into a single tensor
187
+ #
188
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
189
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
190
+
191
+ fp32_flat_groups = [
192
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
193
+ ]
194
+
195
+ return zero_stage, world_size, fp32_flat_groups
196
+
197
+
198
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
199
+ """
200
+ Returns fp32 state_dict reconstructed from ds checkpoint
201
+
202
+ Args:
203
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
204
+
205
+ """
206
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
207
+
208
+ optim_files = get_optim_files(ds_checkpoint_dir)
209
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
210
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
211
+
212
+ model_files = get_model_state_files(ds_checkpoint_dir)
213
+
214
+ zero_model_states = parse_model_states(model_files)
215
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
216
+
217
+ if zero_stage <= 2:
218
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
219
+ exclude_frozen_parameters)
220
+ elif zero_stage == 3:
221
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
222
+ exclude_frozen_parameters)
223
+
224
+
225
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
226
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
227
+ return
228
+
229
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
230
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
231
+
232
+ if debug:
233
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
234
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
235
+
236
+ wanted_params = len(frozen_param_shapes)
237
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
238
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
239
+ print(f'Frozen params: Have {avail_numel} numels to process.')
240
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
241
+
242
+ total_params = 0
243
+ total_numel = 0
244
+ for name, shape in frozen_param_shapes.items():
245
+ total_params += 1
246
+ unpartitioned_numel = shape.numel()
247
+ total_numel += unpartitioned_numel
248
+
249
+ state_dict[name] = frozen_param_fragments[name]
250
+
251
+ if debug:
252
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
253
+
254
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
255
+
256
+
257
+ def _has_callable(obj, fn):
258
+ attr = getattr(obj, fn, None)
259
+ return callable(attr)
260
+
261
+
262
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
263
+ param_shapes = zero_model_states[0].param_shapes
264
+
265
+ # Reconstruction protocol:
266
+ #
267
+ # XXX: document this
268
+
269
+ if debug:
270
+ for i in range(world_size):
271
+ for j in range(len(fp32_flat_groups[0])):
272
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
273
+
274
+ # XXX: memory usage doubles here (zero2)
275
+ num_param_groups = len(fp32_flat_groups[0])
276
+ merged_single_partition_of_fp32_groups = []
277
+ for i in range(num_param_groups):
278
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
279
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
280
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
281
+ avail_numel = sum(
282
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
283
+
284
+ if debug:
285
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
286
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
287
+ # not asserting if there is a mismatch due to possible padding
288
+ print(f"Have {avail_numel} numels to process.")
289
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
290
+
291
+ # params
292
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
293
+ # out-of-core computing solution
294
+ total_numel = 0
295
+ total_params = 0
296
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
297
+ offset = 0
298
+ avail_numel = full_single_fp32_vector.numel()
299
+ for name, shape in shapes.items():
300
+
301
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
302
+ total_numel += unpartitioned_numel
303
+ total_params += 1
304
+
305
+ if debug:
306
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
307
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
308
+ offset += unpartitioned_numel
309
+
310
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
311
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
312
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
313
+ # live optimizer object, so we are checking that the numbers are within the right range
314
+ align_to = 2 * world_size
315
+
316
+ def zero2_align(x):
317
+ return align_to * math.ceil(x / align_to)
318
+
319
+ if debug:
320
+ print(f"original offset={offset}, avail_numel={avail_numel}")
321
+
322
+ offset = zero2_align(offset)
323
+ avail_numel = zero2_align(avail_numel)
324
+
325
+ if debug:
326
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
327
+
328
+ # Sanity check
329
+ if offset != avail_numel:
330
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
331
+
332
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
333
+
334
+
335
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
336
+ exclude_frozen_parameters):
337
+ state_dict = OrderedDict()
338
+
339
+ # buffers
340
+ buffers = zero_model_states[0].buffers
341
+ state_dict.update(buffers)
342
+ if debug:
343
+ print(f"added {len(buffers)} buffers")
344
+
345
+ if not exclude_frozen_parameters:
346
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
347
+
348
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
349
+
350
+ # recover shared parameters
351
+ for pair in zero_model_states[0].shared_params:
352
+ if pair[1] in state_dict:
353
+ state_dict[pair[0]] = state_dict[pair[1]]
354
+
355
+ return state_dict
356
+
357
+
358
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
359
+ remainder = unpartitioned_numel % world_size
360
+ padding_numel = (world_size - remainder) if remainder else 0
361
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
362
+ return partitioned_numel, padding_numel
363
+
364
+
365
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
366
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
367
+ return
368
+
369
+ if debug:
370
+ for i in range(world_size):
371
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
372
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
373
+
374
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
375
+ wanted_params = len(frozen_param_shapes)
376
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
377
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
378
+ print(f'Frozen params: Have {avail_numel} numels to process.')
379
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
380
+
381
+ total_params = 0
382
+ total_numel = 0
383
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
384
+ total_params += 1
385
+ unpartitioned_numel = shape.numel()
386
+ total_numel += unpartitioned_numel
387
+
388
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
389
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
390
+
391
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
392
+
393
+ if debug:
394
+ print(
395
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
396
+ )
397
+
398
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
399
+
400
+
401
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
402
+ param_shapes = zero_model_states[0].param_shapes
403
+ avail_numel = fp32_flat_groups[0].numel() * world_size
404
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
405
+ # param, re-consolidating each param, while dealing with padding if any
406
+
407
+ # merge list of dicts, preserving order
408
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
409
+
410
+ if debug:
411
+ for i in range(world_size):
412
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
413
+
414
+ wanted_params = len(param_shapes)
415
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
416
+ # not asserting if there is a mismatch due to possible padding
417
+ avail_numel = fp32_flat_groups[0].numel() * world_size
418
+ print(f"Trainable params: Have {avail_numel} numels to process.")
419
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
420
+
421
+ # params
422
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
423
+ # out-of-core computing solution
424
+ offset = 0
425
+ total_numel = 0
426
+ total_params = 0
427
+ for name, shape in tqdm(param_shapes.items(), desc='Gathering Sharded Weights'):
428
+ unpartitioned_numel = shape.numel()
429
+ total_numel += unpartitioned_numel
430
+ total_params += 1
431
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
432
+
433
+ if debug:
434
+ print(
435
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
436
+ )
437
+
438
+ # XXX: memory usage doubles here
439
+ state_dict[name] = torch.cat(
440
+ tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
441
+ 0).narrow(0, 0, unpartitioned_numel).view(shape)
442
+ offset += partitioned_numel
443
+
444
+ offset *= world_size
445
+
446
+ # Sanity check
447
+ if offset != avail_numel:
448
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
449
+
450
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
451
+
452
+
453
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
454
+ exclude_frozen_parameters):
455
+ state_dict = OrderedDict()
456
+
457
+ # buffers
458
+ buffers = zero_model_states[0].buffers
459
+ state_dict.update(buffers)
460
+ if debug:
461
+ print(f"added {len(buffers)} buffers")
462
+
463
+ if not exclude_frozen_parameters:
464
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
465
+
466
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
467
+
468
+ # recover shared parameters
469
+ for pair in zero_model_states[0].shared_params:
470
+ if pair[1] in state_dict:
471
+ state_dict[pair[0]] = state_dict[pair[1]]
472
+
473
+ return state_dict
474
+
475
+
476
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False):
477
+ """
478
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
479
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
480
+ via a model hub.
481
+
482
+ Args:
483
+ - ``checkpoint_dir``: path to the desired checkpoint folder
484
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
485
+ - ``exclude_frozen_parameters``: exclude frozen parameters
486
+
487
+ Returns:
488
+ - pytorch ``state_dict``
489
+
490
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
491
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
492
+ the checkpoint.
493
+
494
+ A typical usage might be ::
495
+
496
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
497
+ # do the training and checkpoint saving
498
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
499
+ model = model.cpu() # move to cpu
500
+ model.load_state_dict(state_dict)
501
+ # submit to model hub or save the model to share with others
502
+
503
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
504
+ application. i.e. you will need to re-initialize the deepspeed engine, since
505
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
506
+
507
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
508
+
509
+ """
510
+ if tag is None:
511
+ latest_path = os.path.join(checkpoint_dir, 'latest')
512
+ if os.path.isfile(latest_path):
513
+ with open(latest_path, 'r') as fd:
514
+ tag = fd.read().strip()
515
+ else:
516
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
517
+
518
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
519
+
520
+ if not os.path.isdir(ds_checkpoint_dir):
521
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
522
+
523
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
524
+
525
+
526
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
527
+ output_dir,
528
+ max_shard_size="5GB",
529
+ safe_serialization=False,
530
+ tag=None,
531
+ exclude_frozen_parameters=False):
532
+ """
533
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
534
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
535
+
536
+ Args:
537
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
538
+ - ``output_dir``: directory to the pytorch fp32 state_dict output files
539
+ - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
540
+ - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
541
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
542
+ - ``exclude_frozen_parameters``: exclude frozen parameters
543
+ """
544
+ # Dependency pre-check
545
+ if safe_serialization:
546
+ try:
547
+ from safetensors.torch import save_file
548
+ except ImportError:
549
+ print('If you want to use `safe_serialization`, please `pip install safetensors`')
550
+ raise
551
+ if max_shard_size is not None:
552
+ try:
553
+ from huggingface_hub import split_torch_state_dict_into_shards
554
+ except ImportError:
555
+ print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
556
+ raise
557
+
558
+ # Convert zero checkpoint to state_dict
559
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters)
560
+
561
+ # Shard the model if it is too big.
562
+ weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
563
+ if max_shard_size is not None:
564
+ filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
565
+ state_dict_split = split_torch_state_dict_into_shards(state_dict,
566
+ filename_pattern=filename_pattern,
567
+ max_shard_size=max_shard_size)
568
+ else:
569
+ from collections import namedtuple
570
+ StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
571
+ state_dict_split = StateDictSplit(is_sharded=False,
572
+ filename_to_tensors={weights_name: list(state_dict.keys())})
573
+
574
+ # Save the model
575
+ filename_to_tensors = state_dict_split.filename_to_tensors.items()
576
+ for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
577
+ shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
578
+ output_path = os.path.join(output_dir, shard_file)
579
+ if safe_serialization:
580
+ save_file(shard, output_path, metadata={"format": "pt"})
581
+ else:
582
+ torch.save(shard, output_path)
583
+
584
+ # Save index if sharded
585
+ if state_dict_split.is_sharded:
586
+ index = {
587
+ "metadata": state_dict_split.metadata,
588
+ "weight_map": state_dict_split.tensor_to_filename,
589
+ }
590
+ save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
591
+ save_index_file = os.path.join(output_dir, save_index_file)
592
+ with open(save_index_file, "w", encoding="utf-8") as f:
593
+ content = json.dumps(index, indent=2, sort_keys=True) + "\n"
594
+ f.write(content)
595
+
596
+
597
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
598
+ """
599
+ 1. Put the provided model to cpu
600
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
601
+ 3. Load it into the provided model
602
+
603
+ Args:
604
+ - ``model``: the model object to update
605
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
606
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
607
+
608
+ Returns:
609
+ - ``model`: modified model
610
+
611
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
612
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
613
+ conveniently placed for you in the checkpoint folder.
614
+
615
+ A typical usage might be ::
616
+
617
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
618
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
619
+ # submit to model hub or save the model to share with others
620
+
621
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
622
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
623
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
624
+
625
+ """
626
+ logger.info(f"Extracting fp32 weights")
627
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
628
+
629
+ logger.info(f"Overwriting model with fp32 weights")
630
+ model = model.cpu()
631
+ model.load_state_dict(state_dict, strict=False)
632
+
633
+ return model
634
+
635
+
636
+ if __name__ == "__main__":
637
+ parser = argparse.ArgumentParser()
638
+ parser.add_argument("checkpoint_dir",
639
+ type=str,
640
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
641
+ parser.add_argument("output_dir",
642
+ type=str,
643
+ help="directory to the pytorch fp32 state_dict output files"
644
+ "(e.g. path/checkpoint-12-output/)")
645
+ parser.add_argument(
646
+ "--max_shard_size",
647
+ type=str,
648
+ default="5GB",
649
+ help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
650
+ "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
651
+ "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
652
+ "without CPU OOM issues.")
653
+ parser.add_argument(
654
+ "--safe_serialization",
655
+ default=False,
656
+ action='store_true',
657
+ help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
658
+ parser.add_argument("-t",
659
+ "--tag",
660
+ type=str,
661
+ default=None,
662
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
663
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
664
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
665
+ args = parser.parse_args()
666
+
667
+ debug = args.debug
668
+
669
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
670
+ args.output_dir,
671
+ max_shard_size=args.max_shard_size,
672
+ safe_serialization=args.safe_serialization,
673
+ tag=args.tag,
674
+ exclude_frozen_parameters=args.exclude_frozen_parameters)