amphora commited on
Commit
cb7aeb2
·
verified ·
1 Parent(s): d5af138

Delete checkpoint-381

Browse files
checkpoint-381/added_tokens.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "</tool_call>": 151658,
3
- "<tool_call>": 151657,
4
- "<|box_end|>": 151649,
5
- "<|box_start|>": 151648,
6
- "<|endoftext|>": 151643,
7
- "<|file_sep|>": 151664,
8
- "<|fim_middle|>": 151660,
9
- "<|fim_pad|>": 151662,
10
- "<|fim_prefix|>": 151659,
11
- "<|fim_suffix|>": 151661,
12
- "<|im_end|>": 151645,
13
- "<|im_start|>": 151644,
14
- "<|image_pad|>": 151655,
15
- "<|object_ref_end|>": 151647,
16
- "<|object_ref_start|>": 151646,
17
- "<|quad_end|>": 151651,
18
- "<|quad_start|>": 151650,
19
- "<|repo_name|>": 151663,
20
- "<|video_pad|>": 151656,
21
- "<|vision_end|>": 151653,
22
- "<|vision_pad|>": 151654,
23
- "<|vision_start|>": 151652
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-381/config.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
3
- "architectures": [
4
- "Qwen2ForCausalLM"
5
- ],
6
- "attention_dropout": 0.0,
7
- "eos_token_id": 151645,
8
- "hidden_act": "silu",
9
- "hidden_size": 1536,
10
- "initializer_range": 0.02,
11
- "intermediate_size": 8960,
12
- "max_position_embeddings": 32768,
13
- "max_window_layers": 21,
14
- "model_type": "qwen2",
15
- "num_attention_heads": 12,
16
- "num_hidden_layers": 28,
17
- "num_key_value_heads": 2,
18
- "rms_norm_eps": 1e-06,
19
- "rope_scaling": null,
20
- "rope_theta": 1000000.0,
21
- "sliding_window": null,
22
- "tie_word_embeddings": true,
23
- "torch_dtype": "bfloat16",
24
- "transformers_version": "4.48.1",
25
- "use_cache": false,
26
- "use_sliding_window": false,
27
- "vocab_size": 151665
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-381/generation_config.json DELETED
@@ -1,14 +0,0 @@
1
- {
2
- "bos_token_id": 151643,
3
- "do_sample": true,
4
- "eos_token_id": [
5
- 151645,
6
- 151643
7
- ],
8
- "pad_token_id": 151643,
9
- "repetition_penalty": 1.1,
10
- "temperature": 0.7,
11
- "top_k": 20,
12
- "top_p": 0.8,
13
- "transformers_version": "4.48.1"
14
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-381/latest DELETED
@@ -1 +0,0 @@
1
- global_step381
 
 
checkpoint-381/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-381/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e894aee6a90801f44c9691e3440b53d94bdf748ea5d51734b11a8228b54f1784
3
- size 3552549728
 
 
 
 
checkpoint-381/rng_state_0.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f3803bff3f596c03b55881de967a825b5734e4a581739164f9cb9e7fd1aee89
3
- size 14512
 
 
 
 
checkpoint-381/rng_state_1.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d768a04b798e2ca42effbe096b8e4481f32a402a9125a2ced390586dab8eb29e
3
- size 14512
 
 
 
 
checkpoint-381/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:410d31e26656fe111807307d758f91b4394aefad48a9d1d7efaa9992c522efa9
3
- size 1064
 
 
 
 
checkpoint-381/special_tokens_map.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|object_ref_start|>",
6
- "<|object_ref_end|>",
7
- "<|box_start|>",
8
- "<|box_end|>",
9
- "<|quad_start|>",
10
- "<|quad_end|>",
11
- "<|vision_start|>",
12
- "<|vision_end|>",
13
- "<|vision_pad|>",
14
- "<|image_pad|>",
15
- "<|video_pad|>"
16
- ],
17
- "eos_token": {
18
- "content": "<|im_end|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- "pad_token": {
25
- "content": "<|endoftext|>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- }
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-381/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
- size 11421896
 
 
 
 
checkpoint-381/tokenizer_config.json DELETED
@@ -1,208 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "added_tokens_decoder": {
5
- "151643": {
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "151644": {
14
- "content": "<|im_start|>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "151645": {
22
- "content": "<|im_end|>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "151646": {
30
- "content": "<|object_ref_start|>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "151647": {
38
- "content": "<|object_ref_end|>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "151648": {
46
- "content": "<|box_start|>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "151649": {
54
- "content": "<|box_end|>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "151650": {
62
- "content": "<|quad_start|>",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
- },
69
- "151651": {
70
- "content": "<|quad_end|>",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": true
76
- },
77
- "151652": {
78
- "content": "<|vision_start|>",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": true
84
- },
85
- "151653": {
86
- "content": "<|vision_end|>",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": true
92
- },
93
- "151654": {
94
- "content": "<|vision_pad|>",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": true
100
- },
101
- "151655": {
102
- "content": "<|image_pad|>",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": true
108
- },
109
- "151656": {
110
- "content": "<|video_pad|>",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": true
116
- },
117
- "151657": {
118
- "content": "<tool_call>",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": false
124
- },
125
- "151658": {
126
- "content": "</tool_call>",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "151659": {
134
- "content": "<|fim_prefix|>",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "151660": {
142
- "content": "<|fim_middle|>",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "151661": {
150
- "content": "<|fim_suffix|>",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": false
156
- },
157
- "151662": {
158
- "content": "<|fim_pad|>",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": false
164
- },
165
- "151663": {
166
- "content": "<|repo_name|>",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": false
172
- },
173
- "151664": {
174
- "content": "<|file_sep|>",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": false
180
- }
181
- },
182
- "additional_special_tokens": [
183
- "<|im_start|>",
184
- "<|im_end|>",
185
- "<|object_ref_start|>",
186
- "<|object_ref_end|>",
187
- "<|box_start|>",
188
- "<|box_end|>",
189
- "<|quad_start|>",
190
- "<|quad_end|>",
191
- "<|vision_start|>",
192
- "<|vision_end|>",
193
- "<|vision_pad|>",
194
- "<|image_pad|>",
195
- "<|video_pad|>"
196
- ],
197
- "bos_token": null,
198
- "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
- "clean_up_tokenization_spaces": false,
200
- "eos_token": "<|im_end|>",
201
- "errors": "replace",
202
- "extra_special_tokens": {},
203
- "model_max_length": 131072,
204
- "pad_token": "<|endoftext|>",
205
- "split_special_tokens": false,
206
- "tokenizer_class": "Qwen2Tokenizer",
207
- "unk_token": null
208
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-381/trainer_state.json DELETED
@@ -1,2772 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
- "eval_steps": 43,
6
- "global_step": 381,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.007874015748031496,
13
- "grad_norm": 118.11203002929688,
14
- "learning_rate": 2.0000000000000003e-06,
15
- "loss": 4.6099,
16
- "step": 1
17
- },
18
- {
19
- "epoch": 0.007874015748031496,
20
- "eval_loss": 3.1001100540161133,
21
- "eval_runtime": 5.3966,
22
- "eval_samples_per_second": 30.204,
23
- "eval_steps_per_second": 3.891,
24
- "step": 1
25
- },
26
- {
27
- "epoch": 0.015748031496062992,
28
- "grad_norm": 118.4310302734375,
29
- "learning_rate": 4.000000000000001e-06,
30
- "loss": 4.5857,
31
- "step": 2
32
- },
33
- {
34
- "epoch": 0.023622047244094488,
35
- "grad_norm": 103.37439727783203,
36
- "learning_rate": 6e-06,
37
- "loss": 4.3069,
38
- "step": 3
39
- },
40
- {
41
- "epoch": 0.031496062992125984,
42
- "grad_norm": 75.05075073242188,
43
- "learning_rate": 8.000000000000001e-06,
44
- "loss": 3.8754,
45
- "step": 4
46
- },
47
- {
48
- "epoch": 0.03937007874015748,
49
- "grad_norm": 50.459983825683594,
50
- "learning_rate": 1e-05,
51
- "loss": 3.2841,
52
- "step": 5
53
- },
54
- {
55
- "epoch": 0.047244094488188976,
56
- "grad_norm": 47.4603385925293,
57
- "learning_rate": 1.2e-05,
58
- "loss": 2.4285,
59
- "step": 6
60
- },
61
- {
62
- "epoch": 0.05511811023622047,
63
- "grad_norm": 32.362667083740234,
64
- "learning_rate": 1.4e-05,
65
- "loss": 1.8177,
66
- "step": 7
67
- },
68
- {
69
- "epoch": 0.06299212598425197,
70
- "grad_norm": 22.846933364868164,
71
- "learning_rate": 1.6000000000000003e-05,
72
- "loss": 1.1567,
73
- "step": 8
74
- },
75
- {
76
- "epoch": 0.07086614173228346,
77
- "grad_norm": 17.060213088989258,
78
- "learning_rate": 1.8e-05,
79
- "loss": 0.8257,
80
- "step": 9
81
- },
82
- {
83
- "epoch": 0.07874015748031496,
84
- "grad_norm": 14.415579795837402,
85
- "learning_rate": 2e-05,
86
- "loss": 0.4257,
87
- "step": 10
88
- },
89
- {
90
- "epoch": 0.08661417322834646,
91
- "grad_norm": 7.753712177276611,
92
- "learning_rate": 1.999964147509006e-05,
93
- "loss": 0.2976,
94
- "step": 11
95
- },
96
- {
97
- "epoch": 0.09448818897637795,
98
- "grad_norm": 26.883708953857422,
99
- "learning_rate": 1.9998565926068253e-05,
100
- "loss": 0.3365,
101
- "step": 12
102
- },
103
- {
104
- "epoch": 0.10236220472440945,
105
- "grad_norm": 10.675631523132324,
106
- "learning_rate": 1.9996773430056806e-05,
107
- "loss": 0.2161,
108
- "step": 13
109
- },
110
- {
111
- "epoch": 0.11023622047244094,
112
- "grad_norm": 6.670111179351807,
113
- "learning_rate": 1.999426411558661e-05,
114
- "loss": 0.1816,
115
- "step": 14
116
- },
117
- {
118
- "epoch": 0.11811023622047244,
119
- "grad_norm": 8.878239631652832,
120
- "learning_rate": 1.9991038162588018e-05,
121
- "loss": 0.1567,
122
- "step": 15
123
- },
124
- {
125
- "epoch": 0.12598425196850394,
126
- "grad_norm": 2.9917383193969727,
127
- "learning_rate": 1.9987095802377933e-05,
128
- "loss": 0.0813,
129
- "step": 16
130
- },
131
- {
132
- "epoch": 0.13385826771653545,
133
- "grad_norm": 1.0548763275146484,
134
- "learning_rate": 1.9982437317643218e-05,
135
- "loss": 0.0217,
136
- "step": 17
137
- },
138
- {
139
- "epoch": 0.14173228346456693,
140
- "grad_norm": 2.8778488636016846,
141
- "learning_rate": 1.9977063042420438e-05,
142
- "loss": 0.0618,
143
- "step": 18
144
- },
145
- {
146
- "epoch": 0.14960629921259844,
147
- "grad_norm": 0.9811734557151794,
148
- "learning_rate": 1.99709733620719e-05,
149
- "loss": 0.0175,
150
- "step": 19
151
- },
152
- {
153
- "epoch": 0.15748031496062992,
154
- "grad_norm": 0.7218202948570251,
155
- "learning_rate": 1.996416871325803e-05,
156
- "loss": 0.0302,
157
- "step": 20
158
- },
159
- {
160
- "epoch": 0.16535433070866143,
161
- "grad_norm": 1.2746995687484741,
162
- "learning_rate": 1.995664958390604e-05,
163
- "loss": 0.0453,
164
- "step": 21
165
- },
166
- {
167
- "epoch": 0.1732283464566929,
168
- "grad_norm": 0.9413469433784485,
169
- "learning_rate": 1.9948416513174976e-05,
170
- "loss": 0.0175,
171
- "step": 22
172
- },
173
- {
174
- "epoch": 0.18110236220472442,
175
- "grad_norm": 1.4161137342453003,
176
- "learning_rate": 1.9939470091417012e-05,
177
- "loss": 0.0277,
178
- "step": 23
179
- },
180
- {
181
- "epoch": 0.1889763779527559,
182
- "grad_norm": 2.2721235752105713,
183
- "learning_rate": 1.992981096013517e-05,
184
- "loss": 0.0589,
185
- "step": 24
186
- },
187
- {
188
- "epoch": 0.1968503937007874,
189
- "grad_norm": 1.143970251083374,
190
- "learning_rate": 1.9919439811937283e-05,
191
- "loss": 0.0182,
192
- "step": 25
193
- },
194
- {
195
- "epoch": 0.2047244094488189,
196
- "grad_norm": 0.8054028749465942,
197
- "learning_rate": 1.9908357390486342e-05,
198
- "loss": 0.0211,
199
- "step": 26
200
- },
201
- {
202
- "epoch": 0.2125984251968504,
203
- "grad_norm": 1.4449081420898438,
204
- "learning_rate": 1.989656449044718e-05,
205
- "loss": 0.0244,
206
- "step": 27
207
- },
208
- {
209
- "epoch": 0.2204724409448819,
210
- "grad_norm": 0.49216631054878235,
211
- "learning_rate": 1.988406195742948e-05,
212
- "loss": 0.005,
213
- "step": 28
214
- },
215
- {
216
- "epoch": 0.2283464566929134,
217
- "grad_norm": 0.9945647716522217,
218
- "learning_rate": 1.987085068792715e-05,
219
- "loss": 0.0373,
220
- "step": 29
221
- },
222
- {
223
- "epoch": 0.23622047244094488,
224
- "grad_norm": 1.1753748655319214,
225
- "learning_rate": 1.9856931629254032e-05,
226
- "loss": 0.0217,
227
- "step": 30
228
- },
229
- {
230
- "epoch": 0.2440944881889764,
231
- "grad_norm": 0.5960403680801392,
232
- "learning_rate": 1.984230577947597e-05,
233
- "loss": 0.0157,
234
- "step": 31
235
- },
236
- {
237
- "epoch": 0.25196850393700787,
238
- "grad_norm": 0.3657272160053253,
239
- "learning_rate": 1.9826974187339267e-05,
240
- "loss": 0.0082,
241
- "step": 32
242
- },
243
- {
244
- "epoch": 0.25984251968503935,
245
- "grad_norm": 1.1290266513824463,
246
- "learning_rate": 1.981093795219546e-05,
247
- "loss": 0.0236,
248
- "step": 33
249
- },
250
- {
251
- "epoch": 0.2677165354330709,
252
- "grad_norm": 1.673962116241455,
253
- "learning_rate": 1.9794198223922496e-05,
254
- "loss": 0.0182,
255
- "step": 34
256
- },
257
- {
258
- "epoch": 0.2755905511811024,
259
- "grad_norm": 0.540355384349823,
260
- "learning_rate": 1.9776756202842297e-05,
261
- "loss": 0.011,
262
- "step": 35
263
- },
264
- {
265
- "epoch": 0.28346456692913385,
266
- "grad_norm": 0.3380790054798126,
267
- "learning_rate": 1.9758613139634662e-05,
268
- "loss": 0.0048,
269
- "step": 36
270
- },
271
- {
272
- "epoch": 0.29133858267716534,
273
- "grad_norm": 1.886232852935791,
274
- "learning_rate": 1.9739770335247616e-05,
275
- "loss": 0.0157,
276
- "step": 37
277
- },
278
- {
279
- "epoch": 0.2992125984251969,
280
- "grad_norm": 2.140639305114746,
281
- "learning_rate": 1.972022914080411e-05,
282
- "loss": 0.0393,
283
- "step": 38
284
- },
285
- {
286
- "epoch": 0.30708661417322836,
287
- "grad_norm": 0.35308870673179626,
288
- "learning_rate": 1.9699990957505136e-05,
289
- "loss": 0.0074,
290
- "step": 39
291
- },
292
- {
293
- "epoch": 0.31496062992125984,
294
- "grad_norm": 0.3918301463127136,
295
- "learning_rate": 1.9679057236529266e-05,
296
- "loss": 0.0083,
297
- "step": 40
298
- },
299
- {
300
- "epoch": 0.3228346456692913,
301
- "grad_norm": 0.4406338632106781,
302
- "learning_rate": 1.965742947892858e-05,
303
- "loss": 0.0152,
304
- "step": 41
305
- },
306
- {
307
- "epoch": 0.33070866141732286,
308
- "grad_norm": 0.6819682121276855,
309
- "learning_rate": 1.9635109235521057e-05,
310
- "loss": 0.0091,
311
- "step": 42
312
- },
313
- {
314
- "epoch": 0.33858267716535434,
315
- "grad_norm": 0.6794927716255188,
316
- "learning_rate": 1.961209810677934e-05,
317
- "loss": 0.0071,
318
- "step": 43
319
- },
320
- {
321
- "epoch": 0.33858267716535434,
322
- "eval_loss": 0.3895845115184784,
323
- "eval_runtime": 6.5602,
324
- "eval_samples_per_second": 24.847,
325
- "eval_steps_per_second": 3.201,
326
- "step": 43
327
- },
328
- {
329
- "epoch": 0.3464566929133858,
330
- "grad_norm": 0.3874967694282532,
331
- "learning_rate": 1.9588397742716004e-05,
332
- "loss": 0.0089,
333
- "step": 44
334
- },
335
- {
336
- "epoch": 0.3543307086614173,
337
- "grad_norm": 0.5577577352523804,
338
- "learning_rate": 1.9564009842765225e-05,
339
- "loss": 0.0098,
340
- "step": 45
341
- },
342
- {
343
- "epoch": 0.36220472440944884,
344
- "grad_norm": 0.8152347207069397,
345
- "learning_rate": 1.9538936155660934e-05,
346
- "loss": 0.0118,
347
- "step": 46
348
- },
349
- {
350
- "epoch": 0.3700787401574803,
351
- "grad_norm": 0.2971118688583374,
352
- "learning_rate": 1.951317847931141e-05,
353
- "loss": 0.0084,
354
- "step": 47
355
- },
356
- {
357
- "epoch": 0.3779527559055118,
358
- "grad_norm": 1.0286651849746704,
359
- "learning_rate": 1.9486738660670373e-05,
360
- "loss": 0.0123,
361
- "step": 48
362
- },
363
- {
364
- "epoch": 0.3858267716535433,
365
- "grad_norm": 0.5227222442626953,
366
- "learning_rate": 1.945961859560454e-05,
367
- "loss": 0.0144,
368
- "step": 49
369
- },
370
- {
371
- "epoch": 0.3937007874015748,
372
- "grad_norm": 0.461935818195343,
373
- "learning_rate": 1.943182022875769e-05,
374
- "loss": 0.0119,
375
- "step": 50
376
- },
377
- {
378
- "epoch": 0.4015748031496063,
379
- "grad_norm": 1.2550626993179321,
380
- "learning_rate": 1.940334555341122e-05,
381
- "loss": 0.013,
382
- "step": 51
383
- },
384
- {
385
- "epoch": 0.4094488188976378,
386
- "grad_norm": 0.37549659609794617,
387
- "learning_rate": 1.9374196611341212e-05,
388
- "loss": 0.0181,
389
- "step": 52
390
- },
391
- {
392
- "epoch": 0.41732283464566927,
393
- "grad_norm": 0.3444191515445709,
394
- "learning_rate": 1.9344375492672024e-05,
395
- "loss": 0.0111,
396
- "step": 53
397
- },
398
- {
399
- "epoch": 0.4251968503937008,
400
- "grad_norm": 0.3489387333393097,
401
- "learning_rate": 1.9313884335726443e-05,
402
- "loss": 0.0111,
403
- "step": 54
404
- },
405
- {
406
- "epoch": 0.4330708661417323,
407
- "grad_norm": 0.26080814003944397,
408
- "learning_rate": 1.9282725326872324e-05,
409
- "loss": 0.0091,
410
- "step": 55
411
- },
412
- {
413
- "epoch": 0.4409448818897638,
414
- "grad_norm": 0.1390451341867447,
415
- "learning_rate": 1.9250900700365837e-05,
416
- "loss": 0.0033,
417
- "step": 56
418
- },
419
- {
420
- "epoch": 0.44881889763779526,
421
- "grad_norm": 0.20499111711978912,
422
- "learning_rate": 1.921841273819125e-05,
423
- "loss": 0.0066,
424
- "step": 57
425
- },
426
- {
427
- "epoch": 0.4566929133858268,
428
- "grad_norm": 2.185487747192383,
429
- "learning_rate": 1.918526376989731e-05,
430
- "loss": 0.0095,
431
- "step": 58
432
- },
433
- {
434
- "epoch": 0.4645669291338583,
435
- "grad_norm": 0.23939816653728485,
436
- "learning_rate": 1.9151456172430186e-05,
437
- "loss": 0.0048,
438
- "step": 59
439
- },
440
- {
441
- "epoch": 0.47244094488188976,
442
- "grad_norm": 0.41510018706321716,
443
- "learning_rate": 1.911699236996305e-05,
444
- "loss": 0.0077,
445
- "step": 60
446
- },
447
- {
448
- "epoch": 0.48031496062992124,
449
- "grad_norm": 0.264318585395813,
450
- "learning_rate": 1.9081874833722234e-05,
451
- "loss": 0.0129,
452
- "step": 61
453
- },
454
- {
455
- "epoch": 0.4881889763779528,
456
- "grad_norm": 1.0443968772888184,
457
- "learning_rate": 1.9046106081810047e-05,
458
- "loss": 0.0035,
459
- "step": 62
460
- },
461
- {
462
- "epoch": 0.49606299212598426,
463
- "grad_norm": 0.2800132632255554,
464
- "learning_rate": 1.900968867902419e-05,
465
- "loss": 0.0057,
466
- "step": 63
467
- },
468
- {
469
- "epoch": 0.5039370078740157,
470
- "grad_norm": 1.114960789680481,
471
- "learning_rate": 1.8972625236673887e-05,
472
- "loss": 0.0123,
473
- "step": 64
474
- },
475
- {
476
- "epoch": 0.5118110236220472,
477
- "grad_norm": 0.5027065873146057,
478
- "learning_rate": 1.8934918412392596e-05,
479
- "loss": 0.0052,
480
- "step": 65
481
- },
482
- {
483
- "epoch": 0.5196850393700787,
484
- "grad_norm": 0.5564169883728027,
485
- "learning_rate": 1.8896570909947477e-05,
486
- "loss": 0.0085,
487
- "step": 66
488
- },
489
- {
490
- "epoch": 0.5275590551181102,
491
- "grad_norm": 0.7567198872566223,
492
- "learning_rate": 1.8857585479045493e-05,
493
- "loss": 0.0054,
494
- "step": 67
495
- },
496
- {
497
- "epoch": 0.5354330708661418,
498
- "grad_norm": 0.13573969900608063,
499
- "learning_rate": 1.8817964915136277e-05,
500
- "loss": 0.0008,
501
- "step": 68
502
- },
503
- {
504
- "epoch": 0.5433070866141733,
505
- "grad_norm": 0.2704390287399292,
506
- "learning_rate": 1.8777712059211643e-05,
507
- "loss": 0.0078,
508
- "step": 69
509
- },
510
- {
511
- "epoch": 0.5511811023622047,
512
- "grad_norm": 0.6014392971992493,
513
- "learning_rate": 1.8736829797601903e-05,
514
- "loss": 0.0059,
515
- "step": 70
516
- },
517
- {
518
- "epoch": 0.5590551181102362,
519
- "grad_norm": 0.5487034916877747,
520
- "learning_rate": 1.8695321061768886e-05,
521
- "loss": 0.0097,
522
- "step": 71
523
- },
524
- {
525
- "epoch": 0.5669291338582677,
526
- "grad_norm": 0.6670834422111511,
527
- "learning_rate": 1.8653188828095754e-05,
528
- "loss": 0.011,
529
- "step": 72
530
- },
531
- {
532
- "epoch": 0.5748031496062992,
533
- "grad_norm": 0.1795203685760498,
534
- "learning_rate": 1.8610436117673557e-05,
535
- "loss": 0.0067,
536
- "step": 73
537
- },
538
- {
539
- "epoch": 0.5826771653543307,
540
- "grad_norm": 1.768436074256897,
541
- "learning_rate": 1.8567065996084628e-05,
542
- "loss": 0.0096,
543
- "step": 74
544
- },
545
- {
546
- "epoch": 0.5905511811023622,
547
- "grad_norm": 0.26233312487602234,
548
- "learning_rate": 1.8523081573182754e-05,
549
- "loss": 0.0124,
550
- "step": 75
551
- },
552
- {
553
- "epoch": 0.5984251968503937,
554
- "grad_norm": 0.3775719404220581,
555
- "learning_rate": 1.847848600287019e-05,
556
- "loss": 0.0052,
557
- "step": 76
558
- },
559
- {
560
- "epoch": 0.6062992125984252,
561
- "grad_norm": 1.0016565322875977,
562
- "learning_rate": 1.8433282482871497e-05,
563
- "loss": 0.0058,
564
- "step": 77
565
- },
566
- {
567
- "epoch": 0.6141732283464567,
568
- "grad_norm": 0.20153792202472687,
569
- "learning_rate": 1.8387474254504265e-05,
570
- "loss": 0.0056,
571
- "step": 78
572
- },
573
- {
574
- "epoch": 0.6220472440944882,
575
- "grad_norm": 0.5119822025299072,
576
- "learning_rate": 1.8341064602446686e-05,
577
- "loss": 0.0079,
578
- "step": 79
579
- },
580
- {
581
- "epoch": 0.6299212598425197,
582
- "grad_norm": 1.5781004428863525,
583
- "learning_rate": 1.829405685450202e-05,
584
- "loss": 0.008,
585
- "step": 80
586
- },
587
- {
588
- "epoch": 0.6377952755905512,
589
- "grad_norm": 0.23826757073402405,
590
- "learning_rate": 1.824645438135999e-05,
591
- "loss": 0.0041,
592
- "step": 81
593
- },
594
- {
595
- "epoch": 0.6456692913385826,
596
- "grad_norm": 0.6386727690696716,
597
- "learning_rate": 1.8198260596355077e-05,
598
- "loss": 0.0188,
599
- "step": 82
600
- },
601
- {
602
- "epoch": 0.6535433070866141,
603
- "grad_norm": 0.9503199458122253,
604
- "learning_rate": 1.814947895522176e-05,
605
- "loss": 0.008,
606
- "step": 83
607
- },
608
- {
609
- "epoch": 0.6614173228346457,
610
- "grad_norm": 0.2040701061487198,
611
- "learning_rate": 1.8100112955846746e-05,
612
- "loss": 0.0038,
613
- "step": 84
614
- },
615
- {
616
- "epoch": 0.6692913385826772,
617
- "grad_norm": 0.3660199046134949,
618
- "learning_rate": 1.805016613801813e-05,
619
- "loss": 0.0148,
620
- "step": 85
621
- },
622
- {
623
- "epoch": 0.6771653543307087,
624
- "grad_norm": 1.0502821207046509,
625
- "learning_rate": 1.7999642083171576e-05,
626
- "loss": 0.0098,
627
- "step": 86
628
- },
629
- {
630
- "epoch": 0.6771653543307087,
631
- "eval_loss": 0.3526817262172699,
632
- "eval_runtime": 6.6167,
633
- "eval_samples_per_second": 24.635,
634
- "eval_steps_per_second": 3.174,
635
- "step": 86
636
- },
637
- {
638
- "epoch": 0.6850393700787402,
639
- "grad_norm": 0.13735969364643097,
640
- "learning_rate": 1.7948544414133534e-05,
641
- "loss": 0.0022,
642
- "step": 87
643
- },
644
- {
645
- "epoch": 0.6929133858267716,
646
- "grad_norm": 0.6425012946128845,
647
- "learning_rate": 1.7896876794861443e-05,
648
- "loss": 0.0086,
649
- "step": 88
650
- },
651
- {
652
- "epoch": 0.7007874015748031,
653
- "grad_norm": 0.7540380954742432,
654
- "learning_rate": 1.7844642930181008e-05,
655
- "loss": 0.0062,
656
- "step": 89
657
- },
658
- {
659
- "epoch": 0.7086614173228346,
660
- "grad_norm": 0.6727365255355835,
661
- "learning_rate": 1.779184656552056e-05,
662
- "loss": 0.0027,
663
- "step": 90
664
- },
665
- {
666
- "epoch": 0.7165354330708661,
667
- "grad_norm": 0.14059337973594666,
668
- "learning_rate": 1.773849148664247e-05,
669
- "loss": 0.0056,
670
- "step": 91
671
- },
672
- {
673
- "epoch": 0.7244094488188977,
674
- "grad_norm": 0.33292093873023987,
675
- "learning_rate": 1.7684581519371714e-05,
676
- "loss": 0.0047,
677
- "step": 92
678
- },
679
- {
680
- "epoch": 0.7322834645669292,
681
- "grad_norm": 0.3809877932071686,
682
- "learning_rate": 1.7630120529321518e-05,
683
- "loss": 0.0139,
684
- "step": 93
685
- },
686
- {
687
- "epoch": 0.7401574803149606,
688
- "grad_norm": 1.729589819908142,
689
- "learning_rate": 1.7575112421616203e-05,
690
- "loss": 0.0128,
691
- "step": 94
692
- },
693
- {
694
- "epoch": 0.7480314960629921,
695
- "grad_norm": 0.18192608654499054,
696
- "learning_rate": 1.751956114061113e-05,
697
- "loss": 0.0025,
698
- "step": 95
699
- },
700
- {
701
- "epoch": 0.7559055118110236,
702
- "grad_norm": 1.0333118438720703,
703
- "learning_rate": 1.7463470669609907e-05,
704
- "loss": 0.006,
705
- "step": 96
706
- },
707
- {
708
- "epoch": 0.7637795275590551,
709
- "grad_norm": 0.7247685194015503,
710
- "learning_rate": 1.7406845030578747e-05,
711
- "loss": 0.0073,
712
- "step": 97
713
- },
714
- {
715
- "epoch": 0.7716535433070866,
716
- "grad_norm": 0.06979379802942276,
717
- "learning_rate": 1.734968828385808e-05,
718
- "loss": 0.0005,
719
- "step": 98
720
- },
721
- {
722
- "epoch": 0.7795275590551181,
723
- "grad_norm": 0.5137119293212891,
724
- "learning_rate": 1.729200452787139e-05,
725
- "loss": 0.0082,
726
- "step": 99
727
- },
728
- {
729
- "epoch": 0.7874015748031497,
730
- "grad_norm": 0.4704137146472931,
731
- "learning_rate": 1.7233797898831376e-05,
732
- "loss": 0.005,
733
- "step": 100
734
- },
735
- {
736
- "epoch": 0.7952755905511811,
737
- "grad_norm": 0.28564465045928955,
738
- "learning_rate": 1.717507257044331e-05,
739
- "loss": 0.0052,
740
- "step": 101
741
- },
742
- {
743
- "epoch": 0.8031496062992126,
744
- "grad_norm": 0.17685537040233612,
745
- "learning_rate": 1.711583275360582e-05,
746
- "loss": 0.0024,
747
- "step": 102
748
- },
749
- {
750
- "epoch": 0.8110236220472441,
751
- "grad_norm": 0.45714935660362244,
752
- "learning_rate": 1.7056082696108896e-05,
753
- "loss": 0.0072,
754
- "step": 103
755
- },
756
- {
757
- "epoch": 0.8188976377952756,
758
- "grad_norm": 0.4373086988925934,
759
- "learning_rate": 1.699582668232934e-05,
760
- "loss": 0.0051,
761
- "step": 104
762
- },
763
- {
764
- "epoch": 0.8267716535433071,
765
- "grad_norm": 0.8478983640670776,
766
- "learning_rate": 1.6935069032923525e-05,
767
- "loss": 0.022,
768
- "step": 105
769
- },
770
- {
771
- "epoch": 0.8346456692913385,
772
- "grad_norm": 0.16181086003780365,
773
- "learning_rate": 1.6873814104517617e-05,
774
- "loss": 0.0058,
775
- "step": 106
776
- },
777
- {
778
- "epoch": 0.84251968503937,
779
- "grad_norm": 0.09503592550754547,
780
- "learning_rate": 1.6812066289395157e-05,
781
- "loss": 0.0009,
782
- "step": 107
783
- },
784
- {
785
- "epoch": 0.8503937007874016,
786
- "grad_norm": 0.7462632060050964,
787
- "learning_rate": 1.6749830015182106e-05,
788
- "loss": 0.0044,
789
- "step": 108
790
- },
791
- {
792
- "epoch": 0.8582677165354331,
793
- "grad_norm": 0.07221701741218567,
794
- "learning_rate": 1.6687109744529394e-05,
795
- "loss": 0.0015,
796
- "step": 109
797
- },
798
- {
799
- "epoch": 0.8661417322834646,
800
- "grad_norm": 0.08999036252498627,
801
- "learning_rate": 1.6623909974792888e-05,
802
- "loss": 0.0023,
803
- "step": 110
804
- },
805
- {
806
- "epoch": 0.8740157480314961,
807
- "grad_norm": 0.42536938190460205,
808
- "learning_rate": 1.656023523771095e-05,
809
- "loss": 0.005,
810
- "step": 111
811
- },
812
- {
813
- "epoch": 0.8818897637795275,
814
- "grad_norm": 0.7885191440582275,
815
- "learning_rate": 1.6496090099079452e-05,
816
- "loss": 0.0103,
817
- "step": 112
818
- },
819
- {
820
- "epoch": 0.889763779527559,
821
- "grad_norm": 0.16610018908977509,
822
- "learning_rate": 1.64314791584244e-05,
823
- "loss": 0.006,
824
- "step": 113
825
- },
826
- {
827
- "epoch": 0.8976377952755905,
828
- "grad_norm": 0.32151034474372864,
829
- "learning_rate": 1.6366407048672135e-05,
830
- "loss": 0.0086,
831
- "step": 114
832
- },
833
- {
834
- "epoch": 0.905511811023622,
835
- "grad_norm": 0.557732343673706,
836
- "learning_rate": 1.6300878435817115e-05,
837
- "loss": 0.0064,
838
- "step": 115
839
- },
840
- {
841
- "epoch": 0.9133858267716536,
842
- "grad_norm": 0.2238176167011261,
843
- "learning_rate": 1.6234898018587336e-05,
844
- "loss": 0.0065,
845
- "step": 116
846
- },
847
- {
848
- "epoch": 0.9212598425196851,
849
- "grad_norm": 0.2980042099952698,
850
- "learning_rate": 1.616847052810744e-05,
851
- "loss": 0.0095,
852
- "step": 117
853
- },
854
- {
855
- "epoch": 0.9291338582677166,
856
- "grad_norm": 0.1529705822467804,
857
- "learning_rate": 1.6101600727559423e-05,
858
- "loss": 0.0062,
859
- "step": 118
860
- },
861
- {
862
- "epoch": 0.937007874015748,
863
- "grad_norm": 0.017149658873677254,
864
- "learning_rate": 1.603429341184114e-05,
865
- "loss": 0.0002,
866
- "step": 119
867
- },
868
- {
869
- "epoch": 0.9448818897637795,
870
- "grad_norm": 0.4514746367931366,
871
- "learning_rate": 1.596655340722244e-05,
872
- "loss": 0.0067,
873
- "step": 120
874
- },
875
- {
876
- "epoch": 0.952755905511811,
877
- "grad_norm": 0.11766134947538376,
878
- "learning_rate": 1.5898385570999146e-05,
879
- "loss": 0.0053,
880
- "step": 121
881
- },
882
- {
883
- "epoch": 0.9606299212598425,
884
- "grad_norm": 0.4089784026145935,
885
- "learning_rate": 1.5829794791144723e-05,
886
- "loss": 0.0085,
887
- "step": 122
888
- },
889
- {
890
- "epoch": 0.968503937007874,
891
- "grad_norm": 0.1353057473897934,
892
- "learning_rate": 1.57607859859598e-05,
893
- "loss": 0.0013,
894
- "step": 123
895
- },
896
- {
897
- "epoch": 0.9763779527559056,
898
- "grad_norm": 0.6548481583595276,
899
- "learning_rate": 1.5691364103719515e-05,
900
- "loss": 0.0117,
901
- "step": 124
902
- },
903
- {
904
- "epoch": 0.984251968503937,
905
- "grad_norm": 0.1571267992258072,
906
- "learning_rate": 1.5621534122318682e-05,
907
- "loss": 0.0049,
908
- "step": 125
909
- },
910
- {
911
- "epoch": 0.9921259842519685,
912
- "grad_norm": 1.2177189588546753,
913
- "learning_rate": 1.5551301048914863e-05,
914
- "loss": 0.0161,
915
- "step": 126
916
- },
917
- {
918
- "epoch": 1.0,
919
- "grad_norm": 0.414489209651947,
920
- "learning_rate": 1.5480669919569313e-05,
921
- "loss": 0.0181,
922
- "step": 127
923
- },
924
- {
925
- "epoch": 1.0078740157480315,
926
- "grad_norm": 0.10985995829105377,
927
- "learning_rate": 1.54096457988859e-05,
928
- "loss": 0.0049,
929
- "step": 128
930
- },
931
- {
932
- "epoch": 1.015748031496063,
933
- "grad_norm": 0.12780147790908813,
934
- "learning_rate": 1.533823377964791e-05,
935
- "loss": 0.0026,
936
- "step": 129
937
- },
938
- {
939
- "epoch": 1.015748031496063,
940
- "eval_loss": 0.33064374327659607,
941
- "eval_runtime": 6.9286,
942
- "eval_samples_per_second": 23.526,
943
- "eval_steps_per_second": 3.031,
944
- "step": 129
945
- },
946
- {
947
- "epoch": 1.0236220472440944,
948
- "grad_norm": 0.5142458081245422,
949
- "learning_rate": 1.52664389824529e-05,
950
- "loss": 0.0082,
951
- "step": 130
952
- },
953
- {
954
- "epoch": 1.031496062992126,
955
- "grad_norm": 0.15617145597934723,
956
- "learning_rate": 1.5194266555345505e-05,
957
- "loss": 0.0016,
958
- "step": 131
959
- },
960
- {
961
- "epoch": 1.0393700787401574,
962
- "grad_norm": 0.5782387852668762,
963
- "learning_rate": 1.5121721673448319e-05,
964
- "loss": 0.0117,
965
- "step": 132
966
- },
967
- {
968
- "epoch": 1.047244094488189,
969
- "grad_norm": 0.08414836972951889,
970
- "learning_rate": 1.5048809538590789e-05,
971
- "loss": 0.0021,
972
- "step": 133
973
- },
974
- {
975
- "epoch": 1.0551181102362204,
976
- "grad_norm": 0.28253939747810364,
977
- "learning_rate": 1.4975535378936228e-05,
978
- "loss": 0.0055,
979
- "step": 134
980
- },
981
- {
982
- "epoch": 1.0629921259842519,
983
- "grad_norm": 0.47917842864990234,
984
- "learning_rate": 1.490190444860694e-05,
985
- "loss": 0.0046,
986
- "step": 135
987
- },
988
- {
989
- "epoch": 1.0708661417322836,
990
- "grad_norm": 0.1895662248134613,
991
- "learning_rate": 1.482792202730745e-05,
992
- "loss": 0.006,
993
- "step": 136
994
- },
995
- {
996
- "epoch": 1.078740157480315,
997
- "grad_norm": 0.13722768425941467,
998
- "learning_rate": 1.475359341994595e-05,
999
- "loss": 0.0031,
1000
- "step": 137
1001
- },
1002
- {
1003
- "epoch": 1.0866141732283465,
1004
- "grad_norm": 0.10731153190135956,
1005
- "learning_rate": 1.4678923956253894e-05,
1006
- "loss": 0.0005,
1007
- "step": 138
1008
- },
1009
- {
1010
- "epoch": 1.094488188976378,
1011
- "grad_norm": 0.12261265516281128,
1012
- "learning_rate": 1.460391899040383e-05,
1013
- "loss": 0.0031,
1014
- "step": 139
1015
- },
1016
- {
1017
- "epoch": 1.1023622047244095,
1018
- "grad_norm": 0.0038245893083512783,
1019
- "learning_rate": 1.4528583900625481e-05,
1020
- "loss": 0.0,
1021
- "step": 140
1022
- },
1023
- {
1024
- "epoch": 1.110236220472441,
1025
- "grad_norm": 0.28762558102607727,
1026
- "learning_rate": 1.4452924088820101e-05,
1027
- "loss": 0.004,
1028
- "step": 141
1029
- },
1030
- {
1031
- "epoch": 1.1181102362204725,
1032
- "grad_norm": 0.17267552018165588,
1033
- "learning_rate": 1.4376944980173138e-05,
1034
- "loss": 0.0002,
1035
- "step": 142
1036
- },
1037
- {
1038
- "epoch": 1.125984251968504,
1039
- "grad_norm": 0.12727122008800507,
1040
- "learning_rate": 1.4300652022765207e-05,
1041
- "loss": 0.0029,
1042
- "step": 143
1043
- },
1044
- {
1045
- "epoch": 1.1338582677165354,
1046
- "grad_norm": 0.25049135088920593,
1047
- "learning_rate": 1.4224050687181442e-05,
1048
- "loss": 0.0108,
1049
- "step": 144
1050
- },
1051
- {
1052
- "epoch": 1.141732283464567,
1053
- "grad_norm": 0.16092728078365326,
1054
- "learning_rate": 1.4147146466119235e-05,
1055
- "loss": 0.0024,
1056
- "step": 145
1057
- },
1058
- {
1059
- "epoch": 1.1496062992125984,
1060
- "grad_norm": 0.13642658293247223,
1061
- "learning_rate": 1.406994487399437e-05,
1062
- "loss": 0.0037,
1063
- "step": 146
1064
- },
1065
- {
1066
- "epoch": 1.1574803149606299,
1067
- "grad_norm": 0.9029403328895569,
1068
- "learning_rate": 1.3992451446545624e-05,
1069
- "loss": 0.0034,
1070
- "step": 147
1071
- },
1072
- {
1073
- "epoch": 1.1653543307086613,
1074
- "grad_norm": 0.19518424570560455,
1075
- "learning_rate": 1.3914671740437811e-05,
1076
- "loss": 0.0057,
1077
- "step": 148
1078
- },
1079
- {
1080
- "epoch": 1.1732283464566928,
1081
- "grad_norm": 0.12140502035617828,
1082
- "learning_rate": 1.3836611332863356e-05,
1083
- "loss": 0.0041,
1084
- "step": 149
1085
- },
1086
- {
1087
- "epoch": 1.1811023622047245,
1088
- "grad_norm": 0.5148038864135742,
1089
- "learning_rate": 1.3758275821142382e-05,
1090
- "loss": 0.0026,
1091
- "step": 150
1092
- },
1093
- {
1094
- "epoch": 1.188976377952756,
1095
- "grad_norm": 1.828904390335083,
1096
- "learning_rate": 1.3679670822321347e-05,
1097
- "loss": 0.0024,
1098
- "step": 151
1099
- },
1100
- {
1101
- "epoch": 1.1968503937007875,
1102
- "grad_norm": 0.3571717143058777,
1103
- "learning_rate": 1.3600801972770272e-05,
1104
- "loss": 0.0106,
1105
- "step": 152
1106
- },
1107
- {
1108
- "epoch": 1.204724409448819,
1109
- "grad_norm": 0.051027003675699234,
1110
- "learning_rate": 1.3521674927778594e-05,
1111
- "loss": 0.0003,
1112
- "step": 153
1113
- },
1114
- {
1115
- "epoch": 1.2125984251968505,
1116
- "grad_norm": 0.6490982174873352,
1117
- "learning_rate": 1.3442295361149651e-05,
1118
- "loss": 0.0035,
1119
- "step": 154
1120
- },
1121
- {
1122
- "epoch": 1.220472440944882,
1123
- "grad_norm": 0.08408445864915848,
1124
- "learning_rate": 1.336266896479384e-05,
1125
- "loss": 0.0027,
1126
- "step": 155
1127
- },
1128
- {
1129
- "epoch": 1.2283464566929134,
1130
- "grad_norm": 0.09666562080383301,
1131
- "learning_rate": 1.328280144832047e-05,
1132
- "loss": 0.0019,
1133
- "step": 156
1134
- },
1135
- {
1136
- "epoch": 1.236220472440945,
1137
- "grad_norm": 0.03880690038204193,
1138
- "learning_rate": 1.3202698538628376e-05,
1139
- "loss": 0.0003,
1140
- "step": 157
1141
- },
1142
- {
1143
- "epoch": 1.2440944881889764,
1144
- "grad_norm": 0.11940775066614151,
1145
- "learning_rate": 1.3122365979495259e-05,
1146
- "loss": 0.0024,
1147
- "step": 158
1148
- },
1149
- {
1150
- "epoch": 1.2519685039370079,
1151
- "grad_norm": 0.1442880481481552,
1152
- "learning_rate": 1.3041809531165819e-05,
1153
- "loss": 0.0015,
1154
- "step": 159
1155
- },
1156
- {
1157
- "epoch": 1.2598425196850394,
1158
- "grad_norm": 0.1961939036846161,
1159
- "learning_rate": 1.2961034969938732e-05,
1160
- "loss": 0.0056,
1161
- "step": 160
1162
- },
1163
- {
1164
- "epoch": 1.2677165354330708,
1165
- "grad_norm": 0.26947638392448425,
1166
- "learning_rate": 1.288004808775246e-05,
1167
- "loss": 0.0028,
1168
- "step": 161
1169
- },
1170
- {
1171
- "epoch": 1.2755905511811023,
1172
- "grad_norm": 0.5154056549072266,
1173
- "learning_rate": 1.2798854691769927e-05,
1174
- "loss": 0.0037,
1175
- "step": 162
1176
- },
1177
- {
1178
- "epoch": 1.2834645669291338,
1179
- "grad_norm": 0.4292369782924652,
1180
- "learning_rate": 1.2717460603962132e-05,
1181
- "loss": 0.0029,
1182
- "step": 163
1183
- },
1184
- {
1185
- "epoch": 1.2913385826771653,
1186
- "grad_norm": 0.19139212369918823,
1187
- "learning_rate": 1.2635871660690677e-05,
1188
- "loss": 0.0061,
1189
- "step": 164
1190
- },
1191
- {
1192
- "epoch": 1.2992125984251968,
1193
- "grad_norm": 0.19960306584835052,
1194
- "learning_rate": 1.2554093712289267e-05,
1195
- "loss": 0.005,
1196
- "step": 165
1197
- },
1198
- {
1199
- "epoch": 1.3070866141732282,
1200
- "grad_norm": 0.4523830711841583,
1201
- "learning_rate": 1.2472132622644222e-05,
1202
- "loss": 0.0065,
1203
- "step": 166
1204
- },
1205
- {
1206
- "epoch": 1.3149606299212597,
1207
- "grad_norm": 0.49343299865722656,
1208
- "learning_rate": 1.2389994268773995e-05,
1209
- "loss": 0.0061,
1210
- "step": 167
1211
- },
1212
- {
1213
- "epoch": 1.3228346456692912,
1214
- "grad_norm": 0.01938088797032833,
1215
- "learning_rate": 1.2307684540407775e-05,
1216
- "loss": 0.0001,
1217
- "step": 168
1218
- },
1219
- {
1220
- "epoch": 1.330708661417323,
1221
- "grad_norm": 0.3082112669944763,
1222
- "learning_rate": 1.2225209339563144e-05,
1223
- "loss": 0.0053,
1224
- "step": 169
1225
- },
1226
- {
1227
- "epoch": 1.3385826771653544,
1228
- "grad_norm": 0.01982509344816208,
1229
- "learning_rate": 1.2142574580122903e-05,
1230
- "loss": 0.0001,
1231
- "step": 170
1232
- },
1233
- {
1234
- "epoch": 1.3464566929133859,
1235
- "grad_norm": 0.12388588488101959,
1236
- "learning_rate": 1.2059786187410984e-05,
1237
- "loss": 0.0049,
1238
- "step": 171
1239
- },
1240
- {
1241
- "epoch": 1.3543307086614174,
1242
- "grad_norm": 0.43759095668792725,
1243
- "learning_rate": 1.1976850097767598e-05,
1244
- "loss": 0.0128,
1245
- "step": 172
1246
- },
1247
- {
1248
- "epoch": 1.3543307086614174,
1249
- "eval_loss": 0.3166251480579376,
1250
- "eval_runtime": 6.9515,
1251
- "eval_samples_per_second": 23.448,
1252
- "eval_steps_per_second": 3.021,
1253
- "step": 172
1254
- },
1255
- {
1256
- "epoch": 1.3622047244094488,
1257
- "grad_norm": 0.46561670303344727,
1258
- "learning_rate": 1.1893772258123554e-05,
1259
- "loss": 0.008,
1260
- "step": 173
1261
- },
1262
- {
1263
- "epoch": 1.3700787401574803,
1264
- "grad_norm": 0.16612188518047333,
1265
- "learning_rate": 1.1810558625573856e-05,
1266
- "loss": 0.0024,
1267
- "step": 174
1268
- },
1269
- {
1270
- "epoch": 1.3779527559055118,
1271
- "grad_norm": 0.13628093898296356,
1272
- "learning_rate": 1.1727215166950519e-05,
1273
- "loss": 0.0045,
1274
- "step": 175
1275
- },
1276
- {
1277
- "epoch": 1.3858267716535433,
1278
- "grad_norm": 0.565229058265686,
1279
- "learning_rate": 1.1643747858394743e-05,
1280
- "loss": 0.0103,
1281
- "step": 176
1282
- },
1283
- {
1284
- "epoch": 1.3937007874015748,
1285
- "grad_norm": 0.14550763368606567,
1286
- "learning_rate": 1.156016268492839e-05,
1287
- "loss": 0.0028,
1288
- "step": 177
1289
- },
1290
- {
1291
- "epoch": 1.4015748031496063,
1292
- "grad_norm": 0.12460129708051682,
1293
- "learning_rate": 1.1476465640024814e-05,
1294
- "loss": 0.0031,
1295
- "step": 178
1296
- },
1297
- {
1298
- "epoch": 1.4094488188976377,
1299
- "grad_norm": 0.19089221954345703,
1300
- "learning_rate": 1.1392662725179114e-05,
1301
- "loss": 0.0035,
1302
- "step": 179
1303
- },
1304
- {
1305
- "epoch": 1.4173228346456692,
1306
- "grad_norm": 0.6106573343276978,
1307
- "learning_rate": 1.1308759949477786e-05,
1308
- "loss": 0.0088,
1309
- "step": 180
1310
- },
1311
- {
1312
- "epoch": 1.425196850393701,
1313
- "grad_norm": 0.20053207874298096,
1314
- "learning_rate": 1.1224763329167859e-05,
1315
- "loss": 0.0033,
1316
- "step": 181
1317
- },
1318
- {
1319
- "epoch": 1.4330708661417324,
1320
- "grad_norm": 0.1984691321849823,
1321
- "learning_rate": 1.1140678887225468e-05,
1322
- "loss": 0.0051,
1323
- "step": 182
1324
- },
1325
- {
1326
- "epoch": 1.4409448818897639,
1327
- "grad_norm": 0.19264858961105347,
1328
- "learning_rate": 1.1056512652924014e-05,
1329
- "loss": 0.0046,
1330
- "step": 183
1331
- },
1332
- {
1333
- "epoch": 1.4488188976377954,
1334
- "grad_norm": 0.10979076474905014,
1335
- "learning_rate": 1.0972270661401812e-05,
1336
- "loss": 0.0031,
1337
- "step": 184
1338
- },
1339
- {
1340
- "epoch": 1.4566929133858268,
1341
- "grad_norm": 0.1744084656238556,
1342
- "learning_rate": 1.0887958953229349e-05,
1343
- "loss": 0.0024,
1344
- "step": 185
1345
- },
1346
- {
1347
- "epoch": 1.4645669291338583,
1348
- "grad_norm": 0.20646224915981293,
1349
- "learning_rate": 1.0803583573976137e-05,
1350
- "loss": 0.008,
1351
- "step": 186
1352
- },
1353
- {
1354
- "epoch": 1.4724409448818898,
1355
- "grad_norm": 0.14391584694385529,
1356
- "learning_rate": 1.0719150573777226e-05,
1357
- "loss": 0.004,
1358
- "step": 187
1359
- },
1360
- {
1361
- "epoch": 1.4803149606299213,
1362
- "grad_norm": 0.36887863278388977,
1363
- "learning_rate": 1.0634666006899375e-05,
1364
- "loss": 0.0074,
1365
- "step": 188
1366
- },
1367
- {
1368
- "epoch": 1.4881889763779528,
1369
- "grad_norm": 0.21352627873420715,
1370
- "learning_rate": 1.055013593130693e-05,
1371
- "loss": 0.0082,
1372
- "step": 189
1373
- },
1374
- {
1375
- "epoch": 1.4960629921259843,
1376
- "grad_norm": 0.22443020343780518,
1377
- "learning_rate": 1.046556640822744e-05,
1378
- "loss": 0.0087,
1379
- "step": 190
1380
- },
1381
- {
1382
- "epoch": 1.5039370078740157,
1383
- "grad_norm": 0.4243764281272888,
1384
- "learning_rate": 1.0380963501717034e-05,
1385
- "loss": 0.0068,
1386
- "step": 191
1387
- },
1388
- {
1389
- "epoch": 1.5118110236220472,
1390
- "grad_norm": 0.17558562755584717,
1391
- "learning_rate": 1.0296333278225599e-05,
1392
- "loss": 0.0054,
1393
- "step": 192
1394
- },
1395
- {
1396
- "epoch": 1.5196850393700787,
1397
- "grad_norm": 0.14842620491981506,
1398
- "learning_rate": 1.0211681806161787e-05,
1399
- "loss": 0.0031,
1400
- "step": 193
1401
- },
1402
- {
1403
- "epoch": 1.5275590551181102,
1404
- "grad_norm": 0.09316081553697586,
1405
- "learning_rate": 1.0127015155457875e-05,
1406
- "loss": 0.0013,
1407
- "step": 194
1408
- },
1409
- {
1410
- "epoch": 1.5354330708661417,
1411
- "grad_norm": 0.19795025885105133,
1412
- "learning_rate": 1.0042339397134528e-05,
1413
- "loss": 0.0051,
1414
- "step": 195
1415
- },
1416
- {
1417
- "epoch": 1.5433070866141732,
1418
- "grad_norm": 0.21606990694999695,
1419
- "learning_rate": 9.957660602865477e-06,
1420
- "loss": 0.0041,
1421
- "step": 196
1422
- },
1423
- {
1424
- "epoch": 1.5511811023622046,
1425
- "grad_norm": 0.18036173284053802,
1426
- "learning_rate": 9.872984844542128e-06,
1427
- "loss": 0.0037,
1428
- "step": 197
1429
- },
1430
- {
1431
- "epoch": 1.5590551181102361,
1432
- "grad_norm": 0.18953870236873627,
1433
- "learning_rate": 9.788318193838218e-06,
1434
- "loss": 0.0041,
1435
- "step": 198
1436
- },
1437
- {
1438
- "epoch": 1.5669291338582676,
1439
- "grad_norm": 0.12346503138542175,
1440
- "learning_rate": 9.703666721774403e-06,
1441
- "loss": 0.0035,
1442
- "step": 199
1443
- },
1444
- {
1445
- "epoch": 1.574803149606299,
1446
- "grad_norm": 0.4576225280761719,
1447
- "learning_rate": 9.619036498282968e-06,
1448
- "loss": 0.0041,
1449
- "step": 200
1450
- },
1451
- {
1452
- "epoch": 1.5826771653543306,
1453
- "grad_norm": 0.10333681106567383,
1454
- "learning_rate": 9.534433591772562e-06,
1455
- "loss": 0.0011,
1456
- "step": 201
1457
- },
1458
- {
1459
- "epoch": 1.590551181102362,
1460
- "grad_norm": 0.19167865812778473,
1461
- "learning_rate": 9.449864068693072e-06,
1462
- "loss": 0.0062,
1463
- "step": 202
1464
- },
1465
- {
1466
- "epoch": 1.5984251968503937,
1467
- "grad_norm": 0.2258184254169464,
1468
- "learning_rate": 9.365333993100628e-06,
1469
- "loss": 0.003,
1470
- "step": 203
1471
- },
1472
- {
1473
- "epoch": 1.6062992125984252,
1474
- "grad_norm": 0.07945302873849869,
1475
- "learning_rate": 9.280849426222778e-06,
1476
- "loss": 0.0008,
1477
- "step": 204
1478
- },
1479
- {
1480
- "epoch": 1.6141732283464567,
1481
- "grad_norm": 0.17767398059368134,
1482
- "learning_rate": 9.196416426023868e-06,
1483
- "loss": 0.0053,
1484
- "step": 205
1485
- },
1486
- {
1487
- "epoch": 1.6220472440944882,
1488
- "grad_norm": 0.12704500555992126,
1489
- "learning_rate": 9.112041046770653e-06,
1490
- "loss": 0.0023,
1491
- "step": 206
1492
- },
1493
- {
1494
- "epoch": 1.6299212598425197,
1495
- "grad_norm": 0.4054742753505707,
1496
- "learning_rate": 9.027729338598188e-06,
1497
- "loss": 0.0045,
1498
- "step": 207
1499
- },
1500
- {
1501
- "epoch": 1.6377952755905512,
1502
- "grad_norm": 0.4463757574558258,
1503
- "learning_rate": 8.943487347075988e-06,
1504
- "loss": 0.007,
1505
- "step": 208
1506
- },
1507
- {
1508
- "epoch": 1.6456692913385826,
1509
- "grad_norm": 0.6517045497894287,
1510
- "learning_rate": 8.859321112774535e-06,
1511
- "loss": 0.0052,
1512
- "step": 209
1513
- },
1514
- {
1515
- "epoch": 1.6535433070866141,
1516
- "grad_norm": 0.1542089730501175,
1517
- "learning_rate": 8.775236670832146e-06,
1518
- "loss": 0.0047,
1519
- "step": 210
1520
- },
1521
- {
1522
- "epoch": 1.6614173228346458,
1523
- "grad_norm": 0.14716440439224243,
1524
- "learning_rate": 8.691240050522215e-06,
1525
- "loss": 0.0049,
1526
- "step": 211
1527
- },
1528
- {
1529
- "epoch": 1.6692913385826773,
1530
- "grad_norm": 0.2997347116470337,
1531
- "learning_rate": 8.607337274820888e-06,
1532
- "loss": 0.0076,
1533
- "step": 212
1534
- },
1535
- {
1536
- "epoch": 1.6771653543307088,
1537
- "grad_norm": 0.22548256814479828,
1538
- "learning_rate": 8.52353435997519e-06,
1539
- "loss": 0.0063,
1540
- "step": 213
1541
- },
1542
- {
1543
- "epoch": 1.6850393700787403,
1544
- "grad_norm": 0.7220733165740967,
1545
- "learning_rate": 8.439837315071612e-06,
1546
- "loss": 0.0089,
1547
- "step": 214
1548
- },
1549
- {
1550
- "epoch": 1.6929133858267718,
1551
- "grad_norm": 0.5101618766784668,
1552
- "learning_rate": 8.35625214160526e-06,
1553
- "loss": 0.0042,
1554
- "step": 215
1555
- },
1556
- {
1557
- "epoch": 1.6929133858267718,
1558
- "eval_loss": 0.3484288156032562,
1559
- "eval_runtime": 6.4482,
1560
- "eval_samples_per_second": 25.278,
1561
- "eval_steps_per_second": 3.257,
1562
- "step": 215
1563
- },
1564
- {
1565
- "epoch": 1.7007874015748032,
1566
- "grad_norm": 0.1698393076658249,
1567
- "learning_rate": 8.272784833049485e-06,
1568
- "loss": 0.0028,
1569
- "step": 216
1570
- },
1571
- {
1572
- "epoch": 1.7086614173228347,
1573
- "grad_norm": 0.5772718191146851,
1574
- "learning_rate": 8.18944137442615e-06,
1575
- "loss": 0.0082,
1576
- "step": 217
1577
- },
1578
- {
1579
- "epoch": 1.7165354330708662,
1580
- "grad_norm": 0.09606469422578812,
1581
- "learning_rate": 8.106227741876447e-06,
1582
- "loss": 0.0011,
1583
- "step": 218
1584
- },
1585
- {
1586
- "epoch": 1.7244094488188977,
1587
- "grad_norm": 0.14510361850261688,
1588
- "learning_rate": 8.023149902232404e-06,
1589
- "loss": 0.0015,
1590
- "step": 219
1591
- },
1592
- {
1593
- "epoch": 1.7322834645669292,
1594
- "grad_norm": 0.055804118514060974,
1595
- "learning_rate": 7.940213812589018e-06,
1596
- "loss": 0.0008,
1597
- "step": 220
1598
- },
1599
- {
1600
- "epoch": 1.7401574803149606,
1601
- "grad_norm": 0.13318321108818054,
1602
- "learning_rate": 7.857425419877097e-06,
1603
- "loss": 0.005,
1604
- "step": 221
1605
- },
1606
- {
1607
- "epoch": 1.7480314960629921,
1608
- "grad_norm": 0.23600782454013824,
1609
- "learning_rate": 7.774790660436857e-06,
1610
- "loss": 0.0063,
1611
- "step": 222
1612
- },
1613
- {
1614
- "epoch": 1.7559055118110236,
1615
- "grad_norm": 0.8483791351318359,
1616
- "learning_rate": 7.69231545959223e-06,
1617
- "loss": 0.0027,
1618
- "step": 223
1619
- },
1620
- {
1621
- "epoch": 1.763779527559055,
1622
- "grad_norm": 0.16536197066307068,
1623
- "learning_rate": 7.610005731226009e-06,
1624
- "loss": 0.0039,
1625
- "step": 224
1626
- },
1627
- {
1628
- "epoch": 1.7716535433070866,
1629
- "grad_norm": 0.14446765184402466,
1630
- "learning_rate": 7.52786737735578e-06,
1631
- "loss": 0.0036,
1632
- "step": 225
1633
- },
1634
- {
1635
- "epoch": 1.779527559055118,
1636
- "grad_norm": 0.8880365490913391,
1637
- "learning_rate": 7.445906287710733e-06,
1638
- "loss": 0.0061,
1639
- "step": 226
1640
- },
1641
- {
1642
- "epoch": 1.7874015748031495,
1643
- "grad_norm": 0.151743084192276,
1644
- "learning_rate": 7.364128339309326e-06,
1645
- "loss": 0.0028,
1646
- "step": 227
1647
- },
1648
- {
1649
- "epoch": 1.795275590551181,
1650
- "grad_norm": 0.1224551647901535,
1651
- "learning_rate": 7.282539396037868e-06,
1652
- "loss": 0.002,
1653
- "step": 228
1654
- },
1655
- {
1656
- "epoch": 1.8031496062992125,
1657
- "grad_norm": 0.4868486225605011,
1658
- "learning_rate": 7.201145308230075e-06,
1659
- "loss": 0.0031,
1660
- "step": 229
1661
- },
1662
- {
1663
- "epoch": 1.811023622047244,
1664
- "grad_norm": 0.2875569462776184,
1665
- "learning_rate": 7.119951912247545e-06,
1666
- "loss": 0.0082,
1667
- "step": 230
1668
- },
1669
- {
1670
- "epoch": 1.8188976377952755,
1671
- "grad_norm": 0.43524420261383057,
1672
- "learning_rate": 7.038965030061273e-06,
1673
- "loss": 0.0075,
1674
- "step": 231
1675
- },
1676
- {
1677
- "epoch": 1.826771653543307,
1678
- "grad_norm": 0.39634883403778076,
1679
- "learning_rate": 6.9581904688341854e-06,
1680
- "loss": 0.0032,
1681
- "step": 232
1682
- },
1683
- {
1684
- "epoch": 1.8346456692913384,
1685
- "grad_norm": 0.9809433817863464,
1686
- "learning_rate": 6.8776340205047446e-06,
1687
- "loss": 0.0085,
1688
- "step": 233
1689
- },
1690
- {
1691
- "epoch": 1.84251968503937,
1692
- "grad_norm": 0.20062875747680664,
1693
- "learning_rate": 6.797301461371626e-06,
1694
- "loss": 0.0043,
1695
- "step": 234
1696
- },
1697
- {
1698
- "epoch": 1.8503937007874016,
1699
- "grad_norm": 0.148948073387146,
1700
- "learning_rate": 6.7171985516795315e-06,
1701
- "loss": 0.0036,
1702
- "step": 235
1703
- },
1704
- {
1705
- "epoch": 1.858267716535433,
1706
- "grad_norm": 0.15658679604530334,
1707
- "learning_rate": 6.637331035206166e-06,
1708
- "loss": 0.0046,
1709
- "step": 236
1710
- },
1711
- {
1712
- "epoch": 1.8661417322834646,
1713
- "grad_norm": 0.22365815937519073,
1714
- "learning_rate": 6.557704638850352e-06,
1715
- "loss": 0.0081,
1716
- "step": 237
1717
- },
1718
- {
1719
- "epoch": 1.874015748031496,
1720
- "grad_norm": 0.10596666485071182,
1721
- "learning_rate": 6.4783250722214066e-06,
1722
- "loss": 0.0032,
1723
- "step": 238
1724
- },
1725
- {
1726
- "epoch": 1.8818897637795275,
1727
- "grad_norm": 0.2130754142999649,
1728
- "learning_rate": 6.399198027229732e-06,
1729
- "loss": 0.0056,
1730
- "step": 239
1731
- },
1732
- {
1733
- "epoch": 1.889763779527559,
1734
- "grad_norm": 0.05641167238354683,
1735
- "learning_rate": 6.320329177678656e-06,
1736
- "loss": 0.0008,
1737
- "step": 240
1738
- },
1739
- {
1740
- "epoch": 1.8976377952755905,
1741
- "grad_norm": 0.10349344462156296,
1742
- "learning_rate": 6.241724178857621e-06,
1743
- "loss": 0.0026,
1744
- "step": 241
1745
- },
1746
- {
1747
- "epoch": 1.905511811023622,
1748
- "grad_norm": 0.08451675623655319,
1749
- "learning_rate": 6.163388667136646e-06,
1750
- "loss": 0.0016,
1751
- "step": 242
1752
- },
1753
- {
1754
- "epoch": 1.9133858267716537,
1755
- "grad_norm": 0.13671623170375824,
1756
- "learning_rate": 6.085328259562195e-06,
1757
- "loss": 0.0034,
1758
- "step": 243
1759
- },
1760
- {
1761
- "epoch": 1.9212598425196852,
1762
- "grad_norm": 0.5500523447990417,
1763
- "learning_rate": 6.007548553454379e-06,
1764
- "loss": 0.0028,
1765
- "step": 244
1766
- },
1767
- {
1768
- "epoch": 1.9291338582677167,
1769
- "grad_norm": 0.06702329218387604,
1770
- "learning_rate": 5.93005512600563e-06,
1771
- "loss": 0.0009,
1772
- "step": 245
1773
- },
1774
- {
1775
- "epoch": 1.9370078740157481,
1776
- "grad_norm": 0.15156973898410797,
1777
- "learning_rate": 5.852853533880768e-06,
1778
- "loss": 0.0064,
1779
- "step": 246
1780
- },
1781
- {
1782
- "epoch": 1.9448818897637796,
1783
- "grad_norm": 0.2970314621925354,
1784
- "learning_rate": 5.7759493128185584e-06,
1785
- "loss": 0.0077,
1786
- "step": 247
1787
- },
1788
- {
1789
- "epoch": 1.952755905511811,
1790
- "grad_norm": 0.06406261771917343,
1791
- "learning_rate": 5.699347977234799e-06,
1792
- "loss": 0.0006,
1793
- "step": 248
1794
- },
1795
- {
1796
- "epoch": 1.9606299212598426,
1797
- "grad_norm": 0.2910393178462982,
1798
- "learning_rate": 5.623055019826862e-06,
1799
- "loss": 0.0036,
1800
- "step": 249
1801
- },
1802
- {
1803
- "epoch": 1.968503937007874,
1804
- "grad_norm": 0.6454993486404419,
1805
- "learning_rate": 5.547075911179902e-06,
1806
- "loss": 0.0084,
1807
- "step": 250
1808
- },
1809
- {
1810
- "epoch": 1.9763779527559056,
1811
- "grad_norm": 0.09460143744945526,
1812
- "learning_rate": 5.471416099374525e-06,
1813
- "loss": 0.0021,
1814
- "step": 251
1815
- },
1816
- {
1817
- "epoch": 1.984251968503937,
1818
- "grad_norm": 0.2024363875389099,
1819
- "learning_rate": 5.3960810095961705e-06,
1820
- "loss": 0.0052,
1821
- "step": 252
1822
- },
1823
- {
1824
- "epoch": 1.9921259842519685,
1825
- "grad_norm": 0.09423142671585083,
1826
- "learning_rate": 5.321076043746108e-06,
1827
- "loss": 0.0018,
1828
- "step": 253
1829
- },
1830
- {
1831
- "epoch": 2.0,
1832
- "grad_norm": 0.1085880920290947,
1833
- "learning_rate": 5.246406580054051e-06,
1834
- "loss": 0.0039,
1835
- "step": 254
1836
- },
1837
- {
1838
- "epoch": 2.0078740157480315,
1839
- "grad_norm": 0.20550444722175598,
1840
- "learning_rate": 5.172077972692553e-06,
1841
- "loss": 0.0006,
1842
- "step": 255
1843
- },
1844
- {
1845
- "epoch": 2.015748031496063,
1846
- "grad_norm": 0.0635254830121994,
1847
- "learning_rate": 5.098095551393066e-06,
1848
- "loss": 0.0008,
1849
- "step": 256
1850
- },
1851
- {
1852
- "epoch": 2.0236220472440944,
1853
- "grad_norm": 0.12593789398670197,
1854
- "learning_rate": 5.024464621063773e-06,
1855
- "loss": 0.0016,
1856
- "step": 257
1857
- },
1858
- {
1859
- "epoch": 2.031496062992126,
1860
- "grad_norm": 0.08928010612726212,
1861
- "learning_rate": 4.951190461409214e-06,
1862
- "loss": 0.0019,
1863
- "step": 258
1864
- },
1865
- {
1866
- "epoch": 2.031496062992126,
1867
- "eval_loss": 0.2930968105792999,
1868
- "eval_runtime": 7.0864,
1869
- "eval_samples_per_second": 23.002,
1870
- "eval_steps_per_second": 2.963,
1871
- "step": 258
1872
- },
1873
- {
1874
- "epoch": 2.0393700787401574,
1875
- "grad_norm": 0.11555846035480499,
1876
- "learning_rate": 4.878278326551682e-06,
1877
- "loss": 0.0036,
1878
- "step": 259
1879
- },
1880
- {
1881
- "epoch": 2.047244094488189,
1882
- "grad_norm": 0.11923055350780487,
1883
- "learning_rate": 4.805733444654496e-06,
1884
- "loss": 0.0011,
1885
- "step": 260
1886
- },
1887
- {
1888
- "epoch": 2.0551181102362204,
1889
- "grad_norm": 0.5410908460617065,
1890
- "learning_rate": 4.733561017547104e-06,
1891
- "loss": 0.0065,
1892
- "step": 261
1893
- },
1894
- {
1895
- "epoch": 2.062992125984252,
1896
- "grad_norm": 0.43598446249961853,
1897
- "learning_rate": 4.661766220352098e-06,
1898
- "loss": 0.004,
1899
- "step": 262
1900
- },
1901
- {
1902
- "epoch": 2.0708661417322833,
1903
- "grad_norm": 0.08221737295389175,
1904
- "learning_rate": 4.590354201114103e-06,
1905
- "loss": 0.0018,
1906
- "step": 263
1907
- },
1908
- {
1909
- "epoch": 2.078740157480315,
1910
- "grad_norm": 0.07835202664136887,
1911
- "learning_rate": 4.519330080430687e-06,
1912
- "loss": 0.0011,
1913
- "step": 264
1914
- },
1915
- {
1916
- "epoch": 2.0866141732283463,
1917
- "grad_norm": 0.1391119360923767,
1918
- "learning_rate": 4.448698951085143e-06,
1919
- "loss": 0.0018,
1920
- "step": 265
1921
- },
1922
- {
1923
- "epoch": 2.094488188976378,
1924
- "grad_norm": 0.10286661982536316,
1925
- "learning_rate": 4.378465877681317e-06,
1926
- "loss": 0.0021,
1927
- "step": 266
1928
- },
1929
- {
1930
- "epoch": 2.1023622047244093,
1931
- "grad_norm": 0.16050903499126434,
1932
- "learning_rate": 4.3086358962804885e-06,
1933
- "loss": 0.004,
1934
- "step": 267
1935
- },
1936
- {
1937
- "epoch": 2.1102362204724407,
1938
- "grad_norm": 0.1615462303161621,
1939
- "learning_rate": 4.2392140140401996e-06,
1940
- "loss": 0.0049,
1941
- "step": 268
1942
- },
1943
- {
1944
- "epoch": 2.1181102362204722,
1945
- "grad_norm": 0.12022113800048828,
1946
- "learning_rate": 4.170205208855281e-06,
1947
- "loss": 0.0021,
1948
- "step": 269
1949
- },
1950
- {
1951
- "epoch": 2.1259842519685037,
1952
- "grad_norm": 0.18673180043697357,
1953
- "learning_rate": 4.101614429000857e-06,
1954
- "loss": 0.0026,
1955
- "step": 270
1956
- },
1957
- {
1958
- "epoch": 2.1338582677165356,
1959
- "grad_norm": 0.13400611281394958,
1960
- "learning_rate": 4.033446592777558e-06,
1961
- "loss": 0.0045,
1962
- "step": 271
1963
- },
1964
- {
1965
- "epoch": 2.141732283464567,
1966
- "grad_norm": 0.08963260799646378,
1967
- "learning_rate": 3.965706588158865e-06,
1968
- "loss": 0.002,
1969
- "step": 272
1970
- },
1971
- {
1972
- "epoch": 2.1496062992125986,
1973
- "grad_norm": 0.07362519204616547,
1974
- "learning_rate": 3.89839927244058e-06,
1975
- "loss": 0.0008,
1976
- "step": 273
1977
- },
1978
- {
1979
- "epoch": 2.15748031496063,
1980
- "grad_norm": 0.12438540160655975,
1981
- "learning_rate": 3.8315294718925656e-06,
1982
- "loss": 0.0032,
1983
- "step": 274
1984
- },
1985
- {
1986
- "epoch": 2.1653543307086616,
1987
- "grad_norm": 0.07505560666322708,
1988
- "learning_rate": 3.7651019814126656e-06,
1989
- "loss": 0.0011,
1990
- "step": 275
1991
- },
1992
- {
1993
- "epoch": 2.173228346456693,
1994
- "grad_norm": 0.24100656807422638,
1995
- "learning_rate": 3.6991215641828903e-06,
1996
- "loss": 0.0039,
1997
- "step": 276
1998
- },
1999
- {
2000
- "epoch": 2.1811023622047245,
2001
- "grad_norm": 0.08774268627166748,
2002
- "learning_rate": 3.6335929513278667e-06,
2003
- "loss": 0.0021,
2004
- "step": 277
2005
- },
2006
- {
2007
- "epoch": 2.188976377952756,
2008
- "grad_norm": 0.06761056184768677,
2009
- "learning_rate": 3.568520841575601e-06,
2010
- "loss": 0.0004,
2011
- "step": 278
2012
- },
2013
- {
2014
- "epoch": 2.1968503937007875,
2015
- "grad_norm": 0.514453113079071,
2016
- "learning_rate": 3.5039099009205503e-06,
2017
- "loss": 0.002,
2018
- "step": 279
2019
- },
2020
- {
2021
- "epoch": 2.204724409448819,
2022
- "grad_norm": 0.1681102067232132,
2023
- "learning_rate": 3.439764762289051e-06,
2024
- "loss": 0.0049,
2025
- "step": 280
2026
- },
2027
- {
2028
- "epoch": 2.2125984251968505,
2029
- "grad_norm": 0.46447646617889404,
2030
- "learning_rate": 3.376090025207115e-06,
2031
- "loss": 0.0037,
2032
- "step": 281
2033
- },
2034
- {
2035
- "epoch": 2.220472440944882,
2036
- "grad_norm": 0.09738212823867798,
2037
- "learning_rate": 3.312890255470609e-06,
2038
- "loss": 0.0018,
2039
- "step": 282
2040
- },
2041
- {
2042
- "epoch": 2.2283464566929134,
2043
- "grad_norm": 0.12760388851165771,
2044
- "learning_rate": 3.250169984817897e-06,
2045
- "loss": 0.0022,
2046
- "step": 283
2047
- },
2048
- {
2049
- "epoch": 2.236220472440945,
2050
- "grad_norm": 0.05433168262243271,
2051
- "learning_rate": 3.187933710604847e-06,
2052
- "loss": 0.0005,
2053
- "step": 284
2054
- },
2055
- {
2056
- "epoch": 2.2440944881889764,
2057
- "grad_norm": 0.06812359392642975,
2058
- "learning_rate": 3.1261858954823798e-06,
2059
- "loss": 0.0007,
2060
- "step": 285
2061
- },
2062
- {
2063
- "epoch": 2.251968503937008,
2064
- "grad_norm": 0.44168326258659363,
2065
- "learning_rate": 3.064930967076477e-06,
2066
- "loss": 0.0052,
2067
- "step": 286
2068
- },
2069
- {
2070
- "epoch": 2.2598425196850394,
2071
- "grad_norm": 0.4508403241634369,
2072
- "learning_rate": 3.0041733176706668e-06,
2073
- "loss": 0.0049,
2074
- "step": 287
2075
- },
2076
- {
2077
- "epoch": 2.267716535433071,
2078
- "grad_norm": 0.00029889008146710694,
2079
- "learning_rate": 2.943917303891107e-06,
2080
- "loss": 0.0,
2081
- "step": 288
2082
- },
2083
- {
2084
- "epoch": 2.2755905511811023,
2085
- "grad_norm": 0.16293245553970337,
2086
- "learning_rate": 2.8841672463941827e-06,
2087
- "loss": 0.0052,
2088
- "step": 289
2089
- },
2090
- {
2091
- "epoch": 2.283464566929134,
2092
- "grad_norm": 0.0034355763345956802,
2093
- "learning_rate": 2.8249274295566863e-06,
2094
- "loss": 0.0,
2095
- "step": 290
2096
- },
2097
- {
2098
- "epoch": 2.2913385826771653,
2099
- "grad_norm": 0.41321080923080444,
2100
- "learning_rate": 2.766202101168628e-06,
2101
- "loss": 0.0042,
2102
- "step": 291
2103
- },
2104
- {
2105
- "epoch": 2.2992125984251968,
2106
- "grad_norm": 0.05302264913916588,
2107
- "learning_rate": 2.7079954721286108e-06,
2108
- "loss": 0.0008,
2109
- "step": 292
2110
- },
2111
- {
2112
- "epoch": 2.3070866141732282,
2113
- "grad_norm": 0.16997075080871582,
2114
- "learning_rate": 2.6503117161419246e-06,
2115
- "loss": 0.0049,
2116
- "step": 293
2117
- },
2118
- {
2119
- "epoch": 2.3149606299212597,
2120
- "grad_norm": 0.15489016473293304,
2121
- "learning_rate": 2.5931549694212545e-06,
2122
- "loss": 0.0029,
2123
- "step": 294
2124
- },
2125
- {
2126
- "epoch": 2.322834645669291,
2127
- "grad_norm": 0.040922824293375015,
2128
- "learning_rate": 2.536529330390095e-06,
2129
- "loss": 0.0003,
2130
- "step": 295
2131
- },
2132
- {
2133
- "epoch": 2.3307086614173227,
2134
- "grad_norm": 0.15096415579319,
2135
- "learning_rate": 2.480438859388873e-06,
2136
- "loss": 0.0037,
2137
- "step": 296
2138
- },
2139
- {
2140
- "epoch": 2.338582677165354,
2141
- "grad_norm": 0.05358278378844261,
2142
- "learning_rate": 2.424887578383799e-06,
2143
- "loss": 0.0004,
2144
- "step": 297
2145
- },
2146
- {
2147
- "epoch": 2.3464566929133857,
2148
- "grad_norm": 0.16193096339702606,
2149
- "learning_rate": 2.36987947067848e-06,
2150
- "loss": 0.0025,
2151
- "step": 298
2152
- },
2153
- {
2154
- "epoch": 2.354330708661417,
2155
- "grad_norm": 0.10353274643421173,
2156
- "learning_rate": 2.3154184806282863e-06,
2157
- "loss": 0.0021,
2158
- "step": 299
2159
- },
2160
- {
2161
- "epoch": 2.362204724409449,
2162
- "grad_norm": 0.10735179483890533,
2163
- "learning_rate": 2.261508513357532e-06,
2164
- "loss": 0.0035,
2165
- "step": 300
2166
- },
2167
- {
2168
- "epoch": 2.3700787401574805,
2169
- "grad_norm": 0.18752367794513702,
2170
- "learning_rate": 2.208153434479442e-06,
2171
- "loss": 0.0039,
2172
- "step": 301
2173
- },
2174
- {
2175
- "epoch": 2.3700787401574805,
2176
- "eval_loss": 0.30320534110069275,
2177
- "eval_runtime": 6.5784,
2178
- "eval_samples_per_second": 24.778,
2179
- "eval_steps_per_second": 3.192,
2180
- "step": 301
2181
- },
2182
- {
2183
- "epoch": 2.377952755905512,
2184
- "grad_norm": 0.13881297409534454,
2185
- "learning_rate": 2.155357069818995e-06,
2186
- "loss": 0.0032,
2187
- "step": 302
2188
- },
2189
- {
2190
- "epoch": 2.3858267716535435,
2191
- "grad_norm": 0.09920285642147064,
2192
- "learning_rate": 2.1031232051385606e-06,
2193
- "loss": 0.0021,
2194
- "step": 303
2195
- },
2196
- {
2197
- "epoch": 2.393700787401575,
2198
- "grad_norm": 0.37194201350212097,
2199
- "learning_rate": 2.0514555858664663e-06,
2200
- "loss": 0.0045,
2201
- "step": 304
2202
- },
2203
- {
2204
- "epoch": 2.4015748031496065,
2205
- "grad_norm": 0.10560385882854462,
2206
- "learning_rate": 2.000357916828428e-06,
2207
- "loss": 0.0011,
2208
- "step": 305
2209
- },
2210
- {
2211
- "epoch": 2.409448818897638,
2212
- "grad_norm": 0.33549824357032776,
2213
- "learning_rate": 1.949833861981877e-06,
2214
- "loss": 0.0039,
2215
- "step": 306
2216
- },
2217
- {
2218
- "epoch": 2.4173228346456694,
2219
- "grad_norm": 0.3969619870185852,
2220
- "learning_rate": 1.8998870441532569e-06,
2221
- "loss": 0.0027,
2222
- "step": 307
2223
- },
2224
- {
2225
- "epoch": 2.425196850393701,
2226
- "grad_norm": 0.081158846616745,
2227
- "learning_rate": 1.8505210447782418e-06,
2228
- "loss": 0.0011,
2229
- "step": 308
2230
- },
2231
- {
2232
- "epoch": 2.4330708661417324,
2233
- "grad_norm": 0.28652095794677734,
2234
- "learning_rate": 1.8017394036449276e-06,
2235
- "loss": 0.0038,
2236
- "step": 309
2237
- },
2238
- {
2239
- "epoch": 2.440944881889764,
2240
- "grad_norm": 0.0656951516866684,
2241
- "learning_rate": 1.7535456186400123e-06,
2242
- "loss": 0.001,
2243
- "step": 310
2244
- },
2245
- {
2246
- "epoch": 2.4488188976377954,
2247
- "grad_norm": 0.14871421456336975,
2248
- "learning_rate": 1.7059431454979825e-06,
2249
- "loss": 0.0027,
2250
- "step": 311
2251
- },
2252
- {
2253
- "epoch": 2.456692913385827,
2254
- "grad_norm": 0.25429457426071167,
2255
- "learning_rate": 1.6589353975533174e-06,
2256
- "loss": 0.0012,
2257
- "step": 312
2258
- },
2259
- {
2260
- "epoch": 2.4645669291338583,
2261
- "grad_norm": 0.06939385086297989,
2262
- "learning_rate": 1.6125257454957365e-06,
2263
- "loss": 0.0008,
2264
- "step": 313
2265
- },
2266
- {
2267
- "epoch": 2.47244094488189,
2268
- "grad_norm": 0.15781065821647644,
2269
- "learning_rate": 1.5667175171285054e-06,
2270
- "loss": 0.003,
2271
- "step": 314
2272
- },
2273
- {
2274
- "epoch": 2.4803149606299213,
2275
- "grad_norm": 0.08229056000709534,
2276
- "learning_rate": 1.5215139971298131e-06,
2277
- "loss": 0.0015,
2278
- "step": 315
2279
- },
2280
- {
2281
- "epoch": 2.4881889763779528,
2282
- "grad_norm": 0.16827985644340515,
2283
- "learning_rate": 1.4769184268172465e-06,
2284
- "loss": 0.0032,
2285
- "step": 316
2286
- },
2287
- {
2288
- "epoch": 2.4960629921259843,
2289
- "grad_norm": 0.12261717021465302,
2290
- "learning_rate": 1.4329340039153738e-06,
2291
- "loss": 0.0022,
2292
- "step": 317
2293
- },
2294
- {
2295
- "epoch": 2.5039370078740157,
2296
- "grad_norm": 0.1208304911851883,
2297
- "learning_rate": 1.3895638823264447e-06,
2298
- "loss": 0.002,
2299
- "step": 318
2300
- },
2301
- {
2302
- "epoch": 2.5118110236220472,
2303
- "grad_norm": 0.22991932928562164,
2304
- "learning_rate": 1.3468111719042497e-06,
2305
- "loss": 0.0027,
2306
- "step": 319
2307
- },
2308
- {
2309
- "epoch": 2.5196850393700787,
2310
- "grad_norm": 0.468462198972702,
2311
- "learning_rate": 1.3046789382311132e-06,
2312
- "loss": 0.0042,
2313
- "step": 320
2314
- },
2315
- {
2316
- "epoch": 2.52755905511811,
2317
- "grad_norm": 0.029908303171396255,
2318
- "learning_rate": 1.2631702023980997e-06,
2319
- "loss": 0.0002,
2320
- "step": 321
2321
- },
2322
- {
2323
- "epoch": 2.5354330708661417,
2324
- "grad_norm": 0.07678980380296707,
2325
- "learning_rate": 1.2222879407883592e-06,
2326
- "loss": 0.0014,
2327
- "step": 322
2328
- },
2329
- {
2330
- "epoch": 2.543307086614173,
2331
- "grad_norm": 0.13547496497631073,
2332
- "learning_rate": 1.182035084863724e-06,
2333
- "loss": 0.0017,
2334
- "step": 323
2335
- },
2336
- {
2337
- "epoch": 2.5511811023622046,
2338
- "grad_norm": 0.15075382590293884,
2339
- "learning_rate": 1.1424145209545079e-06,
2340
- "loss": 0.0059,
2341
- "step": 324
2342
- },
2343
- {
2344
- "epoch": 2.559055118110236,
2345
- "grad_norm": 0.1271948516368866,
2346
- "learning_rate": 1.1034290900525279e-06,
2347
- "loss": 0.0021,
2348
- "step": 325
2349
- },
2350
- {
2351
- "epoch": 2.5669291338582676,
2352
- "grad_norm": 0.11441997438669205,
2353
- "learning_rate": 1.065081587607406e-06,
2354
- "loss": 0.0022,
2355
- "step": 326
2356
- },
2357
- {
2358
- "epoch": 2.574803149606299,
2359
- "grad_norm": 0.13326182961463928,
2360
- "learning_rate": 1.0273747633261144e-06,
2361
- "loss": 0.004,
2362
- "step": 327
2363
- },
2364
- {
2365
- "epoch": 2.5826771653543306,
2366
- "grad_norm": 0.07804345339536667,
2367
- "learning_rate": 9.903113209758098e-07,
2368
- "loss": 0.0018,
2369
- "step": 328
2370
- },
2371
- {
2372
- "epoch": 2.590551181102362,
2373
- "grad_norm": 0.0012728713918477297,
2374
- "learning_rate": 9.538939181899565e-07,
2375
- "loss": 0.0,
2376
- "step": 329
2377
- },
2378
- {
2379
- "epoch": 2.5984251968503935,
2380
- "grad_norm": 0.06427028775215149,
2381
- "learning_rate": 9.181251662777668e-07,
2382
- "loss": 0.0007,
2383
- "step": 330
2384
- },
2385
- {
2386
- "epoch": 2.606299212598425,
2387
- "grad_norm": 0.1923428475856781,
2388
- "learning_rate": 8.830076300369517e-07,
2389
- "loss": 0.006,
2390
- "step": 331
2391
- },
2392
- {
2393
- "epoch": 2.6141732283464565,
2394
- "grad_norm": 0.33056169748306274,
2395
- "learning_rate": 8.485438275698154e-07,
2396
- "loss": 0.0024,
2397
- "step": 332
2398
- },
2399
- {
2400
- "epoch": 2.622047244094488,
2401
- "grad_norm": 0.13692541420459747,
2402
- "learning_rate": 8.14736230102694e-07,
2403
- "loss": 0.0019,
2404
- "step": 333
2405
- },
2406
- {
2407
- "epoch": 2.6299212598425195,
2408
- "grad_norm": 0.11543405055999756,
2409
- "learning_rate": 7.815872618087506e-07,
2410
- "loss": 0.003,
2411
- "step": 334
2412
- },
2413
- {
2414
- "epoch": 2.637795275590551,
2415
- "grad_norm": 0.20871274173259735,
2416
- "learning_rate": 7.490992996341662e-07,
2417
- "loss": 0.0022,
2418
- "step": 335
2419
- },
2420
- {
2421
- "epoch": 2.6456692913385824,
2422
- "grad_norm": 0.1506434828042984,
2423
- "learning_rate": 7.17274673127677e-07,
2424
- "loss": 0.0034,
2425
- "step": 336
2426
- },
2427
- {
2428
- "epoch": 2.653543307086614,
2429
- "grad_norm": 0.1000061109662056,
2430
- "learning_rate": 6.861156642735578e-07,
2431
- "loss": 0.0015,
2432
- "step": 337
2433
- },
2434
- {
2435
- "epoch": 2.661417322834646,
2436
- "grad_norm": 0.04730301722884178,
2437
- "learning_rate": 6.556245073279777e-07,
2438
- "loss": 0.0003,
2439
- "step": 338
2440
- },
2441
- {
2442
- "epoch": 2.6692913385826773,
2443
- "grad_norm": 0.07712409645318985,
2444
- "learning_rate": 6.258033886587911e-07,
2445
- "loss": 0.0006,
2446
- "step": 339
2447
- },
2448
- {
2449
- "epoch": 2.677165354330709,
2450
- "grad_norm": 0.12951001524925232,
2451
- "learning_rate": 5.966544465887803e-07,
2452
- "loss": 0.0022,
2453
- "step": 340
2454
- },
2455
- {
2456
- "epoch": 2.6850393700787403,
2457
- "grad_norm": 0.3450707495212555,
2458
- "learning_rate": 5.681797712423099e-07,
2459
- "loss": 0.0031,
2460
- "step": 341
2461
- },
2462
- {
2463
- "epoch": 2.6929133858267718,
2464
- "grad_norm": 0.11356323957443237,
2465
- "learning_rate": 5.403814043954592e-07,
2466
- "loss": 0.0016,
2467
- "step": 342
2468
- },
2469
- {
2470
- "epoch": 2.7007874015748032,
2471
- "grad_norm": 0.40962764620780945,
2472
- "learning_rate": 5.132613393296293e-07,
2473
- "loss": 0.0022,
2474
- "step": 343
2475
- },
2476
- {
2477
- "epoch": 2.7086614173228347,
2478
- "grad_norm": 0.0026160525158047676,
2479
- "learning_rate": 4.868215206885918e-07,
2480
- "loss": 0.0,
2481
- "step": 344
2482
- },
2483
- {
2484
- "epoch": 2.7086614173228347,
2485
- "eval_loss": 0.3102666437625885,
2486
- "eval_runtime": 7.3029,
2487
- "eval_samples_per_second": 22.32,
2488
- "eval_steps_per_second": 2.876,
2489
- "step": 344
2490
- },
2491
- {
2492
- "epoch": 2.716535433070866,
2493
- "grad_norm": 0.2460733950138092,
2494
- "learning_rate": 4.61063844339068e-07,
2495
- "loss": 0.0044,
2496
- "step": 345
2497
- },
2498
- {
2499
- "epoch": 2.7244094488188977,
2500
- "grad_norm": 0.11104279011487961,
2501
- "learning_rate": 4.359901572347758e-07,
2502
- "loss": 0.0031,
2503
- "step": 346
2504
- },
2505
- {
2506
- "epoch": 2.732283464566929,
2507
- "grad_norm": 0.288809210062027,
2508
- "learning_rate": 4.116022572839984e-07,
2509
- "loss": 0.0023,
2510
- "step": 347
2511
- },
2512
- {
2513
- "epoch": 2.7401574803149606,
2514
- "grad_norm": 0.2904239892959595,
2515
- "learning_rate": 3.879018932206624e-07,
2516
- "loss": 0.001,
2517
- "step": 348
2518
- },
2519
- {
2520
- "epoch": 2.748031496062992,
2521
- "grad_norm": 0.5172310471534729,
2522
- "learning_rate": 3.6489076447894456e-07,
2523
- "loss": 0.0023,
2524
- "step": 349
2525
- },
2526
- {
2527
- "epoch": 2.7559055118110236,
2528
- "grad_norm": 0.555241048336029,
2529
- "learning_rate": 3.425705210714192e-07,
2530
- "loss": 0.0026,
2531
- "step": 350
2532
- },
2533
- {
2534
- "epoch": 2.763779527559055,
2535
- "grad_norm": 0.12381427735090256,
2536
- "learning_rate": 3.2094276347073626e-07,
2537
- "loss": 0.002,
2538
- "step": 351
2539
- },
2540
- {
2541
- "epoch": 2.7716535433070866,
2542
- "grad_norm": 0.16744810342788696,
2543
- "learning_rate": 3.000090424948665e-07,
2544
- "loss": 0.0036,
2545
- "step": 352
2546
- },
2547
- {
2548
- "epoch": 2.779527559055118,
2549
- "grad_norm": 0.512416422367096,
2550
- "learning_rate": 2.7977085919589253e-07,
2551
- "loss": 0.0026,
2552
- "step": 353
2553
- },
2554
- {
2555
- "epoch": 2.7874015748031495,
2556
- "grad_norm": 0.18864978849887848,
2557
- "learning_rate": 2.602296647523861e-07,
2558
- "loss": 0.0025,
2559
- "step": 354
2560
- },
2561
- {
2562
- "epoch": 2.795275590551181,
2563
- "grad_norm": 0.012189923785626888,
2564
- "learning_rate": 2.413868603653413e-07,
2565
- "loss": 0.0001,
2566
- "step": 355
2567
- },
2568
- {
2569
- "epoch": 2.8031496062992125,
2570
- "grad_norm": 0.10027037560939789,
2571
- "learning_rate": 2.2324379715770728e-07,
2572
- "loss": 0.0011,
2573
- "step": 356
2574
- },
2575
- {
2576
- "epoch": 2.811023622047244,
2577
- "grad_norm": 0.10117685794830322,
2578
- "learning_rate": 2.0580177607750663e-07,
2579
- "loss": 0.0036,
2580
- "step": 357
2581
- },
2582
- {
2583
- "epoch": 2.8188976377952755,
2584
- "grad_norm": 0.1535252332687378,
2585
- "learning_rate": 1.890620478045435e-07,
2586
- "loss": 0.0044,
2587
- "step": 358
2588
- },
2589
- {
2590
- "epoch": 2.826771653543307,
2591
- "grad_norm": 0.39140409231185913,
2592
- "learning_rate": 1.7302581266073537e-07,
2593
- "loss": 0.0037,
2594
- "step": 359
2595
- },
2596
- {
2597
- "epoch": 2.8346456692913384,
2598
- "grad_norm": 0.18143348395824432,
2599
- "learning_rate": 1.5769422052403172e-07,
2600
- "loss": 0.0033,
2601
- "step": 360
2602
- },
2603
- {
2604
- "epoch": 2.84251968503937,
2605
- "grad_norm": 0.6282801032066345,
2606
- "learning_rate": 1.4306837074597235e-07,
2607
- "loss": 0.0096,
2608
- "step": 361
2609
- },
2610
- {
2611
- "epoch": 2.850393700787402,
2612
- "grad_norm": 0.3672868311405182,
2613
- "learning_rate": 1.2914931207285154e-07,
2614
- "loss": 0.0014,
2615
- "step": 362
2616
- },
2617
- {
2618
- "epoch": 2.8582677165354333,
2619
- "grad_norm": 0.13403712213039398,
2620
- "learning_rate": 1.1593804257052143e-07,
2621
- "loss": 0.0046,
2622
- "step": 363
2623
- },
2624
- {
2625
- "epoch": 2.866141732283465,
2626
- "grad_norm": 0.004047819878906012,
2627
- "learning_rate": 1.0343550955282278e-07,
2628
- "loss": 0.0,
2629
- "step": 364
2630
- },
2631
- {
2632
- "epoch": 2.8740157480314963,
2633
- "grad_norm": 0.3351942002773285,
2634
- "learning_rate": 9.164260951366021e-08,
2635
- "loss": 0.0024,
2636
- "step": 365
2637
- },
2638
- {
2639
- "epoch": 2.8818897637795278,
2640
- "grad_norm": 0.09759978204965591,
2641
- "learning_rate": 8.056018806271937e-08,
2642
- "loss": 0.002,
2643
- "step": 366
2644
- },
2645
- {
2646
- "epoch": 2.8897637795275593,
2647
- "grad_norm": 0.06213594600558281,
2648
- "learning_rate": 7.018903986483083e-08,
2649
- "loss": 0.0009,
2650
- "step": 367
2651
- },
2652
- {
2653
- "epoch": 2.8976377952755907,
2654
- "grad_norm": 0.07074209302663803,
2655
- "learning_rate": 6.052990858298801e-08,
2656
- "loss": 0.0009,
2657
- "step": 368
2658
- },
2659
- {
2660
- "epoch": 2.905511811023622,
2661
- "grad_norm": 0.271335631608963,
2662
- "learning_rate": 5.158348682502756e-08,
2663
- "loss": 0.0037,
2664
- "step": 369
2665
- },
2666
- {
2667
- "epoch": 2.9133858267716537,
2668
- "grad_norm": 0.09063868969678879,
2669
- "learning_rate": 4.335041609396018e-08,
2670
- "loss": 0.0014,
2671
- "step": 370
2672
- },
2673
- {
2674
- "epoch": 2.921259842519685,
2675
- "grad_norm": 0.818594753742218,
2676
- "learning_rate": 3.5831286741973006e-08,
2677
- "loss": 0.0033,
2678
- "step": 371
2679
- },
2680
- {
2681
- "epoch": 2.9291338582677167,
2682
- "grad_norm": 0.09543661028146744,
2683
- "learning_rate": 2.902663792810012e-08,
2684
- "loss": 0.0015,
2685
- "step": 372
2686
- },
2687
- {
2688
- "epoch": 2.937007874015748,
2689
- "grad_norm": 0.13098907470703125,
2690
- "learning_rate": 2.293695757956571e-08,
2691
- "loss": 0.0037,
2692
- "step": 373
2693
- },
2694
- {
2695
- "epoch": 2.9448818897637796,
2696
- "grad_norm": 0.5491423010826111,
2697
- "learning_rate": 1.7562682356786488e-08,
2698
- "loss": 0.004,
2699
- "step": 374
2700
- },
2701
- {
2702
- "epoch": 2.952755905511811,
2703
- "grad_norm": 0.08357255905866623,
2704
- "learning_rate": 1.290419762207007e-08,
2705
- "loss": 0.0015,
2706
- "step": 375
2707
- },
2708
- {
2709
- "epoch": 2.9606299212598426,
2710
- "grad_norm": 0.24269114434719086,
2711
- "learning_rate": 8.961837411982643e-09,
2712
- "loss": 0.0028,
2713
- "step": 376
2714
- },
2715
- {
2716
- "epoch": 2.968503937007874,
2717
- "grad_norm": 0.1084604412317276,
2718
- "learning_rate": 5.735884413391457e-09,
2719
- "loss": 0.0022,
2720
- "step": 377
2721
- },
2722
- {
2723
- "epoch": 2.9763779527559056,
2724
- "grad_norm": 0.09172981232404709,
2725
- "learning_rate": 3.226569943197699e-09,
2726
- "loss": 0.0022,
2727
- "step": 378
2728
- },
2729
- {
2730
- "epoch": 2.984251968503937,
2731
- "grad_norm": 0.1312946230173111,
2732
- "learning_rate": 1.4340739317497688e-09,
2733
- "loss": 0.002,
2734
- "step": 379
2735
- },
2736
- {
2737
- "epoch": 2.9921259842519685,
2738
- "grad_norm": 0.0002817026397679001,
2739
- "learning_rate": 3.585249099435917e-10,
2740
- "loss": 0.0,
2741
- "step": 380
2742
- },
2743
- {
2744
- "epoch": 3.0,
2745
- "grad_norm": 0.2318553477525711,
2746
- "learning_rate": 0.0,
2747
- "loss": 0.0007,
2748
- "step": 381
2749
- }
2750
- ],
2751
- "logging_steps": 1,
2752
- "max_steps": 381,
2753
- "num_input_tokens_seen": 0,
2754
- "num_train_epochs": 3,
2755
- "save_steps": 127,
2756
- "stateful_callbacks": {
2757
- "TrainerControl": {
2758
- "args": {
2759
- "should_epoch_stop": false,
2760
- "should_evaluate": false,
2761
- "should_log": false,
2762
- "should_save": true,
2763
- "should_training_stop": true
2764
- },
2765
- "attributes": {}
2766
- }
2767
- },
2768
- "total_flos": 3.9261813209667994e+17,
2769
- "train_batch_size": 128,
2770
- "trial_name": null,
2771
- "trial_params": null
2772
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-381/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:033fc2cc0303528d4e9ad523b3fd63b75e963b86dba301044379df1d98e6c394
3
- size 10744
 
 
 
 
checkpoint-381/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoint-381/zero_to_fp32.py DELETED
@@ -1,760 +0,0 @@
1
- #!/usr/bin/env python
2
-
3
- # Copyright (c) Microsoft Corporation.
4
- # SPDX-License-Identifier: Apache-2.0
5
-
6
- # DeepSpeed Team
7
-
8
- # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
- # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
- # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
- # application.
12
- #
13
- # example:
14
- # python zero_to_fp32.py . output_dir/
15
- # or
16
- # python zero_to_fp32.py . output_dir/ --safe_serialization
17
-
18
- import argparse
19
- import torch
20
- import glob
21
- import math
22
- import os
23
- import re
24
- import gc
25
- import json
26
- import numpy as np
27
- from tqdm import tqdm
28
- from collections import OrderedDict
29
- from dataclasses import dataclass
30
-
31
- # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
32
- # DeepSpeed data structures it has to be available in the current python environment.
33
- from deepspeed.utils import logger
34
- from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
35
- FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
36
- FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
37
-
38
-
39
- @dataclass
40
- class zero_model_state:
41
- buffers: dict()
42
- param_shapes: dict()
43
- shared_params: list
44
- ds_version: int
45
- frozen_param_shapes: dict()
46
- frozen_param_fragments: dict()
47
-
48
-
49
- debug = 0
50
-
51
- # load to cpu
52
- device = torch.device('cpu')
53
-
54
-
55
- def atoi(text):
56
- return int(text) if text.isdigit() else text
57
-
58
-
59
- def natural_keys(text):
60
- '''
61
- alist.sort(key=natural_keys) sorts in human order
62
- http://nedbatchelder.com/blog/200712/human_sorting.html
63
- (See Toothy's implementation in the comments)
64
- '''
65
- return [atoi(c) for c in re.split(r'(\d+)', text)]
66
-
67
-
68
- def get_model_state_file(checkpoint_dir, zero_stage):
69
- if not os.path.isdir(checkpoint_dir):
70
- raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
71
-
72
- # there should be only one file
73
- if zero_stage <= 2:
74
- file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
75
- elif zero_stage == 3:
76
- file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
77
-
78
- if not os.path.exists(file):
79
- raise FileNotFoundError(f"can't find model states file at '{file}'")
80
-
81
- return file
82
-
83
-
84
- def get_checkpoint_files(checkpoint_dir, glob_pattern):
85
- # XXX: need to test that this simple glob rule works for multi-node setup too
86
- ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
87
-
88
- if len(ckpt_files) == 0:
89
- raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
90
-
91
- return ckpt_files
92
-
93
-
94
- def get_optim_files(checkpoint_dir):
95
- return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
96
-
97
-
98
- def get_model_state_files(checkpoint_dir):
99
- return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
100
-
101
-
102
- def parse_model_states(files):
103
- zero_model_states = []
104
- for file in files:
105
- state_dict = torch.load(file, map_location=device, weights_only=False)
106
-
107
- if BUFFER_NAMES not in state_dict:
108
- raise ValueError(f"{file} is not a model state checkpoint")
109
- buffer_names = state_dict[BUFFER_NAMES]
110
- if debug:
111
- print("Found buffers:", buffer_names)
112
-
113
- # recover just the buffers while restoring them to fp32 if they were saved in fp16
114
- buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
115
- param_shapes = state_dict[PARAM_SHAPES]
116
-
117
- # collect parameters that are included in param_shapes
118
- param_names = []
119
- for s in param_shapes:
120
- for name in s.keys():
121
- param_names.append(name)
122
-
123
- # update with frozen parameters
124
- frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
125
- if frozen_param_shapes is not None:
126
- if debug:
127
- print(f"Found frozen_param_shapes: {frozen_param_shapes}")
128
- param_names += list(frozen_param_shapes.keys())
129
-
130
- # handle shared params
131
- shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
132
-
133
- ds_version = state_dict.get(DS_VERSION, None)
134
-
135
- frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
136
-
137
- z_model_state = zero_model_state(buffers=buffers,
138
- param_shapes=param_shapes,
139
- shared_params=shared_params,
140
- ds_version=ds_version,
141
- frozen_param_shapes=frozen_param_shapes,
142
- frozen_param_fragments=frozen_param_fragments)
143
- zero_model_states.append(z_model_state)
144
-
145
- return zero_model_states
146
-
147
-
148
- def parse_optim_states(files, ds_checkpoint_dir):
149
- total_files = len(files)
150
- state_dicts = []
151
- for f in tqdm(files, desc='Loading checkpoint shards'):
152
- state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
153
- # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
154
- # and also handle the case where it was already removed by another helper script
155
- state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
156
- state_dicts.append(state_dict)
157
-
158
- if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
159
- raise ValueError(f"{files[0]} is not a zero checkpoint")
160
- zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
161
- world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
162
-
163
- # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
164
- # parameters can be different from data parallelism for non-expert parameters. So we can just
165
- # use the max of the partition_count to get the dp world_size.
166
-
167
- if type(world_size) is list:
168
- world_size = max(world_size)
169
-
170
- if world_size != total_files:
171
- raise ValueError(
172
- f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
173
- "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
174
- )
175
-
176
- # the groups are named differently in each stage
177
- if zero_stage <= 2:
178
- fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
179
- elif zero_stage == 3:
180
- fp32_groups_key = FP32_FLAT_GROUPS
181
- else:
182
- raise ValueError(f"unknown zero stage {zero_stage}")
183
-
184
- fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
185
- return zero_stage, world_size, fp32_flat_groups
186
-
187
-
188
- def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
189
- """
190
- Returns fp32 state_dict reconstructed from ds checkpoint
191
-
192
- Args:
193
- - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
194
-
195
- """
196
- print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
197
-
198
- optim_files = get_optim_files(ds_checkpoint_dir)
199
- zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
200
- print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
201
-
202
- model_files = get_model_state_files(ds_checkpoint_dir)
203
-
204
- zero_model_states = parse_model_states(model_files)
205
- print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
206
-
207
- if zero_stage <= 2:
208
- return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
209
- exclude_frozen_parameters)
210
- elif zero_stage == 3:
211
- return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
212
- exclude_frozen_parameters)
213
-
214
-
215
- def _zero2_merge_frozen_params(state_dict, zero_model_states):
216
- if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
217
- return
218
-
219
- frozen_param_shapes = zero_model_states[0].frozen_param_shapes
220
- frozen_param_fragments = zero_model_states[0].frozen_param_fragments
221
-
222
- if debug:
223
- num_elem = sum(s.numel() for s in frozen_param_shapes.values())
224
- print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
225
-
226
- wanted_params = len(frozen_param_shapes)
227
- wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
228
- avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
229
- print(f'Frozen params: Have {avail_numel} numels to process.')
230
- print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
231
-
232
- total_params = 0
233
- total_numel = 0
234
- for name, shape in frozen_param_shapes.items():
235
- total_params += 1
236
- unpartitioned_numel = shape.numel()
237
- total_numel += unpartitioned_numel
238
-
239
- state_dict[name] = frozen_param_fragments[name]
240
-
241
- if debug:
242
- print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
243
-
244
- print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
245
-
246
-
247
- def _has_callable(obj, fn):
248
- attr = getattr(obj, fn, None)
249
- return callable(attr)
250
-
251
-
252
- def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
253
- param_shapes = zero_model_states[0].param_shapes
254
-
255
- # Reconstruction protocol:
256
- #
257
- # XXX: document this
258
-
259
- if debug:
260
- for i in range(world_size):
261
- for j in range(len(fp32_flat_groups[0])):
262
- print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
263
-
264
- # XXX: memory usage doubles here (zero2)
265
- num_param_groups = len(fp32_flat_groups[0])
266
- merged_single_partition_of_fp32_groups = []
267
- for i in range(num_param_groups):
268
- merged_partitions = [sd[i] for sd in fp32_flat_groups]
269
- full_single_fp32_vector = torch.cat(merged_partitions, 0)
270
- merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
271
- avail_numel = sum(
272
- [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
273
-
274
- if debug:
275
- wanted_params = sum([len(shapes) for shapes in param_shapes])
276
- wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
277
- # not asserting if there is a mismatch due to possible padding
278
- print(f"Have {avail_numel} numels to process.")
279
- print(f"Need {wanted_numel} numels in {wanted_params} params.")
280
-
281
- # params
282
- # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
283
- # out-of-core computing solution
284
- total_numel = 0
285
- total_params = 0
286
- for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
287
- offset = 0
288
- avail_numel = full_single_fp32_vector.numel()
289
- for name, shape in shapes.items():
290
-
291
- unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
292
- total_numel += unpartitioned_numel
293
- total_params += 1
294
-
295
- if debug:
296
- print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
297
- state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
298
- offset += unpartitioned_numel
299
-
300
- # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
301
- # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
302
- # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
303
- # live optimizer object, so we are checking that the numbers are within the right range
304
- align_to = 2 * world_size
305
-
306
- def zero2_align(x):
307
- return align_to * math.ceil(x / align_to)
308
-
309
- if debug:
310
- print(f"original offset={offset}, avail_numel={avail_numel}")
311
-
312
- offset = zero2_align(offset)
313
- avail_numel = zero2_align(avail_numel)
314
-
315
- if debug:
316
- print(f"aligned offset={offset}, avail_numel={avail_numel}")
317
-
318
- # Sanity check
319
- if offset != avail_numel:
320
- raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
321
-
322
- print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
323
-
324
-
325
- def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
326
- exclude_frozen_parameters):
327
- state_dict = OrderedDict()
328
-
329
- # buffers
330
- buffers = zero_model_states[0].buffers
331
- state_dict.update(buffers)
332
- if debug:
333
- print(f"added {len(buffers)} buffers")
334
-
335
- if not exclude_frozen_parameters:
336
- _zero2_merge_frozen_params(state_dict, zero_model_states)
337
-
338
- _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
339
-
340
- # recover shared parameters
341
- for pair in zero_model_states[0].shared_params:
342
- if pair[1] in state_dict:
343
- state_dict[pair[0]] = state_dict[pair[1]]
344
-
345
- return state_dict
346
-
347
-
348
- def zero3_partitioned_param_info(unpartitioned_numel, world_size):
349
- remainder = unpartitioned_numel % world_size
350
- padding_numel = (world_size - remainder) if remainder else 0
351
- partitioned_numel = math.ceil(unpartitioned_numel / world_size)
352
- return partitioned_numel, padding_numel
353
-
354
-
355
- def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
356
- if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
357
- return
358
-
359
- if debug:
360
- for i in range(world_size):
361
- num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
362
- print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
363
-
364
- frozen_param_shapes = zero_model_states[0].frozen_param_shapes
365
- wanted_params = len(frozen_param_shapes)
366
- wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
367
- avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
368
- print(f'Frozen params: Have {avail_numel} numels to process.')
369
- print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
370
-
371
- total_params = 0
372
- total_numel = 0
373
- for name, shape in zero_model_states[0].frozen_param_shapes.items():
374
- total_params += 1
375
- unpartitioned_numel = shape.numel()
376
- total_numel += unpartitioned_numel
377
-
378
- param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
379
- state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
380
-
381
- partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
382
-
383
- if debug:
384
- print(
385
- f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
386
- )
387
-
388
- print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
389
-
390
-
391
- class GatheredTensor:
392
- """
393
- A pseudo tensor that collects partitioned weights.
394
- It is more memory efficient when there are multiple groups.
395
- """
396
-
397
- def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
398
- self.flat_groups = flat_groups
399
- self.flat_groups_offset = flat_groups_offset
400
- self.offset = offset
401
- self.partitioned_numel = partitioned_numel
402
- self.shape = shape
403
- self.dtype = self.flat_groups[0][0].dtype
404
-
405
- def contiguous(self):
406
- """
407
- Merge partitioned weights from flat_groups into a single tensor.
408
- """
409
- end_idx = self.offset + self.partitioned_numel
410
- world_size = len(self.flat_groups)
411
- pad_flat_param_chunks = []
412
-
413
- for rank_i in range(world_size):
414
- # for each rank, we need to collect weights from related group/groups
415
- flat_groups_at_rank_i = self.flat_groups[rank_i]
416
- start_group_id = None
417
- end_group_id = None
418
- for group_id in range(len(self.flat_groups_offset)):
419
- if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
420
- start_group_id = group_id
421
- if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
422
- end_group_id = group_id
423
- break
424
- # collect weights from related group/groups
425
- for group_id in range(start_group_id, end_group_id + 1):
426
- flat_tensor = flat_groups_at_rank_i[group_id]
427
- start_offset = self.offset - self.flat_groups_offset[group_id]
428
- end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
429
- pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
430
-
431
- # collect weights from all ranks
432
- pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
433
- param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
434
- return param
435
-
436
-
437
- def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
438
- param_shapes = zero_model_states[0].param_shapes
439
- avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
440
-
441
- # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
442
- # param, re-consolidating each param, while dealing with padding if any
443
-
444
- # merge list of dicts, preserving order
445
- param_shapes = {k: v for d in param_shapes for k, v in d.items()}
446
-
447
- if debug:
448
- for i in range(world_size):
449
- print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
450
-
451
- wanted_params = len(param_shapes)
452
- wanted_numel = sum(shape.numel() for shape in param_shapes.values())
453
- # not asserting if there is a mismatch due to possible padding
454
- avail_numel = fp32_flat_groups[0].numel() * world_size
455
- print(f"Trainable params: Have {avail_numel} numels to process.")
456
- print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
457
-
458
- # params
459
- # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
460
- # out-of-core computing solution
461
- offset = 0
462
- total_numel = 0
463
- total_params = 0
464
- flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
465
- for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
466
- unpartitioned_numel = shape.numel()
467
- total_numel += unpartitioned_numel
468
- total_params += 1
469
- partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
470
-
471
- if debug:
472
- print(
473
- f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
474
- )
475
-
476
- # memory efficient tensor
477
- tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
478
- state_dict[name] = tensor
479
- offset += partitioned_numel
480
-
481
- offset *= world_size
482
-
483
- # Sanity check
484
- if offset != avail_numel:
485
- raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
486
-
487
- print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
488
-
489
-
490
- def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
491
- exclude_frozen_parameters):
492
- state_dict = OrderedDict()
493
-
494
- # buffers
495
- buffers = zero_model_states[0].buffers
496
- state_dict.update(buffers)
497
- if debug:
498
- print(f"added {len(buffers)} buffers")
499
-
500
- if not exclude_frozen_parameters:
501
- _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
502
-
503
- _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
504
-
505
- # recover shared parameters
506
- for pair in zero_model_states[0].shared_params:
507
- if pair[1] in state_dict:
508
- state_dict[pair[0]] = state_dict[pair[1]]
509
-
510
- return state_dict
511
-
512
-
513
- def to_torch_tensor(state_dict, return_empty_tensor=False):
514
- """
515
- Convert state_dict of GatheredTensor to torch tensor
516
- """
517
- torch_state_dict = {}
518
- converted_tensors = {}
519
- for name, tensor in state_dict.items():
520
- tensor_id = id(tensor)
521
- if tensor_id in converted_tensors: # shared tensors
522
- shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
523
- torch_state_dict[name] = shared_tensor
524
- else:
525
- converted_tensors[tensor_id] = name
526
- if return_empty_tensor:
527
- torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
528
- else:
529
- torch_state_dict[name] = tensor.contiguous()
530
- return torch_state_dict
531
-
532
-
533
- def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
534
- tag=None,
535
- exclude_frozen_parameters=False,
536
- lazy_mode=False):
537
- """
538
- Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
539
- ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
540
- via a model hub.
541
-
542
- Args:
543
- - ``checkpoint_dir``: path to the desired checkpoint folder
544
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
545
- - ``exclude_frozen_parameters``: exclude frozen parameters
546
- - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
547
- Convert the pesduo tensor to torch tensor by ``.contiguous()``
548
-
549
- Returns:
550
- - pytorch ``state_dict``
551
-
552
- A typical usage might be ::
553
-
554
- from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
555
- # do the training and checkpoint saving
556
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
557
- model = model.cpu() # move to cpu
558
- model.load_state_dict(state_dict)
559
- # submit to model hub or save the model to share with others
560
-
561
- In this example the ``model`` will no longer be usable in the deepspeed context of the same
562
- application. i.e. you will need to re-initialize the deepspeed engine, since
563
- ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
564
-
565
- If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
566
-
567
- Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
568
- You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
569
- the checkpoint. Or you can load state_dict in lazy mode ::
570
-
571
- from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
572
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
573
- for name, lazy_tensor in state_dict.item():
574
- tensor = lazy_tensor.contiguous() # to cpu
575
- print(name, tensor)
576
- # del tensor to release memory if it no longer in use
577
- """
578
- if tag is None:
579
- latest_path = os.path.join(checkpoint_dir, 'latest')
580
- if os.path.isfile(latest_path):
581
- with open(latest_path, 'r') as fd:
582
- tag = fd.read().strip()
583
- else:
584
- raise ValueError(f"Unable to find 'latest' file at {latest_path}")
585
-
586
- ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
587
-
588
- if not os.path.isdir(ds_checkpoint_dir):
589
- raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
590
-
591
- state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
592
- if lazy_mode:
593
- return state_dict
594
- else:
595
- return to_torch_tensor(state_dict)
596
-
597
-
598
- def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
599
- output_dir,
600
- max_shard_size="5GB",
601
- safe_serialization=False,
602
- tag=None,
603
- exclude_frozen_parameters=False):
604
- """
605
- Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
606
- loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
607
-
608
- Args:
609
- - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
610
- - ``output_dir``: directory to the pytorch fp32 state_dict output files
611
- - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
612
- - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
613
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
614
- - ``exclude_frozen_parameters``: exclude frozen parameters
615
- """
616
-
617
- # Dependency pre-check
618
- if safe_serialization:
619
- try:
620
- from safetensors.torch import save_file
621
- except ImportError:
622
- print('If you want to use `safe_serialization`, please `pip install safetensors`')
623
- raise
624
- if max_shard_size is not None:
625
- try:
626
- from huggingface_hub import split_torch_state_dict_into_shards
627
- except ImportError:
628
- print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
629
- raise
630
-
631
- # Convert zero checkpoint to state_dict
632
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
633
- tag,
634
- exclude_frozen_parameters,
635
- lazy_mode=True)
636
-
637
- # Shard the model if it is too big.
638
- weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
639
- if max_shard_size is not None:
640
- filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
641
- # an memory-efficient approach for sharding
642
- empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
643
- state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
644
- filename_pattern=filename_pattern,
645
- max_shard_size=max_shard_size)
646
- else:
647
- from collections import namedtuple
648
- StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
649
- state_dict_split = StateDictSplit(is_sharded=False,
650
- filename_to_tensors={weights_name: list(state_dict.keys())})
651
-
652
- # Save the model by shard
653
- os.makedirs(output_dir, exist_ok=True)
654
- filename_to_tensors = state_dict_split.filename_to_tensors.items()
655
- for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
656
- shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
657
- shard_state_dict = to_torch_tensor(shard_state_dict)
658
- output_path = os.path.join(output_dir, shard_file)
659
- if safe_serialization:
660
- save_file(shard_state_dict, output_path, metadata={"format": "pt"})
661
- else:
662
- torch.save(shard_state_dict, output_path)
663
- # release the memory of current shard
664
- for tensor_name in list(shard_state_dict.keys()):
665
- del state_dict[tensor_name]
666
- del shard_state_dict[tensor_name]
667
- del shard_state_dict
668
- gc.collect()
669
-
670
- # Save index if sharded
671
- if state_dict_split.is_sharded:
672
- index = {
673
- "metadata": state_dict_split.metadata,
674
- "weight_map": state_dict_split.tensor_to_filename,
675
- }
676
- save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
677
- save_index_file = os.path.join(output_dir, save_index_file)
678
- with open(save_index_file, "w", encoding="utf-8") as f:
679
- content = json.dumps(index, indent=2, sort_keys=True) + "\n"
680
- f.write(content)
681
-
682
-
683
- def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
684
- """
685
- 1. Put the provided model to cpu
686
- 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
687
- 3. Load it into the provided model
688
-
689
- Args:
690
- - ``model``: the model object to update
691
- - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
692
- - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
693
-
694
- Returns:
695
- - ``model`: modified model
696
-
697
- Make sure you have plenty of CPU memory available before you call this function. If you don't
698
- have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
699
- conveniently placed for you in the checkpoint folder.
700
-
701
- A typical usage might be ::
702
-
703
- from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
704
- model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
705
- # submit to model hub or save the model to share with others
706
-
707
- Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
708
- of the same application. i.e. you will need to re-initialize the deepspeed engine, since
709
- ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
710
-
711
- """
712
- logger.info(f"Extracting fp32 weights")
713
- state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
714
-
715
- logger.info(f"Overwriting model with fp32 weights")
716
- model = model.cpu()
717
- model.load_state_dict(state_dict, strict=False)
718
-
719
- return model
720
-
721
-
722
- if __name__ == "__main__":
723
- parser = argparse.ArgumentParser()
724
- parser.add_argument("checkpoint_dir",
725
- type=str,
726
- help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
727
- parser.add_argument("output_dir",
728
- type=str,
729
- help="directory to the pytorch fp32 state_dict output files"
730
- "(e.g. path/checkpoint-12-output/)")
731
- parser.add_argument(
732
- "--max_shard_size",
733
- type=str,
734
- default="5GB",
735
- help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
736
- "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
737
- "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
738
- "without CPU OOM issues.")
739
- parser.add_argument(
740
- "--safe_serialization",
741
- default=False,
742
- action='store_true',
743
- help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
744
- parser.add_argument("-t",
745
- "--tag",
746
- type=str,
747
- default=None,
748
- help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
749
- parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
750
- parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
751
- args = parser.parse_args()
752
-
753
- debug = args.debug
754
-
755
- convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
756
- args.output_dir,
757
- max_shard_size=args.max_shard_size,
758
- safe_serialization=args.safe_serialization,
759
- tag=args.tag,
760
- exclude_frozen_parameters=args.exclude_frozen_parameters)