saim1212 commited on
Commit
698c508
·
verified ·
1 Parent(s): f75909c
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: Qwen/Qwen2-VL-2B-Instruct
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: qwen2vl_lora_16lr_7b
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # qwen2vl_lora_16lr_7b
18
+
19
+ This model is a fine-tuned version of [Qwen/Qwen2-VL-2B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) on the talk2car dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 2e-05
39
+ - train_batch_size: 1
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 2
44
+ - total_train_batch_size: 2
45
+ - total_eval_batch_size: 16
46
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
+ - lr_scheduler_type: cosine
48
+ - lr_scheduler_warmup_ratio: 0.1
49
+ - num_epochs: 10.0
50
+ - mixed_precision_training: Native AMP
51
+
52
+ ### Training results
53
+
54
+
55
+
56
+ ### Framework versions
57
+
58
+ - PEFT 0.12.0
59
+ - Transformers 4.49.0
60
+ - Pytorch 2.5.1+cu121
61
+ - Datasets 3.2.0
62
+ - Tokenizers 0.21.0
adapter_config.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 64,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "visual.blocks.29.attn.proj",
24
+ "visual.blocks.6.attn.qkv",
25
+ "visual.blocks.11.mlp.fc1",
26
+ "visual.blocks.23.mlp.fc2",
27
+ "visual.blocks.7.attn.proj",
28
+ "visual.blocks.16.attn.qkv",
29
+ "visual.blocks.20.mlp.fc2",
30
+ "visual.blocks.27.mlp.fc2",
31
+ "visual.blocks.4.mlp.fc2",
32
+ "visual.blocks.21.mlp.fc2",
33
+ "visual.blocks.7.attn.qkv",
34
+ "visual.blocks.5.mlp.fc2",
35
+ "visual.blocks.0.mlp.fc1",
36
+ "visual.blocks.3.mlp.fc2",
37
+ "visual.blocks.27.attn.proj",
38
+ "visual.blocks.0.attn.qkv",
39
+ "visual.blocks.13.mlp.fc2",
40
+ "visual.blocks.5.attn.qkv",
41
+ "visual.blocks.15.mlp.fc1",
42
+ "visual.blocks.15.mlp.fc2",
43
+ "visual.blocks.2.attn.qkv",
44
+ "visual.blocks.8.attn.qkv",
45
+ "visual.blocks.5.mlp.fc1",
46
+ "visual.blocks.7.mlp.fc2",
47
+ "visual.blocks.20.attn.proj",
48
+ "visual.blocks.24.mlp.fc2",
49
+ "visual.blocks.2.mlp.fc1",
50
+ "visual.blocks.30.mlp.fc2",
51
+ "visual.blocks.14.mlp.fc2",
52
+ "visual.blocks.3.mlp.fc1",
53
+ "visual.blocks.24.attn.qkv",
54
+ "visual.blocks.17.attn.qkv",
55
+ "visual.blocks.21.attn.proj",
56
+ "visual.blocks.19.attn.qkv",
57
+ "visual.blocks.16.attn.proj",
58
+ "visual.blocks.26.mlp.fc1",
59
+ "visual.blocks.26.mlp.fc2",
60
+ "visual.blocks.27.mlp.fc1",
61
+ "visual.blocks.3.attn.qkv",
62
+ "visual.blocks.22.mlp.fc2",
63
+ "visual.blocks.0.attn.proj",
64
+ "visual.blocks.10.mlp.fc1",
65
+ "visual.blocks.29.attn.qkv",
66
+ "visual.blocks.4.attn.proj",
67
+ "visual.blocks.23.mlp.fc1",
68
+ "visual.blocks.27.attn.qkv",
69
+ "visual.blocks.31.mlp.fc1",
70
+ "visual.blocks.19.mlp.fc2",
71
+ "visual.blocks.14.mlp.fc1",
72
+ "visual.blocks.18.mlp.fc1",
73
+ "visual.blocks.17.mlp.fc2",
74
+ "visual.blocks.23.attn.proj",
75
+ "visual.blocks.12.mlp.fc1",
76
+ "visual.blocks.24.mlp.fc1",
77
+ "visual.blocks.14.attn.qkv",
78
+ "visual.blocks.19.attn.proj",
79
+ "visual.blocks.6.mlp.fc1",
80
+ "visual.blocks.9.mlp.fc2",
81
+ "visual.blocks.8.mlp.fc1",
82
+ "visual.blocks.31.attn.qkv",
83
+ "visual.blocks.11.attn.proj",
84
+ "visual.blocks.0.mlp.fc2",
85
+ "visual.blocks.10.attn.proj",
86
+ "visual.blocks.1.mlp.fc2",
87
+ "visual.blocks.17.attn.proj",
88
+ "visual.blocks.12.mlp.fc2",
89
+ "visual.blocks.25.mlp.fc2",
90
+ "visual.blocks.6.attn.proj",
91
+ "visual.blocks.8.attn.proj",
92
+ "visual.blocks.16.mlp.fc2",
93
+ "visual.blocks.25.mlp.fc1",
94
+ "visual.blocks.22.attn.qkv",
95
+ "visual.blocks.15.attn.proj",
96
+ "visual.blocks.28.mlp.fc1",
97
+ "visual.blocks.21.attn.qkv",
98
+ "visual.blocks.19.mlp.fc1",
99
+ "visual.blocks.14.attn.proj",
100
+ "visual.blocks.26.attn.proj",
101
+ "visual.blocks.2.attn.proj",
102
+ "visual.blocks.29.mlp.fc1",
103
+ "visual.blocks.2.mlp.fc2",
104
+ "visual.blocks.18.attn.proj",
105
+ "visual.blocks.11.attn.qkv",
106
+ "visual.blocks.1.mlp.fc1",
107
+ "visual.blocks.1.attn.qkv",
108
+ "visual.blocks.9.attn.proj",
109
+ "visual.blocks.20.attn.qkv",
110
+ "visual.blocks.25.attn.qkv",
111
+ "visual.blocks.29.mlp.fc2",
112
+ "visual.blocks.9.attn.qkv",
113
+ "visual.blocks.24.attn.proj",
114
+ "visual.blocks.28.attn.qkv",
115
+ "visual.blocks.30.mlp.fc1",
116
+ "visual.blocks.31.mlp.fc2",
117
+ "visual.blocks.7.mlp.fc1",
118
+ "visual.blocks.26.attn.qkv",
119
+ "visual.blocks.18.mlp.fc2",
120
+ "visual.blocks.9.mlp.fc1",
121
+ "visual.blocks.8.mlp.fc2",
122
+ "visual.blocks.5.attn.proj",
123
+ "visual.blocks.30.attn.proj",
124
+ "visual.blocks.10.attn.qkv",
125
+ "visual.blocks.23.attn.qkv",
126
+ "visual.blocks.22.attn.proj",
127
+ "visual.blocks.15.attn.qkv",
128
+ "visual.blocks.28.attn.proj",
129
+ "visual.blocks.1.attn.proj",
130
+ "visual.blocks.25.attn.proj",
131
+ "visual.blocks.13.mlp.fc1",
132
+ "visual.blocks.31.attn.proj",
133
+ "visual.blocks.4.attn.qkv",
134
+ "visual.blocks.4.mlp.fc1",
135
+ "visual.blocks.28.mlp.fc2",
136
+ "visual.blocks.17.mlp.fc1",
137
+ "visual.blocks.30.attn.qkv",
138
+ "visual.blocks.12.attn.proj",
139
+ "visual.blocks.13.attn.proj",
140
+ "visual.blocks.10.mlp.fc2",
141
+ "visual.blocks.13.attn.qkv",
142
+ "visual.blocks.21.mlp.fc1",
143
+ "visual.blocks.3.attn.proj",
144
+ "visual.blocks.11.mlp.fc2",
145
+ "visual.blocks.16.mlp.fc1",
146
+ "visual.blocks.18.attn.qkv",
147
+ "visual.blocks.20.mlp.fc1",
148
+ "visual.blocks.6.mlp.fc2",
149
+ "visual.blocks.22.mlp.fc1",
150
+ "visual.blocks.12.attn.qkv"
151
+ ],
152
+ "task_type": "CAUSAL_LM",
153
+ "use_dora": false,
154
+ "use_rslora": false
155
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e545f8b565a04fdba0c232e26ad3e3c57a57fc3874b9fadfe9cc242f4ce9fc
3
+ size 83918672
added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.1899956660640154e+17,
4
+ "train_loss": 0.49240388979911803,
5
+ "train_runtime": 35927.6647,
6
+ "train_samples_per_second": 0.278,
7
+ "train_steps_per_second": 0.139
8
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
checkpoint-5000/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2-VL-2B-Instruct
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
checkpoint-5000/adapter_config.json ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 64,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "visual.blocks.29.attn.proj",
24
+ "visual.blocks.6.attn.qkv",
25
+ "visual.blocks.11.mlp.fc1",
26
+ "visual.blocks.23.mlp.fc2",
27
+ "visual.blocks.7.attn.proj",
28
+ "visual.blocks.16.attn.qkv",
29
+ "visual.blocks.20.mlp.fc2",
30
+ "visual.blocks.27.mlp.fc2",
31
+ "visual.blocks.4.mlp.fc2",
32
+ "visual.blocks.21.mlp.fc2",
33
+ "visual.blocks.7.attn.qkv",
34
+ "visual.blocks.5.mlp.fc2",
35
+ "visual.blocks.0.mlp.fc1",
36
+ "visual.blocks.3.mlp.fc2",
37
+ "visual.blocks.27.attn.proj",
38
+ "visual.blocks.0.attn.qkv",
39
+ "visual.blocks.13.mlp.fc2",
40
+ "visual.blocks.5.attn.qkv",
41
+ "visual.blocks.15.mlp.fc1",
42
+ "visual.blocks.15.mlp.fc2",
43
+ "visual.blocks.2.attn.qkv",
44
+ "visual.blocks.8.attn.qkv",
45
+ "visual.blocks.5.mlp.fc1",
46
+ "visual.blocks.7.mlp.fc2",
47
+ "visual.blocks.20.attn.proj",
48
+ "visual.blocks.24.mlp.fc2",
49
+ "visual.blocks.2.mlp.fc1",
50
+ "visual.blocks.30.mlp.fc2",
51
+ "visual.blocks.14.mlp.fc2",
52
+ "visual.blocks.3.mlp.fc1",
53
+ "visual.blocks.24.attn.qkv",
54
+ "visual.blocks.17.attn.qkv",
55
+ "visual.blocks.21.attn.proj",
56
+ "visual.blocks.19.attn.qkv",
57
+ "visual.blocks.16.attn.proj",
58
+ "visual.blocks.26.mlp.fc1",
59
+ "visual.blocks.26.mlp.fc2",
60
+ "visual.blocks.27.mlp.fc1",
61
+ "visual.blocks.3.attn.qkv",
62
+ "visual.blocks.22.mlp.fc2",
63
+ "visual.blocks.0.attn.proj",
64
+ "visual.blocks.10.mlp.fc1",
65
+ "visual.blocks.29.attn.qkv",
66
+ "visual.blocks.4.attn.proj",
67
+ "visual.blocks.23.mlp.fc1",
68
+ "visual.blocks.27.attn.qkv",
69
+ "visual.blocks.31.mlp.fc1",
70
+ "visual.blocks.19.mlp.fc2",
71
+ "visual.blocks.14.mlp.fc1",
72
+ "visual.blocks.18.mlp.fc1",
73
+ "visual.blocks.17.mlp.fc2",
74
+ "visual.blocks.23.attn.proj",
75
+ "visual.blocks.12.mlp.fc1",
76
+ "visual.blocks.24.mlp.fc1",
77
+ "visual.blocks.14.attn.qkv",
78
+ "visual.blocks.19.attn.proj",
79
+ "visual.blocks.6.mlp.fc1",
80
+ "visual.blocks.9.mlp.fc2",
81
+ "visual.blocks.8.mlp.fc1",
82
+ "visual.blocks.31.attn.qkv",
83
+ "visual.blocks.11.attn.proj",
84
+ "visual.blocks.0.mlp.fc2",
85
+ "visual.blocks.10.attn.proj",
86
+ "visual.blocks.1.mlp.fc2",
87
+ "visual.blocks.17.attn.proj",
88
+ "visual.blocks.12.mlp.fc2",
89
+ "visual.blocks.25.mlp.fc2",
90
+ "visual.blocks.6.attn.proj",
91
+ "visual.blocks.8.attn.proj",
92
+ "visual.blocks.16.mlp.fc2",
93
+ "visual.blocks.25.mlp.fc1",
94
+ "visual.blocks.22.attn.qkv",
95
+ "visual.blocks.15.attn.proj",
96
+ "visual.blocks.28.mlp.fc1",
97
+ "visual.blocks.21.attn.qkv",
98
+ "visual.blocks.19.mlp.fc1",
99
+ "visual.blocks.14.attn.proj",
100
+ "visual.blocks.26.attn.proj",
101
+ "visual.blocks.2.attn.proj",
102
+ "visual.blocks.29.mlp.fc1",
103
+ "visual.blocks.2.mlp.fc2",
104
+ "visual.blocks.18.attn.proj",
105
+ "visual.blocks.11.attn.qkv",
106
+ "visual.blocks.1.mlp.fc1",
107
+ "visual.blocks.1.attn.qkv",
108
+ "visual.blocks.9.attn.proj",
109
+ "visual.blocks.20.attn.qkv",
110
+ "visual.blocks.25.attn.qkv",
111
+ "visual.blocks.29.mlp.fc2",
112
+ "visual.blocks.9.attn.qkv",
113
+ "visual.blocks.24.attn.proj",
114
+ "visual.blocks.28.attn.qkv",
115
+ "visual.blocks.30.mlp.fc1",
116
+ "visual.blocks.31.mlp.fc2",
117
+ "visual.blocks.7.mlp.fc1",
118
+ "visual.blocks.26.attn.qkv",
119
+ "visual.blocks.18.mlp.fc2",
120
+ "visual.blocks.9.mlp.fc1",
121
+ "visual.blocks.8.mlp.fc2",
122
+ "visual.blocks.5.attn.proj",
123
+ "visual.blocks.30.attn.proj",
124
+ "visual.blocks.10.attn.qkv",
125
+ "visual.blocks.23.attn.qkv",
126
+ "visual.blocks.22.attn.proj",
127
+ "visual.blocks.15.attn.qkv",
128
+ "visual.blocks.28.attn.proj",
129
+ "visual.blocks.1.attn.proj",
130
+ "visual.blocks.25.attn.proj",
131
+ "visual.blocks.13.mlp.fc1",
132
+ "visual.blocks.31.attn.proj",
133
+ "visual.blocks.4.attn.qkv",
134
+ "visual.blocks.4.mlp.fc1",
135
+ "visual.blocks.28.mlp.fc2",
136
+ "visual.blocks.17.mlp.fc1",
137
+ "visual.blocks.30.attn.qkv",
138
+ "visual.blocks.12.attn.proj",
139
+ "visual.blocks.13.attn.proj",
140
+ "visual.blocks.10.mlp.fc2",
141
+ "visual.blocks.13.attn.qkv",
142
+ "visual.blocks.21.mlp.fc1",
143
+ "visual.blocks.3.attn.proj",
144
+ "visual.blocks.11.mlp.fc2",
145
+ "visual.blocks.16.mlp.fc1",
146
+ "visual.blocks.18.attn.qkv",
147
+ "visual.blocks.20.mlp.fc1",
148
+ "visual.blocks.6.mlp.fc2",
149
+ "visual.blocks.22.mlp.fc1",
150
+ "visual.blocks.12.attn.qkv"
151
+ ],
152
+ "task_type": "CAUSAL_LM",
153
+ "use_dora": false,
154
+ "use_rslora": false
155
+ }
checkpoint-5000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9e545f8b565a04fdba0c232e26ad3e3c57a57fc3874b9fadfe9cc242f4ce9fc
3
+ size 83918672
checkpoint-5000/added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
checkpoint-5000/chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"
3
+ }
checkpoint-5000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-5000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acfd2cb152766ae1b47005cc3013f76ead4ee361d6361cbca1241ebbe50e6c54
3
+ size 167987578
checkpoint-5000/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "Qwen2VLImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "max_pixels": 12845056,
18
+ "merge_size": 2,
19
+ "min_pixels": 3136,
20
+ "patch_size": 14,
21
+ "processor_class": "Qwen2VLProcessor",
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "longest_edge": 12845056,
26
+ "shortest_edge": 3136
27
+ },
28
+ "temporal_patch_size": 2
29
+ }
checkpoint-5000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b074bf97f241c2662caa5ce956b03d1249c3cc0713b6aef7796673362754f98
3
+ size 14512
checkpoint-5000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58aed9e8d78903cb12015375021c729c3f6c5fd1a1e19e7aee6ddde57c3310b9
3
+ size 14512
checkpoint-5000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40aaa2594ab145486b05e674a81d8cb031bfe9f8a7f456bf6c9140449829730a
3
+ size 988
checkpoint-5000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dceffcf59c5cc9da5174022a29c9b8d5a910aa52b2cca3b9795d4da094253039
3
+ size 1064
checkpoint-5000/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-5000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:091aa7594dc2fcfbfa06b9e3c22a5f0562ac14f30375c13af7309407a0e67b8a
3
+ size 11420371
checkpoint-5000/tokenizer_config.json ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ }
116
+ },
117
+ "additional_special_tokens": [
118
+ "<|im_start|>",
119
+ "<|im_end|>",
120
+ "<|object_ref_start|>",
121
+ "<|object_ref_end|>",
122
+ "<|box_start|>",
123
+ "<|box_end|>",
124
+ "<|quad_start|>",
125
+ "<|quad_end|>",
126
+ "<|vision_start|>",
127
+ "<|vision_end|>",
128
+ "<|vision_pad|>",
129
+ "<|image_pad|>",
130
+ "<|video_pad|>"
131
+ ],
132
+ "bos_token": null,
133
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
134
+ "clean_up_tokenization_spaces": false,
135
+ "eos_token": "<|im_end|>",
136
+ "errors": "replace",
137
+ "extra_special_tokens": {},
138
+ "model_max_length": 2048,
139
+ "pad_token": "<|endoftext|>",
140
+ "padding_side": "right",
141
+ "processor_class": "Qwen2VLProcessor",
142
+ "split_special_tokens": false,
143
+ "tokenizer_class": "Qwen2Tokenizer",
144
+ "unk_token": null
145
+ }
checkpoint-5000/trainer_state.json ADDED
@@ -0,0 +1,3533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 5000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "grad_norm": NaN,
14
+ "learning_rate": 2.4000000000000003e-07,
15
+ "loss": 1.3992,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.04,
20
+ "grad_norm": 1.1072826385498047,
21
+ "learning_rate": 6.000000000000001e-07,
22
+ "loss": 1.4497,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.06,
27
+ "grad_norm": 3.9411494731903076,
28
+ "learning_rate": 1.0000000000000002e-06,
29
+ "loss": 1.2599,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.08,
34
+ "grad_norm": 3.928187847137451,
35
+ "learning_rate": 1.4000000000000001e-06,
36
+ "loss": 1.4124,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.1,
41
+ "grad_norm": 0.6260775923728943,
42
+ "learning_rate": 1.8000000000000001e-06,
43
+ "loss": 1.3542,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.12,
48
+ "grad_norm": 1.5522129535675049,
49
+ "learning_rate": 2.2e-06,
50
+ "loss": 1.2123,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.14,
55
+ "grad_norm": 1.6981067657470703,
56
+ "learning_rate": 2.6e-06,
57
+ "loss": 1.2098,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.16,
62
+ "grad_norm": 5.6391496658325195,
63
+ "learning_rate": 3e-06,
64
+ "loss": 1.2226,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.18,
69
+ "grad_norm": 0.5151563286781311,
70
+ "learning_rate": 3.3600000000000004e-06,
71
+ "loss": 1.0624,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.2,
76
+ "grad_norm": 1.4428874254226685,
77
+ "learning_rate": 3.7600000000000004e-06,
78
+ "loss": 1.0969,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.22,
83
+ "grad_norm": 2.130734920501709,
84
+ "learning_rate": 4.16e-06,
85
+ "loss": 1.0879,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.24,
90
+ "grad_norm": 2.011439561843872,
91
+ "learning_rate": 4.56e-06,
92
+ "loss": 1.0195,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.26,
97
+ "grad_norm": 2.217247486114502,
98
+ "learning_rate": 4.960000000000001e-06,
99
+ "loss": 0.9765,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.28,
104
+ "grad_norm": 0.946567952632904,
105
+ "learning_rate": 5.36e-06,
106
+ "loss": 1.0205,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.3,
111
+ "grad_norm": 6.922210693359375,
112
+ "learning_rate": 5.76e-06,
113
+ "loss": 0.9517,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.32,
118
+ "grad_norm": 3.384115219116211,
119
+ "learning_rate": 6.16e-06,
120
+ "loss": 0.9324,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.34,
125
+ "grad_norm": 1.7061117887496948,
126
+ "learning_rate": 6.560000000000001e-06,
127
+ "loss": 0.804,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.36,
132
+ "grad_norm": 1.5616205930709839,
133
+ "learning_rate": 6.96e-06,
134
+ "loss": 0.7821,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.38,
139
+ "grad_norm": 1.8393518924713135,
140
+ "learning_rate": 7.360000000000001e-06,
141
+ "loss": 0.8086,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.4,
146
+ "grad_norm": 1.4879248142242432,
147
+ "learning_rate": 7.76e-06,
148
+ "loss": 0.7655,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.42,
153
+ "grad_norm": 0.6370295882225037,
154
+ "learning_rate": 8.16e-06,
155
+ "loss": 0.7508,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.44,
160
+ "grad_norm": 0.5269752144813538,
161
+ "learning_rate": 8.560000000000001e-06,
162
+ "loss": 0.7429,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.46,
167
+ "grad_norm": 1.2695356607437134,
168
+ "learning_rate": 8.96e-06,
169
+ "loss": 0.7502,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.48,
174
+ "grad_norm": 1.5492205619812012,
175
+ "learning_rate": 9.360000000000002e-06,
176
+ "loss": 0.7029,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.5,
181
+ "grad_norm": 1.7380893230438232,
182
+ "learning_rate": 9.760000000000001e-06,
183
+ "loss": 0.7324,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.52,
188
+ "grad_norm": 1.8911452293395996,
189
+ "learning_rate": 1.0160000000000001e-05,
190
+ "loss": 0.7521,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.54,
195
+ "grad_norm": 2.0408151149749756,
196
+ "learning_rate": 1.056e-05,
197
+ "loss": 0.698,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.56,
202
+ "grad_norm": 1.9015631675720215,
203
+ "learning_rate": 1.0960000000000002e-05,
204
+ "loss": 0.6859,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.58,
209
+ "grad_norm": 1.5284056663513184,
210
+ "learning_rate": 1.136e-05,
211
+ "loss": 0.6716,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.6,
216
+ "grad_norm": 1.3547126054763794,
217
+ "learning_rate": 1.1760000000000001e-05,
218
+ "loss": 0.6978,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.62,
223
+ "grad_norm": 1.4986441135406494,
224
+ "learning_rate": 1.216e-05,
225
+ "loss": 0.6584,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.64,
230
+ "grad_norm": 1.1250969171524048,
231
+ "learning_rate": 1.2560000000000002e-05,
232
+ "loss": 0.7188,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.66,
237
+ "grad_norm": 1.1186408996582031,
238
+ "learning_rate": 1.2960000000000001e-05,
239
+ "loss": 0.6687,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.68,
244
+ "grad_norm": 1.1250578165054321,
245
+ "learning_rate": 1.3360000000000003e-05,
246
+ "loss": 0.623,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.7,
251
+ "grad_norm": 0.8931149840354919,
252
+ "learning_rate": 1.376e-05,
253
+ "loss": 0.6795,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.72,
258
+ "grad_norm": 1.5052251815795898,
259
+ "learning_rate": 1.416e-05,
260
+ "loss": 0.6455,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.74,
265
+ "grad_norm": 1.8043763637542725,
266
+ "learning_rate": 1.4560000000000001e-05,
267
+ "loss": 0.6548,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.76,
272
+ "grad_norm": 1.7357759475708008,
273
+ "learning_rate": 1.496e-05,
274
+ "loss": 0.6508,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.78,
279
+ "grad_norm": 1.4080872535705566,
280
+ "learning_rate": 1.5360000000000002e-05,
281
+ "loss": 0.7105,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 0.8,
286
+ "grad_norm": 0.7395206689834595,
287
+ "learning_rate": 1.576e-05,
288
+ "loss": 0.6738,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 0.82,
293
+ "grad_norm": 1.4803540706634521,
294
+ "learning_rate": 1.616e-05,
295
+ "loss": 0.6741,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 0.84,
300
+ "grad_norm": 0.9231945276260376,
301
+ "learning_rate": 1.656e-05,
302
+ "loss": 0.6385,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 0.86,
307
+ "grad_norm": 1.0000849962234497,
308
+ "learning_rate": 1.696e-05,
309
+ "loss": 0.6304,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 0.88,
314
+ "grad_norm": 1.8198318481445312,
315
+ "learning_rate": 1.736e-05,
316
+ "loss": 0.652,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 0.9,
321
+ "grad_norm": 0.8213591575622559,
322
+ "learning_rate": 1.7760000000000003e-05,
323
+ "loss": 0.6517,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 0.92,
328
+ "grad_norm": 2.0881271362304688,
329
+ "learning_rate": 1.8160000000000002e-05,
330
+ "loss": 0.7044,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 0.94,
335
+ "grad_norm": 2.583738088607788,
336
+ "learning_rate": 1.8560000000000002e-05,
337
+ "loss": 0.6801,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 0.96,
342
+ "grad_norm": 2.3651039600372314,
343
+ "learning_rate": 1.896e-05,
344
+ "loss": 0.6531,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 0.98,
349
+ "grad_norm": 1.174816608428955,
350
+ "learning_rate": 1.936e-05,
351
+ "loss": 0.6837,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 1.0,
356
+ "grad_norm": 1.93290376663208,
357
+ "learning_rate": 1.976e-05,
358
+ "loss": 0.6659,
359
+ "step": 500
360
+ },
361
+ {
362
+ "epoch": 1.02,
363
+ "grad_norm": 1.616199254989624,
364
+ "learning_rate": 1.9999961008995607e-05,
365
+ "loss": 0.6212,
366
+ "step": 510
367
+ },
368
+ {
369
+ "epoch": 1.04,
370
+ "grad_norm": 1.4221971035003662,
371
+ "learning_rate": 1.99995223636881e-05,
372
+ "loss": 0.6705,
373
+ "step": 520
374
+ },
375
+ {
376
+ "epoch": 1.06,
377
+ "grad_norm": 1.4205855131149292,
378
+ "learning_rate": 1.9998596355767805e-05,
379
+ "loss": 0.6346,
380
+ "step": 530
381
+ },
382
+ {
383
+ "epoch": 1.08,
384
+ "grad_norm": 1.5640236139297485,
385
+ "learning_rate": 1.999718303036705e-05,
386
+ "loss": 0.6698,
387
+ "step": 540
388
+ },
389
+ {
390
+ "epoch": 1.1,
391
+ "grad_norm": 1.3953174352645874,
392
+ "learning_rate": 1.9995282456369313e-05,
393
+ "loss": 0.5925,
394
+ "step": 550
395
+ },
396
+ {
397
+ "epoch": 1.12,
398
+ "grad_norm": 1.3201889991760254,
399
+ "learning_rate": 1.9992894726405894e-05,
400
+ "loss": 0.6795,
401
+ "step": 560
402
+ },
403
+ {
404
+ "epoch": 1.1400000000000001,
405
+ "grad_norm": 1.8795799016952515,
406
+ "learning_rate": 1.9990019956851384e-05,
407
+ "loss": 0.6096,
408
+ "step": 570
409
+ },
410
+ {
411
+ "epoch": 1.16,
412
+ "grad_norm": 1.9675489664077759,
413
+ "learning_rate": 1.998665828781799e-05,
414
+ "loss": 0.5971,
415
+ "step": 580
416
+ },
417
+ {
418
+ "epoch": 1.18,
419
+ "grad_norm": 1.0514065027236938,
420
+ "learning_rate": 1.998280988314872e-05,
421
+ "loss": 0.6055,
422
+ "step": 590
423
+ },
424
+ {
425
+ "epoch": 1.2,
426
+ "grad_norm": 0.7312430143356323,
427
+ "learning_rate": 1.9978474930409396e-05,
428
+ "loss": 0.6304,
429
+ "step": 600
430
+ },
431
+ {
432
+ "epoch": 1.22,
433
+ "grad_norm": 11.12009048461914,
434
+ "learning_rate": 1.9973653640879486e-05,
435
+ "loss": 0.6812,
436
+ "step": 610
437
+ },
438
+ {
439
+ "epoch": 1.24,
440
+ "grad_norm": 2.484487771987915,
441
+ "learning_rate": 1.9968346249541848e-05,
442
+ "loss": 0.5842,
443
+ "step": 620
444
+ },
445
+ {
446
+ "epoch": 1.26,
447
+ "grad_norm": 1.4185147285461426,
448
+ "learning_rate": 1.996255301507125e-05,
449
+ "loss": 0.6478,
450
+ "step": 630
451
+ },
452
+ {
453
+ "epoch": 1.28,
454
+ "grad_norm": 0.939939022064209,
455
+ "learning_rate": 1.995627421982176e-05,
456
+ "loss": 0.6003,
457
+ "step": 640
458
+ },
459
+ {
460
+ "epoch": 1.3,
461
+ "grad_norm": 2.303175687789917,
462
+ "learning_rate": 1.9949510169813006e-05,
463
+ "loss": 0.6087,
464
+ "step": 650
465
+ },
466
+ {
467
+ "epoch": 1.32,
468
+ "grad_norm": 1.1472731828689575,
469
+ "learning_rate": 1.9942261194715236e-05,
470
+ "loss": 0.5905,
471
+ "step": 660
472
+ },
473
+ {
474
+ "epoch": 1.34,
475
+ "grad_norm": 0.8142107129096985,
476
+ "learning_rate": 1.9934527647833276e-05,
477
+ "loss": 0.593,
478
+ "step": 670
479
+ },
480
+ {
481
+ "epoch": 1.3599999999999999,
482
+ "grad_norm": 1.4084548950195312,
483
+ "learning_rate": 1.992630990608929e-05,
484
+ "loss": 0.6253,
485
+ "step": 680
486
+ },
487
+ {
488
+ "epoch": 1.38,
489
+ "grad_norm": 1.6054160594940186,
490
+ "learning_rate": 1.9917608370004417e-05,
491
+ "loss": 0.6338,
492
+ "step": 690
493
+ },
494
+ {
495
+ "epoch": 1.4,
496
+ "grad_norm": 2.442007541656494,
497
+ "learning_rate": 1.9908423463679246e-05,
498
+ "loss": 0.6148,
499
+ "step": 700
500
+ },
501
+ {
502
+ "epoch": 1.42,
503
+ "grad_norm": 0.7885130047798157,
504
+ "learning_rate": 1.989875563477316e-05,
505
+ "loss": 0.6271,
506
+ "step": 710
507
+ },
508
+ {
509
+ "epoch": 1.44,
510
+ "grad_norm": 2.8302054405212402,
511
+ "learning_rate": 1.9888605354482494e-05,
512
+ "loss": 0.638,
513
+ "step": 720
514
+ },
515
+ {
516
+ "epoch": 1.46,
517
+ "grad_norm": 1.134475827217102,
518
+ "learning_rate": 1.987797311751759e-05,
519
+ "loss": 0.6304,
520
+ "step": 730
521
+ },
522
+ {
523
+ "epoch": 1.48,
524
+ "grad_norm": 0.7951676249504089,
525
+ "learning_rate": 1.986685944207868e-05,
526
+ "loss": 0.5877,
527
+ "step": 740
528
+ },
529
+ {
530
+ "epoch": 1.5,
531
+ "grad_norm": 1.5261722803115845,
532
+ "learning_rate": 1.985526486983063e-05,
533
+ "loss": 0.5747,
534
+ "step": 750
535
+ },
536
+ {
537
+ "epoch": 1.52,
538
+ "grad_norm": 4.633131980895996,
539
+ "learning_rate": 1.9843189965876525e-05,
540
+ "loss": 0.6514,
541
+ "step": 760
542
+ },
543
+ {
544
+ "epoch": 1.54,
545
+ "grad_norm": 0.7688568830490112,
546
+ "learning_rate": 1.9830635318730155e-05,
547
+ "loss": 0.5879,
548
+ "step": 770
549
+ },
550
+ {
551
+ "epoch": 1.56,
552
+ "grad_norm": 0.859425961971283,
553
+ "learning_rate": 1.981760154028731e-05,
554
+ "loss": 0.6152,
555
+ "step": 780
556
+ },
557
+ {
558
+ "epoch": 1.58,
559
+ "grad_norm": 1.939563274383545,
560
+ "learning_rate": 1.980408926579596e-05,
561
+ "loss": 0.6342,
562
+ "step": 790
563
+ },
564
+ {
565
+ "epoch": 1.6,
566
+ "grad_norm": 0.7612221837043762,
567
+ "learning_rate": 1.97900991538253e-05,
568
+ "loss": 0.6167,
569
+ "step": 800
570
+ },
571
+ {
572
+ "epoch": 1.62,
573
+ "grad_norm": 2.2331180572509766,
574
+ "learning_rate": 1.9775631886233655e-05,
575
+ "loss": 0.5688,
576
+ "step": 810
577
+ },
578
+ {
579
+ "epoch": 1.6400000000000001,
580
+ "grad_norm": 3.1707897186279297,
581
+ "learning_rate": 1.9760688168135233e-05,
582
+ "loss": 0.6023,
583
+ "step": 820
584
+ },
585
+ {
586
+ "epoch": 1.6600000000000001,
587
+ "grad_norm": 1.5035152435302734,
588
+ "learning_rate": 1.9745268727865774e-05,
589
+ "loss": 0.5957,
590
+ "step": 830
591
+ },
592
+ {
593
+ "epoch": 1.6800000000000002,
594
+ "grad_norm": 1.8150962591171265,
595
+ "learning_rate": 1.972937431694704e-05,
596
+ "loss": 0.5409,
597
+ "step": 840
598
+ },
599
+ {
600
+ "epoch": 1.7,
601
+ "grad_norm": 1.5338727235794067,
602
+ "learning_rate": 1.9713005710050203e-05,
603
+ "loss": 0.6286,
604
+ "step": 850
605
+ },
606
+ {
607
+ "epoch": 1.72,
608
+ "grad_norm": 1.0299500226974487,
609
+ "learning_rate": 1.969616370495806e-05,
610
+ "loss": 0.5936,
611
+ "step": 860
612
+ },
613
+ {
614
+ "epoch": 1.74,
615
+ "grad_norm": 3.0667810440063477,
616
+ "learning_rate": 1.967884912252619e-05,
617
+ "loss": 0.6535,
618
+ "step": 870
619
+ },
620
+ {
621
+ "epoch": 1.76,
622
+ "grad_norm": 0.9165984988212585,
623
+ "learning_rate": 1.9661062806642903e-05,
624
+ "loss": 0.5864,
625
+ "step": 880
626
+ },
627
+ {
628
+ "epoch": 1.78,
629
+ "grad_norm": 1.398553729057312,
630
+ "learning_rate": 1.964280562418815e-05,
631
+ "loss": 0.6181,
632
+ "step": 890
633
+ },
634
+ {
635
+ "epoch": 1.8,
636
+ "grad_norm": 1.1232646703720093,
637
+ "learning_rate": 1.962407846499124e-05,
638
+ "loss": 0.5736,
639
+ "step": 900
640
+ },
641
+ {
642
+ "epoch": 1.8199999999999998,
643
+ "grad_norm": 1.905674695968628,
644
+ "learning_rate": 1.96048822417875e-05,
645
+ "loss": 0.5769,
646
+ "step": 910
647
+ },
648
+ {
649
+ "epoch": 1.8399999999999999,
650
+ "grad_norm": 0.9369404911994934,
651
+ "learning_rate": 1.958521789017376e-05,
652
+ "loss": 0.6056,
653
+ "step": 920
654
+ },
655
+ {
656
+ "epoch": 1.8599999999999999,
657
+ "grad_norm": 1.0187280178070068,
658
+ "learning_rate": 1.956508636856278e-05,
659
+ "loss": 0.6632,
660
+ "step": 930
661
+ },
662
+ {
663
+ "epoch": 1.88,
664
+ "grad_norm": 1.4954912662506104,
665
+ "learning_rate": 1.9546569379242446e-05,
666
+ "loss": 0.5803,
667
+ "step": 940
668
+ },
669
+ {
670
+ "epoch": 1.9,
671
+ "grad_norm": 1.1077364683151245,
672
+ "learning_rate": 1.9525552956573244e-05,
673
+ "loss": 0.6028,
674
+ "step": 950
675
+ },
676
+ {
677
+ "epoch": 1.92,
678
+ "grad_norm": 1.777431607246399,
679
+ "learning_rate": 1.9504072271891486e-05,
680
+ "loss": 0.5932,
681
+ "step": 960
682
+ },
683
+ {
684
+ "epoch": 1.94,
685
+ "grad_norm": 1.1748569011688232,
686
+ "learning_rate": 1.9482128372135446e-05,
687
+ "loss": 0.581,
688
+ "step": 970
689
+ },
690
+ {
691
+ "epoch": 1.96,
692
+ "grad_norm": 1.5400365591049194,
693
+ "learning_rate": 1.945972232681984e-05,
694
+ "loss": 0.6207,
695
+ "step": 980
696
+ },
697
+ {
698
+ "epoch": 1.98,
699
+ "grad_norm": 1.3064255714416504,
700
+ "learning_rate": 1.9436855227983695e-05,
701
+ "loss": 0.5576,
702
+ "step": 990
703
+ },
704
+ {
705
+ "epoch": 2.0,
706
+ "grad_norm": 1.6377707719802856,
707
+ "learning_rate": 1.9413528190137158e-05,
708
+ "loss": 0.6121,
709
+ "step": 1000
710
+ },
711
+ {
712
+ "epoch": 2.02,
713
+ "grad_norm": 0.8060258030891418,
714
+ "learning_rate": 1.938974235020714e-05,
715
+ "loss": 0.5909,
716
+ "step": 1010
717
+ },
718
+ {
719
+ "epoch": 2.04,
720
+ "grad_norm": 2.174511671066284,
721
+ "learning_rate": 1.9365498867481926e-05,
722
+ "loss": 0.5459,
723
+ "step": 1020
724
+ },
725
+ {
726
+ "epoch": 2.06,
727
+ "grad_norm": 2.0883657932281494,
728
+ "learning_rate": 1.9340798923554657e-05,
729
+ "loss": 0.5781,
730
+ "step": 1030
731
+ },
732
+ {
733
+ "epoch": 2.08,
734
+ "grad_norm": 1.4199424982070923,
735
+ "learning_rate": 1.931564372226576e-05,
736
+ "loss": 0.5284,
737
+ "step": 1040
738
+ },
739
+ {
740
+ "epoch": 2.1,
741
+ "grad_norm": 1.2759448289871216,
742
+ "learning_rate": 1.9290034489644247e-05,
743
+ "loss": 0.5476,
744
+ "step": 1050
745
+ },
746
+ {
747
+ "epoch": 2.12,
748
+ "grad_norm": 0.9871682524681091,
749
+ "learning_rate": 1.9263972473847995e-05,
750
+ "loss": 0.5386,
751
+ "step": 1060
752
+ },
753
+ {
754
+ "epoch": 2.14,
755
+ "grad_norm": 1.325584053993225,
756
+ "learning_rate": 1.923745894510288e-05,
757
+ "loss": 0.5864,
758
+ "step": 1070
759
+ },
760
+ {
761
+ "epoch": 2.16,
762
+ "grad_norm": 1.4707293510437012,
763
+ "learning_rate": 1.9210495195640895e-05,
764
+ "loss": 0.5413,
765
+ "step": 1080
766
+ },
767
+ {
768
+ "epoch": 2.18,
769
+ "grad_norm": 1.222374439239502,
770
+ "learning_rate": 1.918308253963715e-05,
771
+ "loss": 0.5201,
772
+ "step": 1090
773
+ },
774
+ {
775
+ "epoch": 2.2,
776
+ "grad_norm": 2.6065587997436523,
777
+ "learning_rate": 1.9155222313145817e-05,
778
+ "loss": 0.5658,
779
+ "step": 1100
780
+ },
781
+ {
782
+ "epoch": 2.22,
783
+ "grad_norm": 1.2004817724227905,
784
+ "learning_rate": 1.912691587403503e-05,
785
+ "loss": 0.5578,
786
+ "step": 1110
787
+ },
788
+ {
789
+ "epoch": 2.24,
790
+ "grad_norm": 2.0765957832336426,
791
+ "learning_rate": 1.9098164601920702e-05,
792
+ "loss": 0.4792,
793
+ "step": 1120
794
+ },
795
+ {
796
+ "epoch": 2.26,
797
+ "grad_norm": 1.2682489156723022,
798
+ "learning_rate": 1.906896989809927e-05,
799
+ "loss": 0.6048,
800
+ "step": 1130
801
+ },
802
+ {
803
+ "epoch": 2.2800000000000002,
804
+ "grad_norm": 3.5352697372436523,
805
+ "learning_rate": 1.903933318547942e-05,
806
+ "loss": 0.567,
807
+ "step": 1140
808
+ },
809
+ {
810
+ "epoch": 2.3,
811
+ "grad_norm": 1.590265154838562,
812
+ "learning_rate": 1.9009255908512704e-05,
813
+ "loss": 0.5965,
814
+ "step": 1150
815
+ },
816
+ {
817
+ "epoch": 2.32,
818
+ "grad_norm": 2.2210605144500732,
819
+ "learning_rate": 1.897873953312317e-05,
820
+ "loss": 0.5561,
821
+ "step": 1160
822
+ },
823
+ {
824
+ "epoch": 2.34,
825
+ "grad_norm": 1.0093090534210205,
826
+ "learning_rate": 1.8947785546635905e-05,
827
+ "loss": 0.5529,
828
+ "step": 1170
829
+ },
830
+ {
831
+ "epoch": 2.36,
832
+ "grad_norm": 1.523364543914795,
833
+ "learning_rate": 1.8916395457704536e-05,
834
+ "loss": 0.5818,
835
+ "step": 1180
836
+ },
837
+ {
838
+ "epoch": 2.38,
839
+ "grad_norm": 0.9908381700515747,
840
+ "learning_rate": 1.888457079623772e-05,
841
+ "loss": 0.5558,
842
+ "step": 1190
843
+ },
844
+ {
845
+ "epoch": 2.4,
846
+ "grad_norm": 1.499123215675354,
847
+ "learning_rate": 1.8852313113324553e-05,
848
+ "loss": 0.5833,
849
+ "step": 1200
850
+ },
851
+ {
852
+ "epoch": 2.42,
853
+ "grad_norm": 1.700926661491394,
854
+ "learning_rate": 1.8819623981158996e-05,
855
+ "loss": 0.527,
856
+ "step": 1210
857
+ },
858
+ {
859
+ "epoch": 2.44,
860
+ "grad_norm": 0.7639631032943726,
861
+ "learning_rate": 1.878650499296323e-05,
862
+ "loss": 0.5605,
863
+ "step": 1220
864
+ },
865
+ {
866
+ "epoch": 2.46,
867
+ "grad_norm": 0.9155722856521606,
868
+ "learning_rate": 1.8752957762910016e-05,
869
+ "loss": 0.5528,
870
+ "step": 1230
871
+ },
872
+ {
873
+ "epoch": 2.48,
874
+ "grad_norm": 1.4307914972305298,
875
+ "learning_rate": 1.871898392604402e-05,
876
+ "loss": 0.5239,
877
+ "step": 1240
878
+ },
879
+ {
880
+ "epoch": 2.5,
881
+ "grad_norm": 0.9646509885787964,
882
+ "learning_rate": 1.8684585138202122e-05,
883
+ "loss": 0.5825,
884
+ "step": 1250
885
+ },
886
+ {
887
+ "epoch": 2.52,
888
+ "grad_norm": 0.823187530040741,
889
+ "learning_rate": 1.864976307593271e-05,
890
+ "loss": 0.5816,
891
+ "step": 1260
892
+ },
893
+ {
894
+ "epoch": 2.54,
895
+ "grad_norm": 0.9524659514427185,
896
+ "learning_rate": 1.8614519436413968e-05,
897
+ "loss": 0.595,
898
+ "step": 1270
899
+ },
900
+ {
901
+ "epoch": 2.56,
902
+ "grad_norm": 3.476361036300659,
903
+ "learning_rate": 1.8578855937371176e-05,
904
+ "loss": 0.5828,
905
+ "step": 1280
906
+ },
907
+ {
908
+ "epoch": 2.58,
909
+ "grad_norm": 0.8790251016616821,
910
+ "learning_rate": 1.8542774316992953e-05,
911
+ "loss": 0.5408,
912
+ "step": 1290
913
+ },
914
+ {
915
+ "epoch": 2.6,
916
+ "grad_norm": 1.8205924034118652,
917
+ "learning_rate": 1.850627633384658e-05,
918
+ "loss": 0.5683,
919
+ "step": 1300
920
+ },
921
+ {
922
+ "epoch": 2.62,
923
+ "grad_norm": 0.7987878918647766,
924
+ "learning_rate": 1.8469363766792258e-05,
925
+ "loss": 0.5734,
926
+ "step": 1310
927
+ },
928
+ {
929
+ "epoch": 2.64,
930
+ "grad_norm": 1.1774803400039673,
931
+ "learning_rate": 1.8432038414896432e-05,
932
+ "loss": 0.5581,
933
+ "step": 1320
934
+ },
935
+ {
936
+ "epoch": 2.66,
937
+ "grad_norm": 1.4109526872634888,
938
+ "learning_rate": 1.8394302097344103e-05,
939
+ "loss": 0.5781,
940
+ "step": 1330
941
+ },
942
+ {
943
+ "epoch": 2.68,
944
+ "grad_norm": 2.104931116104126,
945
+ "learning_rate": 1.8356156653350138e-05,
946
+ "loss": 0.5468,
947
+ "step": 1340
948
+ },
949
+ {
950
+ "epoch": 2.7,
951
+ "grad_norm": 1.7022193670272827,
952
+ "learning_rate": 1.8317603942069665e-05,
953
+ "loss": 0.543,
954
+ "step": 1350
955
+ },
956
+ {
957
+ "epoch": 2.7199999999999998,
958
+ "grad_norm": 1.9831078052520752,
959
+ "learning_rate": 1.8278645842507448e-05,
960
+ "loss": 0.5416,
961
+ "step": 1360
962
+ },
963
+ {
964
+ "epoch": 2.74,
965
+ "grad_norm": 1.4346486330032349,
966
+ "learning_rate": 1.8239284253426294e-05,
967
+ "loss": 0.5692,
968
+ "step": 1370
969
+ },
970
+ {
971
+ "epoch": 2.76,
972
+ "grad_norm": 3.786205768585205,
973
+ "learning_rate": 1.8199521093254524e-05,
974
+ "loss": 0.5372,
975
+ "step": 1380
976
+ },
977
+ {
978
+ "epoch": 2.7800000000000002,
979
+ "grad_norm": 1.2855556011199951,
980
+ "learning_rate": 1.815935829999247e-05,
981
+ "loss": 0.5205,
982
+ "step": 1390
983
+ },
984
+ {
985
+ "epoch": 2.8,
986
+ "grad_norm": 1.4858529567718506,
987
+ "learning_rate": 1.811879783111801e-05,
988
+ "loss": 0.5159,
989
+ "step": 1400
990
+ },
991
+ {
992
+ "epoch": 2.82,
993
+ "grad_norm": 1.3300436735153198,
994
+ "learning_rate": 1.8077841663491174e-05,
995
+ "loss": 0.5405,
996
+ "step": 1410
997
+ },
998
+ {
999
+ "epoch": 2.84,
1000
+ "grad_norm": 1.2626831531524658,
1001
+ "learning_rate": 1.80364917932578e-05,
1002
+ "loss": 0.5769,
1003
+ "step": 1420
1004
+ },
1005
+ {
1006
+ "epoch": 2.86,
1007
+ "grad_norm": 1.4615288972854614,
1008
+ "learning_rate": 1.799475023575222e-05,
1009
+ "loss": 0.5724,
1010
+ "step": 1430
1011
+ },
1012
+ {
1013
+ "epoch": 2.88,
1014
+ "grad_norm": 1.2775332927703857,
1015
+ "learning_rate": 1.795261902539906e-05,
1016
+ "loss": 0.5603,
1017
+ "step": 1440
1018
+ },
1019
+ {
1020
+ "epoch": 2.9,
1021
+ "grad_norm": 1.9211925268173218,
1022
+ "learning_rate": 1.791010021561407e-05,
1023
+ "loss": 0.5609,
1024
+ "step": 1450
1025
+ },
1026
+ {
1027
+ "epoch": 2.92,
1028
+ "grad_norm": 3.850076675415039,
1029
+ "learning_rate": 1.7867195878704062e-05,
1030
+ "loss": 0.585,
1031
+ "step": 1460
1032
+ },
1033
+ {
1034
+ "epoch": 2.94,
1035
+ "grad_norm": 1.0455843210220337,
1036
+ "learning_rate": 1.7823908105765883e-05,
1037
+ "loss": 0.5818,
1038
+ "step": 1470
1039
+ },
1040
+ {
1041
+ "epoch": 2.96,
1042
+ "grad_norm": 2.406511068344116,
1043
+ "learning_rate": 1.7780239006584515e-05,
1044
+ "loss": 0.5453,
1045
+ "step": 1480
1046
+ },
1047
+ {
1048
+ "epoch": 2.98,
1049
+ "grad_norm": 1.212849736213684,
1050
+ "learning_rate": 1.773619070953025e-05,
1051
+ "loss": 0.5526,
1052
+ "step": 1490
1053
+ },
1054
+ {
1055
+ "epoch": 3.0,
1056
+ "grad_norm": 1.608502984046936,
1057
+ "learning_rate": 1.769176536145494e-05,
1058
+ "loss": 0.5664,
1059
+ "step": 1500
1060
+ },
1061
+ {
1062
+ "epoch": 3.02,
1063
+ "grad_norm": 2.083935499191284,
1064
+ "learning_rate": 1.7646965127587373e-05,
1065
+ "loss": 0.4993,
1066
+ "step": 1510
1067
+ },
1068
+ {
1069
+ "epoch": 3.04,
1070
+ "grad_norm": 3.191681146621704,
1071
+ "learning_rate": 1.760179219142774e-05,
1072
+ "loss": 0.5302,
1073
+ "step": 1520
1074
+ },
1075
+ {
1076
+ "epoch": 3.06,
1077
+ "grad_norm": 2.241614580154419,
1078
+ "learning_rate": 1.7556248754641237e-05,
1079
+ "loss": 0.4995,
1080
+ "step": 1530
1081
+ },
1082
+ {
1083
+ "epoch": 3.08,
1084
+ "grad_norm": 2.2549211978912354,
1085
+ "learning_rate": 1.7510337036950703e-05,
1086
+ "loss": 0.4902,
1087
+ "step": 1540
1088
+ },
1089
+ {
1090
+ "epoch": 3.1,
1091
+ "grad_norm": 2.623849630355835,
1092
+ "learning_rate": 1.7464059276028497e-05,
1093
+ "loss": 0.5571,
1094
+ "step": 1550
1095
+ },
1096
+ {
1097
+ "epoch": 3.12,
1098
+ "grad_norm": 1.099225401878357,
1099
+ "learning_rate": 1.7417417727387392e-05,
1100
+ "loss": 0.5441,
1101
+ "step": 1560
1102
+ },
1103
+ {
1104
+ "epoch": 3.14,
1105
+ "grad_norm": 1.0318654775619507,
1106
+ "learning_rate": 1.7370414664270675e-05,
1107
+ "loss": 0.5498,
1108
+ "step": 1570
1109
+ },
1110
+ {
1111
+ "epoch": 3.16,
1112
+ "grad_norm": 1.4295272827148438,
1113
+ "learning_rate": 1.732305237754132e-05,
1114
+ "loss": 0.4799,
1115
+ "step": 1580
1116
+ },
1117
+ {
1118
+ "epoch": 3.18,
1119
+ "grad_norm": 3.886106014251709,
1120
+ "learning_rate": 1.727533317557037e-05,
1121
+ "loss": 0.5285,
1122
+ "step": 1590
1123
+ },
1124
+ {
1125
+ "epoch": 3.2,
1126
+ "grad_norm": 0.9527933597564697,
1127
+ "learning_rate": 1.7227259384124408e-05,
1128
+ "loss": 0.5328,
1129
+ "step": 1600
1130
+ },
1131
+ {
1132
+ "epoch": 3.22,
1133
+ "grad_norm": 1.0082141160964966,
1134
+ "learning_rate": 1.7178833346252208e-05,
1135
+ "loss": 0.5333,
1136
+ "step": 1610
1137
+ },
1138
+ {
1139
+ "epoch": 3.24,
1140
+ "grad_norm": 0.9779771566390991,
1141
+ "learning_rate": 1.713005742217053e-05,
1142
+ "loss": 0.5163,
1143
+ "step": 1620
1144
+ },
1145
+ {
1146
+ "epoch": 3.26,
1147
+ "grad_norm": 2.230241060256958,
1148
+ "learning_rate": 1.7080933989149112e-05,
1149
+ "loss": 0.5173,
1150
+ "step": 1630
1151
+ },
1152
+ {
1153
+ "epoch": 3.2800000000000002,
1154
+ "grad_norm": 4.50337028503418,
1155
+ "learning_rate": 1.7031465441394766e-05,
1156
+ "loss": 0.5187,
1157
+ "step": 1640
1158
+ },
1159
+ {
1160
+ "epoch": 3.3,
1161
+ "grad_norm": 1.1370171308517456,
1162
+ "learning_rate": 1.698165418993473e-05,
1163
+ "loss": 0.5611,
1164
+ "step": 1650
1165
+ },
1166
+ {
1167
+ "epoch": 3.32,
1168
+ "grad_norm": 0.9167156219482422,
1169
+ "learning_rate": 1.6931502662499116e-05,
1170
+ "loss": 0.5381,
1171
+ "step": 1660
1172
+ },
1173
+ {
1174
+ "epoch": 3.34,
1175
+ "grad_norm": 2.5473690032958984,
1176
+ "learning_rate": 1.688101330340263e-05,
1177
+ "loss": 0.5089,
1178
+ "step": 1670
1179
+ },
1180
+ {
1181
+ "epoch": 3.36,
1182
+ "grad_norm": 0.9971883893013,
1183
+ "learning_rate": 1.683018857342539e-05,
1184
+ "loss": 0.5538,
1185
+ "step": 1680
1186
+ },
1187
+ {
1188
+ "epoch": 3.38,
1189
+ "grad_norm": 1.7401093244552612,
1190
+ "learning_rate": 1.6779030949693044e-05,
1191
+ "loss": 0.5216,
1192
+ "step": 1690
1193
+ },
1194
+ {
1195
+ "epoch": 3.4,
1196
+ "grad_norm": 6.5078935623168945,
1197
+ "learning_rate": 1.6727542925556e-05,
1198
+ "loss": 0.5356,
1199
+ "step": 1700
1200
+ },
1201
+ {
1202
+ "epoch": 3.42,
1203
+ "grad_norm": 0.9460225105285645,
1204
+ "learning_rate": 1.667572701046791e-05,
1205
+ "loss": 0.497,
1206
+ "step": 1710
1207
+ },
1208
+ {
1209
+ "epoch": 3.44,
1210
+ "grad_norm": 1.153794765472412,
1211
+ "learning_rate": 1.662358572986337e-05,
1212
+ "loss": 0.4934,
1213
+ "step": 1720
1214
+ },
1215
+ {
1216
+ "epoch": 3.46,
1217
+ "grad_norm": 1.1257123947143555,
1218
+ "learning_rate": 1.6571121625034847e-05,
1219
+ "loss": 0.5327,
1220
+ "step": 1730
1221
+ },
1222
+ {
1223
+ "epoch": 3.48,
1224
+ "grad_norm": 2.454582691192627,
1225
+ "learning_rate": 1.651833725300879e-05,
1226
+ "loss": 0.4995,
1227
+ "step": 1740
1228
+ },
1229
+ {
1230
+ "epoch": 3.5,
1231
+ "grad_norm": 1.7266925573349,
1232
+ "learning_rate": 1.6465235186421024e-05,
1233
+ "loss": 0.4945,
1234
+ "step": 1750
1235
+ },
1236
+ {
1237
+ "epoch": 3.52,
1238
+ "grad_norm": 2.5186126232147217,
1239
+ "learning_rate": 1.6411818013391357e-05,
1240
+ "loss": 0.4969,
1241
+ "step": 1760
1242
+ },
1243
+ {
1244
+ "epoch": 3.54,
1245
+ "grad_norm": 2.2978458404541016,
1246
+ "learning_rate": 1.6358088337397444e-05,
1247
+ "loss": 0.5133,
1248
+ "step": 1770
1249
+ },
1250
+ {
1251
+ "epoch": 3.56,
1252
+ "grad_norm": 9.083796501159668,
1253
+ "learning_rate": 1.630404877714789e-05,
1254
+ "loss": 0.4598,
1255
+ "step": 1780
1256
+ },
1257
+ {
1258
+ "epoch": 3.58,
1259
+ "grad_norm": 1.687730312347412,
1260
+ "learning_rate": 1.6249701966454626e-05,
1261
+ "loss": 0.5721,
1262
+ "step": 1790
1263
+ },
1264
+ {
1265
+ "epoch": 3.6,
1266
+ "grad_norm": 0.949276328086853,
1267
+ "learning_rate": 1.619505055410453e-05,
1268
+ "loss": 0.5549,
1269
+ "step": 1800
1270
+ },
1271
+ {
1272
+ "epoch": 3.62,
1273
+ "grad_norm": 1.7653878927230835,
1274
+ "learning_rate": 1.614009720373034e-05,
1275
+ "loss": 0.5192,
1276
+ "step": 1810
1277
+ },
1278
+ {
1279
+ "epoch": 3.64,
1280
+ "grad_norm": 0.6442993879318237,
1281
+ "learning_rate": 1.608484459368082e-05,
1282
+ "loss": 0.4927,
1283
+ "step": 1820
1284
+ },
1285
+ {
1286
+ "epoch": 3.66,
1287
+ "grad_norm": 2.4791717529296875,
1288
+ "learning_rate": 1.602929541689025e-05,
1289
+ "loss": 0.5319,
1290
+ "step": 1830
1291
+ },
1292
+ {
1293
+ "epoch": 3.68,
1294
+ "grad_norm": 1.8095901012420654,
1295
+ "learning_rate": 1.5973452380747125e-05,
1296
+ "loss": 0.5025,
1297
+ "step": 1840
1298
+ },
1299
+ {
1300
+ "epoch": 3.7,
1301
+ "grad_norm": 1.8692318201065063,
1302
+ "learning_rate": 1.591731820696224e-05,
1303
+ "loss": 0.497,
1304
+ "step": 1850
1305
+ },
1306
+ {
1307
+ "epoch": 3.7199999999999998,
1308
+ "grad_norm": 1.3001285791397095,
1309
+ "learning_rate": 1.5860895631436044e-05,
1310
+ "loss": 0.556,
1311
+ "step": 1860
1312
+ },
1313
+ {
1314
+ "epoch": 3.74,
1315
+ "grad_norm": 1.0697414875030518,
1316
+ "learning_rate": 1.580418740412526e-05,
1317
+ "loss": 0.506,
1318
+ "step": 1870
1319
+ },
1320
+ {
1321
+ "epoch": 3.76,
1322
+ "grad_norm": 3.5158541202545166,
1323
+ "learning_rate": 1.5747196288908887e-05,
1324
+ "loss": 0.5154,
1325
+ "step": 1880
1326
+ },
1327
+ {
1328
+ "epoch": 3.7800000000000002,
1329
+ "grad_norm": 1.699308156967163,
1330
+ "learning_rate": 1.5689925063453483e-05,
1331
+ "loss": 0.5887,
1332
+ "step": 1890
1333
+ },
1334
+ {
1335
+ "epoch": 3.8,
1336
+ "grad_norm": 0.825036883354187,
1337
+ "learning_rate": 1.563237651907777e-05,
1338
+ "loss": 0.508,
1339
+ "step": 1900
1340
+ },
1341
+ {
1342
+ "epoch": 3.82,
1343
+ "grad_norm": 1.7058846950531006,
1344
+ "learning_rate": 1.5574553460616608e-05,
1345
+ "loss": 0.4954,
1346
+ "step": 1910
1347
+ },
1348
+ {
1349
+ "epoch": 3.84,
1350
+ "grad_norm": 1.7128701210021973,
1351
+ "learning_rate": 1.5516458706284306e-05,
1352
+ "loss": 0.5628,
1353
+ "step": 1920
1354
+ },
1355
+ {
1356
+ "epoch": 3.86,
1357
+ "grad_norm": 0.8009471297264099,
1358
+ "learning_rate": 1.5458095087537216e-05,
1359
+ "loss": 0.4494,
1360
+ "step": 1930
1361
+ },
1362
+ {
1363
+ "epoch": 3.88,
1364
+ "grad_norm": 1.8152306079864502,
1365
+ "learning_rate": 1.5399465448935788e-05,
1366
+ "loss": 0.522,
1367
+ "step": 1940
1368
+ },
1369
+ {
1370
+ "epoch": 3.9,
1371
+ "grad_norm": 2.59840989112854,
1372
+ "learning_rate": 1.5340572648005887e-05,
1373
+ "loss": 0.5225,
1374
+ "step": 1950
1375
+ },
1376
+ {
1377
+ "epoch": 3.92,
1378
+ "grad_norm": 3.8816378116607666,
1379
+ "learning_rate": 1.5281419555099547e-05,
1380
+ "loss": 0.5092,
1381
+ "step": 1960
1382
+ },
1383
+ {
1384
+ "epoch": 3.94,
1385
+ "grad_norm": 1.4815788269042969,
1386
+ "learning_rate": 1.5222009053255061e-05,
1387
+ "loss": 0.5167,
1388
+ "step": 1970
1389
+ },
1390
+ {
1391
+ "epoch": 3.96,
1392
+ "grad_norm": 1.5924495458602905,
1393
+ "learning_rate": 1.5162344038056476e-05,
1394
+ "loss": 0.5198,
1395
+ "step": 1980
1396
+ },
1397
+ {
1398
+ "epoch": 3.98,
1399
+ "grad_norm": 1.44657301902771,
1400
+ "learning_rate": 1.510242741749246e-05,
1401
+ "loss": 0.5723,
1402
+ "step": 1990
1403
+ },
1404
+ {
1405
+ "epoch": 4.0,
1406
+ "grad_norm": 2.8102152347564697,
1407
+ "learning_rate": 1.5042262111814566e-05,
1408
+ "loss": 0.4707,
1409
+ "step": 2000
1410
+ },
1411
+ {
1412
+ "epoch": 4.02,
1413
+ "grad_norm": 1.1032963991165161,
1414
+ "learning_rate": 1.498185105339491e-05,
1415
+ "loss": 0.523,
1416
+ "step": 2010
1417
+ },
1418
+ {
1419
+ "epoch": 4.04,
1420
+ "grad_norm": 1.0523751974105835,
1421
+ "learning_rate": 1.4921197186583256e-05,
1422
+ "loss": 0.4433,
1423
+ "step": 2020
1424
+ },
1425
+ {
1426
+ "epoch": 4.06,
1427
+ "grad_norm": 3.989997148513794,
1428
+ "learning_rate": 1.4860303467563504e-05,
1429
+ "loss": 0.4861,
1430
+ "step": 2030
1431
+ },
1432
+ {
1433
+ "epoch": 4.08,
1434
+ "grad_norm": 2.7621233463287354,
1435
+ "learning_rate": 1.4799172864209607e-05,
1436
+ "loss": 0.4621,
1437
+ "step": 2040
1438
+ },
1439
+ {
1440
+ "epoch": 4.1,
1441
+ "grad_norm": 0.960752010345459,
1442
+ "learning_rate": 1.4737808355940932e-05,
1443
+ "loss": 0.5454,
1444
+ "step": 2050
1445
+ },
1446
+ {
1447
+ "epoch": 4.12,
1448
+ "grad_norm": 1.279720664024353,
1449
+ "learning_rate": 1.467621293357704e-05,
1450
+ "loss": 0.4984,
1451
+ "step": 2060
1452
+ },
1453
+ {
1454
+ "epoch": 4.14,
1455
+ "grad_norm": 2.1226606369018555,
1456
+ "learning_rate": 1.4614389599191917e-05,
1457
+ "loss": 0.5375,
1458
+ "step": 2070
1459
+ },
1460
+ {
1461
+ "epoch": 4.16,
1462
+ "grad_norm": 1.6314127445220947,
1463
+ "learning_rate": 1.455234136596766e-05,
1464
+ "loss": 0.5191,
1465
+ "step": 2080
1466
+ },
1467
+ {
1468
+ "epoch": 4.18,
1469
+ "grad_norm": 2.72589111328125,
1470
+ "learning_rate": 1.4490071258047625e-05,
1471
+ "loss": 0.4872,
1472
+ "step": 2090
1473
+ },
1474
+ {
1475
+ "epoch": 4.2,
1476
+ "grad_norm": 4.3185648918151855,
1477
+ "learning_rate": 1.442758231038902e-05,
1478
+ "loss": 0.4883,
1479
+ "step": 2100
1480
+ },
1481
+ {
1482
+ "epoch": 4.22,
1483
+ "grad_norm": 0.8917685747146606,
1484
+ "learning_rate": 1.436487756861499e-05,
1485
+ "loss": 0.4564,
1486
+ "step": 2110
1487
+ },
1488
+ {
1489
+ "epoch": 4.24,
1490
+ "grad_norm": 1.1269738674163818,
1491
+ "learning_rate": 1.4301960088866187e-05,
1492
+ "loss": 0.4311,
1493
+ "step": 2120
1494
+ },
1495
+ {
1496
+ "epoch": 4.26,
1497
+ "grad_norm": 4.341186046600342,
1498
+ "learning_rate": 1.4238832937651816e-05,
1499
+ "loss": 0.4663,
1500
+ "step": 2130
1501
+ },
1502
+ {
1503
+ "epoch": 4.28,
1504
+ "grad_norm": 0.9497487545013428,
1505
+ "learning_rate": 1.4175499191700169e-05,
1506
+ "loss": 0.5048,
1507
+ "step": 2140
1508
+ },
1509
+ {
1510
+ "epoch": 4.3,
1511
+ "grad_norm": 1.7001278400421143,
1512
+ "learning_rate": 1.4111961937808665e-05,
1513
+ "loss": 0.493,
1514
+ "step": 2150
1515
+ },
1516
+ {
1517
+ "epoch": 4.32,
1518
+ "grad_norm": 1.7037960290908813,
1519
+ "learning_rate": 1.4048224272693426e-05,
1520
+ "loss": 0.4712,
1521
+ "step": 2160
1522
+ },
1523
+ {
1524
+ "epoch": 4.34,
1525
+ "grad_norm": 0.9490267634391785,
1526
+ "learning_rate": 1.3984289302838327e-05,
1527
+ "loss": 0.4641,
1528
+ "step": 2170
1529
+ },
1530
+ {
1531
+ "epoch": 4.36,
1532
+ "grad_norm": 2.2001664638519287,
1533
+ "learning_rate": 1.3920160144343604e-05,
1534
+ "loss": 0.4929,
1535
+ "step": 2180
1536
+ },
1537
+ {
1538
+ "epoch": 4.38,
1539
+ "grad_norm": 1.0687891244888306,
1540
+ "learning_rate": 1.3855839922773968e-05,
1541
+ "loss": 0.5269,
1542
+ "step": 2190
1543
+ },
1544
+ {
1545
+ "epoch": 4.4,
1546
+ "grad_norm": 1.1579177379608154,
1547
+ "learning_rate": 1.3791331773006272e-05,
1548
+ "loss": 0.4857,
1549
+ "step": 2200
1550
+ },
1551
+ {
1552
+ "epoch": 4.42,
1553
+ "grad_norm": 0.9180253744125366,
1554
+ "learning_rate": 1.3726638839076732e-05,
1555
+ "loss": 0.5613,
1556
+ "step": 2210
1557
+ },
1558
+ {
1559
+ "epoch": 4.44,
1560
+ "grad_norm": 1.170154333114624,
1561
+ "learning_rate": 1.3661764274027678e-05,
1562
+ "loss": 0.4884,
1563
+ "step": 2220
1564
+ },
1565
+ {
1566
+ "epoch": 4.46,
1567
+ "grad_norm": 1.8389673233032227,
1568
+ "learning_rate": 1.3596711239753889e-05,
1569
+ "loss": 0.4849,
1570
+ "step": 2230
1571
+ },
1572
+ {
1573
+ "epoch": 4.48,
1574
+ "grad_norm": 1.43435537815094,
1575
+ "learning_rate": 1.3531482906848474e-05,
1576
+ "loss": 0.4752,
1577
+ "step": 2240
1578
+ },
1579
+ {
1580
+ "epoch": 4.5,
1581
+ "grad_norm": 1.2561147212982178,
1582
+ "learning_rate": 1.3466082454448364e-05,
1583
+ "loss": 0.4804,
1584
+ "step": 2250
1585
+ },
1586
+ {
1587
+ "epoch": 4.52,
1588
+ "grad_norm": 1.3098878860473633,
1589
+ "learning_rate": 1.340051307007933e-05,
1590
+ "loss": 0.4719,
1591
+ "step": 2260
1592
+ },
1593
+ {
1594
+ "epoch": 4.54,
1595
+ "grad_norm": 7.966071128845215,
1596
+ "learning_rate": 1.3334777949500673e-05,
1597
+ "loss": 0.4599,
1598
+ "step": 2270
1599
+ },
1600
+ {
1601
+ "epoch": 4.5600000000000005,
1602
+ "grad_norm": 2.28067946434021,
1603
+ "learning_rate": 1.3268880296549424e-05,
1604
+ "loss": 0.4712,
1605
+ "step": 2280
1606
+ },
1607
+ {
1608
+ "epoch": 4.58,
1609
+ "grad_norm": 1.1881040334701538,
1610
+ "learning_rate": 1.3202823322984228e-05,
1611
+ "loss": 0.4772,
1612
+ "step": 2290
1613
+ },
1614
+ {
1615
+ "epoch": 4.6,
1616
+ "grad_norm": 1.482059359550476,
1617
+ "learning_rate": 1.3136610248328779e-05,
1618
+ "loss": 0.453,
1619
+ "step": 2300
1620
+ },
1621
+ {
1622
+ "epoch": 4.62,
1623
+ "grad_norm": 2.1320247650146484,
1624
+ "learning_rate": 1.307024429971492e-05,
1625
+ "loss": 0.4657,
1626
+ "step": 2310
1627
+ },
1628
+ {
1629
+ "epoch": 4.64,
1630
+ "grad_norm": 1.2124568223953247,
1631
+ "learning_rate": 1.3003728711725364e-05,
1632
+ "loss": 0.4791,
1633
+ "step": 2320
1634
+ },
1635
+ {
1636
+ "epoch": 4.66,
1637
+ "grad_norm": 0.7249093055725098,
1638
+ "learning_rate": 1.2937066726236029e-05,
1639
+ "loss": 0.5586,
1640
+ "step": 2330
1641
+ },
1642
+ {
1643
+ "epoch": 4.68,
1644
+ "grad_norm": 1.5038686990737915,
1645
+ "learning_rate": 1.2870261592258038e-05,
1646
+ "loss": 0.4603,
1647
+ "step": 2340
1648
+ },
1649
+ {
1650
+ "epoch": 4.7,
1651
+ "grad_norm": 1.320827603340149,
1652
+ "learning_rate": 1.2803316565779378e-05,
1653
+ "loss": 0.4618,
1654
+ "step": 2350
1655
+ },
1656
+ {
1657
+ "epoch": 4.72,
1658
+ "grad_norm": 1.4801580905914307,
1659
+ "learning_rate": 1.2736234909606186e-05,
1660
+ "loss": 0.4643,
1661
+ "step": 2360
1662
+ },
1663
+ {
1664
+ "epoch": 4.74,
1665
+ "grad_norm": 2.649113655090332,
1666
+ "learning_rate": 1.2669019893203758e-05,
1667
+ "loss": 0.5017,
1668
+ "step": 2370
1669
+ },
1670
+ {
1671
+ "epoch": 4.76,
1672
+ "grad_norm": 1.3265902996063232,
1673
+ "learning_rate": 1.2601674792537157e-05,
1674
+ "loss": 0.451,
1675
+ "step": 2380
1676
+ },
1677
+ {
1678
+ "epoch": 4.78,
1679
+ "grad_norm": 1.6874167919158936,
1680
+ "learning_rate": 1.2534202889911584e-05,
1681
+ "loss": 0.4763,
1682
+ "step": 2390
1683
+ },
1684
+ {
1685
+ "epoch": 4.8,
1686
+ "grad_norm": 0.7948456406593323,
1687
+ "learning_rate": 1.2466607473812386e-05,
1688
+ "loss": 0.4984,
1689
+ "step": 2400
1690
+ },
1691
+ {
1692
+ "epoch": 4.82,
1693
+ "grad_norm": 1.3831831216812134,
1694
+ "learning_rate": 1.2398891838744777e-05,
1695
+ "loss": 0.4594,
1696
+ "step": 2410
1697
+ },
1698
+ {
1699
+ "epoch": 4.84,
1700
+ "grad_norm": 2.660630941390991,
1701
+ "learning_rate": 1.233105928507328e-05,
1702
+ "loss": 0.476,
1703
+ "step": 2420
1704
+ },
1705
+ {
1706
+ "epoch": 4.86,
1707
+ "grad_norm": 1.7315685749053955,
1708
+ "learning_rate": 1.226311311886086e-05,
1709
+ "loss": 0.4599,
1710
+ "step": 2430
1711
+ },
1712
+ {
1713
+ "epoch": 4.88,
1714
+ "grad_norm": 1.0656920671463013,
1715
+ "learning_rate": 1.2195056651707806e-05,
1716
+ "loss": 0.4786,
1717
+ "step": 2440
1718
+ },
1719
+ {
1720
+ "epoch": 4.9,
1721
+ "grad_norm": 1.317185878753662,
1722
+ "learning_rate": 1.2126893200590309e-05,
1723
+ "loss": 0.539,
1724
+ "step": 2450
1725
+ },
1726
+ {
1727
+ "epoch": 4.92,
1728
+ "grad_norm": 2.1588151454925537,
1729
+ "learning_rate": 1.2058626087698814e-05,
1730
+ "loss": 0.442,
1731
+ "step": 2460
1732
+ },
1733
+ {
1734
+ "epoch": 4.9399999999999995,
1735
+ "grad_norm": 1.3337817192077637,
1736
+ "learning_rate": 1.1990258640276094e-05,
1737
+ "loss": 0.4829,
1738
+ "step": 2470
1739
+ },
1740
+ {
1741
+ "epoch": 4.96,
1742
+ "grad_norm": 1.3558602333068848,
1743
+ "learning_rate": 1.1921794190455082e-05,
1744
+ "loss": 0.5055,
1745
+ "step": 2480
1746
+ },
1747
+ {
1748
+ "epoch": 4.98,
1749
+ "grad_norm": 1.18630850315094,
1750
+ "learning_rate": 1.1853236075096474e-05,
1751
+ "loss": 0.4857,
1752
+ "step": 2490
1753
+ },
1754
+ {
1755
+ "epoch": 5.0,
1756
+ "grad_norm": 1.4073606729507446,
1757
+ "learning_rate": 1.1784587635626095e-05,
1758
+ "loss": 0.4962,
1759
+ "step": 2500
1760
+ },
1761
+ {
1762
+ "epoch": 5.02,
1763
+ "grad_norm": 2.1152431964874268,
1764
+ "learning_rate": 1.171585221787203e-05,
1765
+ "loss": 0.4185,
1766
+ "step": 2510
1767
+ },
1768
+ {
1769
+ "epoch": 5.04,
1770
+ "grad_norm": 1.4434751272201538,
1771
+ "learning_rate": 1.1647033171901573e-05,
1772
+ "loss": 0.4545,
1773
+ "step": 2520
1774
+ },
1775
+ {
1776
+ "epoch": 5.06,
1777
+ "grad_norm": 2.575100898742676,
1778
+ "learning_rate": 1.157813385185794e-05,
1779
+ "loss": 0.4162,
1780
+ "step": 2530
1781
+ },
1782
+ {
1783
+ "epoch": 5.08,
1784
+ "grad_norm": 1.808670163154602,
1785
+ "learning_rate": 1.1509157615796775e-05,
1786
+ "loss": 0.425,
1787
+ "step": 2540
1788
+ },
1789
+ {
1790
+ "epoch": 5.1,
1791
+ "grad_norm": 2.0756947994232178,
1792
+ "learning_rate": 1.1440107825522522e-05,
1793
+ "loss": 0.4514,
1794
+ "step": 2550
1795
+ },
1796
+ {
1797
+ "epoch": 5.12,
1798
+ "grad_norm": 1.3622123003005981,
1799
+ "learning_rate": 1.1370987846424547e-05,
1800
+ "loss": 0.4687,
1801
+ "step": 2560
1802
+ },
1803
+ {
1804
+ "epoch": 5.14,
1805
+ "grad_norm": 1.938477873802185,
1806
+ "learning_rate": 1.1301801047313106e-05,
1807
+ "loss": 0.4892,
1808
+ "step": 2570
1809
+ },
1810
+ {
1811
+ "epoch": 5.16,
1812
+ "grad_norm": 1.3552794456481934,
1813
+ "learning_rate": 1.1232550800255188e-05,
1814
+ "loss": 0.4675,
1815
+ "step": 2580
1816
+ },
1817
+ {
1818
+ "epoch": 5.18,
1819
+ "grad_norm": 2.3927013874053955,
1820
+ "learning_rate": 1.1163240480410136e-05,
1821
+ "loss": 0.4336,
1822
+ "step": 2590
1823
+ },
1824
+ {
1825
+ "epoch": 5.2,
1826
+ "grad_norm": 1.3408219814300537,
1827
+ "learning_rate": 1.1093873465865156e-05,
1828
+ "loss": 0.4358,
1829
+ "step": 2600
1830
+ },
1831
+ {
1832
+ "epoch": 5.22,
1833
+ "grad_norm": 2.9869275093078613,
1834
+ "learning_rate": 1.1024453137470677e-05,
1835
+ "loss": 0.4709,
1836
+ "step": 2610
1837
+ },
1838
+ {
1839
+ "epoch": 5.24,
1840
+ "grad_norm": 2.7815663814544678,
1841
+ "learning_rate": 1.0954982878675564e-05,
1842
+ "loss": 0.4349,
1843
+ "step": 2620
1844
+ },
1845
+ {
1846
+ "epoch": 5.26,
1847
+ "grad_norm": 1.9527360200881958,
1848
+ "learning_rate": 1.0885466075362224e-05,
1849
+ "loss": 0.4581,
1850
+ "step": 2630
1851
+ },
1852
+ {
1853
+ "epoch": 5.28,
1854
+ "grad_norm": 1.804969072341919,
1855
+ "learning_rate": 1.0815906115681579e-05,
1856
+ "loss": 0.4482,
1857
+ "step": 2640
1858
+ },
1859
+ {
1860
+ "epoch": 5.3,
1861
+ "grad_norm": 1.6230815649032593,
1862
+ "learning_rate": 1.0746306389887924e-05,
1863
+ "loss": 0.4771,
1864
+ "step": 2650
1865
+ },
1866
+ {
1867
+ "epoch": 5.32,
1868
+ "grad_norm": 2.6288340091705322,
1869
+ "learning_rate": 1.067667029017371e-05,
1870
+ "loss": 0.4893,
1871
+ "step": 2660
1872
+ },
1873
+ {
1874
+ "epoch": 5.34,
1875
+ "grad_norm": 0.7628895044326782,
1876
+ "learning_rate": 1.060700121050419e-05,
1877
+ "loss": 0.4823,
1878
+ "step": 2670
1879
+ },
1880
+ {
1881
+ "epoch": 5.36,
1882
+ "grad_norm": 1.395524501800537,
1883
+ "learning_rate": 1.0537302546452022e-05,
1884
+ "loss": 0.45,
1885
+ "step": 2680
1886
+ },
1887
+ {
1888
+ "epoch": 5.38,
1889
+ "grad_norm": 1.0264369249343872,
1890
+ "learning_rate": 1.0467577695031763e-05,
1891
+ "loss": 0.4817,
1892
+ "step": 2690
1893
+ },
1894
+ {
1895
+ "epoch": 5.4,
1896
+ "grad_norm": 1.3651304244995117,
1897
+ "learning_rate": 1.03978300545343e-05,
1898
+ "loss": 0.4472,
1899
+ "step": 2700
1900
+ },
1901
+ {
1902
+ "epoch": 5.42,
1903
+ "grad_norm": 1.520727276802063,
1904
+ "learning_rate": 1.0328063024361232e-05,
1905
+ "loss": 0.4351,
1906
+ "step": 2710
1907
+ },
1908
+ {
1909
+ "epoch": 5.44,
1910
+ "grad_norm": 2.283327102661133,
1911
+ "learning_rate": 1.0258280004859189e-05,
1912
+ "loss": 0.4052,
1913
+ "step": 2720
1914
+ },
1915
+ {
1916
+ "epoch": 5.46,
1917
+ "grad_norm": 5.243598937988281,
1918
+ "learning_rate": 1.0188484397154083e-05,
1919
+ "loss": 0.51,
1920
+ "step": 2730
1921
+ },
1922
+ {
1923
+ "epoch": 5.48,
1924
+ "grad_norm": 2.3326563835144043,
1925
+ "learning_rate": 1.0118679602985373e-05,
1926
+ "loss": 0.4678,
1927
+ "step": 2740
1928
+ },
1929
+ {
1930
+ "epoch": 5.5,
1931
+ "grad_norm": 1.8756747245788574,
1932
+ "learning_rate": 1.0048869024540247e-05,
1933
+ "loss": 0.4802,
1934
+ "step": 2750
1935
+ },
1936
+ {
1937
+ "epoch": 5.52,
1938
+ "grad_norm": 2.212642192840576,
1939
+ "learning_rate": 9.979056064287807e-06,
1940
+ "loss": 0.4416,
1941
+ "step": 2760
1942
+ },
1943
+ {
1944
+ "epoch": 5.54,
1945
+ "grad_norm": 1.893557071685791,
1946
+ "learning_rate": 9.909244124813246e-06,
1947
+ "loss": 0.4613,
1948
+ "step": 2770
1949
+ },
1950
+ {
1951
+ "epoch": 5.5600000000000005,
1952
+ "grad_norm": 3.211782217025757,
1953
+ "learning_rate": 9.839436608652007e-06,
1954
+ "loss": 0.4163,
1955
+ "step": 2780
1956
+ },
1957
+ {
1958
+ "epoch": 5.58,
1959
+ "grad_norm": 1.4164925813674927,
1960
+ "learning_rate": 9.76963691812394e-06,
1961
+ "loss": 0.4753,
1962
+ "step": 2790
1963
+ },
1964
+ {
1965
+ "epoch": 5.6,
1966
+ "grad_norm": 1.139273762702942,
1967
+ "learning_rate": 9.699848455167489e-06,
1968
+ "loss": 0.4725,
1969
+ "step": 2800
1970
+ },
1971
+ {
1972
+ "epoch": 5.62,
1973
+ "grad_norm": 1.5774643421173096,
1974
+ "learning_rate": 9.630074621173882e-06,
1975
+ "loss": 0.4521,
1976
+ "step": 2810
1977
+ },
1978
+ {
1979
+ "epoch": 5.64,
1980
+ "grad_norm": 2.0061256885528564,
1981
+ "learning_rate": 9.560318816821354e-06,
1982
+ "loss": 0.3838,
1983
+ "step": 2820
1984
+ },
1985
+ {
1986
+ "epoch": 5.66,
1987
+ "grad_norm": 3.7671396732330322,
1988
+ "learning_rate": 9.490584441909392e-06,
1989
+ "loss": 0.4603,
1990
+ "step": 2830
1991
+ },
1992
+ {
1993
+ "epoch": 5.68,
1994
+ "grad_norm": 1.6197257041931152,
1995
+ "learning_rate": 9.420874895193056e-06,
1996
+ "loss": 0.4538,
1997
+ "step": 2840
1998
+ },
1999
+ {
2000
+ "epoch": 5.7,
2001
+ "grad_norm": 3.386794328689575,
2002
+ "learning_rate": 9.351193574217305e-06,
2003
+ "loss": 0.4527,
2004
+ "step": 2850
2005
+ },
2006
+ {
2007
+ "epoch": 5.72,
2008
+ "grad_norm": 1.4062769412994385,
2009
+ "learning_rate": 9.281543875151419e-06,
2010
+ "loss": 0.4915,
2011
+ "step": 2860
2012
+ },
2013
+ {
2014
+ "epoch": 5.74,
2015
+ "grad_norm": 2.367417573928833,
2016
+ "learning_rate": 9.211929192623466e-06,
2017
+ "loss": 0.4338,
2018
+ "step": 2870
2019
+ },
2020
+ {
2021
+ "epoch": 5.76,
2022
+ "grad_norm": 1.7326956987380981,
2023
+ "learning_rate": 9.142352919554862e-06,
2024
+ "loss": 0.4573,
2025
+ "step": 2880
2026
+ },
2027
+ {
2028
+ "epoch": 5.78,
2029
+ "grad_norm": 2.8575878143310547,
2030
+ "learning_rate": 9.072818446995e-06,
2031
+ "loss": 0.4494,
2032
+ "step": 2890
2033
+ },
2034
+ {
2035
+ "epoch": 5.8,
2036
+ "grad_norm": 1.1295793056488037,
2037
+ "learning_rate": 9.003329163955973e-06,
2038
+ "loss": 0.5061,
2039
+ "step": 2900
2040
+ },
2041
+ {
2042
+ "epoch": 5.82,
2043
+ "grad_norm": 1.31191885471344,
2044
+ "learning_rate": 8.933888457247402e-06,
2045
+ "loss": 0.4537,
2046
+ "step": 2910
2047
+ },
2048
+ {
2049
+ "epoch": 5.84,
2050
+ "grad_norm": 1.8330936431884766,
2051
+ "learning_rate": 8.864499711311362e-06,
2052
+ "loss": 0.4764,
2053
+ "step": 2920
2054
+ },
2055
+ {
2056
+ "epoch": 5.86,
2057
+ "grad_norm": 1.8839877843856812,
2058
+ "learning_rate": 8.79516630805745e-06,
2059
+ "loss": 0.4563,
2060
+ "step": 2930
2061
+ },
2062
+ {
2063
+ "epoch": 5.88,
2064
+ "grad_norm": 2.6970791816711426,
2065
+ "learning_rate": 8.725891626697912e-06,
2066
+ "loss": 0.4887,
2067
+ "step": 2940
2068
+ },
2069
+ {
2070
+ "epoch": 5.9,
2071
+ "grad_norm": 3.178072214126587,
2072
+ "learning_rate": 8.656679043582986e-06,
2073
+ "loss": 0.446,
2074
+ "step": 2950
2075
+ },
2076
+ {
2077
+ "epoch": 5.92,
2078
+ "grad_norm": 2.423067569732666,
2079
+ "learning_rate": 8.587531932036334e-06,
2080
+ "loss": 0.4533,
2081
+ "step": 2960
2082
+ },
2083
+ {
2084
+ "epoch": 5.9399999999999995,
2085
+ "grad_norm": 3.043440580368042,
2086
+ "learning_rate": 8.518453662190622e-06,
2087
+ "loss": 0.4451,
2088
+ "step": 2970
2089
+ },
2090
+ {
2091
+ "epoch": 5.96,
2092
+ "grad_norm": 2.4324257373809814,
2093
+ "learning_rate": 8.449447600823262e-06,
2094
+ "loss": 0.393,
2095
+ "step": 2980
2096
+ },
2097
+ {
2098
+ "epoch": 5.98,
2099
+ "grad_norm": 7.399738311767578,
2100
+ "learning_rate": 8.380517111192336e-06,
2101
+ "loss": 0.4406,
2102
+ "step": 2990
2103
+ },
2104
+ {
2105
+ "epoch": 6.0,
2106
+ "grad_norm": 0.8923618197441101,
2107
+ "learning_rate": 8.311665552872662e-06,
2108
+ "loss": 0.474,
2109
+ "step": 3000
2110
+ },
2111
+ {
2112
+ "epoch": 6.02,
2113
+ "grad_norm": 1.7500466108322144,
2114
+ "learning_rate": 8.242896281592057e-06,
2115
+ "loss": 0.3953,
2116
+ "step": 3010
2117
+ },
2118
+ {
2119
+ "epoch": 6.04,
2120
+ "grad_norm": 3.8324530124664307,
2121
+ "learning_rate": 8.174212649067781e-06,
2122
+ "loss": 0.4117,
2123
+ "step": 3020
2124
+ },
2125
+ {
2126
+ "epoch": 6.06,
2127
+ "grad_norm": 2.275822639465332,
2128
+ "learning_rate": 8.10561800284319e-06,
2129
+ "loss": 0.3988,
2130
+ "step": 3030
2131
+ },
2132
+ {
2133
+ "epoch": 6.08,
2134
+ "grad_norm": 1.2562943696975708,
2135
+ "learning_rate": 8.037115686124564e-06,
2136
+ "loss": 0.418,
2137
+ "step": 3040
2138
+ },
2139
+ {
2140
+ "epoch": 6.1,
2141
+ "grad_norm": 1.3214370012283325,
2142
+ "learning_rate": 7.96870903761818e-06,
2143
+ "loss": 0.4084,
2144
+ "step": 3050
2145
+ },
2146
+ {
2147
+ "epoch": 6.12,
2148
+ "grad_norm": 2.595797061920166,
2149
+ "learning_rate": 7.900401391367576e-06,
2150
+ "loss": 0.3739,
2151
+ "step": 3060
2152
+ },
2153
+ {
2154
+ "epoch": 6.14,
2155
+ "grad_norm": 2.055779457092285,
2156
+ "learning_rate": 7.832196076591067e-06,
2157
+ "loss": 0.3763,
2158
+ "step": 3070
2159
+ },
2160
+ {
2161
+ "epoch": 6.16,
2162
+ "grad_norm": 2.5182206630706787,
2163
+ "learning_rate": 7.76409641751947e-06,
2164
+ "loss": 0.4522,
2165
+ "step": 3080
2166
+ },
2167
+ {
2168
+ "epoch": 6.18,
2169
+ "grad_norm": 6.86693000793457,
2170
+ "learning_rate": 7.696105733234099e-06,
2171
+ "loss": 0.4661,
2172
+ "step": 3090
2173
+ },
2174
+ {
2175
+ "epoch": 6.2,
2176
+ "grad_norm": 0.7651225924491882,
2177
+ "learning_rate": 7.628227337504972e-06,
2178
+ "loss": 0.4519,
2179
+ "step": 3100
2180
+ },
2181
+ {
2182
+ "epoch": 6.22,
2183
+ "grad_norm": 1.531447172164917,
2184
+ "learning_rate": 7.560464538629345e-06,
2185
+ "loss": 0.4073,
2186
+ "step": 3110
2187
+ },
2188
+ {
2189
+ "epoch": 6.24,
2190
+ "grad_norm": 2.4921135902404785,
2191
+ "learning_rate": 7.492820639270435e-06,
2192
+ "loss": 0.4458,
2193
+ "step": 3120
2194
+ },
2195
+ {
2196
+ "epoch": 6.26,
2197
+ "grad_norm": 1.5849100351333618,
2198
+ "learning_rate": 7.4252989362964635e-06,
2199
+ "loss": 0.3703,
2200
+ "step": 3130
2201
+ },
2202
+ {
2203
+ "epoch": 6.28,
2204
+ "grad_norm": 1.8190685510635376,
2205
+ "learning_rate": 7.357902720619976e-06,
2206
+ "loss": 0.4393,
2207
+ "step": 3140
2208
+ },
2209
+ {
2210
+ "epoch": 6.3,
2211
+ "grad_norm": 1.195379376411438,
2212
+ "learning_rate": 7.290635277037442e-06,
2213
+ "loss": 0.437,
2214
+ "step": 3150
2215
+ },
2216
+ {
2217
+ "epoch": 6.32,
2218
+ "grad_norm": 1.6389209032058716,
2219
+ "learning_rate": 7.22349988406916e-06,
2220
+ "loss": 0.3979,
2221
+ "step": 3160
2222
+ },
2223
+ {
2224
+ "epoch": 6.34,
2225
+ "grad_norm": 1.7568351030349731,
2226
+ "learning_rate": 7.156499813799477e-06,
2227
+ "loss": 0.4078,
2228
+ "step": 3170
2229
+ },
2230
+ {
2231
+ "epoch": 6.36,
2232
+ "grad_norm": 2.5609893798828125,
2233
+ "learning_rate": 7.0896383317172845e-06,
2234
+ "loss": 0.4182,
2235
+ "step": 3180
2236
+ },
2237
+ {
2238
+ "epoch": 6.38,
2239
+ "grad_norm": 2.070969343185425,
2240
+ "learning_rate": 7.022918696556896e-06,
2241
+ "loss": 0.4239,
2242
+ "step": 3190
2243
+ },
2244
+ {
2245
+ "epoch": 6.4,
2246
+ "grad_norm": 3.3474345207214355,
2247
+ "learning_rate": 6.956344160139201e-06,
2248
+ "loss": 0.4369,
2249
+ "step": 3200
2250
+ },
2251
+ {
2252
+ "epoch": 6.42,
2253
+ "grad_norm": 1.3559445142745972,
2254
+ "learning_rate": 6.889917967213184e-06,
2255
+ "loss": 0.4469,
2256
+ "step": 3210
2257
+ },
2258
+ {
2259
+ "epoch": 6.44,
2260
+ "grad_norm": 1.4630825519561768,
2261
+ "learning_rate": 6.823643355297774e-06,
2262
+ "loss": 0.4312,
2263
+ "step": 3220
2264
+ },
2265
+ {
2266
+ "epoch": 6.46,
2267
+ "grad_norm": 2.0589451789855957,
2268
+ "learning_rate": 6.757523554524056e-06,
2269
+ "loss": 0.4465,
2270
+ "step": 3230
2271
+ },
2272
+ {
2273
+ "epoch": 6.48,
2274
+ "grad_norm": 2.448317766189575,
2275
+ "learning_rate": 6.69156178747784e-06,
2276
+ "loss": 0.4201,
2277
+ "step": 3240
2278
+ },
2279
+ {
2280
+ "epoch": 6.5,
2281
+ "grad_norm": 1.911927580833435,
2282
+ "learning_rate": 6.62576126904259e-06,
2283
+ "loss": 0.3882,
2284
+ "step": 3250
2285
+ },
2286
+ {
2287
+ "epoch": 6.52,
2288
+ "grad_norm": 3.176950216293335,
2289
+ "learning_rate": 6.560125206242746e-06,
2290
+ "loss": 0.4448,
2291
+ "step": 3260
2292
+ },
2293
+ {
2294
+ "epoch": 6.54,
2295
+ "grad_norm": 1.4145492315292358,
2296
+ "learning_rate": 6.494656798087412e-06,
2297
+ "loss": 0.3915,
2298
+ "step": 3270
2299
+ },
2300
+ {
2301
+ "epoch": 6.5600000000000005,
2302
+ "grad_norm": 4.982487201690674,
2303
+ "learning_rate": 6.4293592354144365e-06,
2304
+ "loss": 0.3769,
2305
+ "step": 3280
2306
+ },
2307
+ {
2308
+ "epoch": 6.58,
2309
+ "grad_norm": 3.9301717281341553,
2310
+ "learning_rate": 6.364235700734903e-06,
2311
+ "loss": 0.4503,
2312
+ "step": 3290
2313
+ },
2314
+ {
2315
+ "epoch": 6.6,
2316
+ "grad_norm": 3.643587112426758,
2317
+ "learning_rate": 6.299289368078016e-06,
2318
+ "loss": 0.4398,
2319
+ "step": 3300
2320
+ },
2321
+ {
2322
+ "epoch": 6.62,
2323
+ "grad_norm": 2.1195595264434814,
2324
+ "learning_rate": 6.234523402836408e-06,
2325
+ "loss": 0.4199,
2326
+ "step": 3310
2327
+ },
2328
+ {
2329
+ "epoch": 6.64,
2330
+ "grad_norm": 1.3784760236740112,
2331
+ "learning_rate": 6.169940961611853e-06,
2332
+ "loss": 0.4574,
2333
+ "step": 3320
2334
+ },
2335
+ {
2336
+ "epoch": 6.66,
2337
+ "grad_norm": 4.373683452606201,
2338
+ "learning_rate": 6.1055451920614165e-06,
2339
+ "loss": 0.4252,
2340
+ "step": 3330
2341
+ },
2342
+ {
2343
+ "epoch": 6.68,
2344
+ "grad_norm": 1.9143246412277222,
2345
+ "learning_rate": 6.0413392327440635e-06,
2346
+ "loss": 0.4069,
2347
+ "step": 3340
2348
+ },
2349
+ {
2350
+ "epoch": 6.7,
2351
+ "grad_norm": 1.9608592987060547,
2352
+ "learning_rate": 5.977326212967671e-06,
2353
+ "loss": 0.4173,
2354
+ "step": 3350
2355
+ },
2356
+ {
2357
+ "epoch": 6.72,
2358
+ "grad_norm": 2.7381646633148193,
2359
+ "learning_rate": 5.913509252636511e-06,
2360
+ "loss": 0.3737,
2361
+ "step": 3360
2362
+ },
2363
+ {
2364
+ "epoch": 6.74,
2365
+ "grad_norm": 1.9294111728668213,
2366
+ "learning_rate": 5.849891462099199e-06,
2367
+ "loss": 0.437,
2368
+ "step": 3370
2369
+ },
2370
+ {
2371
+ "epoch": 6.76,
2372
+ "grad_norm": 0.8937060236930847,
2373
+ "learning_rate": 5.786475941997094e-06,
2374
+ "loss": 0.4457,
2375
+ "step": 3380
2376
+ },
2377
+ {
2378
+ "epoch": 6.78,
2379
+ "grad_norm": 1.2962634563446045,
2380
+ "learning_rate": 5.723265783113181e-06,
2381
+ "loss": 0.3989,
2382
+ "step": 3390
2383
+ },
2384
+ {
2385
+ "epoch": 6.8,
2386
+ "grad_norm": 1.0391006469726562,
2387
+ "learning_rate": 5.660264066221426e-06,
2388
+ "loss": 0.4314,
2389
+ "step": 3400
2390
+ },
2391
+ {
2392
+ "epoch": 6.82,
2393
+ "grad_norm": 7.157230377197266,
2394
+ "learning_rate": 5.59747386193663e-06,
2395
+ "loss": 0.3989,
2396
+ "step": 3410
2397
+ },
2398
+ {
2399
+ "epoch": 6.84,
2400
+ "grad_norm": 2.1349549293518066,
2401
+ "learning_rate": 5.534898230564765e-06,
2402
+ "loss": 0.3792,
2403
+ "step": 3420
2404
+ },
2405
+ {
2406
+ "epoch": 6.86,
2407
+ "grad_norm": 1.362468957901001,
2408
+ "learning_rate": 5.472540221953824e-06,
2409
+ "loss": 0.4115,
2410
+ "step": 3430
2411
+ },
2412
+ {
2413
+ "epoch": 6.88,
2414
+ "grad_norm": 3.6166296005249023,
2415
+ "learning_rate": 5.41040287534517e-06,
2416
+ "loss": 0.4067,
2417
+ "step": 3440
2418
+ },
2419
+ {
2420
+ "epoch": 6.9,
2421
+ "grad_norm": 2.424628257751465,
2422
+ "learning_rate": 5.348489219225417e-06,
2423
+ "loss": 0.4424,
2424
+ "step": 3450
2425
+ },
2426
+ {
2427
+ "epoch": 6.92,
2428
+ "grad_norm": 2.7839276790618896,
2429
+ "learning_rate": 5.286802271178815e-06,
2430
+ "loss": 0.4508,
2431
+ "step": 3460
2432
+ },
2433
+ {
2434
+ "epoch": 6.9399999999999995,
2435
+ "grad_norm": 3.2447237968444824,
2436
+ "learning_rate": 5.225345037740186e-06,
2437
+ "loss": 0.3984,
2438
+ "step": 3470
2439
+ },
2440
+ {
2441
+ "epoch": 6.96,
2442
+ "grad_norm": 1.6200125217437744,
2443
+ "learning_rate": 5.16412051424839e-06,
2444
+ "loss": 0.4499,
2445
+ "step": 3480
2446
+ },
2447
+ {
2448
+ "epoch": 6.98,
2449
+ "grad_norm": 1.5760829448699951,
2450
+ "learning_rate": 5.103131684700315e-06,
2451
+ "loss": 0.4154,
2452
+ "step": 3490
2453
+ },
2454
+ {
2455
+ "epoch": 7.0,
2456
+ "grad_norm": 11.93620491027832,
2457
+ "learning_rate": 5.042381521605473e-06,
2458
+ "loss": 0.391,
2459
+ "step": 3500
2460
+ },
2461
+ {
2462
+ "epoch": 7.02,
2463
+ "grad_norm": 4.725837707519531,
2464
+ "learning_rate": 4.981872985841115e-06,
2465
+ "loss": 0.38,
2466
+ "step": 3510
2467
+ },
2468
+ {
2469
+ "epoch": 7.04,
2470
+ "grad_norm": 1.6126409769058228,
2471
+ "learning_rate": 4.921609026507907e-06,
2472
+ "loss": 0.3478,
2473
+ "step": 3520
2474
+ },
2475
+ {
2476
+ "epoch": 7.06,
2477
+ "grad_norm": 2.6732842922210693,
2478
+ "learning_rate": 4.861592580786205e-06,
2479
+ "loss": 0.3712,
2480
+ "step": 3530
2481
+ },
2482
+ {
2483
+ "epoch": 7.08,
2484
+ "grad_norm": 2.0151851177215576,
2485
+ "learning_rate": 4.801826573792905e-06,
2486
+ "loss": 0.3801,
2487
+ "step": 3540
2488
+ },
2489
+ {
2490
+ "epoch": 7.1,
2491
+ "grad_norm": 1.151302456855774,
2492
+ "learning_rate": 4.7423139184388725e-06,
2493
+ "loss": 0.3881,
2494
+ "step": 3550
2495
+ },
2496
+ {
2497
+ "epoch": 7.12,
2498
+ "grad_norm": 3.5267462730407715,
2499
+ "learning_rate": 4.6830575152869615e-06,
2500
+ "loss": 0.3489,
2501
+ "step": 3560
2502
+ },
2503
+ {
2504
+ "epoch": 7.14,
2505
+ "grad_norm": 1.8429639339447021,
2506
+ "learning_rate": 4.62406025241067e-06,
2507
+ "loss": 0.4284,
2508
+ "step": 3570
2509
+ },
2510
+ {
2511
+ "epoch": 7.16,
2512
+ "grad_norm": 4.2320051193237305,
2513
+ "learning_rate": 4.565325005253356e-06,
2514
+ "loss": 0.4055,
2515
+ "step": 3580
2516
+ },
2517
+ {
2518
+ "epoch": 7.18,
2519
+ "grad_norm": 2.368800163269043,
2520
+ "learning_rate": 4.506854636488103e-06,
2521
+ "loss": 0.3627,
2522
+ "step": 3590
2523
+ },
2524
+ {
2525
+ "epoch": 7.2,
2526
+ "grad_norm": 7.869661331176758,
2527
+ "learning_rate": 4.44865199587819e-06,
2528
+ "loss": 0.3866,
2529
+ "step": 3600
2530
+ },
2531
+ {
2532
+ "epoch": 7.22,
2533
+ "grad_norm": 1.2418557405471802,
2534
+ "learning_rate": 4.39071992013822e-06,
2535
+ "loss": 0.3947,
2536
+ "step": 3610
2537
+ },
2538
+ {
2539
+ "epoch": 7.24,
2540
+ "grad_norm": 1.7556277513504028,
2541
+ "learning_rate": 4.3330612327958265e-06,
2542
+ "loss": 0.4266,
2543
+ "step": 3620
2544
+ },
2545
+ {
2546
+ "epoch": 7.26,
2547
+ "grad_norm": 4.239712238311768,
2548
+ "learning_rate": 4.275678744054094e-06,
2549
+ "loss": 0.3495,
2550
+ "step": 3630
2551
+ },
2552
+ {
2553
+ "epoch": 7.28,
2554
+ "grad_norm": 2.917245626449585,
2555
+ "learning_rate": 4.218575250654559e-06,
2556
+ "loss": 0.4153,
2557
+ "step": 3640
2558
+ },
2559
+ {
2560
+ "epoch": 7.3,
2561
+ "grad_norm": 1.490869402885437,
2562
+ "learning_rate": 4.161753535740932e-06,
2563
+ "loss": 0.3819,
2564
+ "step": 3650
2565
+ },
2566
+ {
2567
+ "epoch": 7.32,
2568
+ "grad_norm": 1.5143734216690063,
2569
+ "learning_rate": 4.105216368723437e-06,
2570
+ "loss": 0.4032,
2571
+ "step": 3660
2572
+ },
2573
+ {
2574
+ "epoch": 7.34,
2575
+ "grad_norm": 3.434727907180786,
2576
+ "learning_rate": 4.048966505143831e-06,
2577
+ "loss": 0.358,
2578
+ "step": 3670
2579
+ },
2580
+ {
2581
+ "epoch": 7.36,
2582
+ "grad_norm": 1.666413426399231,
2583
+ "learning_rate": 3.993006686541108e-06,
2584
+ "loss": 0.4101,
2585
+ "step": 3680
2586
+ },
2587
+ {
2588
+ "epoch": 7.38,
2589
+ "grad_norm": 2.142817974090576,
2590
+ "learning_rate": 3.937339640317879e-06,
2591
+ "loss": 0.3803,
2592
+ "step": 3690
2593
+ },
2594
+ {
2595
+ "epoch": 7.4,
2596
+ "grad_norm": 0.9919471740722656,
2597
+ "learning_rate": 3.88196807960744e-06,
2598
+ "loss": 0.3844,
2599
+ "step": 3700
2600
+ },
2601
+ {
2602
+ "epoch": 7.42,
2603
+ "grad_norm": 2.370820999145508,
2604
+ "learning_rate": 3.826894703141552e-06,
2605
+ "loss": 0.3536,
2606
+ "step": 3710
2607
+ },
2608
+ {
2609
+ "epoch": 7.44,
2610
+ "grad_norm": 1.761391520500183,
2611
+ "learning_rate": 3.772122195118877e-06,
2612
+ "loss": 0.3957,
2613
+ "step": 3720
2614
+ },
2615
+ {
2616
+ "epoch": 7.46,
2617
+ "grad_norm": 1.3135240077972412,
2618
+ "learning_rate": 3.7176532250741857e-06,
2619
+ "loss": 0.4308,
2620
+ "step": 3730
2621
+ },
2622
+ {
2623
+ "epoch": 7.48,
2624
+ "grad_norm": 7.84911584854126,
2625
+ "learning_rate": 3.663490447748236e-06,
2626
+ "loss": 0.3988,
2627
+ "step": 3740
2628
+ },
2629
+ {
2630
+ "epoch": 7.5,
2631
+ "grad_norm": 2.369114875793457,
2632
+ "learning_rate": 3.6096365029583803e-06,
2633
+ "loss": 0.3983,
2634
+ "step": 3750
2635
+ },
2636
+ {
2637
+ "epoch": 7.52,
2638
+ "grad_norm": 3.6399729251861572,
2639
+ "learning_rate": 3.5560940154699133e-06,
2640
+ "loss": 0.37,
2641
+ "step": 3760
2642
+ },
2643
+ {
2644
+ "epoch": 7.54,
2645
+ "grad_norm": 3.580399751663208,
2646
+ "learning_rate": 3.502865594868136e-06,
2647
+ "loss": 0.3645,
2648
+ "step": 3770
2649
+ },
2650
+ {
2651
+ "epoch": 7.5600000000000005,
2652
+ "grad_norm": 3.4667141437530518,
2653
+ "learning_rate": 3.4499538354311757e-06,
2654
+ "loss": 0.4179,
2655
+ "step": 3780
2656
+ },
2657
+ {
2658
+ "epoch": 7.58,
2659
+ "grad_norm": 2.440298318862915,
2660
+ "learning_rate": 3.397361316003539e-06,
2661
+ "loss": 0.324,
2662
+ "step": 3790
2663
+ },
2664
+ {
2665
+ "epoch": 7.6,
2666
+ "grad_norm": 2.0638418197631836,
2667
+ "learning_rate": 3.3450905998704274e-06,
2668
+ "loss": 0.3789,
2669
+ "step": 3800
2670
+ },
2671
+ {
2672
+ "epoch": 7.62,
2673
+ "grad_norm": 2.4778010845184326,
2674
+ "learning_rate": 3.2931442346328e-06,
2675
+ "loss": 0.3608,
2676
+ "step": 3810
2677
+ },
2678
+ {
2679
+ "epoch": 7.64,
2680
+ "grad_norm": 2.4514052867889404,
2681
+ "learning_rate": 3.241524752083215e-06,
2682
+ "loss": 0.3985,
2683
+ "step": 3820
2684
+ },
2685
+ {
2686
+ "epoch": 7.66,
2687
+ "grad_norm": 1.3875998258590698,
2688
+ "learning_rate": 3.190234668082427e-06,
2689
+ "loss": 0.3447,
2690
+ "step": 3830
2691
+ },
2692
+ {
2693
+ "epoch": 7.68,
2694
+ "grad_norm": 1.2562263011932373,
2695
+ "learning_rate": 3.1392764824367706e-06,
2696
+ "loss": 0.3426,
2697
+ "step": 3840
2698
+ },
2699
+ {
2700
+ "epoch": 7.7,
2701
+ "grad_norm": 1.41866135597229,
2702
+ "learning_rate": 3.0886526787763237e-06,
2703
+ "loss": 0.3576,
2704
+ "step": 3850
2705
+ },
2706
+ {
2707
+ "epoch": 7.72,
2708
+ "grad_norm": 3.5626509189605713,
2709
+ "learning_rate": 3.038365724433858e-06,
2710
+ "loss": 0.3928,
2711
+ "step": 3860
2712
+ },
2713
+ {
2714
+ "epoch": 7.74,
2715
+ "grad_norm": 5.021074295043945,
2716
+ "learning_rate": 2.988418070324577e-06,
2717
+ "loss": 0.3589,
2718
+ "step": 3870
2719
+ },
2720
+ {
2721
+ "epoch": 7.76,
2722
+ "grad_norm": 1.392564058303833,
2723
+ "learning_rate": 2.938812150826684e-06,
2724
+ "loss": 0.3851,
2725
+ "step": 3880
2726
+ },
2727
+ {
2728
+ "epoch": 7.78,
2729
+ "grad_norm": 3.992396116256714,
2730
+ "learning_rate": 2.8895503836627105e-06,
2731
+ "loss": 0.3688,
2732
+ "step": 3890
2733
+ },
2734
+ {
2735
+ "epoch": 7.8,
2736
+ "grad_norm": 2.653534173965454,
2737
+ "learning_rate": 2.840635169781688e-06,
2738
+ "loss": 0.3585,
2739
+ "step": 3900
2740
+ },
2741
+ {
2742
+ "epoch": 7.82,
2743
+ "grad_norm": 6.615116596221924,
2744
+ "learning_rate": 2.7920688932421337e-06,
2745
+ "loss": 0.3653,
2746
+ "step": 3910
2747
+ },
2748
+ {
2749
+ "epoch": 7.84,
2750
+ "grad_norm": 5.369716644287109,
2751
+ "learning_rate": 2.7438539210958483e-06,
2752
+ "loss": 0.3512,
2753
+ "step": 3920
2754
+ },
2755
+ {
2756
+ "epoch": 7.86,
2757
+ "grad_norm": 1.679790735244751,
2758
+ "learning_rate": 2.6959926032725537e-06,
2759
+ "loss": 0.3717,
2760
+ "step": 3930
2761
+ },
2762
+ {
2763
+ "epoch": 7.88,
2764
+ "grad_norm": 2.233903646469116,
2765
+ "learning_rate": 2.648487272465361e-06,
2766
+ "loss": 0.3806,
2767
+ "step": 3940
2768
+ },
2769
+ {
2770
+ "epoch": 7.9,
2771
+ "grad_norm": 3.0028107166290283,
2772
+ "learning_rate": 2.6013402440170676e-06,
2773
+ "loss": 0.3993,
2774
+ "step": 3950
2775
+ },
2776
+ {
2777
+ "epoch": 7.92,
2778
+ "grad_norm": 3.600489854812622,
2779
+ "learning_rate": 2.5545538158073278e-06,
2780
+ "loss": 0.3387,
2781
+ "step": 3960
2782
+ },
2783
+ {
2784
+ "epoch": 7.9399999999999995,
2785
+ "grad_norm": 7.545295715332031,
2786
+ "learning_rate": 2.512756228659141e-06,
2787
+ "loss": 0.37,
2788
+ "step": 3970
2789
+ },
2790
+ {
2791
+ "epoch": 7.96,
2792
+ "grad_norm": 2.566960573196411,
2793
+ "learning_rate": 2.4666612085261344e-06,
2794
+ "loss": 0.3967,
2795
+ "step": 3980
2796
+ },
2797
+ {
2798
+ "epoch": 7.98,
2799
+ "grad_norm": 2.3997247219085693,
2800
+ "learning_rate": 2.420933352697865e-06,
2801
+ "loss": 0.4029,
2802
+ "step": 3990
2803
+ },
2804
+ {
2805
+ "epoch": 8.0,
2806
+ "grad_norm": 2.916670560836792,
2807
+ "learning_rate": 2.37557488988552e-06,
2808
+ "loss": 0.3713,
2809
+ "step": 4000
2810
+ },
2811
+ {
2812
+ "epoch": 8.02,
2813
+ "grad_norm": 1.7952624559402466,
2814
+ "learning_rate": 2.3305880307965834e-06,
2815
+ "loss": 0.3232,
2816
+ "step": 4010
2817
+ },
2818
+ {
2819
+ "epoch": 8.04,
2820
+ "grad_norm": 1.91434645652771,
2821
+ "learning_rate": 2.2859749680270983e-06,
2822
+ "loss": 0.331,
2823
+ "step": 4020
2824
+ },
2825
+ {
2826
+ "epoch": 8.06,
2827
+ "grad_norm": 3.671706438064575,
2828
+ "learning_rate": 2.241737875954808e-06,
2829
+ "loss": 0.3818,
2830
+ "step": 4030
2831
+ },
2832
+ {
2833
+ "epoch": 8.08,
2834
+ "grad_norm": 1.5308889150619507,
2835
+ "learning_rate": 2.1978789106331666e-06,
2836
+ "loss": 0.3482,
2837
+ "step": 4040
2838
+ },
2839
+ {
2840
+ "epoch": 8.1,
2841
+ "grad_norm": 1.8674166202545166,
2842
+ "learning_rate": 2.154400209686268e-06,
2843
+ "loss": 0.3195,
2844
+ "step": 4050
2845
+ },
2846
+ {
2847
+ "epoch": 8.12,
2848
+ "grad_norm": 1.5842407941818237,
2849
+ "learning_rate": 2.1113038922046603e-06,
2850
+ "loss": 0.3557,
2851
+ "step": 4060
2852
+ },
2853
+ {
2854
+ "epoch": 8.14,
2855
+ "grad_norm": 2.2769813537597656,
2856
+ "learning_rate": 2.0685920586420562e-06,
2857
+ "loss": 0.2853,
2858
+ "step": 4070
2859
+ },
2860
+ {
2861
+ "epoch": 8.16,
2862
+ "grad_norm": 1.7789726257324219,
2863
+ "learning_rate": 2.026266790712965e-06,
2864
+ "loss": 0.316,
2865
+ "step": 4080
2866
+ },
2867
+ {
2868
+ "epoch": 8.18,
2869
+ "grad_norm": 2.544579029083252,
2870
+ "learning_rate": 1.984330151291233e-06,
2871
+ "loss": 0.3328,
2872
+ "step": 4090
2873
+ },
2874
+ {
2875
+ "epoch": 8.2,
2876
+ "grad_norm": 5.644877910614014,
2877
+ "learning_rate": 1.9427841843095063e-06,
2878
+ "loss": 0.3338,
2879
+ "step": 4100
2880
+ },
2881
+ {
2882
+ "epoch": 8.22,
2883
+ "grad_norm": 2.9125707149505615,
2884
+ "learning_rate": 1.9016309146596024e-06,
2885
+ "loss": 0.3226,
2886
+ "step": 4110
2887
+ },
2888
+ {
2889
+ "epoch": 8.24,
2890
+ "grad_norm": 2.9386703968048096,
2891
+ "learning_rate": 1.8608723480938207e-06,
2892
+ "loss": 0.3147,
2893
+ "step": 4120
2894
+ },
2895
+ {
2896
+ "epoch": 8.26,
2897
+ "grad_norm": 5.057535648345947,
2898
+ "learning_rate": 1.820510471127196e-06,
2899
+ "loss": 0.3549,
2900
+ "step": 4130
2901
+ },
2902
+ {
2903
+ "epoch": 8.28,
2904
+ "grad_norm": 1.1568169593811035,
2905
+ "learning_rate": 1.7805472509406695e-06,
2906
+ "loss": 0.3701,
2907
+ "step": 4140
2908
+ },
2909
+ {
2910
+ "epoch": 8.3,
2911
+ "grad_norm": 2.978498697280884,
2912
+ "learning_rate": 1.7409846352852144e-06,
2913
+ "loss": 0.341,
2914
+ "step": 4150
2915
+ },
2916
+ {
2917
+ "epoch": 8.32,
2918
+ "grad_norm": 1.8623732328414917,
2919
+ "learning_rate": 1.7018245523869038e-06,
2920
+ "loss": 0.2754,
2921
+ "step": 4160
2922
+ },
2923
+ {
2924
+ "epoch": 8.34,
2925
+ "grad_norm": 3.46683406829834,
2926
+ "learning_rate": 1.6630689108529286e-06,
2927
+ "loss": 0.3958,
2928
+ "step": 4170
2929
+ },
2930
+ {
2931
+ "epoch": 8.36,
2932
+ "grad_norm": 5.015219211578369,
2933
+ "learning_rate": 1.6247195995785836e-06,
2934
+ "loss": 0.3512,
2935
+ "step": 4180
2936
+ },
2937
+ {
2938
+ "epoch": 8.38,
2939
+ "grad_norm": 1.5242810249328613,
2940
+ "learning_rate": 1.5867784876551973e-06,
2941
+ "loss": 0.3533,
2942
+ "step": 4190
2943
+ },
2944
+ {
2945
+ "epoch": 8.4,
2946
+ "grad_norm": 4.693676948547363,
2947
+ "learning_rate": 1.5492474242790368e-06,
2948
+ "loss": 0.3746,
2949
+ "step": 4200
2950
+ },
2951
+ {
2952
+ "epoch": 8.42,
2953
+ "grad_norm": 2.436262845993042,
2954
+ "learning_rate": 1.5121282386611823e-06,
2955
+ "loss": 0.3274,
2956
+ "step": 4210
2957
+ },
2958
+ {
2959
+ "epoch": 8.44,
2960
+ "grad_norm": 2.2660608291625977,
2961
+ "learning_rate": 1.4754227399383758e-06,
2962
+ "loss": 0.3055,
2963
+ "step": 4220
2964
+ },
2965
+ {
2966
+ "epoch": 8.46,
2967
+ "grad_norm": 2.7948834896087646,
2968
+ "learning_rate": 1.439132717084839e-06,
2969
+ "loss": 0.3078,
2970
+ "step": 4230
2971
+ },
2972
+ {
2973
+ "epoch": 8.48,
2974
+ "grad_norm": 1.3765865564346313,
2975
+ "learning_rate": 1.40325993882509e-06,
2976
+ "loss": 0.3194,
2977
+ "step": 4240
2978
+ },
2979
+ {
2980
+ "epoch": 8.5,
2981
+ "grad_norm": 1.2223212718963623,
2982
+ "learning_rate": 1.3678061535477305e-06,
2983
+ "loss": 0.352,
2984
+ "step": 4250
2985
+ },
2986
+ {
2987
+ "epoch": 8.52,
2988
+ "grad_norm": 2.556001663208008,
2989
+ "learning_rate": 1.3327730892202384e-06,
2990
+ "loss": 0.3061,
2991
+ "step": 4260
2992
+ },
2993
+ {
2994
+ "epoch": 8.54,
2995
+ "grad_norm": 4.0893168449401855,
2996
+ "learning_rate": 1.2981624533047432e-06,
2997
+ "loss": 0.406,
2998
+ "step": 4270
2999
+ },
3000
+ {
3001
+ "epoch": 8.56,
3002
+ "grad_norm": 1.8102929592132568,
3003
+ "learning_rate": 1.2639759326748136e-06,
3004
+ "loss": 0.3335,
3005
+ "step": 4280
3006
+ },
3007
+ {
3008
+ "epoch": 8.58,
3009
+ "grad_norm": 0.6934239268302917,
3010
+ "learning_rate": 1.230215193533233e-06,
3011
+ "loss": 0.4048,
3012
+ "step": 4290
3013
+ },
3014
+ {
3015
+ "epoch": 8.6,
3016
+ "grad_norm": 2.5495901107788086,
3017
+ "learning_rate": 1.196881881330798e-06,
3018
+ "loss": 0.3388,
3019
+ "step": 4300
3020
+ },
3021
+ {
3022
+ "epoch": 8.62,
3023
+ "grad_norm": 2.681366443634033,
3024
+ "learning_rate": 1.1639776206861197e-06,
3025
+ "loss": 0.358,
3026
+ "step": 4310
3027
+ },
3028
+ {
3029
+ "epoch": 8.64,
3030
+ "grad_norm": 1.624990463256836,
3031
+ "learning_rate": 1.1315040153064416e-06,
3032
+ "loss": 0.3628,
3033
+ "step": 4320
3034
+ },
3035
+ {
3036
+ "epoch": 8.66,
3037
+ "grad_norm": 7.331467151641846,
3038
+ "learning_rate": 1.0994626479094749e-06,
3039
+ "loss": 0.3585,
3040
+ "step": 4330
3041
+ },
3042
+ {
3043
+ "epoch": 8.68,
3044
+ "grad_norm": 1.2213658094406128,
3045
+ "learning_rate": 1.0678550801462662e-06,
3046
+ "loss": 0.3583,
3047
+ "step": 4340
3048
+ },
3049
+ {
3050
+ "epoch": 8.7,
3051
+ "grad_norm": 2.539713144302368,
3052
+ "learning_rate": 1.0366828525250728e-06,
3053
+ "loss": 0.2861,
3054
+ "step": 4350
3055
+ },
3056
+ {
3057
+ "epoch": 8.72,
3058
+ "grad_norm": 4.281270980834961,
3059
+ "learning_rate": 1.0059474843362893e-06,
3060
+ "loss": 0.3422,
3061
+ "step": 4360
3062
+ },
3063
+ {
3064
+ "epoch": 8.74,
3065
+ "grad_norm": 1.515568733215332,
3066
+ "learning_rate": 9.756504735784067e-07,
3067
+ "loss": 0.3337,
3068
+ "step": 4370
3069
+ },
3070
+ {
3071
+ "epoch": 8.76,
3072
+ "grad_norm": 1.730093240737915,
3073
+ "learning_rate": 9.457932968849826e-07,
3074
+ "loss": 0.3163,
3075
+ "step": 4380
3076
+ },
3077
+ {
3078
+ "epoch": 8.78,
3079
+ "grad_norm": 4.305525302886963,
3080
+ "learning_rate": 9.16377409452689e-07,
3081
+ "loss": 0.3132,
3082
+ "step": 4390
3083
+ },
3084
+ {
3085
+ "epoch": 8.8,
3086
+ "grad_norm": 1.6857116222381592,
3087
+ "learning_rate": 8.874042449703779e-07,
3088
+ "loss": 0.3108,
3089
+ "step": 4400
3090
+ },
3091
+ {
3092
+ "epoch": 8.82,
3093
+ "grad_norm": 1.7638370990753174,
3094
+ "learning_rate": 8.58875215549212e-07,
3095
+ "loss": 0.3444,
3096
+ "step": 4410
3097
+ },
3098
+ {
3099
+ "epoch": 8.84,
3100
+ "grad_norm": 2.9410483837127686,
3101
+ "learning_rate": 8.307917116538378e-07,
3102
+ "loss": 0.3582,
3103
+ "step": 4420
3104
+ },
3105
+ {
3106
+ "epoch": 8.86,
3107
+ "grad_norm": 1.4133245944976807,
3108
+ "learning_rate": 8.031551020346129e-07,
3109
+ "loss": 0.3014,
3110
+ "step": 4430
3111
+ },
3112
+ {
3113
+ "epoch": 8.88,
3114
+ "grad_norm": 2.466925621032715,
3115
+ "learning_rate": 7.759667336609011e-07,
3116
+ "loss": 0.3578,
3117
+ "step": 4440
3118
+ },
3119
+ {
3120
+ "epoch": 8.9,
3121
+ "grad_norm": 1.979108214378357,
3122
+ "learning_rate": 7.492279316554207e-07,
3123
+ "loss": 0.3253,
3124
+ "step": 4450
3125
+ },
3126
+ {
3127
+ "epoch": 8.92,
3128
+ "grad_norm": 1.6241674423217773,
3129
+ "learning_rate": 7.22939999229657e-07,
3130
+ "loss": 0.3839,
3131
+ "step": 4460
3132
+ },
3133
+ {
3134
+ "epoch": 8.94,
3135
+ "grad_norm": 2.58152174949646,
3136
+ "learning_rate": 6.971042176203535e-07,
3137
+ "loss": 0.268,
3138
+ "step": 4470
3139
+ },
3140
+ {
3141
+ "epoch": 8.96,
3142
+ "grad_norm": 2.5680618286132812,
3143
+ "learning_rate": 6.717218460270536e-07,
3144
+ "loss": 0.332,
3145
+ "step": 4480
3146
+ },
3147
+ {
3148
+ "epoch": 8.98,
3149
+ "grad_norm": 2.1524252891540527,
3150
+ "learning_rate": 6.467941215507434e-07,
3151
+ "loss": 0.361,
3152
+ "step": 4490
3153
+ },
3154
+ {
3155
+ "epoch": 9.0,
3156
+ "grad_norm": 1.6696678400039673,
3157
+ "learning_rate": 6.223222591335409e-07,
3158
+ "loss": 0.3358,
3159
+ "step": 4500
3160
+ },
3161
+ {
3162
+ "epoch": 9.02,
3163
+ "grad_norm": 1.1763229370117188,
3164
+ "learning_rate": 5.98307451499498e-07,
3165
+ "loss": 0.2874,
3166
+ "step": 4510
3167
+ },
3168
+ {
3169
+ "epoch": 9.04,
3170
+ "grad_norm": 1.381579041481018,
3171
+ "learning_rate": 5.747508690964599e-07,
3172
+ "loss": 0.361,
3173
+ "step": 4520
3174
+ },
3175
+ {
3176
+ "epoch": 9.06,
3177
+ "grad_norm": 1.560950756072998,
3178
+ "learning_rate": 5.516536600390188e-07,
3179
+ "loss": 0.2929,
3180
+ "step": 4530
3181
+ },
3182
+ {
3183
+ "epoch": 9.08,
3184
+ "grad_norm": 2.703350782394409,
3185
+ "learning_rate": 5.290169500525577e-07,
3186
+ "loss": 0.2854,
3187
+ "step": 4540
3188
+ },
3189
+ {
3190
+ "epoch": 9.1,
3191
+ "grad_norm": 1.5632970333099365,
3192
+ "learning_rate": 5.068418424183874e-07,
3193
+ "loss": 0.3173,
3194
+ "step": 4550
3195
+ },
3196
+ {
3197
+ "epoch": 9.12,
3198
+ "grad_norm": 1.8101422786712646,
3199
+ "learning_rate": 4.851294179199673e-07,
3200
+ "loss": 0.3683,
3201
+ "step": 4560
3202
+ },
3203
+ {
3204
+ "epoch": 9.14,
3205
+ "grad_norm": 1.0653437376022339,
3206
+ "learning_rate": 4.638807347902408e-07,
3207
+ "loss": 0.3256,
3208
+ "step": 4570
3209
+ },
3210
+ {
3211
+ "epoch": 9.16,
3212
+ "grad_norm": 2.522818088531494,
3213
+ "learning_rate": 4.4309682866004124e-07,
3214
+ "loss": 0.319,
3215
+ "step": 4580
3216
+ },
3217
+ {
3218
+ "epoch": 9.18,
3219
+ "grad_norm": 3.289451837539673,
3220
+ "learning_rate": 4.2277871250763327e-07,
3221
+ "loss": 0.3221,
3222
+ "step": 4590
3223
+ },
3224
+ {
3225
+ "epoch": 9.2,
3226
+ "grad_norm": 2.0382297039031982,
3227
+ "learning_rate": 4.0292737660933335e-07,
3228
+ "loss": 0.2951,
3229
+ "step": 4600
3230
+ },
3231
+ {
3232
+ "epoch": 9.22,
3233
+ "grad_norm": 2.1435165405273438,
3234
+ "learning_rate": 3.835437884912474e-07,
3235
+ "loss": 0.3738,
3236
+ "step": 4610
3237
+ },
3238
+ {
3239
+ "epoch": 9.24,
3240
+ "grad_norm": 1.6461173295974731,
3241
+ "learning_rate": 3.646288928821151e-07,
3242
+ "loss": 0.2898,
3243
+ "step": 4620
3244
+ },
3245
+ {
3246
+ "epoch": 9.26,
3247
+ "grad_norm": 2.4130804538726807,
3248
+ "learning_rate": 3.4618361166726123e-07,
3249
+ "loss": 0.3792,
3250
+ "step": 4630
3251
+ },
3252
+ {
3253
+ "epoch": 9.28,
3254
+ "grad_norm": 2.4446017742156982,
3255
+ "learning_rate": 3.282088438436715e-07,
3256
+ "loss": 0.3424,
3257
+ "step": 4640
3258
+ },
3259
+ {
3260
+ "epoch": 9.3,
3261
+ "grad_norm": 1.3320859670639038,
3262
+ "learning_rate": 3.10705465476171e-07,
3263
+ "loss": 0.358,
3264
+ "step": 4650
3265
+ },
3266
+ {
3267
+ "epoch": 9.32,
3268
+ "grad_norm": 3.8511667251586914,
3269
+ "learning_rate": 2.936743296547273e-07,
3270
+ "loss": 0.32,
3271
+ "step": 4660
3272
+ },
3273
+ {
3274
+ "epoch": 9.34,
3275
+ "grad_norm": 5.630286693572998,
3276
+ "learning_rate": 2.771162664528726e-07,
3277
+ "loss": 0.3079,
3278
+ "step": 4670
3279
+ },
3280
+ {
3281
+ "epoch": 9.36,
3282
+ "grad_norm": 1.600219964981079,
3283
+ "learning_rate": 2.6103208288724815e-07,
3284
+ "loss": 0.2834,
3285
+ "step": 4680
3286
+ },
3287
+ {
3288
+ "epoch": 9.38,
3289
+ "grad_norm": 1.0820380449295044,
3290
+ "learning_rate": 2.4542256287826915e-07,
3291
+ "loss": 0.354,
3292
+ "step": 4690
3293
+ },
3294
+ {
3295
+ "epoch": 9.4,
3296
+ "grad_norm": 1.4870035648345947,
3297
+ "learning_rate": 2.3028846721191878e-07,
3298
+ "loss": 0.3243,
3299
+ "step": 4700
3300
+ },
3301
+ {
3302
+ "epoch": 9.42,
3303
+ "grad_norm": 3.148569107055664,
3304
+ "learning_rate": 2.1563053350266983e-07,
3305
+ "loss": 0.3121,
3306
+ "step": 4710
3307
+ },
3308
+ {
3309
+ "epoch": 9.44,
3310
+ "grad_norm": 1.8829195499420166,
3311
+ "learning_rate": 2.014494761575314e-07,
3312
+ "loss": 0.3142,
3313
+ "step": 4720
3314
+ },
3315
+ {
3316
+ "epoch": 9.46,
3317
+ "grad_norm": 3.1038215160369873,
3318
+ "learning_rate": 1.877459863412323e-07,
3319
+ "loss": 0.3287,
3320
+ "step": 4730
3321
+ },
3322
+ {
3323
+ "epoch": 9.48,
3324
+ "grad_norm": 1.9286001920700073,
3325
+ "learning_rate": 1.7452073194253237e-07,
3326
+ "loss": 0.2989,
3327
+ "step": 4740
3328
+ },
3329
+ {
3330
+ "epoch": 9.5,
3331
+ "grad_norm": 2.0495471954345703,
3332
+ "learning_rate": 1.6177435754167413e-07,
3333
+ "loss": 0.3632,
3334
+ "step": 4750
3335
+ },
3336
+ {
3337
+ "epoch": 9.52,
3338
+ "grad_norm": 2.1833696365356445,
3339
+ "learning_rate": 1.4950748437896235e-07,
3340
+ "loss": 0.265,
3341
+ "step": 4760
3342
+ },
3343
+ {
3344
+ "epoch": 9.54,
3345
+ "grad_norm": 3.15493106842041,
3346
+ "learning_rate": 1.377207103244904e-07,
3347
+ "loss": 0.283,
3348
+ "step": 4770
3349
+ },
3350
+ {
3351
+ "epoch": 9.56,
3352
+ "grad_norm": 1.2273836135864258,
3353
+ "learning_rate": 1.26414609848996e-07,
3354
+ "loss": 0.2264,
3355
+ "step": 4780
3356
+ },
3357
+ {
3358
+ "epoch": 9.58,
3359
+ "grad_norm": 1.6316149234771729,
3360
+ "learning_rate": 1.1558973399586671e-07,
3361
+ "loss": 0.3198,
3362
+ "step": 4790
3363
+ },
3364
+ {
3365
+ "epoch": 9.6,
3366
+ "grad_norm": 2.2700629234313965,
3367
+ "learning_rate": 1.052466103542793e-07,
3368
+ "loss": 0.2258,
3369
+ "step": 4800
3370
+ },
3371
+ {
3372
+ "epoch": 9.62,
3373
+ "grad_norm": 1.132501244544983,
3374
+ "learning_rate": 9.538574303348813e-08,
3375
+ "loss": 0.3053,
3376
+ "step": 4810
3377
+ },
3378
+ {
3379
+ "epoch": 9.64,
3380
+ "grad_norm": 1.9846259355545044,
3381
+ "learning_rate": 8.600761263825475e-08,
3382
+ "loss": 0.278,
3383
+ "step": 4820
3384
+ },
3385
+ {
3386
+ "epoch": 9.66,
3387
+ "grad_norm": 1.3777711391448975,
3388
+ "learning_rate": 7.71126762454233e-08,
3389
+ "loss": 0.3211,
3390
+ "step": 4830
3391
+ },
3392
+ {
3393
+ "epoch": 9.68,
3394
+ "grad_norm": 1.355865240097046,
3395
+ "learning_rate": 6.870136738164612e-08,
3396
+ "loss": 0.3079,
3397
+ "step": 4840
3398
+ },
3399
+ {
3400
+ "epoch": 9.7,
3401
+ "grad_norm": 2.8824238777160645,
3402
+ "learning_rate": 6.07740960022507e-08,
3403
+ "loss": 0.3717,
3404
+ "step": 4850
3405
+ },
3406
+ {
3407
+ "epoch": 9.72,
3408
+ "grad_norm": 2.4788002967834473,
3409
+ "learning_rate": 5.3331248471258926e-08,
3410
+ "loss": 0.3052,
3411
+ "step": 4860
3412
+ },
3413
+ {
3414
+ "epoch": 9.74,
3415
+ "grad_norm": 1.3350954055786133,
3416
+ "learning_rate": 4.6373187542561036e-08,
3417
+ "loss": 0.3018,
3418
+ "step": 4870
3419
+ },
3420
+ {
3421
+ "epoch": 9.76,
3422
+ "grad_norm": 1.485877513885498,
3423
+ "learning_rate": 3.990025234222872e-08,
3424
+ "loss": 0.2694,
3425
+ "step": 4880
3426
+ },
3427
+ {
3428
+ "epoch": 9.78,
3429
+ "grad_norm": 2.1320180892944336,
3430
+ "learning_rate": 3.391275835199159e-08,
3431
+ "loss": 0.323,
3432
+ "step": 4890
3433
+ },
3434
+ {
3435
+ "epoch": 9.8,
3436
+ "grad_norm": 1.2034118175506592,
3437
+ "learning_rate": 2.8410997393860663e-08,
3438
+ "loss": 0.302,
3439
+ "step": 4900
3440
+ },
3441
+ {
3442
+ "epoch": 9.82,
3443
+ "grad_norm": 1.9911288022994995,
3444
+ "learning_rate": 2.339523761590301e-08,
3445
+ "loss": 0.3561,
3446
+ "step": 4910
3447
+ },
3448
+ {
3449
+ "epoch": 9.84,
3450
+ "grad_norm": 4.588063716888428,
3451
+ "learning_rate": 1.886572347917337e-08,
3452
+ "loss": 0.3486,
3453
+ "step": 4920
3454
+ },
3455
+ {
3456
+ "epoch": 9.86,
3457
+ "grad_norm": 1.2661594152450562,
3458
+ "learning_rate": 1.482267574580143e-08,
3459
+ "loss": 0.3651,
3460
+ "step": 4930
3461
+ },
3462
+ {
3463
+ "epoch": 9.88,
3464
+ "grad_norm": 1.7865337133407593,
3465
+ "learning_rate": 1.126629146822933e-08,
3466
+ "loss": 0.2544,
3467
+ "step": 4940
3468
+ },
3469
+ {
3470
+ "epoch": 9.9,
3471
+ "grad_norm": 3.7908692359924316,
3472
+ "learning_rate": 8.196743979610455e-09,
3473
+ "loss": 0.2575,
3474
+ "step": 4950
3475
+ },
3476
+ {
3477
+ "epoch": 9.92,
3478
+ "grad_norm": 2.3560678958892822,
3479
+ "learning_rate": 5.614182885357311e-09,
3480
+ "loss": 0.2833,
3481
+ "step": 4960
3482
+ },
3483
+ {
3484
+ "epoch": 9.94,
3485
+ "grad_norm": 1.629074215888977,
3486
+ "learning_rate": 3.518734055855122e-09,
3487
+ "loss": 0.3345,
3488
+ "step": 4970
3489
+ },
3490
+ {
3491
+ "epoch": 9.96,
3492
+ "grad_norm": 2.738832473754883,
3493
+ "learning_rate": 1.910499620322304e-09,
3494
+ "loss": 0.3034,
3495
+ "step": 4980
3496
+ },
3497
+ {
3498
+ "epoch": 9.98,
3499
+ "grad_norm": 1.9189926385879517,
3500
+ "learning_rate": 7.895579618388827e-10,
3501
+ "loss": 0.2543,
3502
+ "step": 4990
3503
+ },
3504
+ {
3505
+ "epoch": 10.0,
3506
+ "grad_norm": 1.597418189048767,
3507
+ "learning_rate": 1.559637135173375e-10,
3508
+ "loss": 0.3149,
3509
+ "step": 5000
3510
+ }
3511
+ ],
3512
+ "logging_steps": 10,
3513
+ "max_steps": 5000,
3514
+ "num_input_tokens_seen": 0,
3515
+ "num_train_epochs": 10,
3516
+ "save_steps": 100,
3517
+ "stateful_callbacks": {
3518
+ "TrainerControl": {
3519
+ "args": {
3520
+ "should_epoch_stop": false,
3521
+ "should_evaluate": false,
3522
+ "should_log": false,
3523
+ "should_save": true,
3524
+ "should_training_stop": true
3525
+ },
3526
+ "attributes": {}
3527
+ }
3528
+ },
3529
+ "total_flos": 1.1899956660640154e+17,
3530
+ "train_batch_size": 1,
3531
+ "trial_name": null,
3532
+ "trial_params": null
3533
+ }
checkpoint-5000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:692cb1fd6a6230812580978c9375ef6ab236f04422b096db6a0f3b485bb57b52
3
+ size 5688
checkpoint-5000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "Qwen2VLImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "max_pixels": 12845056,
18
+ "merge_size": 2,
19
+ "min_pixels": 3136,
20
+ "patch_size": 14,
21
+ "processor_class": "Qwen2VLProcessor",
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "longest_edge": 12845056,
26
+ "shortest_edge": 3136
27
+ },
28
+ "temporal_patch_size": 2
29
+ }
runs/Mar02_19-17-49_a112c240632f/events.out.tfevents.1740943165.a112c240632f.132.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:807c12bdfd50817f5868aaa721c3b1391699c5c08b3b1b5f4f74256de9c28635
3
+ size 112229
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:091aa7594dc2fcfbfa06b9e3c22a5f0562ac14f30375c13af7309407a0e67b8a
3
+ size 11420371
tokenizer_config.json ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|object_ref_start|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|object_ref_end|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|box_start|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_start|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_end|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|image_pad|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|video_pad|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ }
116
+ },
117
+ "additional_special_tokens": [
118
+ "<|im_start|>",
119
+ "<|im_end|>",
120
+ "<|object_ref_start|>",
121
+ "<|object_ref_end|>",
122
+ "<|box_start|>",
123
+ "<|box_end|>",
124
+ "<|quad_start|>",
125
+ "<|quad_end|>",
126
+ "<|vision_start|>",
127
+ "<|vision_end|>",
128
+ "<|vision_pad|>",
129
+ "<|image_pad|>",
130
+ "<|video_pad|>"
131
+ ],
132
+ "bos_token": null,
133
+ "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
134
+ "clean_up_tokenization_spaces": false,
135
+ "eos_token": "<|im_end|>",
136
+ "errors": "replace",
137
+ "extra_special_tokens": {},
138
+ "model_max_length": 2048,
139
+ "pad_token": "<|endoftext|>",
140
+ "padding_side": "right",
141
+ "processor_class": "Qwen2VLProcessor",
142
+ "split_special_tokens": false,
143
+ "tokenizer_class": "Qwen2Tokenizer",
144
+ "unk_token": null
145
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 10.0,
3
+ "total_flos": 1.1899956660640154e+17,
4
+ "train_loss": 0.49240388979911803,
5
+ "train_runtime": 35927.6647,
6
+ "train_samples_per_second": 0.278,
7
+ "train_steps_per_second": 0.139
8
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 5000, "loss": 1.3992, "lr": 2.4000000000000003e-07, "epoch": 0.02, "percentage": 0.2, "elapsed_time": "0:01:03", "remaining_time": "8:51:52"}
2
+ {"current_steps": 20, "total_steps": 5000, "loss": 1.4497, "lr": 6.000000000000001e-07, "epoch": 0.04, "percentage": 0.4, "elapsed_time": "0:02:15", "remaining_time": "9:21:43"}
3
+ {"current_steps": 30, "total_steps": 5000, "loss": 1.2599, "lr": 1.0000000000000002e-06, "epoch": 0.06, "percentage": 0.6, "elapsed_time": "0:03:26", "remaining_time": "9:30:46"}
4
+ {"current_steps": 40, "total_steps": 5000, "loss": 1.4124, "lr": 1.4000000000000001e-06, "epoch": 0.08, "percentage": 0.8, "elapsed_time": "0:04:37", "remaining_time": "9:33:42"}
5
+ {"current_steps": 50, "total_steps": 5000, "loss": 1.3542, "lr": 1.8000000000000001e-06, "epoch": 0.1, "percentage": 1.0, "elapsed_time": "0:05:48", "remaining_time": "9:35:37"}
6
+ {"current_steps": 60, "total_steps": 5000, "loss": 1.2123, "lr": 2.2e-06, "epoch": 0.12, "percentage": 1.2, "elapsed_time": "0:07:00", "remaining_time": "9:36:27"}
7
+ {"current_steps": 70, "total_steps": 5000, "loss": 1.2098, "lr": 2.6e-06, "epoch": 0.14, "percentage": 1.4, "elapsed_time": "0:08:11", "remaining_time": "9:37:19"}
8
+ {"current_steps": 80, "total_steps": 5000, "loss": 1.2226, "lr": 3e-06, "epoch": 0.16, "percentage": 1.6, "elapsed_time": "0:09:24", "remaining_time": "9:38:53"}
9
+ {"current_steps": 90, "total_steps": 5000, "loss": 1.0624, "lr": 3.3600000000000004e-06, "epoch": 0.18, "percentage": 1.8, "elapsed_time": "0:10:36", "remaining_time": "9:39:02"}
10
+ {"current_steps": 100, "total_steps": 5000, "loss": 1.0969, "lr": 3.7600000000000004e-06, "epoch": 0.2, "percentage": 2.0, "elapsed_time": "0:11:49", "remaining_time": "9:39:23"}
11
+ {"current_steps": 110, "total_steps": 5000, "loss": 1.0879, "lr": 4.16e-06, "epoch": 0.22, "percentage": 2.2, "elapsed_time": "0:13:03", "remaining_time": "9:40:32"}
12
+ {"current_steps": 120, "total_steps": 5000, "loss": 1.0195, "lr": 4.56e-06, "epoch": 0.24, "percentage": 2.4, "elapsed_time": "0:14:15", "remaining_time": "9:39:40"}
13
+ {"current_steps": 130, "total_steps": 5000, "loss": 0.9765, "lr": 4.960000000000001e-06, "epoch": 0.26, "percentage": 2.6, "elapsed_time": "0:15:27", "remaining_time": "9:39:11"}
14
+ {"current_steps": 140, "total_steps": 5000, "loss": 1.0205, "lr": 5.36e-06, "epoch": 0.28, "percentage": 2.8, "elapsed_time": "0:16:40", "remaining_time": "9:38:36"}
15
+ {"current_steps": 150, "total_steps": 5000, "loss": 0.9517, "lr": 5.76e-06, "epoch": 0.3, "percentage": 3.0, "elapsed_time": "0:17:51", "remaining_time": "9:37:25"}
16
+ {"current_steps": 160, "total_steps": 5000, "loss": 0.9324, "lr": 6.16e-06, "epoch": 0.32, "percentage": 3.2, "elapsed_time": "0:19:02", "remaining_time": "9:35:59"}
17
+ {"current_steps": 170, "total_steps": 5000, "loss": 0.804, "lr": 6.560000000000001e-06, "epoch": 0.34, "percentage": 3.4, "elapsed_time": "0:20:13", "remaining_time": "9:34:49"}
18
+ {"current_steps": 180, "total_steps": 5000, "loss": 0.7821, "lr": 6.96e-06, "epoch": 0.36, "percentage": 3.6, "elapsed_time": "0:21:26", "remaining_time": "9:33:57"}
19
+ {"current_steps": 190, "total_steps": 5000, "loss": 0.8086, "lr": 7.360000000000001e-06, "epoch": 0.38, "percentage": 3.8, "elapsed_time": "0:22:38", "remaining_time": "9:32:59"}
20
+ {"current_steps": 200, "total_steps": 5000, "loss": 0.7655, "lr": 7.76e-06, "epoch": 0.4, "percentage": 4.0, "elapsed_time": "0:23:50", "remaining_time": "9:32:02"}
21
+ {"current_steps": 210, "total_steps": 5000, "loss": 0.7508, "lr": 8.16e-06, "epoch": 0.42, "percentage": 4.2, "elapsed_time": "0:25:03", "remaining_time": "9:31:34"}
22
+ {"current_steps": 220, "total_steps": 5000, "loss": 0.7429, "lr": 8.560000000000001e-06, "epoch": 0.44, "percentage": 4.4, "elapsed_time": "0:26:15", "remaining_time": "9:30:29"}
23
+ {"current_steps": 230, "total_steps": 5000, "loss": 0.7502, "lr": 8.96e-06, "epoch": 0.46, "percentage": 4.6, "elapsed_time": "0:27:27", "remaining_time": "9:29:27"}
24
+ {"current_steps": 240, "total_steps": 5000, "loss": 0.7029, "lr": 9.360000000000002e-06, "epoch": 0.48, "percentage": 4.8, "elapsed_time": "0:28:39", "remaining_time": "9:28:22"}
25
+ {"current_steps": 250, "total_steps": 5000, "loss": 0.7324, "lr": 9.760000000000001e-06, "epoch": 0.5, "percentage": 5.0, "elapsed_time": "0:29:50", "remaining_time": "9:27:08"}
26
+ {"current_steps": 260, "total_steps": 5000, "loss": 0.7521, "lr": 1.0160000000000001e-05, "epoch": 0.52, "percentage": 5.2, "elapsed_time": "0:31:02", "remaining_time": "9:25:48"}
27
+ {"current_steps": 270, "total_steps": 5000, "loss": 0.698, "lr": 1.056e-05, "epoch": 0.54, "percentage": 5.4, "elapsed_time": "0:32:13", "remaining_time": "9:24:30"}
28
+ {"current_steps": 280, "total_steps": 5000, "loss": 0.6859, "lr": 1.0960000000000002e-05, "epoch": 0.56, "percentage": 5.6, "elapsed_time": "0:33:24", "remaining_time": "9:23:15"}
29
+ {"current_steps": 290, "total_steps": 5000, "loss": 0.6716, "lr": 1.136e-05, "epoch": 0.58, "percentage": 5.8, "elapsed_time": "0:34:36", "remaining_time": "9:22:04"}
30
+ {"current_steps": 300, "total_steps": 5000, "loss": 0.6978, "lr": 1.1760000000000001e-05, "epoch": 0.6, "percentage": 6.0, "elapsed_time": "0:35:47", "remaining_time": "9:20:48"}
31
+ {"current_steps": 310, "total_steps": 5000, "loss": 0.6584, "lr": 1.216e-05, "epoch": 0.62, "percentage": 6.2, "elapsed_time": "0:37:01", "remaining_time": "9:20:02"}
32
+ {"current_steps": 320, "total_steps": 5000, "loss": 0.7188, "lr": 1.2560000000000002e-05, "epoch": 0.64, "percentage": 6.4, "elapsed_time": "0:38:12", "remaining_time": "9:18:52"}
33
+ {"current_steps": 330, "total_steps": 5000, "loss": 0.6687, "lr": 1.2960000000000001e-05, "epoch": 0.66, "percentage": 6.6, "elapsed_time": "0:39:25", "remaining_time": "9:17:48"}
34
+ {"current_steps": 340, "total_steps": 5000, "loss": 0.623, "lr": 1.3360000000000003e-05, "epoch": 0.68, "percentage": 6.8, "elapsed_time": "0:40:37", "remaining_time": "9:16:48"}
35
+ {"current_steps": 350, "total_steps": 5000, "loss": 0.6795, "lr": 1.376e-05, "epoch": 0.7, "percentage": 7.0, "elapsed_time": "0:41:49", "remaining_time": "9:15:38"}
36
+ {"current_steps": 360, "total_steps": 5000, "loss": 0.6455, "lr": 1.416e-05, "epoch": 0.72, "percentage": 7.2, "elapsed_time": "0:43:00", "remaining_time": "9:14:20"}
37
+ {"current_steps": 370, "total_steps": 5000, "loss": 0.6548, "lr": 1.4560000000000001e-05, "epoch": 0.74, "percentage": 7.4, "elapsed_time": "0:44:13", "remaining_time": "9:13:18"}
38
+ {"current_steps": 380, "total_steps": 5000, "loss": 0.6508, "lr": 1.496e-05, "epoch": 0.76, "percentage": 7.6, "elapsed_time": "0:45:24", "remaining_time": "9:12:03"}
39
+ {"current_steps": 390, "total_steps": 5000, "loss": 0.7105, "lr": 1.5360000000000002e-05, "epoch": 0.78, "percentage": 7.8, "elapsed_time": "0:46:36", "remaining_time": "9:10:53"}
40
+ {"current_steps": 400, "total_steps": 5000, "loss": 0.6738, "lr": 1.576e-05, "epoch": 0.8, "percentage": 8.0, "elapsed_time": "0:47:47", "remaining_time": "9:09:33"}
41
+ {"current_steps": 410, "total_steps": 5000, "loss": 0.6741, "lr": 1.616e-05, "epoch": 0.82, "percentage": 8.2, "elapsed_time": "0:49:00", "remaining_time": "9:08:40"}
42
+ {"current_steps": 420, "total_steps": 5000, "loss": 0.6385, "lr": 1.656e-05, "epoch": 0.84, "percentage": 8.4, "elapsed_time": "0:50:12", "remaining_time": "9:07:28"}
43
+ {"current_steps": 430, "total_steps": 5000, "loss": 0.6304, "lr": 1.696e-05, "epoch": 0.86, "percentage": 8.6, "elapsed_time": "0:51:24", "remaining_time": "9:06:26"}
44
+ {"current_steps": 440, "total_steps": 5000, "loss": 0.652, "lr": 1.736e-05, "epoch": 0.88, "percentage": 8.8, "elapsed_time": "0:52:37", "remaining_time": "9:05:18"}
45
+ {"current_steps": 450, "total_steps": 5000, "loss": 0.6517, "lr": 1.7760000000000003e-05, "epoch": 0.9, "percentage": 9.0, "elapsed_time": "0:53:48", "remaining_time": "9:03:58"}
46
+ {"current_steps": 460, "total_steps": 5000, "loss": 0.7044, "lr": 1.8160000000000002e-05, "epoch": 0.92, "percentage": 9.2, "elapsed_time": "0:54:59", "remaining_time": "9:02:41"}
47
+ {"current_steps": 470, "total_steps": 5000, "loss": 0.6801, "lr": 1.8560000000000002e-05, "epoch": 0.94, "percentage": 9.4, "elapsed_time": "0:56:11", "remaining_time": "9:01:32"}
48
+ {"current_steps": 480, "total_steps": 5000, "loss": 0.6531, "lr": 1.896e-05, "epoch": 0.96, "percentage": 9.6, "elapsed_time": "0:57:21", "remaining_time": "9:00:11"}
49
+ {"current_steps": 490, "total_steps": 5000, "loss": 0.6837, "lr": 1.936e-05, "epoch": 0.98, "percentage": 9.8, "elapsed_time": "0:58:33", "remaining_time": "8:59:03"}
50
+ {"current_steps": 500, "total_steps": 5000, "loss": 0.6659, "lr": 1.976e-05, "epoch": 1.0, "percentage": 10.0, "elapsed_time": "0:59:46", "remaining_time": "8:57:56"}
51
+ {"current_steps": 510, "total_steps": 5000, "loss": 0.6212, "lr": 1.9999961008995607e-05, "epoch": 1.02, "percentage": 10.2, "elapsed_time": "1:00:59", "remaining_time": "8:56:59"}
52
+ {"current_steps": 520, "total_steps": 5000, "loss": 0.6705, "lr": 1.99995223636881e-05, "epoch": 1.04, "percentage": 10.4, "elapsed_time": "1:02:10", "remaining_time": "8:55:42"}
53
+ {"current_steps": 530, "total_steps": 5000, "loss": 0.6346, "lr": 1.9998596355767805e-05, "epoch": 1.06, "percentage": 10.6, "elapsed_time": "1:03:22", "remaining_time": "8:54:29"}
54
+ {"current_steps": 540, "total_steps": 5000, "loss": 0.6698, "lr": 1.999718303036705e-05, "epoch": 1.08, "percentage": 10.8, "elapsed_time": "1:04:34", "remaining_time": "8:53:20"}
55
+ {"current_steps": 550, "total_steps": 5000, "loss": 0.5925, "lr": 1.9995282456369313e-05, "epoch": 1.1, "percentage": 11.0, "elapsed_time": "1:05:46", "remaining_time": "8:52:09"}
56
+ {"current_steps": 560, "total_steps": 5000, "loss": 0.6795, "lr": 1.9992894726405894e-05, "epoch": 1.12, "percentage": 11.2, "elapsed_time": "1:06:58", "remaining_time": "8:50:57"}
57
+ {"current_steps": 570, "total_steps": 5000, "loss": 0.6096, "lr": 1.9990019956851384e-05, "epoch": 1.1400000000000001, "percentage": 11.4, "elapsed_time": "1:08:10", "remaining_time": "8:49:49"}
58
+ {"current_steps": 580, "total_steps": 5000, "loss": 0.5971, "lr": 1.998665828781799e-05, "epoch": 1.16, "percentage": 11.6, "elapsed_time": "1:09:22", "remaining_time": "8:48:40"}
59
+ {"current_steps": 590, "total_steps": 5000, "loss": 0.6055, "lr": 1.998280988314872e-05, "epoch": 1.18, "percentage": 11.8, "elapsed_time": "1:10:34", "remaining_time": "8:47:31"}
60
+ {"current_steps": 600, "total_steps": 5000, "loss": 0.6304, "lr": 1.9978474930409396e-05, "epoch": 1.2, "percentage": 12.0, "elapsed_time": "1:11:46", "remaining_time": "8:46:23"}
61
+ {"current_steps": 610, "total_steps": 5000, "loss": 0.6812, "lr": 1.9973653640879486e-05, "epoch": 1.22, "percentage": 12.2, "elapsed_time": "1:12:59", "remaining_time": "8:45:20"}
62
+ {"current_steps": 620, "total_steps": 5000, "loss": 0.5842, "lr": 1.9968346249541848e-05, "epoch": 1.24, "percentage": 12.4, "elapsed_time": "1:14:11", "remaining_time": "8:44:09"}
63
+ {"current_steps": 630, "total_steps": 5000, "loss": 0.6478, "lr": 1.996255301507125e-05, "epoch": 1.26, "percentage": 12.6, "elapsed_time": "1:15:22", "remaining_time": "8:42:53"}
64
+ {"current_steps": 640, "total_steps": 5000, "loss": 0.6003, "lr": 1.995627421982176e-05, "epoch": 1.28, "percentage": 12.8, "elapsed_time": "1:16:34", "remaining_time": "8:41:42"}
65
+ {"current_steps": 650, "total_steps": 5000, "loss": 0.6087, "lr": 1.9949510169813006e-05, "epoch": 1.3, "percentage": 13.0, "elapsed_time": "1:17:46", "remaining_time": "8:40:27"}
66
+ {"current_steps": 660, "total_steps": 5000, "loss": 0.5905, "lr": 1.9942261194715236e-05, "epoch": 1.32, "percentage": 13.2, "elapsed_time": "1:18:57", "remaining_time": "8:39:12"}
67
+ {"current_steps": 670, "total_steps": 5000, "loss": 0.593, "lr": 1.9934527647833276e-05, "epoch": 1.34, "percentage": 13.4, "elapsed_time": "1:20:10", "remaining_time": "8:38:05"}
68
+ {"current_steps": 680, "total_steps": 5000, "loss": 0.6253, "lr": 1.992630990608929e-05, "epoch": 1.3599999999999999, "percentage": 13.6, "elapsed_time": "1:21:22", "remaining_time": "8:36:55"}
69
+ {"current_steps": 690, "total_steps": 5000, "loss": 0.6338, "lr": 1.9917608370004417e-05, "epoch": 1.38, "percentage": 13.8, "elapsed_time": "1:22:34", "remaining_time": "8:35:47"}
70
+ {"current_steps": 700, "total_steps": 5000, "loss": 0.6148, "lr": 1.9908423463679246e-05, "epoch": 1.4, "percentage": 14.0, "elapsed_time": "1:23:45", "remaining_time": "8:34:31"}
71
+ {"current_steps": 710, "total_steps": 5000, "loss": 0.6271, "lr": 1.989875563477316e-05, "epoch": 1.42, "percentage": 14.2, "elapsed_time": "1:24:58", "remaining_time": "8:33:27"}
72
+ {"current_steps": 720, "total_steps": 5000, "loss": 0.638, "lr": 1.9888605354482494e-05, "epoch": 1.44, "percentage": 14.4, "elapsed_time": "1:26:12", "remaining_time": "8:32:25"}
73
+ {"current_steps": 730, "total_steps": 5000, "loss": 0.6304, "lr": 1.987797311751759e-05, "epoch": 1.46, "percentage": 14.6, "elapsed_time": "1:27:25", "remaining_time": "8:31:20"}
74
+ {"current_steps": 740, "total_steps": 5000, "loss": 0.5877, "lr": 1.986685944207868e-05, "epoch": 1.48, "percentage": 14.8, "elapsed_time": "1:28:37", "remaining_time": "8:30:11"}
75
+ {"current_steps": 750, "total_steps": 5000, "loss": 0.5747, "lr": 1.985526486983063e-05, "epoch": 1.5, "percentage": 15.0, "elapsed_time": "1:29:49", "remaining_time": "8:28:59"}
76
+ {"current_steps": 760, "total_steps": 5000, "loss": 0.6514, "lr": 1.9843189965876525e-05, "epoch": 1.52, "percentage": 15.2, "elapsed_time": "1:31:01", "remaining_time": "8:27:49"}
77
+ {"current_steps": 770, "total_steps": 5000, "loss": 0.5879, "lr": 1.9830635318730155e-05, "epoch": 1.54, "percentage": 15.4, "elapsed_time": "1:32:12", "remaining_time": "8:26:34"}
78
+ {"current_steps": 780, "total_steps": 5000, "loss": 0.6152, "lr": 1.981760154028731e-05, "epoch": 1.56, "percentage": 15.6, "elapsed_time": "1:33:24", "remaining_time": "8:25:19"}
79
+ {"current_steps": 790, "total_steps": 5000, "loss": 0.6342, "lr": 1.980408926579596e-05, "epoch": 1.58, "percentage": 15.8, "elapsed_time": "1:34:35", "remaining_time": "8:24:06"}
80
+ {"current_steps": 800, "total_steps": 5000, "loss": 0.6167, "lr": 1.97900991538253e-05, "epoch": 1.6, "percentage": 16.0, "elapsed_time": "1:35:47", "remaining_time": "8:22:54"}
81
+ {"current_steps": 810, "total_steps": 5000, "loss": 0.5688, "lr": 1.9775631886233655e-05, "epoch": 1.62, "percentage": 16.2, "elapsed_time": "1:37:00", "remaining_time": "8:21:48"}
82
+ {"current_steps": 820, "total_steps": 5000, "loss": 0.6023, "lr": 1.9760688168135233e-05, "epoch": 1.6400000000000001, "percentage": 16.4, "elapsed_time": "1:38:11", "remaining_time": "8:20:34"}
83
+ {"current_steps": 830, "total_steps": 5000, "loss": 0.5957, "lr": 1.9745268727865774e-05, "epoch": 1.6600000000000001, "percentage": 16.6, "elapsed_time": "1:39:23", "remaining_time": "8:19:19"}
84
+ {"current_steps": 840, "total_steps": 5000, "loss": 0.5409, "lr": 1.972937431694704e-05, "epoch": 1.6800000000000002, "percentage": 16.8, "elapsed_time": "1:40:34", "remaining_time": "8:18:06"}
85
+ {"current_steps": 850, "total_steps": 5000, "loss": 0.6286, "lr": 1.9713005710050203e-05, "epoch": 1.7, "percentage": 17.0, "elapsed_time": "1:41:46", "remaining_time": "8:16:52"}
86
+ {"current_steps": 860, "total_steps": 5000, "loss": 0.5936, "lr": 1.969616370495806e-05, "epoch": 1.72, "percentage": 17.2, "elapsed_time": "1:42:57", "remaining_time": "8:15:39"}
87
+ {"current_steps": 870, "total_steps": 5000, "loss": 0.6535, "lr": 1.967884912252619e-05, "epoch": 1.74, "percentage": 17.4, "elapsed_time": "1:44:09", "remaining_time": "8:14:24"}
88
+ {"current_steps": 880, "total_steps": 5000, "loss": 0.5864, "lr": 1.9661062806642903e-05, "epoch": 1.76, "percentage": 17.6, "elapsed_time": "1:45:20", "remaining_time": "8:13:12"}
89
+ {"current_steps": 890, "total_steps": 5000, "loss": 0.6181, "lr": 1.964280562418815e-05, "epoch": 1.78, "percentage": 17.8, "elapsed_time": "1:46:32", "remaining_time": "8:11:59"}
90
+ {"current_steps": 900, "total_steps": 5000, "loss": 0.5736, "lr": 1.962407846499124e-05, "epoch": 1.8, "percentage": 18.0, "elapsed_time": "1:47:43", "remaining_time": "8:10:46"}
91
+ {"current_steps": 910, "total_steps": 5000, "loss": 0.5769, "lr": 1.96048822417875e-05, "epoch": 1.8199999999999998, "percentage": 18.2, "elapsed_time": "1:48:56", "remaining_time": "8:09:40"}
92
+ {"current_steps": 920, "total_steps": 5000, "loss": 0.6056, "lr": 1.958521789017376e-05, "epoch": 1.8399999999999999, "percentage": 18.4, "elapsed_time": "1:50:09", "remaining_time": "8:08:29"}
93
+ {"current_steps": 930, "total_steps": 5000, "loss": 0.6632, "lr": 1.956508636856278e-05, "epoch": 1.8599999999999999, "percentage": 18.6, "elapsed_time": "1:51:20", "remaining_time": "8:07:16"}
94
+ {"current_steps": 940, "total_steps": 5000, "loss": 0.5803, "lr": 1.9546569379242446e-05, "epoch": 1.88, "percentage": 18.8, "elapsed_time": "1:52:31", "remaining_time": "8:06:00"}
95
+ {"current_steps": 950, "total_steps": 5000, "loss": 0.6028, "lr": 1.9525552956573244e-05, "epoch": 1.9, "percentage": 19.0, "elapsed_time": "1:53:42", "remaining_time": "8:04:44"}
96
+ {"current_steps": 960, "total_steps": 5000, "loss": 0.5932, "lr": 1.9504072271891486e-05, "epoch": 1.92, "percentage": 19.2, "elapsed_time": "1:54:54", "remaining_time": "8:03:32"}
97
+ {"current_steps": 970, "total_steps": 5000, "loss": 0.581, "lr": 1.9482128372135446e-05, "epoch": 1.94, "percentage": 19.4, "elapsed_time": "1:56:06", "remaining_time": "8:02:21"}
98
+ {"current_steps": 980, "total_steps": 5000, "loss": 0.6207, "lr": 1.945972232681984e-05, "epoch": 1.96, "percentage": 19.6, "elapsed_time": "1:57:18", "remaining_time": "8:01:12"}
99
+ {"current_steps": 990, "total_steps": 5000, "loss": 0.5576, "lr": 1.9436855227983695e-05, "epoch": 1.98, "percentage": 19.8, "elapsed_time": "1:58:30", "remaining_time": "8:00:00"}
100
+ {"current_steps": 1000, "total_steps": 5000, "loss": 0.6121, "lr": 1.9413528190137158e-05, "epoch": 2.0, "percentage": 20.0, "elapsed_time": "1:59:42", "remaining_time": "7:58:48"}
101
+ {"current_steps": 1010, "total_steps": 5000, "loss": 0.5909, "lr": 1.938974235020714e-05, "epoch": 2.02, "percentage": 20.2, "elapsed_time": "2:00:56", "remaining_time": "7:57:45"}
102
+ {"current_steps": 1020, "total_steps": 5000, "loss": 0.5459, "lr": 1.9365498867481926e-05, "epoch": 2.04, "percentage": 20.4, "elapsed_time": "2:02:07", "remaining_time": "7:56:32"}
103
+ {"current_steps": 1030, "total_steps": 5000, "loss": 0.5781, "lr": 1.9340798923554657e-05, "epoch": 2.06, "percentage": 20.6, "elapsed_time": "2:03:18", "remaining_time": "7:55:16"}
104
+ {"current_steps": 1040, "total_steps": 5000, "loss": 0.5284, "lr": 1.931564372226576e-05, "epoch": 2.08, "percentage": 20.8, "elapsed_time": "2:04:30", "remaining_time": "7:54:04"}
105
+ {"current_steps": 1050, "total_steps": 5000, "loss": 0.5476, "lr": 1.9290034489644247e-05, "epoch": 2.1, "percentage": 21.0, "elapsed_time": "2:05:41", "remaining_time": "7:52:50"}
106
+ {"current_steps": 1060, "total_steps": 5000, "loss": 0.5386, "lr": 1.9263972473847995e-05, "epoch": 2.12, "percentage": 21.2, "elapsed_time": "2:06:52", "remaining_time": "7:51:36"}
107
+ {"current_steps": 1070, "total_steps": 5000, "loss": 0.5864, "lr": 1.923745894510288e-05, "epoch": 2.14, "percentage": 21.4, "elapsed_time": "2:08:04", "remaining_time": "7:50:24"}
108
+ {"current_steps": 1080, "total_steps": 5000, "loss": 0.5413, "lr": 1.9210495195640895e-05, "epoch": 2.16, "percentage": 21.6, "elapsed_time": "2:09:15", "remaining_time": "7:49:10"}
109
+ {"current_steps": 1090, "total_steps": 5000, "loss": 0.5201, "lr": 1.918308253963715e-05, "epoch": 2.18, "percentage": 21.8, "elapsed_time": "2:10:28", "remaining_time": "7:48:00"}
110
+ {"current_steps": 1100, "total_steps": 5000, "loss": 0.5658, "lr": 1.9155222313145817e-05, "epoch": 2.2, "percentage": 22.0, "elapsed_time": "2:11:39", "remaining_time": "7:46:47"}
111
+ {"current_steps": 1110, "total_steps": 5000, "loss": 0.5578, "lr": 1.912691587403503e-05, "epoch": 2.22, "percentage": 22.2, "elapsed_time": "2:12:53", "remaining_time": "7:45:41"}
112
+ {"current_steps": 1120, "total_steps": 5000, "loss": 0.4792, "lr": 1.9098164601920702e-05, "epoch": 2.24, "percentage": 22.4, "elapsed_time": "2:14:04", "remaining_time": "7:44:29"}
113
+ {"current_steps": 1130, "total_steps": 5000, "loss": 0.6048, "lr": 1.906896989809927e-05, "epoch": 2.26, "percentage": 22.6, "elapsed_time": "2:15:16", "remaining_time": "7:43:16"}
114
+ {"current_steps": 1140, "total_steps": 5000, "loss": 0.567, "lr": 1.903933318547942e-05, "epoch": 2.2800000000000002, "percentage": 22.8, "elapsed_time": "2:16:28", "remaining_time": "7:42:06"}
115
+ {"current_steps": 1150, "total_steps": 5000, "loss": 0.5965, "lr": 1.9009255908512704e-05, "epoch": 2.3, "percentage": 23.0, "elapsed_time": "2:17:40", "remaining_time": "7:40:53"}
116
+ {"current_steps": 1160, "total_steps": 5000, "loss": 0.5561, "lr": 1.897873953312317e-05, "epoch": 2.32, "percentage": 23.2, "elapsed_time": "2:18:52", "remaining_time": "7:39:43"}
117
+ {"current_steps": 1170, "total_steps": 5000, "loss": 0.5529, "lr": 1.8947785546635905e-05, "epoch": 2.34, "percentage": 23.4, "elapsed_time": "2:20:04", "remaining_time": "7:38:30"}
118
+ {"current_steps": 1180, "total_steps": 5000, "loss": 0.5818, "lr": 1.8916395457704536e-05, "epoch": 2.36, "percentage": 23.6, "elapsed_time": "2:21:16", "remaining_time": "7:37:19"}
119
+ {"current_steps": 1190, "total_steps": 5000, "loss": 0.5558, "lr": 1.888457079623772e-05, "epoch": 2.38, "percentage": 23.8, "elapsed_time": "2:22:28", "remaining_time": "7:36:08"}
120
+ {"current_steps": 1200, "total_steps": 5000, "loss": 0.5833, "lr": 1.8852313113324553e-05, "epoch": 2.4, "percentage": 24.0, "elapsed_time": "2:23:39", "remaining_time": "7:34:56"}
121
+ {"current_steps": 1210, "total_steps": 5000, "loss": 0.527, "lr": 1.8819623981158996e-05, "epoch": 2.42, "percentage": 24.2, "elapsed_time": "2:24:52", "remaining_time": "7:33:47"}
122
+ {"current_steps": 1220, "total_steps": 5000, "loss": 0.5605, "lr": 1.878650499296323e-05, "epoch": 2.44, "percentage": 24.4, "elapsed_time": "2:26:03", "remaining_time": "7:32:32"}
123
+ {"current_steps": 1230, "total_steps": 5000, "loss": 0.5528, "lr": 1.8752957762910016e-05, "epoch": 2.46, "percentage": 24.6, "elapsed_time": "2:27:14", "remaining_time": "7:31:17"}
124
+ {"current_steps": 1240, "total_steps": 5000, "loss": 0.5239, "lr": 1.871898392604402e-05, "epoch": 2.48, "percentage": 24.8, "elapsed_time": "2:28:25", "remaining_time": "7:30:03"}
125
+ {"current_steps": 1250, "total_steps": 5000, "loss": 0.5825, "lr": 1.8684585138202122e-05, "epoch": 2.5, "percentage": 25.0, "elapsed_time": "2:29:36", "remaining_time": "7:28:49"}
126
+ {"current_steps": 1260, "total_steps": 5000, "loss": 0.5816, "lr": 1.864976307593271e-05, "epoch": 2.52, "percentage": 25.2, "elapsed_time": "2:30:48", "remaining_time": "7:27:38"}
127
+ {"current_steps": 1270, "total_steps": 5000, "loss": 0.595, "lr": 1.8614519436413968e-05, "epoch": 2.54, "percentage": 25.4, "elapsed_time": "2:32:01", "remaining_time": "7:26:28"}
128
+ {"current_steps": 1280, "total_steps": 5000, "loss": 0.5828, "lr": 1.8578855937371176e-05, "epoch": 2.56, "percentage": 25.6, "elapsed_time": "2:33:12", "remaining_time": "7:25:15"}
129
+ {"current_steps": 1290, "total_steps": 5000, "loss": 0.5408, "lr": 1.8542774316992953e-05, "epoch": 2.58, "percentage": 25.8, "elapsed_time": "2:34:24", "remaining_time": "7:24:04"}
130
+ {"current_steps": 1300, "total_steps": 5000, "loss": 0.5683, "lr": 1.850627633384658e-05, "epoch": 2.6, "percentage": 26.0, "elapsed_time": "2:35:35", "remaining_time": "7:22:50"}
131
+ {"current_steps": 1310, "total_steps": 5000, "loss": 0.5734, "lr": 1.8469363766792258e-05, "epoch": 2.62, "percentage": 26.2, "elapsed_time": "2:36:48", "remaining_time": "7:21:42"}
132
+ {"current_steps": 1320, "total_steps": 5000, "loss": 0.5581, "lr": 1.8432038414896432e-05, "epoch": 2.64, "percentage": 26.4, "elapsed_time": "2:38:00", "remaining_time": "7:20:29"}
133
+ {"current_steps": 1330, "total_steps": 5000, "loss": 0.5781, "lr": 1.8394302097344103e-05, "epoch": 2.66, "percentage": 26.6, "elapsed_time": "2:39:11", "remaining_time": "7:19:17"}
134
+ {"current_steps": 1340, "total_steps": 5000, "loss": 0.5468, "lr": 1.8356156653350138e-05, "epoch": 2.68, "percentage": 26.8, "elapsed_time": "2:40:22", "remaining_time": "7:18:03"}
135
+ {"current_steps": 1350, "total_steps": 5000, "loss": 0.543, "lr": 1.8317603942069665e-05, "epoch": 2.7, "percentage": 27.0, "elapsed_time": "2:41:34", "remaining_time": "7:16:51"}
136
+ {"current_steps": 1360, "total_steps": 5000, "loss": 0.5416, "lr": 1.8278645842507448e-05, "epoch": 2.7199999999999998, "percentage": 27.2, "elapsed_time": "2:42:45", "remaining_time": "7:15:38"}
137
+ {"current_steps": 1370, "total_steps": 5000, "loss": 0.5692, "lr": 1.8239284253426294e-05, "epoch": 2.74, "percentage": 27.4, "elapsed_time": "2:43:57", "remaining_time": "7:14:26"}
138
+ {"current_steps": 1380, "total_steps": 5000, "loss": 0.5372, "lr": 1.8199521093254524e-05, "epoch": 2.76, "percentage": 27.6, "elapsed_time": "2:45:08", "remaining_time": "7:13:11"}
139
+ {"current_steps": 1390, "total_steps": 5000, "loss": 0.5205, "lr": 1.815935829999247e-05, "epoch": 2.7800000000000002, "percentage": 27.8, "elapsed_time": "2:46:19", "remaining_time": "7:11:58"}
140
+ {"current_steps": 1400, "total_steps": 5000, "loss": 0.5159, "lr": 1.811879783111801e-05, "epoch": 2.8, "percentage": 28.0, "elapsed_time": "2:47:31", "remaining_time": "7:10:45"}
141
+ {"current_steps": 1410, "total_steps": 5000, "loss": 0.5405, "lr": 1.8077841663491174e-05, "epoch": 2.82, "percentage": 28.2, "elapsed_time": "2:48:44", "remaining_time": "7:09:37"}
142
+ {"current_steps": 1420, "total_steps": 5000, "loss": 0.5769, "lr": 1.80364917932578e-05, "epoch": 2.84, "percentage": 28.4, "elapsed_time": "2:49:56", "remaining_time": "7:08:26"}
143
+ {"current_steps": 1430, "total_steps": 5000, "loss": 0.5724, "lr": 1.799475023575222e-05, "epoch": 2.86, "percentage": 28.6, "elapsed_time": "2:51:07", "remaining_time": "7:07:12"}
144
+ {"current_steps": 1440, "total_steps": 5000, "loss": 0.5603, "lr": 1.795261902539906e-05, "epoch": 2.88, "percentage": 28.8, "elapsed_time": "2:52:19", "remaining_time": "7:06:00"}
145
+ {"current_steps": 1450, "total_steps": 5000, "loss": 0.5609, "lr": 1.791010021561407e-05, "epoch": 2.9, "percentage": 29.0, "elapsed_time": "2:53:30", "remaining_time": "7:04:47"}
146
+ {"current_steps": 1460, "total_steps": 5000, "loss": 0.585, "lr": 1.7867195878704062e-05, "epoch": 2.92, "percentage": 29.2, "elapsed_time": "2:54:42", "remaining_time": "7:03:37"}
147
+ {"current_steps": 1470, "total_steps": 5000, "loss": 0.5818, "lr": 1.7823908105765883e-05, "epoch": 2.94, "percentage": 29.4, "elapsed_time": "2:55:55", "remaining_time": "7:02:26"}
148
+ {"current_steps": 1480, "total_steps": 5000, "loss": 0.5453, "lr": 1.7780239006584515e-05, "epoch": 2.96, "percentage": 29.6, "elapsed_time": "2:57:06", "remaining_time": "7:01:13"}
149
+ {"current_steps": 1490, "total_steps": 5000, "loss": 0.5526, "lr": 1.773619070953025e-05, "epoch": 2.98, "percentage": 29.8, "elapsed_time": "2:58:18", "remaining_time": "7:00:01"}
150
+ {"current_steps": 1500, "total_steps": 5000, "loss": 0.5664, "lr": 1.769176536145494e-05, "epoch": 3.0, "percentage": 30.0, "elapsed_time": "2:59:29", "remaining_time": "6:58:49"}
151
+ {"current_steps": 1510, "total_steps": 5000, "loss": 0.4993, "lr": 1.7646965127587373e-05, "epoch": 3.02, "percentage": 30.2, "elapsed_time": "3:00:42", "remaining_time": "6:57:39"}
152
+ {"current_steps": 1520, "total_steps": 5000, "loss": 0.5302, "lr": 1.760179219142774e-05, "epoch": 3.04, "percentage": 30.4, "elapsed_time": "3:01:53", "remaining_time": "6:56:26"}
153
+ {"current_steps": 1530, "total_steps": 5000, "loss": 0.4995, "lr": 1.7556248754641237e-05, "epoch": 3.06, "percentage": 30.6, "elapsed_time": "3:03:04", "remaining_time": "6:55:12"}
154
+ {"current_steps": 1540, "total_steps": 5000, "loss": 0.4902, "lr": 1.7510337036950703e-05, "epoch": 3.08, "percentage": 30.8, "elapsed_time": "3:04:17", "remaining_time": "6:54:02"}
155
+ {"current_steps": 1550, "total_steps": 5000, "loss": 0.5571, "lr": 1.7464059276028497e-05, "epoch": 3.1, "percentage": 31.0, "elapsed_time": "3:05:27", "remaining_time": "6:52:48"}
156
+ {"current_steps": 1560, "total_steps": 5000, "loss": 0.5441, "lr": 1.7417417727387392e-05, "epoch": 3.12, "percentage": 31.2, "elapsed_time": "3:06:39", "remaining_time": "6:51:36"}
157
+ {"current_steps": 1570, "total_steps": 5000, "loss": 0.5498, "lr": 1.7370414664270675e-05, "epoch": 3.14, "percentage": 31.4, "elapsed_time": "3:07:52", "remaining_time": "6:50:26"}
158
+ {"current_steps": 1580, "total_steps": 5000, "loss": 0.4799, "lr": 1.732305237754132e-05, "epoch": 3.16, "percentage": 31.6, "elapsed_time": "3:09:03", "remaining_time": "6:49:13"}
159
+ {"current_steps": 1590, "total_steps": 5000, "loss": 0.5285, "lr": 1.727533317557037e-05, "epoch": 3.18, "percentage": 31.8, "elapsed_time": "3:10:14", "remaining_time": "6:48:00"}
160
+ {"current_steps": 1600, "total_steps": 5000, "loss": 0.5328, "lr": 1.7227259384124408e-05, "epoch": 3.2, "percentage": 32.0, "elapsed_time": "3:11:26", "remaining_time": "6:46:49"}
161
+ {"current_steps": 1610, "total_steps": 5000, "loss": 0.5333, "lr": 1.7178833346252208e-05, "epoch": 3.22, "percentage": 32.2, "elapsed_time": "3:12:40", "remaining_time": "6:45:41"}
162
+ {"current_steps": 1620, "total_steps": 5000, "loss": 0.5163, "lr": 1.713005742217053e-05, "epoch": 3.24, "percentage": 32.4, "elapsed_time": "3:13:52", "remaining_time": "6:44:30"}
163
+ {"current_steps": 1630, "total_steps": 5000, "loss": 0.5173, "lr": 1.7080933989149112e-05, "epoch": 3.26, "percentage": 32.6, "elapsed_time": "3:15:04", "remaining_time": "6:43:19"}
164
+ {"current_steps": 1640, "total_steps": 5000, "loss": 0.5187, "lr": 1.7031465441394766e-05, "epoch": 3.2800000000000002, "percentage": 32.8, "elapsed_time": "3:16:15", "remaining_time": "6:42:05"}
165
+ {"current_steps": 1650, "total_steps": 5000, "loss": 0.5611, "lr": 1.698165418993473e-05, "epoch": 3.3, "percentage": 33.0, "elapsed_time": "3:17:27", "remaining_time": "6:40:54"}
166
+ {"current_steps": 1660, "total_steps": 5000, "loss": 0.5381, "lr": 1.6931502662499116e-05, "epoch": 3.32, "percentage": 33.2, "elapsed_time": "3:18:40", "remaining_time": "6:39:43"}
167
+ {"current_steps": 1670, "total_steps": 5000, "loss": 0.5089, "lr": 1.688101330340263e-05, "epoch": 3.34, "percentage": 33.4, "elapsed_time": "3:19:52", "remaining_time": "6:38:32"}
168
+ {"current_steps": 1680, "total_steps": 5000, "loss": 0.5538, "lr": 1.683018857342539e-05, "epoch": 3.36, "percentage": 33.6, "elapsed_time": "3:21:04", "remaining_time": "6:37:20"}
169
+ {"current_steps": 1690, "total_steps": 5000, "loss": 0.5216, "lr": 1.6779030949693044e-05, "epoch": 3.38, "percentage": 33.8, "elapsed_time": "3:22:15", "remaining_time": "6:36:08"}
170
+ {"current_steps": 1700, "total_steps": 5000, "loss": 0.5356, "lr": 1.6727542925556e-05, "epoch": 3.4, "percentage": 34.0, "elapsed_time": "3:23:27", "remaining_time": "6:34:56"}
171
+ {"current_steps": 1710, "total_steps": 5000, "loss": 0.497, "lr": 1.667572701046791e-05, "epoch": 3.42, "percentage": 34.2, "elapsed_time": "3:24:40", "remaining_time": "6:33:46"}
172
+ {"current_steps": 1720, "total_steps": 5000, "loss": 0.4934, "lr": 1.662358572986337e-05, "epoch": 3.44, "percentage": 34.4, "elapsed_time": "3:25:51", "remaining_time": "6:32:33"}
173
+ {"current_steps": 1730, "total_steps": 5000, "loss": 0.5327, "lr": 1.6571121625034847e-05, "epoch": 3.46, "percentage": 34.6, "elapsed_time": "3:27:03", "remaining_time": "6:31:22"}
174
+ {"current_steps": 1740, "total_steps": 5000, "loss": 0.4995, "lr": 1.651833725300879e-05, "epoch": 3.48, "percentage": 34.8, "elapsed_time": "3:28:14", "remaining_time": "6:30:10"}
175
+ {"current_steps": 1750, "total_steps": 5000, "loss": 0.4945, "lr": 1.6465235186421024e-05, "epoch": 3.5, "percentage": 35.0, "elapsed_time": "3:29:26", "remaining_time": "6:28:57"}
176
+ {"current_steps": 1760, "total_steps": 5000, "loss": 0.4969, "lr": 1.6411818013391357e-05, "epoch": 3.52, "percentage": 35.2, "elapsed_time": "3:30:38", "remaining_time": "6:27:45"}
177
+ {"current_steps": 1770, "total_steps": 5000, "loss": 0.5133, "lr": 1.6358088337397444e-05, "epoch": 3.54, "percentage": 35.4, "elapsed_time": "3:31:49", "remaining_time": "6:26:32"}
178
+ {"current_steps": 1780, "total_steps": 5000, "loss": 0.4598, "lr": 1.630404877714789e-05, "epoch": 3.56, "percentage": 35.6, "elapsed_time": "3:33:01", "remaining_time": "6:25:21"}
179
+ {"current_steps": 1790, "total_steps": 5000, "loss": 0.5721, "lr": 1.6249701966454626e-05, "epoch": 3.58, "percentage": 35.8, "elapsed_time": "3:34:13", "remaining_time": "6:24:09"}
180
+ {"current_steps": 1800, "total_steps": 5000, "loss": 0.5549, "lr": 1.619505055410453e-05, "epoch": 3.6, "percentage": 36.0, "elapsed_time": "3:35:25", "remaining_time": "6:22:58"}
181
+ {"current_steps": 1810, "total_steps": 5000, "loss": 0.5192, "lr": 1.614009720373034e-05, "epoch": 3.62, "percentage": 36.2, "elapsed_time": "3:36:38", "remaining_time": "6:21:49"}
182
+ {"current_steps": 1820, "total_steps": 5000, "loss": 0.4927, "lr": 1.608484459368082e-05, "epoch": 3.64, "percentage": 36.4, "elapsed_time": "3:37:50", "remaining_time": "6:20:37"}
183
+ {"current_steps": 1830, "total_steps": 5000, "loss": 0.5319, "lr": 1.602929541689025e-05, "epoch": 3.66, "percentage": 36.6, "elapsed_time": "3:39:02", "remaining_time": "6:19:26"}
184
+ {"current_steps": 1840, "total_steps": 5000, "loss": 0.5025, "lr": 1.5973452380747125e-05, "epoch": 3.68, "percentage": 36.8, "elapsed_time": "3:40:14", "remaining_time": "6:18:14"}
185
+ {"current_steps": 1850, "total_steps": 5000, "loss": 0.497, "lr": 1.591731820696224e-05, "epoch": 3.7, "percentage": 37.0, "elapsed_time": "3:41:25", "remaining_time": "6:17:02"}
186
+ {"current_steps": 1860, "total_steps": 5000, "loss": 0.556, "lr": 1.5860895631436044e-05, "epoch": 3.7199999999999998, "percentage": 37.2, "elapsed_time": "3:42:37", "remaining_time": "6:15:49"}
187
+ {"current_steps": 1870, "total_steps": 5000, "loss": 0.506, "lr": 1.580418740412526e-05, "epoch": 3.74, "percentage": 37.4, "elapsed_time": "3:43:49", "remaining_time": "6:14:37"}
188
+ {"current_steps": 1880, "total_steps": 5000, "loss": 0.5154, "lr": 1.5747196288908887e-05, "epoch": 3.76, "percentage": 37.6, "elapsed_time": "3:45:00", "remaining_time": "6:13:25"}
189
+ {"current_steps": 1890, "total_steps": 5000, "loss": 0.5887, "lr": 1.5689925063453483e-05, "epoch": 3.7800000000000002, "percentage": 37.8, "elapsed_time": "3:46:13", "remaining_time": "6:12:14"}
190
+ {"current_steps": 1900, "total_steps": 5000, "loss": 0.508, "lr": 1.563237651907777e-05, "epoch": 3.8, "percentage": 38.0, "elapsed_time": "3:47:24", "remaining_time": "6:11:02"}
191
+ {"current_steps": 1910, "total_steps": 5000, "loss": 0.4954, "lr": 1.5574553460616608e-05, "epoch": 3.82, "percentage": 38.2, "elapsed_time": "3:48:38", "remaining_time": "6:09:53"}
192
+ {"current_steps": 1920, "total_steps": 5000, "loss": 0.5628, "lr": 1.5516458706284306e-05, "epoch": 3.84, "percentage": 38.4, "elapsed_time": "3:49:50", "remaining_time": "6:08:42"}
193
+ {"current_steps": 1930, "total_steps": 5000, "loss": 0.4494, "lr": 1.5458095087537216e-05, "epoch": 3.86, "percentage": 38.6, "elapsed_time": "3:51:02", "remaining_time": "6:07:30"}
194
+ {"current_steps": 1940, "total_steps": 5000, "loss": 0.522, "lr": 1.5399465448935788e-05, "epoch": 3.88, "percentage": 38.8, "elapsed_time": "3:52:14", "remaining_time": "6:06:19"}
195
+ {"current_steps": 1950, "total_steps": 5000, "loss": 0.5225, "lr": 1.5340572648005887e-05, "epoch": 3.9, "percentage": 39.0, "elapsed_time": "3:53:26", "remaining_time": "6:05:07"}
196
+ {"current_steps": 1960, "total_steps": 5000, "loss": 0.5092, "lr": 1.5281419555099547e-05, "epoch": 3.92, "percentage": 39.2, "elapsed_time": "3:54:38", "remaining_time": "6:03:55"}
197
+ {"current_steps": 1970, "total_steps": 5000, "loss": 0.5167, "lr": 1.5222009053255061e-05, "epoch": 3.94, "percentage": 39.4, "elapsed_time": "3:55:49", "remaining_time": "6:02:43"}
198
+ {"current_steps": 1980, "total_steps": 5000, "loss": 0.5198, "lr": 1.5162344038056476e-05, "epoch": 3.96, "percentage": 39.6, "elapsed_time": "3:57:00", "remaining_time": "6:01:30"}
199
+ {"current_steps": 1990, "total_steps": 5000, "loss": 0.5723, "lr": 1.510242741749246e-05, "epoch": 3.98, "percentage": 39.8, "elapsed_time": "3:58:12", "remaining_time": "6:00:18"}
200
+ {"current_steps": 2000, "total_steps": 5000, "loss": 0.4707, "lr": 1.5042262111814566e-05, "epoch": 4.0, "percentage": 40.0, "elapsed_time": "3:59:23", "remaining_time": "5:59:05"}
201
+ {"current_steps": 2010, "total_steps": 5000, "loss": 0.523, "lr": 1.498185105339491e-05, "epoch": 4.02, "percentage": 40.2, "elapsed_time": "4:00:37", "remaining_time": "5:57:56"}
202
+ {"current_steps": 2020, "total_steps": 5000, "loss": 0.4433, "lr": 1.4921197186583256e-05, "epoch": 4.04, "percentage": 40.4, "elapsed_time": "4:01:48", "remaining_time": "5:56:44"}
203
+ {"current_steps": 2030, "total_steps": 5000, "loss": 0.4861, "lr": 1.4860303467563504e-05, "epoch": 4.06, "percentage": 40.6, "elapsed_time": "4:03:01", "remaining_time": "5:55:32"}
204
+ {"current_steps": 2040, "total_steps": 5000, "loss": 0.4621, "lr": 1.4799172864209607e-05, "epoch": 4.08, "percentage": 40.8, "elapsed_time": "4:04:13", "remaining_time": "5:54:21"}
205
+ {"current_steps": 2050, "total_steps": 5000, "loss": 0.5454, "lr": 1.4737808355940932e-05, "epoch": 4.1, "percentage": 41.0, "elapsed_time": "4:05:25", "remaining_time": "5:53:10"}
206
+ {"current_steps": 2060, "total_steps": 5000, "loss": 0.4984, "lr": 1.467621293357704e-05, "epoch": 4.12, "percentage": 41.2, "elapsed_time": "4:06:37", "remaining_time": "5:51:58"}
207
+ {"current_steps": 2070, "total_steps": 5000, "loss": 0.5375, "lr": 1.4614389599191917e-05, "epoch": 4.14, "percentage": 41.4, "elapsed_time": "4:07:49", "remaining_time": "5:50:46"}
208
+ {"current_steps": 2080, "total_steps": 5000, "loss": 0.5191, "lr": 1.455234136596766e-05, "epoch": 4.16, "percentage": 41.6, "elapsed_time": "4:09:01", "remaining_time": "5:49:36"}
209
+ {"current_steps": 2090, "total_steps": 5000, "loss": 0.4872, "lr": 1.4490071258047625e-05, "epoch": 4.18, "percentage": 41.8, "elapsed_time": "4:10:14", "remaining_time": "5:48:25"}
210
+ {"current_steps": 2100, "total_steps": 5000, "loss": 0.4883, "lr": 1.442758231038902e-05, "epoch": 4.2, "percentage": 42.0, "elapsed_time": "4:11:27", "remaining_time": "5:47:14"}
211
+ {"current_steps": 2110, "total_steps": 5000, "loss": 0.4564, "lr": 1.436487756861499e-05, "epoch": 4.22, "percentage": 42.2, "elapsed_time": "4:12:40", "remaining_time": "5:46:05"}
212
+ {"current_steps": 2120, "total_steps": 5000, "loss": 0.4311, "lr": 1.4301960088866187e-05, "epoch": 4.24, "percentage": 42.4, "elapsed_time": "4:13:52", "remaining_time": "5:44:52"}
213
+ {"current_steps": 2130, "total_steps": 5000, "loss": 0.4663, "lr": 1.4238832937651816e-05, "epoch": 4.26, "percentage": 42.6, "elapsed_time": "4:15:02", "remaining_time": "5:43:39"}
214
+ {"current_steps": 2140, "total_steps": 5000, "loss": 0.5048, "lr": 1.4175499191700169e-05, "epoch": 4.28, "percentage": 42.8, "elapsed_time": "4:16:14", "remaining_time": "5:42:27"}
215
+ {"current_steps": 2150, "total_steps": 5000, "loss": 0.493, "lr": 1.4111961937808665e-05, "epoch": 4.3, "percentage": 43.0, "elapsed_time": "4:17:27", "remaining_time": "5:41:16"}
216
+ {"current_steps": 2160, "total_steps": 5000, "loss": 0.4712, "lr": 1.4048224272693426e-05, "epoch": 4.32, "percentage": 43.2, "elapsed_time": "4:18:39", "remaining_time": "5:40:04"}
217
+ {"current_steps": 2170, "total_steps": 5000, "loss": 0.4641, "lr": 1.3984289302838327e-05, "epoch": 4.34, "percentage": 43.4, "elapsed_time": "4:19:50", "remaining_time": "5:38:52"}
218
+ {"current_steps": 2180, "total_steps": 5000, "loss": 0.4929, "lr": 1.3920160144343604e-05, "epoch": 4.36, "percentage": 43.6, "elapsed_time": "4:21:02", "remaining_time": "5:37:40"}
219
+ {"current_steps": 2190, "total_steps": 5000, "loss": 0.5269, "lr": 1.3855839922773968e-05, "epoch": 4.38, "percentage": 43.8, "elapsed_time": "4:22:14", "remaining_time": "5:36:28"}
220
+ {"current_steps": 2200, "total_steps": 5000, "loss": 0.4857, "lr": 1.3791331773006272e-05, "epoch": 4.4, "percentage": 44.0, "elapsed_time": "4:23:25", "remaining_time": "5:35:15"}
221
+ {"current_steps": 2210, "total_steps": 5000, "loss": 0.5613, "lr": 1.3726638839076732e-05, "epoch": 4.42, "percentage": 44.2, "elapsed_time": "4:24:38", "remaining_time": "5:34:06"}
222
+ {"current_steps": 2220, "total_steps": 5000, "loss": 0.4884, "lr": 1.3661764274027678e-05, "epoch": 4.44, "percentage": 44.4, "elapsed_time": "4:25:50", "remaining_time": "5:32:54"}
223
+ {"current_steps": 2230, "total_steps": 5000, "loss": 0.4849, "lr": 1.3596711239753889e-05, "epoch": 4.46, "percentage": 44.6, "elapsed_time": "4:27:02", "remaining_time": "5:31:42"}
224
+ {"current_steps": 2240, "total_steps": 5000, "loss": 0.4752, "lr": 1.3531482906848474e-05, "epoch": 4.48, "percentage": 44.8, "elapsed_time": "4:28:14", "remaining_time": "5:30:30"}
225
+ {"current_steps": 2250, "total_steps": 5000, "loss": 0.4804, "lr": 1.3466082454448364e-05, "epoch": 4.5, "percentage": 45.0, "elapsed_time": "4:29:25", "remaining_time": "5:29:17"}
226
+ {"current_steps": 2260, "total_steps": 5000, "loss": 0.4719, "lr": 1.340051307007933e-05, "epoch": 4.52, "percentage": 45.2, "elapsed_time": "4:30:37", "remaining_time": "5:28:06"}
227
+ {"current_steps": 2270, "total_steps": 5000, "loss": 0.4599, "lr": 1.3334777949500673e-05, "epoch": 4.54, "percentage": 45.4, "elapsed_time": "4:31:48", "remaining_time": "5:26:53"}
228
+ {"current_steps": 2280, "total_steps": 5000, "loss": 0.4712, "lr": 1.3268880296549424e-05, "epoch": 4.5600000000000005, "percentage": 45.6, "elapsed_time": "4:32:59", "remaining_time": "5:25:40"}
229
+ {"current_steps": 2290, "total_steps": 5000, "loss": 0.4772, "lr": 1.3202823322984228e-05, "epoch": 4.58, "percentage": 45.8, "elapsed_time": "4:34:11", "remaining_time": "5:24:28"}
230
+ {"current_steps": 2300, "total_steps": 5000, "loss": 0.453, "lr": 1.3136610248328779e-05, "epoch": 4.6, "percentage": 46.0, "elapsed_time": "4:35:22", "remaining_time": "5:23:16"}
231
+ {"current_steps": 2310, "total_steps": 5000, "loss": 0.4657, "lr": 1.307024429971492e-05, "epoch": 4.62, "percentage": 46.2, "elapsed_time": "4:36:35", "remaining_time": "5:22:05"}
232
+ {"current_steps": 2320, "total_steps": 5000, "loss": 0.4791, "lr": 1.3003728711725364e-05, "epoch": 4.64, "percentage": 46.4, "elapsed_time": "4:37:46", "remaining_time": "5:20:52"}
233
+ {"current_steps": 2330, "total_steps": 5000, "loss": 0.5586, "lr": 1.2937066726236029e-05, "epoch": 4.66, "percentage": 46.6, "elapsed_time": "4:38:58", "remaining_time": "5:19:40"}
234
+ {"current_steps": 2340, "total_steps": 5000, "loss": 0.4603, "lr": 1.2870261592258038e-05, "epoch": 4.68, "percentage": 46.8, "elapsed_time": "4:40:10", "remaining_time": "5:18:28"}
235
+ {"current_steps": 2350, "total_steps": 5000, "loss": 0.4618, "lr": 1.2803316565779378e-05, "epoch": 4.7, "percentage": 47.0, "elapsed_time": "4:41:21", "remaining_time": "5:17:16"}
236
+ {"current_steps": 2360, "total_steps": 5000, "loss": 0.4643, "lr": 1.2736234909606186e-05, "epoch": 4.72, "percentage": 47.2, "elapsed_time": "4:42:33", "remaining_time": "5:16:05"}
237
+ {"current_steps": 2370, "total_steps": 5000, "loss": 0.5017, "lr": 1.2669019893203758e-05, "epoch": 4.74, "percentage": 47.4, "elapsed_time": "4:43:45", "remaining_time": "5:14:53"}
238
+ {"current_steps": 2380, "total_steps": 5000, "loss": 0.451, "lr": 1.2601674792537157e-05, "epoch": 4.76, "percentage": 47.6, "elapsed_time": "4:44:56", "remaining_time": "5:13:40"}
239
+ {"current_steps": 2390, "total_steps": 5000, "loss": 0.4763, "lr": 1.2534202889911584e-05, "epoch": 4.78, "percentage": 47.8, "elapsed_time": "4:46:09", "remaining_time": "5:12:29"}
240
+ {"current_steps": 2400, "total_steps": 5000, "loss": 0.4984, "lr": 1.2466607473812386e-05, "epoch": 4.8, "percentage": 48.0, "elapsed_time": "4:47:21", "remaining_time": "5:11:18"}
241
+ {"current_steps": 2410, "total_steps": 5000, "loss": 0.4594, "lr": 1.2398891838744777e-05, "epoch": 4.82, "percentage": 48.2, "elapsed_time": "4:48:34", "remaining_time": "5:10:07"}
242
+ {"current_steps": 2420, "total_steps": 5000, "loss": 0.476, "lr": 1.233105928507328e-05, "epoch": 4.84, "percentage": 48.4, "elapsed_time": "4:49:46", "remaining_time": "5:08:56"}
243
+ {"current_steps": 2430, "total_steps": 5000, "loss": 0.4599, "lr": 1.226311311886086e-05, "epoch": 4.86, "percentage": 48.6, "elapsed_time": "4:50:58", "remaining_time": "5:07:44"}
244
+ {"current_steps": 2440, "total_steps": 5000, "loss": 0.4786, "lr": 1.2195056651707806e-05, "epoch": 4.88, "percentage": 48.8, "elapsed_time": "4:52:10", "remaining_time": "5:06:32"}
245
+ {"current_steps": 2450, "total_steps": 5000, "loss": 0.539, "lr": 1.2126893200590309e-05, "epoch": 4.9, "percentage": 49.0, "elapsed_time": "4:53:22", "remaining_time": "5:05:20"}
246
+ {"current_steps": 2460, "total_steps": 5000, "loss": 0.442, "lr": 1.2058626087698814e-05, "epoch": 4.92, "percentage": 49.2, "elapsed_time": "4:54:32", "remaining_time": "5:04:07"}
247
+ {"current_steps": 2470, "total_steps": 5000, "loss": 0.4829, "lr": 1.1990258640276094e-05, "epoch": 4.9399999999999995, "percentage": 49.4, "elapsed_time": "4:55:44", "remaining_time": "5:02:55"}
248
+ {"current_steps": 2480, "total_steps": 5000, "loss": 0.5055, "lr": 1.1921794190455082e-05, "epoch": 4.96, "percentage": 49.6, "elapsed_time": "4:56:56", "remaining_time": "5:01:44"}
249
+ {"current_steps": 2490, "total_steps": 5000, "loss": 0.4857, "lr": 1.1853236075096474e-05, "epoch": 4.98, "percentage": 49.8, "elapsed_time": "4:58:08", "remaining_time": "5:00:32"}
250
+ {"current_steps": 2500, "total_steps": 5000, "loss": 0.4962, "lr": 1.1784587635626095e-05, "epoch": 5.0, "percentage": 50.0, "elapsed_time": "4:59:20", "remaining_time": "4:59:20"}
251
+ {"current_steps": 2510, "total_steps": 5000, "loss": 0.4185, "lr": 1.171585221787203e-05, "epoch": 5.02, "percentage": 50.2, "elapsed_time": "5:00:33", "remaining_time": "4:58:09"}
252
+ {"current_steps": 2520, "total_steps": 5000, "loss": 0.4545, "lr": 1.1647033171901573e-05, "epoch": 5.04, "percentage": 50.4, "elapsed_time": "5:01:45", "remaining_time": "4:56:58"}
253
+ {"current_steps": 2530, "total_steps": 5000, "loss": 0.4162, "lr": 1.157813385185794e-05, "epoch": 5.06, "percentage": 50.6, "elapsed_time": "5:02:56", "remaining_time": "4:55:45"}
254
+ {"current_steps": 2540, "total_steps": 5000, "loss": 0.425, "lr": 1.1509157615796775e-05, "epoch": 5.08, "percentage": 50.8, "elapsed_time": "5:04:07", "remaining_time": "4:54:33"}
255
+ {"current_steps": 2550, "total_steps": 5000, "loss": 0.4514, "lr": 1.1440107825522522e-05, "epoch": 5.1, "percentage": 51.0, "elapsed_time": "5:05:19", "remaining_time": "4:53:21"}
256
+ {"current_steps": 2560, "total_steps": 5000, "loss": 0.4687, "lr": 1.1370987846424547e-05, "epoch": 5.12, "percentage": 51.2, "elapsed_time": "5:06:31", "remaining_time": "4:52:09"}
257
+ {"current_steps": 2570, "total_steps": 5000, "loss": 0.4892, "lr": 1.1301801047313106e-05, "epoch": 5.14, "percentage": 51.4, "elapsed_time": "5:07:43", "remaining_time": "4:50:57"}
258
+ {"current_steps": 2580, "total_steps": 5000, "loss": 0.4675, "lr": 1.1232550800255188e-05, "epoch": 5.16, "percentage": 51.6, "elapsed_time": "5:08:55", "remaining_time": "4:49:45"}
259
+ {"current_steps": 2590, "total_steps": 5000, "loss": 0.4336, "lr": 1.1163240480410136e-05, "epoch": 5.18, "percentage": 51.8, "elapsed_time": "5:10:06", "remaining_time": "4:48:33"}
260
+ {"current_steps": 2600, "total_steps": 5000, "loss": 0.4358, "lr": 1.1093873465865156e-05, "epoch": 5.2, "percentage": 52.0, "elapsed_time": "5:11:17", "remaining_time": "4:47:20"}
261
+ {"current_steps": 2610, "total_steps": 5000, "loss": 0.4709, "lr": 1.1024453137470677e-05, "epoch": 5.22, "percentage": 52.2, "elapsed_time": "5:12:30", "remaining_time": "4:46:10"}
262
+ {"current_steps": 2620, "total_steps": 5000, "loss": 0.4349, "lr": 1.0954982878675564e-05, "epoch": 5.24, "percentage": 52.4, "elapsed_time": "5:13:42", "remaining_time": "4:44:58"}
263
+ {"current_steps": 2630, "total_steps": 5000, "loss": 0.4581, "lr": 1.0885466075362224e-05, "epoch": 5.26, "percentage": 52.6, "elapsed_time": "5:14:53", "remaining_time": "4:43:45"}
264
+ {"current_steps": 2640, "total_steps": 5000, "loss": 0.4482, "lr": 1.0815906115681579e-05, "epoch": 5.28, "percentage": 52.8, "elapsed_time": "5:16:05", "remaining_time": "4:42:34"}
265
+ {"current_steps": 2650, "total_steps": 5000, "loss": 0.4771, "lr": 1.0746306389887924e-05, "epoch": 5.3, "percentage": 53.0, "elapsed_time": "5:17:17", "remaining_time": "4:41:22"}
266
+ {"current_steps": 2660, "total_steps": 5000, "loss": 0.4893, "lr": 1.067667029017371e-05, "epoch": 5.32, "percentage": 53.2, "elapsed_time": "5:18:28", "remaining_time": "4:40:09"}
267
+ {"current_steps": 2670, "total_steps": 5000, "loss": 0.4823, "lr": 1.060700121050419e-05, "epoch": 5.34, "percentage": 53.4, "elapsed_time": "5:19:40", "remaining_time": "4:38:57"}
268
+ {"current_steps": 2680, "total_steps": 5000, "loss": 0.45, "lr": 1.0537302546452022e-05, "epoch": 5.36, "percentage": 53.6, "elapsed_time": "5:20:52", "remaining_time": "4:37:46"}
269
+ {"current_steps": 2690, "total_steps": 5000, "loss": 0.4817, "lr": 1.0467577695031763e-05, "epoch": 5.38, "percentage": 53.8, "elapsed_time": "5:22:04", "remaining_time": "4:36:34"}
270
+ {"current_steps": 2700, "total_steps": 5000, "loss": 0.4472, "lr": 1.03978300545343e-05, "epoch": 5.4, "percentage": 54.0, "elapsed_time": "5:23:15", "remaining_time": "4:35:22"}
271
+ {"current_steps": 2710, "total_steps": 5000, "loss": 0.4351, "lr": 1.0328063024361232e-05, "epoch": 5.42, "percentage": 54.2, "elapsed_time": "5:24:28", "remaining_time": "4:34:11"}
272
+ {"current_steps": 2720, "total_steps": 5000, "loss": 0.4052, "lr": 1.0258280004859189e-05, "epoch": 5.44, "percentage": 54.4, "elapsed_time": "5:25:40", "remaining_time": "4:32:59"}
273
+ {"current_steps": 2730, "total_steps": 5000, "loss": 0.51, "lr": 1.0188484397154083e-05, "epoch": 5.46, "percentage": 54.6, "elapsed_time": "5:26:52", "remaining_time": "4:31:48"}
274
+ {"current_steps": 2740, "total_steps": 5000, "loss": 0.4678, "lr": 1.0118679602985373e-05, "epoch": 5.48, "percentage": 54.8, "elapsed_time": "5:28:04", "remaining_time": "4:30:35"}
275
+ {"current_steps": 2750, "total_steps": 5000, "loss": 0.4802, "lr": 1.0048869024540247e-05, "epoch": 5.5, "percentage": 55.0, "elapsed_time": "5:29:15", "remaining_time": "4:29:23"}
276
+ {"current_steps": 2760, "total_steps": 5000, "loss": 0.4416, "lr": 9.979056064287807e-06, "epoch": 5.52, "percentage": 55.2, "elapsed_time": "5:30:26", "remaining_time": "4:28:11"}
277
+ {"current_steps": 2770, "total_steps": 5000, "loss": 0.4613, "lr": 9.909244124813246e-06, "epoch": 5.54, "percentage": 55.4, "elapsed_time": "5:31:38", "remaining_time": "4:26:59"}
278
+ {"current_steps": 2780, "total_steps": 5000, "loss": 0.4163, "lr": 9.839436608652007e-06, "epoch": 5.5600000000000005, "percentage": 55.6, "elapsed_time": "5:32:50", "remaining_time": "4:25:47"}
279
+ {"current_steps": 2790, "total_steps": 5000, "loss": 0.4753, "lr": 9.76963691812394e-06, "epoch": 5.58, "percentage": 55.8, "elapsed_time": "5:34:03", "remaining_time": "4:24:36"}
280
+ {"current_steps": 2800, "total_steps": 5000, "loss": 0.4725, "lr": 9.699848455167489e-06, "epoch": 5.6, "percentage": 56.0, "elapsed_time": "5:35:15", "remaining_time": "4:23:24"}
281
+ {"current_steps": 2810, "total_steps": 5000, "loss": 0.4521, "lr": 9.630074621173882e-06, "epoch": 5.62, "percentage": 56.2, "elapsed_time": "5:36:28", "remaining_time": "4:22:14"}
282
+ {"current_steps": 2820, "total_steps": 5000, "loss": 0.3838, "lr": 9.560318816821354e-06, "epoch": 5.64, "percentage": 56.4, "elapsed_time": "5:37:39", "remaining_time": "4:21:01"}
283
+ {"current_steps": 2830, "total_steps": 5000, "loss": 0.4603, "lr": 9.490584441909392e-06, "epoch": 5.66, "percentage": 56.6, "elapsed_time": "5:38:50", "remaining_time": "4:19:49"}
284
+ {"current_steps": 2840, "total_steps": 5000, "loss": 0.4538, "lr": 9.420874895193056e-06, "epoch": 5.68, "percentage": 56.8, "elapsed_time": "5:40:03", "remaining_time": "4:18:37"}
285
+ {"current_steps": 2850, "total_steps": 5000, "loss": 0.4527, "lr": 9.351193574217305e-06, "epoch": 5.7, "percentage": 57.0, "elapsed_time": "5:41:14", "remaining_time": "4:17:25"}
286
+ {"current_steps": 2860, "total_steps": 5000, "loss": 0.4915, "lr": 9.281543875151419e-06, "epoch": 5.72, "percentage": 57.2, "elapsed_time": "5:42:27", "remaining_time": "4:16:14"}
287
+ {"current_steps": 2870, "total_steps": 5000, "loss": 0.4338, "lr": 9.211929192623466e-06, "epoch": 5.74, "percentage": 57.4, "elapsed_time": "5:43:38", "remaining_time": "4:15:02"}
288
+ {"current_steps": 2880, "total_steps": 5000, "loss": 0.4573, "lr": 9.142352919554862e-06, "epoch": 5.76, "percentage": 57.6, "elapsed_time": "5:44:49", "remaining_time": "4:13:49"}
289
+ {"current_steps": 2890, "total_steps": 5000, "loss": 0.4494, "lr": 9.072818446995e-06, "epoch": 5.78, "percentage": 57.8, "elapsed_time": "5:46:01", "remaining_time": "4:12:38"}
290
+ {"current_steps": 2900, "total_steps": 5000, "loss": 0.5061, "lr": 9.003329163955973e-06, "epoch": 5.8, "percentage": 58.0, "elapsed_time": "5:47:13", "remaining_time": "4:11:26"}
291
+ {"current_steps": 2910, "total_steps": 5000, "loss": 0.4537, "lr": 8.933888457247402e-06, "epoch": 5.82, "percentage": 58.2, "elapsed_time": "5:48:27", "remaining_time": "4:10:15"}
292
+ {"current_steps": 2920, "total_steps": 5000, "loss": 0.4764, "lr": 8.864499711311362e-06, "epoch": 5.84, "percentage": 58.4, "elapsed_time": "5:49:38", "remaining_time": "4:09:03"}
293
+ {"current_steps": 2930, "total_steps": 5000, "loss": 0.4563, "lr": 8.79516630805745e-06, "epoch": 5.86, "percentage": 58.6, "elapsed_time": "5:50:50", "remaining_time": "4:07:51"}
294
+ {"current_steps": 2940, "total_steps": 5000, "loss": 0.4887, "lr": 8.725891626697912e-06, "epoch": 5.88, "percentage": 58.8, "elapsed_time": "5:52:03", "remaining_time": "4:06:40"}
295
+ {"current_steps": 2950, "total_steps": 5000, "loss": 0.446, "lr": 8.656679043582986e-06, "epoch": 5.9, "percentage": 59.0, "elapsed_time": "5:53:13", "remaining_time": "4:05:28"}
296
+ {"current_steps": 2960, "total_steps": 5000, "loss": 0.4533, "lr": 8.587531932036334e-06, "epoch": 5.92, "percentage": 59.2, "elapsed_time": "5:54:26", "remaining_time": "4:04:16"}
297
+ {"current_steps": 2970, "total_steps": 5000, "loss": 0.4451, "lr": 8.518453662190622e-06, "epoch": 5.9399999999999995, "percentage": 59.4, "elapsed_time": "5:55:37", "remaining_time": "4:03:04"}
298
+ {"current_steps": 2980, "total_steps": 5000, "loss": 0.393, "lr": 8.449447600823262e-06, "epoch": 5.96, "percentage": 59.6, "elapsed_time": "5:56:49", "remaining_time": "4:01:52"}
299
+ {"current_steps": 2990, "total_steps": 5000, "loss": 0.4406, "lr": 8.380517111192336e-06, "epoch": 5.98, "percentage": 59.8, "elapsed_time": "5:58:00", "remaining_time": "4:00:39"}
300
+ {"current_steps": 3000, "total_steps": 5000, "loss": 0.474, "lr": 8.311665552872662e-06, "epoch": 6.0, "percentage": 60.0, "elapsed_time": "5:59:11", "remaining_time": "3:59:27"}
301
+ {"current_steps": 3010, "total_steps": 5000, "loss": 0.3953, "lr": 8.242896281592057e-06, "epoch": 6.02, "percentage": 60.2, "elapsed_time": "6:00:25", "remaining_time": "3:58:17"}
302
+ {"current_steps": 3020, "total_steps": 5000, "loss": 0.4117, "lr": 8.174212649067781e-06, "epoch": 6.04, "percentage": 60.4, "elapsed_time": "6:01:37", "remaining_time": "3:57:05"}
303
+ {"current_steps": 3030, "total_steps": 5000, "loss": 0.3988, "lr": 8.10561800284319e-06, "epoch": 6.06, "percentage": 60.6, "elapsed_time": "6:02:48", "remaining_time": "3:55:52"}
304
+ {"current_steps": 3040, "total_steps": 5000, "loss": 0.418, "lr": 8.037115686124564e-06, "epoch": 6.08, "percentage": 60.8, "elapsed_time": "6:04:00", "remaining_time": "3:54:41"}
305
+ {"current_steps": 3050, "total_steps": 5000, "loss": 0.4084, "lr": 7.96870903761818e-06, "epoch": 6.1, "percentage": 61.0, "elapsed_time": "6:05:11", "remaining_time": "3:53:29"}
306
+ {"current_steps": 3060, "total_steps": 5000, "loss": 0.3739, "lr": 7.900401391367576e-06, "epoch": 6.12, "percentage": 61.2, "elapsed_time": "6:06:22", "remaining_time": "3:52:16"}
307
+ {"current_steps": 3070, "total_steps": 5000, "loss": 0.3763, "lr": 7.832196076591067e-06, "epoch": 6.14, "percentage": 61.4, "elapsed_time": "6:07:33", "remaining_time": "3:51:04"}
308
+ {"current_steps": 3080, "total_steps": 5000, "loss": 0.4522, "lr": 7.76409641751947e-06, "epoch": 6.16, "percentage": 61.6, "elapsed_time": "6:08:45", "remaining_time": "3:49:52"}
309
+ {"current_steps": 3090, "total_steps": 5000, "loss": 0.4661, "lr": 7.696105733234099e-06, "epoch": 6.18, "percentage": 61.8, "elapsed_time": "6:09:57", "remaining_time": "3:48:41"}
310
+ {"current_steps": 3100, "total_steps": 5000, "loss": 0.4519, "lr": 7.628227337504972e-06, "epoch": 6.2, "percentage": 62.0, "elapsed_time": "6:11:09", "remaining_time": "3:47:29"}
311
+ {"current_steps": 3110, "total_steps": 5000, "loss": 0.4073, "lr": 7.560464538629345e-06, "epoch": 6.22, "percentage": 62.2, "elapsed_time": "6:12:23", "remaining_time": "3:46:18"}
312
+ {"current_steps": 3120, "total_steps": 5000, "loss": 0.4458, "lr": 7.492820639270435e-06, "epoch": 6.24, "percentage": 62.4, "elapsed_time": "6:13:35", "remaining_time": "3:45:06"}
313
+ {"current_steps": 3130, "total_steps": 5000, "loss": 0.3703, "lr": 7.4252989362964635e-06, "epoch": 6.26, "percentage": 62.6, "elapsed_time": "6:14:47", "remaining_time": "3:43:54"}
314
+ {"current_steps": 3140, "total_steps": 5000, "loss": 0.4393, "lr": 7.357902720619976e-06, "epoch": 6.28, "percentage": 62.8, "elapsed_time": "6:15:59", "remaining_time": "3:42:43"}
315
+ {"current_steps": 3150, "total_steps": 5000, "loss": 0.437, "lr": 7.290635277037442e-06, "epoch": 6.3, "percentage": 63.0, "elapsed_time": "6:17:11", "remaining_time": "3:41:31"}
316
+ {"current_steps": 3160, "total_steps": 5000, "loss": 0.3979, "lr": 7.22349988406916e-06, "epoch": 6.32, "percentage": 63.2, "elapsed_time": "6:18:23", "remaining_time": "3:40:19"}
317
+ {"current_steps": 3170, "total_steps": 5000, "loss": 0.4078, "lr": 7.156499813799477e-06, "epoch": 6.34, "percentage": 63.4, "elapsed_time": "6:19:35", "remaining_time": "3:39:07"}
318
+ {"current_steps": 3180, "total_steps": 5000, "loss": 0.4182, "lr": 7.0896383317172845e-06, "epoch": 6.36, "percentage": 63.6, "elapsed_time": "6:20:47", "remaining_time": "3:37:55"}
319
+ {"current_steps": 3190, "total_steps": 5000, "loss": 0.4239, "lr": 7.022918696556896e-06, "epoch": 6.38, "percentage": 63.8, "elapsed_time": "6:21:59", "remaining_time": "3:36:44"}
320
+ {"current_steps": 3200, "total_steps": 5000, "loss": 0.4369, "lr": 6.956344160139201e-06, "epoch": 6.4, "percentage": 64.0, "elapsed_time": "6:23:11", "remaining_time": "3:35:32"}
321
+ {"current_steps": 3210, "total_steps": 5000, "loss": 0.4469, "lr": 6.889917967213184e-06, "epoch": 6.42, "percentage": 64.2, "elapsed_time": "6:24:25", "remaining_time": "3:34:22"}
322
+ {"current_steps": 3220, "total_steps": 5000, "loss": 0.4312, "lr": 6.823643355297774e-06, "epoch": 6.44, "percentage": 64.4, "elapsed_time": "6:25:37", "remaining_time": "3:33:10"}
323
+ {"current_steps": 3230, "total_steps": 5000, "loss": 0.4465, "lr": 6.757523554524056e-06, "epoch": 6.46, "percentage": 64.6, "elapsed_time": "6:26:50", "remaining_time": "3:31:59"}
324
+ {"current_steps": 3240, "total_steps": 5000, "loss": 0.4201, "lr": 6.69156178747784e-06, "epoch": 6.48, "percentage": 64.8, "elapsed_time": "6:28:01", "remaining_time": "3:30:46"}
325
+ {"current_steps": 3250, "total_steps": 5000, "loss": 0.3882, "lr": 6.62576126904259e-06, "epoch": 6.5, "percentage": 65.0, "elapsed_time": "6:29:12", "remaining_time": "3:29:34"}
326
+ {"current_steps": 3260, "total_steps": 5000, "loss": 0.4448, "lr": 6.560125206242746e-06, "epoch": 6.52, "percentage": 65.2, "elapsed_time": "6:30:24", "remaining_time": "3:28:22"}
327
+ {"current_steps": 3270, "total_steps": 5000, "loss": 0.3915, "lr": 6.494656798087412e-06, "epoch": 6.54, "percentage": 65.4, "elapsed_time": "6:31:35", "remaining_time": "3:27:10"}
328
+ {"current_steps": 3280, "total_steps": 5000, "loss": 0.3769, "lr": 6.4293592354144365e-06, "epoch": 6.5600000000000005, "percentage": 65.6, "elapsed_time": "6:32:47", "remaining_time": "3:25:58"}
329
+ {"current_steps": 3290, "total_steps": 5000, "loss": 0.4503, "lr": 6.364235700734903e-06, "epoch": 6.58, "percentage": 65.8, "elapsed_time": "6:33:59", "remaining_time": "3:24:46"}
330
+ {"current_steps": 3300, "total_steps": 5000, "loss": 0.4398, "lr": 6.299289368078016e-06, "epoch": 6.6, "percentage": 66.0, "elapsed_time": "6:35:10", "remaining_time": "3:23:34"}
331
+ {"current_steps": 3310, "total_steps": 5000, "loss": 0.4199, "lr": 6.234523402836408e-06, "epoch": 6.62, "percentage": 66.2, "elapsed_time": "6:36:23", "remaining_time": "3:22:23"}
332
+ {"current_steps": 3320, "total_steps": 5000, "loss": 0.4574, "lr": 6.169940961611853e-06, "epoch": 6.64, "percentage": 66.4, "elapsed_time": "6:37:36", "remaining_time": "3:21:11"}
333
+ {"current_steps": 3330, "total_steps": 5000, "loss": 0.4252, "lr": 6.1055451920614165e-06, "epoch": 6.66, "percentage": 66.6, "elapsed_time": "6:38:47", "remaining_time": "3:19:59"}
334
+ {"current_steps": 3340, "total_steps": 5000, "loss": 0.4069, "lr": 6.0413392327440635e-06, "epoch": 6.68, "percentage": 66.8, "elapsed_time": "6:39:59", "remaining_time": "3:18:47"}
335
+ {"current_steps": 3350, "total_steps": 5000, "loss": 0.4173, "lr": 5.977326212967671e-06, "epoch": 6.7, "percentage": 67.0, "elapsed_time": "6:41:11", "remaining_time": "3:17:36"}
336
+ {"current_steps": 3360, "total_steps": 5000, "loss": 0.3737, "lr": 5.913509252636511e-06, "epoch": 6.72, "percentage": 67.2, "elapsed_time": "6:42:22", "remaining_time": "3:16:23"}
337
+ {"current_steps": 3370, "total_steps": 5000, "loss": 0.437, "lr": 5.849891462099199e-06, "epoch": 6.74, "percentage": 67.4, "elapsed_time": "6:43:34", "remaining_time": "3:15:11"}
338
+ {"current_steps": 3380, "total_steps": 5000, "loss": 0.4457, "lr": 5.786475941997094e-06, "epoch": 6.76, "percentage": 67.6, "elapsed_time": "6:44:46", "remaining_time": "3:14:00"}
339
+ {"current_steps": 3390, "total_steps": 5000, "loss": 0.3989, "lr": 5.723265783113181e-06, "epoch": 6.78, "percentage": 67.8, "elapsed_time": "6:45:58", "remaining_time": "3:12:48"}
340
+ {"current_steps": 3400, "total_steps": 5000, "loss": 0.4314, "lr": 5.660264066221426e-06, "epoch": 6.8, "percentage": 68.0, "elapsed_time": "6:47:10", "remaining_time": "3:11:36"}
341
+ {"current_steps": 3410, "total_steps": 5000, "loss": 0.3989, "lr": 5.59747386193663e-06, "epoch": 6.82, "percentage": 68.2, "elapsed_time": "6:48:23", "remaining_time": "3:10:25"}
342
+ {"current_steps": 3420, "total_steps": 5000, "loss": 0.3792, "lr": 5.534898230564765e-06, "epoch": 6.84, "percentage": 68.4, "elapsed_time": "6:49:34", "remaining_time": "3:09:13"}
343
+ {"current_steps": 3430, "total_steps": 5000, "loss": 0.4115, "lr": 5.472540221953824e-06, "epoch": 6.86, "percentage": 68.6, "elapsed_time": "6:50:45", "remaining_time": "3:08:01"}
344
+ {"current_steps": 3440, "total_steps": 5000, "loss": 0.4067, "lr": 5.41040287534517e-06, "epoch": 6.88, "percentage": 68.8, "elapsed_time": "6:51:57", "remaining_time": "3:06:49"}
345
+ {"current_steps": 3450, "total_steps": 5000, "loss": 0.4424, "lr": 5.348489219225417e-06, "epoch": 6.9, "percentage": 69.0, "elapsed_time": "6:53:08", "remaining_time": "3:05:37"}
346
+ {"current_steps": 3460, "total_steps": 5000, "loss": 0.4508, "lr": 5.286802271178815e-06, "epoch": 6.92, "percentage": 69.2, "elapsed_time": "6:54:21", "remaining_time": "3:04:25"}
347
+ {"current_steps": 3470, "total_steps": 5000, "loss": 0.3984, "lr": 5.225345037740186e-06, "epoch": 6.9399999999999995, "percentage": 69.4, "elapsed_time": "6:55:32", "remaining_time": "3:03:13"}
348
+ {"current_steps": 3480, "total_steps": 5000, "loss": 0.4499, "lr": 5.16412051424839e-06, "epoch": 6.96, "percentage": 69.6, "elapsed_time": "6:56:43", "remaining_time": "3:02:00"}
349
+ {"current_steps": 3490, "total_steps": 5000, "loss": 0.4154, "lr": 5.103131684700315e-06, "epoch": 6.98, "percentage": 69.8, "elapsed_time": "6:57:54", "remaining_time": "3:00:49"}
350
+ {"current_steps": 3500, "total_steps": 5000, "loss": 0.391, "lr": 5.042381521605473e-06, "epoch": 7.0, "percentage": 70.0, "elapsed_time": "6:59:06", "remaining_time": "2:59:37"}
351
+ {"current_steps": 3510, "total_steps": 5000, "loss": 0.38, "lr": 4.981872985841115e-06, "epoch": 7.02, "percentage": 70.2, "elapsed_time": "7:00:19", "remaining_time": "2:58:25"}
352
+ {"current_steps": 3520, "total_steps": 5000, "loss": 0.3478, "lr": 4.921609026507907e-06, "epoch": 7.04, "percentage": 70.4, "elapsed_time": "7:01:30", "remaining_time": "2:57:13"}
353
+ {"current_steps": 3530, "total_steps": 5000, "loss": 0.3712, "lr": 4.861592580786205e-06, "epoch": 7.06, "percentage": 70.6, "elapsed_time": "7:02:41", "remaining_time": "2:56:01"}
354
+ {"current_steps": 3540, "total_steps": 5000, "loss": 0.3801, "lr": 4.801826573792905e-06, "epoch": 7.08, "percentage": 70.8, "elapsed_time": "7:03:53", "remaining_time": "2:54:49"}
355
+ {"current_steps": 3550, "total_steps": 5000, "loss": 0.3881, "lr": 4.7423139184388725e-06, "epoch": 7.1, "percentage": 71.0, "elapsed_time": "7:05:05", "remaining_time": "2:53:37"}
356
+ {"current_steps": 3560, "total_steps": 5000, "loss": 0.3489, "lr": 4.6830575152869615e-06, "epoch": 7.12, "percentage": 71.2, "elapsed_time": "7:06:17", "remaining_time": "2:52:25"}
357
+ {"current_steps": 3570, "total_steps": 5000, "loss": 0.4284, "lr": 4.62406025241067e-06, "epoch": 7.14, "percentage": 71.4, "elapsed_time": "7:07:28", "remaining_time": "2:51:13"}
358
+ {"current_steps": 3580, "total_steps": 5000, "loss": 0.4055, "lr": 4.565325005253356e-06, "epoch": 7.16, "percentage": 71.6, "elapsed_time": "7:08:40", "remaining_time": "2:50:01"}
359
+ {"current_steps": 3590, "total_steps": 5000, "loss": 0.3627, "lr": 4.506854636488103e-06, "epoch": 7.18, "percentage": 71.8, "elapsed_time": "7:09:51", "remaining_time": "2:48:49"}
360
+ {"current_steps": 3600, "total_steps": 5000, "loss": 0.3866, "lr": 4.44865199587819e-06, "epoch": 7.2, "percentage": 72.0, "elapsed_time": "7:11:03", "remaining_time": "2:47:37"}
361
+ {"current_steps": 3610, "total_steps": 5000, "loss": 0.3947, "lr": 4.39071992013822e-06, "epoch": 7.22, "percentage": 72.2, "elapsed_time": "7:12:17", "remaining_time": "2:46:27"}
362
+ {"current_steps": 3620, "total_steps": 5000, "loss": 0.4266, "lr": 4.3330612327958265e-06, "epoch": 7.24, "percentage": 72.4, "elapsed_time": "7:13:30", "remaining_time": "2:45:15"}
363
+ {"current_steps": 3630, "total_steps": 5000, "loss": 0.3495, "lr": 4.275678744054094e-06, "epoch": 7.26, "percentage": 72.6, "elapsed_time": "7:14:41", "remaining_time": "2:44:03"}
364
+ {"current_steps": 3640, "total_steps": 5000, "loss": 0.4153, "lr": 4.218575250654559e-06, "epoch": 7.28, "percentage": 72.8, "elapsed_time": "7:15:54", "remaining_time": "2:42:52"}
365
+ {"current_steps": 3650, "total_steps": 5000, "loss": 0.3819, "lr": 4.161753535740932e-06, "epoch": 7.3, "percentage": 73.0, "elapsed_time": "7:17:06", "remaining_time": "2:41:40"}
366
+ {"current_steps": 3660, "total_steps": 5000, "loss": 0.4032, "lr": 4.105216368723437e-06, "epoch": 7.32, "percentage": 73.2, "elapsed_time": "7:18:18", "remaining_time": "2:40:28"}
367
+ {"current_steps": 3670, "total_steps": 5000, "loss": 0.358, "lr": 4.048966505143831e-06, "epoch": 7.34, "percentage": 73.4, "elapsed_time": "7:19:30", "remaining_time": "2:39:16"}
368
+ {"current_steps": 3680, "total_steps": 5000, "loss": 0.4101, "lr": 3.993006686541108e-06, "epoch": 7.36, "percentage": 73.6, "elapsed_time": "7:20:41", "remaining_time": "2:38:04"}
369
+ {"current_steps": 3690, "total_steps": 5000, "loss": 0.3803, "lr": 3.937339640317879e-06, "epoch": 7.38, "percentage": 73.8, "elapsed_time": "7:21:54", "remaining_time": "2:36:52"}
370
+ {"current_steps": 3700, "total_steps": 5000, "loss": 0.3844, "lr": 3.88196807960744e-06, "epoch": 7.4, "percentage": 74.0, "elapsed_time": "7:23:06", "remaining_time": "2:35:41"}
371
+ {"current_steps": 3710, "total_steps": 5000, "loss": 0.3536, "lr": 3.826894703141552e-06, "epoch": 7.42, "percentage": 74.2, "elapsed_time": "7:24:19", "remaining_time": "2:34:29"}
372
+ {"current_steps": 3720, "total_steps": 5000, "loss": 0.3957, "lr": 3.772122195118877e-06, "epoch": 7.44, "percentage": 74.4, "elapsed_time": "7:25:31", "remaining_time": "2:33:17"}
373
+ {"current_steps": 3730, "total_steps": 5000, "loss": 0.4308, "lr": 3.7176532250741857e-06, "epoch": 7.46, "percentage": 74.6, "elapsed_time": "7:26:44", "remaining_time": "2:32:06"}
374
+ {"current_steps": 3740, "total_steps": 5000, "loss": 0.3988, "lr": 3.663490447748236e-06, "epoch": 7.48, "percentage": 74.8, "elapsed_time": "7:27:56", "remaining_time": "2:30:54"}
375
+ {"current_steps": 3750, "total_steps": 5000, "loss": 0.3983, "lr": 3.6096365029583803e-06, "epoch": 7.5, "percentage": 75.0, "elapsed_time": "7:29:08", "remaining_time": "2:29:42"}
376
+ {"current_steps": 3760, "total_steps": 5000, "loss": 0.37, "lr": 3.5560940154699133e-06, "epoch": 7.52, "percentage": 75.2, "elapsed_time": "7:30:19", "remaining_time": "2:28:30"}
377
+ {"current_steps": 3770, "total_steps": 5000, "loss": 0.3645, "lr": 3.502865594868136e-06, "epoch": 7.54, "percentage": 75.4, "elapsed_time": "7:31:30", "remaining_time": "2:27:18"}
378
+ {"current_steps": 3780, "total_steps": 5000, "loss": 0.4179, "lr": 3.4499538354311757e-06, "epoch": 7.5600000000000005, "percentage": 75.6, "elapsed_time": "7:32:42", "remaining_time": "2:26:06"}
379
+ {"current_steps": 3790, "total_steps": 5000, "loss": 0.324, "lr": 3.397361316003539e-06, "epoch": 7.58, "percentage": 75.8, "elapsed_time": "7:33:54", "remaining_time": "2:24:54"}
380
+ {"current_steps": 3800, "total_steps": 5000, "loss": 0.3789, "lr": 3.3450905998704274e-06, "epoch": 7.6, "percentage": 76.0, "elapsed_time": "7:35:06", "remaining_time": "2:23:42"}
381
+ {"current_steps": 3810, "total_steps": 5000, "loss": 0.3608, "lr": 3.2931442346328e-06, "epoch": 7.62, "percentage": 76.2, "elapsed_time": "7:36:19", "remaining_time": "2:22:31"}
382
+ {"current_steps": 3820, "total_steps": 5000, "loss": 0.3985, "lr": 3.241524752083215e-06, "epoch": 7.64, "percentage": 76.4, "elapsed_time": "7:37:30", "remaining_time": "2:21:19"}
383
+ {"current_steps": 3830, "total_steps": 5000, "loss": 0.3447, "lr": 3.190234668082427e-06, "epoch": 7.66, "percentage": 76.6, "elapsed_time": "7:38:42", "remaining_time": "2:20:07"}
384
+ {"current_steps": 3840, "total_steps": 5000, "loss": 0.3426, "lr": 3.1392764824367706e-06, "epoch": 7.68, "percentage": 76.8, "elapsed_time": "7:39:53", "remaining_time": "2:18:55"}
385
+ {"current_steps": 3850, "total_steps": 5000, "loss": 0.3576, "lr": 3.0886526787763237e-06, "epoch": 7.7, "percentage": 77.0, "elapsed_time": "7:41:05", "remaining_time": "2:17:43"}
386
+ {"current_steps": 3860, "total_steps": 5000, "loss": 0.3928, "lr": 3.038365724433858e-06, "epoch": 7.72, "percentage": 77.2, "elapsed_time": "7:42:16", "remaining_time": "2:16:31"}
387
+ {"current_steps": 3870, "total_steps": 5000, "loss": 0.3589, "lr": 2.988418070324577e-06, "epoch": 7.74, "percentage": 77.4, "elapsed_time": "7:43:28", "remaining_time": "2:15:19"}
388
+ {"current_steps": 3880, "total_steps": 5000, "loss": 0.3851, "lr": 2.938812150826684e-06, "epoch": 7.76, "percentage": 77.6, "elapsed_time": "7:44:40", "remaining_time": "2:14:07"}
389
+ {"current_steps": 3890, "total_steps": 5000, "loss": 0.3688, "lr": 2.8895503836627105e-06, "epoch": 7.78, "percentage": 77.8, "elapsed_time": "7:45:51", "remaining_time": "2:12:55"}
390
+ {"current_steps": 3900, "total_steps": 5000, "loss": 0.3585, "lr": 2.840635169781688e-06, "epoch": 7.8, "percentage": 78.0, "elapsed_time": "7:47:03", "remaining_time": "2:11:44"}
391
+ {"current_steps": 3910, "total_steps": 5000, "loss": 0.3653, "lr": 2.7920688932421337e-06, "epoch": 7.82, "percentage": 78.2, "elapsed_time": "7:48:16", "remaining_time": "2:10:32"}
392
+ {"current_steps": 3920, "total_steps": 5000, "loss": 0.3512, "lr": 2.7438539210958483e-06, "epoch": 7.84, "percentage": 78.4, "elapsed_time": "7:49:28", "remaining_time": "2:09:20"}
393
+ {"current_steps": 3930, "total_steps": 5000, "loss": 0.3717, "lr": 2.6959926032725537e-06, "epoch": 7.86, "percentage": 78.6, "elapsed_time": "7:50:39", "remaining_time": "2:08:08"}
394
+ {"current_steps": 3940, "total_steps": 5000, "loss": 0.3806, "lr": 2.648487272465361e-06, "epoch": 7.88, "percentage": 78.8, "elapsed_time": "7:51:51", "remaining_time": "2:06:56"}
395
+ {"current_steps": 3950, "total_steps": 5000, "loss": 0.3993, "lr": 2.6013402440170676e-06, "epoch": 7.9, "percentage": 79.0, "elapsed_time": "7:53:04", "remaining_time": "2:05:45"}
396
+ {"current_steps": 3960, "total_steps": 5000, "loss": 0.3387, "lr": 2.5545538158073278e-06, "epoch": 7.92, "percentage": 79.2, "elapsed_time": "7:54:15", "remaining_time": "2:04:33"}
397
+ {"current_steps": 3970, "total_steps": 5000, "loss": 0.37, "lr": 2.512756228659141e-06, "epoch": 7.9399999999999995, "percentage": 79.4, "elapsed_time": "7:55:26", "remaining_time": "2:03:21"}
398
+ {"current_steps": 3980, "total_steps": 5000, "loss": 0.3967, "lr": 2.4666612085261344e-06, "epoch": 7.96, "percentage": 79.6, "elapsed_time": "7:56:38", "remaining_time": "2:02:09"}
399
+ {"current_steps": 3990, "total_steps": 5000, "loss": 0.4029, "lr": 2.420933352697865e-06, "epoch": 7.98, "percentage": 79.8, "elapsed_time": "7:57:50", "remaining_time": "2:00:57"}
400
+ {"current_steps": 4000, "total_steps": 5000, "loss": 0.3713, "lr": 2.37557488988552e-06, "epoch": 8.0, "percentage": 80.0, "elapsed_time": "7:59:01", "remaining_time": "1:59:45"}
401
+ {"current_steps": 4010, "total_steps": 5000, "loss": 0.3232, "lr": 2.3305880307965834e-06, "epoch": 8.02, "percentage": 80.2, "elapsed_time": "8:00:14", "remaining_time": "1:58:33"}
402
+ {"current_steps": 4020, "total_steps": 5000, "loss": 0.331, "lr": 2.2859749680270983e-06, "epoch": 8.04, "percentage": 80.4, "elapsed_time": "8:01:26", "remaining_time": "1:57:22"}
403
+ {"current_steps": 4030, "total_steps": 5000, "loss": 0.3818, "lr": 2.241737875954808e-06, "epoch": 8.06, "percentage": 80.6, "elapsed_time": "8:02:38", "remaining_time": "1:56:10"}
404
+ {"current_steps": 4040, "total_steps": 5000, "loss": 0.3482, "lr": 2.1978789106331666e-06, "epoch": 8.08, "percentage": 80.8, "elapsed_time": "8:03:50", "remaining_time": "1:54:58"}
405
+ {"current_steps": 4050, "total_steps": 5000, "loss": 0.3195, "lr": 2.154400209686268e-06, "epoch": 8.1, "percentage": 81.0, "elapsed_time": "8:05:02", "remaining_time": "1:53:46"}
406
+ {"current_steps": 4060, "total_steps": 5000, "loss": 0.3557, "lr": 2.1113038922046603e-06, "epoch": 8.12, "percentage": 81.2, "elapsed_time": "8:06:13", "remaining_time": "1:52:34"}
407
+ {"current_steps": 4070, "total_steps": 5000, "loss": 0.2853, "lr": 2.0685920586420562e-06, "epoch": 8.14, "percentage": 81.4, "elapsed_time": "8:07:24", "remaining_time": "1:51:22"}
408
+ {"current_steps": 4080, "total_steps": 5000, "loss": 0.316, "lr": 2.026266790712965e-06, "epoch": 8.16, "percentage": 81.6, "elapsed_time": "8:08:35", "remaining_time": "1:50:10"}
409
+ {"current_steps": 4090, "total_steps": 5000, "loss": 0.3328, "lr": 1.984330151291233e-06, "epoch": 8.18, "percentage": 81.8, "elapsed_time": "8:09:47", "remaining_time": "1:48:58"}
410
+ {"current_steps": 4100, "total_steps": 5000, "loss": 0.3338, "lr": 1.9427841843095063e-06, "epoch": 8.2, "percentage": 82.0, "elapsed_time": "8:10:59", "remaining_time": "1:47:46"}
411
+ {"current_steps": 4110, "total_steps": 5000, "loss": 0.3226, "lr": 1.9016309146596024e-06, "epoch": 8.22, "percentage": 82.2, "elapsed_time": "8:12:12", "remaining_time": "1:46:35"}
412
+ {"current_steps": 4120, "total_steps": 5000, "loss": 0.3147, "lr": 1.8608723480938207e-06, "epoch": 8.24, "percentage": 82.4, "elapsed_time": "8:13:23", "remaining_time": "1:45:23"}
413
+ {"current_steps": 4130, "total_steps": 5000, "loss": 0.3549, "lr": 1.820510471127196e-06, "epoch": 8.26, "percentage": 82.6, "elapsed_time": "8:14:35", "remaining_time": "1:44:11"}
414
+ {"current_steps": 4140, "total_steps": 5000, "loss": 0.3701, "lr": 1.7805472509406695e-06, "epoch": 8.28, "percentage": 82.8, "elapsed_time": "8:15:47", "remaining_time": "1:42:59"}
415
+ {"current_steps": 4150, "total_steps": 5000, "loss": 0.341, "lr": 1.7409846352852144e-06, "epoch": 8.3, "percentage": 83.0, "elapsed_time": "8:16:59", "remaining_time": "1:41:47"}
416
+ {"current_steps": 4160, "total_steps": 5000, "loss": 0.2754, "lr": 1.7018245523869038e-06, "epoch": 8.32, "percentage": 83.2, "elapsed_time": "8:18:11", "remaining_time": "1:40:35"}
417
+ {"current_steps": 4170, "total_steps": 5000, "loss": 0.3958, "lr": 1.6630689108529286e-06, "epoch": 8.34, "percentage": 83.4, "elapsed_time": "8:19:23", "remaining_time": "1:39:23"}
418
+ {"current_steps": 4180, "total_steps": 5000, "loss": 0.3512, "lr": 1.6247195995785836e-06, "epoch": 8.36, "percentage": 83.6, "elapsed_time": "8:20:35", "remaining_time": "1:38:12"}
419
+ {"current_steps": 4190, "total_steps": 5000, "loss": 0.3533, "lr": 1.5867784876551973e-06, "epoch": 8.38, "percentage": 83.8, "elapsed_time": "8:21:46", "remaining_time": "1:37:00"}
420
+ {"current_steps": 4200, "total_steps": 5000, "loss": 0.3746, "lr": 1.5492474242790368e-06, "epoch": 8.4, "percentage": 84.0, "elapsed_time": "8:22:59", "remaining_time": "1:35:48"}
421
+ {"current_steps": 4210, "total_steps": 5000, "loss": 0.3274, "lr": 1.5121282386611823e-06, "epoch": 8.42, "percentage": 84.2, "elapsed_time": "8:24:12", "remaining_time": "1:34:36"}
422
+ {"current_steps": 4220, "total_steps": 5000, "loss": 0.3055, "lr": 1.4754227399383758e-06, "epoch": 8.44, "percentage": 84.4, "elapsed_time": "8:25:23", "remaining_time": "1:33:24"}
423
+ {"current_steps": 4230, "total_steps": 5000, "loss": 0.3078, "lr": 1.439132717084839e-06, "epoch": 8.46, "percentage": 84.6, "elapsed_time": "8:26:35", "remaining_time": "1:32:13"}
424
+ {"current_steps": 4240, "total_steps": 5000, "loss": 0.3194, "lr": 1.40325993882509e-06, "epoch": 8.48, "percentage": 84.8, "elapsed_time": "8:27:47", "remaining_time": "1:31:01"}
425
+ {"current_steps": 4250, "total_steps": 5000, "loss": 0.352, "lr": 1.3678061535477305e-06, "epoch": 8.5, "percentage": 85.0, "elapsed_time": "8:28:59", "remaining_time": "1:29:49"}
426
+ {"current_steps": 4260, "total_steps": 5000, "loss": 0.3061, "lr": 1.3327730892202384e-06, "epoch": 8.52, "percentage": 85.2, "elapsed_time": "8:30:11", "remaining_time": "1:28:37"}
427
+ {"current_steps": 4270, "total_steps": 5000, "loss": 0.406, "lr": 1.2981624533047432e-06, "epoch": 8.54, "percentage": 85.4, "elapsed_time": "8:31:24", "remaining_time": "1:27:25"}
428
+ {"current_steps": 4280, "total_steps": 5000, "loss": 0.3335, "lr": 1.2639759326748136e-06, "epoch": 8.56, "percentage": 85.6, "elapsed_time": "8:32:35", "remaining_time": "1:26:13"}
429
+ {"current_steps": 4290, "total_steps": 5000, "loss": 0.4048, "lr": 1.230215193533233e-06, "epoch": 8.58, "percentage": 85.8, "elapsed_time": "8:33:47", "remaining_time": "1:25:02"}
430
+ {"current_steps": 4300, "total_steps": 5000, "loss": 0.3388, "lr": 1.196881881330798e-06, "epoch": 8.6, "percentage": 86.0, "elapsed_time": "8:34:59", "remaining_time": "1:23:50"}
431
+ {"current_steps": 4310, "total_steps": 5000, "loss": 0.358, "lr": 1.1639776206861197e-06, "epoch": 8.62, "percentage": 86.2, "elapsed_time": "8:36:11", "remaining_time": "1:22:38"}
432
+ {"current_steps": 4320, "total_steps": 5000, "loss": 0.3628, "lr": 1.1315040153064416e-06, "epoch": 8.64, "percentage": 86.4, "elapsed_time": "8:37:23", "remaining_time": "1:21:26"}
433
+ {"current_steps": 4330, "total_steps": 5000, "loss": 0.3585, "lr": 1.0994626479094749e-06, "epoch": 8.66, "percentage": 86.6, "elapsed_time": "8:38:35", "remaining_time": "1:20:14"}
434
+ {"current_steps": 4340, "total_steps": 5000, "loss": 0.3583, "lr": 1.0678550801462662e-06, "epoch": 8.68, "percentage": 86.8, "elapsed_time": "8:39:47", "remaining_time": "1:19:02"}
435
+ {"current_steps": 4350, "total_steps": 5000, "loss": 0.2861, "lr": 1.0366828525250728e-06, "epoch": 8.7, "percentage": 87.0, "elapsed_time": "8:40:58", "remaining_time": "1:17:50"}
436
+ {"current_steps": 4360, "total_steps": 5000, "loss": 0.3422, "lr": 1.0059474843362893e-06, "epoch": 8.72, "percentage": 87.2, "elapsed_time": "8:42:10", "remaining_time": "1:16:38"}
437
+ {"current_steps": 4370, "total_steps": 5000, "loss": 0.3337, "lr": 9.756504735784067e-07, "epoch": 8.74, "percentage": 87.4, "elapsed_time": "8:43:21", "remaining_time": "1:15:26"}
438
+ {"current_steps": 4380, "total_steps": 5000, "loss": 0.3163, "lr": 9.457932968849826e-07, "epoch": 8.76, "percentage": 87.6, "elapsed_time": "8:44:33", "remaining_time": "1:14:15"}
439
+ {"current_steps": 4390, "total_steps": 5000, "loss": 0.3132, "lr": 9.16377409452689e-07, "epoch": 8.78, "percentage": 87.8, "elapsed_time": "8:45:45", "remaining_time": "1:13:03"}
440
+ {"current_steps": 4400, "total_steps": 5000, "loss": 0.3108, "lr": 8.874042449703779e-07, "epoch": 8.8, "percentage": 88.0, "elapsed_time": "8:46:56", "remaining_time": "1:11:51"}
441
+ {"current_steps": 4410, "total_steps": 5000, "loss": 0.3444, "lr": 8.58875215549212e-07, "epoch": 8.82, "percentage": 88.2, "elapsed_time": "8:48:10", "remaining_time": "1:10:39"}
442
+ {"current_steps": 4420, "total_steps": 5000, "loss": 0.3582, "lr": 8.307917116538378e-07, "epoch": 8.84, "percentage": 88.4, "elapsed_time": "8:49:22", "remaining_time": "1:09:27"}
443
+ {"current_steps": 4430, "total_steps": 5000, "loss": 0.3014, "lr": 8.031551020346129e-07, "epoch": 8.86, "percentage": 88.6, "elapsed_time": "8:50:33", "remaining_time": "1:08:16"}
444
+ {"current_steps": 4440, "total_steps": 5000, "loss": 0.3578, "lr": 7.759667336609011e-07, "epoch": 8.88, "percentage": 88.8, "elapsed_time": "8:51:45", "remaining_time": "1:07:04"}
445
+ {"current_steps": 4450, "total_steps": 5000, "loss": 0.3253, "lr": 7.492279316554207e-07, "epoch": 8.9, "percentage": 89.0, "elapsed_time": "8:52:57", "remaining_time": "1:05:52"}
446
+ {"current_steps": 4460, "total_steps": 5000, "loss": 0.3839, "lr": 7.22939999229657e-07, "epoch": 8.92, "percentage": 89.2, "elapsed_time": "8:54:08", "remaining_time": "1:04:40"}
447
+ {"current_steps": 4470, "total_steps": 5000, "loss": 0.268, "lr": 6.971042176203535e-07, "epoch": 8.94, "percentage": 89.4, "elapsed_time": "8:55:20", "remaining_time": "1:03:28"}
448
+ {"current_steps": 4480, "total_steps": 5000, "loss": 0.332, "lr": 6.717218460270536e-07, "epoch": 8.96, "percentage": 89.6, "elapsed_time": "8:56:31", "remaining_time": "1:02:16"}
449
+ {"current_steps": 4490, "total_steps": 5000, "loss": 0.361, "lr": 6.467941215507434e-07, "epoch": 8.98, "percentage": 89.8, "elapsed_time": "8:57:43", "remaining_time": "1:01:04"}
450
+ {"current_steps": 4500, "total_steps": 5000, "loss": 0.3358, "lr": 6.223222591335409e-07, "epoch": 9.0, "percentage": 90.0, "elapsed_time": "8:58:54", "remaining_time": "0:59:52"}
451
+ {"current_steps": 4510, "total_steps": 5000, "loss": 0.2874, "lr": 5.98307451499498e-07, "epoch": 9.02, "percentage": 90.2, "elapsed_time": "9:00:07", "remaining_time": "0:58:40"}
452
+ {"current_steps": 4520, "total_steps": 5000, "loss": 0.361, "lr": 5.747508690964599e-07, "epoch": 9.04, "percentage": 90.4, "elapsed_time": "9:01:18", "remaining_time": "0:57:29"}
453
+ {"current_steps": 4530, "total_steps": 5000, "loss": 0.2929, "lr": 5.516536600390188e-07, "epoch": 9.06, "percentage": 90.6, "elapsed_time": "9:02:30", "remaining_time": "0:56:17"}
454
+ {"current_steps": 4540, "total_steps": 5000, "loss": 0.2854, "lr": 5.290169500525577e-07, "epoch": 9.08, "percentage": 90.8, "elapsed_time": "9:03:41", "remaining_time": "0:55:05"}
455
+ {"current_steps": 4550, "total_steps": 5000, "loss": 0.3173, "lr": 5.068418424183874e-07, "epoch": 9.1, "percentage": 91.0, "elapsed_time": "9:04:52", "remaining_time": "0:53:53"}
456
+ {"current_steps": 4560, "total_steps": 5000, "loss": 0.3683, "lr": 4.851294179199673e-07, "epoch": 9.12, "percentage": 91.2, "elapsed_time": "9:06:03", "remaining_time": "0:52:41"}
457
+ {"current_steps": 4570, "total_steps": 5000, "loss": 0.3256, "lr": 4.638807347902408e-07, "epoch": 9.14, "percentage": 91.4, "elapsed_time": "9:07:15", "remaining_time": "0:51:29"}
458
+ {"current_steps": 4580, "total_steps": 5000, "loss": 0.319, "lr": 4.4309682866004124e-07, "epoch": 9.16, "percentage": 91.6, "elapsed_time": "9:08:26", "remaining_time": "0:50:17"}
459
+ {"current_steps": 4590, "total_steps": 5000, "loss": 0.3221, "lr": 4.2277871250763327e-07, "epoch": 9.18, "percentage": 91.8, "elapsed_time": "9:09:37", "remaining_time": "0:49:05"}
460
+ {"current_steps": 4600, "total_steps": 5000, "loss": 0.2951, "lr": 4.0292737660933335e-07, "epoch": 9.2, "percentage": 92.0, "elapsed_time": "9:10:49", "remaining_time": "0:47:53"}
461
+ {"current_steps": 4610, "total_steps": 5000, "loss": 0.3738, "lr": 3.835437884912474e-07, "epoch": 9.22, "percentage": 92.2, "elapsed_time": "9:12:03", "remaining_time": "0:46:42"}
462
+ {"current_steps": 4620, "total_steps": 5000, "loss": 0.2898, "lr": 3.646288928821151e-07, "epoch": 9.24, "percentage": 92.4, "elapsed_time": "9:13:15", "remaining_time": "0:45:30"}
463
+ {"current_steps": 4630, "total_steps": 5000, "loss": 0.3792, "lr": 3.4618361166726123e-07, "epoch": 9.26, "percentage": 92.6, "elapsed_time": "9:14:26", "remaining_time": "0:44:18"}
464
+ {"current_steps": 4640, "total_steps": 5000, "loss": 0.3424, "lr": 3.282088438436715e-07, "epoch": 9.28, "percentage": 92.8, "elapsed_time": "9:15:38", "remaining_time": "0:43:06"}
465
+ {"current_steps": 4650, "total_steps": 5000, "loss": 0.358, "lr": 3.10705465476171e-07, "epoch": 9.3, "percentage": 93.0, "elapsed_time": "9:16:51", "remaining_time": "0:41:54"}
466
+ {"current_steps": 4660, "total_steps": 5000, "loss": 0.32, "lr": 2.936743296547273e-07, "epoch": 9.32, "percentage": 93.2, "elapsed_time": "9:18:03", "remaining_time": "0:40:43"}
467
+ {"current_steps": 4670, "total_steps": 5000, "loss": 0.3079, "lr": 2.771162664528726e-07, "epoch": 9.34, "percentage": 93.4, "elapsed_time": "9:19:14", "remaining_time": "0:39:31"}
468
+ {"current_steps": 4680, "total_steps": 5000, "loss": 0.2834, "lr": 2.6103208288724815e-07, "epoch": 9.36, "percentage": 93.6, "elapsed_time": "9:20:26", "remaining_time": "0:38:19"}
469
+ {"current_steps": 4690, "total_steps": 5000, "loss": 0.354, "lr": 2.4542256287826915e-07, "epoch": 9.38, "percentage": 93.8, "elapsed_time": "9:21:39", "remaining_time": "0:37:07"}
470
+ {"current_steps": 4700, "total_steps": 5000, "loss": 0.3243, "lr": 2.3028846721191878e-07, "epoch": 9.4, "percentage": 94.0, "elapsed_time": "9:22:50", "remaining_time": "0:35:55"}
471
+ {"current_steps": 4710, "total_steps": 5000, "loss": 0.3121, "lr": 2.1563053350266983e-07, "epoch": 9.42, "percentage": 94.2, "elapsed_time": "9:24:04", "remaining_time": "0:34:43"}
472
+ {"current_steps": 4720, "total_steps": 5000, "loss": 0.3142, "lr": 2.014494761575314e-07, "epoch": 9.44, "percentage": 94.4, "elapsed_time": "9:25:15", "remaining_time": "0:33:31"}
473
+ {"current_steps": 4730, "total_steps": 5000, "loss": 0.3287, "lr": 1.877459863412323e-07, "epoch": 9.46, "percentage": 94.6, "elapsed_time": "9:26:27", "remaining_time": "0:32:20"}
474
+ {"current_steps": 4740, "total_steps": 5000, "loss": 0.2989, "lr": 1.7452073194253237e-07, "epoch": 9.48, "percentage": 94.8, "elapsed_time": "9:27:38", "remaining_time": "0:31:08"}
475
+ {"current_steps": 4750, "total_steps": 5000, "loss": 0.3632, "lr": 1.6177435754167413e-07, "epoch": 9.5, "percentage": 95.0, "elapsed_time": "9:28:49", "remaining_time": "0:29:56"}
476
+ {"current_steps": 4760, "total_steps": 5000, "loss": 0.265, "lr": 1.4950748437896235e-07, "epoch": 9.52, "percentage": 95.2, "elapsed_time": "9:30:01", "remaining_time": "0:28:44"}
477
+ {"current_steps": 4770, "total_steps": 5000, "loss": 0.283, "lr": 1.377207103244904e-07, "epoch": 9.54, "percentage": 95.4, "elapsed_time": "9:31:12", "remaining_time": "0:27:32"}
478
+ {"current_steps": 4780, "total_steps": 5000, "loss": 0.2264, "lr": 1.26414609848996e-07, "epoch": 9.56, "percentage": 95.6, "elapsed_time": "9:32:23", "remaining_time": "0:26:20"}
479
+ {"current_steps": 4790, "total_steps": 5000, "loss": 0.3198, "lr": 1.1558973399586671e-07, "epoch": 9.58, "percentage": 95.8, "elapsed_time": "9:33:35", "remaining_time": "0:25:08"}
480
+ {"current_steps": 4800, "total_steps": 5000, "loss": 0.2258, "lr": 1.052466103542793e-07, "epoch": 9.6, "percentage": 96.0, "elapsed_time": "9:34:45", "remaining_time": "0:23:56"}
481
+ {"current_steps": 4810, "total_steps": 5000, "loss": 0.3053, "lr": 9.538574303348813e-08, "epoch": 9.62, "percentage": 96.2, "elapsed_time": "9:35:59", "remaining_time": "0:22:45"}
482
+ {"current_steps": 4820, "total_steps": 5000, "loss": 0.278, "lr": 8.600761263825475e-08, "epoch": 9.64, "percentage": 96.4, "elapsed_time": "9:37:10", "remaining_time": "0:21:33"}
483
+ {"current_steps": 4830, "total_steps": 5000, "loss": 0.3211, "lr": 7.71126762454233e-08, "epoch": 9.66, "percentage": 96.6, "elapsed_time": "9:38:21", "remaining_time": "0:20:21"}
484
+ {"current_steps": 4840, "total_steps": 5000, "loss": 0.3079, "lr": 6.870136738164612e-08, "epoch": 9.68, "percentage": 96.8, "elapsed_time": "9:39:33", "remaining_time": "0:19:09"}
485
+ {"current_steps": 4850, "total_steps": 5000, "loss": 0.3717, "lr": 6.07740960022507e-08, "epoch": 9.7, "percentage": 97.0, "elapsed_time": "9:40:45", "remaining_time": "0:17:57"}
486
+ {"current_steps": 4860, "total_steps": 5000, "loss": 0.3052, "lr": 5.3331248471258926e-08, "epoch": 9.72, "percentage": 97.2, "elapsed_time": "9:41:57", "remaining_time": "0:16:45"}
487
+ {"current_steps": 4870, "total_steps": 5000, "loss": 0.3018, "lr": 4.6373187542561036e-08, "epoch": 9.74, "percentage": 97.4, "elapsed_time": "9:43:08", "remaining_time": "0:15:33"}
488
+ {"current_steps": 4880, "total_steps": 5000, "loss": 0.2694, "lr": 3.990025234222872e-08, "epoch": 9.76, "percentage": 97.6, "elapsed_time": "9:44:19", "remaining_time": "0:14:22"}
489
+ {"current_steps": 4890, "total_steps": 5000, "loss": 0.323, "lr": 3.391275835199159e-08, "epoch": 9.78, "percentage": 97.8, "elapsed_time": "9:45:31", "remaining_time": "0:13:10"}
490
+ {"current_steps": 4900, "total_steps": 5000, "loss": 0.302, "lr": 2.8410997393860663e-08, "epoch": 9.8, "percentage": 98.0, "elapsed_time": "9:46:42", "remaining_time": "0:11:58"}
491
+ {"current_steps": 4910, "total_steps": 5000, "loss": 0.3561, "lr": 2.339523761590301e-08, "epoch": 9.82, "percentage": 98.2, "elapsed_time": "9:47:56", "remaining_time": "0:10:46"}
492
+ {"current_steps": 4920, "total_steps": 5000, "loss": 0.3486, "lr": 1.886572347917337e-08, "epoch": 9.84, "percentage": 98.4, "elapsed_time": "9:49:07", "remaining_time": "0:09:34"}
493
+ {"current_steps": 4930, "total_steps": 5000, "loss": 0.3651, "lr": 1.482267574580143e-08, "epoch": 9.86, "percentage": 98.6, "elapsed_time": "9:50:19", "remaining_time": "0:08:22"}
494
+ {"current_steps": 4940, "total_steps": 5000, "loss": 0.2544, "lr": 1.126629146822933e-08, "epoch": 9.88, "percentage": 98.8, "elapsed_time": "9:51:30", "remaining_time": "0:07:11"}
495
+ {"current_steps": 4950, "total_steps": 5000, "loss": 0.2575, "lr": 8.196743979610455e-09, "epoch": 9.9, "percentage": 99.0, "elapsed_time": "9:52:42", "remaining_time": "0:05:59"}
496
+ {"current_steps": 4960, "total_steps": 5000, "loss": 0.2833, "lr": 5.614182885357311e-09, "epoch": 9.92, "percentage": 99.2, "elapsed_time": "9:53:53", "remaining_time": "0:04:47"}
497
+ {"current_steps": 4970, "total_steps": 5000, "loss": 0.3345, "lr": 3.518734055855122e-09, "epoch": 9.94, "percentage": 99.4, "elapsed_time": "9:55:05", "remaining_time": "0:03:35"}
498
+ {"current_steps": 4980, "total_steps": 5000, "loss": 0.3034, "lr": 1.910499620322304e-09, "epoch": 9.96, "percentage": 99.6, "elapsed_time": "9:56:16", "remaining_time": "0:02:23"}
499
+ {"current_steps": 4990, "total_steps": 5000, "loss": 0.2543, "lr": 7.895579618388827e-10, "epoch": 9.98, "percentage": 99.8, "elapsed_time": "9:57:27", "remaining_time": "0:01:11"}
500
+ {"current_steps": 5000, "total_steps": 5000, "loss": 0.3149, "lr": 1.559637135173375e-10, "epoch": 10.0, "percentage": 100.0, "elapsed_time": "9:58:39", "remaining_time": "0:00:00"}
501
+ {"current_steps": 5000, "total_steps": 5000, "epoch": 10.0, "percentage": 100.0, "elapsed_time": "9:58:40", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,3542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 5000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "grad_norm": NaN,
14
+ "learning_rate": 2.4000000000000003e-07,
15
+ "loss": 1.3992,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.04,
20
+ "grad_norm": 1.1072826385498047,
21
+ "learning_rate": 6.000000000000001e-07,
22
+ "loss": 1.4497,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.06,
27
+ "grad_norm": 3.9411494731903076,
28
+ "learning_rate": 1.0000000000000002e-06,
29
+ "loss": 1.2599,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.08,
34
+ "grad_norm": 3.928187847137451,
35
+ "learning_rate": 1.4000000000000001e-06,
36
+ "loss": 1.4124,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.1,
41
+ "grad_norm": 0.6260775923728943,
42
+ "learning_rate": 1.8000000000000001e-06,
43
+ "loss": 1.3542,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.12,
48
+ "grad_norm": 1.5522129535675049,
49
+ "learning_rate": 2.2e-06,
50
+ "loss": 1.2123,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.14,
55
+ "grad_norm": 1.6981067657470703,
56
+ "learning_rate": 2.6e-06,
57
+ "loss": 1.2098,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.16,
62
+ "grad_norm": 5.6391496658325195,
63
+ "learning_rate": 3e-06,
64
+ "loss": 1.2226,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.18,
69
+ "grad_norm": 0.5151563286781311,
70
+ "learning_rate": 3.3600000000000004e-06,
71
+ "loss": 1.0624,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.2,
76
+ "grad_norm": 1.4428874254226685,
77
+ "learning_rate": 3.7600000000000004e-06,
78
+ "loss": 1.0969,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.22,
83
+ "grad_norm": 2.130734920501709,
84
+ "learning_rate": 4.16e-06,
85
+ "loss": 1.0879,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.24,
90
+ "grad_norm": 2.011439561843872,
91
+ "learning_rate": 4.56e-06,
92
+ "loss": 1.0195,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.26,
97
+ "grad_norm": 2.217247486114502,
98
+ "learning_rate": 4.960000000000001e-06,
99
+ "loss": 0.9765,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.28,
104
+ "grad_norm": 0.946567952632904,
105
+ "learning_rate": 5.36e-06,
106
+ "loss": 1.0205,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.3,
111
+ "grad_norm": 6.922210693359375,
112
+ "learning_rate": 5.76e-06,
113
+ "loss": 0.9517,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.32,
118
+ "grad_norm": 3.384115219116211,
119
+ "learning_rate": 6.16e-06,
120
+ "loss": 0.9324,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.34,
125
+ "grad_norm": 1.7061117887496948,
126
+ "learning_rate": 6.560000000000001e-06,
127
+ "loss": 0.804,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.36,
132
+ "grad_norm": 1.5616205930709839,
133
+ "learning_rate": 6.96e-06,
134
+ "loss": 0.7821,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.38,
139
+ "grad_norm": 1.8393518924713135,
140
+ "learning_rate": 7.360000000000001e-06,
141
+ "loss": 0.8086,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.4,
146
+ "grad_norm": 1.4879248142242432,
147
+ "learning_rate": 7.76e-06,
148
+ "loss": 0.7655,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.42,
153
+ "grad_norm": 0.6370295882225037,
154
+ "learning_rate": 8.16e-06,
155
+ "loss": 0.7508,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.44,
160
+ "grad_norm": 0.5269752144813538,
161
+ "learning_rate": 8.560000000000001e-06,
162
+ "loss": 0.7429,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.46,
167
+ "grad_norm": 1.2695356607437134,
168
+ "learning_rate": 8.96e-06,
169
+ "loss": 0.7502,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.48,
174
+ "grad_norm": 1.5492205619812012,
175
+ "learning_rate": 9.360000000000002e-06,
176
+ "loss": 0.7029,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.5,
181
+ "grad_norm": 1.7380893230438232,
182
+ "learning_rate": 9.760000000000001e-06,
183
+ "loss": 0.7324,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.52,
188
+ "grad_norm": 1.8911452293395996,
189
+ "learning_rate": 1.0160000000000001e-05,
190
+ "loss": 0.7521,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.54,
195
+ "grad_norm": 2.0408151149749756,
196
+ "learning_rate": 1.056e-05,
197
+ "loss": 0.698,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.56,
202
+ "grad_norm": 1.9015631675720215,
203
+ "learning_rate": 1.0960000000000002e-05,
204
+ "loss": 0.6859,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.58,
209
+ "grad_norm": 1.5284056663513184,
210
+ "learning_rate": 1.136e-05,
211
+ "loss": 0.6716,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.6,
216
+ "grad_norm": 1.3547126054763794,
217
+ "learning_rate": 1.1760000000000001e-05,
218
+ "loss": 0.6978,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.62,
223
+ "grad_norm": 1.4986441135406494,
224
+ "learning_rate": 1.216e-05,
225
+ "loss": 0.6584,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.64,
230
+ "grad_norm": 1.1250969171524048,
231
+ "learning_rate": 1.2560000000000002e-05,
232
+ "loss": 0.7188,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.66,
237
+ "grad_norm": 1.1186408996582031,
238
+ "learning_rate": 1.2960000000000001e-05,
239
+ "loss": 0.6687,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.68,
244
+ "grad_norm": 1.1250578165054321,
245
+ "learning_rate": 1.3360000000000003e-05,
246
+ "loss": 0.623,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.7,
251
+ "grad_norm": 0.8931149840354919,
252
+ "learning_rate": 1.376e-05,
253
+ "loss": 0.6795,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.72,
258
+ "grad_norm": 1.5052251815795898,
259
+ "learning_rate": 1.416e-05,
260
+ "loss": 0.6455,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.74,
265
+ "grad_norm": 1.8043763637542725,
266
+ "learning_rate": 1.4560000000000001e-05,
267
+ "loss": 0.6548,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.76,
272
+ "grad_norm": 1.7357759475708008,
273
+ "learning_rate": 1.496e-05,
274
+ "loss": 0.6508,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.78,
279
+ "grad_norm": 1.4080872535705566,
280
+ "learning_rate": 1.5360000000000002e-05,
281
+ "loss": 0.7105,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 0.8,
286
+ "grad_norm": 0.7395206689834595,
287
+ "learning_rate": 1.576e-05,
288
+ "loss": 0.6738,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 0.82,
293
+ "grad_norm": 1.4803540706634521,
294
+ "learning_rate": 1.616e-05,
295
+ "loss": 0.6741,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 0.84,
300
+ "grad_norm": 0.9231945276260376,
301
+ "learning_rate": 1.656e-05,
302
+ "loss": 0.6385,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 0.86,
307
+ "grad_norm": 1.0000849962234497,
308
+ "learning_rate": 1.696e-05,
309
+ "loss": 0.6304,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 0.88,
314
+ "grad_norm": 1.8198318481445312,
315
+ "learning_rate": 1.736e-05,
316
+ "loss": 0.652,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 0.9,
321
+ "grad_norm": 0.8213591575622559,
322
+ "learning_rate": 1.7760000000000003e-05,
323
+ "loss": 0.6517,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 0.92,
328
+ "grad_norm": 2.0881271362304688,
329
+ "learning_rate": 1.8160000000000002e-05,
330
+ "loss": 0.7044,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 0.94,
335
+ "grad_norm": 2.583738088607788,
336
+ "learning_rate": 1.8560000000000002e-05,
337
+ "loss": 0.6801,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 0.96,
342
+ "grad_norm": 2.3651039600372314,
343
+ "learning_rate": 1.896e-05,
344
+ "loss": 0.6531,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 0.98,
349
+ "grad_norm": 1.174816608428955,
350
+ "learning_rate": 1.936e-05,
351
+ "loss": 0.6837,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 1.0,
356
+ "grad_norm": 1.93290376663208,
357
+ "learning_rate": 1.976e-05,
358
+ "loss": 0.6659,
359
+ "step": 500
360
+ },
361
+ {
362
+ "epoch": 1.02,
363
+ "grad_norm": 1.616199254989624,
364
+ "learning_rate": 1.9999961008995607e-05,
365
+ "loss": 0.6212,
366
+ "step": 510
367
+ },
368
+ {
369
+ "epoch": 1.04,
370
+ "grad_norm": 1.4221971035003662,
371
+ "learning_rate": 1.99995223636881e-05,
372
+ "loss": 0.6705,
373
+ "step": 520
374
+ },
375
+ {
376
+ "epoch": 1.06,
377
+ "grad_norm": 1.4205855131149292,
378
+ "learning_rate": 1.9998596355767805e-05,
379
+ "loss": 0.6346,
380
+ "step": 530
381
+ },
382
+ {
383
+ "epoch": 1.08,
384
+ "grad_norm": 1.5640236139297485,
385
+ "learning_rate": 1.999718303036705e-05,
386
+ "loss": 0.6698,
387
+ "step": 540
388
+ },
389
+ {
390
+ "epoch": 1.1,
391
+ "grad_norm": 1.3953174352645874,
392
+ "learning_rate": 1.9995282456369313e-05,
393
+ "loss": 0.5925,
394
+ "step": 550
395
+ },
396
+ {
397
+ "epoch": 1.12,
398
+ "grad_norm": 1.3201889991760254,
399
+ "learning_rate": 1.9992894726405894e-05,
400
+ "loss": 0.6795,
401
+ "step": 560
402
+ },
403
+ {
404
+ "epoch": 1.1400000000000001,
405
+ "grad_norm": 1.8795799016952515,
406
+ "learning_rate": 1.9990019956851384e-05,
407
+ "loss": 0.6096,
408
+ "step": 570
409
+ },
410
+ {
411
+ "epoch": 1.16,
412
+ "grad_norm": 1.9675489664077759,
413
+ "learning_rate": 1.998665828781799e-05,
414
+ "loss": 0.5971,
415
+ "step": 580
416
+ },
417
+ {
418
+ "epoch": 1.18,
419
+ "grad_norm": 1.0514065027236938,
420
+ "learning_rate": 1.998280988314872e-05,
421
+ "loss": 0.6055,
422
+ "step": 590
423
+ },
424
+ {
425
+ "epoch": 1.2,
426
+ "grad_norm": 0.7312430143356323,
427
+ "learning_rate": 1.9978474930409396e-05,
428
+ "loss": 0.6304,
429
+ "step": 600
430
+ },
431
+ {
432
+ "epoch": 1.22,
433
+ "grad_norm": 11.12009048461914,
434
+ "learning_rate": 1.9973653640879486e-05,
435
+ "loss": 0.6812,
436
+ "step": 610
437
+ },
438
+ {
439
+ "epoch": 1.24,
440
+ "grad_norm": 2.484487771987915,
441
+ "learning_rate": 1.9968346249541848e-05,
442
+ "loss": 0.5842,
443
+ "step": 620
444
+ },
445
+ {
446
+ "epoch": 1.26,
447
+ "grad_norm": 1.4185147285461426,
448
+ "learning_rate": 1.996255301507125e-05,
449
+ "loss": 0.6478,
450
+ "step": 630
451
+ },
452
+ {
453
+ "epoch": 1.28,
454
+ "grad_norm": 0.939939022064209,
455
+ "learning_rate": 1.995627421982176e-05,
456
+ "loss": 0.6003,
457
+ "step": 640
458
+ },
459
+ {
460
+ "epoch": 1.3,
461
+ "grad_norm": 2.303175687789917,
462
+ "learning_rate": 1.9949510169813006e-05,
463
+ "loss": 0.6087,
464
+ "step": 650
465
+ },
466
+ {
467
+ "epoch": 1.32,
468
+ "grad_norm": 1.1472731828689575,
469
+ "learning_rate": 1.9942261194715236e-05,
470
+ "loss": 0.5905,
471
+ "step": 660
472
+ },
473
+ {
474
+ "epoch": 1.34,
475
+ "grad_norm": 0.8142107129096985,
476
+ "learning_rate": 1.9934527647833276e-05,
477
+ "loss": 0.593,
478
+ "step": 670
479
+ },
480
+ {
481
+ "epoch": 1.3599999999999999,
482
+ "grad_norm": 1.4084548950195312,
483
+ "learning_rate": 1.992630990608929e-05,
484
+ "loss": 0.6253,
485
+ "step": 680
486
+ },
487
+ {
488
+ "epoch": 1.38,
489
+ "grad_norm": 1.6054160594940186,
490
+ "learning_rate": 1.9917608370004417e-05,
491
+ "loss": 0.6338,
492
+ "step": 690
493
+ },
494
+ {
495
+ "epoch": 1.4,
496
+ "grad_norm": 2.442007541656494,
497
+ "learning_rate": 1.9908423463679246e-05,
498
+ "loss": 0.6148,
499
+ "step": 700
500
+ },
501
+ {
502
+ "epoch": 1.42,
503
+ "grad_norm": 0.7885130047798157,
504
+ "learning_rate": 1.989875563477316e-05,
505
+ "loss": 0.6271,
506
+ "step": 710
507
+ },
508
+ {
509
+ "epoch": 1.44,
510
+ "grad_norm": 2.8302054405212402,
511
+ "learning_rate": 1.9888605354482494e-05,
512
+ "loss": 0.638,
513
+ "step": 720
514
+ },
515
+ {
516
+ "epoch": 1.46,
517
+ "grad_norm": 1.134475827217102,
518
+ "learning_rate": 1.987797311751759e-05,
519
+ "loss": 0.6304,
520
+ "step": 730
521
+ },
522
+ {
523
+ "epoch": 1.48,
524
+ "grad_norm": 0.7951676249504089,
525
+ "learning_rate": 1.986685944207868e-05,
526
+ "loss": 0.5877,
527
+ "step": 740
528
+ },
529
+ {
530
+ "epoch": 1.5,
531
+ "grad_norm": 1.5261722803115845,
532
+ "learning_rate": 1.985526486983063e-05,
533
+ "loss": 0.5747,
534
+ "step": 750
535
+ },
536
+ {
537
+ "epoch": 1.52,
538
+ "grad_norm": 4.633131980895996,
539
+ "learning_rate": 1.9843189965876525e-05,
540
+ "loss": 0.6514,
541
+ "step": 760
542
+ },
543
+ {
544
+ "epoch": 1.54,
545
+ "grad_norm": 0.7688568830490112,
546
+ "learning_rate": 1.9830635318730155e-05,
547
+ "loss": 0.5879,
548
+ "step": 770
549
+ },
550
+ {
551
+ "epoch": 1.56,
552
+ "grad_norm": 0.859425961971283,
553
+ "learning_rate": 1.981760154028731e-05,
554
+ "loss": 0.6152,
555
+ "step": 780
556
+ },
557
+ {
558
+ "epoch": 1.58,
559
+ "grad_norm": 1.939563274383545,
560
+ "learning_rate": 1.980408926579596e-05,
561
+ "loss": 0.6342,
562
+ "step": 790
563
+ },
564
+ {
565
+ "epoch": 1.6,
566
+ "grad_norm": 0.7612221837043762,
567
+ "learning_rate": 1.97900991538253e-05,
568
+ "loss": 0.6167,
569
+ "step": 800
570
+ },
571
+ {
572
+ "epoch": 1.62,
573
+ "grad_norm": 2.2331180572509766,
574
+ "learning_rate": 1.9775631886233655e-05,
575
+ "loss": 0.5688,
576
+ "step": 810
577
+ },
578
+ {
579
+ "epoch": 1.6400000000000001,
580
+ "grad_norm": 3.1707897186279297,
581
+ "learning_rate": 1.9760688168135233e-05,
582
+ "loss": 0.6023,
583
+ "step": 820
584
+ },
585
+ {
586
+ "epoch": 1.6600000000000001,
587
+ "grad_norm": 1.5035152435302734,
588
+ "learning_rate": 1.9745268727865774e-05,
589
+ "loss": 0.5957,
590
+ "step": 830
591
+ },
592
+ {
593
+ "epoch": 1.6800000000000002,
594
+ "grad_norm": 1.8150962591171265,
595
+ "learning_rate": 1.972937431694704e-05,
596
+ "loss": 0.5409,
597
+ "step": 840
598
+ },
599
+ {
600
+ "epoch": 1.7,
601
+ "grad_norm": 1.5338727235794067,
602
+ "learning_rate": 1.9713005710050203e-05,
603
+ "loss": 0.6286,
604
+ "step": 850
605
+ },
606
+ {
607
+ "epoch": 1.72,
608
+ "grad_norm": 1.0299500226974487,
609
+ "learning_rate": 1.969616370495806e-05,
610
+ "loss": 0.5936,
611
+ "step": 860
612
+ },
613
+ {
614
+ "epoch": 1.74,
615
+ "grad_norm": 3.0667810440063477,
616
+ "learning_rate": 1.967884912252619e-05,
617
+ "loss": 0.6535,
618
+ "step": 870
619
+ },
620
+ {
621
+ "epoch": 1.76,
622
+ "grad_norm": 0.9165984988212585,
623
+ "learning_rate": 1.9661062806642903e-05,
624
+ "loss": 0.5864,
625
+ "step": 880
626
+ },
627
+ {
628
+ "epoch": 1.78,
629
+ "grad_norm": 1.398553729057312,
630
+ "learning_rate": 1.964280562418815e-05,
631
+ "loss": 0.6181,
632
+ "step": 890
633
+ },
634
+ {
635
+ "epoch": 1.8,
636
+ "grad_norm": 1.1232646703720093,
637
+ "learning_rate": 1.962407846499124e-05,
638
+ "loss": 0.5736,
639
+ "step": 900
640
+ },
641
+ {
642
+ "epoch": 1.8199999999999998,
643
+ "grad_norm": 1.905674695968628,
644
+ "learning_rate": 1.96048822417875e-05,
645
+ "loss": 0.5769,
646
+ "step": 910
647
+ },
648
+ {
649
+ "epoch": 1.8399999999999999,
650
+ "grad_norm": 0.9369404911994934,
651
+ "learning_rate": 1.958521789017376e-05,
652
+ "loss": 0.6056,
653
+ "step": 920
654
+ },
655
+ {
656
+ "epoch": 1.8599999999999999,
657
+ "grad_norm": 1.0187280178070068,
658
+ "learning_rate": 1.956508636856278e-05,
659
+ "loss": 0.6632,
660
+ "step": 930
661
+ },
662
+ {
663
+ "epoch": 1.88,
664
+ "grad_norm": 1.4954912662506104,
665
+ "learning_rate": 1.9546569379242446e-05,
666
+ "loss": 0.5803,
667
+ "step": 940
668
+ },
669
+ {
670
+ "epoch": 1.9,
671
+ "grad_norm": 1.1077364683151245,
672
+ "learning_rate": 1.9525552956573244e-05,
673
+ "loss": 0.6028,
674
+ "step": 950
675
+ },
676
+ {
677
+ "epoch": 1.92,
678
+ "grad_norm": 1.777431607246399,
679
+ "learning_rate": 1.9504072271891486e-05,
680
+ "loss": 0.5932,
681
+ "step": 960
682
+ },
683
+ {
684
+ "epoch": 1.94,
685
+ "grad_norm": 1.1748569011688232,
686
+ "learning_rate": 1.9482128372135446e-05,
687
+ "loss": 0.581,
688
+ "step": 970
689
+ },
690
+ {
691
+ "epoch": 1.96,
692
+ "grad_norm": 1.5400365591049194,
693
+ "learning_rate": 1.945972232681984e-05,
694
+ "loss": 0.6207,
695
+ "step": 980
696
+ },
697
+ {
698
+ "epoch": 1.98,
699
+ "grad_norm": 1.3064255714416504,
700
+ "learning_rate": 1.9436855227983695e-05,
701
+ "loss": 0.5576,
702
+ "step": 990
703
+ },
704
+ {
705
+ "epoch": 2.0,
706
+ "grad_norm": 1.6377707719802856,
707
+ "learning_rate": 1.9413528190137158e-05,
708
+ "loss": 0.6121,
709
+ "step": 1000
710
+ },
711
+ {
712
+ "epoch": 2.02,
713
+ "grad_norm": 0.8060258030891418,
714
+ "learning_rate": 1.938974235020714e-05,
715
+ "loss": 0.5909,
716
+ "step": 1010
717
+ },
718
+ {
719
+ "epoch": 2.04,
720
+ "grad_norm": 2.174511671066284,
721
+ "learning_rate": 1.9365498867481926e-05,
722
+ "loss": 0.5459,
723
+ "step": 1020
724
+ },
725
+ {
726
+ "epoch": 2.06,
727
+ "grad_norm": 2.0883657932281494,
728
+ "learning_rate": 1.9340798923554657e-05,
729
+ "loss": 0.5781,
730
+ "step": 1030
731
+ },
732
+ {
733
+ "epoch": 2.08,
734
+ "grad_norm": 1.4199424982070923,
735
+ "learning_rate": 1.931564372226576e-05,
736
+ "loss": 0.5284,
737
+ "step": 1040
738
+ },
739
+ {
740
+ "epoch": 2.1,
741
+ "grad_norm": 1.2759448289871216,
742
+ "learning_rate": 1.9290034489644247e-05,
743
+ "loss": 0.5476,
744
+ "step": 1050
745
+ },
746
+ {
747
+ "epoch": 2.12,
748
+ "grad_norm": 0.9871682524681091,
749
+ "learning_rate": 1.9263972473847995e-05,
750
+ "loss": 0.5386,
751
+ "step": 1060
752
+ },
753
+ {
754
+ "epoch": 2.14,
755
+ "grad_norm": 1.325584053993225,
756
+ "learning_rate": 1.923745894510288e-05,
757
+ "loss": 0.5864,
758
+ "step": 1070
759
+ },
760
+ {
761
+ "epoch": 2.16,
762
+ "grad_norm": 1.4707293510437012,
763
+ "learning_rate": 1.9210495195640895e-05,
764
+ "loss": 0.5413,
765
+ "step": 1080
766
+ },
767
+ {
768
+ "epoch": 2.18,
769
+ "grad_norm": 1.222374439239502,
770
+ "learning_rate": 1.918308253963715e-05,
771
+ "loss": 0.5201,
772
+ "step": 1090
773
+ },
774
+ {
775
+ "epoch": 2.2,
776
+ "grad_norm": 2.6065587997436523,
777
+ "learning_rate": 1.9155222313145817e-05,
778
+ "loss": 0.5658,
779
+ "step": 1100
780
+ },
781
+ {
782
+ "epoch": 2.22,
783
+ "grad_norm": 1.2004817724227905,
784
+ "learning_rate": 1.912691587403503e-05,
785
+ "loss": 0.5578,
786
+ "step": 1110
787
+ },
788
+ {
789
+ "epoch": 2.24,
790
+ "grad_norm": 2.0765957832336426,
791
+ "learning_rate": 1.9098164601920702e-05,
792
+ "loss": 0.4792,
793
+ "step": 1120
794
+ },
795
+ {
796
+ "epoch": 2.26,
797
+ "grad_norm": 1.2682489156723022,
798
+ "learning_rate": 1.906896989809927e-05,
799
+ "loss": 0.6048,
800
+ "step": 1130
801
+ },
802
+ {
803
+ "epoch": 2.2800000000000002,
804
+ "grad_norm": 3.5352697372436523,
805
+ "learning_rate": 1.903933318547942e-05,
806
+ "loss": 0.567,
807
+ "step": 1140
808
+ },
809
+ {
810
+ "epoch": 2.3,
811
+ "grad_norm": 1.590265154838562,
812
+ "learning_rate": 1.9009255908512704e-05,
813
+ "loss": 0.5965,
814
+ "step": 1150
815
+ },
816
+ {
817
+ "epoch": 2.32,
818
+ "grad_norm": 2.2210605144500732,
819
+ "learning_rate": 1.897873953312317e-05,
820
+ "loss": 0.5561,
821
+ "step": 1160
822
+ },
823
+ {
824
+ "epoch": 2.34,
825
+ "grad_norm": 1.0093090534210205,
826
+ "learning_rate": 1.8947785546635905e-05,
827
+ "loss": 0.5529,
828
+ "step": 1170
829
+ },
830
+ {
831
+ "epoch": 2.36,
832
+ "grad_norm": 1.523364543914795,
833
+ "learning_rate": 1.8916395457704536e-05,
834
+ "loss": 0.5818,
835
+ "step": 1180
836
+ },
837
+ {
838
+ "epoch": 2.38,
839
+ "grad_norm": 0.9908381700515747,
840
+ "learning_rate": 1.888457079623772e-05,
841
+ "loss": 0.5558,
842
+ "step": 1190
843
+ },
844
+ {
845
+ "epoch": 2.4,
846
+ "grad_norm": 1.499123215675354,
847
+ "learning_rate": 1.8852313113324553e-05,
848
+ "loss": 0.5833,
849
+ "step": 1200
850
+ },
851
+ {
852
+ "epoch": 2.42,
853
+ "grad_norm": 1.700926661491394,
854
+ "learning_rate": 1.8819623981158996e-05,
855
+ "loss": 0.527,
856
+ "step": 1210
857
+ },
858
+ {
859
+ "epoch": 2.44,
860
+ "grad_norm": 0.7639631032943726,
861
+ "learning_rate": 1.878650499296323e-05,
862
+ "loss": 0.5605,
863
+ "step": 1220
864
+ },
865
+ {
866
+ "epoch": 2.46,
867
+ "grad_norm": 0.9155722856521606,
868
+ "learning_rate": 1.8752957762910016e-05,
869
+ "loss": 0.5528,
870
+ "step": 1230
871
+ },
872
+ {
873
+ "epoch": 2.48,
874
+ "grad_norm": 1.4307914972305298,
875
+ "learning_rate": 1.871898392604402e-05,
876
+ "loss": 0.5239,
877
+ "step": 1240
878
+ },
879
+ {
880
+ "epoch": 2.5,
881
+ "grad_norm": 0.9646509885787964,
882
+ "learning_rate": 1.8684585138202122e-05,
883
+ "loss": 0.5825,
884
+ "step": 1250
885
+ },
886
+ {
887
+ "epoch": 2.52,
888
+ "grad_norm": 0.823187530040741,
889
+ "learning_rate": 1.864976307593271e-05,
890
+ "loss": 0.5816,
891
+ "step": 1260
892
+ },
893
+ {
894
+ "epoch": 2.54,
895
+ "grad_norm": 0.9524659514427185,
896
+ "learning_rate": 1.8614519436413968e-05,
897
+ "loss": 0.595,
898
+ "step": 1270
899
+ },
900
+ {
901
+ "epoch": 2.56,
902
+ "grad_norm": 3.476361036300659,
903
+ "learning_rate": 1.8578855937371176e-05,
904
+ "loss": 0.5828,
905
+ "step": 1280
906
+ },
907
+ {
908
+ "epoch": 2.58,
909
+ "grad_norm": 0.8790251016616821,
910
+ "learning_rate": 1.8542774316992953e-05,
911
+ "loss": 0.5408,
912
+ "step": 1290
913
+ },
914
+ {
915
+ "epoch": 2.6,
916
+ "grad_norm": 1.8205924034118652,
917
+ "learning_rate": 1.850627633384658e-05,
918
+ "loss": 0.5683,
919
+ "step": 1300
920
+ },
921
+ {
922
+ "epoch": 2.62,
923
+ "grad_norm": 0.7987878918647766,
924
+ "learning_rate": 1.8469363766792258e-05,
925
+ "loss": 0.5734,
926
+ "step": 1310
927
+ },
928
+ {
929
+ "epoch": 2.64,
930
+ "grad_norm": 1.1774803400039673,
931
+ "learning_rate": 1.8432038414896432e-05,
932
+ "loss": 0.5581,
933
+ "step": 1320
934
+ },
935
+ {
936
+ "epoch": 2.66,
937
+ "grad_norm": 1.4109526872634888,
938
+ "learning_rate": 1.8394302097344103e-05,
939
+ "loss": 0.5781,
940
+ "step": 1330
941
+ },
942
+ {
943
+ "epoch": 2.68,
944
+ "grad_norm": 2.104931116104126,
945
+ "learning_rate": 1.8356156653350138e-05,
946
+ "loss": 0.5468,
947
+ "step": 1340
948
+ },
949
+ {
950
+ "epoch": 2.7,
951
+ "grad_norm": 1.7022193670272827,
952
+ "learning_rate": 1.8317603942069665e-05,
953
+ "loss": 0.543,
954
+ "step": 1350
955
+ },
956
+ {
957
+ "epoch": 2.7199999999999998,
958
+ "grad_norm": 1.9831078052520752,
959
+ "learning_rate": 1.8278645842507448e-05,
960
+ "loss": 0.5416,
961
+ "step": 1360
962
+ },
963
+ {
964
+ "epoch": 2.74,
965
+ "grad_norm": 1.4346486330032349,
966
+ "learning_rate": 1.8239284253426294e-05,
967
+ "loss": 0.5692,
968
+ "step": 1370
969
+ },
970
+ {
971
+ "epoch": 2.76,
972
+ "grad_norm": 3.786205768585205,
973
+ "learning_rate": 1.8199521093254524e-05,
974
+ "loss": 0.5372,
975
+ "step": 1380
976
+ },
977
+ {
978
+ "epoch": 2.7800000000000002,
979
+ "grad_norm": 1.2855556011199951,
980
+ "learning_rate": 1.815935829999247e-05,
981
+ "loss": 0.5205,
982
+ "step": 1390
983
+ },
984
+ {
985
+ "epoch": 2.8,
986
+ "grad_norm": 1.4858529567718506,
987
+ "learning_rate": 1.811879783111801e-05,
988
+ "loss": 0.5159,
989
+ "step": 1400
990
+ },
991
+ {
992
+ "epoch": 2.82,
993
+ "grad_norm": 1.3300436735153198,
994
+ "learning_rate": 1.8077841663491174e-05,
995
+ "loss": 0.5405,
996
+ "step": 1410
997
+ },
998
+ {
999
+ "epoch": 2.84,
1000
+ "grad_norm": 1.2626831531524658,
1001
+ "learning_rate": 1.80364917932578e-05,
1002
+ "loss": 0.5769,
1003
+ "step": 1420
1004
+ },
1005
+ {
1006
+ "epoch": 2.86,
1007
+ "grad_norm": 1.4615288972854614,
1008
+ "learning_rate": 1.799475023575222e-05,
1009
+ "loss": 0.5724,
1010
+ "step": 1430
1011
+ },
1012
+ {
1013
+ "epoch": 2.88,
1014
+ "grad_norm": 1.2775332927703857,
1015
+ "learning_rate": 1.795261902539906e-05,
1016
+ "loss": 0.5603,
1017
+ "step": 1440
1018
+ },
1019
+ {
1020
+ "epoch": 2.9,
1021
+ "grad_norm": 1.9211925268173218,
1022
+ "learning_rate": 1.791010021561407e-05,
1023
+ "loss": 0.5609,
1024
+ "step": 1450
1025
+ },
1026
+ {
1027
+ "epoch": 2.92,
1028
+ "grad_norm": 3.850076675415039,
1029
+ "learning_rate": 1.7867195878704062e-05,
1030
+ "loss": 0.585,
1031
+ "step": 1460
1032
+ },
1033
+ {
1034
+ "epoch": 2.94,
1035
+ "grad_norm": 1.0455843210220337,
1036
+ "learning_rate": 1.7823908105765883e-05,
1037
+ "loss": 0.5818,
1038
+ "step": 1470
1039
+ },
1040
+ {
1041
+ "epoch": 2.96,
1042
+ "grad_norm": 2.406511068344116,
1043
+ "learning_rate": 1.7780239006584515e-05,
1044
+ "loss": 0.5453,
1045
+ "step": 1480
1046
+ },
1047
+ {
1048
+ "epoch": 2.98,
1049
+ "grad_norm": 1.212849736213684,
1050
+ "learning_rate": 1.773619070953025e-05,
1051
+ "loss": 0.5526,
1052
+ "step": 1490
1053
+ },
1054
+ {
1055
+ "epoch": 3.0,
1056
+ "grad_norm": 1.608502984046936,
1057
+ "learning_rate": 1.769176536145494e-05,
1058
+ "loss": 0.5664,
1059
+ "step": 1500
1060
+ },
1061
+ {
1062
+ "epoch": 3.02,
1063
+ "grad_norm": 2.083935499191284,
1064
+ "learning_rate": 1.7646965127587373e-05,
1065
+ "loss": 0.4993,
1066
+ "step": 1510
1067
+ },
1068
+ {
1069
+ "epoch": 3.04,
1070
+ "grad_norm": 3.191681146621704,
1071
+ "learning_rate": 1.760179219142774e-05,
1072
+ "loss": 0.5302,
1073
+ "step": 1520
1074
+ },
1075
+ {
1076
+ "epoch": 3.06,
1077
+ "grad_norm": 2.241614580154419,
1078
+ "learning_rate": 1.7556248754641237e-05,
1079
+ "loss": 0.4995,
1080
+ "step": 1530
1081
+ },
1082
+ {
1083
+ "epoch": 3.08,
1084
+ "grad_norm": 2.2549211978912354,
1085
+ "learning_rate": 1.7510337036950703e-05,
1086
+ "loss": 0.4902,
1087
+ "step": 1540
1088
+ },
1089
+ {
1090
+ "epoch": 3.1,
1091
+ "grad_norm": 2.623849630355835,
1092
+ "learning_rate": 1.7464059276028497e-05,
1093
+ "loss": 0.5571,
1094
+ "step": 1550
1095
+ },
1096
+ {
1097
+ "epoch": 3.12,
1098
+ "grad_norm": 1.099225401878357,
1099
+ "learning_rate": 1.7417417727387392e-05,
1100
+ "loss": 0.5441,
1101
+ "step": 1560
1102
+ },
1103
+ {
1104
+ "epoch": 3.14,
1105
+ "grad_norm": 1.0318654775619507,
1106
+ "learning_rate": 1.7370414664270675e-05,
1107
+ "loss": 0.5498,
1108
+ "step": 1570
1109
+ },
1110
+ {
1111
+ "epoch": 3.16,
1112
+ "grad_norm": 1.4295272827148438,
1113
+ "learning_rate": 1.732305237754132e-05,
1114
+ "loss": 0.4799,
1115
+ "step": 1580
1116
+ },
1117
+ {
1118
+ "epoch": 3.18,
1119
+ "grad_norm": 3.886106014251709,
1120
+ "learning_rate": 1.727533317557037e-05,
1121
+ "loss": 0.5285,
1122
+ "step": 1590
1123
+ },
1124
+ {
1125
+ "epoch": 3.2,
1126
+ "grad_norm": 0.9527933597564697,
1127
+ "learning_rate": 1.7227259384124408e-05,
1128
+ "loss": 0.5328,
1129
+ "step": 1600
1130
+ },
1131
+ {
1132
+ "epoch": 3.22,
1133
+ "grad_norm": 1.0082141160964966,
1134
+ "learning_rate": 1.7178833346252208e-05,
1135
+ "loss": 0.5333,
1136
+ "step": 1610
1137
+ },
1138
+ {
1139
+ "epoch": 3.24,
1140
+ "grad_norm": 0.9779771566390991,
1141
+ "learning_rate": 1.713005742217053e-05,
1142
+ "loss": 0.5163,
1143
+ "step": 1620
1144
+ },
1145
+ {
1146
+ "epoch": 3.26,
1147
+ "grad_norm": 2.230241060256958,
1148
+ "learning_rate": 1.7080933989149112e-05,
1149
+ "loss": 0.5173,
1150
+ "step": 1630
1151
+ },
1152
+ {
1153
+ "epoch": 3.2800000000000002,
1154
+ "grad_norm": 4.50337028503418,
1155
+ "learning_rate": 1.7031465441394766e-05,
1156
+ "loss": 0.5187,
1157
+ "step": 1640
1158
+ },
1159
+ {
1160
+ "epoch": 3.3,
1161
+ "grad_norm": 1.1370171308517456,
1162
+ "learning_rate": 1.698165418993473e-05,
1163
+ "loss": 0.5611,
1164
+ "step": 1650
1165
+ },
1166
+ {
1167
+ "epoch": 3.32,
1168
+ "grad_norm": 0.9167156219482422,
1169
+ "learning_rate": 1.6931502662499116e-05,
1170
+ "loss": 0.5381,
1171
+ "step": 1660
1172
+ },
1173
+ {
1174
+ "epoch": 3.34,
1175
+ "grad_norm": 2.5473690032958984,
1176
+ "learning_rate": 1.688101330340263e-05,
1177
+ "loss": 0.5089,
1178
+ "step": 1670
1179
+ },
1180
+ {
1181
+ "epoch": 3.36,
1182
+ "grad_norm": 0.9971883893013,
1183
+ "learning_rate": 1.683018857342539e-05,
1184
+ "loss": 0.5538,
1185
+ "step": 1680
1186
+ },
1187
+ {
1188
+ "epoch": 3.38,
1189
+ "grad_norm": 1.7401093244552612,
1190
+ "learning_rate": 1.6779030949693044e-05,
1191
+ "loss": 0.5216,
1192
+ "step": 1690
1193
+ },
1194
+ {
1195
+ "epoch": 3.4,
1196
+ "grad_norm": 6.5078935623168945,
1197
+ "learning_rate": 1.6727542925556e-05,
1198
+ "loss": 0.5356,
1199
+ "step": 1700
1200
+ },
1201
+ {
1202
+ "epoch": 3.42,
1203
+ "grad_norm": 0.9460225105285645,
1204
+ "learning_rate": 1.667572701046791e-05,
1205
+ "loss": 0.497,
1206
+ "step": 1710
1207
+ },
1208
+ {
1209
+ "epoch": 3.44,
1210
+ "grad_norm": 1.153794765472412,
1211
+ "learning_rate": 1.662358572986337e-05,
1212
+ "loss": 0.4934,
1213
+ "step": 1720
1214
+ },
1215
+ {
1216
+ "epoch": 3.46,
1217
+ "grad_norm": 1.1257123947143555,
1218
+ "learning_rate": 1.6571121625034847e-05,
1219
+ "loss": 0.5327,
1220
+ "step": 1730
1221
+ },
1222
+ {
1223
+ "epoch": 3.48,
1224
+ "grad_norm": 2.454582691192627,
1225
+ "learning_rate": 1.651833725300879e-05,
1226
+ "loss": 0.4995,
1227
+ "step": 1740
1228
+ },
1229
+ {
1230
+ "epoch": 3.5,
1231
+ "grad_norm": 1.7266925573349,
1232
+ "learning_rate": 1.6465235186421024e-05,
1233
+ "loss": 0.4945,
1234
+ "step": 1750
1235
+ },
1236
+ {
1237
+ "epoch": 3.52,
1238
+ "grad_norm": 2.5186126232147217,
1239
+ "learning_rate": 1.6411818013391357e-05,
1240
+ "loss": 0.4969,
1241
+ "step": 1760
1242
+ },
1243
+ {
1244
+ "epoch": 3.54,
1245
+ "grad_norm": 2.2978458404541016,
1246
+ "learning_rate": 1.6358088337397444e-05,
1247
+ "loss": 0.5133,
1248
+ "step": 1770
1249
+ },
1250
+ {
1251
+ "epoch": 3.56,
1252
+ "grad_norm": 9.083796501159668,
1253
+ "learning_rate": 1.630404877714789e-05,
1254
+ "loss": 0.4598,
1255
+ "step": 1780
1256
+ },
1257
+ {
1258
+ "epoch": 3.58,
1259
+ "grad_norm": 1.687730312347412,
1260
+ "learning_rate": 1.6249701966454626e-05,
1261
+ "loss": 0.5721,
1262
+ "step": 1790
1263
+ },
1264
+ {
1265
+ "epoch": 3.6,
1266
+ "grad_norm": 0.949276328086853,
1267
+ "learning_rate": 1.619505055410453e-05,
1268
+ "loss": 0.5549,
1269
+ "step": 1800
1270
+ },
1271
+ {
1272
+ "epoch": 3.62,
1273
+ "grad_norm": 1.7653878927230835,
1274
+ "learning_rate": 1.614009720373034e-05,
1275
+ "loss": 0.5192,
1276
+ "step": 1810
1277
+ },
1278
+ {
1279
+ "epoch": 3.64,
1280
+ "grad_norm": 0.6442993879318237,
1281
+ "learning_rate": 1.608484459368082e-05,
1282
+ "loss": 0.4927,
1283
+ "step": 1820
1284
+ },
1285
+ {
1286
+ "epoch": 3.66,
1287
+ "grad_norm": 2.4791717529296875,
1288
+ "learning_rate": 1.602929541689025e-05,
1289
+ "loss": 0.5319,
1290
+ "step": 1830
1291
+ },
1292
+ {
1293
+ "epoch": 3.68,
1294
+ "grad_norm": 1.8095901012420654,
1295
+ "learning_rate": 1.5973452380747125e-05,
1296
+ "loss": 0.5025,
1297
+ "step": 1840
1298
+ },
1299
+ {
1300
+ "epoch": 3.7,
1301
+ "grad_norm": 1.8692318201065063,
1302
+ "learning_rate": 1.591731820696224e-05,
1303
+ "loss": 0.497,
1304
+ "step": 1850
1305
+ },
1306
+ {
1307
+ "epoch": 3.7199999999999998,
1308
+ "grad_norm": 1.3001285791397095,
1309
+ "learning_rate": 1.5860895631436044e-05,
1310
+ "loss": 0.556,
1311
+ "step": 1860
1312
+ },
1313
+ {
1314
+ "epoch": 3.74,
1315
+ "grad_norm": 1.0697414875030518,
1316
+ "learning_rate": 1.580418740412526e-05,
1317
+ "loss": 0.506,
1318
+ "step": 1870
1319
+ },
1320
+ {
1321
+ "epoch": 3.76,
1322
+ "grad_norm": 3.5158541202545166,
1323
+ "learning_rate": 1.5747196288908887e-05,
1324
+ "loss": 0.5154,
1325
+ "step": 1880
1326
+ },
1327
+ {
1328
+ "epoch": 3.7800000000000002,
1329
+ "grad_norm": 1.699308156967163,
1330
+ "learning_rate": 1.5689925063453483e-05,
1331
+ "loss": 0.5887,
1332
+ "step": 1890
1333
+ },
1334
+ {
1335
+ "epoch": 3.8,
1336
+ "grad_norm": 0.825036883354187,
1337
+ "learning_rate": 1.563237651907777e-05,
1338
+ "loss": 0.508,
1339
+ "step": 1900
1340
+ },
1341
+ {
1342
+ "epoch": 3.82,
1343
+ "grad_norm": 1.7058846950531006,
1344
+ "learning_rate": 1.5574553460616608e-05,
1345
+ "loss": 0.4954,
1346
+ "step": 1910
1347
+ },
1348
+ {
1349
+ "epoch": 3.84,
1350
+ "grad_norm": 1.7128701210021973,
1351
+ "learning_rate": 1.5516458706284306e-05,
1352
+ "loss": 0.5628,
1353
+ "step": 1920
1354
+ },
1355
+ {
1356
+ "epoch": 3.86,
1357
+ "grad_norm": 0.8009471297264099,
1358
+ "learning_rate": 1.5458095087537216e-05,
1359
+ "loss": 0.4494,
1360
+ "step": 1930
1361
+ },
1362
+ {
1363
+ "epoch": 3.88,
1364
+ "grad_norm": 1.8152306079864502,
1365
+ "learning_rate": 1.5399465448935788e-05,
1366
+ "loss": 0.522,
1367
+ "step": 1940
1368
+ },
1369
+ {
1370
+ "epoch": 3.9,
1371
+ "grad_norm": 2.59840989112854,
1372
+ "learning_rate": 1.5340572648005887e-05,
1373
+ "loss": 0.5225,
1374
+ "step": 1950
1375
+ },
1376
+ {
1377
+ "epoch": 3.92,
1378
+ "grad_norm": 3.8816378116607666,
1379
+ "learning_rate": 1.5281419555099547e-05,
1380
+ "loss": 0.5092,
1381
+ "step": 1960
1382
+ },
1383
+ {
1384
+ "epoch": 3.94,
1385
+ "grad_norm": 1.4815788269042969,
1386
+ "learning_rate": 1.5222009053255061e-05,
1387
+ "loss": 0.5167,
1388
+ "step": 1970
1389
+ },
1390
+ {
1391
+ "epoch": 3.96,
1392
+ "grad_norm": 1.5924495458602905,
1393
+ "learning_rate": 1.5162344038056476e-05,
1394
+ "loss": 0.5198,
1395
+ "step": 1980
1396
+ },
1397
+ {
1398
+ "epoch": 3.98,
1399
+ "grad_norm": 1.44657301902771,
1400
+ "learning_rate": 1.510242741749246e-05,
1401
+ "loss": 0.5723,
1402
+ "step": 1990
1403
+ },
1404
+ {
1405
+ "epoch": 4.0,
1406
+ "grad_norm": 2.8102152347564697,
1407
+ "learning_rate": 1.5042262111814566e-05,
1408
+ "loss": 0.4707,
1409
+ "step": 2000
1410
+ },
1411
+ {
1412
+ "epoch": 4.02,
1413
+ "grad_norm": 1.1032963991165161,
1414
+ "learning_rate": 1.498185105339491e-05,
1415
+ "loss": 0.523,
1416
+ "step": 2010
1417
+ },
1418
+ {
1419
+ "epoch": 4.04,
1420
+ "grad_norm": 1.0523751974105835,
1421
+ "learning_rate": 1.4921197186583256e-05,
1422
+ "loss": 0.4433,
1423
+ "step": 2020
1424
+ },
1425
+ {
1426
+ "epoch": 4.06,
1427
+ "grad_norm": 3.989997148513794,
1428
+ "learning_rate": 1.4860303467563504e-05,
1429
+ "loss": 0.4861,
1430
+ "step": 2030
1431
+ },
1432
+ {
1433
+ "epoch": 4.08,
1434
+ "grad_norm": 2.7621233463287354,
1435
+ "learning_rate": 1.4799172864209607e-05,
1436
+ "loss": 0.4621,
1437
+ "step": 2040
1438
+ },
1439
+ {
1440
+ "epoch": 4.1,
1441
+ "grad_norm": 0.960752010345459,
1442
+ "learning_rate": 1.4737808355940932e-05,
1443
+ "loss": 0.5454,
1444
+ "step": 2050
1445
+ },
1446
+ {
1447
+ "epoch": 4.12,
1448
+ "grad_norm": 1.279720664024353,
1449
+ "learning_rate": 1.467621293357704e-05,
1450
+ "loss": 0.4984,
1451
+ "step": 2060
1452
+ },
1453
+ {
1454
+ "epoch": 4.14,
1455
+ "grad_norm": 2.1226606369018555,
1456
+ "learning_rate": 1.4614389599191917e-05,
1457
+ "loss": 0.5375,
1458
+ "step": 2070
1459
+ },
1460
+ {
1461
+ "epoch": 4.16,
1462
+ "grad_norm": 1.6314127445220947,
1463
+ "learning_rate": 1.455234136596766e-05,
1464
+ "loss": 0.5191,
1465
+ "step": 2080
1466
+ },
1467
+ {
1468
+ "epoch": 4.18,
1469
+ "grad_norm": 2.72589111328125,
1470
+ "learning_rate": 1.4490071258047625e-05,
1471
+ "loss": 0.4872,
1472
+ "step": 2090
1473
+ },
1474
+ {
1475
+ "epoch": 4.2,
1476
+ "grad_norm": 4.3185648918151855,
1477
+ "learning_rate": 1.442758231038902e-05,
1478
+ "loss": 0.4883,
1479
+ "step": 2100
1480
+ },
1481
+ {
1482
+ "epoch": 4.22,
1483
+ "grad_norm": 0.8917685747146606,
1484
+ "learning_rate": 1.436487756861499e-05,
1485
+ "loss": 0.4564,
1486
+ "step": 2110
1487
+ },
1488
+ {
1489
+ "epoch": 4.24,
1490
+ "grad_norm": 1.1269738674163818,
1491
+ "learning_rate": 1.4301960088866187e-05,
1492
+ "loss": 0.4311,
1493
+ "step": 2120
1494
+ },
1495
+ {
1496
+ "epoch": 4.26,
1497
+ "grad_norm": 4.341186046600342,
1498
+ "learning_rate": 1.4238832937651816e-05,
1499
+ "loss": 0.4663,
1500
+ "step": 2130
1501
+ },
1502
+ {
1503
+ "epoch": 4.28,
1504
+ "grad_norm": 0.9497487545013428,
1505
+ "learning_rate": 1.4175499191700169e-05,
1506
+ "loss": 0.5048,
1507
+ "step": 2140
1508
+ },
1509
+ {
1510
+ "epoch": 4.3,
1511
+ "grad_norm": 1.7001278400421143,
1512
+ "learning_rate": 1.4111961937808665e-05,
1513
+ "loss": 0.493,
1514
+ "step": 2150
1515
+ },
1516
+ {
1517
+ "epoch": 4.32,
1518
+ "grad_norm": 1.7037960290908813,
1519
+ "learning_rate": 1.4048224272693426e-05,
1520
+ "loss": 0.4712,
1521
+ "step": 2160
1522
+ },
1523
+ {
1524
+ "epoch": 4.34,
1525
+ "grad_norm": 0.9490267634391785,
1526
+ "learning_rate": 1.3984289302838327e-05,
1527
+ "loss": 0.4641,
1528
+ "step": 2170
1529
+ },
1530
+ {
1531
+ "epoch": 4.36,
1532
+ "grad_norm": 2.2001664638519287,
1533
+ "learning_rate": 1.3920160144343604e-05,
1534
+ "loss": 0.4929,
1535
+ "step": 2180
1536
+ },
1537
+ {
1538
+ "epoch": 4.38,
1539
+ "grad_norm": 1.0687891244888306,
1540
+ "learning_rate": 1.3855839922773968e-05,
1541
+ "loss": 0.5269,
1542
+ "step": 2190
1543
+ },
1544
+ {
1545
+ "epoch": 4.4,
1546
+ "grad_norm": 1.1579177379608154,
1547
+ "learning_rate": 1.3791331773006272e-05,
1548
+ "loss": 0.4857,
1549
+ "step": 2200
1550
+ },
1551
+ {
1552
+ "epoch": 4.42,
1553
+ "grad_norm": 0.9180253744125366,
1554
+ "learning_rate": 1.3726638839076732e-05,
1555
+ "loss": 0.5613,
1556
+ "step": 2210
1557
+ },
1558
+ {
1559
+ "epoch": 4.44,
1560
+ "grad_norm": 1.170154333114624,
1561
+ "learning_rate": 1.3661764274027678e-05,
1562
+ "loss": 0.4884,
1563
+ "step": 2220
1564
+ },
1565
+ {
1566
+ "epoch": 4.46,
1567
+ "grad_norm": 1.8389673233032227,
1568
+ "learning_rate": 1.3596711239753889e-05,
1569
+ "loss": 0.4849,
1570
+ "step": 2230
1571
+ },
1572
+ {
1573
+ "epoch": 4.48,
1574
+ "grad_norm": 1.43435537815094,
1575
+ "learning_rate": 1.3531482906848474e-05,
1576
+ "loss": 0.4752,
1577
+ "step": 2240
1578
+ },
1579
+ {
1580
+ "epoch": 4.5,
1581
+ "grad_norm": 1.2561147212982178,
1582
+ "learning_rate": 1.3466082454448364e-05,
1583
+ "loss": 0.4804,
1584
+ "step": 2250
1585
+ },
1586
+ {
1587
+ "epoch": 4.52,
1588
+ "grad_norm": 1.3098878860473633,
1589
+ "learning_rate": 1.340051307007933e-05,
1590
+ "loss": 0.4719,
1591
+ "step": 2260
1592
+ },
1593
+ {
1594
+ "epoch": 4.54,
1595
+ "grad_norm": 7.966071128845215,
1596
+ "learning_rate": 1.3334777949500673e-05,
1597
+ "loss": 0.4599,
1598
+ "step": 2270
1599
+ },
1600
+ {
1601
+ "epoch": 4.5600000000000005,
1602
+ "grad_norm": 2.28067946434021,
1603
+ "learning_rate": 1.3268880296549424e-05,
1604
+ "loss": 0.4712,
1605
+ "step": 2280
1606
+ },
1607
+ {
1608
+ "epoch": 4.58,
1609
+ "grad_norm": 1.1881040334701538,
1610
+ "learning_rate": 1.3202823322984228e-05,
1611
+ "loss": 0.4772,
1612
+ "step": 2290
1613
+ },
1614
+ {
1615
+ "epoch": 4.6,
1616
+ "grad_norm": 1.482059359550476,
1617
+ "learning_rate": 1.3136610248328779e-05,
1618
+ "loss": 0.453,
1619
+ "step": 2300
1620
+ },
1621
+ {
1622
+ "epoch": 4.62,
1623
+ "grad_norm": 2.1320247650146484,
1624
+ "learning_rate": 1.307024429971492e-05,
1625
+ "loss": 0.4657,
1626
+ "step": 2310
1627
+ },
1628
+ {
1629
+ "epoch": 4.64,
1630
+ "grad_norm": 1.2124568223953247,
1631
+ "learning_rate": 1.3003728711725364e-05,
1632
+ "loss": 0.4791,
1633
+ "step": 2320
1634
+ },
1635
+ {
1636
+ "epoch": 4.66,
1637
+ "grad_norm": 0.7249093055725098,
1638
+ "learning_rate": 1.2937066726236029e-05,
1639
+ "loss": 0.5586,
1640
+ "step": 2330
1641
+ },
1642
+ {
1643
+ "epoch": 4.68,
1644
+ "grad_norm": 1.5038686990737915,
1645
+ "learning_rate": 1.2870261592258038e-05,
1646
+ "loss": 0.4603,
1647
+ "step": 2340
1648
+ },
1649
+ {
1650
+ "epoch": 4.7,
1651
+ "grad_norm": 1.320827603340149,
1652
+ "learning_rate": 1.2803316565779378e-05,
1653
+ "loss": 0.4618,
1654
+ "step": 2350
1655
+ },
1656
+ {
1657
+ "epoch": 4.72,
1658
+ "grad_norm": 1.4801580905914307,
1659
+ "learning_rate": 1.2736234909606186e-05,
1660
+ "loss": 0.4643,
1661
+ "step": 2360
1662
+ },
1663
+ {
1664
+ "epoch": 4.74,
1665
+ "grad_norm": 2.649113655090332,
1666
+ "learning_rate": 1.2669019893203758e-05,
1667
+ "loss": 0.5017,
1668
+ "step": 2370
1669
+ },
1670
+ {
1671
+ "epoch": 4.76,
1672
+ "grad_norm": 1.3265902996063232,
1673
+ "learning_rate": 1.2601674792537157e-05,
1674
+ "loss": 0.451,
1675
+ "step": 2380
1676
+ },
1677
+ {
1678
+ "epoch": 4.78,
1679
+ "grad_norm": 1.6874167919158936,
1680
+ "learning_rate": 1.2534202889911584e-05,
1681
+ "loss": 0.4763,
1682
+ "step": 2390
1683
+ },
1684
+ {
1685
+ "epoch": 4.8,
1686
+ "grad_norm": 0.7948456406593323,
1687
+ "learning_rate": 1.2466607473812386e-05,
1688
+ "loss": 0.4984,
1689
+ "step": 2400
1690
+ },
1691
+ {
1692
+ "epoch": 4.82,
1693
+ "grad_norm": 1.3831831216812134,
1694
+ "learning_rate": 1.2398891838744777e-05,
1695
+ "loss": 0.4594,
1696
+ "step": 2410
1697
+ },
1698
+ {
1699
+ "epoch": 4.84,
1700
+ "grad_norm": 2.660630941390991,
1701
+ "learning_rate": 1.233105928507328e-05,
1702
+ "loss": 0.476,
1703
+ "step": 2420
1704
+ },
1705
+ {
1706
+ "epoch": 4.86,
1707
+ "grad_norm": 1.7315685749053955,
1708
+ "learning_rate": 1.226311311886086e-05,
1709
+ "loss": 0.4599,
1710
+ "step": 2430
1711
+ },
1712
+ {
1713
+ "epoch": 4.88,
1714
+ "grad_norm": 1.0656920671463013,
1715
+ "learning_rate": 1.2195056651707806e-05,
1716
+ "loss": 0.4786,
1717
+ "step": 2440
1718
+ },
1719
+ {
1720
+ "epoch": 4.9,
1721
+ "grad_norm": 1.317185878753662,
1722
+ "learning_rate": 1.2126893200590309e-05,
1723
+ "loss": 0.539,
1724
+ "step": 2450
1725
+ },
1726
+ {
1727
+ "epoch": 4.92,
1728
+ "grad_norm": 2.1588151454925537,
1729
+ "learning_rate": 1.2058626087698814e-05,
1730
+ "loss": 0.442,
1731
+ "step": 2460
1732
+ },
1733
+ {
1734
+ "epoch": 4.9399999999999995,
1735
+ "grad_norm": 1.3337817192077637,
1736
+ "learning_rate": 1.1990258640276094e-05,
1737
+ "loss": 0.4829,
1738
+ "step": 2470
1739
+ },
1740
+ {
1741
+ "epoch": 4.96,
1742
+ "grad_norm": 1.3558602333068848,
1743
+ "learning_rate": 1.1921794190455082e-05,
1744
+ "loss": 0.5055,
1745
+ "step": 2480
1746
+ },
1747
+ {
1748
+ "epoch": 4.98,
1749
+ "grad_norm": 1.18630850315094,
1750
+ "learning_rate": 1.1853236075096474e-05,
1751
+ "loss": 0.4857,
1752
+ "step": 2490
1753
+ },
1754
+ {
1755
+ "epoch": 5.0,
1756
+ "grad_norm": 1.4073606729507446,
1757
+ "learning_rate": 1.1784587635626095e-05,
1758
+ "loss": 0.4962,
1759
+ "step": 2500
1760
+ },
1761
+ {
1762
+ "epoch": 5.02,
1763
+ "grad_norm": 2.1152431964874268,
1764
+ "learning_rate": 1.171585221787203e-05,
1765
+ "loss": 0.4185,
1766
+ "step": 2510
1767
+ },
1768
+ {
1769
+ "epoch": 5.04,
1770
+ "grad_norm": 1.4434751272201538,
1771
+ "learning_rate": 1.1647033171901573e-05,
1772
+ "loss": 0.4545,
1773
+ "step": 2520
1774
+ },
1775
+ {
1776
+ "epoch": 5.06,
1777
+ "grad_norm": 2.575100898742676,
1778
+ "learning_rate": 1.157813385185794e-05,
1779
+ "loss": 0.4162,
1780
+ "step": 2530
1781
+ },
1782
+ {
1783
+ "epoch": 5.08,
1784
+ "grad_norm": 1.808670163154602,
1785
+ "learning_rate": 1.1509157615796775e-05,
1786
+ "loss": 0.425,
1787
+ "step": 2540
1788
+ },
1789
+ {
1790
+ "epoch": 5.1,
1791
+ "grad_norm": 2.0756947994232178,
1792
+ "learning_rate": 1.1440107825522522e-05,
1793
+ "loss": 0.4514,
1794
+ "step": 2550
1795
+ },
1796
+ {
1797
+ "epoch": 5.12,
1798
+ "grad_norm": 1.3622123003005981,
1799
+ "learning_rate": 1.1370987846424547e-05,
1800
+ "loss": 0.4687,
1801
+ "step": 2560
1802
+ },
1803
+ {
1804
+ "epoch": 5.14,
1805
+ "grad_norm": 1.938477873802185,
1806
+ "learning_rate": 1.1301801047313106e-05,
1807
+ "loss": 0.4892,
1808
+ "step": 2570
1809
+ },
1810
+ {
1811
+ "epoch": 5.16,
1812
+ "grad_norm": 1.3552794456481934,
1813
+ "learning_rate": 1.1232550800255188e-05,
1814
+ "loss": 0.4675,
1815
+ "step": 2580
1816
+ },
1817
+ {
1818
+ "epoch": 5.18,
1819
+ "grad_norm": 2.3927013874053955,
1820
+ "learning_rate": 1.1163240480410136e-05,
1821
+ "loss": 0.4336,
1822
+ "step": 2590
1823
+ },
1824
+ {
1825
+ "epoch": 5.2,
1826
+ "grad_norm": 1.3408219814300537,
1827
+ "learning_rate": 1.1093873465865156e-05,
1828
+ "loss": 0.4358,
1829
+ "step": 2600
1830
+ },
1831
+ {
1832
+ "epoch": 5.22,
1833
+ "grad_norm": 2.9869275093078613,
1834
+ "learning_rate": 1.1024453137470677e-05,
1835
+ "loss": 0.4709,
1836
+ "step": 2610
1837
+ },
1838
+ {
1839
+ "epoch": 5.24,
1840
+ "grad_norm": 2.7815663814544678,
1841
+ "learning_rate": 1.0954982878675564e-05,
1842
+ "loss": 0.4349,
1843
+ "step": 2620
1844
+ },
1845
+ {
1846
+ "epoch": 5.26,
1847
+ "grad_norm": 1.9527360200881958,
1848
+ "learning_rate": 1.0885466075362224e-05,
1849
+ "loss": 0.4581,
1850
+ "step": 2630
1851
+ },
1852
+ {
1853
+ "epoch": 5.28,
1854
+ "grad_norm": 1.804969072341919,
1855
+ "learning_rate": 1.0815906115681579e-05,
1856
+ "loss": 0.4482,
1857
+ "step": 2640
1858
+ },
1859
+ {
1860
+ "epoch": 5.3,
1861
+ "grad_norm": 1.6230815649032593,
1862
+ "learning_rate": 1.0746306389887924e-05,
1863
+ "loss": 0.4771,
1864
+ "step": 2650
1865
+ },
1866
+ {
1867
+ "epoch": 5.32,
1868
+ "grad_norm": 2.6288340091705322,
1869
+ "learning_rate": 1.067667029017371e-05,
1870
+ "loss": 0.4893,
1871
+ "step": 2660
1872
+ },
1873
+ {
1874
+ "epoch": 5.34,
1875
+ "grad_norm": 0.7628895044326782,
1876
+ "learning_rate": 1.060700121050419e-05,
1877
+ "loss": 0.4823,
1878
+ "step": 2670
1879
+ },
1880
+ {
1881
+ "epoch": 5.36,
1882
+ "grad_norm": 1.395524501800537,
1883
+ "learning_rate": 1.0537302546452022e-05,
1884
+ "loss": 0.45,
1885
+ "step": 2680
1886
+ },
1887
+ {
1888
+ "epoch": 5.38,
1889
+ "grad_norm": 1.0264369249343872,
1890
+ "learning_rate": 1.0467577695031763e-05,
1891
+ "loss": 0.4817,
1892
+ "step": 2690
1893
+ },
1894
+ {
1895
+ "epoch": 5.4,
1896
+ "grad_norm": 1.3651304244995117,
1897
+ "learning_rate": 1.03978300545343e-05,
1898
+ "loss": 0.4472,
1899
+ "step": 2700
1900
+ },
1901
+ {
1902
+ "epoch": 5.42,
1903
+ "grad_norm": 1.520727276802063,
1904
+ "learning_rate": 1.0328063024361232e-05,
1905
+ "loss": 0.4351,
1906
+ "step": 2710
1907
+ },
1908
+ {
1909
+ "epoch": 5.44,
1910
+ "grad_norm": 2.283327102661133,
1911
+ "learning_rate": 1.0258280004859189e-05,
1912
+ "loss": 0.4052,
1913
+ "step": 2720
1914
+ },
1915
+ {
1916
+ "epoch": 5.46,
1917
+ "grad_norm": 5.243598937988281,
1918
+ "learning_rate": 1.0188484397154083e-05,
1919
+ "loss": 0.51,
1920
+ "step": 2730
1921
+ },
1922
+ {
1923
+ "epoch": 5.48,
1924
+ "grad_norm": 2.3326563835144043,
1925
+ "learning_rate": 1.0118679602985373e-05,
1926
+ "loss": 0.4678,
1927
+ "step": 2740
1928
+ },
1929
+ {
1930
+ "epoch": 5.5,
1931
+ "grad_norm": 1.8756747245788574,
1932
+ "learning_rate": 1.0048869024540247e-05,
1933
+ "loss": 0.4802,
1934
+ "step": 2750
1935
+ },
1936
+ {
1937
+ "epoch": 5.52,
1938
+ "grad_norm": 2.212642192840576,
1939
+ "learning_rate": 9.979056064287807e-06,
1940
+ "loss": 0.4416,
1941
+ "step": 2760
1942
+ },
1943
+ {
1944
+ "epoch": 5.54,
1945
+ "grad_norm": 1.893557071685791,
1946
+ "learning_rate": 9.909244124813246e-06,
1947
+ "loss": 0.4613,
1948
+ "step": 2770
1949
+ },
1950
+ {
1951
+ "epoch": 5.5600000000000005,
1952
+ "grad_norm": 3.211782217025757,
1953
+ "learning_rate": 9.839436608652007e-06,
1954
+ "loss": 0.4163,
1955
+ "step": 2780
1956
+ },
1957
+ {
1958
+ "epoch": 5.58,
1959
+ "grad_norm": 1.4164925813674927,
1960
+ "learning_rate": 9.76963691812394e-06,
1961
+ "loss": 0.4753,
1962
+ "step": 2790
1963
+ },
1964
+ {
1965
+ "epoch": 5.6,
1966
+ "grad_norm": 1.139273762702942,
1967
+ "learning_rate": 9.699848455167489e-06,
1968
+ "loss": 0.4725,
1969
+ "step": 2800
1970
+ },
1971
+ {
1972
+ "epoch": 5.62,
1973
+ "grad_norm": 1.5774643421173096,
1974
+ "learning_rate": 9.630074621173882e-06,
1975
+ "loss": 0.4521,
1976
+ "step": 2810
1977
+ },
1978
+ {
1979
+ "epoch": 5.64,
1980
+ "grad_norm": 2.0061256885528564,
1981
+ "learning_rate": 9.560318816821354e-06,
1982
+ "loss": 0.3838,
1983
+ "step": 2820
1984
+ },
1985
+ {
1986
+ "epoch": 5.66,
1987
+ "grad_norm": 3.7671396732330322,
1988
+ "learning_rate": 9.490584441909392e-06,
1989
+ "loss": 0.4603,
1990
+ "step": 2830
1991
+ },
1992
+ {
1993
+ "epoch": 5.68,
1994
+ "grad_norm": 1.6197257041931152,
1995
+ "learning_rate": 9.420874895193056e-06,
1996
+ "loss": 0.4538,
1997
+ "step": 2840
1998
+ },
1999
+ {
2000
+ "epoch": 5.7,
2001
+ "grad_norm": 3.386794328689575,
2002
+ "learning_rate": 9.351193574217305e-06,
2003
+ "loss": 0.4527,
2004
+ "step": 2850
2005
+ },
2006
+ {
2007
+ "epoch": 5.72,
2008
+ "grad_norm": 1.4062769412994385,
2009
+ "learning_rate": 9.281543875151419e-06,
2010
+ "loss": 0.4915,
2011
+ "step": 2860
2012
+ },
2013
+ {
2014
+ "epoch": 5.74,
2015
+ "grad_norm": 2.367417573928833,
2016
+ "learning_rate": 9.211929192623466e-06,
2017
+ "loss": 0.4338,
2018
+ "step": 2870
2019
+ },
2020
+ {
2021
+ "epoch": 5.76,
2022
+ "grad_norm": 1.7326956987380981,
2023
+ "learning_rate": 9.142352919554862e-06,
2024
+ "loss": 0.4573,
2025
+ "step": 2880
2026
+ },
2027
+ {
2028
+ "epoch": 5.78,
2029
+ "grad_norm": 2.8575878143310547,
2030
+ "learning_rate": 9.072818446995e-06,
2031
+ "loss": 0.4494,
2032
+ "step": 2890
2033
+ },
2034
+ {
2035
+ "epoch": 5.8,
2036
+ "grad_norm": 1.1295793056488037,
2037
+ "learning_rate": 9.003329163955973e-06,
2038
+ "loss": 0.5061,
2039
+ "step": 2900
2040
+ },
2041
+ {
2042
+ "epoch": 5.82,
2043
+ "grad_norm": 1.31191885471344,
2044
+ "learning_rate": 8.933888457247402e-06,
2045
+ "loss": 0.4537,
2046
+ "step": 2910
2047
+ },
2048
+ {
2049
+ "epoch": 5.84,
2050
+ "grad_norm": 1.8330936431884766,
2051
+ "learning_rate": 8.864499711311362e-06,
2052
+ "loss": 0.4764,
2053
+ "step": 2920
2054
+ },
2055
+ {
2056
+ "epoch": 5.86,
2057
+ "grad_norm": 1.8839877843856812,
2058
+ "learning_rate": 8.79516630805745e-06,
2059
+ "loss": 0.4563,
2060
+ "step": 2930
2061
+ },
2062
+ {
2063
+ "epoch": 5.88,
2064
+ "grad_norm": 2.6970791816711426,
2065
+ "learning_rate": 8.725891626697912e-06,
2066
+ "loss": 0.4887,
2067
+ "step": 2940
2068
+ },
2069
+ {
2070
+ "epoch": 5.9,
2071
+ "grad_norm": 3.178072214126587,
2072
+ "learning_rate": 8.656679043582986e-06,
2073
+ "loss": 0.446,
2074
+ "step": 2950
2075
+ },
2076
+ {
2077
+ "epoch": 5.92,
2078
+ "grad_norm": 2.423067569732666,
2079
+ "learning_rate": 8.587531932036334e-06,
2080
+ "loss": 0.4533,
2081
+ "step": 2960
2082
+ },
2083
+ {
2084
+ "epoch": 5.9399999999999995,
2085
+ "grad_norm": 3.043440580368042,
2086
+ "learning_rate": 8.518453662190622e-06,
2087
+ "loss": 0.4451,
2088
+ "step": 2970
2089
+ },
2090
+ {
2091
+ "epoch": 5.96,
2092
+ "grad_norm": 2.4324257373809814,
2093
+ "learning_rate": 8.449447600823262e-06,
2094
+ "loss": 0.393,
2095
+ "step": 2980
2096
+ },
2097
+ {
2098
+ "epoch": 5.98,
2099
+ "grad_norm": 7.399738311767578,
2100
+ "learning_rate": 8.380517111192336e-06,
2101
+ "loss": 0.4406,
2102
+ "step": 2990
2103
+ },
2104
+ {
2105
+ "epoch": 6.0,
2106
+ "grad_norm": 0.8923618197441101,
2107
+ "learning_rate": 8.311665552872662e-06,
2108
+ "loss": 0.474,
2109
+ "step": 3000
2110
+ },
2111
+ {
2112
+ "epoch": 6.02,
2113
+ "grad_norm": 1.7500466108322144,
2114
+ "learning_rate": 8.242896281592057e-06,
2115
+ "loss": 0.3953,
2116
+ "step": 3010
2117
+ },
2118
+ {
2119
+ "epoch": 6.04,
2120
+ "grad_norm": 3.8324530124664307,
2121
+ "learning_rate": 8.174212649067781e-06,
2122
+ "loss": 0.4117,
2123
+ "step": 3020
2124
+ },
2125
+ {
2126
+ "epoch": 6.06,
2127
+ "grad_norm": 2.275822639465332,
2128
+ "learning_rate": 8.10561800284319e-06,
2129
+ "loss": 0.3988,
2130
+ "step": 3030
2131
+ },
2132
+ {
2133
+ "epoch": 6.08,
2134
+ "grad_norm": 1.2562943696975708,
2135
+ "learning_rate": 8.037115686124564e-06,
2136
+ "loss": 0.418,
2137
+ "step": 3040
2138
+ },
2139
+ {
2140
+ "epoch": 6.1,
2141
+ "grad_norm": 1.3214370012283325,
2142
+ "learning_rate": 7.96870903761818e-06,
2143
+ "loss": 0.4084,
2144
+ "step": 3050
2145
+ },
2146
+ {
2147
+ "epoch": 6.12,
2148
+ "grad_norm": 2.595797061920166,
2149
+ "learning_rate": 7.900401391367576e-06,
2150
+ "loss": 0.3739,
2151
+ "step": 3060
2152
+ },
2153
+ {
2154
+ "epoch": 6.14,
2155
+ "grad_norm": 2.055779457092285,
2156
+ "learning_rate": 7.832196076591067e-06,
2157
+ "loss": 0.3763,
2158
+ "step": 3070
2159
+ },
2160
+ {
2161
+ "epoch": 6.16,
2162
+ "grad_norm": 2.5182206630706787,
2163
+ "learning_rate": 7.76409641751947e-06,
2164
+ "loss": 0.4522,
2165
+ "step": 3080
2166
+ },
2167
+ {
2168
+ "epoch": 6.18,
2169
+ "grad_norm": 6.86693000793457,
2170
+ "learning_rate": 7.696105733234099e-06,
2171
+ "loss": 0.4661,
2172
+ "step": 3090
2173
+ },
2174
+ {
2175
+ "epoch": 6.2,
2176
+ "grad_norm": 0.7651225924491882,
2177
+ "learning_rate": 7.628227337504972e-06,
2178
+ "loss": 0.4519,
2179
+ "step": 3100
2180
+ },
2181
+ {
2182
+ "epoch": 6.22,
2183
+ "grad_norm": 1.531447172164917,
2184
+ "learning_rate": 7.560464538629345e-06,
2185
+ "loss": 0.4073,
2186
+ "step": 3110
2187
+ },
2188
+ {
2189
+ "epoch": 6.24,
2190
+ "grad_norm": 2.4921135902404785,
2191
+ "learning_rate": 7.492820639270435e-06,
2192
+ "loss": 0.4458,
2193
+ "step": 3120
2194
+ },
2195
+ {
2196
+ "epoch": 6.26,
2197
+ "grad_norm": 1.5849100351333618,
2198
+ "learning_rate": 7.4252989362964635e-06,
2199
+ "loss": 0.3703,
2200
+ "step": 3130
2201
+ },
2202
+ {
2203
+ "epoch": 6.28,
2204
+ "grad_norm": 1.8190685510635376,
2205
+ "learning_rate": 7.357902720619976e-06,
2206
+ "loss": 0.4393,
2207
+ "step": 3140
2208
+ },
2209
+ {
2210
+ "epoch": 6.3,
2211
+ "grad_norm": 1.195379376411438,
2212
+ "learning_rate": 7.290635277037442e-06,
2213
+ "loss": 0.437,
2214
+ "step": 3150
2215
+ },
2216
+ {
2217
+ "epoch": 6.32,
2218
+ "grad_norm": 1.6389209032058716,
2219
+ "learning_rate": 7.22349988406916e-06,
2220
+ "loss": 0.3979,
2221
+ "step": 3160
2222
+ },
2223
+ {
2224
+ "epoch": 6.34,
2225
+ "grad_norm": 1.7568351030349731,
2226
+ "learning_rate": 7.156499813799477e-06,
2227
+ "loss": 0.4078,
2228
+ "step": 3170
2229
+ },
2230
+ {
2231
+ "epoch": 6.36,
2232
+ "grad_norm": 2.5609893798828125,
2233
+ "learning_rate": 7.0896383317172845e-06,
2234
+ "loss": 0.4182,
2235
+ "step": 3180
2236
+ },
2237
+ {
2238
+ "epoch": 6.38,
2239
+ "grad_norm": 2.070969343185425,
2240
+ "learning_rate": 7.022918696556896e-06,
2241
+ "loss": 0.4239,
2242
+ "step": 3190
2243
+ },
2244
+ {
2245
+ "epoch": 6.4,
2246
+ "grad_norm": 3.3474345207214355,
2247
+ "learning_rate": 6.956344160139201e-06,
2248
+ "loss": 0.4369,
2249
+ "step": 3200
2250
+ },
2251
+ {
2252
+ "epoch": 6.42,
2253
+ "grad_norm": 1.3559445142745972,
2254
+ "learning_rate": 6.889917967213184e-06,
2255
+ "loss": 0.4469,
2256
+ "step": 3210
2257
+ },
2258
+ {
2259
+ "epoch": 6.44,
2260
+ "grad_norm": 1.4630825519561768,
2261
+ "learning_rate": 6.823643355297774e-06,
2262
+ "loss": 0.4312,
2263
+ "step": 3220
2264
+ },
2265
+ {
2266
+ "epoch": 6.46,
2267
+ "grad_norm": 2.0589451789855957,
2268
+ "learning_rate": 6.757523554524056e-06,
2269
+ "loss": 0.4465,
2270
+ "step": 3230
2271
+ },
2272
+ {
2273
+ "epoch": 6.48,
2274
+ "grad_norm": 2.448317766189575,
2275
+ "learning_rate": 6.69156178747784e-06,
2276
+ "loss": 0.4201,
2277
+ "step": 3240
2278
+ },
2279
+ {
2280
+ "epoch": 6.5,
2281
+ "grad_norm": 1.911927580833435,
2282
+ "learning_rate": 6.62576126904259e-06,
2283
+ "loss": 0.3882,
2284
+ "step": 3250
2285
+ },
2286
+ {
2287
+ "epoch": 6.52,
2288
+ "grad_norm": 3.176950216293335,
2289
+ "learning_rate": 6.560125206242746e-06,
2290
+ "loss": 0.4448,
2291
+ "step": 3260
2292
+ },
2293
+ {
2294
+ "epoch": 6.54,
2295
+ "grad_norm": 1.4145492315292358,
2296
+ "learning_rate": 6.494656798087412e-06,
2297
+ "loss": 0.3915,
2298
+ "step": 3270
2299
+ },
2300
+ {
2301
+ "epoch": 6.5600000000000005,
2302
+ "grad_norm": 4.982487201690674,
2303
+ "learning_rate": 6.4293592354144365e-06,
2304
+ "loss": 0.3769,
2305
+ "step": 3280
2306
+ },
2307
+ {
2308
+ "epoch": 6.58,
2309
+ "grad_norm": 3.9301717281341553,
2310
+ "learning_rate": 6.364235700734903e-06,
2311
+ "loss": 0.4503,
2312
+ "step": 3290
2313
+ },
2314
+ {
2315
+ "epoch": 6.6,
2316
+ "grad_norm": 3.643587112426758,
2317
+ "learning_rate": 6.299289368078016e-06,
2318
+ "loss": 0.4398,
2319
+ "step": 3300
2320
+ },
2321
+ {
2322
+ "epoch": 6.62,
2323
+ "grad_norm": 2.1195595264434814,
2324
+ "learning_rate": 6.234523402836408e-06,
2325
+ "loss": 0.4199,
2326
+ "step": 3310
2327
+ },
2328
+ {
2329
+ "epoch": 6.64,
2330
+ "grad_norm": 1.3784760236740112,
2331
+ "learning_rate": 6.169940961611853e-06,
2332
+ "loss": 0.4574,
2333
+ "step": 3320
2334
+ },
2335
+ {
2336
+ "epoch": 6.66,
2337
+ "grad_norm": 4.373683452606201,
2338
+ "learning_rate": 6.1055451920614165e-06,
2339
+ "loss": 0.4252,
2340
+ "step": 3330
2341
+ },
2342
+ {
2343
+ "epoch": 6.68,
2344
+ "grad_norm": 1.9143246412277222,
2345
+ "learning_rate": 6.0413392327440635e-06,
2346
+ "loss": 0.4069,
2347
+ "step": 3340
2348
+ },
2349
+ {
2350
+ "epoch": 6.7,
2351
+ "grad_norm": 1.9608592987060547,
2352
+ "learning_rate": 5.977326212967671e-06,
2353
+ "loss": 0.4173,
2354
+ "step": 3350
2355
+ },
2356
+ {
2357
+ "epoch": 6.72,
2358
+ "grad_norm": 2.7381646633148193,
2359
+ "learning_rate": 5.913509252636511e-06,
2360
+ "loss": 0.3737,
2361
+ "step": 3360
2362
+ },
2363
+ {
2364
+ "epoch": 6.74,
2365
+ "grad_norm": 1.9294111728668213,
2366
+ "learning_rate": 5.849891462099199e-06,
2367
+ "loss": 0.437,
2368
+ "step": 3370
2369
+ },
2370
+ {
2371
+ "epoch": 6.76,
2372
+ "grad_norm": 0.8937060236930847,
2373
+ "learning_rate": 5.786475941997094e-06,
2374
+ "loss": 0.4457,
2375
+ "step": 3380
2376
+ },
2377
+ {
2378
+ "epoch": 6.78,
2379
+ "grad_norm": 1.2962634563446045,
2380
+ "learning_rate": 5.723265783113181e-06,
2381
+ "loss": 0.3989,
2382
+ "step": 3390
2383
+ },
2384
+ {
2385
+ "epoch": 6.8,
2386
+ "grad_norm": 1.0391006469726562,
2387
+ "learning_rate": 5.660264066221426e-06,
2388
+ "loss": 0.4314,
2389
+ "step": 3400
2390
+ },
2391
+ {
2392
+ "epoch": 6.82,
2393
+ "grad_norm": 7.157230377197266,
2394
+ "learning_rate": 5.59747386193663e-06,
2395
+ "loss": 0.3989,
2396
+ "step": 3410
2397
+ },
2398
+ {
2399
+ "epoch": 6.84,
2400
+ "grad_norm": 2.1349549293518066,
2401
+ "learning_rate": 5.534898230564765e-06,
2402
+ "loss": 0.3792,
2403
+ "step": 3420
2404
+ },
2405
+ {
2406
+ "epoch": 6.86,
2407
+ "grad_norm": 1.362468957901001,
2408
+ "learning_rate": 5.472540221953824e-06,
2409
+ "loss": 0.4115,
2410
+ "step": 3430
2411
+ },
2412
+ {
2413
+ "epoch": 6.88,
2414
+ "grad_norm": 3.6166296005249023,
2415
+ "learning_rate": 5.41040287534517e-06,
2416
+ "loss": 0.4067,
2417
+ "step": 3440
2418
+ },
2419
+ {
2420
+ "epoch": 6.9,
2421
+ "grad_norm": 2.424628257751465,
2422
+ "learning_rate": 5.348489219225417e-06,
2423
+ "loss": 0.4424,
2424
+ "step": 3450
2425
+ },
2426
+ {
2427
+ "epoch": 6.92,
2428
+ "grad_norm": 2.7839276790618896,
2429
+ "learning_rate": 5.286802271178815e-06,
2430
+ "loss": 0.4508,
2431
+ "step": 3460
2432
+ },
2433
+ {
2434
+ "epoch": 6.9399999999999995,
2435
+ "grad_norm": 3.2447237968444824,
2436
+ "learning_rate": 5.225345037740186e-06,
2437
+ "loss": 0.3984,
2438
+ "step": 3470
2439
+ },
2440
+ {
2441
+ "epoch": 6.96,
2442
+ "grad_norm": 1.6200125217437744,
2443
+ "learning_rate": 5.16412051424839e-06,
2444
+ "loss": 0.4499,
2445
+ "step": 3480
2446
+ },
2447
+ {
2448
+ "epoch": 6.98,
2449
+ "grad_norm": 1.5760829448699951,
2450
+ "learning_rate": 5.103131684700315e-06,
2451
+ "loss": 0.4154,
2452
+ "step": 3490
2453
+ },
2454
+ {
2455
+ "epoch": 7.0,
2456
+ "grad_norm": 11.93620491027832,
2457
+ "learning_rate": 5.042381521605473e-06,
2458
+ "loss": 0.391,
2459
+ "step": 3500
2460
+ },
2461
+ {
2462
+ "epoch": 7.02,
2463
+ "grad_norm": 4.725837707519531,
2464
+ "learning_rate": 4.981872985841115e-06,
2465
+ "loss": 0.38,
2466
+ "step": 3510
2467
+ },
2468
+ {
2469
+ "epoch": 7.04,
2470
+ "grad_norm": 1.6126409769058228,
2471
+ "learning_rate": 4.921609026507907e-06,
2472
+ "loss": 0.3478,
2473
+ "step": 3520
2474
+ },
2475
+ {
2476
+ "epoch": 7.06,
2477
+ "grad_norm": 2.6732842922210693,
2478
+ "learning_rate": 4.861592580786205e-06,
2479
+ "loss": 0.3712,
2480
+ "step": 3530
2481
+ },
2482
+ {
2483
+ "epoch": 7.08,
2484
+ "grad_norm": 2.0151851177215576,
2485
+ "learning_rate": 4.801826573792905e-06,
2486
+ "loss": 0.3801,
2487
+ "step": 3540
2488
+ },
2489
+ {
2490
+ "epoch": 7.1,
2491
+ "grad_norm": 1.151302456855774,
2492
+ "learning_rate": 4.7423139184388725e-06,
2493
+ "loss": 0.3881,
2494
+ "step": 3550
2495
+ },
2496
+ {
2497
+ "epoch": 7.12,
2498
+ "grad_norm": 3.5267462730407715,
2499
+ "learning_rate": 4.6830575152869615e-06,
2500
+ "loss": 0.3489,
2501
+ "step": 3560
2502
+ },
2503
+ {
2504
+ "epoch": 7.14,
2505
+ "grad_norm": 1.8429639339447021,
2506
+ "learning_rate": 4.62406025241067e-06,
2507
+ "loss": 0.4284,
2508
+ "step": 3570
2509
+ },
2510
+ {
2511
+ "epoch": 7.16,
2512
+ "grad_norm": 4.2320051193237305,
2513
+ "learning_rate": 4.565325005253356e-06,
2514
+ "loss": 0.4055,
2515
+ "step": 3580
2516
+ },
2517
+ {
2518
+ "epoch": 7.18,
2519
+ "grad_norm": 2.368800163269043,
2520
+ "learning_rate": 4.506854636488103e-06,
2521
+ "loss": 0.3627,
2522
+ "step": 3590
2523
+ },
2524
+ {
2525
+ "epoch": 7.2,
2526
+ "grad_norm": 7.869661331176758,
2527
+ "learning_rate": 4.44865199587819e-06,
2528
+ "loss": 0.3866,
2529
+ "step": 3600
2530
+ },
2531
+ {
2532
+ "epoch": 7.22,
2533
+ "grad_norm": 1.2418557405471802,
2534
+ "learning_rate": 4.39071992013822e-06,
2535
+ "loss": 0.3947,
2536
+ "step": 3610
2537
+ },
2538
+ {
2539
+ "epoch": 7.24,
2540
+ "grad_norm": 1.7556277513504028,
2541
+ "learning_rate": 4.3330612327958265e-06,
2542
+ "loss": 0.4266,
2543
+ "step": 3620
2544
+ },
2545
+ {
2546
+ "epoch": 7.26,
2547
+ "grad_norm": 4.239712238311768,
2548
+ "learning_rate": 4.275678744054094e-06,
2549
+ "loss": 0.3495,
2550
+ "step": 3630
2551
+ },
2552
+ {
2553
+ "epoch": 7.28,
2554
+ "grad_norm": 2.917245626449585,
2555
+ "learning_rate": 4.218575250654559e-06,
2556
+ "loss": 0.4153,
2557
+ "step": 3640
2558
+ },
2559
+ {
2560
+ "epoch": 7.3,
2561
+ "grad_norm": 1.490869402885437,
2562
+ "learning_rate": 4.161753535740932e-06,
2563
+ "loss": 0.3819,
2564
+ "step": 3650
2565
+ },
2566
+ {
2567
+ "epoch": 7.32,
2568
+ "grad_norm": 1.5143734216690063,
2569
+ "learning_rate": 4.105216368723437e-06,
2570
+ "loss": 0.4032,
2571
+ "step": 3660
2572
+ },
2573
+ {
2574
+ "epoch": 7.34,
2575
+ "grad_norm": 3.434727907180786,
2576
+ "learning_rate": 4.048966505143831e-06,
2577
+ "loss": 0.358,
2578
+ "step": 3670
2579
+ },
2580
+ {
2581
+ "epoch": 7.36,
2582
+ "grad_norm": 1.666413426399231,
2583
+ "learning_rate": 3.993006686541108e-06,
2584
+ "loss": 0.4101,
2585
+ "step": 3680
2586
+ },
2587
+ {
2588
+ "epoch": 7.38,
2589
+ "grad_norm": 2.142817974090576,
2590
+ "learning_rate": 3.937339640317879e-06,
2591
+ "loss": 0.3803,
2592
+ "step": 3690
2593
+ },
2594
+ {
2595
+ "epoch": 7.4,
2596
+ "grad_norm": 0.9919471740722656,
2597
+ "learning_rate": 3.88196807960744e-06,
2598
+ "loss": 0.3844,
2599
+ "step": 3700
2600
+ },
2601
+ {
2602
+ "epoch": 7.42,
2603
+ "grad_norm": 2.370820999145508,
2604
+ "learning_rate": 3.826894703141552e-06,
2605
+ "loss": 0.3536,
2606
+ "step": 3710
2607
+ },
2608
+ {
2609
+ "epoch": 7.44,
2610
+ "grad_norm": 1.761391520500183,
2611
+ "learning_rate": 3.772122195118877e-06,
2612
+ "loss": 0.3957,
2613
+ "step": 3720
2614
+ },
2615
+ {
2616
+ "epoch": 7.46,
2617
+ "grad_norm": 1.3135240077972412,
2618
+ "learning_rate": 3.7176532250741857e-06,
2619
+ "loss": 0.4308,
2620
+ "step": 3730
2621
+ },
2622
+ {
2623
+ "epoch": 7.48,
2624
+ "grad_norm": 7.84911584854126,
2625
+ "learning_rate": 3.663490447748236e-06,
2626
+ "loss": 0.3988,
2627
+ "step": 3740
2628
+ },
2629
+ {
2630
+ "epoch": 7.5,
2631
+ "grad_norm": 2.369114875793457,
2632
+ "learning_rate": 3.6096365029583803e-06,
2633
+ "loss": 0.3983,
2634
+ "step": 3750
2635
+ },
2636
+ {
2637
+ "epoch": 7.52,
2638
+ "grad_norm": 3.6399729251861572,
2639
+ "learning_rate": 3.5560940154699133e-06,
2640
+ "loss": 0.37,
2641
+ "step": 3760
2642
+ },
2643
+ {
2644
+ "epoch": 7.54,
2645
+ "grad_norm": 3.580399751663208,
2646
+ "learning_rate": 3.502865594868136e-06,
2647
+ "loss": 0.3645,
2648
+ "step": 3770
2649
+ },
2650
+ {
2651
+ "epoch": 7.5600000000000005,
2652
+ "grad_norm": 3.4667141437530518,
2653
+ "learning_rate": 3.4499538354311757e-06,
2654
+ "loss": 0.4179,
2655
+ "step": 3780
2656
+ },
2657
+ {
2658
+ "epoch": 7.58,
2659
+ "grad_norm": 2.440298318862915,
2660
+ "learning_rate": 3.397361316003539e-06,
2661
+ "loss": 0.324,
2662
+ "step": 3790
2663
+ },
2664
+ {
2665
+ "epoch": 7.6,
2666
+ "grad_norm": 2.0638418197631836,
2667
+ "learning_rate": 3.3450905998704274e-06,
2668
+ "loss": 0.3789,
2669
+ "step": 3800
2670
+ },
2671
+ {
2672
+ "epoch": 7.62,
2673
+ "grad_norm": 2.4778010845184326,
2674
+ "learning_rate": 3.2931442346328e-06,
2675
+ "loss": 0.3608,
2676
+ "step": 3810
2677
+ },
2678
+ {
2679
+ "epoch": 7.64,
2680
+ "grad_norm": 2.4514052867889404,
2681
+ "learning_rate": 3.241524752083215e-06,
2682
+ "loss": 0.3985,
2683
+ "step": 3820
2684
+ },
2685
+ {
2686
+ "epoch": 7.66,
2687
+ "grad_norm": 1.3875998258590698,
2688
+ "learning_rate": 3.190234668082427e-06,
2689
+ "loss": 0.3447,
2690
+ "step": 3830
2691
+ },
2692
+ {
2693
+ "epoch": 7.68,
2694
+ "grad_norm": 1.2562263011932373,
2695
+ "learning_rate": 3.1392764824367706e-06,
2696
+ "loss": 0.3426,
2697
+ "step": 3840
2698
+ },
2699
+ {
2700
+ "epoch": 7.7,
2701
+ "grad_norm": 1.41866135597229,
2702
+ "learning_rate": 3.0886526787763237e-06,
2703
+ "loss": 0.3576,
2704
+ "step": 3850
2705
+ },
2706
+ {
2707
+ "epoch": 7.72,
2708
+ "grad_norm": 3.5626509189605713,
2709
+ "learning_rate": 3.038365724433858e-06,
2710
+ "loss": 0.3928,
2711
+ "step": 3860
2712
+ },
2713
+ {
2714
+ "epoch": 7.74,
2715
+ "grad_norm": 5.021074295043945,
2716
+ "learning_rate": 2.988418070324577e-06,
2717
+ "loss": 0.3589,
2718
+ "step": 3870
2719
+ },
2720
+ {
2721
+ "epoch": 7.76,
2722
+ "grad_norm": 1.392564058303833,
2723
+ "learning_rate": 2.938812150826684e-06,
2724
+ "loss": 0.3851,
2725
+ "step": 3880
2726
+ },
2727
+ {
2728
+ "epoch": 7.78,
2729
+ "grad_norm": 3.992396116256714,
2730
+ "learning_rate": 2.8895503836627105e-06,
2731
+ "loss": 0.3688,
2732
+ "step": 3890
2733
+ },
2734
+ {
2735
+ "epoch": 7.8,
2736
+ "grad_norm": 2.653534173965454,
2737
+ "learning_rate": 2.840635169781688e-06,
2738
+ "loss": 0.3585,
2739
+ "step": 3900
2740
+ },
2741
+ {
2742
+ "epoch": 7.82,
2743
+ "grad_norm": 6.615116596221924,
2744
+ "learning_rate": 2.7920688932421337e-06,
2745
+ "loss": 0.3653,
2746
+ "step": 3910
2747
+ },
2748
+ {
2749
+ "epoch": 7.84,
2750
+ "grad_norm": 5.369716644287109,
2751
+ "learning_rate": 2.7438539210958483e-06,
2752
+ "loss": 0.3512,
2753
+ "step": 3920
2754
+ },
2755
+ {
2756
+ "epoch": 7.86,
2757
+ "grad_norm": 1.679790735244751,
2758
+ "learning_rate": 2.6959926032725537e-06,
2759
+ "loss": 0.3717,
2760
+ "step": 3930
2761
+ },
2762
+ {
2763
+ "epoch": 7.88,
2764
+ "grad_norm": 2.233903646469116,
2765
+ "learning_rate": 2.648487272465361e-06,
2766
+ "loss": 0.3806,
2767
+ "step": 3940
2768
+ },
2769
+ {
2770
+ "epoch": 7.9,
2771
+ "grad_norm": 3.0028107166290283,
2772
+ "learning_rate": 2.6013402440170676e-06,
2773
+ "loss": 0.3993,
2774
+ "step": 3950
2775
+ },
2776
+ {
2777
+ "epoch": 7.92,
2778
+ "grad_norm": 3.600489854812622,
2779
+ "learning_rate": 2.5545538158073278e-06,
2780
+ "loss": 0.3387,
2781
+ "step": 3960
2782
+ },
2783
+ {
2784
+ "epoch": 7.9399999999999995,
2785
+ "grad_norm": 7.545295715332031,
2786
+ "learning_rate": 2.512756228659141e-06,
2787
+ "loss": 0.37,
2788
+ "step": 3970
2789
+ },
2790
+ {
2791
+ "epoch": 7.96,
2792
+ "grad_norm": 2.566960573196411,
2793
+ "learning_rate": 2.4666612085261344e-06,
2794
+ "loss": 0.3967,
2795
+ "step": 3980
2796
+ },
2797
+ {
2798
+ "epoch": 7.98,
2799
+ "grad_norm": 2.3997247219085693,
2800
+ "learning_rate": 2.420933352697865e-06,
2801
+ "loss": 0.4029,
2802
+ "step": 3990
2803
+ },
2804
+ {
2805
+ "epoch": 8.0,
2806
+ "grad_norm": 2.916670560836792,
2807
+ "learning_rate": 2.37557488988552e-06,
2808
+ "loss": 0.3713,
2809
+ "step": 4000
2810
+ },
2811
+ {
2812
+ "epoch": 8.02,
2813
+ "grad_norm": 1.7952624559402466,
2814
+ "learning_rate": 2.3305880307965834e-06,
2815
+ "loss": 0.3232,
2816
+ "step": 4010
2817
+ },
2818
+ {
2819
+ "epoch": 8.04,
2820
+ "grad_norm": 1.91434645652771,
2821
+ "learning_rate": 2.2859749680270983e-06,
2822
+ "loss": 0.331,
2823
+ "step": 4020
2824
+ },
2825
+ {
2826
+ "epoch": 8.06,
2827
+ "grad_norm": 3.671706438064575,
2828
+ "learning_rate": 2.241737875954808e-06,
2829
+ "loss": 0.3818,
2830
+ "step": 4030
2831
+ },
2832
+ {
2833
+ "epoch": 8.08,
2834
+ "grad_norm": 1.5308889150619507,
2835
+ "learning_rate": 2.1978789106331666e-06,
2836
+ "loss": 0.3482,
2837
+ "step": 4040
2838
+ },
2839
+ {
2840
+ "epoch": 8.1,
2841
+ "grad_norm": 1.8674166202545166,
2842
+ "learning_rate": 2.154400209686268e-06,
2843
+ "loss": 0.3195,
2844
+ "step": 4050
2845
+ },
2846
+ {
2847
+ "epoch": 8.12,
2848
+ "grad_norm": 1.5842407941818237,
2849
+ "learning_rate": 2.1113038922046603e-06,
2850
+ "loss": 0.3557,
2851
+ "step": 4060
2852
+ },
2853
+ {
2854
+ "epoch": 8.14,
2855
+ "grad_norm": 2.2769813537597656,
2856
+ "learning_rate": 2.0685920586420562e-06,
2857
+ "loss": 0.2853,
2858
+ "step": 4070
2859
+ },
2860
+ {
2861
+ "epoch": 8.16,
2862
+ "grad_norm": 1.7789726257324219,
2863
+ "learning_rate": 2.026266790712965e-06,
2864
+ "loss": 0.316,
2865
+ "step": 4080
2866
+ },
2867
+ {
2868
+ "epoch": 8.18,
2869
+ "grad_norm": 2.544579029083252,
2870
+ "learning_rate": 1.984330151291233e-06,
2871
+ "loss": 0.3328,
2872
+ "step": 4090
2873
+ },
2874
+ {
2875
+ "epoch": 8.2,
2876
+ "grad_norm": 5.644877910614014,
2877
+ "learning_rate": 1.9427841843095063e-06,
2878
+ "loss": 0.3338,
2879
+ "step": 4100
2880
+ },
2881
+ {
2882
+ "epoch": 8.22,
2883
+ "grad_norm": 2.9125707149505615,
2884
+ "learning_rate": 1.9016309146596024e-06,
2885
+ "loss": 0.3226,
2886
+ "step": 4110
2887
+ },
2888
+ {
2889
+ "epoch": 8.24,
2890
+ "grad_norm": 2.9386703968048096,
2891
+ "learning_rate": 1.8608723480938207e-06,
2892
+ "loss": 0.3147,
2893
+ "step": 4120
2894
+ },
2895
+ {
2896
+ "epoch": 8.26,
2897
+ "grad_norm": 5.057535648345947,
2898
+ "learning_rate": 1.820510471127196e-06,
2899
+ "loss": 0.3549,
2900
+ "step": 4130
2901
+ },
2902
+ {
2903
+ "epoch": 8.28,
2904
+ "grad_norm": 1.1568169593811035,
2905
+ "learning_rate": 1.7805472509406695e-06,
2906
+ "loss": 0.3701,
2907
+ "step": 4140
2908
+ },
2909
+ {
2910
+ "epoch": 8.3,
2911
+ "grad_norm": 2.978498697280884,
2912
+ "learning_rate": 1.7409846352852144e-06,
2913
+ "loss": 0.341,
2914
+ "step": 4150
2915
+ },
2916
+ {
2917
+ "epoch": 8.32,
2918
+ "grad_norm": 1.8623732328414917,
2919
+ "learning_rate": 1.7018245523869038e-06,
2920
+ "loss": 0.2754,
2921
+ "step": 4160
2922
+ },
2923
+ {
2924
+ "epoch": 8.34,
2925
+ "grad_norm": 3.46683406829834,
2926
+ "learning_rate": 1.6630689108529286e-06,
2927
+ "loss": 0.3958,
2928
+ "step": 4170
2929
+ },
2930
+ {
2931
+ "epoch": 8.36,
2932
+ "grad_norm": 5.015219211578369,
2933
+ "learning_rate": 1.6247195995785836e-06,
2934
+ "loss": 0.3512,
2935
+ "step": 4180
2936
+ },
2937
+ {
2938
+ "epoch": 8.38,
2939
+ "grad_norm": 1.5242810249328613,
2940
+ "learning_rate": 1.5867784876551973e-06,
2941
+ "loss": 0.3533,
2942
+ "step": 4190
2943
+ },
2944
+ {
2945
+ "epoch": 8.4,
2946
+ "grad_norm": 4.693676948547363,
2947
+ "learning_rate": 1.5492474242790368e-06,
2948
+ "loss": 0.3746,
2949
+ "step": 4200
2950
+ },
2951
+ {
2952
+ "epoch": 8.42,
2953
+ "grad_norm": 2.436262845993042,
2954
+ "learning_rate": 1.5121282386611823e-06,
2955
+ "loss": 0.3274,
2956
+ "step": 4210
2957
+ },
2958
+ {
2959
+ "epoch": 8.44,
2960
+ "grad_norm": 2.2660608291625977,
2961
+ "learning_rate": 1.4754227399383758e-06,
2962
+ "loss": 0.3055,
2963
+ "step": 4220
2964
+ },
2965
+ {
2966
+ "epoch": 8.46,
2967
+ "grad_norm": 2.7948834896087646,
2968
+ "learning_rate": 1.439132717084839e-06,
2969
+ "loss": 0.3078,
2970
+ "step": 4230
2971
+ },
2972
+ {
2973
+ "epoch": 8.48,
2974
+ "grad_norm": 1.3765865564346313,
2975
+ "learning_rate": 1.40325993882509e-06,
2976
+ "loss": 0.3194,
2977
+ "step": 4240
2978
+ },
2979
+ {
2980
+ "epoch": 8.5,
2981
+ "grad_norm": 1.2223212718963623,
2982
+ "learning_rate": 1.3678061535477305e-06,
2983
+ "loss": 0.352,
2984
+ "step": 4250
2985
+ },
2986
+ {
2987
+ "epoch": 8.52,
2988
+ "grad_norm": 2.556001663208008,
2989
+ "learning_rate": 1.3327730892202384e-06,
2990
+ "loss": 0.3061,
2991
+ "step": 4260
2992
+ },
2993
+ {
2994
+ "epoch": 8.54,
2995
+ "grad_norm": 4.0893168449401855,
2996
+ "learning_rate": 1.2981624533047432e-06,
2997
+ "loss": 0.406,
2998
+ "step": 4270
2999
+ },
3000
+ {
3001
+ "epoch": 8.56,
3002
+ "grad_norm": 1.8102929592132568,
3003
+ "learning_rate": 1.2639759326748136e-06,
3004
+ "loss": 0.3335,
3005
+ "step": 4280
3006
+ },
3007
+ {
3008
+ "epoch": 8.58,
3009
+ "grad_norm": 0.6934239268302917,
3010
+ "learning_rate": 1.230215193533233e-06,
3011
+ "loss": 0.4048,
3012
+ "step": 4290
3013
+ },
3014
+ {
3015
+ "epoch": 8.6,
3016
+ "grad_norm": 2.5495901107788086,
3017
+ "learning_rate": 1.196881881330798e-06,
3018
+ "loss": 0.3388,
3019
+ "step": 4300
3020
+ },
3021
+ {
3022
+ "epoch": 8.62,
3023
+ "grad_norm": 2.681366443634033,
3024
+ "learning_rate": 1.1639776206861197e-06,
3025
+ "loss": 0.358,
3026
+ "step": 4310
3027
+ },
3028
+ {
3029
+ "epoch": 8.64,
3030
+ "grad_norm": 1.624990463256836,
3031
+ "learning_rate": 1.1315040153064416e-06,
3032
+ "loss": 0.3628,
3033
+ "step": 4320
3034
+ },
3035
+ {
3036
+ "epoch": 8.66,
3037
+ "grad_norm": 7.331467151641846,
3038
+ "learning_rate": 1.0994626479094749e-06,
3039
+ "loss": 0.3585,
3040
+ "step": 4330
3041
+ },
3042
+ {
3043
+ "epoch": 8.68,
3044
+ "grad_norm": 1.2213658094406128,
3045
+ "learning_rate": 1.0678550801462662e-06,
3046
+ "loss": 0.3583,
3047
+ "step": 4340
3048
+ },
3049
+ {
3050
+ "epoch": 8.7,
3051
+ "grad_norm": 2.539713144302368,
3052
+ "learning_rate": 1.0366828525250728e-06,
3053
+ "loss": 0.2861,
3054
+ "step": 4350
3055
+ },
3056
+ {
3057
+ "epoch": 8.72,
3058
+ "grad_norm": 4.281270980834961,
3059
+ "learning_rate": 1.0059474843362893e-06,
3060
+ "loss": 0.3422,
3061
+ "step": 4360
3062
+ },
3063
+ {
3064
+ "epoch": 8.74,
3065
+ "grad_norm": 1.515568733215332,
3066
+ "learning_rate": 9.756504735784067e-07,
3067
+ "loss": 0.3337,
3068
+ "step": 4370
3069
+ },
3070
+ {
3071
+ "epoch": 8.76,
3072
+ "grad_norm": 1.730093240737915,
3073
+ "learning_rate": 9.457932968849826e-07,
3074
+ "loss": 0.3163,
3075
+ "step": 4380
3076
+ },
3077
+ {
3078
+ "epoch": 8.78,
3079
+ "grad_norm": 4.305525302886963,
3080
+ "learning_rate": 9.16377409452689e-07,
3081
+ "loss": 0.3132,
3082
+ "step": 4390
3083
+ },
3084
+ {
3085
+ "epoch": 8.8,
3086
+ "grad_norm": 1.6857116222381592,
3087
+ "learning_rate": 8.874042449703779e-07,
3088
+ "loss": 0.3108,
3089
+ "step": 4400
3090
+ },
3091
+ {
3092
+ "epoch": 8.82,
3093
+ "grad_norm": 1.7638370990753174,
3094
+ "learning_rate": 8.58875215549212e-07,
3095
+ "loss": 0.3444,
3096
+ "step": 4410
3097
+ },
3098
+ {
3099
+ "epoch": 8.84,
3100
+ "grad_norm": 2.9410483837127686,
3101
+ "learning_rate": 8.307917116538378e-07,
3102
+ "loss": 0.3582,
3103
+ "step": 4420
3104
+ },
3105
+ {
3106
+ "epoch": 8.86,
3107
+ "grad_norm": 1.4133245944976807,
3108
+ "learning_rate": 8.031551020346129e-07,
3109
+ "loss": 0.3014,
3110
+ "step": 4430
3111
+ },
3112
+ {
3113
+ "epoch": 8.88,
3114
+ "grad_norm": 2.466925621032715,
3115
+ "learning_rate": 7.759667336609011e-07,
3116
+ "loss": 0.3578,
3117
+ "step": 4440
3118
+ },
3119
+ {
3120
+ "epoch": 8.9,
3121
+ "grad_norm": 1.979108214378357,
3122
+ "learning_rate": 7.492279316554207e-07,
3123
+ "loss": 0.3253,
3124
+ "step": 4450
3125
+ },
3126
+ {
3127
+ "epoch": 8.92,
3128
+ "grad_norm": 1.6241674423217773,
3129
+ "learning_rate": 7.22939999229657e-07,
3130
+ "loss": 0.3839,
3131
+ "step": 4460
3132
+ },
3133
+ {
3134
+ "epoch": 8.94,
3135
+ "grad_norm": 2.58152174949646,
3136
+ "learning_rate": 6.971042176203535e-07,
3137
+ "loss": 0.268,
3138
+ "step": 4470
3139
+ },
3140
+ {
3141
+ "epoch": 8.96,
3142
+ "grad_norm": 2.5680618286132812,
3143
+ "learning_rate": 6.717218460270536e-07,
3144
+ "loss": 0.332,
3145
+ "step": 4480
3146
+ },
3147
+ {
3148
+ "epoch": 8.98,
3149
+ "grad_norm": 2.1524252891540527,
3150
+ "learning_rate": 6.467941215507434e-07,
3151
+ "loss": 0.361,
3152
+ "step": 4490
3153
+ },
3154
+ {
3155
+ "epoch": 9.0,
3156
+ "grad_norm": 1.6696678400039673,
3157
+ "learning_rate": 6.223222591335409e-07,
3158
+ "loss": 0.3358,
3159
+ "step": 4500
3160
+ },
3161
+ {
3162
+ "epoch": 9.02,
3163
+ "grad_norm": 1.1763229370117188,
3164
+ "learning_rate": 5.98307451499498e-07,
3165
+ "loss": 0.2874,
3166
+ "step": 4510
3167
+ },
3168
+ {
3169
+ "epoch": 9.04,
3170
+ "grad_norm": 1.381579041481018,
3171
+ "learning_rate": 5.747508690964599e-07,
3172
+ "loss": 0.361,
3173
+ "step": 4520
3174
+ },
3175
+ {
3176
+ "epoch": 9.06,
3177
+ "grad_norm": 1.560950756072998,
3178
+ "learning_rate": 5.516536600390188e-07,
3179
+ "loss": 0.2929,
3180
+ "step": 4530
3181
+ },
3182
+ {
3183
+ "epoch": 9.08,
3184
+ "grad_norm": 2.703350782394409,
3185
+ "learning_rate": 5.290169500525577e-07,
3186
+ "loss": 0.2854,
3187
+ "step": 4540
3188
+ },
3189
+ {
3190
+ "epoch": 9.1,
3191
+ "grad_norm": 1.5632970333099365,
3192
+ "learning_rate": 5.068418424183874e-07,
3193
+ "loss": 0.3173,
3194
+ "step": 4550
3195
+ },
3196
+ {
3197
+ "epoch": 9.12,
3198
+ "grad_norm": 1.8101422786712646,
3199
+ "learning_rate": 4.851294179199673e-07,
3200
+ "loss": 0.3683,
3201
+ "step": 4560
3202
+ },
3203
+ {
3204
+ "epoch": 9.14,
3205
+ "grad_norm": 1.0653437376022339,
3206
+ "learning_rate": 4.638807347902408e-07,
3207
+ "loss": 0.3256,
3208
+ "step": 4570
3209
+ },
3210
+ {
3211
+ "epoch": 9.16,
3212
+ "grad_norm": 2.522818088531494,
3213
+ "learning_rate": 4.4309682866004124e-07,
3214
+ "loss": 0.319,
3215
+ "step": 4580
3216
+ },
3217
+ {
3218
+ "epoch": 9.18,
3219
+ "grad_norm": 3.289451837539673,
3220
+ "learning_rate": 4.2277871250763327e-07,
3221
+ "loss": 0.3221,
3222
+ "step": 4590
3223
+ },
3224
+ {
3225
+ "epoch": 9.2,
3226
+ "grad_norm": 2.0382297039031982,
3227
+ "learning_rate": 4.0292737660933335e-07,
3228
+ "loss": 0.2951,
3229
+ "step": 4600
3230
+ },
3231
+ {
3232
+ "epoch": 9.22,
3233
+ "grad_norm": 2.1435165405273438,
3234
+ "learning_rate": 3.835437884912474e-07,
3235
+ "loss": 0.3738,
3236
+ "step": 4610
3237
+ },
3238
+ {
3239
+ "epoch": 9.24,
3240
+ "grad_norm": 1.6461173295974731,
3241
+ "learning_rate": 3.646288928821151e-07,
3242
+ "loss": 0.2898,
3243
+ "step": 4620
3244
+ },
3245
+ {
3246
+ "epoch": 9.26,
3247
+ "grad_norm": 2.4130804538726807,
3248
+ "learning_rate": 3.4618361166726123e-07,
3249
+ "loss": 0.3792,
3250
+ "step": 4630
3251
+ },
3252
+ {
3253
+ "epoch": 9.28,
3254
+ "grad_norm": 2.4446017742156982,
3255
+ "learning_rate": 3.282088438436715e-07,
3256
+ "loss": 0.3424,
3257
+ "step": 4640
3258
+ },
3259
+ {
3260
+ "epoch": 9.3,
3261
+ "grad_norm": 1.3320859670639038,
3262
+ "learning_rate": 3.10705465476171e-07,
3263
+ "loss": 0.358,
3264
+ "step": 4650
3265
+ },
3266
+ {
3267
+ "epoch": 9.32,
3268
+ "grad_norm": 3.8511667251586914,
3269
+ "learning_rate": 2.936743296547273e-07,
3270
+ "loss": 0.32,
3271
+ "step": 4660
3272
+ },
3273
+ {
3274
+ "epoch": 9.34,
3275
+ "grad_norm": 5.630286693572998,
3276
+ "learning_rate": 2.771162664528726e-07,
3277
+ "loss": 0.3079,
3278
+ "step": 4670
3279
+ },
3280
+ {
3281
+ "epoch": 9.36,
3282
+ "grad_norm": 1.600219964981079,
3283
+ "learning_rate": 2.6103208288724815e-07,
3284
+ "loss": 0.2834,
3285
+ "step": 4680
3286
+ },
3287
+ {
3288
+ "epoch": 9.38,
3289
+ "grad_norm": 1.0820380449295044,
3290
+ "learning_rate": 2.4542256287826915e-07,
3291
+ "loss": 0.354,
3292
+ "step": 4690
3293
+ },
3294
+ {
3295
+ "epoch": 9.4,
3296
+ "grad_norm": 1.4870035648345947,
3297
+ "learning_rate": 2.3028846721191878e-07,
3298
+ "loss": 0.3243,
3299
+ "step": 4700
3300
+ },
3301
+ {
3302
+ "epoch": 9.42,
3303
+ "grad_norm": 3.148569107055664,
3304
+ "learning_rate": 2.1563053350266983e-07,
3305
+ "loss": 0.3121,
3306
+ "step": 4710
3307
+ },
3308
+ {
3309
+ "epoch": 9.44,
3310
+ "grad_norm": 1.8829195499420166,
3311
+ "learning_rate": 2.014494761575314e-07,
3312
+ "loss": 0.3142,
3313
+ "step": 4720
3314
+ },
3315
+ {
3316
+ "epoch": 9.46,
3317
+ "grad_norm": 3.1038215160369873,
3318
+ "learning_rate": 1.877459863412323e-07,
3319
+ "loss": 0.3287,
3320
+ "step": 4730
3321
+ },
3322
+ {
3323
+ "epoch": 9.48,
3324
+ "grad_norm": 1.9286001920700073,
3325
+ "learning_rate": 1.7452073194253237e-07,
3326
+ "loss": 0.2989,
3327
+ "step": 4740
3328
+ },
3329
+ {
3330
+ "epoch": 9.5,
3331
+ "grad_norm": 2.0495471954345703,
3332
+ "learning_rate": 1.6177435754167413e-07,
3333
+ "loss": 0.3632,
3334
+ "step": 4750
3335
+ },
3336
+ {
3337
+ "epoch": 9.52,
3338
+ "grad_norm": 2.1833696365356445,
3339
+ "learning_rate": 1.4950748437896235e-07,
3340
+ "loss": 0.265,
3341
+ "step": 4760
3342
+ },
3343
+ {
3344
+ "epoch": 9.54,
3345
+ "grad_norm": 3.15493106842041,
3346
+ "learning_rate": 1.377207103244904e-07,
3347
+ "loss": 0.283,
3348
+ "step": 4770
3349
+ },
3350
+ {
3351
+ "epoch": 9.56,
3352
+ "grad_norm": 1.2273836135864258,
3353
+ "learning_rate": 1.26414609848996e-07,
3354
+ "loss": 0.2264,
3355
+ "step": 4780
3356
+ },
3357
+ {
3358
+ "epoch": 9.58,
3359
+ "grad_norm": 1.6316149234771729,
3360
+ "learning_rate": 1.1558973399586671e-07,
3361
+ "loss": 0.3198,
3362
+ "step": 4790
3363
+ },
3364
+ {
3365
+ "epoch": 9.6,
3366
+ "grad_norm": 2.2700629234313965,
3367
+ "learning_rate": 1.052466103542793e-07,
3368
+ "loss": 0.2258,
3369
+ "step": 4800
3370
+ },
3371
+ {
3372
+ "epoch": 9.62,
3373
+ "grad_norm": 1.132501244544983,
3374
+ "learning_rate": 9.538574303348813e-08,
3375
+ "loss": 0.3053,
3376
+ "step": 4810
3377
+ },
3378
+ {
3379
+ "epoch": 9.64,
3380
+ "grad_norm": 1.9846259355545044,
3381
+ "learning_rate": 8.600761263825475e-08,
3382
+ "loss": 0.278,
3383
+ "step": 4820
3384
+ },
3385
+ {
3386
+ "epoch": 9.66,
3387
+ "grad_norm": 1.3777711391448975,
3388
+ "learning_rate": 7.71126762454233e-08,
3389
+ "loss": 0.3211,
3390
+ "step": 4830
3391
+ },
3392
+ {
3393
+ "epoch": 9.68,
3394
+ "grad_norm": 1.355865240097046,
3395
+ "learning_rate": 6.870136738164612e-08,
3396
+ "loss": 0.3079,
3397
+ "step": 4840
3398
+ },
3399
+ {
3400
+ "epoch": 9.7,
3401
+ "grad_norm": 2.8824238777160645,
3402
+ "learning_rate": 6.07740960022507e-08,
3403
+ "loss": 0.3717,
3404
+ "step": 4850
3405
+ },
3406
+ {
3407
+ "epoch": 9.72,
3408
+ "grad_norm": 2.4788002967834473,
3409
+ "learning_rate": 5.3331248471258926e-08,
3410
+ "loss": 0.3052,
3411
+ "step": 4860
3412
+ },
3413
+ {
3414
+ "epoch": 9.74,
3415
+ "grad_norm": 1.3350954055786133,
3416
+ "learning_rate": 4.6373187542561036e-08,
3417
+ "loss": 0.3018,
3418
+ "step": 4870
3419
+ },
3420
+ {
3421
+ "epoch": 9.76,
3422
+ "grad_norm": 1.485877513885498,
3423
+ "learning_rate": 3.990025234222872e-08,
3424
+ "loss": 0.2694,
3425
+ "step": 4880
3426
+ },
3427
+ {
3428
+ "epoch": 9.78,
3429
+ "grad_norm": 2.1320180892944336,
3430
+ "learning_rate": 3.391275835199159e-08,
3431
+ "loss": 0.323,
3432
+ "step": 4890
3433
+ },
3434
+ {
3435
+ "epoch": 9.8,
3436
+ "grad_norm": 1.2034118175506592,
3437
+ "learning_rate": 2.8410997393860663e-08,
3438
+ "loss": 0.302,
3439
+ "step": 4900
3440
+ },
3441
+ {
3442
+ "epoch": 9.82,
3443
+ "grad_norm": 1.9911288022994995,
3444
+ "learning_rate": 2.339523761590301e-08,
3445
+ "loss": 0.3561,
3446
+ "step": 4910
3447
+ },
3448
+ {
3449
+ "epoch": 9.84,
3450
+ "grad_norm": 4.588063716888428,
3451
+ "learning_rate": 1.886572347917337e-08,
3452
+ "loss": 0.3486,
3453
+ "step": 4920
3454
+ },
3455
+ {
3456
+ "epoch": 9.86,
3457
+ "grad_norm": 1.2661594152450562,
3458
+ "learning_rate": 1.482267574580143e-08,
3459
+ "loss": 0.3651,
3460
+ "step": 4930
3461
+ },
3462
+ {
3463
+ "epoch": 9.88,
3464
+ "grad_norm": 1.7865337133407593,
3465
+ "learning_rate": 1.126629146822933e-08,
3466
+ "loss": 0.2544,
3467
+ "step": 4940
3468
+ },
3469
+ {
3470
+ "epoch": 9.9,
3471
+ "grad_norm": 3.7908692359924316,
3472
+ "learning_rate": 8.196743979610455e-09,
3473
+ "loss": 0.2575,
3474
+ "step": 4950
3475
+ },
3476
+ {
3477
+ "epoch": 9.92,
3478
+ "grad_norm": 2.3560678958892822,
3479
+ "learning_rate": 5.614182885357311e-09,
3480
+ "loss": 0.2833,
3481
+ "step": 4960
3482
+ },
3483
+ {
3484
+ "epoch": 9.94,
3485
+ "grad_norm": 1.629074215888977,
3486
+ "learning_rate": 3.518734055855122e-09,
3487
+ "loss": 0.3345,
3488
+ "step": 4970
3489
+ },
3490
+ {
3491
+ "epoch": 9.96,
3492
+ "grad_norm": 2.738832473754883,
3493
+ "learning_rate": 1.910499620322304e-09,
3494
+ "loss": 0.3034,
3495
+ "step": 4980
3496
+ },
3497
+ {
3498
+ "epoch": 9.98,
3499
+ "grad_norm": 1.9189926385879517,
3500
+ "learning_rate": 7.895579618388827e-10,
3501
+ "loss": 0.2543,
3502
+ "step": 4990
3503
+ },
3504
+ {
3505
+ "epoch": 10.0,
3506
+ "grad_norm": 1.597418189048767,
3507
+ "learning_rate": 1.559637135173375e-10,
3508
+ "loss": 0.3149,
3509
+ "step": 5000
3510
+ },
3511
+ {
3512
+ "epoch": 10.0,
3513
+ "step": 5000,
3514
+ "total_flos": 1.1899956660640154e+17,
3515
+ "train_loss": 0.49240388979911803,
3516
+ "train_runtime": 35927.6647,
3517
+ "train_samples_per_second": 0.278,
3518
+ "train_steps_per_second": 0.139
3519
+ }
3520
+ ],
3521
+ "logging_steps": 10,
3522
+ "max_steps": 5000,
3523
+ "num_input_tokens_seen": 0,
3524
+ "num_train_epochs": 10,
3525
+ "save_steps": 100,
3526
+ "stateful_callbacks": {
3527
+ "TrainerControl": {
3528
+ "args": {
3529
+ "should_epoch_stop": false,
3530
+ "should_evaluate": false,
3531
+ "should_log": false,
3532
+ "should_save": true,
3533
+ "should_training_stop": true
3534
+ },
3535
+ "attributes": {}
3536
+ }
3537
+ },
3538
+ "total_flos": 1.1899956660640154e+17,
3539
+ "train_batch_size": 1,
3540
+ "trial_name": null,
3541
+ "trial_params": null
3542
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:692cb1fd6a6230812580978c9375ef6ab236f04422b096db6a0f3b485bb57b52
3
+ size 5688
vocab.json ADDED
The diff for this file is too large to render. See raw diff