patrickramos commited on
Commit
21bee68
·
1 Parent(s): 2707673

distillclip-different-moon-37

Browse files
last-checkpoint/config.json DELETED
@@ -1,169 +0,0 @@
1
- {
2
- "_commit_hash": null,
3
- "architectures": [
4
- "CLIPForOfflineDistillation"
5
- ],
6
- "initializer_factor": 1.0,
7
- "logit_scale_init_value": 2.6592,
8
- "loss_type": "mse",
9
- "model_type": "clip",
10
- "projection_dim": 256,
11
- "teacher_name_or_path": "openai/clip-vit-base-patch32",
12
- "text_config": {
13
- "_name_or_path": "",
14
- "add_cross_attention": false,
15
- "architectures": null,
16
- "attention_dropout": 0.0,
17
- "bad_words_ids": null,
18
- "begin_suppress_tokens": null,
19
- "bos_token_id": 0,
20
- "chunk_size_feed_forward": 0,
21
- "cross_attention_hidden_size": null,
22
- "decoder_start_token_id": null,
23
- "diversity_penalty": 0.0,
24
- "do_sample": false,
25
- "early_stopping": false,
26
- "encoder_no_repeat_ngram_size": 0,
27
- "eos_token_id": 2,
28
- "exponential_decay_length_penalty": null,
29
- "finetuning_task": null,
30
- "forced_bos_token_id": null,
31
- "forced_eos_token_id": null,
32
- "hidden_act": "quick_gelu",
33
- "hidden_size": 512,
34
- "id2label": {
35
- "0": "LABEL_0",
36
- "1": "LABEL_1"
37
- },
38
- "initializer_factor": 1.0,
39
- "initializer_range": 0.02,
40
- "intermediate_size": 2048,
41
- "is_decoder": false,
42
- "is_encoder_decoder": false,
43
- "label2id": {
44
- "LABEL_0": 0,
45
- "LABEL_1": 1
46
- },
47
- "layer_norm_eps": 1e-05,
48
- "length_penalty": 1.0,
49
- "max_length": 20,
50
- "max_position_embeddings": 77,
51
- "min_length": 0,
52
- "model_type": "clip_text_model",
53
- "no_repeat_ngram_size": 0,
54
- "num_attention_heads": 8,
55
- "num_beam_groups": 1,
56
- "num_beams": 1,
57
- "num_hidden_layers": 6,
58
- "num_return_sequences": 1,
59
- "output_attentions": false,
60
- "output_hidden_states": false,
61
- "output_scores": false,
62
- "pad_token_id": 1,
63
- "prefix": null,
64
- "problem_type": null,
65
- "projection_dim": 512,
66
- "pruned_heads": {},
67
- "remove_invalid_values": false,
68
- "repetition_penalty": 1.0,
69
- "return_dict": true,
70
- "return_dict_in_generate": false,
71
- "sep_token_id": null,
72
- "suppress_tokens": null,
73
- "task_specific_params": null,
74
- "temperature": 1.0,
75
- "tf_legacy_loss": false,
76
- "tie_encoder_decoder": false,
77
- "tie_word_embeddings": true,
78
- "tokenizer_class": null,
79
- "top_k": 50,
80
- "top_p": 1.0,
81
- "torch_dtype": null,
82
- "torchscript": false,
83
- "transformers_version": "4.29.2",
84
- "typical_p": 1.0,
85
- "use_bfloat16": false,
86
- "vocab_size": 49408
87
- },
88
- "torch_dtype": "float32",
89
- "transformers_version": null,
90
- "vision_config": {
91
- "_name_or_path": "",
92
- "add_cross_attention": false,
93
- "architectures": null,
94
- "attention_dropout": 0.0,
95
- "bad_words_ids": null,
96
- "begin_suppress_tokens": null,
97
- "bos_token_id": null,
98
- "chunk_size_feed_forward": 0,
99
- "cross_attention_hidden_size": null,
100
- "decoder_start_token_id": null,
101
- "diversity_penalty": 0.0,
102
- "do_sample": false,
103
- "early_stopping": false,
104
- "encoder_no_repeat_ngram_size": 0,
105
- "eos_token_id": null,
106
- "exponential_decay_length_penalty": null,
107
- "finetuning_task": null,
108
- "forced_bos_token_id": null,
109
- "forced_eos_token_id": null,
110
- "hidden_act": "quick_gelu",
111
- "hidden_size": 384,
112
- "id2label": {
113
- "0": "LABEL_0",
114
- "1": "LABEL_1"
115
- },
116
- "image_size": 224,
117
- "initializer_factor": 1.0,
118
- "initializer_range": 0.02,
119
- "intermediate_size": 1536,
120
- "is_decoder": false,
121
- "is_encoder_decoder": false,
122
- "label2id": {
123
- "LABEL_0": 0,
124
- "LABEL_1": 1
125
- },
126
- "layer_norm_eps": 1e-05,
127
- "length_penalty": 1.0,
128
- "max_length": 20,
129
- "min_length": 0,
130
- "model_type": "clip_vision_model",
131
- "no_repeat_ngram_size": 0,
132
- "num_attention_heads": 6,
133
- "num_beam_groups": 1,
134
- "num_beams": 1,
135
- "num_channels": 3,
136
- "num_hidden_layers": 12,
137
- "num_return_sequences": 1,
138
- "output_attentions": false,
139
- "output_hidden_states": false,
140
- "output_scores": false,
141
- "pad_token_id": null,
142
- "patch_size": 16,
143
- "prefix": null,
144
- "problem_type": null,
145
- "projection_dim": 512,
146
- "pruned_heads": {},
147
- "remove_invalid_values": false,
148
- "repetition_penalty": 1.0,
149
- "return_dict": true,
150
- "return_dict_in_generate": false,
151
- "sep_token_id": null,
152
- "suppress_tokens": null,
153
- "task_specific_params": null,
154
- "temperature": 1.0,
155
- "tf_legacy_loss": false,
156
- "tie_encoder_decoder": false,
157
- "tie_word_embeddings": true,
158
- "tokenizer_class": null,
159
- "top_k": 50,
160
- "top_p": 1.0,
161
- "torch_dtype": null,
162
- "torchscript": false,
163
- "transformers_version": "4.29.2",
164
- "typical_p": 1.0,
165
- "use_bfloat16": false
166
- },
167
- "weight_r": 1.0,
168
- "weight_s": 1.0
169
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:359f0bb9c7fdaf6b9a47c37a482edc40df921b373339ce1a6c7b9f57147c1ae3
3
- size 264627500
 
 
 
 
last-checkpoint/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d9a371842fd61230488662e5eb855b7be4e71789c97f32ee4d639b6966bcc40
3
- size 529349637
 
 
 
 
last-checkpoint/preprocessor_config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "crop_size": {
3
- "height": 224,
4
- "width": 224
5
- },
6
- "do_center_crop": true,
7
- "do_convert_rgb": true,
8
- "do_normalize": true,
9
- "do_rescale": true,
10
- "do_resize": true,
11
- "feature_extractor_type": "CLIPFeatureExtractor",
12
- "image_mean": [
13
- 0.48145466,
14
- 0.4578275,
15
- 0.40821073
16
- ],
17
- "image_processor_type": "CLIPImageProcessor",
18
- "image_std": [
19
- 0.26862954,
20
- 0.26130258,
21
- 0.27577711
22
- ],
23
- "processor_class": "CLIPProcessor",
24
- "resample": 3,
25
- "rescale_factor": 0.00392156862745098,
26
- "size": {
27
- "shortest_edge": 224
28
- }
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:003dae60e1aad3bdb3943ababdac5d57f551939a6e1d0a3e9dee1a6ba7e680e3
3
- size 14575
 
 
 
 
last-checkpoint/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:456a4f0a789f4cbcc35bee1ee478eb09c60afb1bbecc6578d3c7ab655a844d1a
3
- size 557
 
 
 
 
last-checkpoint/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cad79c842bd93ea8f1ac897defc5b2d9d56cb0cbc0054b1a2ca367dd45b0c77a
3
- size 627
 
 
 
 
last-checkpoint/special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<|startoftext|>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "<|endoftext|>",
17
- "unk_token": {
18
- "content": "<|endoftext|>",
19
- "lstrip": false,
20
- "normalized": true,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer_config.json DELETED
@@ -1,34 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "bos_token": {
4
- "__type": "AddedToken",
5
- "content": "<|startoftext|>",
6
- "lstrip": false,
7
- "normalized": true,
8
- "rstrip": false,
9
- "single_word": false
10
- },
11
- "clean_up_tokenization_spaces": true,
12
- "do_lower_case": true,
13
- "eos_token": {
14
- "__type": "AddedToken",
15
- "content": "<|endoftext|>",
16
- "lstrip": false,
17
- "normalized": true,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "errors": "replace",
22
- "model_max_length": 77,
23
- "pad_token": "<|endoftext|>",
24
- "processor_class": "CLIPProcessor",
25
- "tokenizer_class": "CLIPTokenizer",
26
- "unk_token": {
27
- "__type": "AddedToken",
28
- "content": "<|endoftext|>",
29
- "lstrip": false,
30
- "normalized": true,
31
- "rstrip": false,
32
- "single_word": false
33
- }
34
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/trainer_state.json DELETED
@@ -1,3366 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 0.999612090830424,
5
- "global_step": 33500,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.0,
12
- "learning_rate": 3.0000000000000004e-07,
13
- "loss": 0.1505,
14
- "r_loss": 0.04637562483549118,
15
- "s_loss": 0.008538205176591873,
16
- "step": 100
17
- },
18
- {
19
- "epoch": 0.01,
20
- "learning_rate": 6.000000000000001e-07,
21
- "loss": 0.0461,
22
- "r_loss": 0.03377725929021835,
23
- "s_loss": 0.003884976962581277,
24
- "step": 200
25
- },
26
- {
27
- "epoch": 0.01,
28
- "learning_rate": 9e-07,
29
- "loss": 0.034,
30
- "r_loss": 0.028842270374298096,
31
- "s_loss": 0.00349122891202569,
32
- "step": 300
33
- },
34
- {
35
- "epoch": 0.01,
36
- "learning_rate": 1.2000000000000002e-06,
37
- "loss": 0.0294,
38
- "r_loss": 0.03719829022884369,
39
- "s_loss": 0.0032082502730190754,
40
- "step": 400
41
- },
42
- {
43
- "epoch": 0.01,
44
- "learning_rate": 1.5e-06,
45
- "loss": 0.0259,
46
- "r_loss": 0.02185184881091118,
47
- "s_loss": 0.002866600640118122,
48
- "step": 500
49
- },
50
- {
51
- "epoch": 0.01,
52
- "eval_loss": 0.022319668903946877,
53
- "eval_r_loss": 0.019447144120931625,
54
- "eval_runtime": 5.725,
55
- "eval_s_loss": 0.0028725252486765385,
56
- "eval_samples_per_second": 117.38,
57
- "eval_steps_per_second": 117.38,
58
- "step": 500
59
- },
60
- {
61
- "epoch": 0.02,
62
- "learning_rate": 1.8e-06,
63
- "loss": 0.0238,
64
- "r_loss": 0.01666960120201111,
65
- "s_loss": 0.0029884909745305777,
66
- "step": 600
67
- },
68
- {
69
- "epoch": 0.02,
70
- "learning_rate": 2.1000000000000002e-06,
71
- "loss": 0.0226,
72
- "r_loss": 0.01765752211213112,
73
- "s_loss": 0.003066658042371273,
74
- "step": 700
75
- },
76
- {
77
- "epoch": 0.02,
78
- "learning_rate": 2.4000000000000003e-06,
79
- "loss": 0.0212,
80
- "r_loss": 0.01967948116362095,
81
- "s_loss": 0.002733011730015278,
82
- "step": 800
83
- },
84
- {
85
- "epoch": 0.03,
86
- "learning_rate": 2.7e-06,
87
- "loss": 0.0205,
88
- "r_loss": 0.016631901264190674,
89
- "s_loss": 0.0027432385832071304,
90
- "step": 900
91
- },
92
- {
93
- "epoch": 0.03,
94
- "learning_rate": 3e-06,
95
- "loss": 0.0197,
96
- "r_loss": 0.016916554421186447,
97
- "s_loss": 0.002646064618602395,
98
- "step": 1000
99
- },
100
- {
101
- "epoch": 0.03,
102
- "eval_loss": 0.01776285283267498,
103
- "eval_r_loss": 0.015205347910523415,
104
- "eval_runtime": 5.7797,
105
- "eval_s_loss": 0.0025575056206434965,
106
- "eval_samples_per_second": 116.268,
107
- "eval_steps_per_second": 116.268,
108
- "step": 1000
109
- },
110
- {
111
- "epoch": 0.03,
112
- "learning_rate": 3.3e-06,
113
- "loss": 0.0189,
114
- "r_loss": 0.016178447753190994,
115
- "s_loss": 0.002462042961269617,
116
- "step": 1100
117
- },
118
- {
119
- "epoch": 0.04,
120
- "learning_rate": 3.6e-06,
121
- "loss": 0.0182,
122
- "r_loss": 0.01640625111758709,
123
- "s_loss": 0.0024527523200958967,
124
- "step": 1200
125
- },
126
- {
127
- "epoch": 0.04,
128
- "learning_rate": 3.9e-06,
129
- "loss": 0.0181,
130
- "r_loss": 0.013397695496678352,
131
- "s_loss": 0.002130940556526184,
132
- "step": 1300
133
- },
134
- {
135
- "epoch": 0.04,
136
- "learning_rate": 4.2000000000000004e-06,
137
- "loss": 0.017,
138
- "r_loss": 0.015975676476955414,
139
- "s_loss": 0.002383481478318572,
140
- "step": 1400
141
- },
142
- {
143
- "epoch": 0.04,
144
- "learning_rate": 4.5e-06,
145
- "loss": 0.017,
146
- "r_loss": 0.013270380906760693,
147
- "s_loss": 0.0021610523108392954,
148
- "step": 1500
149
- },
150
- {
151
- "epoch": 0.04,
152
- "eval_loss": 0.015290064737200737,
153
- "eval_r_loss": 0.012945108115673065,
154
- "eval_runtime": 5.6931,
155
- "eval_s_loss": 0.002344956621527672,
156
- "eval_samples_per_second": 118.039,
157
- "eval_steps_per_second": 118.039,
158
- "step": 1500
159
- },
160
- {
161
- "epoch": 0.05,
162
- "learning_rate": 4.800000000000001e-06,
163
- "loss": 0.0161,
164
- "r_loss": 0.013699322938919067,
165
- "s_loss": 0.0022580809891223907,
166
- "step": 1600
167
- },
168
- {
169
- "epoch": 0.05,
170
- "learning_rate": 5.1e-06,
171
- "loss": 0.016,
172
- "r_loss": 0.014964626170694828,
173
- "s_loss": 0.0021504017058759928,
174
- "step": 1700
175
- },
176
- {
177
- "epoch": 0.05,
178
- "learning_rate": 5.4e-06,
179
- "loss": 0.016,
180
- "r_loss": 0.013486243784427643,
181
- "s_loss": 0.0020603144075721502,
182
- "step": 1800
183
- },
184
- {
185
- "epoch": 0.06,
186
- "learning_rate": 5.7000000000000005e-06,
187
- "loss": 0.0154,
188
- "r_loss": 0.012973317876458168,
189
- "s_loss": 0.0021089715883135796,
190
- "step": 1900
191
- },
192
- {
193
- "epoch": 0.06,
194
- "learning_rate": 6e-06,
195
- "loss": 0.0153,
196
- "r_loss": 0.015371004119515419,
197
- "s_loss": 0.002191081177443266,
198
- "step": 2000
199
- },
200
- {
201
- "epoch": 0.06,
202
- "eval_loss": 0.013254178687930107,
203
- "eval_r_loss": 0.011181775480508804,
204
- "eval_runtime": 5.6968,
205
- "eval_s_loss": 0.0020724027417600155,
206
- "eval_samples_per_second": 117.961,
207
- "eval_steps_per_second": 117.961,
208
- "step": 2000
209
- },
210
- {
211
- "epoch": 0.06,
212
- "learning_rate": 6.3e-06,
213
- "loss": 0.015,
214
- "r_loss": 0.011732030659914017,
215
- "s_loss": 0.0019391687819734216,
216
- "step": 2100
217
- },
218
- {
219
- "epoch": 0.07,
220
- "learning_rate": 6.6e-06,
221
- "loss": 0.0147,
222
- "r_loss": 0.015546457841992378,
223
- "s_loss": 0.0020762544590979815,
224
- "step": 2200
225
- },
226
- {
227
- "epoch": 0.07,
228
- "learning_rate": 6.900000000000001e-06,
229
- "loss": 0.0146,
230
- "r_loss": 0.009786777198314667,
231
- "s_loss": 0.0019506356911733747,
232
- "step": 2300
233
- },
234
- {
235
- "epoch": 0.07,
236
- "learning_rate": 7.2e-06,
237
- "loss": 0.0144,
238
- "r_loss": 0.012893063016235828,
239
- "s_loss": 0.002081776736304164,
240
- "step": 2400
241
- },
242
- {
243
- "epoch": 0.07,
244
- "learning_rate": 7.5e-06,
245
- "loss": 0.0142,
246
- "r_loss": 0.00976946298032999,
247
- "s_loss": 0.0017642343882471323,
248
- "step": 2500
249
- },
250
- {
251
- "epoch": 0.07,
252
- "eval_loss": 0.01350394356995821,
253
- "eval_r_loss": 0.01156766340136528,
254
- "eval_runtime": 5.7349,
255
- "eval_s_loss": 0.001936280052177608,
256
- "eval_samples_per_second": 117.177,
257
- "eval_steps_per_second": 117.177,
258
- "step": 2500
259
- },
260
- {
261
- "epoch": 0.08,
262
- "learning_rate": 7.8e-06,
263
- "loss": 0.0136,
264
- "r_loss": 0.01364838145673275,
265
- "s_loss": 0.0019653863273561,
266
- "step": 2600
267
- },
268
- {
269
- "epoch": 0.08,
270
- "learning_rate": 8.1e-06,
271
- "loss": 0.014,
272
- "r_loss": 0.013060592114925385,
273
- "s_loss": 0.00197937642224133,
274
- "step": 2700
275
- },
276
- {
277
- "epoch": 0.08,
278
- "learning_rate": 8.400000000000001e-06,
279
- "loss": 0.0134,
280
- "r_loss": 0.010259253904223442,
281
- "s_loss": 0.0018942919559776783,
282
- "step": 2800
283
- },
284
- {
285
- "epoch": 0.09,
286
- "learning_rate": 8.7e-06,
287
- "loss": 0.0136,
288
- "r_loss": 0.010634353384375572,
289
- "s_loss": 0.002018529223278165,
290
- "step": 2900
291
- },
292
- {
293
- "epoch": 0.09,
294
- "learning_rate": 9e-06,
295
- "loss": 0.0134,
296
- "r_loss": 0.01226745918393135,
297
- "s_loss": 0.0018310793675482273,
298
- "step": 3000
299
- },
300
- {
301
- "epoch": 0.09,
302
- "eval_loss": 0.01376401074230671,
303
- "eval_r_loss": 0.011921562254428864,
304
- "eval_runtime": 5.8031,
305
- "eval_s_loss": 0.0018424488371238112,
306
- "eval_samples_per_second": 115.8,
307
- "eval_steps_per_second": 115.8,
308
- "step": 3000
309
- },
310
- {
311
- "epoch": 0.09,
312
- "learning_rate": 9.3e-06,
313
- "loss": 0.0131,
314
- "r_loss": 0.011366230435669422,
315
- "s_loss": 0.0018870271742343903,
316
- "step": 3100
317
- },
318
- {
319
- "epoch": 0.1,
320
- "learning_rate": 9.600000000000001e-06,
321
- "loss": 0.0129,
322
- "r_loss": 0.015041200444102287,
323
- "s_loss": 0.0019711025524884462,
324
- "step": 3200
325
- },
326
- {
327
- "epoch": 0.1,
328
- "learning_rate": 9.9e-06,
329
- "loss": 0.0126,
330
- "r_loss": 0.010199671611189842,
331
- "s_loss": 0.0018134403508156538,
332
- "step": 3300
333
- },
334
- {
335
- "epoch": 0.1,
336
- "learning_rate": 1.02e-05,
337
- "loss": 0.0127,
338
- "r_loss": 0.009185859933495522,
339
- "s_loss": 0.0016790288500487804,
340
- "step": 3400
341
- },
342
- {
343
- "epoch": 0.1,
344
- "learning_rate": 1.05e-05,
345
- "loss": 0.0127,
346
- "r_loss": 0.011276321485638618,
347
- "s_loss": 0.0016570865409448743,
348
- "step": 3500
349
- },
350
- {
351
- "epoch": 0.1,
352
- "eval_loss": 0.011658551171422005,
353
- "eval_r_loss": 0.009879879653453827,
354
- "eval_runtime": 5.7067,
355
- "eval_s_loss": 0.001778671983629465,
356
- "eval_samples_per_second": 117.756,
357
- "eval_steps_per_second": 117.756,
358
- "step": 3500
359
- },
360
- {
361
- "epoch": 0.11,
362
- "learning_rate": 1.08e-05,
363
- "loss": 0.0126,
364
- "r_loss": 0.008440356701612473,
365
- "s_loss": 0.0017231483943760395,
366
- "step": 3600
367
- },
368
- {
369
- "epoch": 0.11,
370
- "learning_rate": 1.11e-05,
371
- "loss": 0.0122,
372
- "r_loss": 0.009153485298156738,
373
- "s_loss": 0.0017831036821007729,
374
- "step": 3700
375
- },
376
- {
377
- "epoch": 0.11,
378
- "learning_rate": 1.1400000000000001e-05,
379
- "loss": 0.012,
380
- "r_loss": 0.011471357196569443,
381
- "s_loss": 0.001937872963026166,
382
- "step": 3800
383
- },
384
- {
385
- "epoch": 0.12,
386
- "learning_rate": 1.1700000000000001e-05,
387
- "loss": 0.0121,
388
- "r_loss": 0.009361416101455688,
389
- "s_loss": 0.001662532682530582,
390
- "step": 3900
391
- },
392
- {
393
- "epoch": 0.12,
394
- "learning_rate": 1.2e-05,
395
- "loss": 0.012,
396
- "r_loss": 0.013415796682238579,
397
- "s_loss": 0.0016665093135088682,
398
- "step": 4000
399
- },
400
- {
401
- "epoch": 0.12,
402
- "eval_loss": 0.011559335514903069,
403
- "eval_r_loss": 0.009901713579893112,
404
- "eval_runtime": 5.7585,
405
- "eval_s_loss": 0.0016576218185946345,
406
- "eval_samples_per_second": 116.697,
407
- "eval_steps_per_second": 116.697,
408
- "step": 4000
409
- },
410
- {
411
- "epoch": 0.12,
412
- "learning_rate": 1.2299999999999999e-05,
413
- "loss": 0.0122,
414
- "r_loss": 0.009355029091238976,
415
- "s_loss": 0.001734372228384018,
416
- "step": 4100
417
- },
418
- {
419
- "epoch": 0.13,
420
- "learning_rate": 1.26e-05,
421
- "loss": 0.0118,
422
- "r_loss": 0.009848803281784058,
423
- "s_loss": 0.0016517819603905082,
424
- "step": 4200
425
- },
426
- {
427
- "epoch": 0.13,
428
- "learning_rate": 1.29e-05,
429
- "loss": 0.0117,
430
- "r_loss": 0.008925353176891804,
431
- "s_loss": 0.0016535022296011448,
432
- "step": 4300
433
- },
434
- {
435
- "epoch": 0.13,
436
- "learning_rate": 1.32e-05,
437
- "loss": 0.0117,
438
- "r_loss": 0.009858155623078346,
439
- "s_loss": 0.001591067761182785,
440
- "step": 4400
441
- },
442
- {
443
- "epoch": 0.13,
444
- "learning_rate": 1.3500000000000001e-05,
445
- "loss": 0.0115,
446
- "r_loss": 0.009000759571790695,
447
- "s_loss": 0.001509728142991662,
448
- "step": 4500
449
- },
450
- {
451
- "epoch": 0.13,
452
- "eval_loss": 0.011256811209022999,
453
- "eval_r_loss": 0.009671147912740707,
454
- "eval_runtime": 5.7598,
455
- "eval_s_loss": 0.0015856630634516478,
456
- "eval_samples_per_second": 116.67,
457
- "eval_steps_per_second": 116.67,
458
- "step": 4500
459
- },
460
- {
461
- "epoch": 0.14,
462
- "learning_rate": 1.3800000000000002e-05,
463
- "loss": 0.0113,
464
- "r_loss": 0.00949438102543354,
465
- "s_loss": 0.0015365839935839176,
466
- "step": 4600
467
- },
468
- {
469
- "epoch": 0.14,
470
- "learning_rate": 1.4099999999999999e-05,
471
- "loss": 0.0114,
472
- "r_loss": 0.01006004773080349,
473
- "s_loss": 0.0015926426276564598,
474
- "step": 4700
475
- },
476
- {
477
- "epoch": 0.14,
478
- "learning_rate": 1.44e-05,
479
- "loss": 0.0114,
480
- "r_loss": 0.010145231150090694,
481
- "s_loss": 0.0015732853207737207,
482
- "step": 4800
483
- },
484
- {
485
- "epoch": 0.15,
486
- "learning_rate": 1.47e-05,
487
- "loss": 0.0112,
488
- "r_loss": 0.00975881703197956,
489
- "s_loss": 0.0014939116081222892,
490
- "step": 4900
491
- },
492
- {
493
- "epoch": 0.15,
494
- "learning_rate": 1.5e-05,
495
- "loss": 0.0111,
496
- "r_loss": 0.009045148268342018,
497
- "s_loss": 0.0014695697464048862,
498
- "step": 5000
499
- },
500
- {
501
- "epoch": 0.15,
502
- "eval_loss": 0.011217299848794937,
503
- "eval_r_loss": 0.009782791137695312,
504
- "eval_runtime": 5.6702,
505
- "eval_s_loss": 0.0014345089439302683,
506
- "eval_samples_per_second": 118.514,
507
- "eval_steps_per_second": 118.514,
508
- "step": 5000
509
- },
510
- {
511
- "epoch": 0.15,
512
- "learning_rate": 1.53e-05,
513
- "loss": 0.0111,
514
- "r_loss": 0.009601066820323467,
515
- "s_loss": 0.001514170435257256,
516
- "step": 5100
517
- },
518
- {
519
- "epoch": 0.16,
520
- "learning_rate": 1.56e-05,
521
- "loss": 0.0112,
522
- "r_loss": 0.00870747584849596,
523
- "s_loss": 0.0014461548998951912,
524
- "step": 5200
525
- },
526
- {
527
- "epoch": 0.16,
528
- "learning_rate": 1.59e-05,
529
- "loss": 0.0109,
530
- "r_loss": 0.008360641077160835,
531
- "s_loss": 0.0014414612669497728,
532
- "step": 5300
533
- },
534
- {
535
- "epoch": 0.16,
536
- "learning_rate": 1.62e-05,
537
- "loss": 0.0108,
538
- "r_loss": 0.00921720638871193,
539
- "s_loss": 0.0014027974102646112,
540
- "step": 5400
541
- },
542
- {
543
- "epoch": 0.16,
544
- "learning_rate": 1.65e-05,
545
- "loss": 0.0108,
546
- "r_loss": 0.00895563792437315,
547
- "s_loss": 0.0014760666526854038,
548
- "step": 5500
549
- },
550
- {
551
- "epoch": 0.16,
552
- "eval_loss": 0.01118839718401432,
553
- "eval_r_loss": 0.00973587017506361,
554
- "eval_runtime": 5.7152,
555
- "eval_s_loss": 0.0014525266597047448,
556
- "eval_samples_per_second": 117.581,
557
- "eval_steps_per_second": 117.581,
558
- "step": 5500
559
- },
560
- {
561
- "epoch": 0.17,
562
- "learning_rate": 1.6800000000000002e-05,
563
- "loss": 0.0107,
564
- "r_loss": 0.007762259803712368,
565
- "s_loss": 0.0015427314210683107,
566
- "step": 5600
567
- },
568
- {
569
- "epoch": 0.17,
570
- "learning_rate": 1.71e-05,
571
- "loss": 0.0108,
572
- "r_loss": 0.009653204120695591,
573
- "s_loss": 0.0015951216919347644,
574
- "step": 5700
575
- },
576
- {
577
- "epoch": 0.17,
578
- "learning_rate": 1.74e-05,
579
- "loss": 0.0106,
580
- "r_loss": 0.007989178411662579,
581
- "s_loss": 0.001397020067088306,
582
- "step": 5800
583
- },
584
- {
585
- "epoch": 0.18,
586
- "learning_rate": 1.77e-05,
587
- "loss": 0.0107,
588
- "r_loss": 0.009275095537304878,
589
- "s_loss": 0.001448939205147326,
590
- "step": 5900
591
- },
592
- {
593
- "epoch": 0.18,
594
- "learning_rate": 1.8e-05,
595
- "loss": 0.0106,
596
- "r_loss": 0.008456457406282425,
597
- "s_loss": 0.0014687306247651577,
598
- "step": 6000
599
- },
600
- {
601
- "epoch": 0.18,
602
- "eval_loss": 0.010676538571715355,
603
- "eval_r_loss": 0.009258019737899303,
604
- "eval_runtime": 5.7202,
605
- "eval_s_loss": 0.0014185188338160515,
606
- "eval_samples_per_second": 117.477,
607
- "eval_steps_per_second": 117.477,
608
- "step": 6000
609
- },
610
- {
611
- "epoch": 0.18,
612
- "learning_rate": 1.83e-05,
613
- "loss": 0.0105,
614
- "r_loss": 0.008195489645004272,
615
- "s_loss": 0.0014291288098320365,
616
- "step": 6100
617
- },
618
- {
619
- "epoch": 0.19,
620
- "learning_rate": 1.86e-05,
621
- "loss": 0.0102,
622
- "r_loss": 0.008191170170903206,
623
- "s_loss": 0.0015366484876722097,
624
- "step": 6200
625
- },
626
- {
627
- "epoch": 0.19,
628
- "learning_rate": 1.8900000000000002e-05,
629
- "loss": 0.0102,
630
- "r_loss": 0.00822894275188446,
631
- "s_loss": 0.0014636135892942548,
632
- "step": 6300
633
- },
634
- {
635
- "epoch": 0.19,
636
- "learning_rate": 1.9200000000000003e-05,
637
- "loss": 0.0103,
638
- "r_loss": 0.0088454969227314,
639
- "s_loss": 0.001345100230537355,
640
- "step": 6400
641
- },
642
- {
643
- "epoch": 0.19,
644
- "learning_rate": 1.95e-05,
645
- "loss": 0.0105,
646
- "r_loss": 0.01023932732641697,
647
- "s_loss": 0.001511400449089706,
648
- "step": 6500
649
- },
650
- {
651
- "epoch": 0.19,
652
- "eval_loss": 0.010188945569097996,
653
- "eval_r_loss": 0.00889565609395504,
654
- "eval_runtime": 6.5641,
655
- "eval_s_loss": 0.0012932894751429558,
656
- "eval_samples_per_second": 102.375,
657
- "eval_steps_per_second": 102.375,
658
- "step": 6500
659
- },
660
- {
661
- "epoch": 0.2,
662
- "learning_rate": 1.98e-05,
663
- "loss": 0.0104,
664
- "r_loss": 0.00900462456047535,
665
- "s_loss": 0.001456625759601593,
666
- "step": 6600
667
- },
668
- {
669
- "epoch": 0.2,
670
- "learning_rate": 2.01e-05,
671
- "loss": 0.0102,
672
- "r_loss": 0.00964261218905449,
673
- "s_loss": 0.001401584129780531,
674
- "step": 6700
675
- },
676
- {
677
- "epoch": 0.2,
678
- "learning_rate": 2.04e-05,
679
- "loss": 0.0101,
680
- "r_loss": 0.007694972679018974,
681
- "s_loss": 0.0014238519361242652,
682
- "step": 6800
683
- },
684
- {
685
- "epoch": 0.21,
686
- "learning_rate": 2.07e-05,
687
- "loss": 0.0102,
688
- "r_loss": 0.009449545294046402,
689
- "s_loss": 0.0014661129098385572,
690
- "step": 6900
691
- },
692
- {
693
- "epoch": 0.21,
694
- "learning_rate": 2.1e-05,
695
- "loss": 0.0101,
696
- "r_loss": 0.00955035537481308,
697
- "s_loss": 0.001392255537211895,
698
- "step": 7000
699
- },
700
- {
701
- "epoch": 0.21,
702
- "eval_loss": 0.010023693554103374,
703
- "eval_r_loss": 0.008697763085365295,
704
- "eval_runtime": 5.7082,
705
- "eval_s_loss": 0.001325930585153401,
706
- "eval_samples_per_second": 117.726,
707
- "eval_steps_per_second": 117.726,
708
- "step": 7000
709
- },
710
- {
711
- "epoch": 0.21,
712
- "learning_rate": 2.13e-05,
713
- "loss": 0.0102,
714
- "r_loss": 0.00833200104534626,
715
- "s_loss": 0.0014237057184800506,
716
- "step": 7100
717
- },
718
- {
719
- "epoch": 0.21,
720
- "learning_rate": 2.16e-05,
721
- "loss": 0.0099,
722
- "r_loss": 0.008094298653304577,
723
- "s_loss": 0.0014223785838112235,
724
- "step": 7200
725
- },
726
- {
727
- "epoch": 0.22,
728
- "learning_rate": 2.19e-05,
729
- "loss": 0.01,
730
- "r_loss": 0.00752690713852644,
731
- "s_loss": 0.0013399685267359018,
732
- "step": 7300
733
- },
734
- {
735
- "epoch": 0.22,
736
- "learning_rate": 2.22e-05,
737
- "loss": 0.0099,
738
- "r_loss": 0.00953773781657219,
739
- "s_loss": 0.001381666399538517,
740
- "step": 7400
741
- },
742
- {
743
- "epoch": 0.22,
744
- "learning_rate": 2.25e-05,
745
- "loss": 0.0098,
746
- "r_loss": 0.008129315450787544,
747
- "s_loss": 0.0011789320269599557,
748
- "step": 7500
749
- },
750
- {
751
- "epoch": 0.22,
752
- "eval_loss": 0.010138859041035175,
753
- "eval_r_loss": 0.00888618640601635,
754
- "eval_runtime": 5.8833,
755
- "eval_s_loss": 0.001252672984264791,
756
- "eval_samples_per_second": 114.221,
757
- "eval_steps_per_second": 114.221,
758
- "step": 7500
759
- },
760
- {
761
- "epoch": 0.23,
762
- "learning_rate": 2.2800000000000002e-05,
763
- "loss": 0.0098,
764
- "r_loss": 0.009464412927627563,
765
- "s_loss": 0.001395503873936832,
766
- "step": 7600
767
- },
768
- {
769
- "epoch": 0.23,
770
- "learning_rate": 2.3100000000000002e-05,
771
- "loss": 0.0097,
772
- "r_loss": 0.00881454348564148,
773
- "s_loss": 0.001235602656379342,
774
- "step": 7700
775
- },
776
- {
777
- "epoch": 0.23,
778
- "learning_rate": 2.3400000000000003e-05,
779
- "loss": 0.01,
780
- "r_loss": 0.00844954326748848,
781
- "s_loss": 0.0012883525341749191,
782
- "step": 7800
783
- },
784
- {
785
- "epoch": 0.24,
786
- "learning_rate": 2.37e-05,
787
- "loss": 0.0095,
788
- "r_loss": 0.008428743109107018,
789
- "s_loss": 0.001225842977873981,
790
- "step": 7900
791
- },
792
- {
793
- "epoch": 0.24,
794
- "learning_rate": 2.4e-05,
795
- "loss": 0.0098,
796
- "r_loss": 0.008358178660273552,
797
- "s_loss": 0.0013608363224193454,
798
- "step": 8000
799
- },
800
- {
801
- "epoch": 0.24,
802
- "eval_loss": 0.010027415119111538,
803
- "eval_r_loss": 0.008776991628110409,
804
- "eval_runtime": 5.7172,
805
- "eval_s_loss": 0.0012504233745858073,
806
- "eval_samples_per_second": 117.54,
807
- "eval_steps_per_second": 117.54,
808
- "step": 8000
809
- },
810
- {
811
- "epoch": 0.24,
812
- "learning_rate": 2.43e-05,
813
- "loss": 0.0097,
814
- "r_loss": 0.007988743484020233,
815
- "s_loss": 0.0012902095913887024,
816
- "step": 8100
817
- },
818
- {
819
- "epoch": 0.24,
820
- "learning_rate": 2.4599999999999998e-05,
821
- "loss": 0.0097,
822
- "r_loss": 0.00881593581289053,
823
- "s_loss": 0.0012137378798797727,
824
- "step": 8200
825
- },
826
- {
827
- "epoch": 0.25,
828
- "learning_rate": 2.49e-05,
829
- "loss": 0.0093,
830
- "r_loss": 0.008240088820457458,
831
- "s_loss": 0.0012537827715277672,
832
- "step": 8300
833
- },
834
- {
835
- "epoch": 0.25,
836
- "learning_rate": 2.52e-05,
837
- "loss": 0.0096,
838
- "r_loss": 0.008193010464310646,
839
- "s_loss": 0.0012971541145816445,
840
- "step": 8400
841
- },
842
- {
843
- "epoch": 0.25,
844
- "learning_rate": 2.55e-05,
845
- "loss": 0.0098,
846
- "r_loss": 0.007697759196162224,
847
- "s_loss": 0.0012005027383565903,
848
- "step": 8500
849
- },
850
- {
851
- "epoch": 0.25,
852
- "eval_loss": 0.010029895231127739,
853
- "eval_r_loss": 0.008869171142578125,
854
- "eval_runtime": 5.779,
855
- "eval_s_loss": 0.0011607238557189703,
856
- "eval_samples_per_second": 116.282,
857
- "eval_steps_per_second": 116.282,
858
- "step": 8500
859
- },
860
- {
861
- "epoch": 0.26,
862
- "learning_rate": 2.58e-05,
863
- "loss": 0.0094,
864
- "r_loss": 0.00764896534383297,
865
- "s_loss": 0.0012557113077491522,
866
- "step": 8600
867
- },
868
- {
869
- "epoch": 0.26,
870
- "learning_rate": 2.61e-05,
871
- "loss": 0.0096,
872
- "r_loss": 0.007685758639127016,
873
- "s_loss": 0.0011759819462895393,
874
- "step": 8700
875
- },
876
- {
877
- "epoch": 0.26,
878
- "learning_rate": 2.64e-05,
879
- "loss": 0.0095,
880
- "r_loss": 0.008306157775223255,
881
- "s_loss": 0.0011866830755025148,
882
- "step": 8800
883
- },
884
- {
885
- "epoch": 0.27,
886
- "learning_rate": 2.6700000000000002e-05,
887
- "loss": 0.0095,
888
- "r_loss": 0.008763814345002174,
889
- "s_loss": 0.0012581332121044397,
890
- "step": 8900
891
- },
892
- {
893
- "epoch": 0.27,
894
- "learning_rate": 2.7000000000000002e-05,
895
- "loss": 0.0094,
896
- "r_loss": 0.007176906801760197,
897
- "s_loss": 0.0012848544865846634,
898
- "step": 9000
899
- },
900
- {
901
- "epoch": 0.27,
902
- "eval_loss": 0.009547967463731766,
903
- "eval_r_loss": 0.008400797843933105,
904
- "eval_runtime": 5.8725,
905
- "eval_s_loss": 0.0011471696197986603,
906
- "eval_samples_per_second": 114.432,
907
- "eval_steps_per_second": 114.432,
908
- "step": 9000
909
- },
910
- {
911
- "epoch": 0.27,
912
- "learning_rate": 2.7300000000000003e-05,
913
- "loss": 0.0091,
914
- "r_loss": 0.007933049462735653,
915
- "s_loss": 0.0013153750915080309,
916
- "step": 9100
917
- },
918
- {
919
- "epoch": 0.27,
920
- "learning_rate": 2.7600000000000003e-05,
921
- "loss": 0.0093,
922
- "r_loss": 0.008543523028492928,
923
- "s_loss": 0.0012904139002785087,
924
- "step": 9200
925
- },
926
- {
927
- "epoch": 0.28,
928
- "learning_rate": 2.79e-05,
929
- "loss": 0.0093,
930
- "r_loss": 0.007229233160614967,
931
- "s_loss": 0.0011527976021170616,
932
- "step": 9300
933
- },
934
- {
935
- "epoch": 0.28,
936
- "learning_rate": 2.8199999999999998e-05,
937
- "loss": 0.0093,
938
- "r_loss": 0.007913423702120781,
939
- "s_loss": 0.0011675741989165545,
940
- "step": 9400
941
- },
942
- {
943
- "epoch": 0.28,
944
- "learning_rate": 2.8499999999999998e-05,
945
- "loss": 0.0092,
946
- "r_loss": 0.007350780535489321,
947
- "s_loss": 0.0012241221265867352,
948
- "step": 9500
949
- },
950
- {
951
- "epoch": 0.28,
952
- "eval_loss": 0.009159870445728302,
953
- "eval_r_loss": 0.008046709932386875,
954
- "eval_runtime": 5.7652,
955
- "eval_s_loss": 0.0011131602805107832,
956
- "eval_samples_per_second": 116.562,
957
- "eval_steps_per_second": 116.562,
958
- "step": 9500
959
- },
960
- {
961
- "epoch": 0.29,
962
- "learning_rate": 2.88e-05,
963
- "loss": 0.0092,
964
- "r_loss": 0.007505511865019798,
965
- "s_loss": 0.0012209609849378467,
966
- "step": 9600
967
- },
968
- {
969
- "epoch": 0.29,
970
- "learning_rate": 2.91e-05,
971
- "loss": 0.0092,
972
- "r_loss": 0.00815967470407486,
973
- "s_loss": 0.0011755439918488264,
974
- "step": 9700
975
- },
976
- {
977
- "epoch": 0.29,
978
- "learning_rate": 2.94e-05,
979
- "loss": 0.0092,
980
- "r_loss": 0.007525102701038122,
981
- "s_loss": 0.0012379743857309222,
982
- "step": 9800
983
- },
984
- {
985
- "epoch": 0.3,
986
- "learning_rate": 2.97e-05,
987
- "loss": 0.009,
988
- "r_loss": 0.008029448799788952,
989
- "s_loss": 0.0012208997504785657,
990
- "step": 9900
991
- },
992
- {
993
- "epoch": 0.3,
994
- "learning_rate": 3e-05,
995
- "loss": 0.0091,
996
- "r_loss": 0.007686637807637453,
997
- "s_loss": 0.0012359985848888755,
998
- "step": 10000
999
- },
1000
- {
1001
- "epoch": 0.3,
1002
- "eval_loss": 0.009688720107078552,
1003
- "eval_r_loss": 0.008566668257117271,
1004
- "eval_runtime": 5.8537,
1005
- "eval_s_loss": 0.0011220521992072463,
1006
- "eval_samples_per_second": 114.8,
1007
- "eval_steps_per_second": 114.8,
1008
- "step": 10000
1009
- },
1010
- {
1011
- "epoch": 0.3,
1012
- "learning_rate": 2.9998687772974054e-05,
1013
- "loss": 0.0091,
1014
- "r_loss": 0.007961354218423367,
1015
- "s_loss": 0.0010860951151698828,
1016
- "step": 10100
1017
- },
1018
- {
1019
- "epoch": 0.3,
1020
- "learning_rate": 2.9994698173783606e-05,
1021
- "loss": 0.0091,
1022
- "r_loss": 0.008185433223843575,
1023
- "s_loss": 0.0011867923894897103,
1024
- "step": 10200
1025
- },
1026
- {
1027
- "epoch": 0.31,
1028
- "learning_rate": 2.998803178074702e-05,
1029
- "loss": 0.0092,
1030
- "r_loss": 0.008926862850785255,
1031
- "s_loss": 0.0011962838470935822,
1032
- "step": 10300
1033
- },
1034
- {
1035
- "epoch": 0.31,
1036
- "learning_rate": 2.997868978392226e-05,
1037
- "loss": 0.009,
1038
- "r_loss": 0.008092904463410378,
1039
- "s_loss": 0.0011486653238534927,
1040
- "step": 10400
1041
- },
1042
- {
1043
- "epoch": 0.31,
1044
- "learning_rate": 2.996667385100541e-05,
1045
- "loss": 0.0091,
1046
- "r_loss": 0.007265549618750811,
1047
- "s_loss": 0.0010849842801690102,
1048
- "step": 10500
1049
- },
1050
- {
1051
- "epoch": 0.31,
1052
- "eval_loss": 0.0097579974681139,
1053
- "eval_r_loss": 0.008664416149258614,
1054
- "eval_runtime": 5.8205,
1055
- "eval_s_loss": 0.0010935813188552856,
1056
- "eval_samples_per_second": 115.453,
1057
- "eval_steps_per_second": 115.453,
1058
- "step": 10500
1059
- },
1060
- {
1061
- "epoch": 0.32,
1062
- "learning_rate": 2.995198612703301e-05,
1063
- "loss": 0.0091,
1064
- "r_loss": 0.007716472260653973,
1065
- "s_loss": 0.0011130705242976546,
1066
- "step": 10600
1067
- },
1068
- {
1069
- "epoch": 0.32,
1070
- "learning_rate": 2.9934629233999088e-05,
1071
- "loss": 0.0088,
1072
- "r_loss": 0.0071815671399235725,
1073
- "s_loss": 0.0010680295526981354,
1074
- "step": 10700
1075
- },
1076
- {
1077
- "epoch": 0.32,
1078
- "learning_rate": 2.991460627038711e-05,
1079
- "loss": 0.0089,
1080
- "r_loss": 0.008051994256675243,
1081
- "s_loss": 0.0011568560730665922,
1082
- "step": 10800
1083
- },
1084
- {
1085
- "epoch": 0.33,
1086
- "learning_rate": 2.9891920810616865e-05,
1087
- "loss": 0.0089,
1088
- "r_loss": 0.007709968835115433,
1089
- "s_loss": 0.0010994401527568698,
1090
- "step": 10900
1091
- },
1092
- {
1093
- "epoch": 0.33,
1094
- "learning_rate": 2.986657690440635e-05,
1095
- "loss": 0.0087,
1096
- "r_loss": 0.007149288430809975,
1097
- "s_loss": 0.0010991152375936508,
1098
- "step": 11000
1099
- },
1100
- {
1101
- "epoch": 0.33,
1102
- "eval_loss": 0.008960756473243237,
1103
- "eval_r_loss": 0.0079101687297225,
1104
- "eval_runtime": 5.8735,
1105
- "eval_s_loss": 0.001050587510690093,
1106
- "eval_samples_per_second": 114.412,
1107
- "eval_steps_per_second": 114.412,
1108
- "step": 11000
1109
- },
1110
- {
1111
- "epoch": 0.33,
1112
- "learning_rate": 2.983857907604885e-05,
1113
- "loss": 0.0087,
1114
- "r_loss": 0.006731455214321613,
1115
- "s_loss": 0.0011426934506744146,
1116
- "step": 11100
1117
- },
1118
- {
1119
- "epoch": 0.33,
1120
- "learning_rate": 2.9807932323605262e-05,
1121
- "loss": 0.009,
1122
- "r_loss": 0.008175029419362545,
1123
- "s_loss": 0.0010848107049241662,
1124
- "step": 11200
1125
- },
1126
- {
1127
- "epoch": 0.34,
1128
- "learning_rate": 2.977464211801187e-05,
1129
- "loss": 0.0085,
1130
- "r_loss": 0.007222716696560383,
1131
- "s_loss": 0.0010968766873702407,
1132
- "step": 11300
1133
- },
1134
- {
1135
- "epoch": 0.34,
1136
- "learning_rate": 2.9738714402103696e-05,
1137
- "loss": 0.0086,
1138
- "r_loss": 0.007525290362536907,
1139
- "s_loss": 0.0010664989240467548,
1140
- "step": 11400
1141
- },
1142
- {
1143
- "epoch": 0.34,
1144
- "learning_rate": 2.9700155589553614e-05,
1145
- "loss": 0.0085,
1146
- "r_loss": 0.00836112443357706,
1147
- "s_loss": 0.0012076541315764189,
1148
- "step": 11500
1149
- },
1150
- {
1151
- "epoch": 0.34,
1152
- "eval_loss": 0.008930221199989319,
1153
- "eval_r_loss": 0.007907573133707047,
1154
- "eval_runtime": 5.8343,
1155
- "eval_s_loss": 0.0010226481826975942,
1156
- "eval_samples_per_second": 115.18,
1157
- "eval_steps_per_second": 115.18,
1158
- "step": 11500
1159
- },
1160
- {
1161
- "epoch": 0.35,
1162
- "learning_rate": 2.9658972563727394e-05,
1163
- "loss": 0.0087,
1164
- "r_loss": 0.007679732982069254,
1165
- "s_loss": 0.000998866162262857,
1166
- "step": 11600
1167
- },
1168
- {
1169
- "epoch": 0.35,
1170
- "learning_rate": 2.9615172676454915e-05,
1171
- "loss": 0.0085,
1172
- "r_loss": 0.007193954661488533,
1173
- "s_loss": 0.00100530288182199,
1174
- "step": 11700
1175
- },
1176
- {
1177
- "epoch": 0.35,
1178
- "learning_rate": 2.956876374671775e-05,
1179
- "loss": 0.0086,
1180
- "r_loss": 0.008202124387025833,
1181
- "s_loss": 0.001100401277653873,
1182
- "step": 11800
1183
- },
1184
- {
1185
- "epoch": 0.36,
1186
- "learning_rate": 2.9519754059253352e-05,
1187
- "loss": 0.0086,
1188
- "r_loss": 0.005764150992035866,
1189
- "s_loss": 0.001019550021737814,
1190
- "step": 11900
1191
- },
1192
- {
1193
- "epoch": 0.36,
1194
- "learning_rate": 2.946815236307609e-05,
1195
- "loss": 0.0088,
1196
- "r_loss": 0.006989379413425922,
1197
- "s_loss": 0.0010334283579140902,
1198
- "step": 12000
1199
- },
1200
- {
1201
- "epoch": 0.36,
1202
- "eval_loss": 0.008558472618460655,
1203
- "eval_r_loss": 0.007531056646257639,
1204
- "eval_runtime": 5.8831,
1205
- "eval_s_loss": 0.0010274164378643036,
1206
- "eval_samples_per_second": 114.225,
1207
- "eval_steps_per_second": 114.225,
1208
- "step": 12000
1209
- },
1210
- {
1211
- "epoch": 0.36,
1212
- "learning_rate": 2.941396786991542e-05,
1213
- "loss": 0.0084,
1214
- "r_loss": 0.007328535430133343,
1215
- "s_loss": 0.0010166720021516085,
1216
- "step": 12100
1217
- },
1218
- {
1219
- "epoch": 0.36,
1220
- "learning_rate": 2.9357210252571423e-05,
1221
- "loss": 0.0084,
1222
- "r_loss": 0.006314602214843035,
1223
- "s_loss": 0.0010744825704023242,
1224
- "step": 12200
1225
- },
1226
- {
1227
- "epoch": 0.37,
1228
- "learning_rate": 2.929788964318808e-05,
1229
- "loss": 0.0084,
1230
- "r_loss": 0.007112974300980568,
1231
- "s_loss": 0.0010199513053521514,
1232
- "step": 12300
1233
- },
1234
- {
1235
- "epoch": 0.37,
1236
- "learning_rate": 2.923601663144452e-05,
1237
- "loss": 0.0083,
1238
- "r_loss": 0.0075445109978318214,
1239
- "s_loss": 0.001125972718000412,
1240
- "step": 12400
1241
- },
1242
- {
1243
- "epoch": 0.37,
1244
- "learning_rate": 2.9171602262664564e-05,
1245
- "loss": 0.0082,
1246
- "r_loss": 0.006783840246498585,
1247
- "s_loss": 0.0010000347392633557,
1248
- "step": 12500
1249
- },
1250
- {
1251
- "epoch": 0.37,
1252
- "eval_loss": 0.008438749238848686,
1253
- "eval_r_loss": 0.007456656079739332,
1254
- "eval_runtime": 6.6399,
1255
- "eval_s_loss": 0.0009820933919399977,
1256
- "eval_samples_per_second": 101.206,
1257
- "eval_steps_per_second": 101.206,
1258
- "step": 12500
1259
- },
1260
- {
1261
- "epoch": 0.38,
1262
- "learning_rate": 2.9104658035844992e-05,
1263
- "loss": 0.0084,
1264
- "r_loss": 0.008811071515083313,
1265
- "s_loss": 0.0010695005767047405,
1266
- "step": 12600
1267
- },
1268
- {
1269
- "epoch": 0.38,
1270
- "learning_rate": 2.9035195901602766e-05,
1271
- "loss": 0.0081,
1272
- "r_loss": 0.00737602636218071,
1273
- "s_loss": 0.001059257541783154,
1274
- "step": 12700
1275
- },
1276
- {
1277
- "epoch": 0.38,
1278
- "learning_rate": 2.896322826004167e-05,
1279
- "loss": 0.0083,
1280
- "r_loss": 0.006479831412434578,
1281
- "s_loss": 0.00109660136513412,
1282
- "step": 12800
1283
- },
1284
- {
1285
- "epoch": 0.38,
1286
- "learning_rate": 2.8888767958538672e-05,
1287
- "loss": 0.0081,
1288
- "r_loss": 0.006183864548802376,
1289
- "s_loss": 0.000993464607745409,
1290
- "step": 12900
1291
- },
1292
- {
1293
- "epoch": 0.39,
1294
- "learning_rate": 2.881182828945048e-05,
1295
- "loss": 0.0082,
1296
- "r_loss": 0.007777994964271784,
1297
- "s_loss": 0.0010871184058487415,
1298
- "step": 13000
1299
- },
1300
- {
1301
- "epoch": 0.39,
1302
- "eval_loss": 0.007983732037246227,
1303
- "eval_r_loss": 0.007039351388812065,
1304
- "eval_runtime": 5.8966,
1305
- "eval_s_loss": 0.0009443803573958576,
1306
- "eval_samples_per_second": 113.965,
1307
- "eval_steps_per_second": 113.965,
1308
- "step": 13000
1309
- },
1310
- {
1311
- "epoch": 0.39,
1312
- "learning_rate": 2.873242298774064e-05,
1313
- "loss": 0.0082,
1314
- "r_loss": 0.010438592173159122,
1315
- "s_loss": 0.0010730213252827525,
1316
- "step": 13100
1317
- },
1318
- {
1319
- "epoch": 0.39,
1320
- "learning_rate": 2.865056622852762e-05,
1321
- "loss": 0.0081,
1322
- "r_loss": 0.006384184584021568,
1323
- "s_loss": 0.0008851269376464188,
1324
- "step": 13200
1325
- },
1326
- {
1327
- "epoch": 0.4,
1328
- "learning_rate": 2.8566272624554314e-05,
1329
- "loss": 0.0081,
1330
- "r_loss": 0.007597366347908974,
1331
- "s_loss": 0.001074151019565761,
1332
- "step": 13300
1333
- },
1334
- {
1335
- "epoch": 0.4,
1336
- "learning_rate": 2.847955722357946e-05,
1337
- "loss": 0.0081,
1338
- "r_loss": 0.006059112958610058,
1339
- "s_loss": 0.0010072380537167192,
1340
- "step": 13400
1341
- },
1342
- {
1343
- "epoch": 0.4,
1344
- "learning_rate": 2.8390435505691352e-05,
1345
- "loss": 0.008,
1346
- "r_loss": 0.006421985570341349,
1347
- "s_loss": 0.0009825531160458922,
1348
- "step": 13500
1349
- },
1350
- {
1351
- "epoch": 0.4,
1352
- "eval_loss": 0.008024024777114391,
1353
- "eval_r_loss": 0.007059826515614986,
1354
- "eval_runtime": 5.9301,
1355
- "eval_s_loss": 0.0009641979122534394,
1356
- "eval_samples_per_second": 113.321,
1357
- "eval_steps_per_second": 113.321,
1358
- "step": 13500
1359
- },
1360
- {
1361
- "epoch": 0.41,
1362
- "learning_rate": 2.8298923380544406e-05,
1363
- "loss": 0.008,
1364
- "r_loss": 0.006581292487680912,
1365
- "s_loss": 0.0010704277083277702,
1366
- "step": 13600
1367
- },
1368
- {
1369
- "epoch": 0.41,
1370
- "learning_rate": 2.8205037184519026e-05,
1371
- "loss": 0.008,
1372
- "r_loss": 0.008079946041107178,
1373
- "s_loss": 0.0009939800947904587,
1374
- "step": 13700
1375
- },
1376
- {
1377
- "epoch": 0.41,
1378
- "learning_rate": 2.8108793677805307e-05,
1379
- "loss": 0.0079,
1380
- "r_loss": 0.005696495994925499,
1381
- "s_loss": 0.0009651000145822763,
1382
- "step": 13800
1383
- },
1384
- {
1385
- "epoch": 0.41,
1386
- "learning_rate": 2.8010210041411057e-05,
1387
- "loss": 0.008,
1388
- "r_loss": 0.006050314754247665,
1389
- "s_loss": 0.0010551176965236664,
1390
- "step": 13900
1391
- },
1392
- {
1393
- "epoch": 0.42,
1394
- "learning_rate": 2.7909303874094737e-05,
1395
- "loss": 0.008,
1396
- "r_loss": 0.007171120494604111,
1397
- "s_loss": 0.001050697872415185,
1398
- "step": 14000
1399
- },
1400
- {
1401
- "epoch": 0.42,
1402
- "eval_loss": 0.0087936632335186,
1403
- "eval_r_loss": 0.007837551645934582,
1404
- "eval_runtime": 5.8165,
1405
- "eval_s_loss": 0.0009561114711686969,
1406
- "eval_samples_per_second": 115.534,
1407
- "eval_steps_per_second": 115.534,
1408
- "step": 14000
1409
- },
1410
- {
1411
- "epoch": 0.42,
1412
- "learning_rate": 2.7806093189223774e-05,
1413
- "loss": 0.0079,
1414
- "r_loss": 0.007168681360781193,
1415
- "s_loss": 0.0011435865890234709,
1416
- "step": 14100
1417
- },
1418
- {
1419
- "epoch": 0.42,
1420
- "learning_rate": 2.7700596411558902e-05,
1421
- "loss": 0.0078,
1422
- "r_loss": 0.007676262408494949,
1423
- "s_loss": 0.0010355566628277302,
1424
- "step": 14200
1425
- },
1426
- {
1427
- "epoch": 0.43,
1428
- "learning_rate": 2.7592832373965038e-05,
1429
- "loss": 0.008,
1430
- "r_loss": 0.007139571011066437,
1431
- "s_loss": 0.001015349174849689,
1432
- "step": 14300
1433
- },
1434
- {
1435
- "epoch": 0.43,
1436
- "learning_rate": 2.7482820314049326e-05,
1437
- "loss": 0.0079,
1438
- "r_loss": 0.00660574808716774,
1439
- "s_loss": 0.000994799891486764,
1440
- "step": 14400
1441
- },
1442
- {
1443
- "epoch": 0.43,
1444
- "learning_rate": 2.7370579870726906e-05,
1445
- "loss": 0.0078,
1446
- "r_loss": 0.006421282887458801,
1447
- "s_loss": 0.0009415894746780396,
1448
- "step": 14500
1449
- },
1450
- {
1451
- "epoch": 0.43,
1452
- "eval_loss": 0.008639033883810043,
1453
- "eval_r_loss": 0.007638509385287762,
1454
- "eval_runtime": 5.8664,
1455
- "eval_s_loss": 0.0010005244985222816,
1456
- "eval_samples_per_second": 114.55,
1457
- "eval_steps_per_second": 114.55,
1458
- "step": 14500
1459
- },
1460
- {
1461
- "epoch": 0.44,
1462
- "learning_rate": 2.7256131080715053e-05,
1463
- "loss": 0.0078,
1464
- "r_loss": 0.006864764261990786,
1465
- "s_loss": 0.0010434763971716166,
1466
- "step": 14600
1467
- },
1468
- {
1469
- "epoch": 0.44,
1470
- "learning_rate": 2.7139494374956316e-05,
1471
- "loss": 0.0078,
1472
- "r_loss": 0.00723334401845932,
1473
- "s_loss": 0.0010841034818440676,
1474
- "step": 14700
1475
- },
1476
- {
1477
- "epoch": 0.44,
1478
- "learning_rate": 2.7020690574971236e-05,
1479
- "loss": 0.0078,
1480
- "r_loss": 0.006216096691787243,
1481
- "s_loss": 0.0008721616468392313,
1482
- "step": 14800
1483
- },
1484
- {
1485
- "epoch": 0.44,
1486
- "learning_rate": 2.6899740889141407e-05,
1487
- "loss": 0.0077,
1488
- "r_loss": 0.006137696094810963,
1489
- "s_loss": 0.000969752436503768,
1490
- "step": 14900
1491
- },
1492
- {
1493
- "epoch": 0.45,
1494
- "learning_rate": 2.677666690892343e-05,
1495
- "loss": 0.0077,
1496
- "r_loss": 0.006585408002138138,
1497
- "s_loss": 0.0009770711185410619,
1498
- "step": 15000
1499
- },
1500
- {
1501
- "epoch": 0.45,
1502
- "eval_loss": 0.008090370334684849,
1503
- "eval_r_loss": 0.007137539330869913,
1504
- "eval_runtime": 5.9249,
1505
- "eval_s_loss": 0.0009528312948532403,
1506
- "eval_samples_per_second": 113.42,
1507
- "eval_steps_per_second": 113.42,
1508
- "step": 15000
1509
- },
1510
- {
1511
- "epoch": 0.45,
1512
- "learning_rate": 2.6651490604994458e-05,
1513
- "loss": 0.0076,
1514
- "r_loss": 0.006115331780165434,
1515
- "s_loss": 0.0010219502728432417,
1516
- "step": 15100
1517
- },
1518
- {
1519
- "epoch": 0.45,
1520
- "learning_rate": 2.6524234323330147e-05,
1521
- "loss": 0.0076,
1522
- "r_loss": 0.006430969573557377,
1523
- "s_loss": 0.0010282087605446577,
1524
- "step": 15200
1525
- },
1526
- {
1527
- "epoch": 0.46,
1528
- "learning_rate": 2.6394920781215467e-05,
1529
- "loss": 0.0076,
1530
- "r_loss": 0.010935202240943909,
1531
- "s_loss": 0.001126307644881308,
1532
- "step": 15300
1533
- },
1534
- {
1535
- "epoch": 0.46,
1536
- "learning_rate": 2.6264896532848944e-05,
1537
- "loss": 0.0076,
1538
- "r_loss": 0.006224375218153,
1539
- "s_loss": 0.0009270138689316809,
1540
- "step": 15400
1541
- },
1542
- {
1543
- "epoch": 0.46,
1544
- "learning_rate": 2.6131558076617624e-05,
1545
- "loss": 0.0076,
1546
- "r_loss": 0.007503915578126907,
1547
- "s_loss": 0.001000251155346632,
1548
- "step": 15500
1549
- },
1550
- {
1551
- "epoch": 0.46,
1552
- "eval_loss": 0.007729613222181797,
1553
- "eval_r_loss": 0.006781161762773991,
1554
- "eval_runtime": 8.5719,
1555
- "eval_s_loss": 0.0009484515758231282,
1556
- "eval_samples_per_second": 78.396,
1557
- "eval_steps_per_second": 78.396,
1558
- "step": 15500
1559
- },
1560
- {
1561
- "epoch": 0.47,
1562
- "learning_rate": 2.5996232458936835e-05,
1563
- "loss": 0.0077,
1564
- "r_loss": 0.0065051475539803505,
1565
- "s_loss": 0.0009143096976913512,
1566
- "step": 15600
1567
- },
1568
- {
1569
- "epoch": 0.47,
1570
- "learning_rate": 2.5858943837597314e-05,
1571
- "loss": 0.0075,
1572
- "r_loss": 0.0064110783860087395,
1573
- "s_loss": 0.0009684975957497954,
1574
- "step": 15700
1575
- },
1576
- {
1577
- "epoch": 0.47,
1578
- "learning_rate": 2.5719716720817392e-05,
1579
- "loss": 0.0074,
1580
- "r_loss": 0.007918891496956348,
1581
- "s_loss": 0.001000194693915546,
1582
- "step": 15800
1583
- },
1584
- {
1585
- "epoch": 0.47,
1586
- "learning_rate": 2.5578575962867906e-05,
1587
- "loss": 0.0075,
1588
- "r_loss": 0.007117274217307568,
1589
- "s_loss": 0.0009594152215868235,
1590
- "step": 15900
1591
- },
1592
- {
1593
- "epoch": 0.48,
1594
- "learning_rate": 2.543554675963528e-05,
1595
- "loss": 0.0075,
1596
- "r_loss": 0.006600790657103062,
1597
- "s_loss": 0.0009607726242393255,
1598
- "step": 16000
1599
- },
1600
- {
1601
- "epoch": 0.48,
1602
- "eval_loss": 0.007644696161150932,
1603
- "eval_r_loss": 0.006735973991453648,
1604
- "eval_runtime": 8.488,
1605
- "eval_s_loss": 0.0009087221696972847,
1606
- "eval_samples_per_second": 79.171,
1607
- "eval_steps_per_second": 79.171,
1608
- "step": 16000
1609
- },
1610
- {
1611
- "epoch": 0.48,
1612
- "learning_rate": 2.5290654644123703e-05,
1613
- "loss": 0.0075,
1614
- "r_loss": 0.005946667864918709,
1615
- "s_loss": 0.0009118504240177572,
1616
- "step": 16100
1617
- },
1618
- {
1619
- "epoch": 0.48,
1620
- "learning_rate": 2.5143925481897017e-05,
1621
- "loss": 0.0076,
1622
- "r_loss": 0.005937603302299976,
1623
- "s_loss": 0.0009287346038036048,
1624
- "step": 16200
1625
- },
1626
- {
1627
- "epoch": 0.49,
1628
- "learning_rate": 2.499538546646136e-05,
1629
- "loss": 0.0073,
1630
- "r_loss": 0.0064862994477152824,
1631
- "s_loss": 0.0008813185268081725,
1632
- "step": 16300
1633
- },
1634
- {
1635
- "epoch": 0.49,
1636
- "learning_rate": 2.4845061114589165e-05,
1637
- "loss": 0.0074,
1638
- "r_loss": 0.006060485728085041,
1639
- "s_loss": 0.0009768878808245063,
1640
- "step": 16400
1641
- },
1642
- {
1643
- "epoch": 0.49,
1644
- "learning_rate": 2.4692979261585507e-05,
1645
- "loss": 0.0074,
1646
- "r_loss": 0.005972530692815781,
1647
- "s_loss": 0.0009232640732079744,
1648
- "step": 16500
1649
- },
1650
- {
1651
- "epoch": 0.49,
1652
- "eval_loss": 0.007525917608290911,
1653
- "eval_r_loss": 0.006600276567041874,
1654
- "eval_runtime": 8.5748,
1655
- "eval_s_loss": 0.000925640866626054,
1656
- "eval_samples_per_second": 78.369,
1657
- "eval_steps_per_second": 78.369,
1658
- "step": 16500
1659
- },
1660
- {
1661
- "epoch": 0.5,
1662
- "learning_rate": 2.4539167056497572e-05,
1663
- "loss": 0.0075,
1664
- "r_loss": 0.006335953716188669,
1665
- "s_loss": 0.0009909672662615776,
1666
- "step": 16600
1667
- },
1668
- {
1669
- "epoch": 0.5,
1670
- "learning_rate": 2.4383651957268106e-05,
1671
- "loss": 0.0074,
1672
- "r_loss": 0.0072190104983747005,
1673
- "s_loss": 0.0009832193609327078,
1674
- "step": 16700
1675
- },
1676
- {
1677
- "epoch": 0.5,
1678
- "learning_rate": 2.4226461725833757e-05,
1679
- "loss": 0.0075,
1680
- "r_loss": 0.0063974312506616116,
1681
- "s_loss": 0.0009214280871674418,
1682
- "step": 16800
1683
- },
1684
- {
1685
- "epoch": 0.5,
1686
- "learning_rate": 2.4067624423169087e-05,
1687
- "loss": 0.0074,
1688
- "r_loss": 0.006921318359673023,
1689
- "s_loss": 0.000975217146333307,
1690
- "step": 16900
1691
- },
1692
- {
1693
- "epoch": 0.51,
1694
- "learning_rate": 2.3907168404277275e-05,
1695
- "loss": 0.0072,
1696
- "r_loss": 0.005746101029217243,
1697
- "s_loss": 0.0009283066028729081,
1698
- "step": 17000
1699
- },
1700
- {
1701
- "epoch": 0.51,
1702
- "eval_loss": 0.006981786340475082,
1703
- "eval_r_loss": 0.006118214689195156,
1704
- "eval_runtime": 9.4285,
1705
- "eval_s_loss": 0.0008635715930722654,
1706
- "eval_samples_per_second": 71.273,
1707
- "eval_steps_per_second": 71.273,
1708
- "step": 17000
1709
- },
1710
- {
1711
- "epoch": 0.51,
1712
- "learning_rate": 2.3745122313128274e-05,
1713
- "loss": 0.0072,
1714
- "r_loss": 0.005779191851615906,
1715
- "s_loss": 0.0009009677451103926,
1716
- "step": 17100
1717
- },
1718
- {
1719
- "epoch": 0.51,
1720
- "learning_rate": 2.3581515077545418e-05,
1721
- "loss": 0.0072,
1722
- "r_loss": 0.007839716970920563,
1723
- "s_loss": 0.0010175108909606934,
1724
- "step": 17200
1725
- },
1726
- {
1727
- "epoch": 0.52,
1728
- "learning_rate": 2.34163759040413e-05,
1729
- "loss": 0.0074,
1730
- "r_loss": 0.006432846188545227,
1731
- "s_loss": 0.001021060859784484,
1732
- "step": 17300
1733
- },
1734
- {
1735
- "epoch": 0.52,
1736
- "learning_rate": 2.324973427260402e-05,
1737
- "loss": 0.0073,
1738
- "r_loss": 0.0062257954850792885,
1739
- "s_loss": 0.0009748931624926627,
1740
- "step": 17400
1741
- },
1742
- {
1743
- "epoch": 0.52,
1744
- "learning_rate": 2.3081619931434452e-05,
1745
- "loss": 0.0072,
1746
- "r_loss": 0.006158421281725168,
1747
- "s_loss": 0.0010701502906158566,
1748
- "step": 17500
1749
- },
1750
- {
1751
- "epoch": 0.52,
1752
- "eval_loss": 0.007478422485291958,
1753
- "eval_r_loss": 0.006587797310203314,
1754
- "eval_runtime": 10.1462,
1755
- "eval_s_loss": 0.0008906250586733222,
1756
- "eval_samples_per_second": 66.232,
1757
- "eval_steps_per_second": 66.232,
1758
- "step": 17500
1759
- },
1760
- {
1761
- "epoch": 0.53,
1762
- "learning_rate": 2.2912062891635778e-05,
1763
- "loss": 0.0072,
1764
- "r_loss": 0.006115030962973833,
1765
- "s_loss": 0.0009346662554889917,
1766
- "step": 17600
1767
- },
1768
- {
1769
- "epoch": 0.53,
1770
- "learning_rate": 2.274109342185598e-05,
1771
- "loss": 0.0074,
1772
- "r_loss": 0.00592977087944746,
1773
- "s_loss": 0.000866447517182678,
1774
- "step": 17700
1775
- },
1776
- {
1777
- "epoch": 0.53,
1778
- "learning_rate": 2.256874204288442e-05,
1779
- "loss": 0.0071,
1780
- "r_loss": 0.006099973805248737,
1781
- "s_loss": 0.0008897872176021338,
1782
- "step": 17800
1783
- },
1784
- {
1785
- "epoch": 0.53,
1786
- "learning_rate": 2.2395039522203403e-05,
1787
- "loss": 0.0073,
1788
- "r_loss": 0.006193576380610466,
1789
- "s_loss": 0.0008475868962705135,
1790
- "step": 17900
1791
- },
1792
- {
1793
- "epoch": 0.54,
1794
- "learning_rate": 2.222001686849566e-05,
1795
- "loss": 0.0071,
1796
- "r_loss": 0.0067910789512097836,
1797
- "s_loss": 0.0010297299595549703,
1798
- "step": 18000
1799
- },
1800
- {
1801
- "epoch": 0.54,
1802
- "eval_loss": 0.007175224833190441,
1803
- "eval_r_loss": 0.006267193704843521,
1804
- "eval_runtime": 8.8162,
1805
- "eval_s_loss": 0.0009080312447622418,
1806
- "eval_samples_per_second": 76.223,
1807
- "eval_steps_per_second": 76.223,
1808
- "step": 18000
1809
- },
1810
- {
1811
- "epoch": 0.54,
1812
- "learning_rate": 2.2043705326108824e-05,
1813
- "loss": 0.0072,
1814
- "r_loss": 0.006607139483094215,
1815
- "s_loss": 0.0009428428602404892,
1816
- "step": 18100
1817
- },
1818
- {
1819
- "epoch": 0.54,
1820
- "learning_rate": 2.1866136369477807e-05,
1821
- "loss": 0.0071,
1822
- "r_loss": 0.0053857965394854546,
1823
- "s_loss": 0.0008554364321753383,
1824
- "step": 18200
1825
- },
1826
- {
1827
- "epoch": 0.55,
1828
- "learning_rate": 2.1687341697506106e-05,
1829
- "loss": 0.0071,
1830
- "r_loss": 0.005985291674733162,
1831
- "s_loss": 0.0008826229604892433,
1832
- "step": 18300
1833
- },
1834
- {
1835
- "epoch": 0.55,
1836
- "learning_rate": 2.150735322790704e-05,
1837
- "loss": 0.0071,
1838
- "r_loss": 0.005803743377327919,
1839
- "s_loss": 0.0009314118069596589,
1840
- "step": 18400
1841
- },
1842
- {
1843
- "epoch": 0.55,
1844
- "learning_rate": 2.1326203091505936e-05,
1845
- "loss": 0.0071,
1846
- "r_loss": 0.007132242433726788,
1847
- "s_loss": 0.0009220225038006902,
1848
- "step": 18500
1849
- },
1850
- {
1851
- "epoch": 0.55,
1852
- "eval_loss": 0.007128148805350065,
1853
- "eval_r_loss": 0.006268758792430162,
1854
- "eval_runtime": 8.6423,
1855
- "eval_s_loss": 0.0008593900711275637,
1856
- "eval_samples_per_second": 77.757,
1857
- "eval_steps_per_second": 77.757,
1858
- "step": 18500
1859
- },
1860
- {
1861
- "epoch": 0.56,
1862
- "learning_rate": 2.114392362650425e-05,
1863
- "loss": 0.007,
1864
- "r_loss": 0.006902765482664108,
1865
- "s_loss": 0.0009045482147485018,
1866
- "step": 18600
1867
- },
1868
- {
1869
- "epoch": 0.56,
1870
- "learning_rate": 2.096054737270669e-05,
1871
- "loss": 0.0071,
1872
- "r_loss": 0.006586451083421707,
1873
- "s_loss": 0.000991364591754973,
1874
- "step": 18700
1875
- },
1876
- {
1877
- "epoch": 0.56,
1878
- "learning_rate": 2.0776107065712326e-05,
1879
- "loss": 0.007,
1880
- "r_loss": 0.004980746190994978,
1881
- "s_loss": 0.0007370096282102168,
1882
- "step": 18800
1883
- },
1884
- {
1885
- "epoch": 0.56,
1886
- "learning_rate": 2.059063563107079e-05,
1887
- "loss": 0.0071,
1888
- "r_loss": 0.006382007151842117,
1889
- "s_loss": 0.0008771989960223436,
1890
- "step": 18900
1891
- },
1892
- {
1893
- "epoch": 0.57,
1894
- "learning_rate": 2.040416617840449e-05,
1895
- "loss": 0.007,
1896
- "r_loss": 0.0058512031100690365,
1897
- "s_loss": 0.0008522871066816151,
1898
- "step": 19000
1899
- },
1900
- {
1901
- "epoch": 0.57,
1902
- "eval_loss": 0.007616510149091482,
1903
- "eval_r_loss": 0.006736051291227341,
1904
- "eval_runtime": 8.8015,
1905
- "eval_s_loss": 0.0008804587414488196,
1906
- "eval_samples_per_second": 76.35,
1907
- "eval_steps_per_second": 76.35,
1908
- "step": 19000
1909
- },
1910
- {
1911
- "epoch": 0.57,
1912
- "learning_rate": 2.021673199549806e-05,
1913
- "loss": 0.007,
1914
- "r_loss": 0.005660332273691893,
1915
- "s_loss": 0.0009998377645388246,
1916
- "step": 19100
1917
- },
1918
- {
1919
- "epoch": 0.57,
1920
- "learning_rate": 2.002836654235594e-05,
1921
- "loss": 0.0069,
1922
- "r_loss": 0.005866233725100756,
1923
- "s_loss": 0.0009188687545247376,
1924
- "step": 19200
1925
- },
1926
- {
1927
- "epoch": 0.58,
1928
- "learning_rate": 1.983910344522921e-05,
1929
- "loss": 0.0069,
1930
- "r_loss": 0.0057606166228652,
1931
- "s_loss": 0.000834951177239418,
1932
- "step": 19300
1933
- },
1934
- {
1935
- "epoch": 0.58,
1936
- "learning_rate": 1.9648976490612795e-05,
1937
- "loss": 0.0067,
1938
- "r_loss": 0.004965795204043388,
1939
- "s_loss": 0.0008518850081600249,
1940
- "step": 19400
1941
- },
1942
- {
1943
- "epoch": 0.58,
1944
- "learning_rate": 1.9459933184205116e-05,
1945
- "loss": 0.0069,
1946
- "r_loss": 0.006777866743505001,
1947
- "s_loss": 0.0010352524695917964,
1948
- "step": 19500
1949
- },
1950
- {
1951
- "epoch": 0.58,
1952
- "eval_loss": 0.007355178706347942,
1953
- "eval_r_loss": 0.006485129706561565,
1954
- "eval_runtime": 8.9082,
1955
- "eval_s_loss": 0.0008700488251633942,
1956
- "eval_samples_per_second": 75.436,
1957
- "eval_steps_per_second": 75.436,
1958
- "step": 19500
1959
- },
1960
- {
1961
- "epoch": 0.58,
1962
- "learning_rate": 1.9268188273952553e-05,
1963
- "loss": 0.0068,
1964
- "r_loss": 0.007018654141575098,
1965
- "s_loss": 0.0009793075732886791,
1966
- "step": 19600
1967
- },
1968
- {
1969
- "epoch": 0.59,
1970
- "learning_rate": 1.9075681423714705e-05,
1971
- "loss": 0.0069,
1972
- "r_loss": 0.006022634916007519,
1973
- "s_loss": 0.0008647244540043175,
1974
- "step": 19700
1975
- },
1976
- {
1977
- "epoch": 0.59,
1978
- "learning_rate": 1.8882446999048395e-05,
1979
- "loss": 0.0068,
1980
- "r_loss": 0.005010883789509535,
1981
- "s_loss": 0.0008228466031141579,
1982
- "step": 19800
1983
- },
1984
- {
1985
- "epoch": 0.59,
1986
- "learning_rate": 1.8688519495394125e-05,
1987
- "loss": 0.0069,
1988
- "r_loss": 0.006728707812726498,
1989
- "s_loss": 0.000956969684921205,
1990
- "step": 19900
1991
- },
1992
- {
1993
- "epoch": 0.6,
1994
- "learning_rate": 1.8493933531918117e-05,
1995
- "loss": 0.0068,
1996
- "r_loss": 0.007086700294166803,
1997
- "s_loss": 0.0008907000883482397,
1998
- "step": 20000
1999
- },
2000
- {
2001
- "epoch": 0.6,
2002
- "eval_loss": 0.006719064898788929,
2003
- "eval_r_loss": 0.005855009891092777,
2004
- "eval_runtime": 8.5414,
2005
- "eval_s_loss": 0.0008640547748655081,
2006
- "eval_samples_per_second": 78.676,
2007
- "eval_steps_per_second": 78.676,
2008
- "step": 20000
2009
- },
2010
- {
2011
- "epoch": 0.6,
2012
- "learning_rate": 1.8298723845332198e-05,
2013
- "loss": 0.0067,
2014
- "r_loss": 0.005535440053790808,
2015
- "s_loss": 0.0009100943570956588,
2016
- "step": 20100
2017
- },
2018
- {
2019
- "epoch": 0.6,
2020
- "learning_rate": 1.8102925283692782e-05,
2021
- "loss": 0.0068,
2022
- "r_loss": 0.006036119069904089,
2023
- "s_loss": 0.0008750570705160499,
2024
- "step": 20200
2025
- },
2026
- {
2027
- "epoch": 0.61,
2028
- "learning_rate": 1.79065728001799e-05,
2029
- "loss": 0.0068,
2030
- "r_loss": 0.005460575222969055,
2031
- "s_loss": 0.0009543396299704909,
2032
- "step": 20300
2033
- },
2034
- {
2035
- "epoch": 0.61,
2036
- "learning_rate": 1.7709701446857527e-05,
2037
- "loss": 0.0068,
2038
- "r_loss": 0.005476498045027256,
2039
- "s_loss": 0.0008106306777335703,
2040
- "step": 20400
2041
- },
2042
- {
2043
- "epoch": 0.61,
2044
- "learning_rate": 1.75123463684162e-05,
2045
- "loss": 0.0069,
2046
- "r_loss": 0.006093060597777367,
2047
- "s_loss": 0.0009121097973547876,
2048
- "step": 20500
2049
- },
2050
- {
2051
- "epoch": 0.61,
2052
- "eval_loss": 0.006673221942037344,
2053
- "eval_r_loss": 0.005839366465806961,
2054
- "eval_runtime": 8.4759,
2055
- "eval_s_loss": 0.0008338554762303829,
2056
- "eval_samples_per_second": 79.284,
2057
- "eval_steps_per_second": 79.284,
2058
- "step": 20500
2059
- },
2060
- {
2061
- "epoch": 0.61,
2062
- "learning_rate": 1.7314542795899137e-05,
2063
- "loss": 0.0067,
2064
- "r_loss": 0.005582145415246487,
2065
- "s_loss": 0.0008446918218396604,
2066
- "step": 20600
2067
- },
2068
- {
2069
- "epoch": 0.62,
2070
- "learning_rate": 1.7116326040412943e-05,
2071
- "loss": 0.0067,
2072
- "r_loss": 0.005406418815255165,
2073
- "s_loss": 0.0010036693420261145,
2074
- "step": 20700
2075
- },
2076
- {
2077
- "epoch": 0.62,
2078
- "learning_rate": 1.6917731486823998e-05,
2079
- "loss": 0.0068,
2080
- "r_loss": 0.005483163520693779,
2081
- "s_loss": 0.0009342134580947459,
2082
- "step": 20800
2083
- },
2084
- {
2085
- "epoch": 0.62,
2086
- "learning_rate": 1.6718794587441696e-05,
2087
- "loss": 0.0067,
2088
- "r_loss": 0.005454606376588345,
2089
- "s_loss": 0.0008848806610330939,
2090
- "step": 20900
2091
- },
2092
- {
2093
- "epoch": 0.63,
2094
- "learning_rate": 1.6519550855689638e-05,
2095
- "loss": 0.0067,
2096
- "r_loss": 0.005080068949609995,
2097
- "s_loss": 0.0008759861811995506,
2098
- "step": 21000
2099
- },
2100
- {
2101
- "epoch": 0.63,
2102
- "eval_loss": 0.006933785974979401,
2103
- "eval_r_loss": 0.0060889944434165955,
2104
- "eval_runtime": 8.6029,
2105
- "eval_s_loss": 0.0008447913569398224,
2106
- "eval_samples_per_second": 78.113,
2107
- "eval_steps_per_second": 78.113,
2108
- "step": 21000
2109
- },
2110
- {
2111
- "epoch": 0.63,
2112
- "learning_rate": 1.6320035859765918e-05,
2113
- "loss": 0.0068,
2114
- "r_loss": 0.006143931299448013,
2115
- "s_loss": 0.000945593579672277,
2116
- "step": 21100
2117
- },
2118
- {
2119
- "epoch": 0.63,
2120
- "learning_rate": 1.612028521629364e-05,
2121
- "loss": 0.0069,
2122
- "r_loss": 0.006441016681492329,
2123
- "s_loss": 0.0009456037660129368,
2124
- "step": 21200
2125
- },
2126
- {
2127
- "epoch": 0.64,
2128
- "learning_rate": 1.5920334583962753e-05,
2129
- "loss": 0.0067,
2130
- "r_loss": 0.006160522345453501,
2131
- "s_loss": 0.0008229748345911503,
2132
- "step": 21300
2133
- },
2134
- {
2135
- "epoch": 0.64,
2136
- "learning_rate": 1.5720219657164435e-05,
2137
- "loss": 0.0067,
2138
- "r_loss": 0.005891709588468075,
2139
- "s_loss": 0.000853882054798305,
2140
- "step": 21400
2141
- },
2142
- {
2143
- "epoch": 0.64,
2144
- "learning_rate": 1.551997615961906e-05,
2145
- "loss": 0.0067,
2146
- "r_loss": 0.005442744120955467,
2147
- "s_loss": 0.0008191297529265285,
2148
- "step": 21500
2149
- },
2150
- {
2151
- "epoch": 0.64,
2152
- "eval_loss": 0.007066499907523394,
2153
- "eval_r_loss": 0.006228615529835224,
2154
- "eval_runtime": 10.9272,
2155
- "eval_s_loss": 0.0008378842030651867,
2156
- "eval_samples_per_second": 61.498,
2157
- "eval_steps_per_second": 61.498,
2158
- "step": 21500
2159
- },
2160
- {
2161
- "epoch": 0.64,
2162
- "learning_rate": 1.5319639837998926e-05,
2163
- "loss": 0.0066,
2164
- "r_loss": 0.006707844324409962,
2165
- "s_loss": 0.0009414084488525987,
2166
- "step": 21600
2167
- },
2168
- {
2169
- "epoch": 0.65,
2170
- "learning_rate": 1.5119246455546931e-05,
2171
- "loss": 0.0066,
2172
- "r_loss": 0.0060582030564546585,
2173
- "s_loss": 0.0009419742273166776,
2174
- "step": 21700
2175
- },
2176
- {
2177
- "epoch": 0.65,
2178
- "learning_rate": 1.4918831785692232e-05,
2179
- "loss": 0.0067,
2180
- "r_loss": 0.006246947217732668,
2181
- "s_loss": 0.0008474804344587028,
2182
- "step": 21800
2183
- },
2184
- {
2185
- "epoch": 0.65,
2186
- "learning_rate": 1.4718431605664146e-05,
2187
- "loss": 0.0066,
2188
- "r_loss": 0.00554366409778595,
2189
- "s_loss": 0.000851424119900912,
2190
- "step": 21900
2191
- },
2192
- {
2193
- "epoch": 0.66,
2194
- "learning_rate": 1.4518081690105308e-05,
2195
- "loss": 0.0065,
2196
- "r_loss": 0.005924141500145197,
2197
- "s_loss": 0.0008292071870528162,
2198
- "step": 22000
2199
- },
2200
- {
2201
- "epoch": 0.66,
2202
- "eval_loss": 0.006922111380845308,
2203
- "eval_r_loss": 0.00609211903065443,
2204
- "eval_runtime": 8.5409,
2205
- "eval_s_loss": 0.0008299925248138607,
2206
- "eval_samples_per_second": 78.68,
2207
- "eval_steps_per_second": 78.68,
2208
- "step": 22000
2209
- },
2210
- {
2211
- "epoch": 0.66,
2212
- "learning_rate": 1.431781780468534e-05,
2213
- "loss": 0.0066,
2214
- "r_loss": 0.006054874509572983,
2215
- "s_loss": 0.0008940041880123317,
2216
- "step": 22100
2217
- },
2218
- {
2219
- "epoch": 0.66,
2220
- "learning_rate": 1.4119676400616625e-05,
2221
- "loss": 0.0065,
2222
- "r_loss": 0.005405670963227749,
2223
- "s_loss": 0.0009109702077694237,
2224
- "step": 22200
2225
- },
2226
- {
2227
- "epoch": 0.67,
2228
- "learning_rate": 1.3919690052810628e-05,
2229
- "loss": 0.0064,
2230
- "r_loss": 0.004920615814626217,
2231
- "s_loss": 0.0008736539166420698,
2232
- "step": 22300
2233
- },
2234
- {
2235
- "epoch": 0.67,
2236
- "learning_rate": 1.3719896557637283e-05,
2237
- "loss": 0.0064,
2238
- "r_loss": 0.0057085175067186356,
2239
- "s_loss": 0.0008784402743913233,
2240
- "step": 22400
2241
- },
2242
- {
2243
- "epoch": 0.67,
2244
- "learning_rate": 1.3520331581436254e-05,
2245
- "loss": 0.0065,
2246
- "r_loss": 0.004970056004822254,
2247
- "s_loss": 0.0008474025526084006,
2248
- "step": 22500
2249
- },
2250
- {
2251
- "epoch": 0.67,
2252
- "eval_loss": 0.00664812745526433,
2253
- "eval_r_loss": 0.005824685096740723,
2254
- "eval_runtime": 8.4868,
2255
- "eval_s_loss": 0.0008234424167312682,
2256
- "eval_samples_per_second": 79.181,
2257
- "eval_steps_per_second": 79.181,
2258
- "step": 22500
2259
- },
2260
- {
2261
- "epoch": 0.67,
2262
- "learning_rate": 1.3321030749752928e-05,
2263
- "loss": 0.0065,
2264
- "r_loss": 0.006657294929027557,
2265
- "s_loss": 0.0008960987906903028,
2266
- "step": 22600
2267
- },
2268
- {
2269
- "epoch": 0.68,
2270
- "learning_rate": 1.3122029640978642e-05,
2271
- "loss": 0.0065,
2272
- "r_loss": 0.0066483840346336365,
2273
- "s_loss": 0.0008470122702419758,
2274
- "step": 22700
2275
- },
2276
- {
2277
- "epoch": 0.68,
2278
- "learning_rate": 1.2923363779999415e-05,
2279
- "loss": 0.0064,
2280
- "r_loss": 0.006104170344769955,
2281
- "s_loss": 0.000873480923473835,
2282
- "step": 22800
2283
- },
2284
- {
2285
- "epoch": 0.68,
2286
- "learning_rate": 1.2725068631854143e-05,
2287
- "loss": 0.0065,
2288
- "r_loss": 0.0065166400745511055,
2289
- "s_loss": 0.0008359896601177752,
2290
- "step": 22900
2291
- },
2292
- {
2293
- "epoch": 0.69,
2294
- "learning_rate": 1.2527179595403555e-05,
2295
- "loss": 0.0065,
2296
- "r_loss": 0.005165203474462032,
2297
- "s_loss": 0.0008489371393807232,
2298
- "step": 23000
2299
- },
2300
- {
2301
- "epoch": 0.69,
2302
- "eval_loss": 0.007038415875285864,
2303
- "eval_r_loss": 0.006207308266311884,
2304
- "eval_runtime": 8.684,
2305
- "eval_s_loss": 0.00083110760897398,
2306
- "eval_samples_per_second": 77.384,
2307
- "eval_steps_per_second": 77.384,
2308
- "step": 23000
2309
- },
2310
- {
2311
- "epoch": 0.69,
2312
- "learning_rate": 1.2329731997010932e-05,
2313
- "loss": 0.0064,
2314
- "r_loss": 0.005528077483177185,
2315
- "s_loss": 0.0008984919404610991,
2316
- "step": 23100
2317
- },
2318
- {
2319
- "epoch": 0.69,
2320
- "learning_rate": 1.2132761084235799e-05,
2321
- "loss": 0.0064,
2322
- "r_loss": 0.005268075503408909,
2323
- "s_loss": 0.0008533818763680756,
2324
- "step": 23200
2325
- },
2326
- {
2327
- "epoch": 0.7,
2328
- "learning_rate": 1.1936302019541638e-05,
2329
- "loss": 0.0065,
2330
- "r_loss": 0.004923930391669273,
2331
- "s_loss": 0.0008330261334776878,
2332
- "step": 23300
2333
- },
2334
- {
2335
- "epoch": 0.7,
2336
- "learning_rate": 1.1740389874018872e-05,
2337
- "loss": 0.0064,
2338
- "r_loss": 0.0052954284474253654,
2339
- "s_loss": 0.0008716843440197408,
2340
- "step": 23400
2341
- },
2342
- {
2343
- "epoch": 0.7,
2344
- "learning_rate": 1.1545059621124078e-05,
2345
- "loss": 0.0064,
2346
- "r_loss": 0.005905309226363897,
2347
- "s_loss": 0.0008605217444710433,
2348
- "step": 23500
2349
- },
2350
- {
2351
- "epoch": 0.7,
2352
- "eval_loss": 0.006766035221517086,
2353
- "eval_r_loss": 0.005948369391262531,
2354
- "eval_runtime": 9.2172,
2355
- "eval_s_loss": 0.0008176658302545547,
2356
- "eval_samples_per_second": 72.908,
2357
- "eval_steps_per_second": 72.908,
2358
- "step": 23500
2359
- },
2360
- {
2361
- "epoch": 0.7,
2362
- "learning_rate": 1.1350346130436692e-05,
2363
- "loss": 0.0064,
2364
- "r_loss": 0.005245131440460682,
2365
- "s_loss": 0.0008455686620436609,
2366
- "step": 23600
2367
- },
2368
- {
2369
- "epoch": 0.71,
2370
- "learning_rate": 1.1156284161434186e-05,
2371
- "loss": 0.0064,
2372
- "r_loss": 0.006068192422389984,
2373
- "s_loss": 0.0008869940065778792,
2374
- "step": 23700
2375
- },
2376
- {
2377
- "epoch": 0.71,
2378
- "learning_rate": 1.0962908357287e-05,
2379
- "loss": 0.0063,
2380
- "r_loss": 0.00627498421818018,
2381
- "s_loss": 0.0008408837020397186,
2382
- "step": 23800
2383
- },
2384
- {
2385
- "epoch": 0.71,
2386
- "learning_rate": 1.0770253238674148e-05,
2387
- "loss": 0.0065,
2388
- "r_loss": 0.006341907661408186,
2389
- "s_loss": 0.0008763980586081743,
2390
- "step": 23900
2391
- },
2392
- {
2393
- "epoch": 0.72,
2394
- "learning_rate": 1.0578353197620722e-05,
2395
- "loss": 0.0064,
2396
- "r_loss": 0.005438666325062513,
2397
- "s_loss": 0.0008361585787497461,
2398
- "step": 24000
2399
- },
2400
- {
2401
- "epoch": 0.72,
2402
- "eval_loss": 0.006421332713216543,
2403
- "eval_r_loss": 0.005617132410407066,
2404
- "eval_runtime": 8.6645,
2405
- "eval_s_loss": 0.000804200186394155,
2406
- "eval_samples_per_second": 77.558,
2407
- "eval_steps_per_second": 77.558,
2408
- "step": 24000
2409
- },
2410
- {
2411
- "epoch": 0.72,
2412
- "learning_rate": 1.0387242491358379e-05,
2413
- "loss": 0.0064,
2414
- "r_loss": 0.004842773545533419,
2415
- "s_loss": 0.0008286428637802601,
2416
- "step": 24100
2417
- },
2418
- {
2419
- "epoch": 0.72,
2420
- "learning_rate": 1.0196955236209875e-05,
2421
- "loss": 0.0064,
2422
- "r_loss": 0.005401437636464834,
2423
- "s_loss": 0.0008410606533288956,
2424
- "step": 24200
2425
- },
2426
- {
2427
- "epoch": 0.73,
2428
- "learning_rate": 1.0007525401498747e-05,
2429
- "loss": 0.0064,
2430
- "r_loss": 0.005490908399224281,
2431
- "s_loss": 0.0009162276983261108,
2432
- "step": 24300
2433
- },
2434
- {
2435
- "epoch": 0.73,
2436
- "learning_rate": 9.818986803485237e-06,
2437
- "loss": 0.0064,
2438
- "r_loss": 0.004949535708874464,
2439
- "s_loss": 0.0008561740978620946,
2440
- "step": 24400
2441
- },
2442
- {
2443
- "epoch": 0.73,
2444
- "learning_rate": 9.63137309932957e-06,
2445
- "loss": 0.0063,
2446
- "r_loss": 0.004786663688719273,
2447
- "s_loss": 0.0008052530465647578,
2448
- "step": 24500
2449
- },
2450
- {
2451
- "epoch": 0.73,
2452
- "eval_loss": 0.006642198655754328,
2453
- "eval_r_loss": 0.005832642316818237,
2454
- "eval_runtime": 8.6342,
2455
- "eval_s_loss": 0.0008095565135590732,
2456
- "eval_samples_per_second": 77.83,
2457
- "eval_steps_per_second": 77.83,
2458
- "step": 24500
2459
- },
2460
- {
2461
- "epoch": 0.73,
2462
- "learning_rate": 9.444717781083603e-06,
2463
- "loss": 0.0064,
2464
- "r_loss": 0.004507116507738829,
2465
- "s_loss": 0.0007864255458116531,
2466
- "step": 24600
2467
- },
2468
- {
2469
- "epoch": 0.74,
2470
- "learning_rate": 9.25905416971195e-06,
2471
- "loss": 0.0063,
2472
- "r_loss": 0.005326538346707821,
2473
- "s_loss": 0.0008638648432679474,
2474
- "step": 24700
2475
- },
2476
- {
2477
- "epoch": 0.74,
2478
- "learning_rate": 9.074415409143651e-06,
2479
- "loss": 0.0064,
2480
- "r_loss": 0.005989129655063152,
2481
- "s_loss": 0.0009261829545721412,
2482
- "step": 24800
2483
- },
2484
- {
2485
- "epoch": 0.74,
2486
- "learning_rate": 8.890834460355467e-06,
2487
- "loss": 0.0063,
2488
- "r_loss": 0.004806933458894491,
2489
- "s_loss": 0.0008874195045791566,
2490
- "step": 24900
2491
- },
2492
- {
2493
- "epoch": 0.75,
2494
- "learning_rate": 8.708344095487813e-06,
2495
- "loss": 0.0063,
2496
- "r_loss": 0.0051068831235170364,
2497
- "s_loss": 0.000819625158328563,
2498
- "step": 25000
2499
- },
2500
- {
2501
- "epoch": 0.75,
2502
- "eval_loss": 0.006493990775197744,
2503
- "eval_r_loss": 0.005688361823558807,
2504
- "eval_runtime": 8.6045,
2505
- "eval_s_loss": 0.000805628951638937,
2506
- "eval_samples_per_second": 78.098,
2507
- "eval_steps_per_second": 78.098,
2508
- "step": 25000
2509
- },
2510
- {
2511
- "epoch": 0.75,
2512
- "learning_rate": 8.526976891994414e-06,
2513
- "loss": 0.0062,
2514
- "r_loss": 0.006579666864126921,
2515
- "s_loss": 0.0009487958159297705,
2516
- "step": 25100
2517
- },
2518
- {
2519
- "epoch": 0.75,
2520
- "learning_rate": 8.346765226826655e-06,
2521
- "loss": 0.0063,
2522
- "r_loss": 0.004679057281464338,
2523
- "s_loss": 0.0007369701052084565,
2524
- "step": 25200
2525
- },
2526
- {
2527
- "epoch": 0.75,
2528
- "learning_rate": 8.167741270653863e-06,
2529
- "loss": 0.0062,
2530
- "r_loss": 0.005413247272372246,
2531
- "s_loss": 0.000827790005132556,
2532
- "step": 25300
2533
- },
2534
- {
2535
- "epoch": 0.76,
2536
- "learning_rate": 7.989936982120253e-06,
2537
- "loss": 0.0062,
2538
- "r_loss": 0.0051015885546803474,
2539
- "s_loss": 0.0007679238333366811,
2540
- "step": 25400
2541
- },
2542
- {
2543
- "epoch": 0.76,
2544
- "learning_rate": 7.813384102139837e-06,
2545
- "loss": 0.0062,
2546
- "r_loss": 0.007656958419829607,
2547
- "s_loss": 0.0009585011284798384,
2548
- "step": 25500
2549
- },
2550
- {
2551
- "epoch": 0.76,
2552
- "eval_loss": 0.006621798500418663,
2553
- "eval_r_loss": 0.005829837638884783,
2554
- "eval_runtime": 8.6093,
2555
- "eval_s_loss": 0.0007919610943645239,
2556
- "eval_samples_per_second": 78.055,
2557
- "eval_steps_per_second": 78.055,
2558
- "step": 25500
2559
- },
2560
- {
2561
- "epoch": 0.76,
2562
- "learning_rate": 7.63811414823016e-06,
2563
- "loss": 0.0063,
2564
- "r_loss": 0.00443157646805048,
2565
- "s_loss": 0.0008406736305914819,
2566
- "step": 25600
2567
- },
2568
- {
2569
- "epoch": 0.77,
2570
- "learning_rate": 7.464158408885898e-06,
2571
- "loss": 0.0061,
2572
- "r_loss": 0.005125071853399277,
2573
- "s_loss": 0.0008355857571586967,
2574
- "step": 25700
2575
- },
2576
- {
2577
- "epoch": 0.77,
2578
- "learning_rate": 7.291547937993373e-06,
2579
- "loss": 0.0062,
2580
- "r_loss": 0.00465528666973114,
2581
- "s_loss": 0.000775384483858943,
2582
- "step": 25800
2583
- },
2584
- {
2585
- "epoch": 0.77,
2586
- "learning_rate": 7.1203135492869385e-06,
2587
- "loss": 0.0061,
2588
- "r_loss": 0.004815374501049519,
2589
- "s_loss": 0.0007990074809640646,
2590
- "step": 25900
2591
- },
2592
- {
2593
- "epoch": 0.78,
2594
- "learning_rate": 6.952177025557144e-06,
2595
- "loss": 0.0062,
2596
- "r_loss": 0.004868610296398401,
2597
- "s_loss": 0.0007737652049399912,
2598
- "step": 26000
2599
- },
2600
- {
2601
- "epoch": 0.78,
2602
- "eval_loss": 0.0064270892180502415,
2603
- "eval_r_loss": 0.005634445697069168,
2604
- "eval_runtime": 9.0404,
2605
- "eval_s_loss": 0.0007926435209810734,
2606
- "eval_samples_per_second": 74.333,
2607
- "eval_steps_per_second": 74.333,
2608
- "step": 26000
2609
- },
2610
- {
2611
- "epoch": 0.78,
2612
- "learning_rate": 6.78377173545463e-06,
2613
- "loss": 0.0062,
2614
- "r_loss": 0.004927606321871281,
2615
- "s_loss": 0.0008687089430168271,
2616
- "step": 26100
2617
- },
2618
- {
2619
- "epoch": 0.78,
2620
- "learning_rate": 6.616833173725176e-06,
2621
- "loss": 0.0062,
2622
- "r_loss": 0.004880106542259455,
2623
- "s_loss": 0.0008371942676603794,
2624
- "step": 26200
2625
- },
2626
- {
2627
- "epoch": 0.78,
2628
- "learning_rate": 6.451391141576487e-06,
2629
- "loss": 0.0062,
2630
- "r_loss": 0.004924027249217033,
2631
- "s_loss": 0.0008956523961387575,
2632
- "step": 26300
2633
- },
2634
- {
2635
- "epoch": 0.79,
2636
- "learning_rate": 6.287475173061754e-06,
2637
- "loss": 0.0061,
2638
- "r_loss": 0.005853409878909588,
2639
- "s_loss": 0.0008864904521033168,
2640
- "step": 26400
2641
- },
2642
- {
2643
- "epoch": 0.79,
2644
- "learning_rate": 6.12511452980738e-06,
2645
- "loss": 0.0062,
2646
- "r_loss": 0.0052686696872115135,
2647
- "s_loss": 0.000804445066023618,
2648
- "step": 26500
2649
- },
2650
- {
2651
- "epoch": 0.79,
2652
- "eval_loss": 0.006505042780190706,
2653
- "eval_r_loss": 0.005706477910280228,
2654
- "eval_runtime": 8.801,
2655
- "eval_s_loss": 0.0007985649281181395,
2656
- "eval_samples_per_second": 76.355,
2657
- "eval_steps_per_second": 76.355,
2658
- "step": 26500
2659
- },
2660
- {
2661
- "epoch": 0.79,
2662
- "learning_rate": 5.9643381957892725e-06,
2663
- "loss": 0.0062,
2664
- "r_loss": 0.0047812857665121555,
2665
- "s_loss": 0.0008621865999884903,
2666
- "step": 26600
2667
- },
2668
- {
2669
- "epoch": 0.8,
2670
- "learning_rate": 5.805174872158762e-06,
2671
- "loss": 0.0061,
2672
- "r_loss": 0.004578050691634417,
2673
- "s_loss": 0.0008361663785763085,
2674
- "step": 26700
2675
- },
2676
- {
2677
- "epoch": 0.8,
2678
- "learning_rate": 5.647652972118998e-06,
2679
- "loss": 0.0062,
2680
- "r_loss": 0.0053869327530264854,
2681
- "s_loss": 0.0008459505042992532,
2682
- "step": 26800
2683
- },
2684
- {
2685
- "epoch": 0.8,
2686
- "learning_rate": 5.491800615852702e-06,
2687
- "loss": 0.0061,
2688
- "r_loss": 0.006079402752220631,
2689
- "s_loss": 0.0008894907077774405,
2690
- "step": 26900
2691
- },
2692
- {
2693
- "epoch": 0.81,
2694
- "learning_rate": 5.337645625502312e-06,
2695
- "loss": 0.0061,
2696
- "r_loss": 0.005296847317367792,
2697
- "s_loss": 0.0007577301003038883,
2698
- "step": 27000
2699
- },
2700
- {
2701
- "epoch": 0.81,
2702
- "eval_loss": 0.006496031768620014,
2703
- "eval_r_loss": 0.00570162758231163,
2704
- "eval_runtime": 8.61,
2705
- "eval_s_loss": 0.0007944039534777403,
2706
- "eval_samples_per_second": 78.048,
2707
- "eval_steps_per_second": 78.048,
2708
- "step": 27000
2709
- },
2710
- {
2711
- "epoch": 0.81,
2712
- "learning_rate": 5.18521552020322e-06,
2713
- "loss": 0.0062,
2714
- "r_loss": 0.0052690450102090836,
2715
- "s_loss": 0.0009067388018593192,
2716
- "step": 27100
2717
- },
2718
- {
2719
- "epoch": 0.81,
2720
- "learning_rate": 5.034537511171238e-06,
2721
- "loss": 0.0061,
2722
- "r_loss": 0.005185704678297043,
2723
- "s_loss": 0.0008527652826160192,
2724
- "step": 27200
2725
- },
2726
- {
2727
- "epoch": 0.81,
2728
- "learning_rate": 4.885638496844903e-06,
2729
- "loss": 0.006,
2730
- "r_loss": 0.005816085264086723,
2731
- "s_loss": 0.0008826929260976613,
2732
- "step": 27300
2733
- },
2734
- {
2735
- "epoch": 0.82,
2736
- "learning_rate": 4.738545058083715e-06,
2737
- "loss": 0.006,
2738
- "r_loss": 0.004605771973729134,
2739
- "s_loss": 0.0008187288185581565,
2740
- "step": 27400
2741
- },
2742
- {
2743
- "epoch": 0.82,
2744
- "learning_rate": 4.593283453422984e-06,
2745
- "loss": 0.0061,
2746
- "r_loss": 0.0052945734933018684,
2747
- "s_loss": 0.0008752761059440672,
2748
- "step": 27500
2749
- },
2750
- {
2751
- "epoch": 0.82,
2752
- "eval_loss": 0.006313642952591181,
2753
- "eval_r_loss": 0.005525515414774418,
2754
- "eval_runtime": 10.8866,
2755
- "eval_s_loss": 0.0007881273631937802,
2756
- "eval_samples_per_second": 61.727,
2757
- "eval_steps_per_second": 61.727,
2758
- "step": 27500
2759
- },
2760
- {
2761
- "epoch": 0.82,
2762
- "learning_rate": 4.449879614386313e-06,
2763
- "loss": 0.0061,
2764
- "r_loss": 0.005148299969732761,
2765
- "s_loss": 0.0007778692524880171,
2766
- "step": 27600
2767
- },
2768
- {
2769
- "epoch": 0.83,
2770
- "learning_rate": 4.308359140856364e-06,
2771
- "loss": 0.006,
2772
- "r_loss": 0.004953067749738693,
2773
- "s_loss": 0.0008753555594012141,
2774
- "step": 27700
2775
- },
2776
- {
2777
- "epoch": 0.83,
2778
- "learning_rate": 4.168747296504881e-06,
2779
- "loss": 0.0061,
2780
- "r_loss": 0.005346274934709072,
2781
- "s_loss": 0.0008310099365189672,
2782
- "step": 27800
2783
- },
2784
- {
2785
- "epoch": 0.83,
2786
- "learning_rate": 4.031069004282739e-06,
2787
- "loss": 0.006,
2788
- "r_loss": 0.00487141078338027,
2789
- "s_loss": 0.0008442049147561193,
2790
- "step": 27900
2791
- },
2792
- {
2793
- "epoch": 0.84,
2794
- "learning_rate": 3.895348841970758e-06,
2795
- "loss": 0.0059,
2796
- "r_loss": 0.004993779119104147,
2797
- "s_loss": 0.0007509638089686632,
2798
- "step": 28000
2799
- },
2800
- {
2801
- "epoch": 0.84,
2802
- "eval_loss": 0.006438162177801132,
2803
- "eval_r_loss": 0.005650770850479603,
2804
- "eval_runtime": 8.473,
2805
- "eval_s_loss": 0.0007873910944908857,
2806
- "eval_samples_per_second": 79.311,
2807
- "eval_steps_per_second": 79.311,
2808
- "step": 28000
2809
- },
2810
- {
2811
- "epoch": 0.84,
2812
- "learning_rate": 3.7616110377922263e-06,
2813
- "loss": 0.0061,
2814
- "r_loss": 0.006000261753797531,
2815
- "s_loss": 0.0009082874748855829,
2816
- "step": 28100
2817
- },
2818
- {
2819
- "epoch": 0.84,
2820
- "learning_rate": 3.6298794660877154e-06,
2821
- "loss": 0.0061,
2822
- "r_loss": 0.006097930949181318,
2823
- "s_loss": 0.0009037306881509721,
2824
- "step": 28200
2825
- },
2826
- {
2827
- "epoch": 0.84,
2828
- "learning_rate": 3.50017764305319e-06,
2829
- "loss": 0.0061,
2830
- "r_loss": 0.006982284598052502,
2831
- "s_loss": 0.0008789664716459811,
2832
- "step": 28300
2833
- },
2834
- {
2835
- "epoch": 0.85,
2836
- "learning_rate": 3.3725287225419365e-06,
2837
- "loss": 0.0061,
2838
- "r_loss": 0.004647083580493927,
2839
- "s_loss": 0.0008192590321414173,
2840
- "step": 28400
2841
- },
2842
- {
2843
- "epoch": 0.85,
2844
- "learning_rate": 3.2469554919312733e-06,
2845
- "loss": 0.006,
2846
- "r_loss": 0.00463445670902729,
2847
- "s_loss": 0.0008258245070464909,
2848
- "step": 28500
2849
- },
2850
- {
2851
- "epoch": 0.85,
2852
- "eval_loss": 0.006398391909897327,
2853
- "eval_r_loss": 0.005606560967862606,
2854
- "eval_runtime": 8.6124,
2855
- "eval_s_loss": 0.0007918307092040777,
2856
- "eval_samples_per_second": 78.027,
2857
- "eval_steps_per_second": 78.027,
2858
- "step": 28500
2859
- },
2860
- {
2861
- "epoch": 0.85,
2862
- "learning_rate": 3.12470466097668e-06,
2863
- "loss": 0.0059,
2864
- "r_loss": 0.004881501197814941,
2865
- "s_loss": 0.0008381219813600183,
2866
- "step": 28600
2867
- },
2868
- {
2869
- "epoch": 0.86,
2870
- "learning_rate": 3.0033283767617047e-06,
2871
- "loss": 0.006,
2872
- "r_loss": 0.004456360824406147,
2873
- "s_loss": 0.0007237752433866262,
2874
- "step": 28700
2875
- },
2876
- {
2877
- "epoch": 0.86,
2878
- "learning_rate": 2.8840936906237913e-06,
2879
- "loss": 0.006,
2880
- "r_loss": 0.005578363314270973,
2881
- "s_loss": 0.0008311424753628671,
2882
- "step": 28800
2883
- },
2884
- {
2885
- "epoch": 0.86,
2886
- "learning_rate": 2.7670218878646273e-06,
2887
- "loss": 0.0059,
2888
- "r_loss": 0.005354044958949089,
2889
- "s_loss": 0.0008516995585523546,
2890
- "step": 28900
2891
- },
2892
- {
2893
- "epoch": 0.87,
2894
- "learning_rate": 2.6521338676765317e-06,
2895
- "loss": 0.006,
2896
- "r_loss": 0.0043981922790408134,
2897
- "s_loss": 0.0007206485024653375,
2898
- "step": 29000
2899
- },
2900
- {
2901
- "epoch": 0.87,
2902
- "eval_loss": 0.006455121096223593,
2903
- "eval_r_loss": 0.00566498190164566,
2904
- "eval_runtime": 10.8485,
2905
- "eval_s_loss": 0.0007901391945779324,
2906
- "eval_samples_per_second": 61.944,
2907
- "eval_steps_per_second": 61.944,
2908
- "step": 29000
2909
- },
2910
- {
2911
- "epoch": 0.87,
2912
- "learning_rate": 2.5394501394116805e-06,
2913
- "loss": 0.006,
2914
- "r_loss": 0.0046515436843037605,
2915
- "s_loss": 0.0008327921386808157,
2916
- "step": 29100
2917
- },
2918
- {
2919
- "epoch": 0.87,
2920
- "learning_rate": 2.4289908189208086e-06,
2921
- "loss": 0.006,
2922
- "r_loss": 0.005409231409430504,
2923
- "s_loss": 0.0008956742822192609,
2924
- "step": 29200
2925
- },
2926
- {
2927
- "epoch": 0.87,
2928
- "learning_rate": 2.320775624962243e-06,
2929
- "loss": 0.0059,
2930
- "r_loss": 0.005478174425661564,
2931
- "s_loss": 0.0007779388688504696,
2932
- "step": 29300
2933
- },
2934
- {
2935
- "epoch": 0.88,
2936
- "learning_rate": 2.214823875681753e-06,
2937
- "loss": 0.006,
2938
- "r_loss": 0.005899087525904179,
2939
- "s_loss": 0.0008006141288205981,
2940
- "step": 29400
2941
- },
2942
- {
2943
- "epoch": 0.88,
2944
- "learning_rate": 2.1111544851639887e-06,
2945
- "loss": 0.006,
2946
- "r_loss": 0.004816343542188406,
2947
- "s_loss": 0.0008681662729941308,
2948
- "step": 29500
2949
- },
2950
- {
2951
- "epoch": 0.88,
2952
- "eval_loss": 0.006466238759458065,
2953
- "eval_r_loss": 0.005683359690010548,
2954
- "eval_runtime": 8.6417,
2955
- "eval_s_loss": 0.0007828791276551783,
2956
- "eval_samples_per_second": 77.763,
2957
- "eval_steps_per_second": 77.763,
2958
- "step": 29500
2959
- },
2960
- {
2961
- "epoch": 0.88,
2962
- "learning_rate": 2.009785960055983e-06,
2963
- "loss": 0.0059,
2964
- "r_loss": 0.005228263325989246,
2965
- "s_loss": 0.0008743102662265301,
2966
- "step": 29600
2967
- },
2968
- {
2969
- "epoch": 0.89,
2970
- "learning_rate": 1.910736396263449e-06,
2971
- "loss": 0.006,
2972
- "r_loss": 0.005140687804669142,
2973
- "s_loss": 0.0007702410221099854,
2974
- "step": 29700
2975
- },
2976
- {
2977
- "epoch": 0.89,
2978
- "learning_rate": 1.8140234757203395e-06,
2979
- "loss": 0.006,
2980
- "r_loss": 0.004959780722856522,
2981
- "s_loss": 0.0007955725886859,
2982
- "step": 29800
2983
- },
2984
- {
2985
- "epoch": 0.89,
2986
- "learning_rate": 1.7196644632323532e-06,
2987
- "loss": 0.0059,
2988
- "r_loss": 0.00630287267267704,
2989
- "s_loss": 0.0008392567397095263,
2990
- "step": 29900
2991
- },
2992
- {
2993
- "epoch": 0.9,
2994
- "learning_rate": 1.6276762033949055e-06,
2995
- "loss": 0.006,
2996
- "r_loss": 0.00496282521635294,
2997
- "s_loss": 0.0008307393291033804,
2998
- "step": 30000
2999
- },
3000
- {
3001
- "epoch": 0.9,
3002
- "eval_loss": 0.006491221487522125,
3003
- "eval_r_loss": 0.005704117473214865,
3004
- "eval_runtime": 13.2979,
3005
- "eval_s_loss": 0.0007871038978919387,
3006
- "eval_samples_per_second": 50.534,
3007
- "eval_steps_per_second": 50.534,
3008
- "step": 30000
3009
- },
3010
- {
3011
- "epoch": 0.9,
3012
- "learning_rate": 1.5380751175860619e-06,
3013
- "loss": 0.006,
3014
- "r_loss": 0.004992773290723562,
3015
- "s_loss": 0.0007617148803547025,
3016
- "step": 30100
3017
- },
3018
- {
3019
- "epoch": 0.9,
3020
- "learning_rate": 1.4508772010350967e-06,
3021
- "loss": 0.0059,
3022
- "r_loss": 0.005444124806672335,
3023
- "s_loss": 0.000857758685015142,
3024
- "step": 30200
3025
- },
3026
- {
3027
- "epoch": 0.9,
3028
- "learning_rate": 1.3660980199670624e-06,
3029
- "loss": 0.006,
3030
- "r_loss": 0.004532184451818466,
3031
- "s_loss": 0.0007677034009248018,
3032
- "step": 30300
3033
- },
3034
- {
3035
- "epoch": 0.91,
3036
- "learning_rate": 1.2837527088239886e-06,
3037
- "loss": 0.0058,
3038
- "r_loss": 0.003988460171967745,
3039
- "s_loss": 0.0007943719392642379,
3040
- "step": 30400
3041
- },
3042
- {
3043
- "epoch": 0.91,
3044
- "learning_rate": 1.2038559675631167e-06,
3045
- "loss": 0.006,
3046
- "r_loss": 0.005079349968582392,
3047
- "s_loss": 0.0008392308373004198,
3048
- "step": 30500
3049
- },
3050
- {
3051
- "epoch": 0.91,
3052
- "eval_loss": 0.0063784122467041016,
3053
- "eval_r_loss": 0.005593848414719105,
3054
- "eval_runtime": 11.2008,
3055
- "eval_s_loss": 0.0007845640648156404,
3056
- "eval_samples_per_second": 59.996,
3057
- "eval_steps_per_second": 59.996,
3058
- "step": 30500
3059
- },
3060
- {
3061
- "epoch": 0.91,
3062
- "learning_rate": 1.1264220590327507e-06,
3063
- "loss": 0.0061,
3064
- "r_loss": 0.004688158631324768,
3065
- "s_loss": 0.0008526835008524358,
3066
- "step": 30600
3067
- },
3068
- {
3069
- "epoch": 0.92,
3070
- "learning_rate": 1.0514648064260858e-06,
3071
- "loss": 0.0059,
3072
- "r_loss": 0.005976270884275436,
3073
- "s_loss": 0.0009033031528815627,
3074
- "step": 30700
3075
- },
3076
- {
3077
- "epoch": 0.92,
3078
- "learning_rate": 9.789975908135684e-07,
3079
- "loss": 0.006,
3080
- "r_loss": 0.004994697868824005,
3081
- "s_loss": 0.0007998015498742461,
3082
- "step": 30800
3083
- },
3084
- {
3085
- "epoch": 0.92,
3086
- "learning_rate": 9.09033348754143e-07,
3087
- "loss": 0.0059,
3088
- "r_loss": 0.004837275482714176,
3089
- "s_loss": 0.0008055656799115241,
3090
- "step": 30900
3091
- },
3092
- {
3093
- "epoch": 0.93,
3094
- "learning_rate": 8.415845699858748e-07,
3095
- "loss": 0.0059,
3096
- "r_loss": 0.00475228950381279,
3097
- "s_loss": 0.0007835312280803919,
3098
- "step": 31000
3099
- },
3100
- {
3101
- "epoch": 0.93,
3102
- "eval_loss": 0.006395082455128431,
3103
- "eval_r_loss": 0.005607670173048973,
3104
- "eval_runtime": 9.003,
3105
- "eval_s_loss": 0.0007874123984947801,
3106
- "eval_samples_per_second": 74.642,
3107
- "eval_steps_per_second": 74.642,
3108
- "step": 31000
3109
- },
3110
- {
3111
- "epoch": 0.93,
3112
- "learning_rate": 7.766632951963343e-07,
3113
- "loss": 0.0059,
3114
- "r_loss": 0.005764458328485489,
3115
- "s_loss": 0.0008497489034198225,
3116
- "step": 31100
3117
- },
3118
- {
3119
- "epoch": 0.93,
3120
- "learning_rate": 7.142811138731459e-07,
3121
- "loss": 0.006,
3122
- "r_loss": 0.005635739304125309,
3123
- "s_loss": 0.0007945407414808869,
3124
- "step": 31200
3125
- },
3126
- {
3127
- "epoch": 0.93,
3128
- "learning_rate": 6.544491622350635e-07,
3129
- "loss": 0.006,
3130
- "r_loss": 0.005129328928887844,
3131
- "s_loss": 0.0008712293347343802,
3132
- "step": 31300
3133
- },
3134
- {
3135
- "epoch": 0.94,
3136
- "learning_rate": 5.971781212439903e-07,
3137
- "loss": 0.0059,
3138
- "r_loss": 0.00499305035918951,
3139
- "s_loss": 0.0008739501936361194,
3140
- "step": 31400
3141
- },
3142
- {
3143
- "epoch": 0.94,
3144
- "learning_rate": 5.42478214698256e-07,
3145
- "loss": 0.006,
3146
- "r_loss": 0.005520367994904518,
3147
- "s_loss": 0.0008639748557470739,
3148
- "step": 31500
3149
- },
3150
- {
3151
- "epoch": 0.94,
3152
- "eval_loss": 0.006423806771636009,
3153
- "eval_r_loss": 0.005639917217195034,
3154
- "eval_runtime": 9.5481,
3155
- "eval_s_loss": 0.0007838893216103315,
3156
- "eval_samples_per_second": 70.381,
3157
- "eval_steps_per_second": 70.381,
3158
- "step": 31500
3159
- },
3160
- {
3161
- "epoch": 0.94,
3162
- "learning_rate": 4.903592074074914e-07,
3163
- "loss": 0.0059,
3164
- "r_loss": 0.004892979748547077,
3165
- "s_loss": 0.0007891397108323872,
3166
- "step": 31600
3167
- },
3168
- {
3169
- "epoch": 0.95,
3170
- "learning_rate": 4.408304034494748e-07,
3171
- "loss": 0.006,
3172
- "r_loss": 0.00487134512513876,
3173
- "s_loss": 0.0007864002254791558,
3174
- "step": 31700
3175
- },
3176
- {
3177
- "epoch": 0.95,
3178
- "learning_rate": 3.9390064450918195e-07,
3179
- "loss": 0.0059,
3180
- "r_loss": 0.004203725606203079,
3181
- "s_loss": 0.0007678180118091404,
3182
- "step": 31800
3183
- },
3184
- {
3185
- "epoch": 0.95,
3186
- "learning_rate": 3.495783083004273e-07,
3187
- "loss": 0.0059,
3188
- "r_loss": 0.005060626659542322,
3189
- "s_loss": 0.0008796448237262666,
3190
- "step": 31900
3191
- },
3192
- {
3193
- "epoch": 0.95,
3194
- "learning_rate": 3.0787130707028155e-07,
3195
- "loss": 0.0059,
3196
- "r_loss": 0.00450053671374917,
3197
- "s_loss": 0.0007761311717331409,
3198
- "step": 32000
3199
- },
3200
- {
3201
- "epoch": 0.95,
3202
- "eval_loss": 0.006359361112117767,
3203
- "eval_r_loss": 0.005576182622462511,
3204
- "eval_runtime": 8.5745,
3205
- "eval_s_loss": 0.0007831782568246126,
3206
- "eval_samples_per_second": 78.372,
3207
- "eval_steps_per_second": 78.372,
3208
- "step": 32000
3209
- },
3210
- {
3211
- "epoch": 0.96,
3212
- "learning_rate": 2.687870861866354e-07,
3213
- "loss": 0.0058,
3214
- "r_loss": 0.004816841334104538,
3215
- "s_loss": 0.0008092151256278157,
3216
- "step": 32100
3217
- },
3218
- {
3219
- "epoch": 0.96,
3220
- "learning_rate": 2.3233262280905887e-07,
3221
- "loss": 0.0058,
3222
- "r_loss": 0.005017046816647053,
3223
- "s_loss": 0.0008538334514014423,
3224
- "step": 32200
3225
- },
3226
- {
3227
- "epoch": 0.96,
3228
- "learning_rate": 1.985144246432896e-07,
3229
- "loss": 0.0059,
3230
- "r_loss": 0.004773393739014864,
3231
- "s_loss": 0.0008774587768130004,
3232
- "step": 32300
3233
- },
3234
- {
3235
- "epoch": 0.97,
3236
- "learning_rate": 1.673385287794771e-07,
3237
- "loss": 0.0059,
3238
- "r_loss": 0.004963357001543045,
3239
- "s_loss": 0.0008022256079129875,
3240
- "step": 32400
3241
- },
3242
- {
3243
- "epoch": 0.97,
3244
- "learning_rate": 1.3881050061448963e-07,
3245
- "loss": 0.0058,
3246
- "r_loss": 0.005114908795803785,
3247
- "s_loss": 0.0008205736521631479,
3248
- "step": 32500
3249
- },
3250
- {
3251
- "epoch": 0.97,
3252
- "eval_loss": 0.006366991437971592,
3253
- "eval_r_loss": 0.005584825295954943,
3254
- "eval_runtime": 8.9973,
3255
- "eval_s_loss": 0.0007821662584319711,
3256
- "eval_samples_per_second": 74.689,
3257
- "eval_steps_per_second": 74.689,
3258
- "step": 32500
3259
- },
3260
- {
3261
- "epoch": 0.97,
3262
- "learning_rate": 1.1342690454311188e-07,
3263
- "loss": 0.006,
3264
- "r_loss": 0.0049808090552687645,
3265
- "s_loss": 0.0008341091452166438,
3266
- "step": 32600
3267
- },
3268
- {
3269
- "epoch": 0.98,
3270
- "learning_rate": 9.015622251951239e-08,
3271
- "loss": 0.006,
3272
- "r_loss": 0.005653849337249994,
3273
- "s_loss": 0.0008855736814439297,
3274
- "step": 32700
3275
- },
3276
- {
3277
- "epoch": 0.98,
3278
- "learning_rate": 6.95471864728997e-08,
3279
- "loss": 0.0059,
3280
- "r_loss": 0.006156204268336296,
3281
- "s_loss": 0.0008836152264848351,
3282
- "step": 32800
3283
- },
3284
- {
3285
- "epoch": 0.98,
3286
- "learning_rate": 5.1603475446377335e-08,
3287
- "loss": 0.0059,
3288
- "r_loss": 0.005304061807692051,
3289
- "s_loss": 0.0009092881809920073,
3290
- "step": 32900
3291
- },
3292
- {
3293
- "epoch": 0.98,
3294
- "learning_rate": 3.6328292679828624e-08,
3295
- "loss": 0.0059,
3296
- "r_loss": 0.004687966778874397,
3297
- "s_loss": 0.0007698750705458224,
3298
- "step": 33000
3299
- },
3300
- {
3301
- "epoch": 0.98,
3302
- "eval_loss": 0.006356844212859869,
3303
- "eval_r_loss": 0.005574433133006096,
3304
- "eval_runtime": 8.7324,
3305
- "eval_s_loss": 0.0007824110798537731,
3306
- "eval_samples_per_second": 76.955,
3307
- "eval_steps_per_second": 76.955,
3308
- "step": 33000
3309
- },
3310
- {
3311
- "epoch": 0.99,
3312
- "learning_rate": 2.37243650380925e-08,
3313
- "loss": 0.0058,
3314
- "r_loss": 0.0053665488958358765,
3315
- "s_loss": 0.0008012793259695172,
3316
- "step": 33100
3317
- },
3318
- {
3319
- "epoch": 0.99,
3320
- "learning_rate": 1.3793942524174541e-08,
3321
- "loss": 0.0059,
3322
- "r_loss": 0.004292100202292204,
3323
- "s_loss": 0.0007570346933789551,
3324
- "step": 33200
3325
- },
3326
- {
3327
- "epoch": 0.99,
3328
- "learning_rate": 6.538797877583269e-09,
3329
- "loss": 0.0059,
3330
- "r_loss": 0.005391569808125496,
3331
- "s_loss": 0.0008714336436241865,
3332
- "step": 33300
3333
- },
3334
- {
3335
- "epoch": 1.0,
3336
- "learning_rate": 1.9602262578682917e-09,
3337
- "loss": 0.0059,
3338
- "r_loss": 0.004852307494729757,
3339
- "s_loss": 0.0007645284058526158,
3340
- "step": 33400
3341
- },
3342
- {
3343
- "epoch": 1.0,
3344
- "learning_rate": 5.90450134152487e-11,
3345
- "loss": 0.0059,
3346
- "r_loss": 0.0045965323224663734,
3347
- "s_loss": 0.0008654047851450741,
3348
- "step": 33500
3349
- },
3350
- {
3351
- "epoch": 1.0,
3352
- "eval_loss": 0.006353782489895821,
3353
- "eval_r_loss": 0.0055714258924126625,
3354
- "eval_runtime": 9.2592,
3355
- "eval_s_loss": 0.0007823564810678363,
3356
- "eval_samples_per_second": 72.576,
3357
- "eval_steps_per_second": 72.576,
3358
- "step": 33500
3359
- }
3360
- ],
3361
- "max_steps": 33513,
3362
- "num_train_epochs": 9223372036854775807,
3363
- "total_flos": 2.119174456955371e+16,
3364
- "trial_name": null,
3365
- "trial_params": null
3366
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:308f6c0d3916cfaa1801ef8138ac88dd659462c3200361fad5cfa39ff8dd4faf
3
- size 3899
 
 
 
 
last-checkpoint/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:359f0bb9c7fdaf6b9a47c37a482edc40df921b373339ce1a6c7b9f57147c1ae3
3
  size 264627500
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34e13ead3f3ca792bed6d415823f1aaf158e7fa1020ff23991dc7bd3acab5151
3
  size 264627500