hr16 commited on
Commit
70e60a8
·
verified ·
1 Parent(s): 34ea879

Training in progress, epoch 3, checkpoint

Browse files
checkpoint-1176/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "vinai/PhoWhisper-small",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "apply_spec_augment": false,
6
+ "architectures": [
7
+ "ViSpeechClassification"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "begin_suppress_tokens": [
11
+ 220,
12
+ 50257
13
+ ],
14
+ "bos_token_id": 50257,
15
+ "classifier_proj_size": 256,
16
+ "d_model": 768,
17
+ "decoder_attention_heads": 12,
18
+ "decoder_ffn_dim": 3072,
19
+ "decoder_layerdrop": 0.0,
20
+ "decoder_layers": 12,
21
+ "decoder_start_token_id": 50258,
22
+ "dropout": 0.0,
23
+ "encoder_attention_heads": 12,
24
+ "encoder_ffn_dim": 3072,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 12,
27
+ "eos_token_id": 50257,
28
+ "forced_decoder_ids": null,
29
+ "id2label": {
30
+ "0": "male, northern dialect",
31
+ "1": "male, central dialect",
32
+ "2": "male, highland central dialect",
33
+ "3": "male, southern dialect",
34
+ "4": "female, northern dialect",
35
+ "5": "female, central dialect",
36
+ "6": "female, highland central dialect",
37
+ "7": "female, southern dialect"
38
+ },
39
+ "init_std": 0.02,
40
+ "is_encoder_decoder": true,
41
+ "label2id": {
42
+ "female, central dialect": "5",
43
+ "female, highland central dialect": "6",
44
+ "female, northern dialect": "4",
45
+ "female, southern dialect": "7",
46
+ "male, central dialect": "1",
47
+ "male, highland central dialect": "2",
48
+ "male, northern dialect": "0",
49
+ "male, southern dialect": "3"
50
+ },
51
+ "mask_feature_length": 10,
52
+ "mask_feature_min_masks": 0,
53
+ "mask_feature_prob": 0.0,
54
+ "mask_time_length": 10,
55
+ "mask_time_min_masks": 2,
56
+ "mask_time_prob": 0.05,
57
+ "max_length": 448,
58
+ "max_source_positions": 1500,
59
+ "max_target_positions": 448,
60
+ "median_filter_width": 7,
61
+ "model_type": "whisper",
62
+ "num_hidden_layers": 12,
63
+ "num_mel_bins": 80,
64
+ "pad_token_id": 50257,
65
+ "scale_embedding": false,
66
+ "torch_dtype": "float32",
67
+ "transformers_version": "4.41.2",
68
+ "use_cache": true,
69
+ "use_weighted_layer_sum": false,
70
+ "vocab_size": 51865
71
+ }
checkpoint-1176/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44b68389db4184fcddf8babebeaf6ecf6999ca6e2cfc5f40b7f0732f93d7f7ee
3
+ size 358419776
checkpoint-1176/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:327c03ef146a6e3459eda80f4556fbee3215181c22d8fd1f70d12bf4f5bdbd22
3
+ size 11572486
checkpoint-1176/preprocessor_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "feature_extractor_type": "WhisperFeatureExtractor",
4
+ "feature_size": 80,
5
+ "hop_length": 160,
6
+ "n_fft": 400,
7
+ "n_samples": 480000,
8
+ "nb_max_frames": 3000,
9
+ "padding_side": "right",
10
+ "padding_value": 0.0,
11
+ "processor_class": "WhisperProcessor",
12
+ "return_attention_mask": false,
13
+ "sampling_rate": 16000
14
+ }
checkpoint-1176/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98568fadb38cbbfb546077baa5dff5d3749afaf00a76163724a6f7fcaf1cc6b1
3
+ size 14244
checkpoint-1176/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:712af847cd25fd3b46c8c2cd41167ccbb67b9b017fe8b3338f1d77450b81b959
3
+ size 1064
checkpoint-1176/trainer_state.json ADDED
@@ -0,0 +1,1705 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.35918039373242266,
3
+ "best_model_checkpoint": "PhoWhisper-small-vispeech-classifier-v3/checkpoint-1176",
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1176,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.012755102040816327,
13
+ "grad_norm": 48589.94921875,
14
+ "learning_rate": 6.377551020408164e-08,
15
+ "loss": 2.0884,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.025510204081632654,
20
+ "grad_norm": 50532.125,
21
+ "learning_rate": 1.2755102040816328e-07,
22
+ "loss": 2.0887,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.03826530612244898,
27
+ "grad_norm": 50246.4296875,
28
+ "learning_rate": 1.913265306122449e-07,
29
+ "loss": 2.0885,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.05102040816326531,
34
+ "grad_norm": 57370.44140625,
35
+ "learning_rate": 2.5510204081632656e-07,
36
+ "loss": 2.0908,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.06377551020408163,
41
+ "grad_norm": 40133.53125,
42
+ "learning_rate": 3.188775510204082e-07,
43
+ "loss": 2.0889,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.07653061224489796,
48
+ "grad_norm": 58377.3671875,
49
+ "learning_rate": 3.826530612244898e-07,
50
+ "loss": 2.0874,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.08928571428571429,
55
+ "grad_norm": 50888.7421875,
56
+ "learning_rate": 4.4642857142857147e-07,
57
+ "loss": 2.088,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.10204081632653061,
62
+ "grad_norm": 44107.75390625,
63
+ "learning_rate": 5.102040816326531e-07,
64
+ "loss": 2.0872,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.11479591836734694,
69
+ "grad_norm": 62939.18359375,
70
+ "learning_rate": 5.739795918367347e-07,
71
+ "loss": 2.0873,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.12755102040816327,
76
+ "grad_norm": 42077.53515625,
77
+ "learning_rate": 6.377551020408164e-07,
78
+ "loss": 2.0852,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.14030612244897958,
83
+ "grad_norm": 50233.00390625,
84
+ "learning_rate": 7.015306122448979e-07,
85
+ "loss": 2.0861,
86
+ "step": 55
87
+ },
88
+ {
89
+ "epoch": 0.15306122448979592,
90
+ "grad_norm": 60839.4765625,
91
+ "learning_rate": 7.653061224489796e-07,
92
+ "loss": 2.0856,
93
+ "step": 60
94
+ },
95
+ {
96
+ "epoch": 0.16581632653061223,
97
+ "grad_norm": 57499.3515625,
98
+ "learning_rate": 8.290816326530612e-07,
99
+ "loss": 2.0838,
100
+ "step": 65
101
+ },
102
+ {
103
+ "epoch": 0.17857142857142858,
104
+ "grad_norm": 65030.765625,
105
+ "learning_rate": 8.928571428571429e-07,
106
+ "loss": 2.083,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 0.1913265306122449,
111
+ "grad_norm": 53192.35546875,
112
+ "learning_rate": 9.566326530612244e-07,
113
+ "loss": 2.0841,
114
+ "step": 75
115
+ },
116
+ {
117
+ "epoch": 0.20408163265306123,
118
+ "grad_norm": 52991.80078125,
119
+ "learning_rate": 1.0204081632653063e-06,
120
+ "loss": 2.0817,
121
+ "step": 80
122
+ },
123
+ {
124
+ "epoch": 0.21683673469387754,
125
+ "grad_norm": 48656.234375,
126
+ "learning_rate": 1.0841836734693879e-06,
127
+ "loss": 2.0813,
128
+ "step": 85
129
+ },
130
+ {
131
+ "epoch": 0.22959183673469388,
132
+ "grad_norm": 63459.2421875,
133
+ "learning_rate": 1.1479591836734695e-06,
134
+ "loss": 2.081,
135
+ "step": 90
136
+ },
137
+ {
138
+ "epoch": 0.2423469387755102,
139
+ "grad_norm": 47800.3515625,
140
+ "learning_rate": 1.211734693877551e-06,
141
+ "loss": 2.0775,
142
+ "step": 95
143
+ },
144
+ {
145
+ "epoch": 0.25510204081632654,
146
+ "grad_norm": 64353.37890625,
147
+ "learning_rate": 1.2755102040816329e-06,
148
+ "loss": 2.0772,
149
+ "step": 100
150
+ },
151
+ {
152
+ "epoch": 0.26785714285714285,
153
+ "grad_norm": 59652.29296875,
154
+ "learning_rate": 1.3392857142857143e-06,
155
+ "loss": 2.0779,
156
+ "step": 105
157
+ },
158
+ {
159
+ "epoch": 0.28061224489795916,
160
+ "grad_norm": 44752.69140625,
161
+ "learning_rate": 1.4030612244897959e-06,
162
+ "loss": 2.0764,
163
+ "step": 110
164
+ },
165
+ {
166
+ "epoch": 0.29336734693877553,
167
+ "grad_norm": 49883.0,
168
+ "learning_rate": 1.4668367346938777e-06,
169
+ "loss": 2.0763,
170
+ "step": 115
171
+ },
172
+ {
173
+ "epoch": 0.30612244897959184,
174
+ "grad_norm": 65950.28125,
175
+ "learning_rate": 1.5306122448979593e-06,
176
+ "loss": 2.0739,
177
+ "step": 120
178
+ },
179
+ {
180
+ "epoch": 0.31887755102040816,
181
+ "grad_norm": 61354.83203125,
182
+ "learning_rate": 1.5943877551020409e-06,
183
+ "loss": 2.0706,
184
+ "step": 125
185
+ },
186
+ {
187
+ "epoch": 0.33163265306122447,
188
+ "grad_norm": 53169.91796875,
189
+ "learning_rate": 1.6581632653061225e-06,
190
+ "loss": 2.0709,
191
+ "step": 130
192
+ },
193
+ {
194
+ "epoch": 0.34438775510204084,
195
+ "grad_norm": 55883.31640625,
196
+ "learning_rate": 1.7219387755102043e-06,
197
+ "loss": 2.0695,
198
+ "step": 135
199
+ },
200
+ {
201
+ "epoch": 0.35714285714285715,
202
+ "grad_norm": 38134.765625,
203
+ "learning_rate": 1.7857142857142859e-06,
204
+ "loss": 2.0675,
205
+ "step": 140
206
+ },
207
+ {
208
+ "epoch": 0.36989795918367346,
209
+ "grad_norm": 53920.7265625,
210
+ "learning_rate": 1.8494897959183675e-06,
211
+ "loss": 2.0659,
212
+ "step": 145
213
+ },
214
+ {
215
+ "epoch": 0.3826530612244898,
216
+ "grad_norm": 68777.28125,
217
+ "learning_rate": 1.913265306122449e-06,
218
+ "loss": 2.0638,
219
+ "step": 150
220
+ },
221
+ {
222
+ "epoch": 0.39540816326530615,
223
+ "grad_norm": 46247.8984375,
224
+ "learning_rate": 1.977040816326531e-06,
225
+ "loss": 2.0656,
226
+ "step": 155
227
+ },
228
+ {
229
+ "epoch": 0.40816326530612246,
230
+ "grad_norm": 49636.04296875,
231
+ "learning_rate": 2.0408163265306125e-06,
232
+ "loss": 2.0621,
233
+ "step": 160
234
+ },
235
+ {
236
+ "epoch": 0.42091836734693877,
237
+ "grad_norm": 51880.3515625,
238
+ "learning_rate": 2.104591836734694e-06,
239
+ "loss": 2.057,
240
+ "step": 165
241
+ },
242
+ {
243
+ "epoch": 0.4336734693877551,
244
+ "grad_norm": 52808.734375,
245
+ "learning_rate": 2.1683673469387757e-06,
246
+ "loss": 2.0585,
247
+ "step": 170
248
+ },
249
+ {
250
+ "epoch": 0.44642857142857145,
251
+ "grad_norm": 46398.125,
252
+ "learning_rate": 2.2321428571428573e-06,
253
+ "loss": 2.0505,
254
+ "step": 175
255
+ },
256
+ {
257
+ "epoch": 0.45918367346938777,
258
+ "grad_norm": 59347.2578125,
259
+ "learning_rate": 2.295918367346939e-06,
260
+ "loss": 2.0487,
261
+ "step": 180
262
+ },
263
+ {
264
+ "epoch": 0.4719387755102041,
265
+ "grad_norm": 58343.73828125,
266
+ "learning_rate": 2.3596938775510205e-06,
267
+ "loss": 2.0563,
268
+ "step": 185
269
+ },
270
+ {
271
+ "epoch": 0.4846938775510204,
272
+ "grad_norm": 43836.42578125,
273
+ "learning_rate": 2.423469387755102e-06,
274
+ "loss": 2.0486,
275
+ "step": 190
276
+ },
277
+ {
278
+ "epoch": 0.49744897959183676,
279
+ "grad_norm": 69661.5859375,
280
+ "learning_rate": 2.487244897959184e-06,
281
+ "loss": 2.0443,
282
+ "step": 195
283
+ },
284
+ {
285
+ "epoch": 0.5102040816326531,
286
+ "grad_norm": 53888.24609375,
287
+ "learning_rate": 2.5510204081632657e-06,
288
+ "loss": 2.0475,
289
+ "step": 200
290
+ },
291
+ {
292
+ "epoch": 0.5229591836734694,
293
+ "grad_norm": 50369.1640625,
294
+ "learning_rate": 2.6147959183673473e-06,
295
+ "loss": 2.035,
296
+ "step": 205
297
+ },
298
+ {
299
+ "epoch": 0.5357142857142857,
300
+ "grad_norm": 62733.9609375,
301
+ "learning_rate": 2.6785714285714285e-06,
302
+ "loss": 2.0354,
303
+ "step": 210
304
+ },
305
+ {
306
+ "epoch": 0.548469387755102,
307
+ "grad_norm": 48340.67578125,
308
+ "learning_rate": 2.74234693877551e-06,
309
+ "loss": 2.0373,
310
+ "step": 215
311
+ },
312
+ {
313
+ "epoch": 0.5612244897959183,
314
+ "grad_norm": 50353.734375,
315
+ "learning_rate": 2.8061224489795917e-06,
316
+ "loss": 2.0389,
317
+ "step": 220
318
+ },
319
+ {
320
+ "epoch": 0.5739795918367347,
321
+ "grad_norm": 62351.765625,
322
+ "learning_rate": 2.869897959183674e-06,
323
+ "loss": 2.0263,
324
+ "step": 225
325
+ },
326
+ {
327
+ "epoch": 0.5867346938775511,
328
+ "grad_norm": 73855.609375,
329
+ "learning_rate": 2.9336734693877553e-06,
330
+ "loss": 2.0269,
331
+ "step": 230
332
+ },
333
+ {
334
+ "epoch": 0.5994897959183674,
335
+ "grad_norm": 84417.859375,
336
+ "learning_rate": 2.997448979591837e-06,
337
+ "loss": 2.0311,
338
+ "step": 235
339
+ },
340
+ {
341
+ "epoch": 0.6122448979591837,
342
+ "grad_norm": 57347.55078125,
343
+ "learning_rate": 3.0612244897959185e-06,
344
+ "loss": 2.0239,
345
+ "step": 240
346
+ },
347
+ {
348
+ "epoch": 0.625,
349
+ "grad_norm": 65611.1875,
350
+ "learning_rate": 3.125e-06,
351
+ "loss": 2.0064,
352
+ "step": 245
353
+ },
354
+ {
355
+ "epoch": 0.6377551020408163,
356
+ "grad_norm": 70810.75,
357
+ "learning_rate": 3.1887755102040818e-06,
358
+ "loss": 2.0224,
359
+ "step": 250
360
+ },
361
+ {
362
+ "epoch": 0.6505102040816326,
363
+ "grad_norm": 46479.8828125,
364
+ "learning_rate": 3.2525510204081634e-06,
365
+ "loss": 2.0057,
366
+ "step": 255
367
+ },
368
+ {
369
+ "epoch": 0.6632653061224489,
370
+ "grad_norm": 34124.89453125,
371
+ "learning_rate": 3.316326530612245e-06,
372
+ "loss": 2.0135,
373
+ "step": 260
374
+ },
375
+ {
376
+ "epoch": 0.6760204081632653,
377
+ "grad_norm": 53196.58203125,
378
+ "learning_rate": 3.3801020408163266e-06,
379
+ "loss": 2.0087,
380
+ "step": 265
381
+ },
382
+ {
383
+ "epoch": 0.6887755102040817,
384
+ "grad_norm": 64588.515625,
385
+ "learning_rate": 3.4438775510204086e-06,
386
+ "loss": 2.0006,
387
+ "step": 270
388
+ },
389
+ {
390
+ "epoch": 0.701530612244898,
391
+ "grad_norm": 73780.546875,
392
+ "learning_rate": 3.50765306122449e-06,
393
+ "loss": 1.9965,
394
+ "step": 275
395
+ },
396
+ {
397
+ "epoch": 0.7142857142857143,
398
+ "grad_norm": 63101.05859375,
399
+ "learning_rate": 3.5714285714285718e-06,
400
+ "loss": 2.0005,
401
+ "step": 280
402
+ },
403
+ {
404
+ "epoch": 0.7270408163265306,
405
+ "grad_norm": 78616.109375,
406
+ "learning_rate": 3.6352040816326534e-06,
407
+ "loss": 1.9904,
408
+ "step": 285
409
+ },
410
+ {
411
+ "epoch": 0.7397959183673469,
412
+ "grad_norm": 79120.6484375,
413
+ "learning_rate": 3.698979591836735e-06,
414
+ "loss": 1.9891,
415
+ "step": 290
416
+ },
417
+ {
418
+ "epoch": 0.7525510204081632,
419
+ "grad_norm": 92711.09375,
420
+ "learning_rate": 3.7627551020408166e-06,
421
+ "loss": 1.9611,
422
+ "step": 295
423
+ },
424
+ {
425
+ "epoch": 0.7653061224489796,
426
+ "grad_norm": 77935.4765625,
427
+ "learning_rate": 3.826530612244898e-06,
428
+ "loss": 1.9746,
429
+ "step": 300
430
+ },
431
+ {
432
+ "epoch": 0.7780612244897959,
433
+ "grad_norm": 85825.375,
434
+ "learning_rate": 3.89030612244898e-06,
435
+ "loss": 1.9753,
436
+ "step": 305
437
+ },
438
+ {
439
+ "epoch": 0.7908163265306123,
440
+ "grad_norm": 65178.671875,
441
+ "learning_rate": 3.954081632653062e-06,
442
+ "loss": 1.9534,
443
+ "step": 310
444
+ },
445
+ {
446
+ "epoch": 0.8035714285714286,
447
+ "grad_norm": 71973.671875,
448
+ "learning_rate": 4.017857142857143e-06,
449
+ "loss": 1.9474,
450
+ "step": 315
451
+ },
452
+ {
453
+ "epoch": 0.8163265306122449,
454
+ "grad_norm": 88983.125,
455
+ "learning_rate": 4.081632653061225e-06,
456
+ "loss": 1.9492,
457
+ "step": 320
458
+ },
459
+ {
460
+ "epoch": 0.8290816326530612,
461
+ "grad_norm": 76345.84375,
462
+ "learning_rate": 4.145408163265306e-06,
463
+ "loss": 1.9513,
464
+ "step": 325
465
+ },
466
+ {
467
+ "epoch": 0.8418367346938775,
468
+ "grad_norm": 64801.640625,
469
+ "learning_rate": 4.209183673469388e-06,
470
+ "loss": 1.9583,
471
+ "step": 330
472
+ },
473
+ {
474
+ "epoch": 0.8545918367346939,
475
+ "grad_norm": 72903.8203125,
476
+ "learning_rate": 4.272959183673469e-06,
477
+ "loss": 1.9579,
478
+ "step": 335
479
+ },
480
+ {
481
+ "epoch": 0.8673469387755102,
482
+ "grad_norm": 65284.796875,
483
+ "learning_rate": 4.336734693877551e-06,
484
+ "loss": 1.9272,
485
+ "step": 340
486
+ },
487
+ {
488
+ "epoch": 0.8801020408163265,
489
+ "grad_norm": 65237.38671875,
490
+ "learning_rate": 4.400510204081633e-06,
491
+ "loss": 1.9065,
492
+ "step": 345
493
+ },
494
+ {
495
+ "epoch": 0.8928571428571429,
496
+ "grad_norm": 79606.9296875,
497
+ "learning_rate": 4.464285714285715e-06,
498
+ "loss": 1.9013,
499
+ "step": 350
500
+ },
501
+ {
502
+ "epoch": 0.9056122448979592,
503
+ "grad_norm": 76137.6328125,
504
+ "learning_rate": 4.528061224489797e-06,
505
+ "loss": 1.8892,
506
+ "step": 355
507
+ },
508
+ {
509
+ "epoch": 0.9183673469387755,
510
+ "grad_norm": 40717.60546875,
511
+ "learning_rate": 4.591836734693878e-06,
512
+ "loss": 1.9072,
513
+ "step": 360
514
+ },
515
+ {
516
+ "epoch": 0.9311224489795918,
517
+ "grad_norm": 110416.125,
518
+ "learning_rate": 4.65561224489796e-06,
519
+ "loss": 1.9024,
520
+ "step": 365
521
+ },
522
+ {
523
+ "epoch": 0.9438775510204082,
524
+ "grad_norm": 99101.6875,
525
+ "learning_rate": 4.719387755102041e-06,
526
+ "loss": 1.8979,
527
+ "step": 370
528
+ },
529
+ {
530
+ "epoch": 0.9566326530612245,
531
+ "grad_norm": 86707.7109375,
532
+ "learning_rate": 4.783163265306123e-06,
533
+ "loss": 1.878,
534
+ "step": 375
535
+ },
536
+ {
537
+ "epoch": 0.9693877551020408,
538
+ "grad_norm": 85074.75,
539
+ "learning_rate": 4.846938775510204e-06,
540
+ "loss": 1.871,
541
+ "step": 380
542
+ },
543
+ {
544
+ "epoch": 0.9821428571428571,
545
+ "grad_norm": 47668.7109375,
546
+ "learning_rate": 4.910714285714286e-06,
547
+ "loss": 1.8762,
548
+ "step": 385
549
+ },
550
+ {
551
+ "epoch": 0.9948979591836735,
552
+ "grad_norm": 133846.6875,
553
+ "learning_rate": 4.974489795918368e-06,
554
+ "loss": 1.8344,
555
+ "step": 390
556
+ },
557
+ {
558
+ "epoch": 1.0,
559
+ "eval_accuracy": 0.0857774206508638,
560
+ "eval_loss": 1.9574071168899536,
561
+ "eval_runtime": 313.0247,
562
+ "eval_samples_per_second": 15.903,
563
+ "eval_steps_per_second": 0.53,
564
+ "step": 392
565
+ },
566
+ {
567
+ "epoch": 1.0076530612244898,
568
+ "grad_norm": 95410.25,
569
+ "learning_rate": 5.0382653061224495e-06,
570
+ "loss": 1.8635,
571
+ "step": 395
572
+ },
573
+ {
574
+ "epoch": 1.0204081632653061,
575
+ "grad_norm": 86022.1484375,
576
+ "learning_rate": 5.1020408163265315e-06,
577
+ "loss": 1.8147,
578
+ "step": 400
579
+ },
580
+ {
581
+ "epoch": 1.0331632653061225,
582
+ "grad_norm": 154860.171875,
583
+ "learning_rate": 5.165816326530613e-06,
584
+ "loss": 1.8016,
585
+ "step": 405
586
+ },
587
+ {
588
+ "epoch": 1.0459183673469388,
589
+ "grad_norm": 105390.4609375,
590
+ "learning_rate": 5.229591836734695e-06,
591
+ "loss": 1.8458,
592
+ "step": 410
593
+ },
594
+ {
595
+ "epoch": 1.058673469387755,
596
+ "grad_norm": 94511.6171875,
597
+ "learning_rate": 5.293367346938776e-06,
598
+ "loss": 1.7888,
599
+ "step": 415
600
+ },
601
+ {
602
+ "epoch": 1.0714285714285714,
603
+ "grad_norm": 93031.234375,
604
+ "learning_rate": 5.357142857142857e-06,
605
+ "loss": 1.7809,
606
+ "step": 420
607
+ },
608
+ {
609
+ "epoch": 1.0841836734693877,
610
+ "grad_norm": 71911.4296875,
611
+ "learning_rate": 5.420918367346939e-06,
612
+ "loss": 1.8523,
613
+ "step": 425
614
+ },
615
+ {
616
+ "epoch": 1.096938775510204,
617
+ "grad_norm": 59985.96875,
618
+ "learning_rate": 5.48469387755102e-06,
619
+ "loss": 1.7427,
620
+ "step": 430
621
+ },
622
+ {
623
+ "epoch": 1.1096938775510203,
624
+ "grad_norm": 86531.875,
625
+ "learning_rate": 5.548469387755102e-06,
626
+ "loss": 1.7646,
627
+ "step": 435
628
+ },
629
+ {
630
+ "epoch": 1.1224489795918366,
631
+ "grad_norm": 90977.859375,
632
+ "learning_rate": 5.6122448979591834e-06,
633
+ "loss": 1.7529,
634
+ "step": 440
635
+ },
636
+ {
637
+ "epoch": 1.135204081632653,
638
+ "grad_norm": 70691.375,
639
+ "learning_rate": 5.6760204081632655e-06,
640
+ "loss": 1.7754,
641
+ "step": 445
642
+ },
643
+ {
644
+ "epoch": 1.1479591836734695,
645
+ "grad_norm": 137077.8125,
646
+ "learning_rate": 5.739795918367348e-06,
647
+ "loss": 1.73,
648
+ "step": 450
649
+ },
650
+ {
651
+ "epoch": 1.1607142857142858,
652
+ "grad_norm": 133976.8125,
653
+ "learning_rate": 5.8035714285714295e-06,
654
+ "loss": 1.7906,
655
+ "step": 455
656
+ },
657
+ {
658
+ "epoch": 1.1734693877551021,
659
+ "grad_norm": 84754.7421875,
660
+ "learning_rate": 5.867346938775511e-06,
661
+ "loss": 1.7722,
662
+ "step": 460
663
+ },
664
+ {
665
+ "epoch": 1.1862244897959184,
666
+ "grad_norm": 67924.859375,
667
+ "learning_rate": 5.931122448979593e-06,
668
+ "loss": 1.8274,
669
+ "step": 465
670
+ },
671
+ {
672
+ "epoch": 1.1989795918367347,
673
+ "grad_norm": 110309.015625,
674
+ "learning_rate": 5.994897959183674e-06,
675
+ "loss": 1.7558,
676
+ "step": 470
677
+ },
678
+ {
679
+ "epoch": 1.211734693877551,
680
+ "grad_norm": 108489.703125,
681
+ "learning_rate": 6.058673469387756e-06,
682
+ "loss": 1.7387,
683
+ "step": 475
684
+ },
685
+ {
686
+ "epoch": 1.2244897959183674,
687
+ "grad_norm": 99533.59375,
688
+ "learning_rate": 6.122448979591837e-06,
689
+ "loss": 1.7367,
690
+ "step": 480
691
+ },
692
+ {
693
+ "epoch": 1.2372448979591837,
694
+ "grad_norm": 85590.296875,
695
+ "learning_rate": 6.186224489795919e-06,
696
+ "loss": 1.7304,
697
+ "step": 485
698
+ },
699
+ {
700
+ "epoch": 1.25,
701
+ "grad_norm": 242811.53125,
702
+ "learning_rate": 6.25e-06,
703
+ "loss": 1.6764,
704
+ "step": 490
705
+ },
706
+ {
707
+ "epoch": 1.2627551020408163,
708
+ "grad_norm": 88326.6171875,
709
+ "learning_rate": 6.313775510204082e-06,
710
+ "loss": 1.7684,
711
+ "step": 495
712
+ },
713
+ {
714
+ "epoch": 1.2755102040816326,
715
+ "grad_norm": 64742.67578125,
716
+ "learning_rate": 6.3775510204081635e-06,
717
+ "loss": 1.788,
718
+ "step": 500
719
+ },
720
+ {
721
+ "epoch": 1.288265306122449,
722
+ "grad_norm": 130764.1015625,
723
+ "learning_rate": 6.4413265306122455e-06,
724
+ "loss": 1.6374,
725
+ "step": 505
726
+ },
727
+ {
728
+ "epoch": 1.3010204081632653,
729
+ "grad_norm": 105262.7578125,
730
+ "learning_rate": 6.505102040816327e-06,
731
+ "loss": 1.7669,
732
+ "step": 510
733
+ },
734
+ {
735
+ "epoch": 1.3137755102040816,
736
+ "grad_norm": 132783.3125,
737
+ "learning_rate": 6.568877551020409e-06,
738
+ "loss": 1.7428,
739
+ "step": 515
740
+ },
741
+ {
742
+ "epoch": 1.3265306122448979,
743
+ "grad_norm": 109851.921875,
744
+ "learning_rate": 6.63265306122449e-06,
745
+ "loss": 1.7023,
746
+ "step": 520
747
+ },
748
+ {
749
+ "epoch": 1.3392857142857144,
750
+ "grad_norm": 65197.0859375,
751
+ "learning_rate": 6.696428571428571e-06,
752
+ "loss": 1.7407,
753
+ "step": 525
754
+ },
755
+ {
756
+ "epoch": 1.3520408163265305,
757
+ "grad_norm": 158283.9375,
758
+ "learning_rate": 6.760204081632653e-06,
759
+ "loss": 1.7629,
760
+ "step": 530
761
+ },
762
+ {
763
+ "epoch": 1.364795918367347,
764
+ "grad_norm": 121252.4140625,
765
+ "learning_rate": 6.823979591836736e-06,
766
+ "loss": 1.7242,
767
+ "step": 535
768
+ },
769
+ {
770
+ "epoch": 1.3775510204081631,
771
+ "grad_norm": 110898.328125,
772
+ "learning_rate": 6.887755102040817e-06,
773
+ "loss": 1.6922,
774
+ "step": 540
775
+ },
776
+ {
777
+ "epoch": 1.3903061224489797,
778
+ "grad_norm": 127766.8046875,
779
+ "learning_rate": 6.951530612244899e-06,
780
+ "loss": 1.7274,
781
+ "step": 545
782
+ },
783
+ {
784
+ "epoch": 1.403061224489796,
785
+ "grad_norm": 138821.59375,
786
+ "learning_rate": 7.01530612244898e-06,
787
+ "loss": 1.6859,
788
+ "step": 550
789
+ },
790
+ {
791
+ "epoch": 1.4158163265306123,
792
+ "grad_norm": 154254.75,
793
+ "learning_rate": 7.079081632653062e-06,
794
+ "loss": 1.6988,
795
+ "step": 555
796
+ },
797
+ {
798
+ "epoch": 1.4285714285714286,
799
+ "grad_norm": 122805.65625,
800
+ "learning_rate": 7.1428571428571436e-06,
801
+ "loss": 1.7003,
802
+ "step": 560
803
+ },
804
+ {
805
+ "epoch": 1.441326530612245,
806
+ "grad_norm": 35300.796875,
807
+ "learning_rate": 7.206632653061226e-06,
808
+ "loss": 1.7224,
809
+ "step": 565
810
+ },
811
+ {
812
+ "epoch": 1.4540816326530612,
813
+ "grad_norm": 125603.1015625,
814
+ "learning_rate": 7.270408163265307e-06,
815
+ "loss": 1.8201,
816
+ "step": 570
817
+ },
818
+ {
819
+ "epoch": 1.4668367346938775,
820
+ "grad_norm": 167185.671875,
821
+ "learning_rate": 7.334183673469388e-06,
822
+ "loss": 1.7959,
823
+ "step": 575
824
+ },
825
+ {
826
+ "epoch": 1.4795918367346939,
827
+ "grad_norm": 59943.359375,
828
+ "learning_rate": 7.39795918367347e-06,
829
+ "loss": 1.6133,
830
+ "step": 580
831
+ },
832
+ {
833
+ "epoch": 1.4923469387755102,
834
+ "grad_norm": 197770.234375,
835
+ "learning_rate": 7.461734693877551e-06,
836
+ "loss": 1.709,
837
+ "step": 585
838
+ },
839
+ {
840
+ "epoch": 1.5051020408163265,
841
+ "grad_norm": 182674.5,
842
+ "learning_rate": 7.525510204081633e-06,
843
+ "loss": 1.688,
844
+ "step": 590
845
+ },
846
+ {
847
+ "epoch": 1.5178571428571428,
848
+ "grad_norm": 90262.0390625,
849
+ "learning_rate": 7.589285714285714e-06,
850
+ "loss": 1.7639,
851
+ "step": 595
852
+ },
853
+ {
854
+ "epoch": 1.5306122448979593,
855
+ "grad_norm": 119569.0859375,
856
+ "learning_rate": 7.653061224489796e-06,
857
+ "loss": 1.7587,
858
+ "step": 600
859
+ },
860
+ {
861
+ "epoch": 1.5433673469387754,
862
+ "grad_norm": 134583.546875,
863
+ "learning_rate": 7.716836734693878e-06,
864
+ "loss": 1.7174,
865
+ "step": 605
866
+ },
867
+ {
868
+ "epoch": 1.556122448979592,
869
+ "grad_norm": 123799.515625,
870
+ "learning_rate": 7.78061224489796e-06,
871
+ "loss": 1.668,
872
+ "step": 610
873
+ },
874
+ {
875
+ "epoch": 1.568877551020408,
876
+ "grad_norm": 124765.65625,
877
+ "learning_rate": 7.844387755102042e-06,
878
+ "loss": 1.691,
879
+ "step": 615
880
+ },
881
+ {
882
+ "epoch": 1.5816326530612246,
883
+ "grad_norm": 180459.28125,
884
+ "learning_rate": 7.908163265306124e-06,
885
+ "loss": 1.6133,
886
+ "step": 620
887
+ },
888
+ {
889
+ "epoch": 1.5943877551020407,
890
+ "grad_norm": 115367.0859375,
891
+ "learning_rate": 7.971938775510205e-06,
892
+ "loss": 1.7529,
893
+ "step": 625
894
+ },
895
+ {
896
+ "epoch": 1.6071428571428572,
897
+ "grad_norm": 66548.40625,
898
+ "learning_rate": 8.035714285714286e-06,
899
+ "loss": 1.7157,
900
+ "step": 630
901
+ },
902
+ {
903
+ "epoch": 1.6198979591836735,
904
+ "grad_norm": 152392.640625,
905
+ "learning_rate": 8.099489795918369e-06,
906
+ "loss": 1.7417,
907
+ "step": 635
908
+ },
909
+ {
910
+ "epoch": 1.6326530612244898,
911
+ "grad_norm": 131193.015625,
912
+ "learning_rate": 8.16326530612245e-06,
913
+ "loss": 1.7704,
914
+ "step": 640
915
+ },
916
+ {
917
+ "epoch": 1.6454081632653061,
918
+ "grad_norm": 154705.71875,
919
+ "learning_rate": 8.227040816326531e-06,
920
+ "loss": 1.6573,
921
+ "step": 645
922
+ },
923
+ {
924
+ "epoch": 1.6581632653061225,
925
+ "grad_norm": 113254.484375,
926
+ "learning_rate": 8.290816326530612e-06,
927
+ "loss": 1.6394,
928
+ "step": 650
929
+ },
930
+ {
931
+ "epoch": 1.6709183673469388,
932
+ "grad_norm": 163821.484375,
933
+ "learning_rate": 8.354591836734695e-06,
934
+ "loss": 1.6884,
935
+ "step": 655
936
+ },
937
+ {
938
+ "epoch": 1.683673469387755,
939
+ "grad_norm": 141981.828125,
940
+ "learning_rate": 8.418367346938776e-06,
941
+ "loss": 1.5955,
942
+ "step": 660
943
+ },
944
+ {
945
+ "epoch": 1.6964285714285714,
946
+ "grad_norm": 158287.6875,
947
+ "learning_rate": 8.482142857142858e-06,
948
+ "loss": 1.6885,
949
+ "step": 665
950
+ },
951
+ {
952
+ "epoch": 1.7091836734693877,
953
+ "grad_norm": 166876.4375,
954
+ "learning_rate": 8.545918367346939e-06,
955
+ "loss": 1.7253,
956
+ "step": 670
957
+ },
958
+ {
959
+ "epoch": 1.7219387755102042,
960
+ "grad_norm": 111882.171875,
961
+ "learning_rate": 8.609693877551022e-06,
962
+ "loss": 1.7057,
963
+ "step": 675
964
+ },
965
+ {
966
+ "epoch": 1.7346938775510203,
967
+ "grad_norm": 237840.828125,
968
+ "learning_rate": 8.673469387755103e-06,
969
+ "loss": 1.7132,
970
+ "step": 680
971
+ },
972
+ {
973
+ "epoch": 1.7474489795918369,
974
+ "grad_norm": 83344.59375,
975
+ "learning_rate": 8.737244897959184e-06,
976
+ "loss": 1.733,
977
+ "step": 685
978
+ },
979
+ {
980
+ "epoch": 1.760204081632653,
981
+ "grad_norm": 30312.462890625,
982
+ "learning_rate": 8.801020408163265e-06,
983
+ "loss": 1.7926,
984
+ "step": 690
985
+ },
986
+ {
987
+ "epoch": 1.7729591836734695,
988
+ "grad_norm": 89868.9609375,
989
+ "learning_rate": 8.864795918367348e-06,
990
+ "loss": 1.6389,
991
+ "step": 695
992
+ },
993
+ {
994
+ "epoch": 1.7857142857142856,
995
+ "grad_norm": 182723.09375,
996
+ "learning_rate": 8.92857142857143e-06,
997
+ "loss": 1.7533,
998
+ "step": 700
999
+ },
1000
+ {
1001
+ "epoch": 1.7984693877551021,
1002
+ "grad_norm": 104328.2421875,
1003
+ "learning_rate": 8.992346938775512e-06,
1004
+ "loss": 1.7424,
1005
+ "step": 705
1006
+ },
1007
+ {
1008
+ "epoch": 1.8112244897959182,
1009
+ "grad_norm": 110120.609375,
1010
+ "learning_rate": 9.056122448979593e-06,
1011
+ "loss": 1.6961,
1012
+ "step": 710
1013
+ },
1014
+ {
1015
+ "epoch": 1.8239795918367347,
1016
+ "grad_norm": 114349.7578125,
1017
+ "learning_rate": 9.119897959183674e-06,
1018
+ "loss": 1.7753,
1019
+ "step": 715
1020
+ },
1021
+ {
1022
+ "epoch": 1.836734693877551,
1023
+ "grad_norm": 103603.8828125,
1024
+ "learning_rate": 9.183673469387756e-06,
1025
+ "loss": 1.777,
1026
+ "step": 720
1027
+ },
1028
+ {
1029
+ "epoch": 1.8494897959183674,
1030
+ "grad_norm": 173858.828125,
1031
+ "learning_rate": 9.247448979591837e-06,
1032
+ "loss": 1.7218,
1033
+ "step": 725
1034
+ },
1035
+ {
1036
+ "epoch": 1.8622448979591837,
1037
+ "grad_norm": 255108.96875,
1038
+ "learning_rate": 9.31122448979592e-06,
1039
+ "loss": 1.7319,
1040
+ "step": 730
1041
+ },
1042
+ {
1043
+ "epoch": 1.875,
1044
+ "grad_norm": 38653.734375,
1045
+ "learning_rate": 9.375000000000001e-06,
1046
+ "loss": 1.6574,
1047
+ "step": 735
1048
+ },
1049
+ {
1050
+ "epoch": 1.8877551020408163,
1051
+ "grad_norm": 141534.234375,
1052
+ "learning_rate": 9.438775510204082e-06,
1053
+ "loss": 1.7595,
1054
+ "step": 740
1055
+ },
1056
+ {
1057
+ "epoch": 1.9005102040816326,
1058
+ "grad_norm": 182591.890625,
1059
+ "learning_rate": 9.502551020408163e-06,
1060
+ "loss": 1.6796,
1061
+ "step": 745
1062
+ },
1063
+ {
1064
+ "epoch": 1.913265306122449,
1065
+ "grad_norm": 75052.21875,
1066
+ "learning_rate": 9.566326530612246e-06,
1067
+ "loss": 1.754,
1068
+ "step": 750
1069
+ },
1070
+ {
1071
+ "epoch": 1.9260204081632653,
1072
+ "grad_norm": 108983.921875,
1073
+ "learning_rate": 9.630102040816327e-06,
1074
+ "loss": 1.6907,
1075
+ "step": 755
1076
+ },
1077
+ {
1078
+ "epoch": 1.9387755102040818,
1079
+ "grad_norm": 110641.828125,
1080
+ "learning_rate": 9.693877551020408e-06,
1081
+ "loss": 1.7041,
1082
+ "step": 760
1083
+ },
1084
+ {
1085
+ "epoch": 1.9515306122448979,
1086
+ "grad_norm": 119189.8203125,
1087
+ "learning_rate": 9.75765306122449e-06,
1088
+ "loss": 1.6054,
1089
+ "step": 765
1090
+ },
1091
+ {
1092
+ "epoch": 1.9642857142857144,
1093
+ "grad_norm": 62365.9453125,
1094
+ "learning_rate": 9.821428571428573e-06,
1095
+ "loss": 1.6862,
1096
+ "step": 770
1097
+ },
1098
+ {
1099
+ "epoch": 1.9770408163265305,
1100
+ "grad_norm": 103043.6171875,
1101
+ "learning_rate": 9.885204081632654e-06,
1102
+ "loss": 1.6596,
1103
+ "step": 775
1104
+ },
1105
+ {
1106
+ "epoch": 1.989795918367347,
1107
+ "grad_norm": 117368.546875,
1108
+ "learning_rate": 9.948979591836737e-06,
1109
+ "loss": 1.6737,
1110
+ "step": 780
1111
+ },
1112
+ {
1113
+ "epoch": 2.0,
1114
+ "eval_accuracy": 0.0857774206508638,
1115
+ "eval_loss": 1.8835231065750122,
1116
+ "eval_runtime": 308.4538,
1117
+ "eval_samples_per_second": 16.139,
1118
+ "eval_steps_per_second": 0.538,
1119
+ "step": 784
1120
+ },
1121
+ {
1122
+ "epoch": 2.002551020408163,
1123
+ "grad_norm": 219828.25,
1124
+ "learning_rate": 9.996811224489797e-06,
1125
+ "loss": 1.604,
1126
+ "step": 785
1127
+ },
1128
+ {
1129
+ "epoch": 2.0153061224489797,
1130
+ "grad_norm": 132893.65625,
1131
+ "learning_rate": 9.980867346938775e-06,
1132
+ "loss": 1.7278,
1133
+ "step": 790
1134
+ },
1135
+ {
1136
+ "epoch": 2.0280612244897958,
1137
+ "grad_norm": 103370.2109375,
1138
+ "learning_rate": 9.964923469387756e-06,
1139
+ "loss": 1.6631,
1140
+ "step": 795
1141
+ },
1142
+ {
1143
+ "epoch": 2.0408163265306123,
1144
+ "grad_norm": 201988.0625,
1145
+ "learning_rate": 9.948979591836737e-06,
1146
+ "loss": 1.7244,
1147
+ "step": 800
1148
+ },
1149
+ {
1150
+ "epoch": 2.0535714285714284,
1151
+ "grad_norm": 67003.359375,
1152
+ "learning_rate": 9.933035714285715e-06,
1153
+ "loss": 1.7057,
1154
+ "step": 805
1155
+ },
1156
+ {
1157
+ "epoch": 2.066326530612245,
1158
+ "grad_norm": 306698.625,
1159
+ "learning_rate": 9.917091836734694e-06,
1160
+ "loss": 1.7915,
1161
+ "step": 810
1162
+ },
1163
+ {
1164
+ "epoch": 2.079081632653061,
1165
+ "grad_norm": 168723.765625,
1166
+ "learning_rate": 9.901147959183675e-06,
1167
+ "loss": 1.7387,
1168
+ "step": 815
1169
+ },
1170
+ {
1171
+ "epoch": 2.0918367346938775,
1172
+ "grad_norm": 159844.828125,
1173
+ "learning_rate": 9.885204081632654e-06,
1174
+ "loss": 1.716,
1175
+ "step": 820
1176
+ },
1177
+ {
1178
+ "epoch": 2.104591836734694,
1179
+ "grad_norm": 131381.34375,
1180
+ "learning_rate": 9.869260204081633e-06,
1181
+ "loss": 1.7793,
1182
+ "step": 825
1183
+ },
1184
+ {
1185
+ "epoch": 2.11734693877551,
1186
+ "grad_norm": 134116.8125,
1187
+ "learning_rate": 9.853316326530613e-06,
1188
+ "loss": 1.6806,
1189
+ "step": 830
1190
+ },
1191
+ {
1192
+ "epoch": 2.1301020408163267,
1193
+ "grad_norm": 222184.515625,
1194
+ "learning_rate": 9.837372448979594e-06,
1195
+ "loss": 1.6068,
1196
+ "step": 835
1197
+ },
1198
+ {
1199
+ "epoch": 2.142857142857143,
1200
+ "grad_norm": 151780.703125,
1201
+ "learning_rate": 9.821428571428573e-06,
1202
+ "loss": 1.7369,
1203
+ "step": 840
1204
+ },
1205
+ {
1206
+ "epoch": 2.1556122448979593,
1207
+ "grad_norm": 150746.140625,
1208
+ "learning_rate": 9.805484693877551e-06,
1209
+ "loss": 1.6551,
1210
+ "step": 845
1211
+ },
1212
+ {
1213
+ "epoch": 2.1683673469387754,
1214
+ "grad_norm": 136112.78125,
1215
+ "learning_rate": 9.789540816326532e-06,
1216
+ "loss": 1.7609,
1217
+ "step": 850
1218
+ },
1219
+ {
1220
+ "epoch": 2.181122448979592,
1221
+ "grad_norm": 101995.4140625,
1222
+ "learning_rate": 9.77359693877551e-06,
1223
+ "loss": 1.7171,
1224
+ "step": 855
1225
+ },
1226
+ {
1227
+ "epoch": 2.193877551020408,
1228
+ "grad_norm": 46676.8046875,
1229
+ "learning_rate": 9.75765306122449e-06,
1230
+ "loss": 1.5907,
1231
+ "step": 860
1232
+ },
1233
+ {
1234
+ "epoch": 2.2066326530612246,
1235
+ "grad_norm": 143790.296875,
1236
+ "learning_rate": 9.74170918367347e-06,
1237
+ "loss": 1.6003,
1238
+ "step": 865
1239
+ },
1240
+ {
1241
+ "epoch": 2.2193877551020407,
1242
+ "grad_norm": 152429.078125,
1243
+ "learning_rate": 9.72576530612245e-06,
1244
+ "loss": 1.5825,
1245
+ "step": 870
1246
+ },
1247
+ {
1248
+ "epoch": 2.232142857142857,
1249
+ "grad_norm": 232178.03125,
1250
+ "learning_rate": 9.70982142857143e-06,
1251
+ "loss": 1.6564,
1252
+ "step": 875
1253
+ },
1254
+ {
1255
+ "epoch": 2.2448979591836733,
1256
+ "grad_norm": 87462.2109375,
1257
+ "learning_rate": 9.693877551020408e-06,
1258
+ "loss": 1.6744,
1259
+ "step": 880
1260
+ },
1261
+ {
1262
+ "epoch": 2.25765306122449,
1263
+ "grad_norm": 144245.515625,
1264
+ "learning_rate": 9.677933673469389e-06,
1265
+ "loss": 1.6895,
1266
+ "step": 885
1267
+ },
1268
+ {
1269
+ "epoch": 2.270408163265306,
1270
+ "grad_norm": 110020.5546875,
1271
+ "learning_rate": 9.661989795918368e-06,
1272
+ "loss": 1.7014,
1273
+ "step": 890
1274
+ },
1275
+ {
1276
+ "epoch": 2.2831632653061225,
1277
+ "grad_norm": 192982.6875,
1278
+ "learning_rate": 9.646045918367348e-06,
1279
+ "loss": 1.6613,
1280
+ "step": 895
1281
+ },
1282
+ {
1283
+ "epoch": 2.295918367346939,
1284
+ "grad_norm": 103864.7734375,
1285
+ "learning_rate": 9.630102040816327e-06,
1286
+ "loss": 1.6505,
1287
+ "step": 900
1288
+ },
1289
+ {
1290
+ "epoch": 2.308673469387755,
1291
+ "grad_norm": 171332.34375,
1292
+ "learning_rate": 9.614158163265306e-06,
1293
+ "loss": 1.764,
1294
+ "step": 905
1295
+ },
1296
+ {
1297
+ "epoch": 2.3214285714285716,
1298
+ "grad_norm": 151310.984375,
1299
+ "learning_rate": 9.598214285714287e-06,
1300
+ "loss": 1.7049,
1301
+ "step": 910
1302
+ },
1303
+ {
1304
+ "epoch": 2.3341836734693877,
1305
+ "grad_norm": 50600.53125,
1306
+ "learning_rate": 9.582270408163266e-06,
1307
+ "loss": 1.7203,
1308
+ "step": 915
1309
+ },
1310
+ {
1311
+ "epoch": 2.3469387755102042,
1312
+ "grad_norm": 219012.625,
1313
+ "learning_rate": 9.566326530612246e-06,
1314
+ "loss": 1.7004,
1315
+ "step": 920
1316
+ },
1317
+ {
1318
+ "epoch": 2.3596938775510203,
1319
+ "grad_norm": 120568.4765625,
1320
+ "learning_rate": 9.550382653061225e-06,
1321
+ "loss": 1.6287,
1322
+ "step": 925
1323
+ },
1324
+ {
1325
+ "epoch": 2.372448979591837,
1326
+ "grad_norm": 103431.390625,
1327
+ "learning_rate": 9.534438775510206e-06,
1328
+ "loss": 1.6989,
1329
+ "step": 930
1330
+ },
1331
+ {
1332
+ "epoch": 2.385204081632653,
1333
+ "grad_norm": 75834.46875,
1334
+ "learning_rate": 9.518494897959184e-06,
1335
+ "loss": 1.666,
1336
+ "step": 935
1337
+ },
1338
+ {
1339
+ "epoch": 2.3979591836734695,
1340
+ "grad_norm": 106555.3828125,
1341
+ "learning_rate": 9.502551020408163e-06,
1342
+ "loss": 1.6855,
1343
+ "step": 940
1344
+ },
1345
+ {
1346
+ "epoch": 2.4107142857142856,
1347
+ "grad_norm": 172204.640625,
1348
+ "learning_rate": 9.486607142857144e-06,
1349
+ "loss": 1.5667,
1350
+ "step": 945
1351
+ },
1352
+ {
1353
+ "epoch": 2.423469387755102,
1354
+ "grad_norm": 234761.484375,
1355
+ "learning_rate": 9.470663265306124e-06,
1356
+ "loss": 1.5812,
1357
+ "step": 950
1358
+ },
1359
+ {
1360
+ "epoch": 2.436224489795918,
1361
+ "grad_norm": 184700.15625,
1362
+ "learning_rate": 9.454719387755103e-06,
1363
+ "loss": 1.5293,
1364
+ "step": 955
1365
+ },
1366
+ {
1367
+ "epoch": 2.4489795918367347,
1368
+ "grad_norm": 189192.515625,
1369
+ "learning_rate": 9.438775510204082e-06,
1370
+ "loss": 1.5867,
1371
+ "step": 960
1372
+ },
1373
+ {
1374
+ "epoch": 2.461734693877551,
1375
+ "grad_norm": 123937.0078125,
1376
+ "learning_rate": 9.422831632653063e-06,
1377
+ "loss": 1.692,
1378
+ "step": 965
1379
+ },
1380
+ {
1381
+ "epoch": 2.4744897959183674,
1382
+ "grad_norm": 215251.9375,
1383
+ "learning_rate": 9.406887755102041e-06,
1384
+ "loss": 1.6901,
1385
+ "step": 970
1386
+ },
1387
+ {
1388
+ "epoch": 2.487244897959184,
1389
+ "grad_norm": 176675.5625,
1390
+ "learning_rate": 9.39094387755102e-06,
1391
+ "loss": 1.59,
1392
+ "step": 975
1393
+ },
1394
+ {
1395
+ "epoch": 2.5,
1396
+ "grad_norm": 134754.96875,
1397
+ "learning_rate": 9.375000000000001e-06,
1398
+ "loss": 1.5592,
1399
+ "step": 980
1400
+ },
1401
+ {
1402
+ "epoch": 2.512755102040816,
1403
+ "grad_norm": 103404.9921875,
1404
+ "learning_rate": 9.359056122448981e-06,
1405
+ "loss": 1.6001,
1406
+ "step": 985
1407
+ },
1408
+ {
1409
+ "epoch": 2.5255102040816326,
1410
+ "grad_norm": 147972.421875,
1411
+ "learning_rate": 9.343112244897959e-06,
1412
+ "loss": 1.6028,
1413
+ "step": 990
1414
+ },
1415
+ {
1416
+ "epoch": 2.538265306122449,
1417
+ "grad_norm": 152136.828125,
1418
+ "learning_rate": 9.327168367346939e-06,
1419
+ "loss": 1.6255,
1420
+ "step": 995
1421
+ },
1422
+ {
1423
+ "epoch": 2.5510204081632653,
1424
+ "grad_norm": 118834.9609375,
1425
+ "learning_rate": 9.31122448979592e-06,
1426
+ "loss": 1.6274,
1427
+ "step": 1000
1428
+ },
1429
+ {
1430
+ "epoch": 2.563775510204082,
1431
+ "grad_norm": 127266.28125,
1432
+ "learning_rate": 9.295280612244899e-06,
1433
+ "loss": 1.6929,
1434
+ "step": 1005
1435
+ },
1436
+ {
1437
+ "epoch": 2.576530612244898,
1438
+ "grad_norm": 131886.953125,
1439
+ "learning_rate": 9.279336734693877e-06,
1440
+ "loss": 1.5158,
1441
+ "step": 1010
1442
+ },
1443
+ {
1444
+ "epoch": 2.5892857142857144,
1445
+ "grad_norm": 221068.84375,
1446
+ "learning_rate": 9.263392857142858e-06,
1447
+ "loss": 1.6322,
1448
+ "step": 1015
1449
+ },
1450
+ {
1451
+ "epoch": 2.6020408163265305,
1452
+ "grad_norm": 260819.6875,
1453
+ "learning_rate": 9.247448979591837e-06,
1454
+ "loss": 1.6103,
1455
+ "step": 1020
1456
+ },
1457
+ {
1458
+ "epoch": 2.614795918367347,
1459
+ "grad_norm": 106676.7421875,
1460
+ "learning_rate": 9.231505102040817e-06,
1461
+ "loss": 1.5057,
1462
+ "step": 1025
1463
+ },
1464
+ {
1465
+ "epoch": 2.627551020408163,
1466
+ "grad_norm": 217982.078125,
1467
+ "learning_rate": 9.215561224489796e-06,
1468
+ "loss": 1.6204,
1469
+ "step": 1030
1470
+ },
1471
+ {
1472
+ "epoch": 2.6403061224489797,
1473
+ "grad_norm": 110120.5390625,
1474
+ "learning_rate": 9.199617346938777e-06,
1475
+ "loss": 1.5733,
1476
+ "step": 1035
1477
+ },
1478
+ {
1479
+ "epoch": 2.6530612244897958,
1480
+ "grad_norm": 80831.828125,
1481
+ "learning_rate": 9.183673469387756e-06,
1482
+ "loss": 1.569,
1483
+ "step": 1040
1484
+ },
1485
+ {
1486
+ "epoch": 2.6658163265306123,
1487
+ "grad_norm": 181651.515625,
1488
+ "learning_rate": 9.167729591836736e-06,
1489
+ "loss": 1.6483,
1490
+ "step": 1045
1491
+ },
1492
+ {
1493
+ "epoch": 2.678571428571429,
1494
+ "grad_norm": 272886.125,
1495
+ "learning_rate": 9.151785714285715e-06,
1496
+ "loss": 1.4335,
1497
+ "step": 1050
1498
+ },
1499
+ {
1500
+ "epoch": 2.691326530612245,
1501
+ "grad_norm": 108633.09375,
1502
+ "learning_rate": 9.135841836734694e-06,
1503
+ "loss": 1.5432,
1504
+ "step": 1055
1505
+ },
1506
+ {
1507
+ "epoch": 2.704081632653061,
1508
+ "grad_norm": 165070.234375,
1509
+ "learning_rate": 9.119897959183674e-06,
1510
+ "loss": 1.6833,
1511
+ "step": 1060
1512
+ },
1513
+ {
1514
+ "epoch": 2.7168367346938775,
1515
+ "grad_norm": 165150.328125,
1516
+ "learning_rate": 9.103954081632653e-06,
1517
+ "loss": 1.6997,
1518
+ "step": 1065
1519
+ },
1520
+ {
1521
+ "epoch": 2.729591836734694,
1522
+ "grad_norm": 97369.4765625,
1523
+ "learning_rate": 9.088010204081634e-06,
1524
+ "loss": 1.638,
1525
+ "step": 1070
1526
+ },
1527
+ {
1528
+ "epoch": 2.74234693877551,
1529
+ "grad_norm": 175838.71875,
1530
+ "learning_rate": 9.072066326530613e-06,
1531
+ "loss": 1.7033,
1532
+ "step": 1075
1533
+ },
1534
+ {
1535
+ "epoch": 2.7551020408163263,
1536
+ "grad_norm": 162969.171875,
1537
+ "learning_rate": 9.056122448979593e-06,
1538
+ "loss": 1.5911,
1539
+ "step": 1080
1540
+ },
1541
+ {
1542
+ "epoch": 2.767857142857143,
1543
+ "grad_norm": 189243.03125,
1544
+ "learning_rate": 9.040178571428572e-06,
1545
+ "loss": 1.4564,
1546
+ "step": 1085
1547
+ },
1548
+ {
1549
+ "epoch": 2.7806122448979593,
1550
+ "grad_norm": 191947.484375,
1551
+ "learning_rate": 9.024234693877551e-06,
1552
+ "loss": 1.7732,
1553
+ "step": 1090
1554
+ },
1555
+ {
1556
+ "epoch": 2.7933673469387754,
1557
+ "grad_norm": 143765.234375,
1558
+ "learning_rate": 9.008290816326532e-06,
1559
+ "loss": 1.5865,
1560
+ "step": 1095
1561
+ },
1562
+ {
1563
+ "epoch": 2.806122448979592,
1564
+ "grad_norm": 193076.421875,
1565
+ "learning_rate": 8.992346938775512e-06,
1566
+ "loss": 1.3999,
1567
+ "step": 1100
1568
+ },
1569
+ {
1570
+ "epoch": 2.818877551020408,
1571
+ "grad_norm": 116660.359375,
1572
+ "learning_rate": 8.97640306122449e-06,
1573
+ "loss": 1.5108,
1574
+ "step": 1105
1575
+ },
1576
+ {
1577
+ "epoch": 2.8316326530612246,
1578
+ "grad_norm": 130521.6484375,
1579
+ "learning_rate": 8.96045918367347e-06,
1580
+ "loss": 1.5463,
1581
+ "step": 1110
1582
+ },
1583
+ {
1584
+ "epoch": 2.8443877551020407,
1585
+ "grad_norm": 122570.4609375,
1586
+ "learning_rate": 8.94451530612245e-06,
1587
+ "loss": 1.5438,
1588
+ "step": 1115
1589
+ },
1590
+ {
1591
+ "epoch": 2.857142857142857,
1592
+ "grad_norm": 162270.4375,
1593
+ "learning_rate": 8.92857142857143e-06,
1594
+ "loss": 1.4326,
1595
+ "step": 1120
1596
+ },
1597
+ {
1598
+ "epoch": 2.8698979591836737,
1599
+ "grad_norm": 98064.515625,
1600
+ "learning_rate": 8.912627551020408e-06,
1601
+ "loss": 1.5871,
1602
+ "step": 1125
1603
+ },
1604
+ {
1605
+ "epoch": 2.88265306122449,
1606
+ "grad_norm": 211316.21875,
1607
+ "learning_rate": 8.896683673469389e-06,
1608
+ "loss": 1.4914,
1609
+ "step": 1130
1610
+ },
1611
+ {
1612
+ "epoch": 2.895408163265306,
1613
+ "grad_norm": 156801.890625,
1614
+ "learning_rate": 8.880739795918368e-06,
1615
+ "loss": 1.6379,
1616
+ "step": 1135
1617
+ },
1618
+ {
1619
+ "epoch": 2.9081632653061225,
1620
+ "grad_norm": 142554.421875,
1621
+ "learning_rate": 8.864795918367348e-06,
1622
+ "loss": 1.604,
1623
+ "step": 1140
1624
+ },
1625
+ {
1626
+ "epoch": 2.920918367346939,
1627
+ "grad_norm": 173248.734375,
1628
+ "learning_rate": 8.848852040816327e-06,
1629
+ "loss": 1.6187,
1630
+ "step": 1145
1631
+ },
1632
+ {
1633
+ "epoch": 2.933673469387755,
1634
+ "grad_norm": 108233.78125,
1635
+ "learning_rate": 8.832908163265307e-06,
1636
+ "loss": 1.5321,
1637
+ "step": 1150
1638
+ },
1639
+ {
1640
+ "epoch": 2.946428571428571,
1641
+ "grad_norm": 190112.34375,
1642
+ "learning_rate": 8.816964285714286e-06,
1643
+ "loss": 1.584,
1644
+ "step": 1155
1645
+ },
1646
+ {
1647
+ "epoch": 2.9591836734693877,
1648
+ "grad_norm": 244386.828125,
1649
+ "learning_rate": 8.801020408163265e-06,
1650
+ "loss": 1.5631,
1651
+ "step": 1160
1652
+ },
1653
+ {
1654
+ "epoch": 2.9719387755102042,
1655
+ "grad_norm": 169686.921875,
1656
+ "learning_rate": 8.785076530612246e-06,
1657
+ "loss": 1.5537,
1658
+ "step": 1165
1659
+ },
1660
+ {
1661
+ "epoch": 2.9846938775510203,
1662
+ "grad_norm": 162361.109375,
1663
+ "learning_rate": 8.769132653061225e-06,
1664
+ "loss": 1.4138,
1665
+ "step": 1170
1666
+ },
1667
+ {
1668
+ "epoch": 2.997448979591837,
1669
+ "grad_norm": 203224.546875,
1670
+ "learning_rate": 8.753188775510205e-06,
1671
+ "loss": 1.4787,
1672
+ "step": 1175
1673
+ },
1674
+ {
1675
+ "epoch": 3.0,
1676
+ "eval_accuracy": 0.35918039373242266,
1677
+ "eval_loss": 1.7329986095428467,
1678
+ "eval_runtime": 304.5388,
1679
+ "eval_samples_per_second": 16.346,
1680
+ "eval_steps_per_second": 0.545,
1681
+ "step": 1176
1682
+ }
1683
+ ],
1684
+ "logging_steps": 5,
1685
+ "max_steps": 3920,
1686
+ "num_input_tokens_seen": 0,
1687
+ "num_train_epochs": 10,
1688
+ "save_steps": 500,
1689
+ "stateful_callbacks": {
1690
+ "TrainerControl": {
1691
+ "args": {
1692
+ "should_epoch_stop": false,
1693
+ "should_evaluate": false,
1694
+ "should_log": false,
1695
+ "should_save": true,
1696
+ "should_training_stop": false
1697
+ },
1698
+ "attributes": {}
1699
+ }
1700
+ },
1701
+ "total_flos": 4.49037208009728e+18,
1702
+ "train_batch_size": 30,
1703
+ "trial_name": null,
1704
+ "trial_params": null
1705
+ }
checkpoint-1176/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62d4afa24b2e5344ef9e2ace3afc7237855a36ba40c6701cd39cbf7db81b4cdc
3
+ size 5176