Lauler commited on
Commit
6691a94
·
verified ·
1 Parent(s): bcd76ae

Add files using upload-large-folder tool

Browse files
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/leonardo_work/EUHPC_A02_045/models/ltg_norbert3-base",
3
+ "architectures": [
4
+ "NorbertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_norbert.NorbertConfig",
9
+ "AutoModel": "ltg/norbert3-base--modeling_norbert.NorbertModel",
10
+ "AutoModelForMaskedLM": "ltg/norbert3-base--modeling_norbert.NorbertForMaskedLM",
11
+ "AutoModelForMultipleChoice": "ltg/norbert3-base--modeling_norbert.NorbertForMultipleChoice",
12
+ "AutoModelForQuestionAnswering": "ltg/norbert3-base--modeling_norbert.NorbertForQuestionAnswering",
13
+ "AutoModelForSequenceClassification": "modeling_norbert.NorbertForSequenceClassification",
14
+ "AutoModelForTokenClassification": "ltg/norbert3-base--modeling_norbert.NorbertForTokenClassification"
15
+ },
16
+ "hidden_dropout_prob": 0.0,
17
+ "hidden_size": 768,
18
+ "id2label": {
19
+ "0": "LABEL_0"
20
+ },
21
+ "intermediate_size": 2048,
22
+ "label2id": {
23
+ "LABEL_0": 0
24
+ },
25
+ "layer_norm_eps": 1e-07,
26
+ "max_position_embeddings": 512,
27
+ "num_attention_heads": 12,
28
+ "num_hidden_layers": 12,
29
+ "output_all_encoded_layers": true,
30
+ "position_bucket_size": 32,
31
+ "problem_type": "regression",
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.44.2",
34
+ "vocab_size": 50000
35
+ }
configuration_norbert.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+
3
+
4
+ class NorbertConfig(PretrainedConfig):
5
+ """Configuration class to store the configuration of a `NorbertModel`.
6
+ """
7
+ def __init__(
8
+ self,
9
+ vocab_size=50000,
10
+ attention_probs_dropout_prob=0.1,
11
+ hidden_dropout_prob=0.1,
12
+ hidden_size=768,
13
+ intermediate_size=2048,
14
+ max_position_embeddings=512,
15
+ position_bucket_size=32,
16
+ num_attention_heads=12,
17
+ num_hidden_layers=12,
18
+ layer_norm_eps=1.0e-7,
19
+ output_all_encoded_layers=True,
20
+ **kwargs,
21
+ ):
22
+ super().__init__(**kwargs)
23
+
24
+ self.vocab_size = vocab_size
25
+ self.hidden_size = hidden_size
26
+ self.num_hidden_layers = num_hidden_layers
27
+ self.num_attention_heads = num_attention_heads
28
+ self.intermediate_size = intermediate_size
29
+ self.hidden_dropout_prob = hidden_dropout_prob
30
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
31
+ self.max_position_embeddings = max_position_embeddings
32
+ self.output_all_encoded_layers = output_all_encoded_layers
33
+ self.position_bucket_size = position_bucket_size
34
+ self.layer_norm_eps = layer_norm_eps
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8219c3afc3f9d4c8eff51823d770f2ec916a96a845c9c3596e900dce45f56ed0
3
+ size 496139908
modeling_norbert.py ADDED
@@ -0,0 +1,635 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import List, Optional, Tuple, Union
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ from torch.utils import checkpoint
8
+
9
+ from .configuration_norbert import NorbertConfig
10
+ from transformers.modeling_utils import PreTrainedModel
11
+ from transformers.activations import gelu_new
12
+ from transformers.modeling_outputs import (
13
+ MaskedLMOutput,
14
+ MultipleChoiceModelOutput,
15
+ QuestionAnsweringModelOutput,
16
+ SequenceClassifierOutput,
17
+ TokenClassifierOutput,
18
+ BaseModelOutput
19
+ )
20
+ from transformers.pytorch_utils import softmax_backward_data
21
+
22
+
23
+ class Encoder(nn.Module):
24
+ def __init__(self, config, activation_checkpointing=False):
25
+ super().__init__()
26
+ self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.num_hidden_layers)])
27
+
28
+ for i, layer in enumerate(self.layers):
29
+ layer.mlp.mlp[1].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
30
+ layer.mlp.mlp[-2].weight.data *= math.sqrt(1.0 / (2.0 * (1 + i)))
31
+
32
+ self.activation_checkpointing = activation_checkpointing
33
+
34
+ def forward(self, hidden_states, attention_mask, relative_embedding):
35
+ hidden_states, attention_probs = [hidden_states], []
36
+
37
+ for layer in self.layers:
38
+ if self.activation_checkpointing:
39
+ hidden_state, attention_p = checkpoint.checkpoint(layer, hidden_states[-1], attention_mask, relative_embedding)
40
+ else:
41
+ hidden_state, attention_p = layer(hidden_states[-1], attention_mask, relative_embedding)
42
+
43
+ hidden_states.append(hidden_state)
44
+ attention_probs.append(attention_p)
45
+
46
+ return hidden_states, attention_probs
47
+
48
+
49
+ class MaskClassifier(nn.Module):
50
+ def __init__(self, config, subword_embedding):
51
+ super().__init__()
52
+ self.nonlinearity = nn.Sequential(
53
+ nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
54
+ nn.Linear(config.hidden_size, config.hidden_size),
55
+ nn.GELU(),
56
+ nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
57
+ nn.Dropout(config.hidden_dropout_prob),
58
+ nn.Linear(subword_embedding.size(1), subword_embedding.size(0))
59
+ )
60
+
61
+ def forward(self, x, masked_lm_labels=None):
62
+ if masked_lm_labels is not None:
63
+ x = torch.index_select(x.flatten(0, 1), 0, torch.nonzero(masked_lm_labels.flatten() != -100).squeeze())
64
+ x = self.nonlinearity(x)
65
+ return x
66
+
67
+
68
+ class EncoderLayer(nn.Module):
69
+ def __init__(self, config):
70
+ super().__init__()
71
+ self.attention = Attention(config)
72
+ self.mlp = FeedForward(config)
73
+
74
+ def forward(self, x, padding_mask, relative_embedding):
75
+ attention_output, attention_probs = self.attention(x, padding_mask, relative_embedding)
76
+ x = x + attention_output
77
+ x = x + self.mlp(x)
78
+ return x, attention_probs
79
+
80
+
81
+ class GeGLU(nn.Module):
82
+ def forward(self, x):
83
+ x, gate = x.chunk(2, dim=-1)
84
+ x = x * gelu_new(gate)
85
+ return x
86
+
87
+
88
+ class FeedForward(nn.Module):
89
+ def __init__(self, config):
90
+ super().__init__()
91
+ self.mlp = nn.Sequential(
92
+ nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False),
93
+ nn.Linear(config.hidden_size, 2*config.intermediate_size, bias=False),
94
+ GeGLU(),
95
+ nn.LayerNorm(config.intermediate_size, eps=config.layer_norm_eps, elementwise_affine=False),
96
+ nn.Linear(config.intermediate_size, config.hidden_size, bias=False),
97
+ nn.Dropout(config.hidden_dropout_prob)
98
+ )
99
+
100
+ def forward(self, x):
101
+ return self.mlp(x)
102
+
103
+
104
+ class MaskedSoftmax(torch.autograd.Function):
105
+ @staticmethod
106
+ def forward(self, x, mask, dim):
107
+ self.dim = dim
108
+ x.masked_fill_(mask, float('-inf'))
109
+ x = torch.softmax(x, self.dim)
110
+ x.masked_fill_(mask, 0.0)
111
+ self.save_for_backward(x)
112
+ return x
113
+
114
+ @staticmethod
115
+ def backward(self, grad_output):
116
+ output, = self.saved_tensors
117
+ input_grad = softmax_backward_data(self, grad_output, output, self.dim, output)
118
+ return input_grad, None, None
119
+
120
+
121
+ class Attention(nn.Module):
122
+ def __init__(self, config):
123
+ super().__init__()
124
+
125
+ self.config = config
126
+
127
+ if config.hidden_size % config.num_attention_heads != 0:
128
+ raise ValueError(f"The hidden size {config.hidden_size} is not a multiple of the number of attention heads {config.num_attention_heads}")
129
+
130
+ self.hidden_size = config.hidden_size
131
+ self.num_heads = config.num_attention_heads
132
+ self.head_size = config.hidden_size // config.num_attention_heads
133
+
134
+ self.in_proj_qk = nn.Linear(config.hidden_size, 2*config.hidden_size, bias=True)
135
+ self.in_proj_v = nn.Linear(config.hidden_size, config.hidden_size, bias=True)
136
+ self.out_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=True)
137
+
138
+ self.pre_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False)
139
+ self.post_layer_norm = nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=True)
140
+
141
+ position_indices = torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(1) \
142
+ - torch.arange(config.max_position_embeddings, dtype=torch.long).unsqueeze(0)
143
+ position_indices = self.make_log_bucket_position(position_indices, config.position_bucket_size, config.max_position_embeddings)
144
+ position_indices = config.position_bucket_size - 1 + position_indices
145
+ self.register_buffer("position_indices", position_indices, persistent=False)
146
+
147
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
148
+ self.scale = 1.0 / math.sqrt(3 * self.head_size)
149
+
150
+ def make_log_bucket_position(self, relative_pos, bucket_size, max_position):
151
+ sign = torch.sign(relative_pos)
152
+ mid = bucket_size // 2
153
+ abs_pos = torch.where((relative_pos < mid) & (relative_pos > -mid), mid - 1, torch.abs(relative_pos).clamp(max=max_position - 1))
154
+ log_pos = torch.ceil(torch.log(abs_pos / mid) / math.log((max_position-1) / mid) * (mid - 1)).int() + mid
155
+ bucket_pos = torch.where(abs_pos <= mid, relative_pos, log_pos * sign).long()
156
+ return bucket_pos
157
+
158
+ def compute_attention_scores(self, hidden_states, relative_embedding):
159
+ key_len, batch_size, _ = hidden_states.size()
160
+ query_len = key_len
161
+
162
+ if self.position_indices.size(0) < query_len:
163
+ position_indices = torch.arange(query_len, dtype=torch.long).unsqueeze(1) \
164
+ - torch.arange(query_len, dtype=torch.long).unsqueeze(0)
165
+ position_indices = self.make_log_bucket_position(position_indices, self.config.position_bucket_size, 512)
166
+ position_indices = self.config.position_bucket_size - 1 + position_indices
167
+ self.position_indices = position_indices.to(hidden_states.device)
168
+
169
+ hidden_states = self.pre_layer_norm(hidden_states)
170
+
171
+ query, key = self.in_proj_qk(hidden_states).chunk(2, dim=2) # shape: [T, B, D]
172
+ value = self.in_proj_v(hidden_states) # shape: [T, B, D]
173
+
174
+ query = query.reshape(query_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
175
+ key = key.reshape(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
176
+ value = value.view(key_len, batch_size * self.num_heads, self.head_size).transpose(0, 1)
177
+
178
+ attention_scores = torch.bmm(query, key.transpose(1, 2) * self.scale)
179
+
180
+ pos = self.in_proj_qk(self.dropout(relative_embedding)) # shape: [2T-1, 2D]
181
+ query_pos, key_pos = pos.view(-1, self.num_heads, 2*self.head_size).chunk(2, dim=2)
182
+ query = query.view(batch_size, self.num_heads, query_len, self.head_size)
183
+ key = key.view(batch_size, self.num_heads, query_len, self.head_size)
184
+
185
+ attention_c_p = torch.einsum("bhqd,khd->bhqk", query, key_pos.squeeze(1) * self.scale)
186
+ attention_p_c = torch.einsum("bhkd,qhd->bhqk", key * self.scale, query_pos.squeeze(1))
187
+
188
+ position_indices = self.position_indices[:query_len, :key_len].expand(batch_size, self.num_heads, -1, -1)
189
+ attention_c_p = attention_c_p.gather(3, position_indices)
190
+ attention_p_c = attention_p_c.gather(2, position_indices)
191
+
192
+ attention_scores = attention_scores.view(batch_size, self.num_heads, query_len, key_len)
193
+ attention_scores.add_(attention_c_p)
194
+ attention_scores.add_(attention_p_c)
195
+
196
+ return attention_scores, value
197
+
198
+ def compute_output(self, attention_probs, value):
199
+ attention_probs = self.dropout(attention_probs)
200
+ context = torch.bmm(attention_probs.flatten(0, 1), value) # shape: [B*H, Q, D]
201
+ context = context.transpose(0, 1).reshape(context.size(1), -1, self.hidden_size) # shape: [Q, B, H*D]
202
+ context = self.out_proj(context)
203
+ context = self.post_layer_norm(context)
204
+ context = self.dropout(context)
205
+ return context
206
+
207
+ def forward(self, hidden_states, attention_mask, relative_embedding):
208
+ attention_scores, value = self.compute_attention_scores(hidden_states, relative_embedding)
209
+ attention_probs = MaskedSoftmax.apply(attention_scores, attention_mask, -1)
210
+ return self.compute_output(attention_probs, value), attention_probs.detach()
211
+
212
+
213
+ class Embedding(nn.Module):
214
+ def __init__(self, config):
215
+ super().__init__()
216
+ self.hidden_size = config.hidden_size
217
+
218
+ self.word_embedding = nn.Embedding(config.vocab_size, config.hidden_size)
219
+ self.word_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps, elementwise_affine=False)
220
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
221
+
222
+ self.relative_embedding = nn.Parameter(torch.empty(2 * config.position_bucket_size - 1, config.hidden_size))
223
+ self.relative_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
224
+
225
+ def forward(self, input_ids):
226
+ word_embedding = self.dropout(self.word_layer_norm(self.word_embedding(input_ids)))
227
+ relative_embeddings = self.relative_layer_norm(self.relative_embedding)
228
+ return word_embedding, relative_embeddings
229
+
230
+
231
+ #
232
+ # HuggingFace wrappers
233
+ #
234
+
235
+ class NorbertPreTrainedModel(PreTrainedModel):
236
+ config_class = NorbertConfig
237
+ base_model_prefix = "norbert3"
238
+ supports_gradient_checkpointing = True
239
+
240
+ def _set_gradient_checkpointing(self, module, value=False):
241
+ if isinstance(module, Encoder):
242
+ module.activation_checkpointing = value
243
+
244
+ def _init_weights(self, module):
245
+ std = math.sqrt(2.0 / (5.0 * self.hidden_size))
246
+
247
+ if isinstance(module, nn.Linear):
248
+ nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std, a=-2*std, b=2*std)
249
+ if module.bias is not None:
250
+ module.bias.data.zero_()
251
+ elif isinstance(module, nn.Embedding):
252
+ nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std, a=-2*std, b=2*std)
253
+ elif isinstance(module, nn.LayerNorm):
254
+ module.bias.data.zero_()
255
+ module.weight.data.fill_(1.0)
256
+
257
+
258
+ class NorbertModel(NorbertPreTrainedModel):
259
+ def __init__(self, config, add_mlm_layer=False, gradient_checkpointing=False, **kwargs):
260
+ super().__init__(config, **kwargs)
261
+ self.config = config
262
+ self.hidden_size = config.hidden_size
263
+
264
+ self.embedding = Embedding(config)
265
+ self.transformer = Encoder(config, activation_checkpointing=gradient_checkpointing)
266
+ self.classifier = MaskClassifier(config, self.embedding.word_embedding.weight) if add_mlm_layer else None
267
+
268
+ def get_input_embeddings(self):
269
+ return self.embedding.word_embedding
270
+
271
+ def set_input_embeddings(self, value):
272
+ self.embedding.word_embedding = value
273
+
274
+ def get_contextualized_embeddings(
275
+ self,
276
+ input_ids: Optional[torch.Tensor] = None,
277
+ attention_mask: Optional[torch.Tensor] = None
278
+ ) -> List[torch.Tensor]:
279
+ if input_ids is not None:
280
+ input_shape = input_ids.size()
281
+ else:
282
+ raise ValueError("You have to specify input_ids")
283
+
284
+ batch_size, seq_length = input_shape
285
+ device = input_ids.device
286
+
287
+ if attention_mask is None:
288
+ attention_mask = torch.zeros(batch_size, seq_length, dtype=torch.bool, device=device)
289
+ else:
290
+ attention_mask = ~attention_mask.bool()
291
+ attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
292
+
293
+ static_embeddings, relative_embedding = self.embedding(input_ids.t())
294
+ contextualized_embeddings, attention_probs = self.transformer(static_embeddings, attention_mask, relative_embedding)
295
+ contextualized_embeddings = [e.transpose(0, 1) for e in contextualized_embeddings]
296
+ last_layer = contextualized_embeddings[-1]
297
+ contextualized_embeddings = [contextualized_embeddings[0]] + [
298
+ contextualized_embeddings[i] - contextualized_embeddings[i - 1]
299
+ for i in range(1, len(contextualized_embeddings))
300
+ ]
301
+ return last_layer, contextualized_embeddings, attention_probs
302
+
303
+ def forward(
304
+ self,
305
+ input_ids: Optional[torch.Tensor] = None,
306
+ attention_mask: Optional[torch.Tensor] = None,
307
+ token_type_ids: Optional[torch.Tensor] = None,
308
+ position_ids: Optional[torch.Tensor] = None,
309
+ output_hidden_states: Optional[bool] = None,
310
+ output_attentions: Optional[bool] = None,
311
+ return_dict: Optional[bool] = None,
312
+ **kwargs
313
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutput]:
314
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
315
+
316
+ sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
317
+
318
+ if not return_dict:
319
+ return (
320
+ sequence_output,
321
+ *([contextualized_embeddings] if output_hidden_states else []),
322
+ *([attention_probs] if output_attentions else [])
323
+ )
324
+
325
+ return BaseModelOutput(
326
+ last_hidden_state=sequence_output,
327
+ hidden_states=contextualized_embeddings if output_hidden_states else None,
328
+ attentions=attention_probs if output_attentions else None
329
+ )
330
+
331
+
332
+ class NorbertForMaskedLM(NorbertModel):
333
+ _keys_to_ignore_on_load_unexpected = ["head"]
334
+
335
+ def __init__(self, config, **kwargs):
336
+ super().__init__(config, add_mlm_layer=True, **kwargs)
337
+
338
+ def get_output_embeddings(self):
339
+ return self.classifier.nonlinearity[-1].weight
340
+
341
+ def set_output_embeddings(self, new_embeddings):
342
+ self.classifier.nonlinearity[-1].weight = new_embeddings
343
+
344
+ def forward(
345
+ self,
346
+ input_ids: Optional[torch.Tensor] = None,
347
+ attention_mask: Optional[torch.Tensor] = None,
348
+ token_type_ids: Optional[torch.Tensor] = None,
349
+ position_ids: Optional[torch.Tensor] = None,
350
+ output_hidden_states: Optional[bool] = None,
351
+ output_attentions: Optional[bool] = None,
352
+ return_dict: Optional[bool] = None,
353
+ labels: Optional[torch.LongTensor] = None,
354
+ **kwargs
355
+ ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
356
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
357
+
358
+ sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
359
+ subword_prediction = self.classifier(sequence_output)
360
+ subword_prediction[:, :, :106+1] = float("-inf")
361
+
362
+ masked_lm_loss = None
363
+ if labels is not None:
364
+ masked_lm_loss = F.cross_entropy(subword_prediction.flatten(0, 1), labels.flatten())
365
+
366
+ if not return_dict:
367
+ output = (
368
+ subword_prediction,
369
+ *([contextualized_embeddings] if output_hidden_states else []),
370
+ *([attention_probs] if output_attentions else [])
371
+ )
372
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
373
+
374
+ return MaskedLMOutput(
375
+ loss=masked_lm_loss,
376
+ logits=subword_prediction,
377
+ hidden_states=contextualized_embeddings if output_hidden_states else None,
378
+ attentions=attention_probs if output_attentions else None
379
+ )
380
+
381
+
382
+ class Classifier(nn.Module):
383
+ def __init__(self, config, num_labels: int):
384
+ super().__init__()
385
+
386
+ drop_out = getattr(config, "cls_dropout", None)
387
+ drop_out = config.hidden_dropout_prob if drop_out is None else drop_out
388
+
389
+ self.nonlinearity = nn.Sequential(
390
+ nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
391
+ nn.Linear(config.hidden_size, config.hidden_size),
392
+ nn.GELU(),
393
+ nn.LayerNorm(config.hidden_size, config.layer_norm_eps, elementwise_affine=False),
394
+ nn.Dropout(drop_out),
395
+ nn.Linear(config.hidden_size, num_labels)
396
+ )
397
+
398
+ def forward(self, x):
399
+ x = self.nonlinearity(x)
400
+ return x
401
+
402
+
403
+ class NorbertForSequenceClassification(NorbertModel):
404
+ _keys_to_ignore_on_load_unexpected = ["classifier"]
405
+ _keys_to_ignore_on_load_missing = ["head"]
406
+
407
+ def __init__(self, config, **kwargs):
408
+ super().__init__(config, add_mlm_layer=False, **kwargs)
409
+
410
+ self.num_labels = config.num_labels
411
+ self.head = Classifier(config, self.num_labels)
412
+
413
+ def forward(
414
+ self,
415
+ input_ids: Optional[torch.Tensor] = None,
416
+ attention_mask: Optional[torch.Tensor] = None,
417
+ token_type_ids: Optional[torch.Tensor] = None,
418
+ position_ids: Optional[torch.Tensor] = None,
419
+ output_attentions: Optional[bool] = None,
420
+ output_hidden_states: Optional[bool] = None,
421
+ return_dict: Optional[bool] = None,
422
+ labels: Optional[torch.LongTensor] = None,
423
+ **kwargs
424
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
425
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
426
+
427
+ sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
428
+ logits = self.head(sequence_output[:, 0, :])
429
+
430
+ loss = None
431
+ if labels is not None:
432
+ if self.config.problem_type is None:
433
+ if self.num_labels == 1:
434
+ self.config.problem_type = "regression"
435
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
436
+ self.config.problem_type = "single_label_classification"
437
+ else:
438
+ self.config.problem_type = "multi_label_classification"
439
+
440
+ if self.config.problem_type == "regression":
441
+ loss_fct = nn.MSELoss()
442
+ if self.num_labels == 1:
443
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
444
+ else:
445
+ loss = loss_fct(logits, labels)
446
+ elif self.config.problem_type == "single_label_classification":
447
+ loss_fct = nn.CrossEntropyLoss()
448
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
449
+ elif self.config.problem_type == "multi_label_classification":
450
+ loss_fct = nn.BCEWithLogitsLoss()
451
+ loss = loss_fct(logits, labels)
452
+
453
+ if not return_dict:
454
+ output = (
455
+ logits,
456
+ *([contextualized_embeddings] if output_hidden_states else []),
457
+ *([attention_probs] if output_attentions else [])
458
+ )
459
+ return ((loss,) + output) if loss is not None else output
460
+
461
+ return SequenceClassifierOutput(
462
+ loss=loss,
463
+ logits=logits,
464
+ hidden_states=contextualized_embeddings if output_hidden_states else None,
465
+ attentions=attention_probs if output_attentions else None
466
+ )
467
+
468
+
469
+ class NorbertForTokenClassification(NorbertModel):
470
+ _keys_to_ignore_on_load_unexpected = ["classifier"]
471
+ _keys_to_ignore_on_load_missing = ["head"]
472
+
473
+ def __init__(self, config, **kwargs):
474
+ super().__init__(config, add_mlm_layer=False, **kwargs)
475
+
476
+ self.num_labels = config.num_labels
477
+ self.head = Classifier(config, self.num_labels)
478
+
479
+ def forward(
480
+ self,
481
+ input_ids: Optional[torch.Tensor] = None,
482
+ attention_mask: Optional[torch.Tensor] = None,
483
+ token_type_ids: Optional[torch.Tensor] = None,
484
+ position_ids: Optional[torch.Tensor] = None,
485
+ output_attentions: Optional[bool] = None,
486
+ output_hidden_states: Optional[bool] = None,
487
+ return_dict: Optional[bool] = None,
488
+ labels: Optional[torch.LongTensor] = None,
489
+ **kwargs
490
+ ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
491
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
492
+
493
+ sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
494
+ logits = self.head(sequence_output)
495
+
496
+ loss = None
497
+ if labels is not None:
498
+ loss_fct = nn.CrossEntropyLoss()
499
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
500
+
501
+ if not return_dict:
502
+ output = (
503
+ logits,
504
+ *([contextualized_embeddings] if output_hidden_states else []),
505
+ *([attention_probs] if output_attentions else [])
506
+ )
507
+ return ((loss,) + output) if loss is not None else output
508
+
509
+ return TokenClassifierOutput(
510
+ loss=loss,
511
+ logits=logits,
512
+ hidden_states=contextualized_embeddings if output_hidden_states else None,
513
+ attentions=attention_probs if output_attentions else None
514
+ )
515
+
516
+
517
+ class NorbertForQuestionAnswering(NorbertModel):
518
+ _keys_to_ignore_on_load_unexpected = ["classifier"]
519
+ _keys_to_ignore_on_load_missing = ["head"]
520
+
521
+ def __init__(self, config, **kwargs):
522
+ super().__init__(config, add_mlm_layer=False, **kwargs)
523
+
524
+ self.num_labels = config.num_labels
525
+ self.head = Classifier(config, self.num_labels)
526
+
527
+ def forward(
528
+ self,
529
+ input_ids: Optional[torch.Tensor] = None,
530
+ attention_mask: Optional[torch.Tensor] = None,
531
+ token_type_ids: Optional[torch.Tensor] = None,
532
+ position_ids: Optional[torch.Tensor] = None,
533
+ output_attentions: Optional[bool] = None,
534
+ output_hidden_states: Optional[bool] = None,
535
+ return_dict: Optional[bool] = None,
536
+ start_positions: Optional[torch.Tensor] = None,
537
+ end_positions: Optional[torch.Tensor] = None,
538
+ **kwargs
539
+ ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
540
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
541
+
542
+ sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(input_ids, attention_mask)
543
+ logits = self.head(sequence_output)
544
+
545
+ start_logits, end_logits = logits.split(1, dim=-1)
546
+ start_logits = start_logits.squeeze(-1).contiguous()
547
+ end_logits = end_logits.squeeze(-1).contiguous()
548
+
549
+ total_loss = None
550
+ if start_positions is not None and end_positions is not None:
551
+ # If we are on multi-GPU, split add a dimension
552
+ if len(start_positions.size()) > 1:
553
+ start_positions = start_positions.squeeze(-1)
554
+ if len(end_positions.size()) > 1:
555
+ end_positions = end_positions.squeeze(-1)
556
+
557
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
558
+ ignored_index = start_logits.size(1)
559
+ start_positions = start_positions.clamp(0, ignored_index)
560
+ end_positions = end_positions.clamp(0, ignored_index)
561
+
562
+ loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
563
+ start_loss = loss_fct(start_logits, start_positions)
564
+ end_loss = loss_fct(end_logits, end_positions)
565
+ total_loss = (start_loss + end_loss) / 2
566
+
567
+ if not return_dict:
568
+ output = (
569
+ start_logits,
570
+ end_logits,
571
+ *([contextualized_embeddings] if output_hidden_states else []),
572
+ *([attention_probs] if output_attentions else [])
573
+ )
574
+ return ((total_loss,) + output) if total_loss is not None else output
575
+
576
+ return QuestionAnsweringModelOutput(
577
+ loss=total_loss,
578
+ start_logits=start_logits,
579
+ end_logits=end_logits,
580
+ hidden_states=contextualized_embeddings if output_hidden_states else None,
581
+ attentions=attention_probs if output_attentions else None
582
+ )
583
+
584
+
585
+ class NorbertForMultipleChoice(NorbertModel):
586
+ _keys_to_ignore_on_load_unexpected = ["classifier"]
587
+ _keys_to_ignore_on_load_missing = ["head"]
588
+
589
+ def __init__(self, config, **kwargs):
590
+ super().__init__(config, add_mlm_layer=False, **kwargs)
591
+
592
+ self.num_labels = getattr(config, "num_labels", 2)
593
+ self.head = Classifier(config, self.num_labels)
594
+
595
+ def forward(
596
+ self,
597
+ input_ids: Optional[torch.Tensor] = None,
598
+ attention_mask: Optional[torch.Tensor] = None,
599
+ token_type_ids: Optional[torch.Tensor] = None,
600
+ position_ids: Optional[torch.Tensor] = None,
601
+ labels: Optional[torch.Tensor] = None,
602
+ output_attentions: Optional[bool] = None,
603
+ output_hidden_states: Optional[bool] = None,
604
+ return_dict: Optional[bool] = None,
605
+ **kwargs
606
+ ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
607
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
608
+ num_choices = input_ids.shape[1]
609
+
610
+ flat_input_ids = input_ids.view(-1, input_ids.size(-1))
611
+ flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
612
+
613
+ sequence_output, contextualized_embeddings, attention_probs = self.get_contextualized_embeddings(flat_input_ids, flat_attention_mask)
614
+ logits = self.head(sequence_output)
615
+ reshaped_logits = logits.view(-1, num_choices)
616
+
617
+ loss = None
618
+ if labels is not None:
619
+ loss_fct = nn.CrossEntropyLoss()
620
+ loss = loss_fct(reshaped_logits, labels)
621
+
622
+ if not return_dict:
623
+ output = (
624
+ reshaped_logits,
625
+ *([contextualized_embeddings] if output_hidden_states else []),
626
+ *([attention_probs] if output_attentions else [])
627
+ )
628
+ return ((loss,) + output) if loss is not None else output
629
+
630
+ return MultipleChoiceModelOutput(
631
+ loss=loss,
632
+ logits=reshaped_logits,
633
+ hidden_states=contextualized_embeddings if output_hidden_states else None,
634
+ attentions=attention_probs if output_attentions else None
635
+ )
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:144912b41c99ce11c9ff41ade8ce11e6432b0eedfae30a55a60152bdfb7a5c5d
3
+ size 4735250
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e78c2b2dfd67dd86ab81cb66717a46e57aba66c9b9e57f20766946e1e48703d2
3
+ size 14180
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a20310f80bd8c93b2778c2b9d92d7e31c911403d1bad2087e2913900421f86d1
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "[BOS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "[CLS]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "[EOS]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "[MASK]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "[SEP]",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,870 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[BOS]",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "[EOS]",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "7": {
60
+ "content": "[MASK_0]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "8": {
68
+ "content": "[MASK_1]",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "9": {
76
+ "content": "[MASK_2]",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "10": {
84
+ "content": "[MASK_3]",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "11": {
92
+ "content": "[MASK_4]",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "12": {
100
+ "content": "[MASK_5]",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "13": {
108
+ "content": "[MASK_6]",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ },
115
+ "14": {
116
+ "content": "[MASK_7]",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "15": {
124
+ "content": "[MASK_8]",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": true
130
+ },
131
+ "16": {
132
+ "content": "[MASK_9]",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": true
138
+ },
139
+ "17": {
140
+ "content": "[MASK_10]",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "18": {
148
+ "content": "[MASK_11]",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ },
155
+ "19": {
156
+ "content": "[MASK_12]",
157
+ "lstrip": false,
158
+ "normalized": false,
159
+ "rstrip": false,
160
+ "single_word": false,
161
+ "special": true
162
+ },
163
+ "20": {
164
+ "content": "[MASK_13]",
165
+ "lstrip": false,
166
+ "normalized": false,
167
+ "rstrip": false,
168
+ "single_word": false,
169
+ "special": true
170
+ },
171
+ "21": {
172
+ "content": "[MASK_14]",
173
+ "lstrip": false,
174
+ "normalized": false,
175
+ "rstrip": false,
176
+ "single_word": false,
177
+ "special": true
178
+ },
179
+ "22": {
180
+ "content": "[MASK_15]",
181
+ "lstrip": false,
182
+ "normalized": false,
183
+ "rstrip": false,
184
+ "single_word": false,
185
+ "special": true
186
+ },
187
+ "23": {
188
+ "content": "[MASK_16]",
189
+ "lstrip": false,
190
+ "normalized": false,
191
+ "rstrip": false,
192
+ "single_word": false,
193
+ "special": true
194
+ },
195
+ "24": {
196
+ "content": "[MASK_17]",
197
+ "lstrip": false,
198
+ "normalized": false,
199
+ "rstrip": false,
200
+ "single_word": false,
201
+ "special": true
202
+ },
203
+ "25": {
204
+ "content": "[MASK_18]",
205
+ "lstrip": false,
206
+ "normalized": false,
207
+ "rstrip": false,
208
+ "single_word": false,
209
+ "special": true
210
+ },
211
+ "26": {
212
+ "content": "[MASK_19]",
213
+ "lstrip": false,
214
+ "normalized": false,
215
+ "rstrip": false,
216
+ "single_word": false,
217
+ "special": true
218
+ },
219
+ "27": {
220
+ "content": "[MASK_20]",
221
+ "lstrip": false,
222
+ "normalized": false,
223
+ "rstrip": false,
224
+ "single_word": false,
225
+ "special": true
226
+ },
227
+ "28": {
228
+ "content": "[MASK_21]",
229
+ "lstrip": false,
230
+ "normalized": false,
231
+ "rstrip": false,
232
+ "single_word": false,
233
+ "special": true
234
+ },
235
+ "29": {
236
+ "content": "[MASK_22]",
237
+ "lstrip": false,
238
+ "normalized": false,
239
+ "rstrip": false,
240
+ "single_word": false,
241
+ "special": true
242
+ },
243
+ "30": {
244
+ "content": "[MASK_23]",
245
+ "lstrip": false,
246
+ "normalized": false,
247
+ "rstrip": false,
248
+ "single_word": false,
249
+ "special": true
250
+ },
251
+ "31": {
252
+ "content": "[MASK_24]",
253
+ "lstrip": false,
254
+ "normalized": false,
255
+ "rstrip": false,
256
+ "single_word": false,
257
+ "special": true
258
+ },
259
+ "32": {
260
+ "content": "[MASK_25]",
261
+ "lstrip": false,
262
+ "normalized": false,
263
+ "rstrip": false,
264
+ "single_word": false,
265
+ "special": true
266
+ },
267
+ "33": {
268
+ "content": "[MASK_26]",
269
+ "lstrip": false,
270
+ "normalized": false,
271
+ "rstrip": false,
272
+ "single_word": false,
273
+ "special": true
274
+ },
275
+ "34": {
276
+ "content": "[MASK_27]",
277
+ "lstrip": false,
278
+ "normalized": false,
279
+ "rstrip": false,
280
+ "single_word": false,
281
+ "special": true
282
+ },
283
+ "35": {
284
+ "content": "[MASK_28]",
285
+ "lstrip": false,
286
+ "normalized": false,
287
+ "rstrip": false,
288
+ "single_word": false,
289
+ "special": true
290
+ },
291
+ "36": {
292
+ "content": "[MASK_29]",
293
+ "lstrip": false,
294
+ "normalized": false,
295
+ "rstrip": false,
296
+ "single_word": false,
297
+ "special": true
298
+ },
299
+ "37": {
300
+ "content": "[MASK_30]",
301
+ "lstrip": false,
302
+ "normalized": false,
303
+ "rstrip": false,
304
+ "single_word": false,
305
+ "special": true
306
+ },
307
+ "38": {
308
+ "content": "[MASK_31]",
309
+ "lstrip": false,
310
+ "normalized": false,
311
+ "rstrip": false,
312
+ "single_word": false,
313
+ "special": true
314
+ },
315
+ "39": {
316
+ "content": "[MASK_32]",
317
+ "lstrip": false,
318
+ "normalized": false,
319
+ "rstrip": false,
320
+ "single_word": false,
321
+ "special": true
322
+ },
323
+ "40": {
324
+ "content": "[MASK_33]",
325
+ "lstrip": false,
326
+ "normalized": false,
327
+ "rstrip": false,
328
+ "single_word": false,
329
+ "special": true
330
+ },
331
+ "41": {
332
+ "content": "[MASK_34]",
333
+ "lstrip": false,
334
+ "normalized": false,
335
+ "rstrip": false,
336
+ "single_word": false,
337
+ "special": true
338
+ },
339
+ "42": {
340
+ "content": "[MASK_35]",
341
+ "lstrip": false,
342
+ "normalized": false,
343
+ "rstrip": false,
344
+ "single_word": false,
345
+ "special": true
346
+ },
347
+ "43": {
348
+ "content": "[MASK_36]",
349
+ "lstrip": false,
350
+ "normalized": false,
351
+ "rstrip": false,
352
+ "single_word": false,
353
+ "special": true
354
+ },
355
+ "44": {
356
+ "content": "[MASK_37]",
357
+ "lstrip": false,
358
+ "normalized": false,
359
+ "rstrip": false,
360
+ "single_word": false,
361
+ "special": true
362
+ },
363
+ "45": {
364
+ "content": "[MASK_38]",
365
+ "lstrip": false,
366
+ "normalized": false,
367
+ "rstrip": false,
368
+ "single_word": false,
369
+ "special": true
370
+ },
371
+ "46": {
372
+ "content": "[MASK_39]",
373
+ "lstrip": false,
374
+ "normalized": false,
375
+ "rstrip": false,
376
+ "single_word": false,
377
+ "special": true
378
+ },
379
+ "47": {
380
+ "content": "[MASK_40]",
381
+ "lstrip": false,
382
+ "normalized": false,
383
+ "rstrip": false,
384
+ "single_word": false,
385
+ "special": true
386
+ },
387
+ "48": {
388
+ "content": "[MASK_41]",
389
+ "lstrip": false,
390
+ "normalized": false,
391
+ "rstrip": false,
392
+ "single_word": false,
393
+ "special": true
394
+ },
395
+ "49": {
396
+ "content": "[MASK_42]",
397
+ "lstrip": false,
398
+ "normalized": false,
399
+ "rstrip": false,
400
+ "single_word": false,
401
+ "special": true
402
+ },
403
+ "50": {
404
+ "content": "[MASK_43]",
405
+ "lstrip": false,
406
+ "normalized": false,
407
+ "rstrip": false,
408
+ "single_word": false,
409
+ "special": true
410
+ },
411
+ "51": {
412
+ "content": "[MASK_44]",
413
+ "lstrip": false,
414
+ "normalized": false,
415
+ "rstrip": false,
416
+ "single_word": false,
417
+ "special": true
418
+ },
419
+ "52": {
420
+ "content": "[MASK_45]",
421
+ "lstrip": false,
422
+ "normalized": false,
423
+ "rstrip": false,
424
+ "single_word": false,
425
+ "special": true
426
+ },
427
+ "53": {
428
+ "content": "[MASK_46]",
429
+ "lstrip": false,
430
+ "normalized": false,
431
+ "rstrip": false,
432
+ "single_word": false,
433
+ "special": true
434
+ },
435
+ "54": {
436
+ "content": "[MASK_47]",
437
+ "lstrip": false,
438
+ "normalized": false,
439
+ "rstrip": false,
440
+ "single_word": false,
441
+ "special": true
442
+ },
443
+ "55": {
444
+ "content": "[MASK_48]",
445
+ "lstrip": false,
446
+ "normalized": false,
447
+ "rstrip": false,
448
+ "single_word": false,
449
+ "special": true
450
+ },
451
+ "56": {
452
+ "content": "[MASK_49]",
453
+ "lstrip": false,
454
+ "normalized": false,
455
+ "rstrip": false,
456
+ "single_word": false,
457
+ "special": true
458
+ },
459
+ "57": {
460
+ "content": "[MASK_50]",
461
+ "lstrip": false,
462
+ "normalized": false,
463
+ "rstrip": false,
464
+ "single_word": false,
465
+ "special": true
466
+ },
467
+ "58": {
468
+ "content": "[MASK_51]",
469
+ "lstrip": false,
470
+ "normalized": false,
471
+ "rstrip": false,
472
+ "single_word": false,
473
+ "special": true
474
+ },
475
+ "59": {
476
+ "content": "[MASK_52]",
477
+ "lstrip": false,
478
+ "normalized": false,
479
+ "rstrip": false,
480
+ "single_word": false,
481
+ "special": true
482
+ },
483
+ "60": {
484
+ "content": "[MASK_53]",
485
+ "lstrip": false,
486
+ "normalized": false,
487
+ "rstrip": false,
488
+ "single_word": false,
489
+ "special": true
490
+ },
491
+ "61": {
492
+ "content": "[MASK_54]",
493
+ "lstrip": false,
494
+ "normalized": false,
495
+ "rstrip": false,
496
+ "single_word": false,
497
+ "special": true
498
+ },
499
+ "62": {
500
+ "content": "[MASK_55]",
501
+ "lstrip": false,
502
+ "normalized": false,
503
+ "rstrip": false,
504
+ "single_word": false,
505
+ "special": true
506
+ },
507
+ "63": {
508
+ "content": "[MASK_56]",
509
+ "lstrip": false,
510
+ "normalized": false,
511
+ "rstrip": false,
512
+ "single_word": false,
513
+ "special": true
514
+ },
515
+ "64": {
516
+ "content": "[MASK_57]",
517
+ "lstrip": false,
518
+ "normalized": false,
519
+ "rstrip": false,
520
+ "single_word": false,
521
+ "special": true
522
+ },
523
+ "65": {
524
+ "content": "[MASK_58]",
525
+ "lstrip": false,
526
+ "normalized": false,
527
+ "rstrip": false,
528
+ "single_word": false,
529
+ "special": true
530
+ },
531
+ "66": {
532
+ "content": "[MASK_59]",
533
+ "lstrip": false,
534
+ "normalized": false,
535
+ "rstrip": false,
536
+ "single_word": false,
537
+ "special": true
538
+ },
539
+ "67": {
540
+ "content": "[MASK_60]",
541
+ "lstrip": false,
542
+ "normalized": false,
543
+ "rstrip": false,
544
+ "single_word": false,
545
+ "special": true
546
+ },
547
+ "68": {
548
+ "content": "[MASK_61]",
549
+ "lstrip": false,
550
+ "normalized": false,
551
+ "rstrip": false,
552
+ "single_word": false,
553
+ "special": true
554
+ },
555
+ "69": {
556
+ "content": "[MASK_62]",
557
+ "lstrip": false,
558
+ "normalized": false,
559
+ "rstrip": false,
560
+ "single_word": false,
561
+ "special": true
562
+ },
563
+ "70": {
564
+ "content": "[MASK_63]",
565
+ "lstrip": false,
566
+ "normalized": false,
567
+ "rstrip": false,
568
+ "single_word": false,
569
+ "special": true
570
+ },
571
+ "71": {
572
+ "content": "[MASK_64]",
573
+ "lstrip": false,
574
+ "normalized": false,
575
+ "rstrip": false,
576
+ "single_word": false,
577
+ "special": true
578
+ },
579
+ "72": {
580
+ "content": "[MASK_65]",
581
+ "lstrip": false,
582
+ "normalized": false,
583
+ "rstrip": false,
584
+ "single_word": false,
585
+ "special": true
586
+ },
587
+ "73": {
588
+ "content": "[MASK_66]",
589
+ "lstrip": false,
590
+ "normalized": false,
591
+ "rstrip": false,
592
+ "single_word": false,
593
+ "special": true
594
+ },
595
+ "74": {
596
+ "content": "[MASK_67]",
597
+ "lstrip": false,
598
+ "normalized": false,
599
+ "rstrip": false,
600
+ "single_word": false,
601
+ "special": true
602
+ },
603
+ "75": {
604
+ "content": "[MASK_68]",
605
+ "lstrip": false,
606
+ "normalized": false,
607
+ "rstrip": false,
608
+ "single_word": false,
609
+ "special": true
610
+ },
611
+ "76": {
612
+ "content": "[MASK_69]",
613
+ "lstrip": false,
614
+ "normalized": false,
615
+ "rstrip": false,
616
+ "single_word": false,
617
+ "special": true
618
+ },
619
+ "77": {
620
+ "content": "[MASK_70]",
621
+ "lstrip": false,
622
+ "normalized": false,
623
+ "rstrip": false,
624
+ "single_word": false,
625
+ "special": true
626
+ },
627
+ "78": {
628
+ "content": "[MASK_71]",
629
+ "lstrip": false,
630
+ "normalized": false,
631
+ "rstrip": false,
632
+ "single_word": false,
633
+ "special": true
634
+ },
635
+ "79": {
636
+ "content": "[MASK_72]",
637
+ "lstrip": false,
638
+ "normalized": false,
639
+ "rstrip": false,
640
+ "single_word": false,
641
+ "special": true
642
+ },
643
+ "80": {
644
+ "content": "[MASK_73]",
645
+ "lstrip": false,
646
+ "normalized": false,
647
+ "rstrip": false,
648
+ "single_word": false,
649
+ "special": true
650
+ },
651
+ "81": {
652
+ "content": "[MASK_74]",
653
+ "lstrip": false,
654
+ "normalized": false,
655
+ "rstrip": false,
656
+ "single_word": false,
657
+ "special": true
658
+ },
659
+ "82": {
660
+ "content": "[MASK_75]",
661
+ "lstrip": false,
662
+ "normalized": false,
663
+ "rstrip": false,
664
+ "single_word": false,
665
+ "special": true
666
+ },
667
+ "83": {
668
+ "content": "[MASK_76]",
669
+ "lstrip": false,
670
+ "normalized": false,
671
+ "rstrip": false,
672
+ "single_word": false,
673
+ "special": true
674
+ },
675
+ "84": {
676
+ "content": "[MASK_77]",
677
+ "lstrip": false,
678
+ "normalized": false,
679
+ "rstrip": false,
680
+ "single_word": false,
681
+ "special": true
682
+ },
683
+ "85": {
684
+ "content": "[MASK_78]",
685
+ "lstrip": false,
686
+ "normalized": false,
687
+ "rstrip": false,
688
+ "single_word": false,
689
+ "special": true
690
+ },
691
+ "86": {
692
+ "content": "[MASK_79]",
693
+ "lstrip": false,
694
+ "normalized": false,
695
+ "rstrip": false,
696
+ "single_word": false,
697
+ "special": true
698
+ },
699
+ "87": {
700
+ "content": "[MASK_80]",
701
+ "lstrip": false,
702
+ "normalized": false,
703
+ "rstrip": false,
704
+ "single_word": false,
705
+ "special": true
706
+ },
707
+ "88": {
708
+ "content": "[MASK_81]",
709
+ "lstrip": false,
710
+ "normalized": false,
711
+ "rstrip": false,
712
+ "single_word": false,
713
+ "special": true
714
+ },
715
+ "89": {
716
+ "content": "[MASK_82]",
717
+ "lstrip": false,
718
+ "normalized": false,
719
+ "rstrip": false,
720
+ "single_word": false,
721
+ "special": true
722
+ },
723
+ "90": {
724
+ "content": "[MASK_83]",
725
+ "lstrip": false,
726
+ "normalized": false,
727
+ "rstrip": false,
728
+ "single_word": false,
729
+ "special": true
730
+ },
731
+ "91": {
732
+ "content": "[MASK_84]",
733
+ "lstrip": false,
734
+ "normalized": false,
735
+ "rstrip": false,
736
+ "single_word": false,
737
+ "special": true
738
+ },
739
+ "92": {
740
+ "content": "[MASK_85]",
741
+ "lstrip": false,
742
+ "normalized": false,
743
+ "rstrip": false,
744
+ "single_word": false,
745
+ "special": true
746
+ },
747
+ "93": {
748
+ "content": "[MASK_86]",
749
+ "lstrip": false,
750
+ "normalized": false,
751
+ "rstrip": false,
752
+ "single_word": false,
753
+ "special": true
754
+ },
755
+ "94": {
756
+ "content": "[MASK_87]",
757
+ "lstrip": false,
758
+ "normalized": false,
759
+ "rstrip": false,
760
+ "single_word": false,
761
+ "special": true
762
+ },
763
+ "95": {
764
+ "content": "[MASK_88]",
765
+ "lstrip": false,
766
+ "normalized": false,
767
+ "rstrip": false,
768
+ "single_word": false,
769
+ "special": true
770
+ },
771
+ "96": {
772
+ "content": "[MASK_89]",
773
+ "lstrip": false,
774
+ "normalized": false,
775
+ "rstrip": false,
776
+ "single_word": false,
777
+ "special": true
778
+ },
779
+ "97": {
780
+ "content": "[MASK_90]",
781
+ "lstrip": false,
782
+ "normalized": false,
783
+ "rstrip": false,
784
+ "single_word": false,
785
+ "special": true
786
+ },
787
+ "98": {
788
+ "content": "[MASK_91]",
789
+ "lstrip": false,
790
+ "normalized": false,
791
+ "rstrip": false,
792
+ "single_word": false,
793
+ "special": true
794
+ },
795
+ "99": {
796
+ "content": "[MASK_92]",
797
+ "lstrip": false,
798
+ "normalized": false,
799
+ "rstrip": false,
800
+ "single_word": false,
801
+ "special": true
802
+ },
803
+ "100": {
804
+ "content": "[MASK_93]",
805
+ "lstrip": false,
806
+ "normalized": false,
807
+ "rstrip": false,
808
+ "single_word": false,
809
+ "special": true
810
+ },
811
+ "101": {
812
+ "content": "[MASK_94]",
813
+ "lstrip": false,
814
+ "normalized": false,
815
+ "rstrip": false,
816
+ "single_word": false,
817
+ "special": true
818
+ },
819
+ "102": {
820
+ "content": "[MASK_95]",
821
+ "lstrip": false,
822
+ "normalized": false,
823
+ "rstrip": false,
824
+ "single_word": false,
825
+ "special": true
826
+ },
827
+ "103": {
828
+ "content": "[MASK_96]",
829
+ "lstrip": false,
830
+ "normalized": false,
831
+ "rstrip": false,
832
+ "single_word": false,
833
+ "special": true
834
+ },
835
+ "104": {
836
+ "content": "[MASK_97]",
837
+ "lstrip": false,
838
+ "normalized": false,
839
+ "rstrip": false,
840
+ "single_word": false,
841
+ "special": true
842
+ },
843
+ "105": {
844
+ "content": "[MASK_98]",
845
+ "lstrip": false,
846
+ "normalized": false,
847
+ "rstrip": false,
848
+ "single_word": false,
849
+ "special": true
850
+ },
851
+ "106": {
852
+ "content": "[MASK_99]",
853
+ "lstrip": false,
854
+ "normalized": false,
855
+ "rstrip": false,
856
+ "single_word": false,
857
+ "special": true
858
+ }
859
+ },
860
+ "bos_token": "[BOS]",
861
+ "clean_up_tokenization_spaces": true,
862
+ "cls_token": "[CLS]",
863
+ "eos_token": "[EOS]",
864
+ "mask_token": "[MASK]",
865
+ "model_max_length": 512,
866
+ "pad_token": "[PAD]",
867
+ "sep_token": "[SEP]",
868
+ "tokenizer_class": "PreTrainedTokenizerFast",
869
+ "unk_token": "[UNK]"
870
+ }
trainer_state.json ADDED
@@ -0,0 +1,2833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.47654790915623974,
3
+ "best_model_checkpoint": "/leonardo_work/EUHPC_A02_045/scandinavian-lm/robin/fw-classifier-checkpoints-no-70b/checkpoint-33000",
4
+ "epoch": 19.976498237367803,
5
+ "eval_steps": 1000,
6
+ "global_step": 34000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0,
13
+ "eval_accuracy": 0.6101936630634727,
14
+ "eval_f1_macro": 0.12631889713244165,
15
+ "eval_loss": 4.240240097045898,
16
+ "eval_precision": 0.1016989438439121,
17
+ "eval_recall": 0.16666666666666666,
18
+ "eval_runtime": 571.773,
19
+ "eval_samples_per_second": 84.619,
20
+ "eval_steps_per_second": 0.661,
21
+ "step": 0
22
+ },
23
+ {
24
+ "epoch": 0.05875440658049354,
25
+ "grad_norm": 4.152867794036865,
26
+ "learning_rate": 0.00029911868390129255,
27
+ "loss": 0.5817,
28
+ "step": 100
29
+ },
30
+ {
31
+ "epoch": 0.11750881316098707,
32
+ "grad_norm": 8.71384334564209,
33
+ "learning_rate": 0.0002982373678025852,
34
+ "loss": 0.3984,
35
+ "step": 200
36
+ },
37
+ {
38
+ "epoch": 0.1762632197414806,
39
+ "grad_norm": 13.754929542541504,
40
+ "learning_rate": 0.00029735605170387776,
41
+ "loss": 0.3821,
42
+ "step": 300
43
+ },
44
+ {
45
+ "epoch": 0.23501762632197415,
46
+ "grad_norm": 0.5506241917610168,
47
+ "learning_rate": 0.0002964747356051704,
48
+ "loss": 0.3869,
49
+ "step": 400
50
+ },
51
+ {
52
+ "epoch": 0.2937720329024677,
53
+ "grad_norm": 7.549045562744141,
54
+ "learning_rate": 0.00029559341950646296,
55
+ "loss": 0.346,
56
+ "step": 500
57
+ },
58
+ {
59
+ "epoch": 0.3525264394829612,
60
+ "grad_norm": 8.952414512634277,
61
+ "learning_rate": 0.00029471210340775554,
62
+ "loss": 0.3535,
63
+ "step": 600
64
+ },
65
+ {
66
+ "epoch": 0.4112808460634548,
67
+ "grad_norm": 23.95926856994629,
68
+ "learning_rate": 0.00029383078730904817,
69
+ "loss": 0.3638,
70
+ "step": 700
71
+ },
72
+ {
73
+ "epoch": 0.4700352526439483,
74
+ "grad_norm": 5.874898910522461,
75
+ "learning_rate": 0.00029294947121034074,
76
+ "loss": 0.3458,
77
+ "step": 800
78
+ },
79
+ {
80
+ "epoch": 0.5287896592244419,
81
+ "grad_norm": 1.2818429470062256,
82
+ "learning_rate": 0.0002920681551116333,
83
+ "loss": 0.3183,
84
+ "step": 900
85
+ },
86
+ {
87
+ "epoch": 0.5875440658049353,
88
+ "grad_norm": 11.916783332824707,
89
+ "learning_rate": 0.00029118683901292595,
90
+ "loss": 0.3415,
91
+ "step": 1000
92
+ },
93
+ {
94
+ "epoch": 0.5875440658049353,
95
+ "eval_accuracy": 0.7214517495814645,
96
+ "eval_f1_macro": 0.36894750243003943,
97
+ "eval_loss": 0.28744974732398987,
98
+ "eval_precision": 0.4977336949727908,
99
+ "eval_recall": 0.35626936254459146,
100
+ "eval_runtime": 546.9073,
101
+ "eval_samples_per_second": 88.467,
102
+ "eval_steps_per_second": 0.691,
103
+ "step": 1000
104
+ },
105
+ {
106
+ "epoch": 0.6462984723854289,
107
+ "grad_norm": 9.043974876403809,
108
+ "learning_rate": 0.0002903055229142185,
109
+ "loss": 0.3187,
110
+ "step": 1100
111
+ },
112
+ {
113
+ "epoch": 0.7050528789659224,
114
+ "grad_norm": 4.782003879547119,
115
+ "learning_rate": 0.00028942420681551115,
116
+ "loss": 0.3466,
117
+ "step": 1200
118
+ },
119
+ {
120
+ "epoch": 0.763807285546416,
121
+ "grad_norm": 18.947124481201172,
122
+ "learning_rate": 0.00028854289071680373,
123
+ "loss": 0.3365,
124
+ "step": 1300
125
+ },
126
+ {
127
+ "epoch": 0.8225616921269095,
128
+ "grad_norm": 3.7607452869415283,
129
+ "learning_rate": 0.00028766157461809636,
130
+ "loss": 0.3307,
131
+ "step": 1400
132
+ },
133
+ {
134
+ "epoch": 0.881316098707403,
135
+ "grad_norm": 7.354115009307861,
136
+ "learning_rate": 0.00028678025851938894,
137
+ "loss": 0.327,
138
+ "step": 1500
139
+ },
140
+ {
141
+ "epoch": 0.9400705052878966,
142
+ "grad_norm": 12.593878746032715,
143
+ "learning_rate": 0.00028589894242068157,
144
+ "loss": 0.3143,
145
+ "step": 1600
146
+ },
147
+ {
148
+ "epoch": 0.9988249118683902,
149
+ "grad_norm": 10.552128791809082,
150
+ "learning_rate": 0.00028501762632197414,
151
+ "loss": 0.3082,
152
+ "step": 1700
153
+ },
154
+ {
155
+ "epoch": 1.0575793184488838,
156
+ "grad_norm": 16.90986442565918,
157
+ "learning_rate": 0.0002841363102232667,
158
+ "loss": 0.3153,
159
+ "step": 1800
160
+ },
161
+ {
162
+ "epoch": 1.1163337250293772,
163
+ "grad_norm": 5.408417224884033,
164
+ "learning_rate": 0.00028325499412455935,
165
+ "loss": 0.3051,
166
+ "step": 1900
167
+ },
168
+ {
169
+ "epoch": 1.1750881316098707,
170
+ "grad_norm": 10.757403373718262,
171
+ "learning_rate": 0.0002823736780258519,
172
+ "loss": 0.2999,
173
+ "step": 2000
174
+ },
175
+ {
176
+ "epoch": 1.1750881316098707,
177
+ "eval_accuracy": 0.6296013062439286,
178
+ "eval_f1_macro": 0.39274972019560317,
179
+ "eval_loss": 0.34983837604522705,
180
+ "eval_precision": 0.4577861635371156,
181
+ "eval_recall": 0.40350488729829226,
182
+ "eval_runtime": 573.3711,
183
+ "eval_samples_per_second": 84.383,
184
+ "eval_steps_per_second": 0.659,
185
+ "step": 2000
186
+ },
187
+ {
188
+ "epoch": 1.2338425381903644,
189
+ "grad_norm": 1.7697181701660156,
190
+ "learning_rate": 0.0002814923619271445,
191
+ "loss": 0.3158,
192
+ "step": 2100
193
+ },
194
+ {
195
+ "epoch": 1.2925969447708578,
196
+ "grad_norm": 9.139455795288086,
197
+ "learning_rate": 0.0002806110458284371,
198
+ "loss": 0.2964,
199
+ "step": 2200
200
+ },
201
+ {
202
+ "epoch": 1.3513513513513513,
203
+ "grad_norm": 1.4484944343566895,
204
+ "learning_rate": 0.0002797297297297297,
205
+ "loss": 0.296,
206
+ "step": 2300
207
+ },
208
+ {
209
+ "epoch": 1.410105757931845,
210
+ "grad_norm": 1.3425700664520264,
211
+ "learning_rate": 0.0002788484136310223,
212
+ "loss": 0.2906,
213
+ "step": 2400
214
+ },
215
+ {
216
+ "epoch": 1.4688601645123385,
217
+ "grad_norm": 9.240585327148438,
218
+ "learning_rate": 0.0002779670975323149,
219
+ "loss": 0.2901,
220
+ "step": 2500
221
+ },
222
+ {
223
+ "epoch": 1.527614571092832,
224
+ "grad_norm": 8.707176208496094,
225
+ "learning_rate": 0.0002770857814336075,
226
+ "loss": 0.2903,
227
+ "step": 2600
228
+ },
229
+ {
230
+ "epoch": 1.5863689776733256,
231
+ "grad_norm": 3.509387969970703,
232
+ "learning_rate": 0.0002762044653349001,
233
+ "loss": 0.2811,
234
+ "step": 2700
235
+ },
236
+ {
237
+ "epoch": 1.6451233842538189,
238
+ "grad_norm": 3.0751891136169434,
239
+ "learning_rate": 0.0002753231492361927,
240
+ "loss": 0.2759,
241
+ "step": 2800
242
+ },
243
+ {
244
+ "epoch": 1.7038777908343126,
245
+ "grad_norm": 4.9634013175964355,
246
+ "learning_rate": 0.0002744418331374853,
247
+ "loss": 0.2842,
248
+ "step": 2900
249
+ },
250
+ {
251
+ "epoch": 1.7626321974148063,
252
+ "grad_norm": 10.635833740234375,
253
+ "learning_rate": 0.0002735605170387779,
254
+ "loss": 0.2909,
255
+ "step": 3000
256
+ },
257
+ {
258
+ "epoch": 1.7626321974148063,
259
+ "eval_accuracy": 0.7319306367939152,
260
+ "eval_f1_macro": 0.3785908786929529,
261
+ "eval_loss": 0.26183947920799255,
262
+ "eval_precision": 0.5055778316715768,
263
+ "eval_recall": 0.3727759846099492,
264
+ "eval_runtime": 546.7019,
265
+ "eval_samples_per_second": 88.5,
266
+ "eval_steps_per_second": 0.691,
267
+ "step": 3000
268
+ },
269
+ {
270
+ "epoch": 1.8213866039952995,
271
+ "grad_norm": 10.297226905822754,
272
+ "learning_rate": 0.0002726792009400705,
273
+ "loss": 0.2826,
274
+ "step": 3100
275
+ },
276
+ {
277
+ "epoch": 1.8801410105757932,
278
+ "grad_norm": 2.603403329849243,
279
+ "learning_rate": 0.0002717978848413631,
280
+ "loss": 0.2787,
281
+ "step": 3200
282
+ },
283
+ {
284
+ "epoch": 1.9388954171562869,
285
+ "grad_norm": 12.185776710510254,
286
+ "learning_rate": 0.0002709165687426557,
287
+ "loss": 0.276,
288
+ "step": 3300
289
+ },
290
+ {
291
+ "epoch": 1.9976498237367801,
292
+ "grad_norm": 4.290465354919434,
293
+ "learning_rate": 0.00027003525264394825,
294
+ "loss": 0.2846,
295
+ "step": 3400
296
+ },
297
+ {
298
+ "epoch": 2.056404230317274,
299
+ "grad_norm": 3.0092501640319824,
300
+ "learning_rate": 0.00026915393654524083,
301
+ "loss": 0.2805,
302
+ "step": 3500
303
+ },
304
+ {
305
+ "epoch": 2.1151586368977675,
306
+ "grad_norm": 4.7245893478393555,
307
+ "learning_rate": 0.00026827262044653346,
308
+ "loss": 0.27,
309
+ "step": 3600
310
+ },
311
+ {
312
+ "epoch": 2.1739130434782608,
313
+ "grad_norm": 9.71957778930664,
314
+ "learning_rate": 0.00026739130434782604,
315
+ "loss": 0.2722,
316
+ "step": 3700
317
+ },
318
+ {
319
+ "epoch": 2.2326674500587544,
320
+ "grad_norm": 2.2740237712860107,
321
+ "learning_rate": 0.00026650998824911867,
322
+ "loss": 0.2663,
323
+ "step": 3800
324
+ },
325
+ {
326
+ "epoch": 2.291421856639248,
327
+ "grad_norm": 1.9909628629684448,
328
+ "learning_rate": 0.00026562867215041124,
329
+ "loss": 0.266,
330
+ "step": 3900
331
+ },
332
+ {
333
+ "epoch": 2.3501762632197414,
334
+ "grad_norm": 5.9694366455078125,
335
+ "learning_rate": 0.00026474735605170387,
336
+ "loss": 0.2724,
337
+ "step": 4000
338
+ },
339
+ {
340
+ "epoch": 2.3501762632197414,
341
+ "eval_accuracy": 0.7637393299299341,
342
+ "eval_f1_macro": 0.4012224284754011,
343
+ "eval_loss": 0.24033646285533905,
344
+ "eval_precision": 0.5199015574271643,
345
+ "eval_recall": 0.37999768266709827,
346
+ "eval_runtime": 549.807,
347
+ "eval_samples_per_second": 88.0,
348
+ "eval_steps_per_second": 0.688,
349
+ "step": 4000
350
+ },
351
+ {
352
+ "epoch": 2.408930669800235,
353
+ "grad_norm": 0.8608851432800293,
354
+ "learning_rate": 0.00026386603995299645,
355
+ "loss": 0.2665,
356
+ "step": 4100
357
+ },
358
+ {
359
+ "epoch": 2.4676850763807288,
360
+ "grad_norm": 3.54764723777771,
361
+ "learning_rate": 0.0002629847238542891,
362
+ "loss": 0.262,
363
+ "step": 4200
364
+ },
365
+ {
366
+ "epoch": 2.526439482961222,
367
+ "grad_norm": 4.2886481285095215,
368
+ "learning_rate": 0.00026210340775558165,
369
+ "loss": 0.2651,
370
+ "step": 4300
371
+ },
372
+ {
373
+ "epoch": 2.5851938895417157,
374
+ "grad_norm": 9.533616065979004,
375
+ "learning_rate": 0.00026122209165687423,
376
+ "loss": 0.2705,
377
+ "step": 4400
378
+ },
379
+ {
380
+ "epoch": 2.6439482961222094,
381
+ "grad_norm": 1.6619293689727783,
382
+ "learning_rate": 0.00026034077555816686,
383
+ "loss": 0.2728,
384
+ "step": 4500
385
+ },
386
+ {
387
+ "epoch": 2.7027027027027026,
388
+ "grad_norm": 5.174167633056641,
389
+ "learning_rate": 0.00025945945945945944,
390
+ "loss": 0.265,
391
+ "step": 4600
392
+ },
393
+ {
394
+ "epoch": 2.7614571092831963,
395
+ "grad_norm": 3.4489777088165283,
396
+ "learning_rate": 0.000258578143360752,
397
+ "loss": 0.2608,
398
+ "step": 4700
399
+ },
400
+ {
401
+ "epoch": 2.82021151586369,
402
+ "grad_norm": 5.9784111976623535,
403
+ "learning_rate": 0.00025769682726204464,
404
+ "loss": 0.268,
405
+ "step": 4800
406
+ },
407
+ {
408
+ "epoch": 2.8789659224441833,
409
+ "grad_norm": 1.8021718263626099,
410
+ "learning_rate": 0.0002568155111633372,
411
+ "loss": 0.2604,
412
+ "step": 4900
413
+ },
414
+ {
415
+ "epoch": 2.937720329024677,
416
+ "grad_norm": 1.6711304187774658,
417
+ "learning_rate": 0.0002559341950646298,
418
+ "loss": 0.2673,
419
+ "step": 5000
420
+ },
421
+ {
422
+ "epoch": 2.937720329024677,
423
+ "eval_accuracy": 0.7662815451708245,
424
+ "eval_f1_macro": 0.42256323386502537,
425
+ "eval_loss": 0.23298443853855133,
426
+ "eval_precision": 0.4985552431790052,
427
+ "eval_recall": 0.4017059313674283,
428
+ "eval_runtime": 572.543,
429
+ "eval_samples_per_second": 84.505,
430
+ "eval_steps_per_second": 0.66,
431
+ "step": 5000
432
+ },
433
+ {
434
+ "epoch": 2.9964747356051706,
435
+ "grad_norm": 0.876846432685852,
436
+ "learning_rate": 0.0002550528789659224,
437
+ "loss": 0.2559,
438
+ "step": 5100
439
+ },
440
+ {
441
+ "epoch": 3.055229142185664,
442
+ "grad_norm": 2.291898250579834,
443
+ "learning_rate": 0.000254171562867215,
444
+ "loss": 0.2614,
445
+ "step": 5200
446
+ },
447
+ {
448
+ "epoch": 3.1139835487661576,
449
+ "grad_norm": 2.2618095874786377,
450
+ "learning_rate": 0.00025329024676850763,
451
+ "loss": 0.2578,
452
+ "step": 5300
453
+ },
454
+ {
455
+ "epoch": 3.172737955346651,
456
+ "grad_norm": 2.6534600257873535,
457
+ "learning_rate": 0.0002524089306698002,
458
+ "loss": 0.2568,
459
+ "step": 5400
460
+ },
461
+ {
462
+ "epoch": 3.2314923619271445,
463
+ "grad_norm": 3.1308279037475586,
464
+ "learning_rate": 0.00025152761457109283,
465
+ "loss": 0.2531,
466
+ "step": 5500
467
+ },
468
+ {
469
+ "epoch": 3.290246768507638,
470
+ "grad_norm": 2.781928300857544,
471
+ "learning_rate": 0.0002506462984723854,
472
+ "loss": 0.2526,
473
+ "step": 5600
474
+ },
475
+ {
476
+ "epoch": 3.3490011750881314,
477
+ "grad_norm": 3.065544366836548,
478
+ "learning_rate": 0.00024976498237367804,
479
+ "loss": 0.2654,
480
+ "step": 5700
481
+ },
482
+ {
483
+ "epoch": 3.407755581668625,
484
+ "grad_norm": 1.8798550367355347,
485
+ "learning_rate": 0.0002488836662749706,
486
+ "loss": 0.2504,
487
+ "step": 5800
488
+ },
489
+ {
490
+ "epoch": 3.466509988249119,
491
+ "grad_norm": 6.618080139160156,
492
+ "learning_rate": 0.0002480023501762632,
493
+ "loss": 0.2593,
494
+ "step": 5900
495
+ },
496
+ {
497
+ "epoch": 3.525264394829612,
498
+ "grad_norm": 3.1250927448272705,
499
+ "learning_rate": 0.00024712103407755577,
500
+ "loss": 0.2592,
501
+ "step": 6000
502
+ },
503
+ {
504
+ "epoch": 3.525264394829612,
505
+ "eval_accuracy": 0.7667155819192691,
506
+ "eval_f1_macro": 0.38853739645942603,
507
+ "eval_loss": 0.24237428605556488,
508
+ "eval_precision": 0.5286680026993323,
509
+ "eval_recall": 0.36739233287894374,
510
+ "eval_runtime": 551.8425,
511
+ "eval_samples_per_second": 87.675,
512
+ "eval_steps_per_second": 0.685,
513
+ "step": 6000
514
+ },
515
+ {
516
+ "epoch": 3.5840188014101058,
517
+ "grad_norm": 1.4983100891113281,
518
+ "learning_rate": 0.0002462397179788484,
519
+ "loss": 0.2492,
520
+ "step": 6100
521
+ },
522
+ {
523
+ "epoch": 3.6427732079905994,
524
+ "grad_norm": 7.058569431304932,
525
+ "learning_rate": 0.000245358401880141,
526
+ "loss": 0.2594,
527
+ "step": 6200
528
+ },
529
+ {
530
+ "epoch": 3.7015276145710927,
531
+ "grad_norm": 1.7073079347610474,
532
+ "learning_rate": 0.0002444770857814336,
533
+ "loss": 0.259,
534
+ "step": 6300
535
+ },
536
+ {
537
+ "epoch": 3.7602820211515864,
538
+ "grad_norm": 3.5612850189208984,
539
+ "learning_rate": 0.00024359576968272618,
540
+ "loss": 0.2499,
541
+ "step": 6400
542
+ },
543
+ {
544
+ "epoch": 3.8190364277320796,
545
+ "grad_norm": 3.4439921379089355,
546
+ "learning_rate": 0.00024271445358401875,
547
+ "loss": 0.2478,
548
+ "step": 6500
549
+ },
550
+ {
551
+ "epoch": 3.8777908343125733,
552
+ "grad_norm": 1.5288629531860352,
553
+ "learning_rate": 0.00024183313748531138,
554
+ "loss": 0.251,
555
+ "step": 6600
556
+ },
557
+ {
558
+ "epoch": 3.936545240893067,
559
+ "grad_norm": 4.820594787597656,
560
+ "learning_rate": 0.00024095182138660396,
561
+ "loss": 0.2556,
562
+ "step": 6700
563
+ },
564
+ {
565
+ "epoch": 3.9952996474735603,
566
+ "grad_norm": 2.041408061981201,
567
+ "learning_rate": 0.00024007050528789656,
568
+ "loss": 0.2559,
569
+ "step": 6800
570
+ },
571
+ {
572
+ "epoch": 4.054054054054054,
573
+ "grad_norm": 7.40335750579834,
574
+ "learning_rate": 0.00023918918918918917,
575
+ "loss": 0.252,
576
+ "step": 6900
577
+ },
578
+ {
579
+ "epoch": 4.112808460634548,
580
+ "grad_norm": 2.86159086227417,
581
+ "learning_rate": 0.00023830787309048177,
582
+ "loss": 0.2504,
583
+ "step": 7000
584
+ },
585
+ {
586
+ "epoch": 4.112808460634548,
587
+ "eval_accuracy": 0.7713866440691979,
588
+ "eval_f1_macro": 0.4170037789256203,
589
+ "eval_loss": 0.23320935666561127,
590
+ "eval_precision": 0.520598590788087,
591
+ "eval_recall": 0.39262170670987384,
592
+ "eval_runtime": 570.908,
593
+ "eval_samples_per_second": 84.747,
594
+ "eval_steps_per_second": 0.662,
595
+ "step": 7000
596
+ },
597
+ {
598
+ "epoch": 4.171562867215041,
599
+ "grad_norm": 4.3792805671691895,
600
+ "learning_rate": 0.00023742655699177434,
601
+ "loss": 0.2511,
602
+ "step": 7100
603
+ },
604
+ {
605
+ "epoch": 4.230317273795535,
606
+ "grad_norm": 1.2670930624008179,
607
+ "learning_rate": 0.00023654524089306697,
608
+ "loss": 0.2467,
609
+ "step": 7200
610
+ },
611
+ {
612
+ "epoch": 4.289071680376028,
613
+ "grad_norm": 4.641327381134033,
614
+ "learning_rate": 0.00023566392479435955,
615
+ "loss": 0.252,
616
+ "step": 7300
617
+ },
618
+ {
619
+ "epoch": 4.3478260869565215,
620
+ "grad_norm": 2.356194257736206,
621
+ "learning_rate": 0.00023478260869565215,
622
+ "loss": 0.2471,
623
+ "step": 7400
624
+ },
625
+ {
626
+ "epoch": 4.406580493537016,
627
+ "grad_norm": 3.5696866512298584,
628
+ "learning_rate": 0.00023390129259694476,
629
+ "loss": 0.2444,
630
+ "step": 7500
631
+ },
632
+ {
633
+ "epoch": 4.465334900117509,
634
+ "grad_norm": 8.092639923095703,
635
+ "learning_rate": 0.00023301997649823736,
636
+ "loss": 0.2458,
637
+ "step": 7600
638
+ },
639
+ {
640
+ "epoch": 4.524089306698002,
641
+ "grad_norm": 3.4449079036712646,
642
+ "learning_rate": 0.00023213866039952993,
643
+ "loss": 0.2512,
644
+ "step": 7700
645
+ },
646
+ {
647
+ "epoch": 4.582843713278496,
648
+ "grad_norm": 5.513228416442871,
649
+ "learning_rate": 0.00023125734430082256,
650
+ "loss": 0.2518,
651
+ "step": 7800
652
+ },
653
+ {
654
+ "epoch": 4.6415981198589895,
655
+ "grad_norm": 2.5132598876953125,
656
+ "learning_rate": 0.00023037602820211514,
657
+ "loss": 0.2519,
658
+ "step": 7900
659
+ },
660
+ {
661
+ "epoch": 4.700352526439483,
662
+ "grad_norm": 2.164031982421875,
663
+ "learning_rate": 0.00022949471210340774,
664
+ "loss": 0.2455,
665
+ "step": 8000
666
+ },
667
+ {
668
+ "epoch": 4.700352526439483,
669
+ "eval_accuracy": 0.7628505880164521,
670
+ "eval_f1_macro": 0.4485850425849249,
671
+ "eval_loss": 0.2332322597503662,
672
+ "eval_precision": 0.484949663618545,
673
+ "eval_recall": 0.4351442261907779,
674
+ "eval_runtime": 569.2371,
675
+ "eval_samples_per_second": 84.996,
676
+ "eval_steps_per_second": 0.664,
677
+ "step": 8000
678
+ },
679
+ {
680
+ "epoch": 4.759106933019977,
681
+ "grad_norm": 8.147943496704102,
682
+ "learning_rate": 0.00022861339600470035,
683
+ "loss": 0.2538,
684
+ "step": 8100
685
+ },
686
+ {
687
+ "epoch": 4.81786133960047,
688
+ "grad_norm": 3.1226038932800293,
689
+ "learning_rate": 0.00022773207990599292,
690
+ "loss": 0.2481,
691
+ "step": 8200
692
+ },
693
+ {
694
+ "epoch": 4.876615746180963,
695
+ "grad_norm": 1.5910353660583496,
696
+ "learning_rate": 0.00022685076380728553,
697
+ "loss": 0.2432,
698
+ "step": 8300
699
+ },
700
+ {
701
+ "epoch": 4.9353701527614575,
702
+ "grad_norm": 2.3687844276428223,
703
+ "learning_rate": 0.0002259694477085781,
704
+ "loss": 0.2464,
705
+ "step": 8400
706
+ },
707
+ {
708
+ "epoch": 4.994124559341951,
709
+ "grad_norm": 1.5202206373214722,
710
+ "learning_rate": 0.00022508813160987073,
711
+ "loss": 0.2491,
712
+ "step": 8500
713
+ },
714
+ {
715
+ "epoch": 5.052878965922444,
716
+ "grad_norm": 8.09229850769043,
717
+ "learning_rate": 0.0002242068155111633,
718
+ "loss": 0.245,
719
+ "step": 8600
720
+ },
721
+ {
722
+ "epoch": 5.111633372502938,
723
+ "grad_norm": 4.977721691131592,
724
+ "learning_rate": 0.0002233254994124559,
725
+ "loss": 0.2414,
726
+ "step": 8700
727
+ },
728
+ {
729
+ "epoch": 5.170387779083431,
730
+ "grad_norm": 2.690870523452759,
731
+ "learning_rate": 0.0002224441833137485,
732
+ "loss": 0.2437,
733
+ "step": 8800
734
+ },
735
+ {
736
+ "epoch": 5.229142185663925,
737
+ "grad_norm": 4.5524373054504395,
738
+ "learning_rate": 0.00022156286721504112,
739
+ "loss": 0.2476,
740
+ "step": 8900
741
+ },
742
+ {
743
+ "epoch": 5.287896592244419,
744
+ "grad_norm": 3.5273966789245605,
745
+ "learning_rate": 0.0002206815511163337,
746
+ "loss": 0.2424,
747
+ "step": 9000
748
+ },
749
+ {
750
+ "epoch": 5.287896592244419,
751
+ "eval_accuracy": 0.7704152284893454,
752
+ "eval_f1_macro": 0.40212235960322845,
753
+ "eval_loss": 0.22698020935058594,
754
+ "eval_precision": 0.5605989028870036,
755
+ "eval_recall": 0.3866441363925465,
756
+ "eval_runtime": 588.8368,
757
+ "eval_samples_per_second": 82.167,
758
+ "eval_steps_per_second": 0.642,
759
+ "step": 9000
760
+ },
761
+ {
762
+ "epoch": 5.346650998824912,
763
+ "grad_norm": 4.150397777557373,
764
+ "learning_rate": 0.00021980023501762632,
765
+ "loss": 0.2476,
766
+ "step": 9100
767
+ },
768
+ {
769
+ "epoch": 5.405405405405405,
770
+ "grad_norm": 1.5151199102401733,
771
+ "learning_rate": 0.0002189189189189189,
772
+ "loss": 0.2458,
773
+ "step": 9200
774
+ },
775
+ {
776
+ "epoch": 5.464159811985899,
777
+ "grad_norm": 2.2463040351867676,
778
+ "learning_rate": 0.0002180376028202115,
779
+ "loss": 0.2388,
780
+ "step": 9300
781
+ },
782
+ {
783
+ "epoch": 5.522914218566393,
784
+ "grad_norm": 2.767045259475708,
785
+ "learning_rate": 0.0002171562867215041,
786
+ "loss": 0.2433,
787
+ "step": 9400
788
+ },
789
+ {
790
+ "epoch": 5.581668625146886,
791
+ "grad_norm": 5.879153728485107,
792
+ "learning_rate": 0.0002162749706227967,
793
+ "loss": 0.2452,
794
+ "step": 9500
795
+ },
796
+ {
797
+ "epoch": 5.64042303172738,
798
+ "grad_norm": 4.529464244842529,
799
+ "learning_rate": 0.00021539365452408928,
800
+ "loss": 0.2437,
801
+ "step": 9600
802
+ },
803
+ {
804
+ "epoch": 5.699177438307873,
805
+ "grad_norm": 2.579648017883301,
806
+ "learning_rate": 0.00021451233842538186,
807
+ "loss": 0.2429,
808
+ "step": 9700
809
+ },
810
+ {
811
+ "epoch": 5.7579318448883665,
812
+ "grad_norm": 1.4765149354934692,
813
+ "learning_rate": 0.0002136310223266745,
814
+ "loss": 0.2347,
815
+ "step": 9800
816
+ },
817
+ {
818
+ "epoch": 5.816686251468861,
819
+ "grad_norm": 6.136841297149658,
820
+ "learning_rate": 0.00021274970622796706,
821
+ "loss": 0.2407,
822
+ "step": 9900
823
+ },
824
+ {
825
+ "epoch": 5.875440658049354,
826
+ "grad_norm": 5.470715045928955,
827
+ "learning_rate": 0.0002118683901292597,
828
+ "loss": 0.2476,
829
+ "step": 10000
830
+ },
831
+ {
832
+ "epoch": 5.875440658049354,
833
+ "eval_accuracy": 0.7542525267139284,
834
+ "eval_f1_macro": 0.4479732740452163,
835
+ "eval_loss": 0.2347133606672287,
836
+ "eval_precision": 0.495252873527299,
837
+ "eval_recall": 0.4323174862666415,
838
+ "eval_runtime": 555.2826,
839
+ "eval_samples_per_second": 87.132,
840
+ "eval_steps_per_second": 0.681,
841
+ "step": 10000
842
+ },
843
+ {
844
+ "epoch": 5.934195064629847,
845
+ "grad_norm": 2.95473575592041,
846
+ "learning_rate": 0.00021098707403055227,
847
+ "loss": 0.2443,
848
+ "step": 10100
849
+ },
850
+ {
851
+ "epoch": 5.992949471210341,
852
+ "grad_norm": 1.8928413391113281,
853
+ "learning_rate": 0.00021010575793184487,
854
+ "loss": 0.237,
855
+ "step": 10200
856
+ },
857
+ {
858
+ "epoch": 6.0517038777908345,
859
+ "grad_norm": 5.004413604736328,
860
+ "learning_rate": 0.00020922444183313745,
861
+ "loss": 0.2419,
862
+ "step": 10300
863
+ },
864
+ {
865
+ "epoch": 6.110458284371328,
866
+ "grad_norm": 2.2294819355010986,
867
+ "learning_rate": 0.00020834312573443008,
868
+ "loss": 0.2393,
869
+ "step": 10400
870
+ },
871
+ {
872
+ "epoch": 6.169212690951821,
873
+ "grad_norm": 3.4622702598571777,
874
+ "learning_rate": 0.00020746180963572265,
875
+ "loss": 0.2467,
876
+ "step": 10500
877
+ },
878
+ {
879
+ "epoch": 6.227967097532315,
880
+ "grad_norm": 11.164566993713379,
881
+ "learning_rate": 0.00020658049353701526,
882
+ "loss": 0.2405,
883
+ "step": 10600
884
+ },
885
+ {
886
+ "epoch": 6.286721504112808,
887
+ "grad_norm": 8.36145305633545,
888
+ "learning_rate": 0.00020569917743830786,
889
+ "loss": 0.2366,
890
+ "step": 10700
891
+ },
892
+ {
893
+ "epoch": 6.345475910693302,
894
+ "grad_norm": 2.6593246459960938,
895
+ "learning_rate": 0.00020481786133960046,
896
+ "loss": 0.244,
897
+ "step": 10800
898
+ },
899
+ {
900
+ "epoch": 6.404230317273796,
901
+ "grad_norm": 1.6066139936447144,
902
+ "learning_rate": 0.00020393654524089304,
903
+ "loss": 0.2412,
904
+ "step": 10900
905
+ },
906
+ {
907
+ "epoch": 6.462984723854289,
908
+ "grad_norm": 2.962965250015259,
909
+ "learning_rate": 0.00020305522914218567,
910
+ "loss": 0.2385,
911
+ "step": 11000
912
+ },
913
+ {
914
+ "epoch": 6.462984723854289,
915
+ "eval_accuracy": 0.7729161068970506,
916
+ "eval_f1_macro": 0.4305720986934003,
917
+ "eval_loss": 0.22273238003253937,
918
+ "eval_precision": 0.5147561915779448,
919
+ "eval_recall": 0.40734695218979783,
920
+ "eval_runtime": 531.4648,
921
+ "eval_samples_per_second": 91.037,
922
+ "eval_steps_per_second": 0.711,
923
+ "step": 11000
924
+ },
925
+ {
926
+ "epoch": 6.521739130434782,
927
+ "grad_norm": 3.933185338973999,
928
+ "learning_rate": 0.00020217391304347824,
929
+ "loss": 0.2389,
930
+ "step": 11100
931
+ },
932
+ {
933
+ "epoch": 6.580493537015276,
934
+ "grad_norm": 6.2505316734313965,
935
+ "learning_rate": 0.00020129259694477085,
936
+ "loss": 0.2345,
937
+ "step": 11200
938
+ },
939
+ {
940
+ "epoch": 6.63924794359577,
941
+ "grad_norm": 2.8899261951446533,
942
+ "learning_rate": 0.00020041128084606345,
943
+ "loss": 0.2404,
944
+ "step": 11300
945
+ },
946
+ {
947
+ "epoch": 6.698002350176263,
948
+ "grad_norm": 4.886023998260498,
949
+ "learning_rate": 0.00019952996474735602,
950
+ "loss": 0.2405,
951
+ "step": 11400
952
+ },
953
+ {
954
+ "epoch": 6.756756756756757,
955
+ "grad_norm": 3.6124258041381836,
956
+ "learning_rate": 0.00019864864864864863,
957
+ "loss": 0.2388,
958
+ "step": 11500
959
+ },
960
+ {
961
+ "epoch": 6.81551116333725,
962
+ "grad_norm": 2.6905336380004883,
963
+ "learning_rate": 0.0001977673325499412,
964
+ "loss": 0.2419,
965
+ "step": 11600
966
+ },
967
+ {
968
+ "epoch": 6.8742655699177435,
969
+ "grad_norm": 1.7078518867492676,
970
+ "learning_rate": 0.00019688601645123383,
971
+ "loss": 0.2351,
972
+ "step": 11700
973
+ },
974
+ {
975
+ "epoch": 6.933019976498238,
976
+ "grad_norm": 4.933712482452393,
977
+ "learning_rate": 0.0001960047003525264,
978
+ "loss": 0.2357,
979
+ "step": 11800
980
+ },
981
+ {
982
+ "epoch": 6.991774383078731,
983
+ "grad_norm": 4.086423873901367,
984
+ "learning_rate": 0.00019512338425381904,
985
+ "loss": 0.241,
986
+ "step": 11900
987
+ },
988
+ {
989
+ "epoch": 7.050528789659224,
990
+ "grad_norm": 3.3847010135650635,
991
+ "learning_rate": 0.00019424206815511161,
992
+ "loss": 0.2343,
993
+ "step": 12000
994
+ },
995
+ {
996
+ "epoch": 7.050528789659224,
997
+ "eval_accuracy": 0.7735568278114213,
998
+ "eval_f1_macro": 0.4611187564793586,
999
+ "eval_loss": 0.2231457531452179,
1000
+ "eval_precision": 0.48558443620097314,
1001
+ "eval_recall": 0.4472545142062536,
1002
+ "eval_runtime": 576.3861,
1003
+ "eval_samples_per_second": 83.942,
1004
+ "eval_steps_per_second": 0.656,
1005
+ "step": 12000
1006
+ },
1007
+ {
1008
+ "epoch": 7.109283196239718,
1009
+ "grad_norm": 2.0805375576019287,
1010
+ "learning_rate": 0.00019336075205640422,
1011
+ "loss": 0.238,
1012
+ "step": 12100
1013
+ },
1014
+ {
1015
+ "epoch": 7.1680376028202115,
1016
+ "grad_norm": 3.5111265182495117,
1017
+ "learning_rate": 0.0001924794359576968,
1018
+ "loss": 0.2365,
1019
+ "step": 12200
1020
+ },
1021
+ {
1022
+ "epoch": 7.226792009400705,
1023
+ "grad_norm": 2.2142751216888428,
1024
+ "learning_rate": 0.00019159811985898942,
1025
+ "loss": 0.2387,
1026
+ "step": 12300
1027
+ },
1028
+ {
1029
+ "epoch": 7.285546415981199,
1030
+ "grad_norm": 5.468302249908447,
1031
+ "learning_rate": 0.000190716803760282,
1032
+ "loss": 0.2366,
1033
+ "step": 12400
1034
+ },
1035
+ {
1036
+ "epoch": 7.344300822561692,
1037
+ "grad_norm": 4.9833598136901855,
1038
+ "learning_rate": 0.0001898354876615746,
1039
+ "loss": 0.2345,
1040
+ "step": 12500
1041
+ },
1042
+ {
1043
+ "epoch": 7.403055229142185,
1044
+ "grad_norm": 2.4710216522216797,
1045
+ "learning_rate": 0.0001889541715628672,
1046
+ "loss": 0.2335,
1047
+ "step": 12600
1048
+ },
1049
+ {
1050
+ "epoch": 7.4618096357226795,
1051
+ "grad_norm": 1.4311057329177856,
1052
+ "learning_rate": 0.0001880728554641598,
1053
+ "loss": 0.242,
1054
+ "step": 12700
1055
+ },
1056
+ {
1057
+ "epoch": 7.520564042303173,
1058
+ "grad_norm": 3.9087047576904297,
1059
+ "learning_rate": 0.00018719153936545238,
1060
+ "loss": 0.2369,
1061
+ "step": 12800
1062
+ },
1063
+ {
1064
+ "epoch": 7.579318448883666,
1065
+ "grad_norm": 2.0385680198669434,
1066
+ "learning_rate": 0.000186310223266745,
1067
+ "loss": 0.2295,
1068
+ "step": 12900
1069
+ },
1070
+ {
1071
+ "epoch": 7.63807285546416,
1072
+ "grad_norm": 3.9989728927612305,
1073
+ "learning_rate": 0.0001854289071680376,
1074
+ "loss": 0.2353,
1075
+ "step": 13000
1076
+ },
1077
+ {
1078
+ "epoch": 7.63807285546416,
1079
+ "eval_accuracy": 0.7637393299299341,
1080
+ "eval_f1_macro": 0.42822074709027813,
1081
+ "eval_loss": 0.2291877716779709,
1082
+ "eval_precision": 0.5240563218793618,
1083
+ "eval_recall": 0.4106384213710441,
1084
+ "eval_runtime": 580.7226,
1085
+ "eval_samples_per_second": 83.315,
1086
+ "eval_steps_per_second": 0.651,
1087
+ "step": 13000
1088
+ },
1089
+ {
1090
+ "epoch": 7.696827262044653,
1091
+ "grad_norm": 1.630708932876587,
1092
+ "learning_rate": 0.00018454759106933017,
1093
+ "loss": 0.2373,
1094
+ "step": 13100
1095
+ },
1096
+ {
1097
+ "epoch": 7.755581668625147,
1098
+ "grad_norm": 3.2567617893218994,
1099
+ "learning_rate": 0.0001836662749706228,
1100
+ "loss": 0.2326,
1101
+ "step": 13200
1102
+ },
1103
+ {
1104
+ "epoch": 7.814336075205641,
1105
+ "grad_norm": 2.3300867080688477,
1106
+ "learning_rate": 0.00018278495887191537,
1107
+ "loss": 0.2369,
1108
+ "step": 13300
1109
+ },
1110
+ {
1111
+ "epoch": 7.873090481786134,
1112
+ "grad_norm": 2.068678379058838,
1113
+ "learning_rate": 0.00018190364277320797,
1114
+ "loss": 0.2337,
1115
+ "step": 13400
1116
+ },
1117
+ {
1118
+ "epoch": 7.931844888366627,
1119
+ "grad_norm": 2.0448477268218994,
1120
+ "learning_rate": 0.00018102232667450055,
1121
+ "loss": 0.2335,
1122
+ "step": 13500
1123
+ },
1124
+ {
1125
+ "epoch": 7.990599294947121,
1126
+ "grad_norm": 2.7080137729644775,
1127
+ "learning_rate": 0.00018014101057579318,
1128
+ "loss": 0.238,
1129
+ "step": 13600
1130
+ },
1131
+ {
1132
+ "epoch": 8.049353701527615,
1133
+ "grad_norm": 1.9964938163757324,
1134
+ "learning_rate": 0.00017925969447708576,
1135
+ "loss": 0.2359,
1136
+ "step": 13700
1137
+ },
1138
+ {
1139
+ "epoch": 8.108108108108109,
1140
+ "grad_norm": 2.795433759689331,
1141
+ "learning_rate": 0.00017837837837837839,
1142
+ "loss": 0.2374,
1143
+ "step": 13800
1144
+ },
1145
+ {
1146
+ "epoch": 8.166862514688601,
1147
+ "grad_norm": 2.685382843017578,
1148
+ "learning_rate": 0.00017749706227967096,
1149
+ "loss": 0.2364,
1150
+ "step": 13900
1151
+ },
1152
+ {
1153
+ "epoch": 8.225616921269095,
1154
+ "grad_norm": 2.214505195617676,
1155
+ "learning_rate": 0.00017661574618096356,
1156
+ "loss": 0.2328,
1157
+ "step": 14000
1158
+ },
1159
+ {
1160
+ "epoch": 8.225616921269095,
1161
+ "eval_accuracy": 0.7689477709112705,
1162
+ "eval_f1_macro": 0.4571007537767639,
1163
+ "eval_loss": 0.22145947813987732,
1164
+ "eval_precision": 0.4969408215399489,
1165
+ "eval_recall": 0.44354312726305817,
1166
+ "eval_runtime": 570.5829,
1167
+ "eval_samples_per_second": 84.796,
1168
+ "eval_steps_per_second": 0.662,
1169
+ "step": 14000
1170
+ },
1171
+ {
1172
+ "epoch": 8.28437132784959,
1173
+ "grad_norm": 3.0027620792388916,
1174
+ "learning_rate": 0.00017573443008225614,
1175
+ "loss": 0.2328,
1176
+ "step": 14100
1177
+ },
1178
+ {
1179
+ "epoch": 8.343125734430082,
1180
+ "grad_norm": 1.9569076299667358,
1181
+ "learning_rate": 0.00017485311398354877,
1182
+ "loss": 0.2315,
1183
+ "step": 14200
1184
+ },
1185
+ {
1186
+ "epoch": 8.401880141010576,
1187
+ "grad_norm": 2.1613545417785645,
1188
+ "learning_rate": 0.00017397179788484135,
1189
+ "loss": 0.2314,
1190
+ "step": 14300
1191
+ },
1192
+ {
1193
+ "epoch": 8.46063454759107,
1194
+ "grad_norm": 4.501012802124023,
1195
+ "learning_rate": 0.00017309048178613395,
1196
+ "loss": 0.2307,
1197
+ "step": 14400
1198
+ },
1199
+ {
1200
+ "epoch": 8.519388954171562,
1201
+ "grad_norm": 4.018213272094727,
1202
+ "learning_rate": 0.00017220916568742655,
1203
+ "loss": 0.2337,
1204
+ "step": 14500
1205
+ },
1206
+ {
1207
+ "epoch": 8.578143360752057,
1208
+ "grad_norm": 3.4571876525878906,
1209
+ "learning_rate": 0.00017132784958871913,
1210
+ "loss": 0.2309,
1211
+ "step": 14600
1212
+ },
1213
+ {
1214
+ "epoch": 8.63689776733255,
1215
+ "grad_norm": 1.6010338068008423,
1216
+ "learning_rate": 0.00017044653349001173,
1217
+ "loss": 0.2304,
1218
+ "step": 14700
1219
+ },
1220
+ {
1221
+ "epoch": 8.695652173913043,
1222
+ "grad_norm": 5.177122592926025,
1223
+ "learning_rate": 0.00016956521739130433,
1224
+ "loss": 0.2315,
1225
+ "step": 14800
1226
+ },
1227
+ {
1228
+ "epoch": 8.754406580493537,
1229
+ "grad_norm": 1.3421051502227783,
1230
+ "learning_rate": 0.00016868390129259694,
1231
+ "loss": 0.2358,
1232
+ "step": 14900
1233
+ },
1234
+ {
1235
+ "epoch": 8.813160987074031,
1236
+ "grad_norm": 5.40761137008667,
1237
+ "learning_rate": 0.0001678025851938895,
1238
+ "loss": 0.2297,
1239
+ "step": 15000
1240
+ },
1241
+ {
1242
+ "epoch": 8.813160987074031,
1243
+ "eval_accuracy": 0.7784759109604613,
1244
+ "eval_f1_macro": 0.42357466614962985,
1245
+ "eval_loss": 0.22359371185302734,
1246
+ "eval_precision": 0.5285253073603687,
1247
+ "eval_recall": 0.3968963203390616,
1248
+ "eval_runtime": 558.7605,
1249
+ "eval_samples_per_second": 86.59,
1250
+ "eval_steps_per_second": 0.676,
1251
+ "step": 15000
1252
+ },
1253
+ {
1254
+ "epoch": 8.871915393654524,
1255
+ "grad_norm": 2.718010902404785,
1256
+ "learning_rate": 0.00016692126909518214,
1257
+ "loss": 0.2282,
1258
+ "step": 15100
1259
+ },
1260
+ {
1261
+ "epoch": 8.930669800235018,
1262
+ "grad_norm": 2.1908445358276367,
1263
+ "learning_rate": 0.00016603995299647472,
1264
+ "loss": 0.2342,
1265
+ "step": 15200
1266
+ },
1267
+ {
1268
+ "epoch": 8.989424206815512,
1269
+ "grad_norm": 1.3827928304672241,
1270
+ "learning_rate": 0.00016515863689776732,
1271
+ "loss": 0.2336,
1272
+ "step": 15300
1273
+ },
1274
+ {
1275
+ "epoch": 9.048178613396004,
1276
+ "grad_norm": 2.2856316566467285,
1277
+ "learning_rate": 0.0001642773207990599,
1278
+ "loss": 0.2316,
1279
+ "step": 15400
1280
+ },
1281
+ {
1282
+ "epoch": 9.106933019976498,
1283
+ "grad_norm": 9.475784301757812,
1284
+ "learning_rate": 0.00016339600470035253,
1285
+ "loss": 0.2338,
1286
+ "step": 15500
1287
+ },
1288
+ {
1289
+ "epoch": 9.165687426556993,
1290
+ "grad_norm": 5.561756610870361,
1291
+ "learning_rate": 0.0001625146886016451,
1292
+ "loss": 0.2345,
1293
+ "step": 15600
1294
+ },
1295
+ {
1296
+ "epoch": 9.224441833137485,
1297
+ "grad_norm": 3.0887973308563232,
1298
+ "learning_rate": 0.00016163337250293773,
1299
+ "loss": 0.2377,
1300
+ "step": 15700
1301
+ },
1302
+ {
1303
+ "epoch": 9.283196239717979,
1304
+ "grad_norm": 2.1840600967407227,
1305
+ "learning_rate": 0.0001607520564042303,
1306
+ "loss": 0.2279,
1307
+ "step": 15800
1308
+ },
1309
+ {
1310
+ "epoch": 9.341950646298473,
1311
+ "grad_norm": 1.5278443098068237,
1312
+ "learning_rate": 0.0001598707403055229,
1313
+ "loss": 0.2281,
1314
+ "step": 15900
1315
+ },
1316
+ {
1317
+ "epoch": 9.400705052878966,
1318
+ "grad_norm": 1.8652377128601074,
1319
+ "learning_rate": 0.00015898942420681549,
1320
+ "loss": 0.2261,
1321
+ "step": 16000
1322
+ },
1323
+ {
1324
+ "epoch": 9.400705052878966,
1325
+ "eval_accuracy": 0.778496579377054,
1326
+ "eval_f1_macro": 0.4118529460883977,
1327
+ "eval_loss": 0.2205253690481186,
1328
+ "eval_precision": 0.5265129533046812,
1329
+ "eval_recall": 0.38989546728234464,
1330
+ "eval_runtime": 564.1949,
1331
+ "eval_samples_per_second": 85.756,
1332
+ "eval_steps_per_second": 0.67,
1333
+ "step": 16000
1334
+ },
1335
+ {
1336
+ "epoch": 9.45945945945946,
1337
+ "grad_norm": 3.021077871322632,
1338
+ "learning_rate": 0.0001581081081081081,
1339
+ "loss": 0.2291,
1340
+ "step": 16100
1341
+ },
1342
+ {
1343
+ "epoch": 9.518213866039954,
1344
+ "grad_norm": 1.2995972633361816,
1345
+ "learning_rate": 0.0001572267920094007,
1346
+ "loss": 0.2279,
1347
+ "step": 16200
1348
+ },
1349
+ {
1350
+ "epoch": 9.576968272620446,
1351
+ "grad_norm": 4.3413801193237305,
1352
+ "learning_rate": 0.00015634547591069327,
1353
+ "loss": 0.2254,
1354
+ "step": 16300
1355
+ },
1356
+ {
1357
+ "epoch": 9.63572267920094,
1358
+ "grad_norm": 1.8537020683288574,
1359
+ "learning_rate": 0.0001554641598119859,
1360
+ "loss": 0.2284,
1361
+ "step": 16400
1362
+ },
1363
+ {
1364
+ "epoch": 9.694477085781434,
1365
+ "grad_norm": 2.3501524925231934,
1366
+ "learning_rate": 0.00015458284371327847,
1367
+ "loss": 0.2315,
1368
+ "step": 16500
1369
+ },
1370
+ {
1371
+ "epoch": 9.753231492361927,
1372
+ "grad_norm": 4.062187671661377,
1373
+ "learning_rate": 0.00015370152761457108,
1374
+ "loss": 0.2271,
1375
+ "step": 16600
1376
+ },
1377
+ {
1378
+ "epoch": 9.811985898942421,
1379
+ "grad_norm": 2.9765398502349854,
1380
+ "learning_rate": 0.00015282021151586368,
1381
+ "loss": 0.2341,
1382
+ "step": 16700
1383
+ },
1384
+ {
1385
+ "epoch": 9.870740305522915,
1386
+ "grad_norm": 3.3737270832061768,
1387
+ "learning_rate": 0.00015193889541715628,
1388
+ "loss": 0.2302,
1389
+ "step": 16800
1390
+ },
1391
+ {
1392
+ "epoch": 9.929494712103407,
1393
+ "grad_norm": 3.7637851238250732,
1394
+ "learning_rate": 0.00015105757931844886,
1395
+ "loss": 0.2306,
1396
+ "step": 16900
1397
+ },
1398
+ {
1399
+ "epoch": 9.988249118683902,
1400
+ "grad_norm": 4.947080135345459,
1401
+ "learning_rate": 0.0001501762632197415,
1402
+ "loss": 0.2226,
1403
+ "step": 17000
1404
+ },
1405
+ {
1406
+ "epoch": 9.988249118683902,
1407
+ "eval_accuracy": 0.7804600789533513,
1408
+ "eval_f1_macro": 0.45841854146967403,
1409
+ "eval_loss": 0.21761466562747955,
1410
+ "eval_precision": 0.4921930948461919,
1411
+ "eval_recall": 0.43836252511254314,
1412
+ "eval_runtime": 558.7853,
1413
+ "eval_samples_per_second": 86.586,
1414
+ "eval_steps_per_second": 0.676,
1415
+ "step": 17000
1416
+ },
1417
+ {
1418
+ "epoch": 10.047003525264396,
1419
+ "grad_norm": 4.100146293640137,
1420
+ "learning_rate": 0.00014929494712103406,
1421
+ "loss": 0.2301,
1422
+ "step": 17100
1423
+ },
1424
+ {
1425
+ "epoch": 10.105757931844888,
1426
+ "grad_norm": 2.0130274295806885,
1427
+ "learning_rate": 0.00014841363102232667,
1428
+ "loss": 0.2288,
1429
+ "step": 17200
1430
+ },
1431
+ {
1432
+ "epoch": 10.164512338425382,
1433
+ "grad_norm": 2.4523582458496094,
1434
+ "learning_rate": 0.00014753231492361924,
1435
+ "loss": 0.2257,
1436
+ "step": 17300
1437
+ },
1438
+ {
1439
+ "epoch": 10.223266745005876,
1440
+ "grad_norm": 2.4732425212860107,
1441
+ "learning_rate": 0.00014665099882491185,
1442
+ "loss": 0.2253,
1443
+ "step": 17400
1444
+ },
1445
+ {
1446
+ "epoch": 10.282021151586369,
1447
+ "grad_norm": 2.8159022331237793,
1448
+ "learning_rate": 0.00014576968272620445,
1449
+ "loss": 0.2297,
1450
+ "step": 17500
1451
+ },
1452
+ {
1453
+ "epoch": 10.340775558166863,
1454
+ "grad_norm": 2.784027338027954,
1455
+ "learning_rate": 0.00014488836662749705,
1456
+ "loss": 0.2282,
1457
+ "step": 17600
1458
+ },
1459
+ {
1460
+ "epoch": 10.399529964747355,
1461
+ "grad_norm": 2.8521196842193604,
1462
+ "learning_rate": 0.00014400705052878965,
1463
+ "loss": 0.2294,
1464
+ "step": 17700
1465
+ },
1466
+ {
1467
+ "epoch": 10.45828437132785,
1468
+ "grad_norm": 3.453033685684204,
1469
+ "learning_rate": 0.00014312573443008226,
1470
+ "loss": 0.2252,
1471
+ "step": 17800
1472
+ },
1473
+ {
1474
+ "epoch": 10.517038777908343,
1475
+ "grad_norm": 1.889672875404358,
1476
+ "learning_rate": 0.00014224441833137483,
1477
+ "loss": 0.2266,
1478
+ "step": 17900
1479
+ },
1480
+ {
1481
+ "epoch": 10.575793184488838,
1482
+ "grad_norm": 2.153575897216797,
1483
+ "learning_rate": 0.00014136310223266744,
1484
+ "loss": 0.2248,
1485
+ "step": 18000
1486
+ },
1487
+ {
1488
+ "epoch": 10.575793184488838,
1489
+ "eval_accuracy": 0.778909947708906,
1490
+ "eval_f1_macro": 0.44892131069383673,
1491
+ "eval_loss": 0.2148253470659256,
1492
+ "eval_precision": 0.4964295361361209,
1493
+ "eval_recall": 0.42785352086722733,
1494
+ "eval_runtime": 556.2941,
1495
+ "eval_samples_per_second": 86.974,
1496
+ "eval_steps_per_second": 0.679,
1497
+ "step": 18000
1498
+ },
1499
+ {
1500
+ "epoch": 10.63454759106933,
1501
+ "grad_norm": 2.2354812622070312,
1502
+ "learning_rate": 0.00014048178613396004,
1503
+ "loss": 0.2303,
1504
+ "step": 18100
1505
+ },
1506
+ {
1507
+ "epoch": 10.693301997649824,
1508
+ "grad_norm": 6.267688751220703,
1509
+ "learning_rate": 0.00013960047003525264,
1510
+ "loss": 0.2219,
1511
+ "step": 18200
1512
+ },
1513
+ {
1514
+ "epoch": 10.752056404230316,
1515
+ "grad_norm": 4.277271270751953,
1516
+ "learning_rate": 0.00013871915393654524,
1517
+ "loss": 0.2256,
1518
+ "step": 18300
1519
+ },
1520
+ {
1521
+ "epoch": 10.81081081081081,
1522
+ "grad_norm": 4.068231582641602,
1523
+ "learning_rate": 0.00013783783783783782,
1524
+ "loss": 0.2278,
1525
+ "step": 18400
1526
+ },
1527
+ {
1528
+ "epoch": 10.869565217391305,
1529
+ "grad_norm": 1.9653985500335693,
1530
+ "learning_rate": 0.00013695652173913042,
1531
+ "loss": 0.2326,
1532
+ "step": 18500
1533
+ },
1534
+ {
1535
+ "epoch": 10.928319623971799,
1536
+ "grad_norm": 1.7713191509246826,
1537
+ "learning_rate": 0.00013607520564042303,
1538
+ "loss": 0.2312,
1539
+ "step": 18600
1540
+ },
1541
+ {
1542
+ "epoch": 10.987074030552291,
1543
+ "grad_norm": 3.856257438659668,
1544
+ "learning_rate": 0.0001351938895417156,
1545
+ "loss": 0.2313,
1546
+ "step": 18700
1547
+ },
1548
+ {
1549
+ "epoch": 11.045828437132785,
1550
+ "grad_norm": 3.794623851776123,
1551
+ "learning_rate": 0.0001343125734430082,
1552
+ "loss": 0.2237,
1553
+ "step": 18800
1554
+ },
1555
+ {
1556
+ "epoch": 11.104582843713278,
1557
+ "grad_norm": 2.6339073181152344,
1558
+ "learning_rate": 0.0001334312573443008,
1559
+ "loss": 0.2237,
1560
+ "step": 18900
1561
+ },
1562
+ {
1563
+ "epoch": 11.163337250293772,
1564
+ "grad_norm": 2.57064151763916,
1565
+ "learning_rate": 0.0001325499412455934,
1566
+ "loss": 0.2238,
1567
+ "step": 19000
1568
+ },
1569
+ {
1570
+ "epoch": 11.163337250293772,
1571
+ "eval_accuracy": 0.783353657276316,
1572
+ "eval_f1_macro": 0.456551925690663,
1573
+ "eval_loss": 0.21650880575180054,
1574
+ "eval_precision": 0.5127229337198491,
1575
+ "eval_recall": 0.42967693938948215,
1576
+ "eval_runtime": 585.3328,
1577
+ "eval_samples_per_second": 82.659,
1578
+ "eval_steps_per_second": 0.646,
1579
+ "step": 19000
1580
+ },
1581
+ {
1582
+ "epoch": 11.222091656874266,
1583
+ "grad_norm": 2.160663366317749,
1584
+ "learning_rate": 0.000131668625146886,
1585
+ "loss": 0.2275,
1586
+ "step": 19100
1587
+ },
1588
+ {
1589
+ "epoch": 11.280846063454758,
1590
+ "grad_norm": 5.847405910491943,
1591
+ "learning_rate": 0.0001307873090481786,
1592
+ "loss": 0.2271,
1593
+ "step": 19200
1594
+ },
1595
+ {
1596
+ "epoch": 11.339600470035252,
1597
+ "grad_norm": 2.103134870529175,
1598
+ "learning_rate": 0.0001299059929494712,
1599
+ "loss": 0.2295,
1600
+ "step": 19300
1601
+ },
1602
+ {
1603
+ "epoch": 11.398354876615747,
1604
+ "grad_norm": 2.2549660205841064,
1605
+ "learning_rate": 0.0001290246768507638,
1606
+ "loss": 0.2229,
1607
+ "step": 19400
1608
+ },
1609
+ {
1610
+ "epoch": 11.457109283196239,
1611
+ "grad_norm": 3.1517040729522705,
1612
+ "learning_rate": 0.0001281433607520564,
1613
+ "loss": 0.2221,
1614
+ "step": 19500
1615
+ },
1616
+ {
1617
+ "epoch": 11.515863689776733,
1618
+ "grad_norm": 2.703953266143799,
1619
+ "learning_rate": 0.000127262044653349,
1620
+ "loss": 0.2216,
1621
+ "step": 19600
1622
+ },
1623
+ {
1624
+ "epoch": 11.574618096357227,
1625
+ "grad_norm": 1.5301584005355835,
1626
+ "learning_rate": 0.0001263807285546416,
1627
+ "loss": 0.2301,
1628
+ "step": 19700
1629
+ },
1630
+ {
1631
+ "epoch": 11.63337250293772,
1632
+ "grad_norm": 3.967664957046509,
1633
+ "learning_rate": 0.00012549941245593418,
1634
+ "loss": 0.2257,
1635
+ "step": 19800
1636
+ },
1637
+ {
1638
+ "epoch": 11.692126909518214,
1639
+ "grad_norm": 3.278876543045044,
1640
+ "learning_rate": 0.00012461809635722678,
1641
+ "loss": 0.2277,
1642
+ "step": 19900
1643
+ },
1644
+ {
1645
+ "epoch": 11.750881316098708,
1646
+ "grad_norm": 3.274348497390747,
1647
+ "learning_rate": 0.00012373678025851938,
1648
+ "loss": 0.2262,
1649
+ "step": 20000
1650
+ },
1651
+ {
1652
+ "epoch": 11.750881316098708,
1653
+ "eval_accuracy": 0.7706632494884567,
1654
+ "eval_f1_macro": 0.46808422191837634,
1655
+ "eval_loss": 0.21933460235595703,
1656
+ "eval_precision": 0.4863537664371959,
1657
+ "eval_recall": 0.4608368369449391,
1658
+ "eval_runtime": 542.8512,
1659
+ "eval_samples_per_second": 89.128,
1660
+ "eval_steps_per_second": 0.696,
1661
+ "step": 20000
1662
+ },
1663
+ {
1664
+ "epoch": 11.8096357226792,
1665
+ "grad_norm": 2.5668694972991943,
1666
+ "learning_rate": 0.00012285546415981196,
1667
+ "loss": 0.2212,
1668
+ "step": 20100
1669
+ },
1670
+ {
1671
+ "epoch": 11.868390129259694,
1672
+ "grad_norm": 2.187702178955078,
1673
+ "learning_rate": 0.00012197414806110456,
1674
+ "loss": 0.2254,
1675
+ "step": 20200
1676
+ },
1677
+ {
1678
+ "epoch": 11.927144535840188,
1679
+ "grad_norm": 2.247164487838745,
1680
+ "learning_rate": 0.00012109283196239717,
1681
+ "loss": 0.2268,
1682
+ "step": 20300
1683
+ },
1684
+ {
1685
+ "epoch": 11.98589894242068,
1686
+ "grad_norm": 4.919483184814453,
1687
+ "learning_rate": 0.00012021151586368976,
1688
+ "loss": 0.2251,
1689
+ "step": 20400
1690
+ },
1691
+ {
1692
+ "epoch": 12.044653349001175,
1693
+ "grad_norm": 2.580787181854248,
1694
+ "learning_rate": 0.00011933019976498236,
1695
+ "loss": 0.2255,
1696
+ "step": 20500
1697
+ },
1698
+ {
1699
+ "epoch": 12.103407755581669,
1700
+ "grad_norm": 3.7776031494140625,
1701
+ "learning_rate": 0.00011844888366627496,
1702
+ "loss": 0.2217,
1703
+ "step": 20600
1704
+ },
1705
+ {
1706
+ "epoch": 12.162162162162161,
1707
+ "grad_norm": 2.159958839416504,
1708
+ "learning_rate": 0.00011756756756756755,
1709
+ "loss": 0.2213,
1710
+ "step": 20700
1711
+ },
1712
+ {
1713
+ "epoch": 12.220916568742656,
1714
+ "grad_norm": 1.7245205640792847,
1715
+ "learning_rate": 0.00011668625146886015,
1716
+ "loss": 0.2217,
1717
+ "step": 20800
1718
+ },
1719
+ {
1720
+ "epoch": 12.27967097532315,
1721
+ "grad_norm": 1.598755955696106,
1722
+ "learning_rate": 0.00011580493537015276,
1723
+ "loss": 0.2227,
1724
+ "step": 20900
1725
+ },
1726
+ {
1727
+ "epoch": 12.338425381903642,
1728
+ "grad_norm": 1.9064700603485107,
1729
+ "learning_rate": 0.00011492361927144535,
1730
+ "loss": 0.2239,
1731
+ "step": 21000
1732
+ },
1733
+ {
1734
+ "epoch": 12.338425381903642,
1735
+ "eval_accuracy": 0.7744042328917181,
1736
+ "eval_f1_macro": 0.46072293060919217,
1737
+ "eval_loss": 0.21473053097724915,
1738
+ "eval_precision": 0.5166471813664621,
1739
+ "eval_recall": 0.44065307647654417,
1740
+ "eval_runtime": 586.4383,
1741
+ "eval_samples_per_second": 82.503,
1742
+ "eval_steps_per_second": 0.645,
1743
+ "step": 21000
1744
+ },
1745
+ {
1746
+ "epoch": 12.397179788484136,
1747
+ "grad_norm": 2.0441579818725586,
1748
+ "learning_rate": 0.00011404230317273795,
1749
+ "loss": 0.2208,
1750
+ "step": 21100
1751
+ },
1752
+ {
1753
+ "epoch": 12.45593419506463,
1754
+ "grad_norm": 4.460620880126953,
1755
+ "learning_rate": 0.00011316098707403055,
1756
+ "loss": 0.2233,
1757
+ "step": 21200
1758
+ },
1759
+ {
1760
+ "epoch": 12.514688601645123,
1761
+ "grad_norm": 2.7372050285339355,
1762
+ "learning_rate": 0.00011227967097532314,
1763
+ "loss": 0.2204,
1764
+ "step": 21300
1765
+ },
1766
+ {
1767
+ "epoch": 12.573443008225617,
1768
+ "grad_norm": 3.1166772842407227,
1769
+ "learning_rate": 0.00011139835487661574,
1770
+ "loss": 0.2283,
1771
+ "step": 21400
1772
+ },
1773
+ {
1774
+ "epoch": 12.632197414806111,
1775
+ "grad_norm": 3.481877565383911,
1776
+ "learning_rate": 0.00011051703877790835,
1777
+ "loss": 0.2206,
1778
+ "step": 21500
1779
+ },
1780
+ {
1781
+ "epoch": 12.690951821386603,
1782
+ "grad_norm": 2.6548030376434326,
1783
+ "learning_rate": 0.00010963572267920094,
1784
+ "loss": 0.2241,
1785
+ "step": 21600
1786
+ },
1787
+ {
1788
+ "epoch": 12.749706227967097,
1789
+ "grad_norm": 2.3535709381103516,
1790
+ "learning_rate": 0.00010875440658049353,
1791
+ "loss": 0.2213,
1792
+ "step": 21700
1793
+ },
1794
+ {
1795
+ "epoch": 12.808460634547592,
1796
+ "grad_norm": 2.523663282394409,
1797
+ "learning_rate": 0.00010787309048178611,
1798
+ "loss": 0.2222,
1799
+ "step": 21800
1800
+ },
1801
+ {
1802
+ "epoch": 12.867215041128084,
1803
+ "grad_norm": 1.9537861347198486,
1804
+ "learning_rate": 0.00010699177438307872,
1805
+ "loss": 0.2221,
1806
+ "step": 21900
1807
+ },
1808
+ {
1809
+ "epoch": 12.925969447708578,
1810
+ "grad_norm": 1.9098992347717285,
1811
+ "learning_rate": 0.00010611045828437131,
1812
+ "loss": 0.2196,
1813
+ "step": 22000
1814
+ },
1815
+ {
1816
+ "epoch": 12.925969447708578,
1817
+ "eval_accuracy": 0.782361573279871,
1818
+ "eval_f1_macro": 0.45321392135920985,
1819
+ "eval_loss": 0.21087060868740082,
1820
+ "eval_precision": 0.5160027197721393,
1821
+ "eval_recall": 0.4277450812600511,
1822
+ "eval_runtime": 561.4808,
1823
+ "eval_samples_per_second": 86.17,
1824
+ "eval_steps_per_second": 0.673,
1825
+ "step": 22000
1826
+ },
1827
+ {
1828
+ "epoch": 12.984723854289072,
1829
+ "grad_norm": 2.665001153945923,
1830
+ "learning_rate": 0.00010522914218566391,
1831
+ "loss": 0.2224,
1832
+ "step": 22100
1833
+ },
1834
+ {
1835
+ "epoch": 13.043478260869565,
1836
+ "grad_norm": 3.2731380462646484,
1837
+ "learning_rate": 0.00010434782608695651,
1838
+ "loss": 0.2161,
1839
+ "step": 22200
1840
+ },
1841
+ {
1842
+ "epoch": 13.102232667450059,
1843
+ "grad_norm": 1.7394378185272217,
1844
+ "learning_rate": 0.0001034665099882491,
1845
+ "loss": 0.2213,
1846
+ "step": 22300
1847
+ },
1848
+ {
1849
+ "epoch": 13.160987074030553,
1850
+ "grad_norm": 4.027496337890625,
1851
+ "learning_rate": 0.0001025851938895417,
1852
+ "loss": 0.2232,
1853
+ "step": 22400
1854
+ },
1855
+ {
1856
+ "epoch": 13.219741480611045,
1857
+ "grad_norm": 4.968031883239746,
1858
+ "learning_rate": 0.00010170387779083431,
1859
+ "loss": 0.2228,
1860
+ "step": 22500
1861
+ },
1862
+ {
1863
+ "epoch": 13.27849588719154,
1864
+ "grad_norm": 2.2942428588867188,
1865
+ "learning_rate": 0.0001008225616921269,
1866
+ "loss": 0.2243,
1867
+ "step": 22600
1868
+ },
1869
+ {
1870
+ "epoch": 13.337250293772033,
1871
+ "grad_norm": 1.5325312614440918,
1872
+ "learning_rate": 9.99412455934195e-05,
1873
+ "loss": 0.2191,
1874
+ "step": 22700
1875
+ },
1876
+ {
1877
+ "epoch": 13.396004700352526,
1878
+ "grad_norm": 4.171008586883545,
1879
+ "learning_rate": 9.90599294947121e-05,
1880
+ "loss": 0.225,
1881
+ "step": 22800
1882
+ },
1883
+ {
1884
+ "epoch": 13.45475910693302,
1885
+ "grad_norm": 2.144474506378174,
1886
+ "learning_rate": 9.817861339600469e-05,
1887
+ "loss": 0.2191,
1888
+ "step": 22900
1889
+ },
1890
+ {
1891
+ "epoch": 13.513513513513514,
1892
+ "grad_norm": 1.8419458866119385,
1893
+ "learning_rate": 9.72972972972973e-05,
1894
+ "loss": 0.2244,
1895
+ "step": 23000
1896
+ },
1897
+ {
1898
+ "epoch": 13.513513513513514,
1899
+ "eval_accuracy": 0.7836016782754274,
1900
+ "eval_f1_macro": 0.46679865317019514,
1901
+ "eval_loss": 0.21025818586349487,
1902
+ "eval_precision": 0.5055179262576418,
1903
+ "eval_recall": 0.4468885382196594,
1904
+ "eval_runtime": 567.4084,
1905
+ "eval_samples_per_second": 85.27,
1906
+ "eval_steps_per_second": 0.666,
1907
+ "step": 23000
1908
+ },
1909
+ {
1910
+ "epoch": 13.572267920094006,
1911
+ "grad_norm": 2.069737672805786,
1912
+ "learning_rate": 9.64159811985899e-05,
1913
+ "loss": 0.2171,
1914
+ "step": 23100
1915
+ },
1916
+ {
1917
+ "epoch": 13.6310223266745,
1918
+ "grad_norm": 2.8181750774383545,
1919
+ "learning_rate": 9.553466509988249e-05,
1920
+ "loss": 0.2199,
1921
+ "step": 23200
1922
+ },
1923
+ {
1924
+ "epoch": 13.689776733254995,
1925
+ "grad_norm": 1.9151453971862793,
1926
+ "learning_rate": 9.465334900117508e-05,
1927
+ "loss": 0.2215,
1928
+ "step": 23300
1929
+ },
1930
+ {
1931
+ "epoch": 13.748531139835487,
1932
+ "grad_norm": 2.501735210418701,
1933
+ "learning_rate": 9.377203290246767e-05,
1934
+ "loss": 0.2224,
1935
+ "step": 23400
1936
+ },
1937
+ {
1938
+ "epoch": 13.807285546415981,
1939
+ "grad_norm": 4.269018173217773,
1940
+ "learning_rate": 9.289071680376027e-05,
1941
+ "loss": 0.2226,
1942
+ "step": 23500
1943
+ },
1944
+ {
1945
+ "epoch": 13.866039952996475,
1946
+ "grad_norm": 3.0230236053466797,
1947
+ "learning_rate": 9.200940070505287e-05,
1948
+ "loss": 0.2254,
1949
+ "step": 23600
1950
+ },
1951
+ {
1952
+ "epoch": 13.924794359576968,
1953
+ "grad_norm": 1.6129169464111328,
1954
+ "learning_rate": 9.112808460634546e-05,
1955
+ "loss": 0.223,
1956
+ "step": 23700
1957
+ },
1958
+ {
1959
+ "epoch": 13.983548766157462,
1960
+ "grad_norm": 3.050380229949951,
1961
+ "learning_rate": 9.024676850763806e-05,
1962
+ "loss": 0.2157,
1963
+ "step": 23800
1964
+ },
1965
+ {
1966
+ "epoch": 14.042303172737956,
1967
+ "grad_norm": 1.8896129131317139,
1968
+ "learning_rate": 8.936545240893067e-05,
1969
+ "loss": 0.2203,
1970
+ "step": 23900
1971
+ },
1972
+ {
1973
+ "epoch": 14.101057579318448,
1974
+ "grad_norm": 2.357605218887329,
1975
+ "learning_rate": 8.848413631022326e-05,
1976
+ "loss": 0.2181,
1977
+ "step": 24000
1978
+ },
1979
+ {
1980
+ "epoch": 14.101057579318448,
1981
+ "eval_accuracy": 0.7799227001219436,
1982
+ "eval_f1_macro": 0.4646864264769805,
1983
+ "eval_loss": 0.2089788019657135,
1984
+ "eval_precision": 0.5189278565795705,
1985
+ "eval_recall": 0.4427888401365953,
1986
+ "eval_runtime": 561.1944,
1987
+ "eval_samples_per_second": 86.214,
1988
+ "eval_steps_per_second": 0.674,
1989
+ "step": 24000
1990
+ },
1991
+ {
1992
+ "epoch": 14.159811985898942,
1993
+ "grad_norm": 2.8449645042419434,
1994
+ "learning_rate": 8.760282021151586e-05,
1995
+ "loss": 0.2158,
1996
+ "step": 24100
1997
+ },
1998
+ {
1999
+ "epoch": 14.218566392479437,
2000
+ "grad_norm": 3.220463752746582,
2001
+ "learning_rate": 8.672150411280845e-05,
2002
+ "loss": 0.2234,
2003
+ "step": 24200
2004
+ },
2005
+ {
2006
+ "epoch": 14.277320799059929,
2007
+ "grad_norm": 2.0377910137176514,
2008
+ "learning_rate": 8.584018801410105e-05,
2009
+ "loss": 0.222,
2010
+ "step": 24300
2011
+ },
2012
+ {
2013
+ "epoch": 14.336075205640423,
2014
+ "grad_norm": 2.213088274002075,
2015
+ "learning_rate": 8.495887191539365e-05,
2016
+ "loss": 0.217,
2017
+ "step": 24400
2018
+ },
2019
+ {
2020
+ "epoch": 14.394829612220917,
2021
+ "grad_norm": 3.5318024158477783,
2022
+ "learning_rate": 8.407755581668624e-05,
2023
+ "loss": 0.218,
2024
+ "step": 24500
2025
+ },
2026
+ {
2027
+ "epoch": 14.45358401880141,
2028
+ "grad_norm": 2.010096549987793,
2029
+ "learning_rate": 8.319623971797885e-05,
2030
+ "loss": 0.2189,
2031
+ "step": 24600
2032
+ },
2033
+ {
2034
+ "epoch": 14.512338425381904,
2035
+ "grad_norm": 1.9498238563537598,
2036
+ "learning_rate": 8.231492361927145e-05,
2037
+ "loss": 0.2238,
2038
+ "step": 24700
2039
+ },
2040
+ {
2041
+ "epoch": 14.571092831962398,
2042
+ "grad_norm": 2.8972408771514893,
2043
+ "learning_rate": 8.143360752056404e-05,
2044
+ "loss": 0.2133,
2045
+ "step": 24800
2046
+ },
2047
+ {
2048
+ "epoch": 14.62984723854289,
2049
+ "grad_norm": 1.39292311668396,
2050
+ "learning_rate": 8.055229142185663e-05,
2051
+ "loss": 0.2229,
2052
+ "step": 24900
2053
+ },
2054
+ {
2055
+ "epoch": 14.688601645123384,
2056
+ "grad_norm": 3.813009738922119,
2057
+ "learning_rate": 7.967097532314922e-05,
2058
+ "loss": 0.2165,
2059
+ "step": 25000
2060
+ },
2061
+ {
2062
+ "epoch": 14.688601645123384,
2063
+ "eval_accuracy": 0.7839530413575017,
2064
+ "eval_f1_macro": 0.4677806630551704,
2065
+ "eval_loss": 0.21110670268535614,
2066
+ "eval_precision": 0.5050811199516158,
2067
+ "eval_recall": 0.44609252100680624,
2068
+ "eval_runtime": 551.7967,
2069
+ "eval_samples_per_second": 87.683,
2070
+ "eval_steps_per_second": 0.685,
2071
+ "step": 25000
2072
+ },
2073
+ {
2074
+ "epoch": 14.747356051703878,
2075
+ "grad_norm": 2.7742602825164795,
2076
+ "learning_rate": 7.878965922444182e-05,
2077
+ "loss": 0.2139,
2078
+ "step": 25100
2079
+ },
2080
+ {
2081
+ "epoch": 14.80611045828437,
2082
+ "grad_norm": 2.4670934677124023,
2083
+ "learning_rate": 7.790834312573442e-05,
2084
+ "loss": 0.2183,
2085
+ "step": 25200
2086
+ },
2087
+ {
2088
+ "epoch": 14.864864864864865,
2089
+ "grad_norm": 3.2507874965667725,
2090
+ "learning_rate": 7.702702702702701e-05,
2091
+ "loss": 0.2229,
2092
+ "step": 25300
2093
+ },
2094
+ {
2095
+ "epoch": 14.923619271445359,
2096
+ "grad_norm": 1.7584885358810425,
2097
+ "learning_rate": 7.614571092831962e-05,
2098
+ "loss": 0.2171,
2099
+ "step": 25400
2100
+ },
2101
+ {
2102
+ "epoch": 14.982373678025851,
2103
+ "grad_norm": 2.5273969173431396,
2104
+ "learning_rate": 7.526439482961222e-05,
2105
+ "loss": 0.2184,
2106
+ "step": 25500
2107
+ },
2108
+ {
2109
+ "epoch": 15.041128084606346,
2110
+ "grad_norm": 2.622952699661255,
2111
+ "learning_rate": 7.438307873090481e-05,
2112
+ "loss": 0.2162,
2113
+ "step": 25600
2114
+ },
2115
+ {
2116
+ "epoch": 15.09988249118684,
2117
+ "grad_norm": 2.1974570751190186,
2118
+ "learning_rate": 7.350176263219741e-05,
2119
+ "loss": 0.2104,
2120
+ "step": 25700
2121
+ },
2122
+ {
2123
+ "epoch": 15.158636897767332,
2124
+ "grad_norm": 2.2584497928619385,
2125
+ "learning_rate": 7.262044653349001e-05,
2126
+ "loss": 0.2152,
2127
+ "step": 25800
2128
+ },
2129
+ {
2130
+ "epoch": 15.217391304347826,
2131
+ "grad_norm": 3.1817431449890137,
2132
+ "learning_rate": 7.17391304347826e-05,
2133
+ "loss": 0.2144,
2134
+ "step": 25900
2135
+ },
2136
+ {
2137
+ "epoch": 15.27614571092832,
2138
+ "grad_norm": 3.306057929992676,
2139
+ "learning_rate": 7.08578143360752e-05,
2140
+ "loss": 0.2197,
2141
+ "step": 26000
2142
+ },
2143
+ {
2144
+ "epoch": 15.27614571092832,
2145
+ "eval_accuracy": 0.7750036169729037,
2146
+ "eval_f1_macro": 0.4676251296697811,
2147
+ "eval_loss": 0.21305988729000092,
2148
+ "eval_precision": 0.49501914130610697,
2149
+ "eval_recall": 0.45513146662613874,
2150
+ "eval_runtime": 565.178,
2151
+ "eval_samples_per_second": 85.607,
2152
+ "eval_steps_per_second": 0.669,
2153
+ "step": 26000
2154
+ },
2155
+ {
2156
+ "epoch": 15.334900117508813,
2157
+ "grad_norm": 2.0208778381347656,
2158
+ "learning_rate": 6.99764982373678e-05,
2159
+ "loss": 0.2214,
2160
+ "step": 26100
2161
+ },
2162
+ {
2163
+ "epoch": 15.393654524089307,
2164
+ "grad_norm": 3.2097744941711426,
2165
+ "learning_rate": 6.909518213866038e-05,
2166
+ "loss": 0.2195,
2167
+ "step": 26200
2168
+ },
2169
+ {
2170
+ "epoch": 15.452408930669801,
2171
+ "grad_norm": 2.718372344970703,
2172
+ "learning_rate": 6.821386603995299e-05,
2173
+ "loss": 0.2233,
2174
+ "step": 26300
2175
+ },
2176
+ {
2177
+ "epoch": 15.511163337250293,
2178
+ "grad_norm": 3.371232032775879,
2179
+ "learning_rate": 6.733254994124559e-05,
2180
+ "loss": 0.2148,
2181
+ "step": 26400
2182
+ },
2183
+ {
2184
+ "epoch": 15.569917743830787,
2185
+ "grad_norm": 1.748062014579773,
2186
+ "learning_rate": 6.645123384253818e-05,
2187
+ "loss": 0.2151,
2188
+ "step": 26500
2189
+ },
2190
+ {
2191
+ "epoch": 15.628672150411282,
2192
+ "grad_norm": 2.6323885917663574,
2193
+ "learning_rate": 6.556991774383078e-05,
2194
+ "loss": 0.2193,
2195
+ "step": 26600
2196
+ },
2197
+ {
2198
+ "epoch": 15.687426556991774,
2199
+ "grad_norm": 3.380427598953247,
2200
+ "learning_rate": 6.468860164512338e-05,
2201
+ "loss": 0.2151,
2202
+ "step": 26700
2203
+ },
2204
+ {
2205
+ "epoch": 15.746180963572268,
2206
+ "grad_norm": 2.617914915084839,
2207
+ "learning_rate": 6.380728554641597e-05,
2208
+ "loss": 0.2175,
2209
+ "step": 26800
2210
+ },
2211
+ {
2212
+ "epoch": 15.804935370152762,
2213
+ "grad_norm": 2.5959670543670654,
2214
+ "learning_rate": 6.292596944770856e-05,
2215
+ "loss": 0.2158,
2216
+ "step": 26900
2217
+ },
2218
+ {
2219
+ "epoch": 15.863689776733255,
2220
+ "grad_norm": 1.8247867822647095,
2221
+ "learning_rate": 6.204465334900117e-05,
2222
+ "loss": 0.2173,
2223
+ "step": 27000
2224
+ },
2225
+ {
2226
+ "epoch": 15.863689776733255,
2227
+ "eval_accuracy": 0.7821548891139449,
2228
+ "eval_f1_macro": 0.46965749568257437,
2229
+ "eval_loss": 0.2087218463420868,
2230
+ "eval_precision": 0.5036667780561973,
2231
+ "eval_recall": 0.45166012346117074,
2232
+ "eval_runtime": 551.32,
2233
+ "eval_samples_per_second": 87.758,
2234
+ "eval_steps_per_second": 0.686,
2235
+ "step": 27000
2236
+ },
2237
+ {
2238
+ "epoch": 15.922444183313749,
2239
+ "grad_norm": 2.068295478820801,
2240
+ "learning_rate": 6.116333725029377e-05,
2241
+ "loss": 0.2161,
2242
+ "step": 27100
2243
+ },
2244
+ {
2245
+ "epoch": 15.981198589894243,
2246
+ "grad_norm": 2.173018455505371,
2247
+ "learning_rate": 6.0282021151586365e-05,
2248
+ "loss": 0.2192,
2249
+ "step": 27200
2250
+ },
2251
+ {
2252
+ "epoch": 16.039952996474735,
2253
+ "grad_norm": 3.564419746398926,
2254
+ "learning_rate": 5.940070505287896e-05,
2255
+ "loss": 0.213,
2256
+ "step": 27300
2257
+ },
2258
+ {
2259
+ "epoch": 16.09870740305523,
2260
+ "grad_norm": 2.131643772125244,
2261
+ "learning_rate": 5.851938895417156e-05,
2262
+ "loss": 0.219,
2263
+ "step": 27400
2264
+ },
2265
+ {
2266
+ "epoch": 16.157461809635723,
2267
+ "grad_norm": 1.6084250211715698,
2268
+ "learning_rate": 5.7638072855464154e-05,
2269
+ "loss": 0.2168,
2270
+ "step": 27500
2271
+ },
2272
+ {
2273
+ "epoch": 16.216216216216218,
2274
+ "grad_norm": 1.8609333038330078,
2275
+ "learning_rate": 5.6756756756756757e-05,
2276
+ "loss": 0.2203,
2277
+ "step": 27600
2278
+ },
2279
+ {
2280
+ "epoch": 16.274970622796708,
2281
+ "grad_norm": 1.929494857788086,
2282
+ "learning_rate": 5.5875440658049346e-05,
2283
+ "loss": 0.2126,
2284
+ "step": 27700
2285
+ },
2286
+ {
2287
+ "epoch": 16.333725029377202,
2288
+ "grad_norm": 1.7891273498535156,
2289
+ "learning_rate": 5.499412455934194e-05,
2290
+ "loss": 0.2125,
2291
+ "step": 27800
2292
+ },
2293
+ {
2294
+ "epoch": 16.392479435957696,
2295
+ "grad_norm": 1.935006022453308,
2296
+ "learning_rate": 5.411280846063454e-05,
2297
+ "loss": 0.2136,
2298
+ "step": 27900
2299
+ },
2300
+ {
2301
+ "epoch": 16.45123384253819,
2302
+ "grad_norm": 2.7039895057678223,
2303
+ "learning_rate": 5.323149236192714e-05,
2304
+ "loss": 0.2204,
2305
+ "step": 28000
2306
+ },
2307
+ {
2308
+ "epoch": 16.45123384253819,
2309
+ "eval_accuracy": 0.781844862865056,
2310
+ "eval_f1_macro": 0.47010557287584404,
2311
+ "eval_loss": 0.20962630212306976,
2312
+ "eval_precision": 0.5006758936230861,
2313
+ "eval_recall": 0.45393923309607365,
2314
+ "eval_runtime": 539.5236,
2315
+ "eval_samples_per_second": 89.677,
2316
+ "eval_steps_per_second": 0.701,
2317
+ "step": 28000
2318
+ },
2319
+ {
2320
+ "epoch": 16.509988249118685,
2321
+ "grad_norm": 2.465174913406372,
2322
+ "learning_rate": 5.235017626321974e-05,
2323
+ "loss": 0.215,
2324
+ "step": 28100
2325
+ },
2326
+ {
2327
+ "epoch": 16.56874265569918,
2328
+ "grad_norm": 3.626897096633911,
2329
+ "learning_rate": 5.146886016451233e-05,
2330
+ "loss": 0.2159,
2331
+ "step": 28200
2332
+ },
2333
+ {
2334
+ "epoch": 16.62749706227967,
2335
+ "grad_norm": 1.9083856344223022,
2336
+ "learning_rate": 5.0587544065804936e-05,
2337
+ "loss": 0.2146,
2338
+ "step": 28300
2339
+ },
2340
+ {
2341
+ "epoch": 16.686251468860164,
2342
+ "grad_norm": 1.8644742965698242,
2343
+ "learning_rate": 4.970622796709753e-05,
2344
+ "loss": 0.2155,
2345
+ "step": 28400
2346
+ },
2347
+ {
2348
+ "epoch": 16.745005875440658,
2349
+ "grad_norm": 2.8223023414611816,
2350
+ "learning_rate": 4.882491186839013e-05,
2351
+ "loss": 0.2127,
2352
+ "step": 28500
2353
+ },
2354
+ {
2355
+ "epoch": 16.803760282021152,
2356
+ "grad_norm": 2.9986822605133057,
2357
+ "learning_rate": 4.794359576968272e-05,
2358
+ "loss": 0.2168,
2359
+ "step": 28600
2360
+ },
2361
+ {
2362
+ "epoch": 16.862514688601646,
2363
+ "grad_norm": 2.5670571327209473,
2364
+ "learning_rate": 4.7062279670975314e-05,
2365
+ "loss": 0.2105,
2366
+ "step": 28700
2367
+ },
2368
+ {
2369
+ "epoch": 16.92126909518214,
2370
+ "grad_norm": 4.0372467041015625,
2371
+ "learning_rate": 4.6180963572267917e-05,
2372
+ "loss": 0.2116,
2373
+ "step": 28800
2374
+ },
2375
+ {
2376
+ "epoch": 16.98002350176263,
2377
+ "grad_norm": 2.199449300765991,
2378
+ "learning_rate": 4.529964747356051e-05,
2379
+ "loss": 0.2188,
2380
+ "step": 28900
2381
+ },
2382
+ {
2383
+ "epoch": 17.038777908343125,
2384
+ "grad_norm": 2.2641525268554688,
2385
+ "learning_rate": 4.441833137485311e-05,
2386
+ "loss": 0.2157,
2387
+ "step": 29000
2388
+ },
2389
+ {
2390
+ "epoch": 17.038777908343125,
2391
+ "eval_accuracy": 0.7866192670979476,
2392
+ "eval_f1_macro": 0.46819371251173775,
2393
+ "eval_loss": 0.2086167186498642,
2394
+ "eval_precision": 0.5101989602452385,
2395
+ "eval_recall": 0.44473033751495855,
2396
+ "eval_runtime": 537.4744,
2397
+ "eval_samples_per_second": 90.019,
2398
+ "eval_steps_per_second": 0.703,
2399
+ "step": 29000
2400
+ },
2401
+ {
2402
+ "epoch": 17.09753231492362,
2403
+ "grad_norm": 1.7599774599075317,
2404
+ "learning_rate": 4.353701527614571e-05,
2405
+ "loss": 0.2147,
2406
+ "step": 29100
2407
+ },
2408
+ {
2409
+ "epoch": 17.156286721504113,
2410
+ "grad_norm": 3.7391819953918457,
2411
+ "learning_rate": 4.265569917743831e-05,
2412
+ "loss": 0.2131,
2413
+ "step": 29200
2414
+ },
2415
+ {
2416
+ "epoch": 17.215041128084607,
2417
+ "grad_norm": 1.628392219543457,
2418
+ "learning_rate": 4.1774383078730904e-05,
2419
+ "loss": 0.211,
2420
+ "step": 29300
2421
+ },
2422
+ {
2423
+ "epoch": 17.2737955346651,
2424
+ "grad_norm": 2.0813705921173096,
2425
+ "learning_rate": 4.089306698002349e-05,
2426
+ "loss": 0.213,
2427
+ "step": 29400
2428
+ },
2429
+ {
2430
+ "epoch": 17.332549941245592,
2431
+ "grad_norm": 1.5833709239959717,
2432
+ "learning_rate": 4.0011750881316096e-05,
2433
+ "loss": 0.2122,
2434
+ "step": 29500
2435
+ },
2436
+ {
2437
+ "epoch": 17.391304347826086,
2438
+ "grad_norm": 2.216641664505005,
2439
+ "learning_rate": 3.913043478260869e-05,
2440
+ "loss": 0.2123,
2441
+ "step": 29600
2442
+ },
2443
+ {
2444
+ "epoch": 17.45005875440658,
2445
+ "grad_norm": 1.9753063917160034,
2446
+ "learning_rate": 3.824911868390129e-05,
2447
+ "loss": 0.2121,
2448
+ "step": 29700
2449
+ },
2450
+ {
2451
+ "epoch": 17.508813160987074,
2452
+ "grad_norm": 2.2607269287109375,
2453
+ "learning_rate": 3.7367802585193884e-05,
2454
+ "loss": 0.2179,
2455
+ "step": 29800
2456
+ },
2457
+ {
2458
+ "epoch": 17.56756756756757,
2459
+ "grad_norm": 4.074460506439209,
2460
+ "learning_rate": 3.648648648648649e-05,
2461
+ "loss": 0.2104,
2462
+ "step": 29900
2463
+ },
2464
+ {
2465
+ "epoch": 17.626321974148063,
2466
+ "grad_norm": 1.758702039718628,
2467
+ "learning_rate": 3.560517038777908e-05,
2468
+ "loss": 0.2135,
2469
+ "step": 30000
2470
+ },
2471
+ {
2472
+ "epoch": 17.626321974148063,
2473
+ "eval_accuracy": 0.781369489283426,
2474
+ "eval_f1_macro": 0.4699363142405637,
2475
+ "eval_loss": 0.20687498152256012,
2476
+ "eval_precision": 0.5076781202687786,
2477
+ "eval_recall": 0.4518498245169957,
2478
+ "eval_runtime": 584.9222,
2479
+ "eval_samples_per_second": 82.717,
2480
+ "eval_steps_per_second": 0.646,
2481
+ "step": 30000
2482
+ },
2483
+ {
2484
+ "epoch": 17.685076380728553,
2485
+ "grad_norm": 2.3470921516418457,
2486
+ "learning_rate": 3.472385428907168e-05,
2487
+ "loss": 0.2152,
2488
+ "step": 30100
2489
+ },
2490
+ {
2491
+ "epoch": 17.743830787309047,
2492
+ "grad_norm": 2.4279568195343018,
2493
+ "learning_rate": 3.3842538190364276e-05,
2494
+ "loss": 0.2144,
2495
+ "step": 30200
2496
+ },
2497
+ {
2498
+ "epoch": 17.80258519388954,
2499
+ "grad_norm": 1.7226651906967163,
2500
+ "learning_rate": 3.296122209165687e-05,
2501
+ "loss": 0.214,
2502
+ "step": 30300
2503
+ },
2504
+ {
2505
+ "epoch": 17.861339600470036,
2506
+ "grad_norm": 3.090634346008301,
2507
+ "learning_rate": 3.207990599294947e-05,
2508
+ "loss": 0.2175,
2509
+ "step": 30400
2510
+ },
2511
+ {
2512
+ "epoch": 17.92009400705053,
2513
+ "grad_norm": 1.787026286125183,
2514
+ "learning_rate": 3.1198589894242064e-05,
2515
+ "loss": 0.2112,
2516
+ "step": 30500
2517
+ },
2518
+ {
2519
+ "epoch": 17.978848413631024,
2520
+ "grad_norm": 1.844420313835144,
2521
+ "learning_rate": 3.0317273795534663e-05,
2522
+ "loss": 0.2132,
2523
+ "step": 30600
2524
+ },
2525
+ {
2526
+ "epoch": 18.037602820211514,
2527
+ "grad_norm": 1.7395440340042114,
2528
+ "learning_rate": 2.9435957696827263e-05,
2529
+ "loss": 0.2091,
2530
+ "step": 30700
2531
+ },
2532
+ {
2533
+ "epoch": 18.09635722679201,
2534
+ "grad_norm": 1.9990227222442627,
2535
+ "learning_rate": 2.8554641598119856e-05,
2536
+ "loss": 0.2123,
2537
+ "step": 30800
2538
+ },
2539
+ {
2540
+ "epoch": 18.155111633372503,
2541
+ "grad_norm": 2.4807918071746826,
2542
+ "learning_rate": 2.7673325499412452e-05,
2543
+ "loss": 0.2082,
2544
+ "step": 30900
2545
+ },
2546
+ {
2547
+ "epoch": 18.213866039952997,
2548
+ "grad_norm": 2.496959924697876,
2549
+ "learning_rate": 2.679200940070505e-05,
2550
+ "loss": 0.2088,
2551
+ "step": 31000
2552
+ },
2553
+ {
2554
+ "epoch": 18.213866039952997,
2555
+ "eval_accuracy": 0.783353657276316,
2556
+ "eval_f1_macro": 0.4700197150315801,
2557
+ "eval_loss": 0.20601825416088104,
2558
+ "eval_precision": 0.5112111682738877,
2559
+ "eval_recall": 0.45050131045693015,
2560
+ "eval_runtime": 542.6455,
2561
+ "eval_samples_per_second": 89.161,
2562
+ "eval_steps_per_second": 0.697,
2563
+ "step": 31000
2564
+ },
2565
+ {
2566
+ "epoch": 18.27262044653349,
2567
+ "grad_norm": 2.345229387283325,
2568
+ "learning_rate": 2.591069330199765e-05,
2569
+ "loss": 0.2152,
2570
+ "step": 31100
2571
+ },
2572
+ {
2573
+ "epoch": 18.331374853113985,
2574
+ "grad_norm": 2.492675542831421,
2575
+ "learning_rate": 2.5029377203290243e-05,
2576
+ "loss": 0.2115,
2577
+ "step": 31200
2578
+ },
2579
+ {
2580
+ "epoch": 18.390129259694476,
2581
+ "grad_norm": 2.251948833465576,
2582
+ "learning_rate": 2.4148061104582843e-05,
2583
+ "loss": 0.2111,
2584
+ "step": 31300
2585
+ },
2586
+ {
2587
+ "epoch": 18.44888366627497,
2588
+ "grad_norm": 2.327437400817871,
2589
+ "learning_rate": 2.326674500587544e-05,
2590
+ "loss": 0.2085,
2591
+ "step": 31400
2592
+ },
2593
+ {
2594
+ "epoch": 18.507638072855464,
2595
+ "grad_norm": 2.292947292327881,
2596
+ "learning_rate": 2.238542890716804e-05,
2597
+ "loss": 0.2141,
2598
+ "step": 31500
2599
+ },
2600
+ {
2601
+ "epoch": 18.566392479435958,
2602
+ "grad_norm": 2.319504499435425,
2603
+ "learning_rate": 2.150411280846063e-05,
2604
+ "loss": 0.2133,
2605
+ "step": 31600
2606
+ },
2607
+ {
2608
+ "epoch": 18.625146886016452,
2609
+ "grad_norm": 2.6240415573120117,
2610
+ "learning_rate": 2.062279670975323e-05,
2611
+ "loss": 0.2161,
2612
+ "step": 31700
2613
+ },
2614
+ {
2615
+ "epoch": 18.683901292596946,
2616
+ "grad_norm": 1.637320637702942,
2617
+ "learning_rate": 1.9741480611045827e-05,
2618
+ "loss": 0.2084,
2619
+ "step": 31800
2620
+ },
2621
+ {
2622
+ "epoch": 18.742655699177437,
2623
+ "grad_norm": 1.8866758346557617,
2624
+ "learning_rate": 1.8860164512338426e-05,
2625
+ "loss": 0.2099,
2626
+ "step": 31900
2627
+ },
2628
+ {
2629
+ "epoch": 18.80141010575793,
2630
+ "grad_norm": 1.5909967422485352,
2631
+ "learning_rate": 1.7978848413631022e-05,
2632
+ "loss": 0.215,
2633
+ "step": 32000
2634
+ },
2635
+ {
2636
+ "epoch": 18.80141010575793,
2637
+ "eval_accuracy": 0.7825475890292045,
2638
+ "eval_f1_macro": 0.4729882480682901,
2639
+ "eval_loss": 0.2060166597366333,
2640
+ "eval_precision": 0.5092421494744399,
2641
+ "eval_recall": 0.45578106777502264,
2642
+ "eval_runtime": 530.5298,
2643
+ "eval_samples_per_second": 91.198,
2644
+ "eval_steps_per_second": 0.712,
2645
+ "step": 32000
2646
+ },
2647
+ {
2648
+ "epoch": 18.860164512338425,
2649
+ "grad_norm": 2.3732552528381348,
2650
+ "learning_rate": 1.709753231492362e-05,
2651
+ "loss": 0.2126,
2652
+ "step": 32100
2653
+ },
2654
+ {
2655
+ "epoch": 18.91891891891892,
2656
+ "grad_norm": 2.5773391723632812,
2657
+ "learning_rate": 1.6216216216216215e-05,
2658
+ "loss": 0.2095,
2659
+ "step": 32200
2660
+ },
2661
+ {
2662
+ "epoch": 18.977673325499413,
2663
+ "grad_norm": 2.3607606887817383,
2664
+ "learning_rate": 1.533490011750881e-05,
2665
+ "loss": 0.2099,
2666
+ "step": 32300
2667
+ },
2668
+ {
2669
+ "epoch": 19.036427732079908,
2670
+ "grad_norm": 3.814934730529785,
2671
+ "learning_rate": 1.445358401880141e-05,
2672
+ "loss": 0.21,
2673
+ "step": 32400
2674
+ },
2675
+ {
2676
+ "epoch": 19.095182138660398,
2677
+ "grad_norm": 1.9512494802474976,
2678
+ "learning_rate": 1.3572267920094006e-05,
2679
+ "loss": 0.2098,
2680
+ "step": 32500
2681
+ },
2682
+ {
2683
+ "epoch": 19.153936545240892,
2684
+ "grad_norm": 2.072913408279419,
2685
+ "learning_rate": 1.2690951821386604e-05,
2686
+ "loss": 0.2156,
2687
+ "step": 32600
2688
+ },
2689
+ {
2690
+ "epoch": 19.212690951821386,
2691
+ "grad_norm": 3.2823166847229004,
2692
+ "learning_rate": 1.18096357226792e-05,
2693
+ "loss": 0.2058,
2694
+ "step": 32700
2695
+ },
2696
+ {
2697
+ "epoch": 19.27144535840188,
2698
+ "grad_norm": 2.4737155437469482,
2699
+ "learning_rate": 1.0928319623971798e-05,
2700
+ "loss": 0.2084,
2701
+ "step": 32800
2702
+ },
2703
+ {
2704
+ "epoch": 19.330199764982375,
2705
+ "grad_norm": 2.195495843887329,
2706
+ "learning_rate": 1.0047003525264394e-05,
2707
+ "loss": 0.2099,
2708
+ "step": 32900
2709
+ },
2710
+ {
2711
+ "epoch": 19.38895417156287,
2712
+ "grad_norm": 2.7346057891845703,
2713
+ "learning_rate": 9.16568742655699e-06,
2714
+ "loss": 0.2138,
2715
+ "step": 33000
2716
+ },
2717
+ {
2718
+ "epoch": 19.38895417156287,
2719
+ "eval_accuracy": 0.7822788996135006,
2720
+ "eval_f1_macro": 0.47654790915623974,
2721
+ "eval_loss": 0.2069149762392044,
2722
+ "eval_precision": 0.5055123102220627,
2723
+ "eval_recall": 0.4617181765781322,
2724
+ "eval_runtime": 563.9047,
2725
+ "eval_samples_per_second": 85.8,
2726
+ "eval_steps_per_second": 0.67,
2727
+ "step": 33000
2728
+ },
2729
+ {
2730
+ "epoch": 19.44770857814336,
2731
+ "grad_norm": 1.7967709302902222,
2732
+ "learning_rate": 8.284371327849588e-06,
2733
+ "loss": 0.2113,
2734
+ "step": 33100
2735
+ },
2736
+ {
2737
+ "epoch": 19.506462984723854,
2738
+ "grad_norm": 2.043199300765991,
2739
+ "learning_rate": 7.403055229142185e-06,
2740
+ "loss": 0.2134,
2741
+ "step": 33200
2742
+ },
2743
+ {
2744
+ "epoch": 19.565217391304348,
2745
+ "grad_norm": 2.2731423377990723,
2746
+ "learning_rate": 6.521739130434782e-06,
2747
+ "loss": 0.2053,
2748
+ "step": 33300
2749
+ },
2750
+ {
2751
+ "epoch": 19.623971797884842,
2752
+ "grad_norm": 1.7844184637069702,
2753
+ "learning_rate": 5.640423031727379e-06,
2754
+ "loss": 0.2102,
2755
+ "step": 33400
2756
+ },
2757
+ {
2758
+ "epoch": 19.682726204465336,
2759
+ "grad_norm": 2.1626009941101074,
2760
+ "learning_rate": 4.759106933019976e-06,
2761
+ "loss": 0.2165,
2762
+ "step": 33500
2763
+ },
2764
+ {
2765
+ "epoch": 19.74148061104583,
2766
+ "grad_norm": 2.5223422050476074,
2767
+ "learning_rate": 3.877790834312573e-06,
2768
+ "loss": 0.2061,
2769
+ "step": 33600
2770
+ },
2771
+ {
2772
+ "epoch": 19.80023501762632,
2773
+ "grad_norm": 2.5433812141418457,
2774
+ "learning_rate": 2.99647473560517e-06,
2775
+ "loss": 0.2083,
2776
+ "step": 33700
2777
+ },
2778
+ {
2779
+ "epoch": 19.858989424206815,
2780
+ "grad_norm": 2.087890148162842,
2781
+ "learning_rate": 2.1151586368977672e-06,
2782
+ "loss": 0.2117,
2783
+ "step": 33800
2784
+ },
2785
+ {
2786
+ "epoch": 19.91774383078731,
2787
+ "grad_norm": 2.9558768272399902,
2788
+ "learning_rate": 1.2338425381903642e-06,
2789
+ "loss": 0.2139,
2790
+ "step": 33900
2791
+ },
2792
+ {
2793
+ "epoch": 19.976498237367803,
2794
+ "grad_norm": 2.22611927986145,
2795
+ "learning_rate": 3.525264394829612e-07,
2796
+ "loss": 0.2125,
2797
+ "step": 34000
2798
+ },
2799
+ {
2800
+ "epoch": 19.976498237367803,
2801
+ "eval_accuracy": 0.7837876940247608,
2802
+ "eval_f1_macro": 0.4752409882222652,
2803
+ "eval_loss": 0.20548085868358612,
2804
+ "eval_precision": 0.506773667085311,
2805
+ "eval_recall": 0.45863653128297405,
2806
+ "eval_runtime": 556.0383,
2807
+ "eval_samples_per_second": 87.014,
2808
+ "eval_steps_per_second": 0.68,
2809
+ "step": 34000
2810
+ }
2811
+ ],
2812
+ "logging_steps": 100,
2813
+ "max_steps": 34040,
2814
+ "num_input_tokens_seen": 0,
2815
+ "num_train_epochs": 20,
2816
+ "save_steps": 1000,
2817
+ "stateful_callbacks": {
2818
+ "TrainerControl": {
2819
+ "args": {
2820
+ "should_epoch_stop": false,
2821
+ "should_evaluate": false,
2822
+ "should_log": false,
2823
+ "should_save": true,
2824
+ "should_training_stop": false
2825
+ },
2826
+ "attributes": {}
2827
+ }
2828
+ },
2829
+ "total_flos": 2.2885422443855616e+18,
2830
+ "train_batch_size": 256,
2831
+ "trial_name": null,
2832
+ "trial_params": null
2833
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e4ae76ec58b487fbfb990b251c8d99d9f248f8fa34093d0f6f5edb76bc44251
3
+ size 5304