fujie commited on
Commit
b608ac3
·
verified ·
1 Parent(s): 1a8bd1c

Training in progress, epoch 1

Browse files
README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: facebook/wav2vec2-base
5
+ tags:
6
+ - generated_from_trainer
7
+ metrics:
8
+ - accuracy
9
+ model-index:
10
+ - name: esc50-wav2vec2-attn
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # esc50-wav2vec2-attn
18
+
19
+ This model is a fine-tuned version of [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) on an unknown dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.6556
22
+ - Accuracy: 0.875
23
+ - F1 Macro: 0.8752
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 3e-05
43
+ - train_batch_size: 8
44
+ - eval_batch_size: 8
45
+ - seed: 42
46
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
+ - lr_scheduler_type: linear
48
+ - lr_scheduler_warmup_steps: 450
49
+ - num_epochs: 20.0
50
+ - mixed_precision_training: Native AMP
51
+
52
+ ### Training results
53
+
54
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy | F1 Macro |
55
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|:--------:|
56
+ | 3.7226 | 1.0 | 225 | 3.6235 | 0.205 | 0.1414 |
57
+ | 3.0002 | 2.0 | 450 | 2.8299 | 0.435 | 0.3782 |
58
+ | 2.2889 | 3.0 | 675 | 2.0599 | 0.585 | 0.5339 |
59
+ | 1.6197 | 4.0 | 900 | 1.5775 | 0.7 | 0.66 |
60
+ | 1.1835 | 5.0 | 1125 | 1.3031 | 0.72 | 0.7070 |
61
+ | 0.7858 | 6.0 | 1350 | 1.2474 | 0.7 | 0.6953 |
62
+ | 0.5843 | 7.0 | 1575 | 0.9818 | 0.76 | 0.7385 |
63
+ | 0.4295 | 8.0 | 1800 | 0.8253 | 0.8 | 0.7958 |
64
+ | 0.3041 | 9.0 | 2025 | 0.8176 | 0.8 | 0.7926 |
65
+ | 0.2178 | 10.0 | 2250 | 0.8450 | 0.795 | 0.7861 |
66
+ | 0.1874 | 11.0 | 2475 | 0.7450 | 0.81 | 0.8045 |
67
+ | 0.1225 | 12.0 | 2700 | 0.7663 | 0.845 | 0.8409 |
68
+ | 0.0818 | 13.0 | 2925 | 0.7127 | 0.855 | 0.8531 |
69
+ | 0.0874 | 14.0 | 3150 | 0.7242 | 0.84 | 0.8396 |
70
+ | 0.0469 | 15.0 | 3375 | 0.6220 | 0.855 | 0.8562 |
71
+ | 0.0531 | 16.0 | 3600 | 0.5916 | 0.875 | 0.8743 |
72
+ | 0.0351 | 17.0 | 3825 | 0.6738 | 0.85 | 0.8485 |
73
+ | 0.0205 | 18.0 | 4050 | 0.6656 | 0.865 | 0.8666 |
74
+ | 0.0207 | 19.0 | 4275 | 0.6556 | 0.875 | 0.8752 |
75
+ | 0.0194 | 20.0 | 4500 | 0.6624 | 0.875 | 0.8752 |
76
+
77
+
78
+ ### Framework versions
79
+
80
+ - Transformers 4.56.1
81
+ - Pytorch 2.8.0+cu128
82
+ - Datasets 2.19.0
83
+ - Tokenizers 0.22.0
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.875,
4
+ "eval_f1_macro": 0.8751587301587301,
5
+ "eval_loss": 0.655569851398468,
6
+ "eval_runtime": 2.9004,
7
+ "eval_samples": 200,
8
+ "eval_samples_per_second": 68.957,
9
+ "eval_steps_per_second": 8.62,
10
+ "total_flos": 1.64162630016e+18,
11
+ "train_loss": 0.7825442723168267,
12
+ "train_runtime": 840.5538,
13
+ "train_samples": 1800,
14
+ "train_samples_per_second": 42.829,
15
+ "train_steps_per_second": 5.354
16
+ }
config.json ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "adapter_attn_dim": null,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForSequenceClassification"
10
+ ],
11
+ "attention_dropout": 0.1,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 256,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": false,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "sum",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": false,
48
+ "dtype": "float32",
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_norm": "group",
52
+ "feat_proj_dropout": 0.1,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "finetuning_task": "esc50-audio-classification",
56
+ "freeze_feat_extract_train": true,
57
+ "hidden_act": "gelu",
58
+ "hidden_dropout": 0.1,
59
+ "hidden_size": 768,
60
+ "id2label": {
61
+ "0": "airplane",
62
+ "1": "breathing",
63
+ "2": "brushing_teeth",
64
+ "3": "can_opening",
65
+ "4": "car_horn",
66
+ "5": "cat",
67
+ "6": "chainsaw",
68
+ "7": "chirping_birds",
69
+ "8": "church_bells",
70
+ "9": "clapping",
71
+ "10": "clock_alarm",
72
+ "11": "clock_tick",
73
+ "12": "coughing",
74
+ "13": "cow",
75
+ "14": "crackling_fire",
76
+ "15": "crickets",
77
+ "16": "crow",
78
+ "17": "crying_baby",
79
+ "18": "dog",
80
+ "19": "door_wood_creaks",
81
+ "20": "door_wood_knock",
82
+ "21": "drinking_sipping",
83
+ "22": "engine",
84
+ "23": "fireworks",
85
+ "24": "footsteps",
86
+ "25": "frog",
87
+ "26": "glass_breaking",
88
+ "27": "hand_saw",
89
+ "28": "helicopter",
90
+ "29": "hen",
91
+ "30": "insects",
92
+ "31": "keyboard_typing",
93
+ "32": "laughing",
94
+ "33": "mouse_click",
95
+ "34": "pig",
96
+ "35": "pouring_water",
97
+ "36": "rain",
98
+ "37": "rooster",
99
+ "38": "sea_waves",
100
+ "39": "sheep",
101
+ "40": "siren",
102
+ "41": "sneezing",
103
+ "42": "snoring",
104
+ "43": "thunderstorm",
105
+ "44": "toilet_flush",
106
+ "45": "train",
107
+ "46": "vacuum_cleaner",
108
+ "47": "washing_machine",
109
+ "48": "water_drops",
110
+ "49": "wind"
111
+ },
112
+ "initializer_range": 0.02,
113
+ "intermediate_size": 3072,
114
+ "label2id": {
115
+ "airplane": 0,
116
+ "breathing": 1,
117
+ "brushing_teeth": 2,
118
+ "can_opening": 3,
119
+ "car_horn": 4,
120
+ "cat": 5,
121
+ "chainsaw": 6,
122
+ "chirping_birds": 7,
123
+ "church_bells": 8,
124
+ "clapping": 9,
125
+ "clock_alarm": 10,
126
+ "clock_tick": 11,
127
+ "coughing": 12,
128
+ "cow": 13,
129
+ "crackling_fire": 14,
130
+ "crickets": 15,
131
+ "crow": 16,
132
+ "crying_baby": 17,
133
+ "dog": 18,
134
+ "door_wood_creaks": 19,
135
+ "door_wood_knock": 20,
136
+ "drinking_sipping": 21,
137
+ "engine": 22,
138
+ "fireworks": 23,
139
+ "footsteps": 24,
140
+ "frog": 25,
141
+ "glass_breaking": 26,
142
+ "hand_saw": 27,
143
+ "helicopter": 28,
144
+ "hen": 29,
145
+ "insects": 30,
146
+ "keyboard_typing": 31,
147
+ "laughing": 32,
148
+ "mouse_click": 33,
149
+ "pig": 34,
150
+ "pouring_water": 35,
151
+ "rain": 36,
152
+ "rooster": 37,
153
+ "sea_waves": 38,
154
+ "sheep": 39,
155
+ "siren": 40,
156
+ "sneezing": 41,
157
+ "snoring": 42,
158
+ "thunderstorm": 43,
159
+ "toilet_flush": 44,
160
+ "train": 45,
161
+ "vacuum_cleaner": 46,
162
+ "washing_machine": 47,
163
+ "water_drops": 48,
164
+ "wind": 49
165
+ },
166
+ "layer_norm_eps": 1e-05,
167
+ "layerdrop": 0.0,
168
+ "mask_channel_length": 10,
169
+ "mask_channel_min_space": 1,
170
+ "mask_channel_other": 0.0,
171
+ "mask_channel_prob": 0.0,
172
+ "mask_channel_selection": "static",
173
+ "mask_feature_length": 10,
174
+ "mask_feature_min_masks": 0,
175
+ "mask_feature_prob": 0.0,
176
+ "mask_time_length": 10,
177
+ "mask_time_min_masks": 2,
178
+ "mask_time_min_space": 1,
179
+ "mask_time_other": 0.0,
180
+ "mask_time_prob": 0.05,
181
+ "mask_time_selection": "static",
182
+ "model_type": "wav2vec2",
183
+ "no_mask_channel_overlap": false,
184
+ "no_mask_time_overlap": false,
185
+ "num_adapter_layers": 3,
186
+ "num_attention_heads": 12,
187
+ "num_codevector_groups": 2,
188
+ "num_codevectors_per_group": 320,
189
+ "num_conv_pos_embedding_groups": 16,
190
+ "num_conv_pos_embeddings": 128,
191
+ "num_feat_extract_layers": 7,
192
+ "num_hidden_layers": 12,
193
+ "num_negatives": 100,
194
+ "output_hidden_size": 768,
195
+ "pad_token_id": 0,
196
+ "proj_codevector_dim": 256,
197
+ "tdnn_dilation": [
198
+ 1,
199
+ 2,
200
+ 3,
201
+ 1,
202
+ 1
203
+ ],
204
+ "tdnn_dim": [
205
+ 512,
206
+ 512,
207
+ 512,
208
+ 512,
209
+ 1500
210
+ ],
211
+ "tdnn_kernel": [
212
+ 5,
213
+ 3,
214
+ 3,
215
+ 1,
216
+ 1
217
+ ],
218
+ "transformers_version": "4.56.1",
219
+ "use_weighted_layer_sum": false,
220
+ "vocab_size": 32,
221
+ "xvector_output_dim": 512
222
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "eval_accuracy": 0.875,
4
+ "eval_f1_macro": 0.8751587301587301,
5
+ "eval_loss": 0.655569851398468,
6
+ "eval_runtime": 2.9004,
7
+ "eval_samples": 200,
8
+ "eval_samples_per_second": 68.957,
9
+ "eval_steps_per_second": 8.62
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45707f7e593430122518253e21adb4de88e579ce7e9cf4b9f263e5e8aa31cdc0
3
+ size 378351720
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 20.0,
3
+ "total_flos": 1.64162630016e+18,
4
+ "train_loss": 0.7825442723168267,
5
+ "train_runtime": 840.5538,
6
+ "train_samples": 1800,
7
+ "train_samples_per_second": 42.829,
8
+ "train_steps_per_second": 5.354
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,873 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 4275,
3
+ "best_metric": 0.8751587301587301,
4
+ "best_model_checkpoint": "./esc50-wav2vec2-attn/checkpoint-4275",
5
+ "epoch": 20.0,
6
+ "eval_steps": 500,
7
+ "global_step": 4500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.2222222222222222,
14
+ "grad_norm": 4.362800598144531,
15
+ "learning_rate": 3.2666666666666666e-06,
16
+ "loss": 3.9144,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.4444444444444444,
21
+ "grad_norm": Infinity,
22
+ "learning_rate": 6.6e-06,
23
+ "loss": 3.8743,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.6666666666666666,
28
+ "grad_norm": 4.956060409545898,
29
+ "learning_rate": 9.933333333333334e-06,
30
+ "loss": 3.8406,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.8888888888888888,
35
+ "grad_norm": 9.889445304870605,
36
+ "learning_rate": 1.3266666666666668e-05,
37
+ "loss": 3.7226,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 1.0,
42
+ "eval_accuracy": 0.205,
43
+ "eval_f1_macro": 0.14144966384623137,
44
+ "eval_loss": 3.6234569549560547,
45
+ "eval_runtime": 2.8782,
46
+ "eval_samples_per_second": 69.489,
47
+ "eval_steps_per_second": 8.686,
48
+ "step": 225
49
+ },
50
+ {
51
+ "epoch": 1.1111111111111112,
52
+ "grad_norm": 5.627626419067383,
53
+ "learning_rate": 1.66e-05,
54
+ "loss": 3.5617,
55
+ "step": 250
56
+ },
57
+ {
58
+ "epoch": 1.3333333333333333,
59
+ "grad_norm": 6.8414154052734375,
60
+ "learning_rate": 1.993333333333333e-05,
61
+ "loss": 3.4209,
62
+ "step": 300
63
+ },
64
+ {
65
+ "epoch": 1.5555555555555556,
66
+ "grad_norm": 7.82706356048584,
67
+ "learning_rate": 2.326666666666667e-05,
68
+ "loss": 3.2577,
69
+ "step": 350
70
+ },
71
+ {
72
+ "epoch": 1.7777777777777777,
73
+ "grad_norm": 8.264775276184082,
74
+ "learning_rate": 2.6600000000000003e-05,
75
+ "loss": 3.1186,
76
+ "step": 400
77
+ },
78
+ {
79
+ "epoch": 2.0,
80
+ "grad_norm": 8.502516746520996,
81
+ "learning_rate": 2.9933333333333334e-05,
82
+ "loss": 3.0002,
83
+ "step": 450
84
+ },
85
+ {
86
+ "epoch": 2.0,
87
+ "eval_accuracy": 0.435,
88
+ "eval_f1_macro": 0.3781537935748462,
89
+ "eval_loss": 2.8298535346984863,
90
+ "eval_runtime": 2.8565,
91
+ "eval_samples_per_second": 70.015,
92
+ "eval_steps_per_second": 8.752,
93
+ "step": 450
94
+ },
95
+ {
96
+ "epoch": 2.2222222222222223,
97
+ "grad_norm": 9.52200698852539,
98
+ "learning_rate": 2.963703703703704e-05,
99
+ "loss": 2.6937,
100
+ "step": 500
101
+ },
102
+ {
103
+ "epoch": 2.4444444444444446,
104
+ "grad_norm": 8.961172103881836,
105
+ "learning_rate": 2.9266666666666665e-05,
106
+ "loss": 2.4869,
107
+ "step": 550
108
+ },
109
+ {
110
+ "epoch": 2.6666666666666665,
111
+ "grad_norm": 12.154603004455566,
112
+ "learning_rate": 2.8896296296296298e-05,
113
+ "loss": 2.4099,
114
+ "step": 600
115
+ },
116
+ {
117
+ "epoch": 2.888888888888889,
118
+ "grad_norm": 10.498632431030273,
119
+ "learning_rate": 2.8525925925925924e-05,
120
+ "loss": 2.2889,
121
+ "step": 650
122
+ },
123
+ {
124
+ "epoch": 3.0,
125
+ "eval_accuracy": 0.585,
126
+ "eval_f1_macro": 0.533888888888889,
127
+ "eval_loss": 2.0598816871643066,
128
+ "eval_runtime": 2.8698,
129
+ "eval_samples_per_second": 69.691,
130
+ "eval_steps_per_second": 8.711,
131
+ "step": 675
132
+ },
133
+ {
134
+ "epoch": 3.111111111111111,
135
+ "grad_norm": 7.486043930053711,
136
+ "learning_rate": 2.8155555555555556e-05,
137
+ "loss": 2.055,
138
+ "step": 700
139
+ },
140
+ {
141
+ "epoch": 3.3333333333333335,
142
+ "grad_norm": 9.043672561645508,
143
+ "learning_rate": 2.7785185185185185e-05,
144
+ "loss": 1.8778,
145
+ "step": 750
146
+ },
147
+ {
148
+ "epoch": 3.5555555555555554,
149
+ "grad_norm": 12.435341835021973,
150
+ "learning_rate": 2.7414814814814815e-05,
151
+ "loss": 1.8209,
152
+ "step": 800
153
+ },
154
+ {
155
+ "epoch": 3.7777777777777777,
156
+ "grad_norm": 18.719287872314453,
157
+ "learning_rate": 2.7044444444444444e-05,
158
+ "loss": 1.5985,
159
+ "step": 850
160
+ },
161
+ {
162
+ "epoch": 4.0,
163
+ "grad_norm": 11.806514739990234,
164
+ "learning_rate": 2.6674074074074076e-05,
165
+ "loss": 1.6197,
166
+ "step": 900
167
+ },
168
+ {
169
+ "epoch": 4.0,
170
+ "eval_accuracy": 0.7,
171
+ "eval_f1_macro": 0.66,
172
+ "eval_loss": 1.5774518251419067,
173
+ "eval_runtime": 2.8913,
174
+ "eval_samples_per_second": 69.172,
175
+ "eval_steps_per_second": 8.647,
176
+ "step": 900
177
+ },
178
+ {
179
+ "epoch": 4.222222222222222,
180
+ "grad_norm": 11.915701866149902,
181
+ "learning_rate": 2.6303703703703702e-05,
182
+ "loss": 1.2872,
183
+ "step": 950
184
+ },
185
+ {
186
+ "epoch": 4.444444444444445,
187
+ "grad_norm": 26.78908920288086,
188
+ "learning_rate": 2.5933333333333335e-05,
189
+ "loss": 1.2329,
190
+ "step": 1000
191
+ },
192
+ {
193
+ "epoch": 4.666666666666667,
194
+ "grad_norm": 9.02193546295166,
195
+ "learning_rate": 2.5562962962962964e-05,
196
+ "loss": 1.2363,
197
+ "step": 1050
198
+ },
199
+ {
200
+ "epoch": 4.888888888888889,
201
+ "grad_norm": 7.637599468231201,
202
+ "learning_rate": 2.5192592592592593e-05,
203
+ "loss": 1.1835,
204
+ "step": 1100
205
+ },
206
+ {
207
+ "epoch": 5.0,
208
+ "eval_accuracy": 0.72,
209
+ "eval_f1_macro": 0.7070447330447331,
210
+ "eval_loss": 1.303067684173584,
211
+ "eval_runtime": 2.8767,
212
+ "eval_samples_per_second": 69.525,
213
+ "eval_steps_per_second": 8.691,
214
+ "step": 1125
215
+ },
216
+ {
217
+ "epoch": 5.111111111111111,
218
+ "grad_norm": 31.952880859375,
219
+ "learning_rate": 2.4822222222222222e-05,
220
+ "loss": 1.0582,
221
+ "step": 1150
222
+ },
223
+ {
224
+ "epoch": 5.333333333333333,
225
+ "grad_norm": 11.701904296875,
226
+ "learning_rate": 2.4451851851851855e-05,
227
+ "loss": 0.8821,
228
+ "step": 1200
229
+ },
230
+ {
231
+ "epoch": 5.555555555555555,
232
+ "grad_norm": 13.303455352783203,
233
+ "learning_rate": 2.408148148148148e-05,
234
+ "loss": 0.9102,
235
+ "step": 1250
236
+ },
237
+ {
238
+ "epoch": 5.777777777777778,
239
+ "grad_norm": 12.172894477844238,
240
+ "learning_rate": 2.3711111111111113e-05,
241
+ "loss": 0.8899,
242
+ "step": 1300
243
+ },
244
+ {
245
+ "epoch": 6.0,
246
+ "grad_norm": 17.647672653198242,
247
+ "learning_rate": 2.334074074074074e-05,
248
+ "loss": 0.7858,
249
+ "step": 1350
250
+ },
251
+ {
252
+ "epoch": 6.0,
253
+ "eval_accuracy": 0.7,
254
+ "eval_f1_macro": 0.6953009213009212,
255
+ "eval_loss": 1.2473626136779785,
256
+ "eval_runtime": 2.9867,
257
+ "eval_samples_per_second": 66.963,
258
+ "eval_steps_per_second": 8.37,
259
+ "step": 1350
260
+ },
261
+ {
262
+ "epoch": 6.222222222222222,
263
+ "grad_norm": 3.5193300247192383,
264
+ "learning_rate": 2.297037037037037e-05,
265
+ "loss": 0.6652,
266
+ "step": 1400
267
+ },
268
+ {
269
+ "epoch": 6.444444444444445,
270
+ "grad_norm": 5.703717231750488,
271
+ "learning_rate": 2.26e-05,
272
+ "loss": 0.6172,
273
+ "step": 1450
274
+ },
275
+ {
276
+ "epoch": 6.666666666666667,
277
+ "grad_norm": 5.758047103881836,
278
+ "learning_rate": 2.222962962962963e-05,
279
+ "loss": 0.67,
280
+ "step": 1500
281
+ },
282
+ {
283
+ "epoch": 6.888888888888889,
284
+ "grad_norm": 33.762943267822266,
285
+ "learning_rate": 2.185925925925926e-05,
286
+ "loss": 0.5843,
287
+ "step": 1550
288
+ },
289
+ {
290
+ "epoch": 7.0,
291
+ "eval_accuracy": 0.76,
292
+ "eval_f1_macro": 0.7385472305472305,
293
+ "eval_loss": 0.9817761182785034,
294
+ "eval_runtime": 2.8876,
295
+ "eval_samples_per_second": 69.261,
296
+ "eval_steps_per_second": 8.658,
297
+ "step": 1575
298
+ },
299
+ {
300
+ "epoch": 7.111111111111111,
301
+ "grad_norm": 13.375953674316406,
302
+ "learning_rate": 2.148888888888889e-05,
303
+ "loss": 0.5566,
304
+ "step": 1600
305
+ },
306
+ {
307
+ "epoch": 7.333333333333333,
308
+ "grad_norm": 10.450238227844238,
309
+ "learning_rate": 2.1118518518518517e-05,
310
+ "loss": 0.4772,
311
+ "step": 1650
312
+ },
313
+ {
314
+ "epoch": 7.555555555555555,
315
+ "grad_norm": 9.287495613098145,
316
+ "learning_rate": 2.074814814814815e-05,
317
+ "loss": 0.4799,
318
+ "step": 1700
319
+ },
320
+ {
321
+ "epoch": 7.777777777777778,
322
+ "grad_norm": 8.345856666564941,
323
+ "learning_rate": 2.037777777777778e-05,
324
+ "loss": 0.467,
325
+ "step": 1750
326
+ },
327
+ {
328
+ "epoch": 8.0,
329
+ "grad_norm": 7.566125869750977,
330
+ "learning_rate": 2.0007407407407408e-05,
331
+ "loss": 0.4295,
332
+ "step": 1800
333
+ },
334
+ {
335
+ "epoch": 8.0,
336
+ "eval_accuracy": 0.8,
337
+ "eval_f1_macro": 0.7957647907647908,
338
+ "eval_loss": 0.8252587914466858,
339
+ "eval_runtime": 2.8766,
340
+ "eval_samples_per_second": 69.525,
341
+ "eval_steps_per_second": 8.691,
342
+ "step": 1800
343
+ },
344
+ {
345
+ "epoch": 8.222222222222221,
346
+ "grad_norm": 15.532505989074707,
347
+ "learning_rate": 1.9637037037037037e-05,
348
+ "loss": 0.3501,
349
+ "step": 1850
350
+ },
351
+ {
352
+ "epoch": 8.444444444444445,
353
+ "grad_norm": 6.413458824157715,
354
+ "learning_rate": 1.926666666666667e-05,
355
+ "loss": 0.3558,
356
+ "step": 1900
357
+ },
358
+ {
359
+ "epoch": 8.666666666666666,
360
+ "grad_norm": 4.636125564575195,
361
+ "learning_rate": 1.8896296296296295e-05,
362
+ "loss": 0.3139,
363
+ "step": 1950
364
+ },
365
+ {
366
+ "epoch": 8.88888888888889,
367
+ "grad_norm": 0.6558843851089478,
368
+ "learning_rate": 1.8525925925925928e-05,
369
+ "loss": 0.3041,
370
+ "step": 2000
371
+ },
372
+ {
373
+ "epoch": 9.0,
374
+ "eval_accuracy": 0.8,
375
+ "eval_f1_macro": 0.7926046176046176,
376
+ "eval_loss": 0.8176364302635193,
377
+ "eval_runtime": 2.8845,
378
+ "eval_samples_per_second": 69.336,
379
+ "eval_steps_per_second": 8.667,
380
+ "step": 2025
381
+ },
382
+ {
383
+ "epoch": 9.11111111111111,
384
+ "grad_norm": 18.850481033325195,
385
+ "learning_rate": 1.8155555555555554e-05,
386
+ "loss": 0.3166,
387
+ "step": 2050
388
+ },
389
+ {
390
+ "epoch": 9.333333333333334,
391
+ "grad_norm": 22.011247634887695,
392
+ "learning_rate": 1.7785185185185186e-05,
393
+ "loss": 0.2472,
394
+ "step": 2100
395
+ },
396
+ {
397
+ "epoch": 9.555555555555555,
398
+ "grad_norm": 0.4794563353061676,
399
+ "learning_rate": 1.7414814814814815e-05,
400
+ "loss": 0.2419,
401
+ "step": 2150
402
+ },
403
+ {
404
+ "epoch": 9.777777777777779,
405
+ "grad_norm": 1.2854745388031006,
406
+ "learning_rate": 1.7044444444444445e-05,
407
+ "loss": 0.1906,
408
+ "step": 2200
409
+ },
410
+ {
411
+ "epoch": 10.0,
412
+ "grad_norm": 18.808258056640625,
413
+ "learning_rate": 1.6674074074074074e-05,
414
+ "loss": 0.2178,
415
+ "step": 2250
416
+ },
417
+ {
418
+ "epoch": 10.0,
419
+ "eval_accuracy": 0.795,
420
+ "eval_f1_macro": 0.7861341991341991,
421
+ "eval_loss": 0.845029890537262,
422
+ "eval_runtime": 2.8982,
423
+ "eval_samples_per_second": 69.008,
424
+ "eval_steps_per_second": 8.626,
425
+ "step": 2250
426
+ },
427
+ {
428
+ "epoch": 10.222222222222221,
429
+ "grad_norm": 0.6055029630661011,
430
+ "learning_rate": 1.6303703703703706e-05,
431
+ "loss": 0.2369,
432
+ "step": 2300
433
+ },
434
+ {
435
+ "epoch": 10.444444444444445,
436
+ "grad_norm": 0.2815465033054352,
437
+ "learning_rate": 1.5933333333333332e-05,
438
+ "loss": 0.1514,
439
+ "step": 2350
440
+ },
441
+ {
442
+ "epoch": 10.666666666666666,
443
+ "grad_norm": 0.5796032547950745,
444
+ "learning_rate": 1.5562962962962965e-05,
445
+ "loss": 0.1704,
446
+ "step": 2400
447
+ },
448
+ {
449
+ "epoch": 10.88888888888889,
450
+ "grad_norm": 15.787017822265625,
451
+ "learning_rate": 1.5192592592592592e-05,
452
+ "loss": 0.1874,
453
+ "step": 2450
454
+ },
455
+ {
456
+ "epoch": 11.0,
457
+ "eval_accuracy": 0.81,
458
+ "eval_f1_macro": 0.8044646464646463,
459
+ "eval_loss": 0.7450368404388428,
460
+ "eval_runtime": 2.8672,
461
+ "eval_samples_per_second": 69.755,
462
+ "eval_steps_per_second": 8.719,
463
+ "step": 2475
464
+ },
465
+ {
466
+ "epoch": 11.11111111111111,
467
+ "grad_norm": 5.2899017333984375,
468
+ "learning_rate": 1.4822222222222221e-05,
469
+ "loss": 0.1407,
470
+ "step": 2500
471
+ },
472
+ {
473
+ "epoch": 11.333333333333334,
474
+ "grad_norm": 1.0392882823944092,
475
+ "learning_rate": 1.4451851851851852e-05,
476
+ "loss": 0.1423,
477
+ "step": 2550
478
+ },
479
+ {
480
+ "epoch": 11.555555555555555,
481
+ "grad_norm": 0.22039936482906342,
482
+ "learning_rate": 1.4081481481481481e-05,
483
+ "loss": 0.1214,
484
+ "step": 2600
485
+ },
486
+ {
487
+ "epoch": 11.777777777777779,
488
+ "grad_norm": 1.2269922494888306,
489
+ "learning_rate": 1.371111111111111e-05,
490
+ "loss": 0.1706,
491
+ "step": 2650
492
+ },
493
+ {
494
+ "epoch": 12.0,
495
+ "grad_norm": 0.16958917677402496,
496
+ "learning_rate": 1.3340740740740741e-05,
497
+ "loss": 0.1225,
498
+ "step": 2700
499
+ },
500
+ {
501
+ "epoch": 12.0,
502
+ "eval_accuracy": 0.845,
503
+ "eval_f1_macro": 0.8408946608946608,
504
+ "eval_loss": 0.7663388848304749,
505
+ "eval_runtime": 2.8819,
506
+ "eval_samples_per_second": 69.398,
507
+ "eval_steps_per_second": 8.675,
508
+ "step": 2700
509
+ },
510
+ {
511
+ "epoch": 12.222222222222221,
512
+ "grad_norm": 16.176551818847656,
513
+ "learning_rate": 1.297037037037037e-05,
514
+ "loss": 0.1082,
515
+ "step": 2750
516
+ },
517
+ {
518
+ "epoch": 12.444444444444445,
519
+ "grad_norm": 0.7906608581542969,
520
+ "learning_rate": 1.26e-05,
521
+ "loss": 0.0837,
522
+ "step": 2800
523
+ },
524
+ {
525
+ "epoch": 12.666666666666666,
526
+ "grad_norm": 0.1822936087846756,
527
+ "learning_rate": 1.2229629629629629e-05,
528
+ "loss": 0.0779,
529
+ "step": 2850
530
+ },
531
+ {
532
+ "epoch": 12.88888888888889,
533
+ "grad_norm": 0.1906086951494217,
534
+ "learning_rate": 1.185925925925926e-05,
535
+ "loss": 0.0818,
536
+ "step": 2900
537
+ },
538
+ {
539
+ "epoch": 13.0,
540
+ "eval_accuracy": 0.855,
541
+ "eval_f1_macro": 0.8530952380952379,
542
+ "eval_loss": 0.7127139568328857,
543
+ "eval_runtime": 2.8693,
544
+ "eval_samples_per_second": 69.703,
545
+ "eval_steps_per_second": 8.713,
546
+ "step": 2925
547
+ },
548
+ {
549
+ "epoch": 13.11111111111111,
550
+ "grad_norm": 0.28385430574417114,
551
+ "learning_rate": 1.1488888888888889e-05,
552
+ "loss": 0.0873,
553
+ "step": 2950
554
+ },
555
+ {
556
+ "epoch": 13.333333333333334,
557
+ "grad_norm": 0.17537418007850647,
558
+ "learning_rate": 1.1118518518518518e-05,
559
+ "loss": 0.0536,
560
+ "step": 3000
561
+ },
562
+ {
563
+ "epoch": 13.555555555555555,
564
+ "grad_norm": 0.2917730212211609,
565
+ "learning_rate": 1.0748148148148149e-05,
566
+ "loss": 0.0555,
567
+ "step": 3050
568
+ },
569
+ {
570
+ "epoch": 13.777777777777779,
571
+ "grad_norm": 0.14240045845508575,
572
+ "learning_rate": 1.0377777777777778e-05,
573
+ "loss": 0.0579,
574
+ "step": 3100
575
+ },
576
+ {
577
+ "epoch": 14.0,
578
+ "grad_norm": 0.5113584399223328,
579
+ "learning_rate": 1.0007407407407407e-05,
580
+ "loss": 0.0874,
581
+ "step": 3150
582
+ },
583
+ {
584
+ "epoch": 14.0,
585
+ "eval_accuracy": 0.84,
586
+ "eval_f1_macro": 0.8395616605616606,
587
+ "eval_loss": 0.7242352366447449,
588
+ "eval_runtime": 2.8999,
589
+ "eval_samples_per_second": 68.968,
590
+ "eval_steps_per_second": 8.621,
591
+ "step": 3150
592
+ },
593
+ {
594
+ "epoch": 14.222222222222221,
595
+ "grad_norm": 0.14713504910469055,
596
+ "learning_rate": 9.637037037037036e-06,
597
+ "loss": 0.035,
598
+ "step": 3200
599
+ },
600
+ {
601
+ "epoch": 14.444444444444445,
602
+ "grad_norm": 0.18145354092121124,
603
+ "learning_rate": 9.266666666666667e-06,
604
+ "loss": 0.0658,
605
+ "step": 3250
606
+ },
607
+ {
608
+ "epoch": 14.666666666666666,
609
+ "grad_norm": 0.14780841767787933,
610
+ "learning_rate": 8.896296296296296e-06,
611
+ "loss": 0.0424,
612
+ "step": 3300
613
+ },
614
+ {
615
+ "epoch": 14.88888888888889,
616
+ "grad_norm": 0.2982366383075714,
617
+ "learning_rate": 8.525925925925925e-06,
618
+ "loss": 0.0469,
619
+ "step": 3350
620
+ },
621
+ {
622
+ "epoch": 15.0,
623
+ "eval_accuracy": 0.855,
624
+ "eval_f1_macro": 0.8562178932178931,
625
+ "eval_loss": 0.6220372915267944,
626
+ "eval_runtime": 2.8763,
627
+ "eval_samples_per_second": 69.534,
628
+ "eval_steps_per_second": 8.692,
629
+ "step": 3375
630
+ },
631
+ {
632
+ "epoch": 15.11111111111111,
633
+ "grad_norm": 0.11205285787582397,
634
+ "learning_rate": 8.155555555555556e-06,
635
+ "loss": 0.0355,
636
+ "step": 3400
637
+ },
638
+ {
639
+ "epoch": 15.333333333333334,
640
+ "grad_norm": 0.11701209098100662,
641
+ "learning_rate": 7.785185185185185e-06,
642
+ "loss": 0.0311,
643
+ "step": 3450
644
+ },
645
+ {
646
+ "epoch": 15.555555555555555,
647
+ "grad_norm": 0.15519364178180695,
648
+ "learning_rate": 7.414814814814815e-06,
649
+ "loss": 0.0252,
650
+ "step": 3500
651
+ },
652
+ {
653
+ "epoch": 15.777777777777779,
654
+ "grad_norm": 0.2080957293510437,
655
+ "learning_rate": 7.044444444444445e-06,
656
+ "loss": 0.0355,
657
+ "step": 3550
658
+ },
659
+ {
660
+ "epoch": 16.0,
661
+ "grad_norm": 0.16233320534229279,
662
+ "learning_rate": 6.674074074074074e-06,
663
+ "loss": 0.0531,
664
+ "step": 3600
665
+ },
666
+ {
667
+ "epoch": 16.0,
668
+ "eval_accuracy": 0.875,
669
+ "eval_f1_macro": 0.8743174603174602,
670
+ "eval_loss": 0.5916269421577454,
671
+ "eval_runtime": 2.9356,
672
+ "eval_samples_per_second": 68.13,
673
+ "eval_steps_per_second": 8.516,
674
+ "step": 3600
675
+ },
676
+ {
677
+ "epoch": 16.22222222222222,
678
+ "grad_norm": 0.15035021305084229,
679
+ "learning_rate": 6.303703703703704e-06,
680
+ "loss": 0.0239,
681
+ "step": 3650
682
+ },
683
+ {
684
+ "epoch": 16.444444444444443,
685
+ "grad_norm": 0.12058259546756744,
686
+ "learning_rate": 5.933333333333333e-06,
687
+ "loss": 0.0233,
688
+ "step": 3700
689
+ },
690
+ {
691
+ "epoch": 16.666666666666668,
692
+ "grad_norm": 0.11992493271827698,
693
+ "learning_rate": 5.562962962962963e-06,
694
+ "loss": 0.0254,
695
+ "step": 3750
696
+ },
697
+ {
698
+ "epoch": 16.88888888888889,
699
+ "grad_norm": 0.1040111556649208,
700
+ "learning_rate": 5.192592592592593e-06,
701
+ "loss": 0.0351,
702
+ "step": 3800
703
+ },
704
+ {
705
+ "epoch": 17.0,
706
+ "eval_accuracy": 0.85,
707
+ "eval_f1_macro": 0.8484761904761904,
708
+ "eval_loss": 0.673793613910675,
709
+ "eval_runtime": 2.9183,
710
+ "eval_samples_per_second": 68.534,
711
+ "eval_steps_per_second": 8.567,
712
+ "step": 3825
713
+ },
714
+ {
715
+ "epoch": 17.11111111111111,
716
+ "grad_norm": 0.14537616074085236,
717
+ "learning_rate": 4.822222222222222e-06,
718
+ "loss": 0.0222,
719
+ "step": 3850
720
+ },
721
+ {
722
+ "epoch": 17.333333333333332,
723
+ "grad_norm": 0.14161454141139984,
724
+ "learning_rate": 4.451851851851852e-06,
725
+ "loss": 0.0209,
726
+ "step": 3900
727
+ },
728
+ {
729
+ "epoch": 17.555555555555557,
730
+ "grad_norm": 0.0898861438035965,
731
+ "learning_rate": 4.081481481481481e-06,
732
+ "loss": 0.0206,
733
+ "step": 3950
734
+ },
735
+ {
736
+ "epoch": 17.77777777777778,
737
+ "grad_norm": 0.10038736462593079,
738
+ "learning_rate": 3.7111111111111113e-06,
739
+ "loss": 0.0319,
740
+ "step": 4000
741
+ },
742
+ {
743
+ "epoch": 18.0,
744
+ "grad_norm": 0.1036420613527298,
745
+ "learning_rate": 3.340740740740741e-06,
746
+ "loss": 0.0205,
747
+ "step": 4050
748
+ },
749
+ {
750
+ "epoch": 18.0,
751
+ "eval_accuracy": 0.865,
752
+ "eval_f1_macro": 0.8666349206349205,
753
+ "eval_loss": 0.665629506111145,
754
+ "eval_runtime": 2.8759,
755
+ "eval_samples_per_second": 69.544,
756
+ "eval_steps_per_second": 8.693,
757
+ "step": 4050
758
+ },
759
+ {
760
+ "epoch": 18.22222222222222,
761
+ "grad_norm": 0.11664649099111557,
762
+ "learning_rate": 2.9703703703703705e-06,
763
+ "loss": 0.0202,
764
+ "step": 4100
765
+ },
766
+ {
767
+ "epoch": 18.444444444444443,
768
+ "grad_norm": 0.10043739527463913,
769
+ "learning_rate": 2.6e-06,
770
+ "loss": 0.0198,
771
+ "step": 4150
772
+ },
773
+ {
774
+ "epoch": 18.666666666666668,
775
+ "grad_norm": 0.07799684256315231,
776
+ "learning_rate": 2.2296296296296297e-06,
777
+ "loss": 0.0195,
778
+ "step": 4200
779
+ },
780
+ {
781
+ "epoch": 18.88888888888889,
782
+ "grad_norm": 0.08117303997278214,
783
+ "learning_rate": 1.8592592592592593e-06,
784
+ "loss": 0.0207,
785
+ "step": 4250
786
+ },
787
+ {
788
+ "epoch": 19.0,
789
+ "eval_accuracy": 0.875,
790
+ "eval_f1_macro": 0.8751587301587301,
791
+ "eval_loss": 0.655569851398468,
792
+ "eval_runtime": 2.9199,
793
+ "eval_samples_per_second": 68.496,
794
+ "eval_steps_per_second": 8.562,
795
+ "step": 4275
796
+ },
797
+ {
798
+ "epoch": 19.11111111111111,
799
+ "grad_norm": 0.09353518486022949,
800
+ "learning_rate": 1.4888888888888888e-06,
801
+ "loss": 0.0243,
802
+ "step": 4300
803
+ },
804
+ {
805
+ "epoch": 19.333333333333332,
806
+ "grad_norm": 0.12512549757957458,
807
+ "learning_rate": 1.1185185185185184e-06,
808
+ "loss": 0.019,
809
+ "step": 4350
810
+ },
811
+ {
812
+ "epoch": 19.555555555555557,
813
+ "grad_norm": 0.09048581123352051,
814
+ "learning_rate": 7.481481481481481e-07,
815
+ "loss": 0.0219,
816
+ "step": 4400
817
+ },
818
+ {
819
+ "epoch": 19.77777777777778,
820
+ "grad_norm": 0.10329825431108475,
821
+ "learning_rate": 3.777777777777778e-07,
822
+ "loss": 0.0194,
823
+ "step": 4450
824
+ },
825
+ {
826
+ "epoch": 20.0,
827
+ "grad_norm": 0.09893308579921722,
828
+ "learning_rate": 7.407407407407408e-09,
829
+ "loss": 0.0194,
830
+ "step": 4500
831
+ },
832
+ {
833
+ "epoch": 20.0,
834
+ "eval_accuracy": 0.875,
835
+ "eval_f1_macro": 0.8751587301587301,
836
+ "eval_loss": 0.6624078154563904,
837
+ "eval_runtime": 2.9156,
838
+ "eval_samples_per_second": 68.595,
839
+ "eval_steps_per_second": 8.574,
840
+ "step": 4500
841
+ },
842
+ {
843
+ "epoch": 20.0,
844
+ "step": 4500,
845
+ "total_flos": 1.64162630016e+18,
846
+ "train_loss": 0.7825442723168267,
847
+ "train_runtime": 840.5538,
848
+ "train_samples_per_second": 42.829,
849
+ "train_steps_per_second": 5.354
850
+ }
851
+ ],
852
+ "logging_steps": 50,
853
+ "max_steps": 4500,
854
+ "num_input_tokens_seen": 0,
855
+ "num_train_epochs": 20,
856
+ "save_steps": 500,
857
+ "stateful_callbacks": {
858
+ "TrainerControl": {
859
+ "args": {
860
+ "should_epoch_stop": false,
861
+ "should_evaluate": false,
862
+ "should_log": false,
863
+ "should_save": true,
864
+ "should_training_stop": true
865
+ },
866
+ "attributes": {}
867
+ }
868
+ },
869
+ "total_flos": 1.64162630016e+18,
870
+ "train_batch_size": 8,
871
+ "trial_name": null,
872
+ "trial_params": null
873
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5351cc4a35022a422ddf1e4dfcb892c5e29a158211003340e7ab9c65d55c2c4
3
+ size 5841