alkiskoudounas commited on
Commit
aece8fe
·
verified ·
1 Parent(s): 98e2b74

Upload with huggingface_hub

Browse files
Files changed (7) hide show
  1. config.json +298 -0
  2. model.safetensors +3 -0
  3. optimizer.pt +3 -0
  4. rng_state.pth +3 -0
  5. scheduler.pt +3 -0
  6. trainer_state.json +917 -0
  7. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "facebook/wav2vec2-base",
3
+ "activation_dropout": 0.0,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForSequenceClassification"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 256,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": false,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "sum",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": false,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_norm": "group",
52
+ "feat_proj_dropout": 0.1,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.0,
55
+ "freeze_feat_extract_train": true,
56
+ "hidden_act": "gelu",
57
+ "hidden_dropout": 0.1,
58
+ "hidden_size": 768,
59
+ "id2label": {
60
+ "0": "calendar_set",
61
+ "1": "wavs/audio_volume_mute",
62
+ "10": "iot_coffee",
63
+ "11": "calendar_remove",
64
+ "12": "email_sendemail",
65
+ "13": "general_joke",
66
+ "14": "lists_query",
67
+ "15": "iot_cleaning",
68
+ "16": "social_query",
69
+ "17": "cooking_recipe",
70
+ "18": "play_game",
71
+ "19": "weather_query",
72
+ "2": "qa_stock",
73
+ "20": "iot_hue_lightoff",
74
+ "21": "qa_factoid",
75
+ "22": "play_wavs/audiobook",
76
+ "23": "news_query",
77
+ "24": "qa_maths",
78
+ "25": "email_query",
79
+ "26": "recommendation_movies",
80
+ "27": "transport_traffic",
81
+ "28": "takeaway_order",
82
+ "29": "wavs/audio_volume_other",
83
+ "3": "social_post",
84
+ "30": "datetime_convert",
85
+ "31": "music",
86
+ "32": "recommendation_locations",
87
+ "33": "recommendation_events",
88
+ "34": "qa_currency",
89
+ "35": "lists_createoradd",
90
+ "36": "datetime_query",
91
+ "37": "transport_ticket",
92
+ "38": "takeaway_query",
93
+ "39": "general_greet",
94
+ "4": "play_radio",
95
+ "40": "qa_definition",
96
+ "41": "play_podcasts",
97
+ "42": "transport_taxi",
98
+ "43": "alarm_remove",
99
+ "44": "iot_hue_lightchange",
100
+ "45": "email_querycontact",
101
+ "46": "iot_hue_lightdim",
102
+ "47": "alarm_set",
103
+ "48": "iot_hue_lightup",
104
+ "49": "transport_query",
105
+ "5": "calendar_query",
106
+ "50": "iot_wemo_on",
107
+ "51": "music_likeness",
108
+ "52": "alarm_query",
109
+ "53": "music_dislikeness",
110
+ "54": "lists_remove",
111
+ "55": "iot_hue_lighton",
112
+ "56": "wavs/audio_volume_down",
113
+ "57": "factoid",
114
+ "58": "iot_wemo_off",
115
+ "59": "query",
116
+ "6": "music_query",
117
+ "60": "set",
118
+ "61": "quirky",
119
+ "62": "email_addcontact",
120
+ "63": "music_settings",
121
+ "64": "joke",
122
+ "65": "podcasts",
123
+ "66": "game",
124
+ "67": "coffee",
125
+ "68": "radio",
126
+ "69": "post",
127
+ "7": "general_quirky",
128
+ "70": "convert",
129
+ "71": "remove",
130
+ "72": "greet",
131
+ "73": "cooking_query",
132
+ "74": "sendemail",
133
+ "75": "traffic",
134
+ "76": "hue_lightup",
135
+ "77": "hue_lightoff",
136
+ "78": "currency",
137
+ "79": "wemo_off",
138
+ "8": "play_music",
139
+ "80": "hue_lightdim",
140
+ "81": "createoradd",
141
+ "82": "ticket",
142
+ "83": "volume_other",
143
+ "84": "cleaning",
144
+ "85": "querycontact",
145
+ "86": "wemo_on",
146
+ "87": "addcontact",
147
+ "9": "wavs/audio_volume_up"
148
+ },
149
+ "initializer_range": 0.02,
150
+ "intermediate_size": 3072,
151
+ "label2id": {
152
+ "addcontact": "87",
153
+ "alarm_query": "52",
154
+ "alarm_remove": "43",
155
+ "alarm_set": "47",
156
+ "calendar_query": "5",
157
+ "calendar_remove": "11",
158
+ "calendar_set": "0",
159
+ "cleaning": "84",
160
+ "coffee": "67",
161
+ "convert": "70",
162
+ "cooking_query": "73",
163
+ "cooking_recipe": "17",
164
+ "createoradd": "81",
165
+ "currency": "78",
166
+ "datetime_convert": "30",
167
+ "datetime_query": "36",
168
+ "email_addcontact": "62",
169
+ "email_query": "25",
170
+ "email_querycontact": "45",
171
+ "email_sendemail": "12",
172
+ "factoid": "57",
173
+ "game": "66",
174
+ "general_greet": "39",
175
+ "general_joke": "13",
176
+ "general_quirky": "7",
177
+ "greet": "72",
178
+ "hue_lightdim": "80",
179
+ "hue_lightoff": "77",
180
+ "hue_lightup": "76",
181
+ "iot_cleaning": "15",
182
+ "iot_coffee": "10",
183
+ "iot_hue_lightchange": "44",
184
+ "iot_hue_lightdim": "46",
185
+ "iot_hue_lightoff": "20",
186
+ "iot_hue_lighton": "55",
187
+ "iot_hue_lightup": "48",
188
+ "iot_wemo_off": "58",
189
+ "iot_wemo_on": "50",
190
+ "joke": "64",
191
+ "lists_createoradd": "35",
192
+ "lists_query": "14",
193
+ "lists_remove": "54",
194
+ "music": "31",
195
+ "music_dislikeness": "53",
196
+ "music_likeness": "51",
197
+ "music_query": "6",
198
+ "music_settings": "63",
199
+ "news_query": "23",
200
+ "play_game": "18",
201
+ "play_music": "8",
202
+ "play_podcasts": "41",
203
+ "play_radio": "4",
204
+ "play_wavs/audiobook": "22",
205
+ "podcasts": "65",
206
+ "post": "69",
207
+ "qa_currency": "34",
208
+ "qa_definition": "40",
209
+ "qa_factoid": "21",
210
+ "qa_maths": "24",
211
+ "qa_stock": "2",
212
+ "query": "59",
213
+ "querycontact": "85",
214
+ "quirky": "61",
215
+ "radio": "68",
216
+ "recommendation_events": "33",
217
+ "recommendation_locations": "32",
218
+ "recommendation_movies": "26",
219
+ "remove": "71",
220
+ "sendemail": "74",
221
+ "set": "60",
222
+ "social_post": "3",
223
+ "social_query": "16",
224
+ "takeaway_order": "28",
225
+ "takeaway_query": "38",
226
+ "ticket": "82",
227
+ "traffic": "75",
228
+ "transport_query": "49",
229
+ "transport_taxi": "42",
230
+ "transport_ticket": "37",
231
+ "transport_traffic": "27",
232
+ "volume_other": "83",
233
+ "wavs/audio_volume_down": "56",
234
+ "wavs/audio_volume_mute": "1",
235
+ "wavs/audio_volume_other": "29",
236
+ "wavs/audio_volume_up": "9",
237
+ "weather_query": "19",
238
+ "wemo_off": "79",
239
+ "wemo_on": "86"
240
+ },
241
+ "layer_norm_eps": 1e-05,
242
+ "layerdrop": 0.0,
243
+ "mask_channel_length": 10,
244
+ "mask_channel_min_space": 1,
245
+ "mask_channel_other": 0.0,
246
+ "mask_channel_prob": 0.0,
247
+ "mask_channel_selection": "static",
248
+ "mask_feature_length": 10,
249
+ "mask_feature_min_masks": 0,
250
+ "mask_feature_prob": 0.0,
251
+ "mask_time_length": 10,
252
+ "mask_time_min_masks": 2,
253
+ "mask_time_min_space": 1,
254
+ "mask_time_other": 0.0,
255
+ "mask_time_prob": 0.05,
256
+ "mask_time_selection": "static",
257
+ "model_type": "wav2vec2",
258
+ "no_mask_channel_overlap": false,
259
+ "no_mask_time_overlap": false,
260
+ "num_adapter_layers": 3,
261
+ "num_attention_heads": 12,
262
+ "num_codevector_groups": 2,
263
+ "num_codevectors_per_group": 320,
264
+ "num_conv_pos_embedding_groups": 16,
265
+ "num_conv_pos_embeddings": 128,
266
+ "num_feat_extract_layers": 7,
267
+ "num_hidden_layers": 12,
268
+ "num_negatives": 100,
269
+ "output_hidden_size": 768,
270
+ "pad_token_id": 0,
271
+ "proj_codevector_dim": 256,
272
+ "tdnn_dilation": [
273
+ 1,
274
+ 2,
275
+ 3,
276
+ 1,
277
+ 1
278
+ ],
279
+ "tdnn_dim": [
280
+ 512,
281
+ 512,
282
+ 512,
283
+ 512,
284
+ 1500
285
+ ],
286
+ "tdnn_kernel": [
287
+ 5,
288
+ 3,
289
+ 3,
290
+ 1,
291
+ 1
292
+ ],
293
+ "torch_dtype": "float32",
294
+ "transformers_version": "4.45.2",
295
+ "use_weighted_layer_sum": false,
296
+ "vocab_size": 32,
297
+ "xvector_output_dim": 512
298
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2eb595d9bffebf49d469d7ba67f390dd4a3bbbafe03327475cefbfcbfb2fbf9e
3
+ size 378390784
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7378f65dc440ef3323f4064ec7cdf0a00ec197d73ef4cb485cb02f4e166988c
3
+ size 756909370
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d42fbb35be881a336af3b263d82ee7d97c2eca75108898734086b9ca86cfe08
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1d6302a7353dd67847dce608afaaff547c670ef8ef4c52a5a5ac9772ebc0155
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,917 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8215622660344397,
3
+ "best_model_checkpoint": "results_unlearning/facebook/wav2vec2-base/42/checkpoint-26000",
4
+ "epoch": 57.30027548209367,
5
+ "eval_steps": 500,
6
+ "global_step": 26000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.1019283746556474,
13
+ "grad_norm": 3.954470634460449,
14
+ "learning_rate": 8.333333333333333e-05,
15
+ "loss": 3.9037,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 1.1019283746556474,
20
+ "eval_accuracy": 0.1856750686298977,
21
+ "eval_f1_macro": 0.0338842756461234,
22
+ "eval_loss": 3.2653889656066895,
23
+ "eval_runtime": 201.4102,
24
+ "eval_samples_per_second": 39.789,
25
+ "eval_steps_per_second": 1.246,
26
+ "step": 500
27
+ },
28
+ {
29
+ "epoch": 2.203856749311295,
30
+ "grad_norm": 7.994827747344971,
31
+ "learning_rate": 0.00016666666666666666,
32
+ "loss": 2.4697,
33
+ "step": 1000
34
+ },
35
+ {
36
+ "epoch": 2.203856749311295,
37
+ "eval_accuracy": 0.5384327427002745,
38
+ "eval_f1_macro": 0.2623597839382389,
39
+ "eval_loss": 1.8842540979385376,
40
+ "eval_runtime": 47.0501,
41
+ "eval_samples_per_second": 170.329,
42
+ "eval_steps_per_second": 5.335,
43
+ "step": 1000
44
+ },
45
+ {
46
+ "epoch": 3.3057851239669422,
47
+ "grad_norm": 8.498858451843262,
48
+ "learning_rate": 0.00025,
49
+ "loss": 1.6285,
50
+ "step": 1500
51
+ },
52
+ {
53
+ "epoch": 3.3057851239669422,
54
+ "eval_accuracy": 0.6313950586473671,
55
+ "eval_f1_macro": 0.4048605043298036,
56
+ "eval_loss": 1.509529709815979,
57
+ "eval_runtime": 34.1201,
58
+ "eval_samples_per_second": 234.876,
59
+ "eval_steps_per_second": 7.356,
60
+ "step": 1500
61
+ },
62
+ {
63
+ "epoch": 4.40771349862259,
64
+ "grad_norm": 9.255996704101562,
65
+ "learning_rate": 0.0003333333333333333,
66
+ "loss": 1.4076,
67
+ "step": 2000
68
+ },
69
+ {
70
+ "epoch": 4.40771349862259,
71
+ "eval_accuracy": 0.6400049912652858,
72
+ "eval_f1_macro": 0.42658784153796886,
73
+ "eval_loss": 1.5477856397628784,
74
+ "eval_runtime": 34.313,
75
+ "eval_samples_per_second": 233.556,
76
+ "eval_steps_per_second": 7.315,
77
+ "step": 2000
78
+ },
79
+ {
80
+ "epoch": 5.509641873278237,
81
+ "grad_norm": 8.901646614074707,
82
+ "learning_rate": 0.0004166666666666667,
83
+ "loss": 1.3687,
84
+ "step": 2500
85
+ },
86
+ {
87
+ "epoch": 5.509641873278237,
88
+ "eval_accuracy": 0.6643374095333167,
89
+ "eval_f1_macro": 0.4590986266426292,
90
+ "eval_loss": 1.4123471975326538,
91
+ "eval_runtime": 34.1574,
92
+ "eval_samples_per_second": 234.62,
93
+ "eval_steps_per_second": 7.348,
94
+ "step": 2500
95
+ },
96
+ {
97
+ "epoch": 6.6115702479338845,
98
+ "grad_norm": 6.906811714172363,
99
+ "learning_rate": 0.0005,
100
+ "loss": 1.3757,
101
+ "step": 3000
102
+ },
103
+ {
104
+ "epoch": 6.6115702479338845,
105
+ "eval_accuracy": 0.6093087097579236,
106
+ "eval_f1_macro": 0.42633143981000005,
107
+ "eval_loss": 1.642005205154419,
108
+ "eval_runtime": 34.1921,
109
+ "eval_samples_per_second": 234.381,
110
+ "eval_steps_per_second": 7.341,
111
+ "step": 3000
112
+ },
113
+ {
114
+ "epoch": 7.7134986225895315,
115
+ "grad_norm": 9.686969757080078,
116
+ "learning_rate": 0.0004907407407407408,
117
+ "loss": 1.3816,
118
+ "step": 3500
119
+ },
120
+ {
121
+ "epoch": 7.7134986225895315,
122
+ "eval_accuracy": 0.6442475667581732,
123
+ "eval_f1_macro": 0.4625270466241396,
124
+ "eval_loss": 1.520119309425354,
125
+ "eval_runtime": 176.8139,
126
+ "eval_samples_per_second": 45.324,
127
+ "eval_steps_per_second": 1.42,
128
+ "step": 3500
129
+ },
130
+ {
131
+ "epoch": 8.81542699724518,
132
+ "grad_norm": 9.070294380187988,
133
+ "learning_rate": 0.00048148148148148144,
134
+ "loss": 1.2373,
135
+ "step": 4000
136
+ },
137
+ {
138
+ "epoch": 8.81542699724518,
139
+ "eval_accuracy": 0.6640878462690292,
140
+ "eval_f1_macro": 0.4936672291314334,
141
+ "eval_loss": 1.4355494976043701,
142
+ "eval_runtime": 34.0662,
143
+ "eval_samples_per_second": 235.248,
144
+ "eval_steps_per_second": 7.368,
145
+ "step": 4000
146
+ },
147
+ {
148
+ "epoch": 9.917355371900827,
149
+ "grad_norm": 10.048285484313965,
150
+ "learning_rate": 0.00047222222222222224,
151
+ "loss": 1.1039,
152
+ "step": 4500
153
+ },
154
+ {
155
+ "epoch": 9.917355371900827,
156
+ "eval_accuracy": 0.6771899176441227,
157
+ "eval_f1_macro": 0.48714340971176784,
158
+ "eval_loss": 1.3826853036880493,
159
+ "eval_runtime": 33.9169,
160
+ "eval_samples_per_second": 236.284,
161
+ "eval_steps_per_second": 7.4,
162
+ "step": 4500
163
+ },
164
+ {
165
+ "epoch": 11.019283746556473,
166
+ "grad_norm": 6.188383102416992,
167
+ "learning_rate": 0.000462962962962963,
168
+ "loss": 1.037,
169
+ "step": 5000
170
+ },
171
+ {
172
+ "epoch": 11.019283746556473,
173
+ "eval_accuracy": 0.6820564012977289,
174
+ "eval_f1_macro": 0.5233785564818548,
175
+ "eval_loss": 1.4125980138778687,
176
+ "eval_runtime": 33.9515,
177
+ "eval_samples_per_second": 236.042,
178
+ "eval_steps_per_second": 7.393,
179
+ "step": 5000
180
+ },
181
+ {
182
+ "epoch": 12.121212121212121,
183
+ "grad_norm": 5.904200077056885,
184
+ "learning_rate": 0.0004537037037037037,
185
+ "loss": 0.9419,
186
+ "step": 5500
187
+ },
188
+ {
189
+ "epoch": 12.121212121212121,
190
+ "eval_accuracy": 0.7108809583229349,
191
+ "eval_f1_macro": 0.5375488759123677,
192
+ "eval_loss": 1.2807058095932007,
193
+ "eval_runtime": 34.1787,
194
+ "eval_samples_per_second": 234.473,
195
+ "eval_steps_per_second": 7.344,
196
+ "step": 5500
197
+ },
198
+ {
199
+ "epoch": 13.223140495867769,
200
+ "grad_norm": 6.450353145599365,
201
+ "learning_rate": 0.0004444444444444444,
202
+ "loss": 0.855,
203
+ "step": 6000
204
+ },
205
+ {
206
+ "epoch": 13.223140495867769,
207
+ "eval_accuracy": 0.7106313950586474,
208
+ "eval_f1_macro": 0.551305442830446,
209
+ "eval_loss": 1.3112666606903076,
210
+ "eval_runtime": 50.9709,
211
+ "eval_samples_per_second": 157.227,
212
+ "eval_steps_per_second": 4.924,
213
+ "step": 6000
214
+ },
215
+ {
216
+ "epoch": 14.325068870523417,
217
+ "grad_norm": 8.54566764831543,
218
+ "learning_rate": 0.0004351851851851852,
219
+ "loss": 0.7855,
220
+ "step": 6500
221
+ },
222
+ {
223
+ "epoch": 14.325068870523417,
224
+ "eval_accuracy": 0.7096331420014974,
225
+ "eval_f1_macro": 0.5583041388143256,
226
+ "eval_loss": 1.2415180206298828,
227
+ "eval_runtime": 138.5717,
228
+ "eval_samples_per_second": 57.833,
229
+ "eval_steps_per_second": 1.811,
230
+ "step": 6500
231
+ },
232
+ {
233
+ "epoch": 15.426997245179063,
234
+ "grad_norm": 6.042281627655029,
235
+ "learning_rate": 0.00042592592592592595,
236
+ "loss": 0.7192,
237
+ "step": 7000
238
+ },
239
+ {
240
+ "epoch": 15.426997245179063,
241
+ "eval_accuracy": 0.7216121786872972,
242
+ "eval_f1_macro": 0.5653564656146333,
243
+ "eval_loss": 1.2567986249923706,
244
+ "eval_runtime": 34.1278,
245
+ "eval_samples_per_second": 234.823,
246
+ "eval_steps_per_second": 7.355,
247
+ "step": 7000
248
+ },
249
+ {
250
+ "epoch": 16.52892561983471,
251
+ "grad_norm": 7.013518333435059,
252
+ "learning_rate": 0.0004166666666666667,
253
+ "loss": 0.6721,
254
+ "step": 7500
255
+ },
256
+ {
257
+ "epoch": 16.52892561983471,
258
+ "eval_accuracy": 0.719241327676566,
259
+ "eval_f1_macro": 0.5674881280347366,
260
+ "eval_loss": 1.2789088487625122,
261
+ "eval_runtime": 34.173,
262
+ "eval_samples_per_second": 234.512,
263
+ "eval_steps_per_second": 7.345,
264
+ "step": 7500
265
+ },
266
+ {
267
+ "epoch": 17.63085399449036,
268
+ "grad_norm": 5.327983379364014,
269
+ "learning_rate": 0.0004074074074074074,
270
+ "loss": 0.618,
271
+ "step": 8000
272
+ },
273
+ {
274
+ "epoch": 17.63085399449036,
275
+ "eval_accuracy": 0.7237334664337409,
276
+ "eval_f1_macro": 0.5611898965872011,
277
+ "eval_loss": 1.2278856039047241,
278
+ "eval_runtime": 34.0889,
279
+ "eval_samples_per_second": 235.091,
280
+ "eval_steps_per_second": 7.363,
281
+ "step": 8000
282
+ },
283
+ {
284
+ "epoch": 18.732782369146005,
285
+ "grad_norm": 5.324774265289307,
286
+ "learning_rate": 0.0003981481481481481,
287
+ "loss": 0.5673,
288
+ "step": 8500
289
+ },
290
+ {
291
+ "epoch": 18.732782369146005,
292
+ "eval_accuracy": 0.7378337908659846,
293
+ "eval_f1_macro": 0.5708025777794506,
294
+ "eval_loss": 1.2691614627838135,
295
+ "eval_runtime": 34.0618,
296
+ "eval_samples_per_second": 235.278,
297
+ "eval_steps_per_second": 7.369,
298
+ "step": 8500
299
+ },
300
+ {
301
+ "epoch": 19.834710743801654,
302
+ "grad_norm": 7.3289384841918945,
303
+ "learning_rate": 0.0003888888888888889,
304
+ "loss": 0.523,
305
+ "step": 9000
306
+ },
307
+ {
308
+ "epoch": 19.834710743801654,
309
+ "eval_accuracy": 0.736585974544547,
310
+ "eval_f1_macro": 0.583453506873929,
311
+ "eval_loss": 1.2533445358276367,
312
+ "eval_runtime": 34.1698,
313
+ "eval_samples_per_second": 234.534,
314
+ "eval_steps_per_second": 7.346,
315
+ "step": 9000
316
+ },
317
+ {
318
+ "epoch": 20.9366391184573,
319
+ "grad_norm": 4.2654643058776855,
320
+ "learning_rate": 0.00037962962962962966,
321
+ "loss": 0.4761,
322
+ "step": 9500
323
+ },
324
+ {
325
+ "epoch": 20.9366391184573,
326
+ "eval_accuracy": 0.7508110806089343,
327
+ "eval_f1_macro": 0.5780908306225785,
328
+ "eval_loss": 1.2073005437850952,
329
+ "eval_runtime": 34.1144,
330
+ "eval_samples_per_second": 234.915,
331
+ "eval_steps_per_second": 7.358,
332
+ "step": 9500
333
+ },
334
+ {
335
+ "epoch": 22.038567493112946,
336
+ "grad_norm": 7.886093616485596,
337
+ "learning_rate": 0.00037037037037037035,
338
+ "loss": 0.4481,
339
+ "step": 10000
340
+ },
341
+ {
342
+ "epoch": 22.038567493112946,
343
+ "eval_accuracy": 0.7504367357125031,
344
+ "eval_f1_macro": 0.5902163691095943,
345
+ "eval_loss": 1.2306609153747559,
346
+ "eval_runtime": 34.1357,
347
+ "eval_samples_per_second": 234.769,
348
+ "eval_steps_per_second": 7.353,
349
+ "step": 10000
350
+ },
351
+ {
352
+ "epoch": 23.140495867768596,
353
+ "grad_norm": 5.739956855773926,
354
+ "learning_rate": 0.0003611111111111111,
355
+ "loss": 0.4137,
356
+ "step": 10500
357
+ },
358
+ {
359
+ "epoch": 23.140495867768596,
360
+ "eval_accuracy": 0.7481906663339156,
361
+ "eval_f1_macro": 0.6000675997156686,
362
+ "eval_loss": 1.2535877227783203,
363
+ "eval_runtime": 34.1657,
364
+ "eval_samples_per_second": 234.563,
365
+ "eval_steps_per_second": 7.347,
366
+ "step": 10500
367
+ },
368
+ {
369
+ "epoch": 24.242424242424242,
370
+ "grad_norm": 6.5599589347839355,
371
+ "learning_rate": 0.0003518518518518519,
372
+ "loss": 0.3771,
373
+ "step": 11000
374
+ },
375
+ {
376
+ "epoch": 24.242424242424242,
377
+ "eval_accuracy": 0.7523084601946594,
378
+ "eval_f1_macro": 0.6003714574908554,
379
+ "eval_loss": 1.1841113567352295,
380
+ "eval_runtime": 33.9543,
381
+ "eval_samples_per_second": 236.023,
382
+ "eval_steps_per_second": 7.392,
383
+ "step": 11000
384
+ },
385
+ {
386
+ "epoch": 25.34435261707989,
387
+ "grad_norm": 4.988476753234863,
388
+ "learning_rate": 0.00034259259259259263,
389
+ "loss": 0.3456,
390
+ "step": 11500
391
+ },
392
+ {
393
+ "epoch": 25.34435261707989,
394
+ "eval_accuracy": 0.7511854255053656,
395
+ "eval_f1_macro": 0.613830898261218,
396
+ "eval_loss": 1.2578365802764893,
397
+ "eval_runtime": 34.0485,
398
+ "eval_samples_per_second": 235.37,
399
+ "eval_steps_per_second": 7.372,
400
+ "step": 11500
401
+ },
402
+ {
403
+ "epoch": 26.446280991735538,
404
+ "grad_norm": 10.72720718383789,
405
+ "learning_rate": 0.0003333333333333333,
406
+ "loss": 0.3206,
407
+ "step": 12000
408
+ },
409
+ {
410
+ "epoch": 26.446280991735538,
411
+ "eval_accuracy": 0.7565510356875468,
412
+ "eval_f1_macro": 0.6008526427475953,
413
+ "eval_loss": 1.2595282793045044,
414
+ "eval_runtime": 34.0387,
415
+ "eval_samples_per_second": 235.438,
416
+ "eval_steps_per_second": 7.374,
417
+ "step": 12000
418
+ },
419
+ {
420
+ "epoch": 27.548209366391184,
421
+ "grad_norm": 5.9753098487854,
422
+ "learning_rate": 0.00032407407407407406,
423
+ "loss": 0.3083,
424
+ "step": 12500
425
+ },
426
+ {
427
+ "epoch": 27.548209366391184,
428
+ "eval_accuracy": 0.7525580234589468,
429
+ "eval_f1_macro": 0.6225560622894577,
430
+ "eval_loss": 1.2758995294570923,
431
+ "eval_runtime": 33.9587,
432
+ "eval_samples_per_second": 235.992,
433
+ "eval_steps_per_second": 7.391,
434
+ "step": 12500
435
+ },
436
+ {
437
+ "epoch": 28.650137741046834,
438
+ "grad_norm": 7.985760688781738,
439
+ "learning_rate": 0.0003148148148148148,
440
+ "loss": 0.286,
441
+ "step": 13000
442
+ },
443
+ {
444
+ "epoch": 28.650137741046834,
445
+ "eval_accuracy": 0.7631644621911655,
446
+ "eval_f1_macro": 0.615134929599282,
447
+ "eval_loss": 1.2176945209503174,
448
+ "eval_runtime": 34.0075,
449
+ "eval_samples_per_second": 235.654,
450
+ "eval_steps_per_second": 7.381,
451
+ "step": 13000
452
+ },
453
+ {
454
+ "epoch": 29.75206611570248,
455
+ "grad_norm": 3.7680389881134033,
456
+ "learning_rate": 0.0003055555555555556,
457
+ "loss": 0.2678,
458
+ "step": 13500
459
+ },
460
+ {
461
+ "epoch": 29.75206611570248,
462
+ "eval_accuracy": 0.7686548540054904,
463
+ "eval_f1_macro": 0.6317793794263403,
464
+ "eval_loss": 1.2493242025375366,
465
+ "eval_runtime": 34.052,
466
+ "eval_samples_per_second": 235.346,
467
+ "eval_steps_per_second": 7.371,
468
+ "step": 13500
469
+ },
470
+ {
471
+ "epoch": 30.853994490358126,
472
+ "grad_norm": 2.3487305641174316,
473
+ "learning_rate": 0.0002962962962962963,
474
+ "loss": 0.2488,
475
+ "step": 14000
476
+ },
477
+ {
478
+ "epoch": 30.853994490358126,
479
+ "eval_accuracy": 0.7685300723733467,
480
+ "eval_f1_macro": 0.6379453782996546,
481
+ "eval_loss": 1.2121572494506836,
482
+ "eval_runtime": 33.8764,
483
+ "eval_samples_per_second": 236.566,
484
+ "eval_steps_per_second": 7.409,
485
+ "step": 14000
486
+ },
487
+ {
488
+ "epoch": 31.955922865013775,
489
+ "grad_norm": 5.986231803894043,
490
+ "learning_rate": 0.00028703703703703703,
491
+ "loss": 0.2324,
492
+ "step": 14500
493
+ },
494
+ {
495
+ "epoch": 31.955922865013775,
496
+ "eval_accuracy": 0.7707761417519341,
497
+ "eval_f1_macro": 0.6291965084001815,
498
+ "eval_loss": 1.2486802339553833,
499
+ "eval_runtime": 33.8585,
500
+ "eval_samples_per_second": 236.691,
501
+ "eval_steps_per_second": 7.413,
502
+ "step": 14500
503
+ },
504
+ {
505
+ "epoch": 33.05785123966942,
506
+ "grad_norm": 4.138526439666748,
507
+ "learning_rate": 0.0002777777777777778,
508
+ "loss": 0.212,
509
+ "step": 15000
510
+ },
511
+ {
512
+ "epoch": 33.05785123966942,
513
+ "eval_accuracy": 0.7655353132018967,
514
+ "eval_f1_macro": 0.6182777113261247,
515
+ "eval_loss": 1.2596852779388428,
516
+ "eval_runtime": 33.8607,
517
+ "eval_samples_per_second": 236.675,
518
+ "eval_steps_per_second": 7.413,
519
+ "step": 15000
520
+ },
521
+ {
522
+ "epoch": 34.15977961432507,
523
+ "grad_norm": 4.644781112670898,
524
+ "learning_rate": 0.0002685185185185186,
525
+ "loss": 0.2003,
526
+ "step": 15500
527
+ },
528
+ {
529
+ "epoch": 34.15977961432507,
530
+ "eval_accuracy": 0.7730222111305216,
531
+ "eval_f1_macro": 0.6195180591624613,
532
+ "eval_loss": 1.2190001010894775,
533
+ "eval_runtime": 33.8524,
534
+ "eval_samples_per_second": 236.734,
535
+ "eval_steps_per_second": 7.415,
536
+ "step": 15500
537
+ },
538
+ {
539
+ "epoch": 35.26170798898072,
540
+ "grad_norm": 2.8191895484924316,
541
+ "learning_rate": 0.00025925925925925926,
542
+ "loss": 0.1822,
543
+ "step": 16000
544
+ },
545
+ {
546
+ "epoch": 35.26170798898072,
547
+ "eval_accuracy": 0.7781382580484153,
548
+ "eval_f1_macro": 0.6369508969483741,
549
+ "eval_loss": 1.2363426685333252,
550
+ "eval_runtime": 40.925,
551
+ "eval_samples_per_second": 195.822,
552
+ "eval_steps_per_second": 6.133,
553
+ "step": 16000
554
+ },
555
+ {
556
+ "epoch": 36.36363636363637,
557
+ "grad_norm": 4.859241962432861,
558
+ "learning_rate": 0.00025,
559
+ "loss": 0.1743,
560
+ "step": 16500
561
+ },
562
+ {
563
+ "epoch": 36.36363636363637,
564
+ "eval_accuracy": 0.777763913151984,
565
+ "eval_f1_macro": 0.6468738399385165,
566
+ "eval_loss": 1.2521047592163086,
567
+ "eval_runtime": 40.9791,
568
+ "eval_samples_per_second": 195.563,
569
+ "eval_steps_per_second": 6.125,
570
+ "step": 16500
571
+ },
572
+ {
573
+ "epoch": 37.46556473829201,
574
+ "grad_norm": 3.027918577194214,
575
+ "learning_rate": 0.00024074074074074072,
576
+ "loss": 0.1604,
577
+ "step": 17000
578
+ },
579
+ {
580
+ "epoch": 37.46556473829201,
581
+ "eval_accuracy": 0.7700274519590716,
582
+ "eval_f1_macro": 0.6320426231926091,
583
+ "eval_loss": 1.2722282409667969,
584
+ "eval_runtime": 41.0083,
585
+ "eval_samples_per_second": 195.424,
586
+ "eval_steps_per_second": 6.121,
587
+ "step": 17000
588
+ },
589
+ {
590
+ "epoch": 38.56749311294766,
591
+ "grad_norm": 5.677507400512695,
592
+ "learning_rate": 0.0002314814814814815,
593
+ "loss": 0.1523,
594
+ "step": 17500
595
+ },
596
+ {
597
+ "epoch": 38.56749311294766,
598
+ "eval_accuracy": 0.7745195907162465,
599
+ "eval_f1_macro": 0.6419239322758532,
600
+ "eval_loss": 1.2623705863952637,
601
+ "eval_runtime": 40.8771,
602
+ "eval_samples_per_second": 196.051,
603
+ "eval_steps_per_second": 6.14,
604
+ "step": 17500
605
+ },
606
+ {
607
+ "epoch": 39.66942148760331,
608
+ "grad_norm": 5.041018009185791,
609
+ "learning_rate": 0.0002222222222222222,
610
+ "loss": 0.1384,
611
+ "step": 18000
612
+ },
613
+ {
614
+ "epoch": 39.66942148760331,
615
+ "eval_accuracy": 0.7866234090341901,
616
+ "eval_f1_macro": 0.6659870204162003,
617
+ "eval_loss": 1.189522624015808,
618
+ "eval_runtime": 40.9611,
619
+ "eval_samples_per_second": 195.649,
620
+ "eval_steps_per_second": 6.128,
621
+ "step": 18000
622
+ },
623
+ {
624
+ "epoch": 40.77134986225895,
625
+ "grad_norm": 3.659827947616577,
626
+ "learning_rate": 0.00021296296296296298,
627
+ "loss": 0.135,
628
+ "step": 18500
629
+ },
630
+ {
631
+ "epoch": 40.77134986225895,
632
+ "eval_accuracy": 0.7850012478163214,
633
+ "eval_f1_macro": 0.6720105896795865,
634
+ "eval_loss": 1.1996605396270752,
635
+ "eval_runtime": 40.9502,
636
+ "eval_samples_per_second": 195.701,
637
+ "eval_steps_per_second": 6.129,
638
+ "step": 18500
639
+ },
640
+ {
641
+ "epoch": 41.8732782369146,
642
+ "grad_norm": 5.367224216461182,
643
+ "learning_rate": 0.0002037037037037037,
644
+ "loss": 0.1239,
645
+ "step": 19000
646
+ },
647
+ {
648
+ "epoch": 41.8732782369146,
649
+ "eval_accuracy": 0.7908659845270776,
650
+ "eval_f1_macro": 0.6804665897748318,
651
+ "eval_loss": 1.2242608070373535,
652
+ "eval_runtime": 40.9731,
653
+ "eval_samples_per_second": 195.592,
654
+ "eval_steps_per_second": 6.126,
655
+ "step": 19000
656
+ },
657
+ {
658
+ "epoch": 42.97520661157025,
659
+ "grad_norm": 3.64471173286438,
660
+ "learning_rate": 0.00019444444444444446,
661
+ "loss": 0.1131,
662
+ "step": 19500
663
+ },
664
+ {
665
+ "epoch": 42.97520661157025,
666
+ "eval_accuracy": 0.7947342151235338,
667
+ "eval_f1_macro": 0.653675258584708,
668
+ "eval_loss": 1.1854939460754395,
669
+ "eval_runtime": 40.6521,
670
+ "eval_samples_per_second": 197.136,
671
+ "eval_steps_per_second": 6.174,
672
+ "step": 19500
673
+ },
674
+ {
675
+ "epoch": 44.07713498622589,
676
+ "grad_norm": 7.800271987915039,
677
+ "learning_rate": 0.00018518518518518518,
678
+ "loss": 0.1076,
679
+ "step": 20000
680
+ },
681
+ {
682
+ "epoch": 44.07713498622589,
683
+ "eval_accuracy": 0.7879960069877714,
684
+ "eval_f1_macro": 0.6313626574356537,
685
+ "eval_loss": 1.2797794342041016,
686
+ "eval_runtime": 40.7712,
687
+ "eval_samples_per_second": 196.56,
688
+ "eval_steps_per_second": 6.156,
689
+ "step": 20000
690
+ },
691
+ {
692
+ "epoch": 45.17906336088154,
693
+ "grad_norm": 3.2701990604400635,
694
+ "learning_rate": 0.00017592592592592595,
695
+ "loss": 0.0999,
696
+ "step": 20500
697
+ },
698
+ {
699
+ "epoch": 45.17906336088154,
700
+ "eval_accuracy": 0.7922385824806588,
701
+ "eval_f1_macro": 0.6725445304133565,
702
+ "eval_loss": 1.2082223892211914,
703
+ "eval_runtime": 39.2309,
704
+ "eval_samples_per_second": 204.278,
705
+ "eval_steps_per_second": 6.398,
706
+ "step": 20500
707
+ },
708
+ {
709
+ "epoch": 46.28099173553719,
710
+ "grad_norm": 3.959247350692749,
711
+ "learning_rate": 0.00016666666666666666,
712
+ "loss": 0.0898,
713
+ "step": 21000
714
+ },
715
+ {
716
+ "epoch": 46.28099173553719,
717
+ "eval_accuracy": 0.80059895183429,
718
+ "eval_f1_macro": 0.6765711205444297,
719
+ "eval_loss": 1.1848937273025513,
720
+ "eval_runtime": 40.6064,
721
+ "eval_samples_per_second": 197.358,
722
+ "eval_steps_per_second": 6.181,
723
+ "step": 21000
724
+ },
725
+ {
726
+ "epoch": 47.382920110192835,
727
+ "grad_norm": 3.7939414978027344,
728
+ "learning_rate": 0.0001574074074074074,
729
+ "loss": 0.0861,
730
+ "step": 21500
731
+ },
732
+ {
733
+ "epoch": 47.382920110192835,
734
+ "eval_accuracy": 0.7979785375592713,
735
+ "eval_f1_macro": 0.6842554850331637,
736
+ "eval_loss": 1.177991509437561,
737
+ "eval_runtime": 40.9664,
738
+ "eval_samples_per_second": 195.624,
739
+ "eval_steps_per_second": 6.127,
740
+ "step": 21500
741
+ },
742
+ {
743
+ "epoch": 48.484848484848484,
744
+ "grad_norm": 2.4407143592834473,
745
+ "learning_rate": 0.00014814814814814815,
746
+ "loss": 0.0743,
747
+ "step": 22000
748
+ },
749
+ {
750
+ "epoch": 48.484848484848484,
751
+ "eval_accuracy": 0.8023458946843025,
752
+ "eval_f1_macro": 0.672574066052015,
753
+ "eval_loss": 1.23497474193573,
754
+ "eval_runtime": 40.9537,
755
+ "eval_samples_per_second": 195.684,
756
+ "eval_steps_per_second": 6.129,
757
+ "step": 22000
758
+ },
759
+ {
760
+ "epoch": 49.586776859504134,
761
+ "grad_norm": 1.4111738204956055,
762
+ "learning_rate": 0.0001388888888888889,
763
+ "loss": 0.0725,
764
+ "step": 22500
765
+ },
766
+ {
767
+ "epoch": 49.586776859504134,
768
+ "eval_accuracy": 0.8037184926378838,
769
+ "eval_f1_macro": 0.687093106679298,
770
+ "eval_loss": 1.1898678541183472,
771
+ "eval_runtime": 41.112,
772
+ "eval_samples_per_second": 194.931,
773
+ "eval_steps_per_second": 6.105,
774
+ "step": 22500
775
+ },
776
+ {
777
+ "epoch": 50.68870523415978,
778
+ "grad_norm": 2.446298122406006,
779
+ "learning_rate": 0.00012962962962962963,
780
+ "loss": 0.063,
781
+ "step": 23000
782
+ },
783
+ {
784
+ "epoch": 50.68870523415978,
785
+ "eval_accuracy": 0.7988520089842776,
786
+ "eval_f1_macro": 0.6845006699400662,
787
+ "eval_loss": 1.2227065563201904,
788
+ "eval_runtime": 41.0323,
789
+ "eval_samples_per_second": 195.31,
790
+ "eval_steps_per_second": 6.117,
791
+ "step": 23000
792
+ },
793
+ {
794
+ "epoch": 51.790633608815426,
795
+ "grad_norm": 3.328188419342041,
796
+ "learning_rate": 0.00012037037037037036,
797
+ "loss": 0.061,
798
+ "step": 23500
799
+ },
800
+ {
801
+ "epoch": 51.790633608815426,
802
+ "eval_accuracy": 0.8082106313950587,
803
+ "eval_f1_macro": 0.6785356281203615,
804
+ "eval_loss": 1.1609222888946533,
805
+ "eval_runtime": 42.2613,
806
+ "eval_samples_per_second": 189.63,
807
+ "eval_steps_per_second": 5.939,
808
+ "step": 23500
809
+ },
810
+ {
811
+ "epoch": 52.892561983471076,
812
+ "grad_norm": 2.284496545791626,
813
+ "learning_rate": 0.0001111111111111111,
814
+ "loss": 0.056,
815
+ "step": 24000
816
+ },
817
+ {
818
+ "epoch": 52.892561983471076,
819
+ "eval_accuracy": 0.8119540803593711,
820
+ "eval_f1_macro": 0.6977082191883057,
821
+ "eval_loss": 1.160672664642334,
822
+ "eval_runtime": 40.9648,
823
+ "eval_samples_per_second": 195.632,
824
+ "eval_steps_per_second": 6.127,
825
+ "step": 24000
826
+ },
827
+ {
828
+ "epoch": 53.99449035812672,
829
+ "grad_norm": 3.7205893993377686,
830
+ "learning_rate": 0.00010185185185185185,
831
+ "loss": 0.051,
832
+ "step": 24500
833
+ },
834
+ {
835
+ "epoch": 53.99449035812672,
836
+ "eval_accuracy": 0.8083354130272024,
837
+ "eval_f1_macro": 0.6629286097479332,
838
+ "eval_loss": 1.224273920059204,
839
+ "eval_runtime": 40.9557,
840
+ "eval_samples_per_second": 195.675,
841
+ "eval_steps_per_second": 6.129,
842
+ "step": 24500
843
+ },
844
+ {
845
+ "epoch": 55.09641873278237,
846
+ "grad_norm": 0.219478040933609,
847
+ "learning_rate": 9.259259259259259e-05,
848
+ "loss": 0.0447,
849
+ "step": 25000
850
+ },
851
+ {
852
+ "epoch": 55.09641873278237,
853
+ "eval_accuracy": 0.8098327926129274,
854
+ "eval_f1_macro": 0.666826551929984,
855
+ "eval_loss": 1.199051856994629,
856
+ "eval_runtime": 41.0107,
857
+ "eval_samples_per_second": 195.412,
858
+ "eval_steps_per_second": 6.12,
859
+ "step": 25000
860
+ },
861
+ {
862
+ "epoch": 56.19834710743802,
863
+ "grad_norm": 3.8260011672973633,
864
+ "learning_rate": 8.333333333333333e-05,
865
+ "loss": 0.0409,
866
+ "step": 25500
867
+ },
868
+ {
869
+ "epoch": 56.19834710743802,
870
+ "eval_accuracy": 0.8124532068879461,
871
+ "eval_f1_macro": 0.699449604989361,
872
+ "eval_loss": 1.2268598079681396,
873
+ "eval_runtime": 40.8996,
874
+ "eval_samples_per_second": 195.943,
875
+ "eval_steps_per_second": 6.137,
876
+ "step": 25500
877
+ },
878
+ {
879
+ "epoch": 57.30027548209367,
880
+ "grad_norm": 0.5541914105415344,
881
+ "learning_rate": 7.407407407407407e-05,
882
+ "loss": 0.039,
883
+ "step": 26000
884
+ },
885
+ {
886
+ "epoch": 57.30027548209367,
887
+ "eval_accuracy": 0.8215622660344397,
888
+ "eval_f1_macro": 0.6928701132826408,
889
+ "eval_loss": 1.1886591911315918,
890
+ "eval_runtime": 40.5313,
891
+ "eval_samples_per_second": 197.724,
892
+ "eval_steps_per_second": 6.193,
893
+ "step": 26000
894
+ }
895
+ ],
896
+ "logging_steps": 500,
897
+ "max_steps": 30000,
898
+ "num_input_tokens_seen": 0,
899
+ "num_train_epochs": 67,
900
+ "save_steps": 1000,
901
+ "stateful_callbacks": {
902
+ "TrainerControl": {
903
+ "args": {
904
+ "should_epoch_stop": false,
905
+ "should_evaluate": false,
906
+ "should_log": false,
907
+ "should_save": true,
908
+ "should_training_stop": false
909
+ },
910
+ "attributes": {}
911
+ }
912
+ },
913
+ "total_flos": 1.5103646535813897e+20,
914
+ "train_batch_size": 32,
915
+ "trial_name": null,
916
+ "trial_params": null
917
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1e7e8d962fba8418949e1365bf6d9bd403c8fe8015481536a1cd9d10372b363
3
+ size 5240