Pclanglais commited on
Commit
52b4308
·
verified ·
1 Parent(s): a8568ac

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[MASK]": 250101
3
+ }
config.json ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "deberta-large",
3
+ "architectures": [
4
+ "DebertaV2ForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "id2label": {
11
+ "0": "harvard-cite-them-right",
12
+ "1": "thieme-german",
13
+ "2": "american-political-science-association",
14
+ "3": "american-society-for-horticultural-science",
15
+ "4": "hiob-ludolf-centre-for-ethiopian-studies",
16
+ "5": "mary-ann-liebert-vancouver",
17
+ "6": "the-geological-society-of-america",
18
+ "7": "modern-language-association",
19
+ "8": "african-online-scientific-information-systems-harvard",
20
+ "9": "american-medical-association",
21
+ "10": "annual-reviews",
22
+ "11": "oikos",
23
+ "12": "springer-socpsych",
24
+ "13": "vancouver",
25
+ "14": "royal-society-of-chemistry",
26
+ "15": "elsevier-harvard",
27
+ "16": "institute-of-mathematics-and-its-applications",
28
+ "17": "future-science-group",
29
+ "18": "ieee",
30
+ "19": "institute-of-mathematical-statistics",
31
+ "20": "the-rockefeller-university-press",
32
+ "21": "american-society-of-agricultural-and-biological-engineers",
33
+ "22": "politeknik-negeri-manado-jurnal-p3m",
34
+ "23": "chroniques-des-activites-archeologiques-de-l-ecole-francaise-de-rome",
35
+ "24": "modern-humanities-research-association",
36
+ "25": "medicina-clinica",
37
+ "26": "environment-and-planning",
38
+ "27": "style-manual-for-authors-editors-and-printers-snooks-co",
39
+ "28": "american-nuclear-society",
40
+ "29": "aims-press",
41
+ "30": "springer-vancouver",
42
+ "31": "proceedings-of-the-royal-society-b",
43
+ "32": "taylor-and-francis-chicago",
44
+ "33": "the-journal-of-comparative-neurology",
45
+ "34": "nature",
46
+ "35": "american-chemical-society",
47
+ "36": "die-bachelorarbeit-samac-et-al-in-text",
48
+ "37": "future-medicine",
49
+ "38": "international-union-of-crystallography",
50
+ "39": "copernicus-publications",
51
+ "40": "medicine-publishing",
52
+ "41": "american-society-for-microbiology",
53
+ "42": "springer-humanities",
54
+ "43": "springer-physics",
55
+ "44": "style-manual-australian-government-note",
56
+ "45": "institute-of-physics-harvard",
57
+ "46": "plos",
58
+ "47": "american-sociological-association",
59
+ "48": "taylor-and-francis-national-library-of-medicine",
60
+ "49": "canadian-journal-of-fisheries-and-aquatic-sciences",
61
+ "50": "elsevier",
62
+ "51": "american-society-of-civil-engineers",
63
+ "52": "inter-research-science-center",
64
+ "53": "the-lancet",
65
+ "54": "chicago",
66
+ "55": "elsevier-vancouver",
67
+ "56": "landes-bioscience-journals",
68
+ "57": "institute-for-operations-research-and-the-management-sciences",
69
+ "58": "american-institute-of-aeronautics-and-astronautics",
70
+ "59": "baishideng-publishing-group",
71
+ "60": "the-optical-society",
72
+ "61": "american-society-of-mechanical-engineers",
73
+ "62": "association-for-computing-machinery",
74
+ "63": "bristol-university-press",
75
+ "64": "cold-spring-harbor-laboratory-press",
76
+ "65": "spie-journals",
77
+ "66": "national-institute-of-health-research",
78
+ "67": "bmj",
79
+ "68": "mary-ann-liebert-harvard",
80
+ "69": "international-journal-of-wildland-fire",
81
+ "70": "institute-of-physics",
82
+ "71": "american-institute-of-physics",
83
+ "72": "american-statistical-association",
84
+ "73": "frontiers-medical-journals",
85
+ "74": "american-physiological-society",
86
+ "75": "the-institution-of-engineering-and-technology",
87
+ "76": "entomological-society-of-america",
88
+ "77": "african-online-scientific-information-systems-vancouver",
89
+ "78": "trends-journals",
90
+ "79": "springer-mathphys",
91
+ "80": "ecology",
92
+ "81": "the-company-of-biologists",
93
+ "82": "springer-basic",
94
+ "83": "american-society-for-pharmacology-and-experimental-therapeutics",
95
+ "84": "american-association-for-cancer-research",
96
+ "85": "american-meteorological-society",
97
+ "86": "the-geological-society-of-london",
98
+ "87": "karger-journals",
99
+ "88": "springer-fachzeitschriften-medizin-psychologie",
100
+ "89": "canadian-journal-of-soil-science",
101
+ "90": "begell-house-chicago",
102
+ "91": "spandidos-publications",
103
+ "92": "biomed-central",
104
+ "93": "cell",
105
+ "94": "council-of-science-editors",
106
+ "95": "frontiers",
107
+ "96": "embo-press",
108
+ "97": "emu-austral-ornithology",
109
+ "98": "microbiology-society",
110
+ "99": "pontifical-gregorian-university",
111
+ "100": "current-opinion",
112
+ "101": "sage-harvard",
113
+ "102": "the-institute-of-electronics-information-and-communication-engineers",
114
+ "103": "taylor-and-francis-council-of-science-editors",
115
+ "104": "european-journal-of-human-genetics",
116
+ "105": "american-geophysical-union",
117
+ "106": "integrated-science-publishing-journals",
118
+ "107": "universita-pontificia-salesiana",
119
+ "108": "american-fisheries-society",
120
+ "109": "international-studies-association",
121
+ "110": "american-physics-society",
122
+ "111": "european-society-of-cardiology",
123
+ "112": "oxford-university-press-scimed",
124
+ "113": "pensoft-journals",
125
+ "114": "multidisciplinary-digital-publishing-institute",
126
+ "115": "endocrine-press",
127
+ "116": "sage-vancouver",
128
+ "117": "academy-of-management-review",
129
+ "118": "american-marketing-association",
130
+ "119": "the-astrophysical-journal",
131
+ "120": "hainan-medical-university-journal-publisher",
132
+ "121": "museum-national-dhistoire-naturelle"
133
+ },
134
+ "initializer_range": 0.02,
135
+ "intermediate_size": 3072,
136
+ "label2id": {
137
+ "academy-of-management-review": 117,
138
+ "african-online-scientific-information-systems-harvard": 8,
139
+ "african-online-scientific-information-systems-vancouver": 77,
140
+ "aims-press": 29,
141
+ "american-association-for-cancer-research": 84,
142
+ "american-chemical-society": 35,
143
+ "american-fisheries-society": 108,
144
+ "american-geophysical-union": 105,
145
+ "american-institute-of-aeronautics-and-astronautics": 58,
146
+ "american-institute-of-physics": 71,
147
+ "american-marketing-association": 118,
148
+ "american-medical-association": 9,
149
+ "american-meteorological-society": 85,
150
+ "american-nuclear-society": 28,
151
+ "american-physics-society": 110,
152
+ "american-physiological-society": 74,
153
+ "american-political-science-association": 2,
154
+ "american-society-for-horticultural-science": 3,
155
+ "american-society-for-microbiology": 41,
156
+ "american-society-for-pharmacology-and-experimental-therapeutics": 83,
157
+ "american-society-of-agricultural-and-biological-engineers": 21,
158
+ "american-society-of-civil-engineers": 51,
159
+ "american-society-of-mechanical-engineers": 61,
160
+ "american-sociological-association": 47,
161
+ "american-statistical-association": 72,
162
+ "annual-reviews": 10,
163
+ "association-for-computing-machinery": 62,
164
+ "baishideng-publishing-group": 59,
165
+ "begell-house-chicago": 90,
166
+ "biomed-central": 92,
167
+ "bmj": 67,
168
+ "bristol-university-press": 63,
169
+ "canadian-journal-of-fisheries-and-aquatic-sciences": 49,
170
+ "canadian-journal-of-soil-science": 89,
171
+ "cell": 93,
172
+ "chicago": 54,
173
+ "chroniques-des-activites-archeologiques-de-l-ecole-francaise-de-rome": 23,
174
+ "cold-spring-harbor-laboratory-press": 64,
175
+ "copernicus-publications": 39,
176
+ "council-of-science-editors": 94,
177
+ "current-opinion": 100,
178
+ "die-bachelorarbeit-samac-et-al-in-text": 36,
179
+ "ecology": 80,
180
+ "elsevier": 50,
181
+ "elsevier-harvard": 15,
182
+ "elsevier-vancouver": 55,
183
+ "embo-press": 96,
184
+ "emu-austral-ornithology": 97,
185
+ "endocrine-press": 115,
186
+ "entomological-society-of-america": 76,
187
+ "environment-and-planning": 26,
188
+ "european-journal-of-human-genetics": 104,
189
+ "european-society-of-cardiology": 111,
190
+ "frontiers": 95,
191
+ "frontiers-medical-journals": 73,
192
+ "future-medicine": 37,
193
+ "future-science-group": 17,
194
+ "hainan-medical-university-journal-publisher": 120,
195
+ "harvard-cite-them-right": 0,
196
+ "hiob-ludolf-centre-for-ethiopian-studies": 4,
197
+ "ieee": 18,
198
+ "institute-for-operations-research-and-the-management-sciences": 57,
199
+ "institute-of-mathematical-statistics": 19,
200
+ "institute-of-mathematics-and-its-applications": 16,
201
+ "institute-of-physics": 70,
202
+ "institute-of-physics-harvard": 45,
203
+ "integrated-science-publishing-journals": 106,
204
+ "inter-research-science-center": 52,
205
+ "international-journal-of-wildland-fire": 69,
206
+ "international-studies-association": 109,
207
+ "international-union-of-crystallography": 38,
208
+ "karger-journals": 87,
209
+ "landes-bioscience-journals": 56,
210
+ "mary-ann-liebert-harvard": 68,
211
+ "mary-ann-liebert-vancouver": 5,
212
+ "medicina-clinica": 25,
213
+ "medicine-publishing": 40,
214
+ "microbiology-society": 98,
215
+ "modern-humanities-research-association": 24,
216
+ "modern-language-association": 7,
217
+ "multidisciplinary-digital-publishing-institute": 114,
218
+ "museum-national-dhistoire-naturelle": 121,
219
+ "national-institute-of-health-research": 66,
220
+ "nature": 34,
221
+ "oikos": 11,
222
+ "oxford-university-press-scimed": 112,
223
+ "pensoft-journals": 113,
224
+ "plos": 46,
225
+ "politeknik-negeri-manado-jurnal-p3m": 22,
226
+ "pontifical-gregorian-university": 99,
227
+ "proceedings-of-the-royal-society-b": 31,
228
+ "royal-society-of-chemistry": 14,
229
+ "sage-harvard": 101,
230
+ "sage-vancouver": 116,
231
+ "spandidos-publications": 91,
232
+ "spie-journals": 65,
233
+ "springer-basic": 82,
234
+ "springer-fachzeitschriften-medizin-psychologie": 88,
235
+ "springer-humanities": 42,
236
+ "springer-mathphys": 79,
237
+ "springer-physics": 43,
238
+ "springer-socpsych": 12,
239
+ "springer-vancouver": 30,
240
+ "style-manual-australian-government-note": 44,
241
+ "style-manual-for-authors-editors-and-printers-snooks-co": 27,
242
+ "taylor-and-francis-chicago": 32,
243
+ "taylor-and-francis-council-of-science-editors": 103,
244
+ "taylor-and-francis-national-library-of-medicine": 48,
245
+ "the-astrophysical-journal": 119,
246
+ "the-company-of-biologists": 81,
247
+ "the-geological-society-of-america": 6,
248
+ "the-geological-society-of-london": 86,
249
+ "the-institute-of-electronics-information-and-communication-engineers": 102,
250
+ "the-institution-of-engineering-and-technology": 75,
251
+ "the-journal-of-comparative-neurology": 33,
252
+ "the-lancet": 53,
253
+ "the-optical-society": 60,
254
+ "the-rockefeller-university-press": 20,
255
+ "thieme-german": 1,
256
+ "trends-journals": 78,
257
+ "universita-pontificia-salesiana": 107,
258
+ "vancouver": 13
259
+ },
260
+ "layer_norm_eps": 1e-07,
261
+ "max_position_embeddings": 512,
262
+ "max_relative_positions": -1,
263
+ "model_type": "deberta-v2",
264
+ "norm_rel_ebd": "layer_norm",
265
+ "num_attention_heads": 12,
266
+ "num_hidden_layers": 12,
267
+ "pad_token_id": 0,
268
+ "pooler_dropout": 0,
269
+ "pooler_hidden_act": "gelu",
270
+ "pooler_hidden_size": 768,
271
+ "pos_att_type": [
272
+ "p2c",
273
+ "c2p"
274
+ ],
275
+ "position_biased_input": false,
276
+ "position_buckets": 256,
277
+ "problem_type": "multi_label_classification",
278
+ "relative_attention": true,
279
+ "share_att_key": true,
280
+ "torch_dtype": "float32",
281
+ "transformers_version": "4.41.2",
282
+ "type_vocab_size": 0,
283
+ "vocab_size": 251000
284
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24e31bd7c3da94f801b31796e2257f5cce1531e82c1839ac7df424e2542c817d
3
+ size 1115637336
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f89f8dec433b084b8008bb8a92f4b7594c2339d70aa8a8b51d50480323e010e8
3
+ size 2231394170
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd008cc57f6432b08bf036944ad8c15663aff9f3a0b3c83b8ac4d9837511a2b0
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9097c42c7958a57b9b27c5387f231b01d681e88ccc4a9a9d89cc46aeb70fe08f
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "[CLS]",
3
+ "cls_token": "[CLS]",
4
+ "eos_token": "[SEP]",
5
+ "mask_token": "[MASK]",
6
+ "pad_token": "[PAD]",
7
+ "sep_token": "[SEP]",
8
+ "unk_token": {
9
+ "content": "[UNK]",
10
+ "lstrip": false,
11
+ "normalized": true,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ }
15
+ }
spm.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13c8d666d62a7bc4ac8f040aab68e942c861f93303156cc28f5c7e885d86d6e3
3
+ size 4305025
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dbb7b63c76007984d0e58a90ee901ceb5b16c8e78252d36ddcde748b3474a1a
3
+ size 16331639
tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[UNK]",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250101": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "[CLS]",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "[CLS]",
47
+ "do_lower_case": false,
48
+ "eos_token": "[SEP]",
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 1000000000000000019884624838656,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "sp_model_kwargs": {},
54
+ "split_by_punct": false,
55
+ "tokenizer_class": "DebertaV2Tokenizer",
56
+ "unk_token": "[UNK]",
57
+ "vocab_type": "spm"
58
+ }
trainer_state.json ADDED
@@ -0,0 +1,953 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9361970057366729,
3
+ "best_model_checkpoint": "deberta-bib-style-classification/checkpoint-61266",
4
+ "epoch": 6.0,
5
+ "eval_steps": 500,
6
+ "global_step": 61266,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.048966800509254726,
13
+ "grad_norm": 0.18245625495910645,
14
+ "learning_rate": 1.9836777331635818e-05,
15
+ "loss": 0.1329,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.09793360101850945,
20
+ "grad_norm": 0.17618948221206665,
21
+ "learning_rate": 1.9673554663271638e-05,
22
+ "loss": 0.0446,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.14690040152776418,
27
+ "grad_norm": 0.16028529405593872,
28
+ "learning_rate": 1.9510331994907454e-05,
29
+ "loss": 0.0436,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.1958672020370189,
34
+ "grad_norm": 0.13800546526908875,
35
+ "learning_rate": 1.934710932654327e-05,
36
+ "loss": 0.0382,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.24483400254627363,
41
+ "grad_norm": 0.1666691154241562,
42
+ "learning_rate": 1.918388665817909e-05,
43
+ "loss": 0.0317,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.29380080305552836,
48
+ "grad_norm": 0.2547225058078766,
49
+ "learning_rate": 1.9020663989814907e-05,
50
+ "loss": 0.0276,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.3427676035647831,
55
+ "grad_norm": 0.10508100688457489,
56
+ "learning_rate": 1.8857441321450724e-05,
57
+ "loss": 0.0236,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.3917344040740378,
62
+ "grad_norm": 0.1274070292711258,
63
+ "learning_rate": 1.8694218653086543e-05,
64
+ "loss": 0.0208,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.44070120458329254,
69
+ "grad_norm": 0.1652757227420807,
70
+ "learning_rate": 1.853099598472236e-05,
71
+ "loss": 0.0187,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.48966800509254726,
76
+ "grad_norm": 0.15535283088684082,
77
+ "learning_rate": 1.8367773316358176e-05,
78
+ "loss": 0.0164,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.538634805601802,
83
+ "grad_norm": 0.1361575722694397,
84
+ "learning_rate": 1.8204550647993996e-05,
85
+ "loss": 0.0151,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.5876016061110567,
90
+ "grad_norm": 0.14795701205730438,
91
+ "learning_rate": 1.8041327979629813e-05,
92
+ "loss": 0.0141,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.6365684066203114,
97
+ "grad_norm": 0.24855226278305054,
98
+ "learning_rate": 1.787810531126563e-05,
99
+ "loss": 0.0128,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.6855352071295662,
104
+ "grad_norm": 0.11990305036306381,
105
+ "learning_rate": 1.771488264290145e-05,
106
+ "loss": 0.0119,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 0.7345020076388209,
111
+ "grad_norm": 0.12674401700496674,
112
+ "learning_rate": 1.7551659974537265e-05,
113
+ "loss": 0.0109,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 0.7834688081480756,
118
+ "grad_norm": 0.05565750598907471,
119
+ "learning_rate": 1.7388437306173082e-05,
120
+ "loss": 0.0103,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 0.8324356086573304,
125
+ "grad_norm": 0.1362127959728241,
126
+ "learning_rate": 1.72252146378089e-05,
127
+ "loss": 0.0097,
128
+ "step": 8500
129
+ },
130
+ {
131
+ "epoch": 0.8814024091665851,
132
+ "grad_norm": 0.1502208560705185,
133
+ "learning_rate": 1.7062318414781447e-05,
134
+ "loss": 0.0096,
135
+ "step": 9000
136
+ },
137
+ {
138
+ "epoch": 0.9303692096758398,
139
+ "grad_norm": 0.1307108849287033,
140
+ "learning_rate": 1.689942219175399e-05,
141
+ "loss": 0.009,
142
+ "step": 9500
143
+ },
144
+ {
145
+ "epoch": 0.9793360101850945,
146
+ "grad_norm": 0.08258962631225586,
147
+ "learning_rate": 1.6736199523389808e-05,
148
+ "loss": 0.0088,
149
+ "step": 10000
150
+ },
151
+ {
152
+ "epoch": 1.0,
153
+ "eval_accuracy": 0.7270709032884696,
154
+ "eval_f1": 0.8196123147092359,
155
+ "eval_loss": 0.007612535264343023,
156
+ "eval_roc_auc": 0.8736920143065684,
157
+ "eval_runtime": 56.0999,
158
+ "eval_samples_per_second": 128.467,
159
+ "eval_steps_per_second": 16.061,
160
+ "step": 10211
161
+ },
162
+ {
163
+ "epoch": 1.0283028106943493,
164
+ "grad_norm": 0.15880635380744934,
165
+ "learning_rate": 1.6572976855025628e-05,
166
+ "loss": 0.0086,
167
+ "step": 10500
168
+ },
169
+ {
170
+ "epoch": 1.077269611203604,
171
+ "grad_norm": 0.04505603387951851,
172
+ "learning_rate": 1.6409754186661444e-05,
173
+ "loss": 0.0082,
174
+ "step": 11000
175
+ },
176
+ {
177
+ "epoch": 1.1262364117128587,
178
+ "grad_norm": 0.1436477154493332,
179
+ "learning_rate": 1.624653151829726e-05,
180
+ "loss": 0.0076,
181
+ "step": 11500
182
+ },
183
+ {
184
+ "epoch": 1.1752032122221134,
185
+ "grad_norm": 0.10293476283550262,
186
+ "learning_rate": 1.608330884993308e-05,
187
+ "loss": 0.0074,
188
+ "step": 12000
189
+ },
190
+ {
191
+ "epoch": 1.2241700127313682,
192
+ "grad_norm": 0.12684179842472076,
193
+ "learning_rate": 1.5920086181568897e-05,
194
+ "loss": 0.0076,
195
+ "step": 12500
196
+ },
197
+ {
198
+ "epoch": 1.2731368132406229,
199
+ "grad_norm": 0.09429904818534851,
200
+ "learning_rate": 1.5756863513204713e-05,
201
+ "loss": 0.0073,
202
+ "step": 13000
203
+ },
204
+ {
205
+ "epoch": 1.3221036137498776,
206
+ "grad_norm": 0.24173329770565033,
207
+ "learning_rate": 1.5593640844840533e-05,
208
+ "loss": 0.0068,
209
+ "step": 13500
210
+ },
211
+ {
212
+ "epoch": 1.3710704142591323,
213
+ "grad_norm": 0.15347008407115936,
214
+ "learning_rate": 1.543041817647635e-05,
215
+ "loss": 0.0069,
216
+ "step": 14000
217
+ },
218
+ {
219
+ "epoch": 1.420037214768387,
220
+ "grad_norm": 0.30062130093574524,
221
+ "learning_rate": 1.5267195508112166e-05,
222
+ "loss": 0.0064,
223
+ "step": 14500
224
+ },
225
+ {
226
+ "epoch": 1.4690040152776418,
227
+ "grad_norm": 0.0312146358191967,
228
+ "learning_rate": 1.5103972839747984e-05,
229
+ "loss": 0.0065,
230
+ "step": 15000
231
+ },
232
+ {
233
+ "epoch": 1.5179708157868965,
234
+ "grad_norm": 0.19778664410114288,
235
+ "learning_rate": 1.4941076616720531e-05,
236
+ "loss": 0.0063,
237
+ "step": 15500
238
+ },
239
+ {
240
+ "epoch": 1.5669376162961512,
241
+ "grad_norm": 0.07010342180728912,
242
+ "learning_rate": 1.4778180393693078e-05,
243
+ "loss": 0.0058,
244
+ "step": 16000
245
+ },
246
+ {
247
+ "epoch": 1.615904416805406,
248
+ "grad_norm": 0.11643481999635696,
249
+ "learning_rate": 1.4614957725328896e-05,
250
+ "loss": 0.0058,
251
+ "step": 16500
252
+ },
253
+ {
254
+ "epoch": 1.6648712173146607,
255
+ "grad_norm": 0.14050887525081635,
256
+ "learning_rate": 1.4452061502301439e-05,
257
+ "loss": 0.0059,
258
+ "step": 17000
259
+ },
260
+ {
261
+ "epoch": 1.7138380178239154,
262
+ "grad_norm": 0.09967122972011566,
263
+ "learning_rate": 1.4288838833937257e-05,
264
+ "loss": 0.0053,
265
+ "step": 17500
266
+ },
267
+ {
268
+ "epoch": 1.7628048183331702,
269
+ "grad_norm": 0.06283292174339294,
270
+ "learning_rate": 1.4125616165573075e-05,
271
+ "loss": 0.0057,
272
+ "step": 18000
273
+ },
274
+ {
275
+ "epoch": 1.8117716188424249,
276
+ "grad_norm": 0.04395497962832451,
277
+ "learning_rate": 1.3962393497208892e-05,
278
+ "loss": 0.0051,
279
+ "step": 18500
280
+ },
281
+ {
282
+ "epoch": 1.8607384193516796,
283
+ "grad_norm": 0.15392401814460754,
284
+ "learning_rate": 1.379917082884471e-05,
285
+ "loss": 0.0049,
286
+ "step": 19000
287
+ },
288
+ {
289
+ "epoch": 1.9097052198609343,
290
+ "grad_norm": 0.053341180086135864,
291
+ "learning_rate": 1.3635948160480528e-05,
292
+ "loss": 0.0052,
293
+ "step": 19500
294
+ },
295
+ {
296
+ "epoch": 1.958672020370189,
297
+ "grad_norm": 0.14263851940631866,
298
+ "learning_rate": 1.3472725492116345e-05,
299
+ "loss": 0.005,
300
+ "step": 20000
301
+ },
302
+ {
303
+ "epoch": 2.0,
304
+ "eval_accuracy": 0.8507007076453448,
305
+ "eval_f1": 0.8938003106905804,
306
+ "eval_loss": 0.00466541014611721,
307
+ "eval_roc_auc": 0.9387280731428467,
308
+ "eval_runtime": 55.7959,
309
+ "eval_samples_per_second": 129.167,
310
+ "eval_steps_per_second": 16.148,
311
+ "step": 20422
312
+ },
313
+ {
314
+ "epoch": 2.0076388208794436,
315
+ "grad_norm": 0.16177518665790558,
316
+ "learning_rate": 1.3309502823752163e-05,
317
+ "loss": 0.0048,
318
+ "step": 20500
319
+ },
320
+ {
321
+ "epoch": 2.0566056213886985,
322
+ "grad_norm": 0.13520753383636475,
323
+ "learning_rate": 1.314628015538798e-05,
324
+ "loss": 0.0045,
325
+ "step": 21000
326
+ },
327
+ {
328
+ "epoch": 2.105572421897953,
329
+ "grad_norm": 0.09872843325138092,
330
+ "learning_rate": 1.2983057487023797e-05,
331
+ "loss": 0.0043,
332
+ "step": 21500
333
+ },
334
+ {
335
+ "epoch": 2.154539222407208,
336
+ "grad_norm": 0.07923103123903275,
337
+ "learning_rate": 1.2819834818659615e-05,
338
+ "loss": 0.0042,
339
+ "step": 22000
340
+ },
341
+ {
342
+ "epoch": 2.2035060229164625,
343
+ "grad_norm": 0.01526894886046648,
344
+ "learning_rate": 1.2656612150295434e-05,
345
+ "loss": 0.0041,
346
+ "step": 22500
347
+ },
348
+ {
349
+ "epoch": 2.2524728234257174,
350
+ "grad_norm": 0.09783605486154556,
351
+ "learning_rate": 1.249338948193125e-05,
352
+ "loss": 0.0042,
353
+ "step": 23000
354
+ },
355
+ {
356
+ "epoch": 2.301439623934972,
357
+ "grad_norm": 0.12495558708906174,
358
+ "learning_rate": 1.2330819704240527e-05,
359
+ "loss": 0.0042,
360
+ "step": 23500
361
+ },
362
+ {
363
+ "epoch": 2.350406424444227,
364
+ "grad_norm": 0.09078874439001083,
365
+ "learning_rate": 1.2167597035876343e-05,
366
+ "loss": 0.0045,
367
+ "step": 24000
368
+ },
369
+ {
370
+ "epoch": 2.3993732249534814,
371
+ "grad_norm": 0.0656205415725708,
372
+ "learning_rate": 1.2004374367512162e-05,
373
+ "loss": 0.0039,
374
+ "step": 24500
375
+ },
376
+ {
377
+ "epoch": 2.4483400254627363,
378
+ "grad_norm": 0.22209672629833221,
379
+ "learning_rate": 1.184115169914798e-05,
380
+ "loss": 0.0043,
381
+ "step": 25000
382
+ },
383
+ {
384
+ "epoch": 2.497306825971991,
385
+ "grad_norm": 0.4231460988521576,
386
+ "learning_rate": 1.1677929030783796e-05,
387
+ "loss": 0.0042,
388
+ "step": 25500
389
+ },
390
+ {
391
+ "epoch": 2.5462736264812458,
392
+ "grad_norm": 0.0051184347830712795,
393
+ "learning_rate": 1.1514706362419614e-05,
394
+ "loss": 0.0039,
395
+ "step": 26000
396
+ },
397
+ {
398
+ "epoch": 2.5952404269905003,
399
+ "grad_norm": 0.15397199988365173,
400
+ "learning_rate": 1.1351483694055432e-05,
401
+ "loss": 0.0037,
402
+ "step": 26500
403
+ },
404
+ {
405
+ "epoch": 2.6442072274997552,
406
+ "grad_norm": 0.03935805708169937,
407
+ "learning_rate": 1.1188261025691249e-05,
408
+ "loss": 0.0035,
409
+ "step": 27000
410
+ },
411
+ {
412
+ "epoch": 2.6931740280090097,
413
+ "grad_norm": 0.011553222313523293,
414
+ "learning_rate": 1.1025364802663794e-05,
415
+ "loss": 0.0038,
416
+ "step": 27500
417
+ },
418
+ {
419
+ "epoch": 2.7421408285182647,
420
+ "grad_norm": 0.06817249953746796,
421
+ "learning_rate": 1.0862142134299612e-05,
422
+ "loss": 0.0035,
423
+ "step": 28000
424
+ },
425
+ {
426
+ "epoch": 2.791107629027519,
427
+ "grad_norm": 0.003078105626627803,
428
+ "learning_rate": 1.0699245911272159e-05,
429
+ "loss": 0.0036,
430
+ "step": 28500
431
+ },
432
+ {
433
+ "epoch": 2.840074429536774,
434
+ "grad_norm": 0.11958350241184235,
435
+ "learning_rate": 1.0536023242907975e-05,
436
+ "loss": 0.0037,
437
+ "step": 29000
438
+ },
439
+ {
440
+ "epoch": 2.8890412300460286,
441
+ "grad_norm": 0.17206184566020966,
442
+ "learning_rate": 1.0372800574543793e-05,
443
+ "loss": 0.0036,
444
+ "step": 29500
445
+ },
446
+ {
447
+ "epoch": 2.9380080305552836,
448
+ "grad_norm": 0.018106259405612946,
449
+ "learning_rate": 1.0209577906179611e-05,
450
+ "loss": 0.0035,
451
+ "step": 30000
452
+ },
453
+ {
454
+ "epoch": 2.986974831064538,
455
+ "grad_norm": 0.14708341658115387,
456
+ "learning_rate": 1.0046355237815428e-05,
457
+ "loss": 0.0035,
458
+ "step": 30500
459
+ },
460
+ {
461
+ "epoch": 3.0,
462
+ "eval_accuracy": 0.8998196198140697,
463
+ "eval_f1": 0.9224077451943314,
464
+ "eval_loss": 0.0033930453937500715,
465
+ "eval_roc_auc": 0.9558131614465735,
466
+ "eval_runtime": 55.6722,
467
+ "eval_samples_per_second": 129.454,
468
+ "eval_steps_per_second": 16.184,
469
+ "step": 30633
470
+ },
471
+ {
472
+ "epoch": 3.035941631573793,
473
+ "grad_norm": 0.03020176850259304,
474
+ "learning_rate": 9.883132569451246e-06,
475
+ "loss": 0.0033,
476
+ "step": 31000
477
+ },
478
+ {
479
+ "epoch": 3.0849084320830475,
480
+ "grad_norm": 0.1869770586490631,
481
+ "learning_rate": 9.720236346423793e-06,
482
+ "loss": 0.0029,
483
+ "step": 31500
484
+ },
485
+ {
486
+ "epoch": 3.1338752325923025,
487
+ "grad_norm": 0.02884034253656864,
488
+ "learning_rate": 9.55701367805961e-06,
489
+ "loss": 0.0033,
490
+ "step": 32000
491
+ },
492
+ {
493
+ "epoch": 3.182842033101557,
494
+ "grad_norm": 0.003137261839583516,
495
+ "learning_rate": 9.393791009695427e-06,
496
+ "loss": 0.0031,
497
+ "step": 32500
498
+ },
499
+ {
500
+ "epoch": 3.231808833610812,
501
+ "grad_norm": 0.007661271840333939,
502
+ "learning_rate": 9.230568341331245e-06,
503
+ "loss": 0.0028,
504
+ "step": 33000
505
+ },
506
+ {
507
+ "epoch": 3.2807756341200665,
508
+ "grad_norm": 0.006506490521132946,
509
+ "learning_rate": 9.06767211830379e-06,
510
+ "loss": 0.0029,
511
+ "step": 33500
512
+ },
513
+ {
514
+ "epoch": 3.3297424346293214,
515
+ "grad_norm": 0.03698953613638878,
516
+ "learning_rate": 8.904449449939609e-06,
517
+ "loss": 0.003,
518
+ "step": 34000
519
+ },
520
+ {
521
+ "epoch": 3.378709235138576,
522
+ "grad_norm": 0.1853983998298645,
523
+ "learning_rate": 8.741226781575427e-06,
524
+ "loss": 0.0029,
525
+ "step": 34500
526
+ },
527
+ {
528
+ "epoch": 3.427676035647831,
529
+ "grad_norm": 0.053507931530475616,
530
+ "learning_rate": 8.578004113211243e-06,
531
+ "loss": 0.0029,
532
+ "step": 35000
533
+ },
534
+ {
535
+ "epoch": 3.4766428361570854,
536
+ "grad_norm": 0.10720884054899216,
537
+ "learning_rate": 8.414781444847061e-06,
538
+ "loss": 0.003,
539
+ "step": 35500
540
+ },
541
+ {
542
+ "epoch": 3.5256096366663403,
543
+ "grad_norm": 0.015493680723011494,
544
+ "learning_rate": 8.25155877648288e-06,
545
+ "loss": 0.0031,
546
+ "step": 36000
547
+ },
548
+ {
549
+ "epoch": 3.574576437175595,
550
+ "grad_norm": 0.07669718563556671,
551
+ "learning_rate": 8.088336108118696e-06,
552
+ "loss": 0.0029,
553
+ "step": 36500
554
+ },
555
+ {
556
+ "epoch": 3.6235432376848498,
557
+ "grad_norm": 0.16198168694972992,
558
+ "learning_rate": 7.925439885091243e-06,
559
+ "loss": 0.003,
560
+ "step": 37000
561
+ },
562
+ {
563
+ "epoch": 3.6725100381941043,
564
+ "grad_norm": 0.00896318256855011,
565
+ "learning_rate": 7.76254366206379e-06,
566
+ "loss": 0.0029,
567
+ "step": 37500
568
+ },
569
+ {
570
+ "epoch": 3.721476838703359,
571
+ "grad_norm": 0.12104916572570801,
572
+ "learning_rate": 7.599320993699606e-06,
573
+ "loss": 0.0028,
574
+ "step": 38000
575
+ },
576
+ {
577
+ "epoch": 3.7704436392126137,
578
+ "grad_norm": 0.10880939662456512,
579
+ "learning_rate": 7.436098325335424e-06,
580
+ "loss": 0.0029,
581
+ "step": 38500
582
+ },
583
+ {
584
+ "epoch": 3.8194104397218687,
585
+ "grad_norm": 0.030039768666028976,
586
+ "learning_rate": 7.272875656971241e-06,
587
+ "loss": 0.0027,
588
+ "step": 39000
589
+ },
590
+ {
591
+ "epoch": 3.868377240231123,
592
+ "grad_norm": 0.09756383299827576,
593
+ "learning_rate": 7.109652988607058e-06,
594
+ "loss": 0.0029,
595
+ "step": 39500
596
+ },
597
+ {
598
+ "epoch": 3.917344040740378,
599
+ "grad_norm": 0.002539890818297863,
600
+ "learning_rate": 6.9464303202428765e-06,
601
+ "loss": 0.0027,
602
+ "step": 40000
603
+ },
604
+ {
605
+ "epoch": 3.9663108412496326,
606
+ "grad_norm": 0.08850258588790894,
607
+ "learning_rate": 6.783207651878694e-06,
608
+ "loss": 0.0029,
609
+ "step": 40500
610
+ },
611
+ {
612
+ "epoch": 4.0,
613
+ "eval_accuracy": 0.9124462328291938,
614
+ "eval_f1": 0.9324835411122006,
615
+ "eval_loss": 0.0029195661190897226,
616
+ "eval_roc_auc": 0.9616052804493336,
617
+ "eval_runtime": 55.6822,
618
+ "eval_samples_per_second": 129.431,
619
+ "eval_steps_per_second": 16.181,
620
+ "step": 40844
621
+ },
622
+ {
623
+ "epoch": 4.015277641758887,
624
+ "grad_norm": 0.1031348779797554,
625
+ "learning_rate": 6.619984983514511e-06,
626
+ "loss": 0.0027,
627
+ "step": 41000
628
+ },
629
+ {
630
+ "epoch": 4.0642444422681425,
631
+ "grad_norm": 0.07331918925046921,
632
+ "learning_rate": 6.456762315150329e-06,
633
+ "loss": 0.0025,
634
+ "step": 41500
635
+ },
636
+ {
637
+ "epoch": 4.113211242777397,
638
+ "grad_norm": 0.10652918368577957,
639
+ "learning_rate": 6.2935396467861465e-06,
640
+ "loss": 0.0025,
641
+ "step": 42000
642
+ },
643
+ {
644
+ "epoch": 4.1621780432866515,
645
+ "grad_norm": 0.11473935097455978,
646
+ "learning_rate": 6.130316978421964e-06,
647
+ "loss": 0.0026,
648
+ "step": 42500
649
+ },
650
+ {
651
+ "epoch": 4.211144843795906,
652
+ "grad_norm": 0.0035420297645032406,
653
+ "learning_rate": 5.967094310057782e-06,
654
+ "loss": 0.0025,
655
+ "step": 43000
656
+ },
657
+ {
658
+ "epoch": 4.260111644305161,
659
+ "grad_norm": 0.01006217859685421,
660
+ "learning_rate": 5.8045245323670555e-06,
661
+ "loss": 0.0025,
662
+ "step": 43500
663
+ },
664
+ {
665
+ "epoch": 4.309078444814416,
666
+ "grad_norm": 0.09146247059106827,
667
+ "learning_rate": 5.641301864002873e-06,
668
+ "loss": 0.0022,
669
+ "step": 44000
670
+ },
671
+ {
672
+ "epoch": 4.35804524532367,
673
+ "grad_norm": 0.05604245886206627,
674
+ "learning_rate": 5.47807919563869e-06,
675
+ "loss": 0.0025,
676
+ "step": 44500
677
+ },
678
+ {
679
+ "epoch": 4.407012045832925,
680
+ "grad_norm": 0.017924629151821136,
681
+ "learning_rate": 5.314856527274508e-06,
682
+ "loss": 0.0025,
683
+ "step": 45000
684
+ },
685
+ {
686
+ "epoch": 4.45597884634218,
687
+ "grad_norm": 0.06204945594072342,
688
+ "learning_rate": 5.1516338589103255e-06,
689
+ "loss": 0.0026,
690
+ "step": 45500
691
+ },
692
+ {
693
+ "epoch": 4.504945646851435,
694
+ "grad_norm": 0.06027592718601227,
695
+ "learning_rate": 4.988411190546144e-06,
696
+ "loss": 0.0024,
697
+ "step": 46000
698
+ },
699
+ {
700
+ "epoch": 4.553912447360689,
701
+ "grad_norm": 0.26520290970802307,
702
+ "learning_rate": 4.825188522181961e-06,
703
+ "loss": 0.0022,
704
+ "step": 46500
705
+ },
706
+ {
707
+ "epoch": 4.602879247869944,
708
+ "grad_norm": 0.025959959253668785,
709
+ "learning_rate": 4.661965853817778e-06,
710
+ "loss": 0.0022,
711
+ "step": 47000
712
+ },
713
+ {
714
+ "epoch": 4.651846048379199,
715
+ "grad_norm": 0.0017334806034341455,
716
+ "learning_rate": 4.498743185453596e-06,
717
+ "loss": 0.0026,
718
+ "step": 47500
719
+ },
720
+ {
721
+ "epoch": 4.700812848888454,
722
+ "grad_norm": 0.09476437419652939,
723
+ "learning_rate": 4.335520517089414e-06,
724
+ "loss": 0.0024,
725
+ "step": 48000
726
+ },
727
+ {
728
+ "epoch": 4.749779649397708,
729
+ "grad_norm": 0.011143738403916359,
730
+ "learning_rate": 4.172297848725231e-06,
731
+ "loss": 0.0021,
732
+ "step": 48500
733
+ },
734
+ {
735
+ "epoch": 4.798746449906963,
736
+ "grad_norm": 0.10621017217636108,
737
+ "learning_rate": 4.009075180361049e-06,
738
+ "loss": 0.0023,
739
+ "step": 49000
740
+ },
741
+ {
742
+ "epoch": 4.847713250416218,
743
+ "grad_norm": 0.004438555799424648,
744
+ "learning_rate": 3.845852511996866e-06,
745
+ "loss": 0.0023,
746
+ "step": 49500
747
+ },
748
+ {
749
+ "epoch": 4.896680050925473,
750
+ "grad_norm": 0.09955357015132904,
751
+ "learning_rate": 3.682629843632684e-06,
752
+ "loss": 0.0022,
753
+ "step": 50000
754
+ },
755
+ {
756
+ "epoch": 4.945646851434727,
757
+ "grad_norm": 0.14140157401561737,
758
+ "learning_rate": 3.51973362060523e-06,
759
+ "loss": 0.0021,
760
+ "step": 50500
761
+ },
762
+ {
763
+ "epoch": 4.994613651943982,
764
+ "grad_norm": 0.06944791227579117,
765
+ "learning_rate": 3.3565109522410477e-06,
766
+ "loss": 0.0022,
767
+ "step": 51000
768
+ },
769
+ {
770
+ "epoch": 5.0,
771
+ "eval_accuracy": 0.9174413764395727,
772
+ "eval_f1": 0.9352578475336324,
773
+ "eval_loss": 0.0027621558401733637,
774
+ "eval_roc_auc": 0.9627978767199474,
775
+ "eval_runtime": 55.5741,
776
+ "eval_samples_per_second": 129.683,
777
+ "eval_steps_per_second": 16.213,
778
+ "step": 51055
779
+ },
780
+ {
781
+ "epoch": 5.043580452453237,
782
+ "grad_norm": 0.011898011900484562,
783
+ "learning_rate": 3.1932882838768654e-06,
784
+ "loss": 0.0021,
785
+ "step": 51500
786
+ },
787
+ {
788
+ "epoch": 5.0925472529624916,
789
+ "grad_norm": 0.09524281322956085,
790
+ "learning_rate": 3.0300656155126827e-06,
791
+ "loss": 0.002,
792
+ "step": 52000
793
+ },
794
+ {
795
+ "epoch": 5.141514053471746,
796
+ "grad_norm": 0.08005507290363312,
797
+ "learning_rate": 2.8668429471485004e-06,
798
+ "loss": 0.0019,
799
+ "step": 52500
800
+ },
801
+ {
802
+ "epoch": 5.190480853981001,
803
+ "grad_norm": 0.05041489377617836,
804
+ "learning_rate": 2.703946724121046e-06,
805
+ "loss": 0.0022,
806
+ "step": 53000
807
+ },
808
+ {
809
+ "epoch": 5.239447654490256,
810
+ "grad_norm": 0.001608343911357224,
811
+ "learning_rate": 2.5407240557568635e-06,
812
+ "loss": 0.0019,
813
+ "step": 53500
814
+ },
815
+ {
816
+ "epoch": 5.2884144549995105,
817
+ "grad_norm": 0.11538127809762955,
818
+ "learning_rate": 2.3775013873926812e-06,
819
+ "loss": 0.0019,
820
+ "step": 54000
821
+ },
822
+ {
823
+ "epoch": 5.337381255508765,
824
+ "grad_norm": 0.11458936333656311,
825
+ "learning_rate": 2.214278719028499e-06,
826
+ "loss": 0.002,
827
+ "step": 54500
828
+ },
829
+ {
830
+ "epoch": 5.3863480560180195,
831
+ "grad_norm": 0.07239941507577896,
832
+ "learning_rate": 2.051382496001045e-06,
833
+ "loss": 0.0021,
834
+ "step": 55000
835
+ },
836
+ {
837
+ "epoch": 5.435314856527275,
838
+ "grad_norm": 0.1313902884721756,
839
+ "learning_rate": 1.8881598276368623e-06,
840
+ "loss": 0.0018,
841
+ "step": 55500
842
+ },
843
+ {
844
+ "epoch": 5.484281657036529,
845
+ "grad_norm": 0.0023473671171814203,
846
+ "learning_rate": 1.7249371592726798e-06,
847
+ "loss": 0.0021,
848
+ "step": 56000
849
+ },
850
+ {
851
+ "epoch": 5.533248457545784,
852
+ "grad_norm": 0.09586118161678314,
853
+ "learning_rate": 1.5617144909084975e-06,
854
+ "loss": 0.0019,
855
+ "step": 56500
856
+ },
857
+ {
858
+ "epoch": 5.582215258055038,
859
+ "grad_norm": 0.08927006274461746,
860
+ "learning_rate": 1.398491822544315e-06,
861
+ "loss": 0.0019,
862
+ "step": 57000
863
+ },
864
+ {
865
+ "epoch": 5.631182058564294,
866
+ "grad_norm": 0.011845240369439125,
867
+ "learning_rate": 1.235595599516861e-06,
868
+ "loss": 0.002,
869
+ "step": 57500
870
+ },
871
+ {
872
+ "epoch": 5.680148859073548,
873
+ "grad_norm": 0.041209351271390915,
874
+ "learning_rate": 1.0723729311526786e-06,
875
+ "loss": 0.002,
876
+ "step": 58000
877
+ },
878
+ {
879
+ "epoch": 5.729115659582803,
880
+ "grad_norm": 0.09277820587158203,
881
+ "learning_rate": 9.091502627884961e-07,
882
+ "loss": 0.002,
883
+ "step": 58500
884
+ },
885
+ {
886
+ "epoch": 5.778082460092057,
887
+ "grad_norm": 0.07851295173168182,
888
+ "learning_rate": 7.459275944243137e-07,
889
+ "loss": 0.002,
890
+ "step": 59000
891
+ },
892
+ {
893
+ "epoch": 5.827049260601313,
894
+ "grad_norm": 0.046077970415353775,
895
+ "learning_rate": 5.827049260601313e-07,
896
+ "loss": 0.0018,
897
+ "step": 59500
898
+ },
899
+ {
900
+ "epoch": 5.876016061110567,
901
+ "grad_norm": 0.13472139835357666,
902
+ "learning_rate": 4.198087030326772e-07,
903
+ "loss": 0.0019,
904
+ "step": 60000
905
+ },
906
+ {
907
+ "epoch": 5.924982861619822,
908
+ "grad_norm": 0.05562426894903183,
909
+ "learning_rate": 2.5658603466849477e-07,
910
+ "loss": 0.0019,
911
+ "step": 60500
912
+ },
913
+ {
914
+ "epoch": 5.973949662129076,
915
+ "grad_norm": 0.016598107293248177,
916
+ "learning_rate": 9.368981164104072e-08,
917
+ "loss": 0.0019,
918
+ "step": 61000
919
+ },
920
+ {
921
+ "epoch": 6.0,
922
+ "eval_accuracy": 0.920632718190648,
923
+ "eval_f1": 0.9361970057366729,
924
+ "eval_loss": 0.002764922333881259,
925
+ "eval_roc_auc": 0.9639744188099952,
926
+ "eval_runtime": 55.7082,
927
+ "eval_samples_per_second": 129.37,
928
+ "eval_steps_per_second": 16.174,
929
+ "step": 61266
930
+ }
931
+ ],
932
+ "logging_steps": 500,
933
+ "max_steps": 61266,
934
+ "num_input_tokens_seen": 0,
935
+ "num_train_epochs": 6,
936
+ "save_steps": 500,
937
+ "stateful_callbacks": {
938
+ "TrainerControl": {
939
+ "args": {
940
+ "should_epoch_stop": false,
941
+ "should_evaluate": false,
942
+ "should_log": false,
943
+ "should_save": true,
944
+ "should_training_stop": true
945
+ },
946
+ "attributes": {}
947
+ }
948
+ },
949
+ "total_flos": 1.2909461039327232e+17,
950
+ "train_batch_size": 8,
951
+ "trial_name": null,
952
+ "trial_params": null
953
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a588501ad64dc7adbf0d74d74c4fd5d40991870acf2b2b7bde589c083735114d
3
+ size 5176