alex6095 commited on
Commit
660349e
1 Parent(s): a382370

Distilkobert Tokenizer

Browse files
Files changed (1) hide show
  1. tokenization_kobert.py +279 -0
tokenization_kobert.py ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team and Jangwon Park
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Tokenization classes for KoBERT model """
16
+
17
+
18
+ import logging
19
+ import os
20
+ import unicodedata
21
+ from shutil import copyfile
22
+
23
+ from transformers import PreTrainedTokenizer
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+ VOCAB_FILES_NAMES = {
28
+ "vocab_file": "tokenizer_78b3253a26.model",
29
+ "vocab_txt": "vocab.txt",
30
+ }
31
+
32
+ PRETRAINED_VOCAB_FILES_MAP = {
33
+ "vocab_file": {
34
+ "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/tokenizer_78b3253a26.model",
35
+ "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/tokenizer_78b3253a26.model",
36
+ "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/tokenizer_78b3253a26.model",
37
+ },
38
+ "vocab_txt": {
39
+ "monologg/kobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert/vocab.txt",
40
+ "monologg/kobert-lm": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/kobert-lm/vocab.txt",
41
+ "monologg/distilkobert": "https://s3.amazonaws.com/models.huggingface.co/bert/monologg/distilkobert/vocab.txt",
42
+ },
43
+ }
44
+
45
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
46
+ "monologg/kobert": 512,
47
+ "monologg/kobert-lm": 512,
48
+ "monologg/distilkobert": 512,
49
+ }
50
+
51
+ PRETRAINED_INIT_CONFIGURATION = {
52
+ "monologg/kobert": {"do_lower_case": False},
53
+ "monologg/kobert-lm": {"do_lower_case": False},
54
+ "monologg/distilkobert": {"do_lower_case": False},
55
+ }
56
+
57
+ SPIECE_UNDERLINE = "▁"
58
+
59
+
60
+ class KoBertTokenizer(PreTrainedTokenizer):
61
+ """
62
+ SentencePiece based tokenizer. Peculiarities:
63
+ - requires `SentencePiece <https://github.com/google/sentencepiece>`_
64
+ """
65
+
66
+ vocab_files_names = VOCAB_FILES_NAMES
67
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
68
+ pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
69
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
70
+
71
+ def __init__(
72
+ self,
73
+ vocab_file,
74
+ vocab_txt,
75
+ do_lower_case=False,
76
+ remove_space=True,
77
+ keep_accents=False,
78
+ unk_token="[UNK]",
79
+ sep_token="[SEP]",
80
+ pad_token="[PAD]",
81
+ cls_token="[CLS]",
82
+ mask_token="[MASK]",
83
+ **kwargs,
84
+ ):
85
+ super().__init__(
86
+ unk_token=unk_token,
87
+ sep_token=sep_token,
88
+ pad_token=pad_token,
89
+ cls_token=cls_token,
90
+ mask_token=mask_token,
91
+ **kwargs,
92
+ )
93
+
94
+ # Build vocab
95
+ self.token2idx = dict()
96
+ self.idx2token = []
97
+ with open(vocab_txt, "r", encoding="utf-8") as f:
98
+ for idx, token in enumerate(f):
99
+ token = token.strip()
100
+ self.token2idx[token] = idx
101
+ self.idx2token.append(token)
102
+
103
+ try:
104
+ import sentencepiece as spm
105
+ except ImportError:
106
+ logger.warning(
107
+ "You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
108
+ "pip install sentencepiece"
109
+ )
110
+
111
+ self.do_lower_case = do_lower_case
112
+ self.remove_space = remove_space
113
+ self.keep_accents = keep_accents
114
+ self.vocab_file = vocab_file
115
+ self.vocab_txt = vocab_txt
116
+
117
+ self.sp_model = spm.SentencePieceProcessor()
118
+ self.sp_model.Load(vocab_file)
119
+
120
+ @property
121
+ def vocab_size(self):
122
+ return len(self.idx2token)
123
+
124
+ def get_vocab(self):
125
+ return dict(self.token2idx, **self.added_tokens_encoder)
126
+
127
+ def __getstate__(self):
128
+ state = self.__dict__.copy()
129
+ state["sp_model"] = None
130
+ return state
131
+
132
+ def __setstate__(self, d):
133
+ self.__dict__ = d
134
+ try:
135
+ import sentencepiece as spm
136
+ except ImportError:
137
+ logger.warning(
138
+ "You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
139
+ "pip install sentencepiece"
140
+ )
141
+ self.sp_model = spm.SentencePieceProcessor()
142
+ self.sp_model.Load(self.vocab_file)
143
+
144
+ def preprocess_text(self, inputs):
145
+ if self.remove_space:
146
+ outputs = " ".join(inputs.strip().split())
147
+ else:
148
+ outputs = inputs
149
+ outputs = outputs.replace("``", '"').replace("''", '"')
150
+
151
+ if not self.keep_accents:
152
+ outputs = unicodedata.normalize("NFKD", outputs)
153
+ outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
154
+ if self.do_lower_case:
155
+ outputs = outputs.lower()
156
+
157
+ return outputs
158
+
159
+ def _tokenize(self, text):
160
+ """Tokenize a string."""
161
+ text = self.preprocess_text(text)
162
+ pieces = self.sp_model.encode(text, out_type=str)
163
+ new_pieces = []
164
+ for piece in pieces:
165
+ if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
166
+ cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
167
+ if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
168
+ if len(cur_pieces[0]) == 1:
169
+ cur_pieces = cur_pieces[1:]
170
+ else:
171
+ cur_pieces[0] = cur_pieces[0][1:]
172
+ cur_pieces.append(piece[-1])
173
+ new_pieces.extend(cur_pieces)
174
+ else:
175
+ new_pieces.append(piece)
176
+
177
+ return new_pieces
178
+
179
+ def _convert_token_to_id(self, token):
180
+ """ Converts a token (str/unicode) in an id using the vocab. """
181
+ return self.token2idx.get(token, self.token2idx[self.unk_token])
182
+
183
+ def _convert_id_to_token(self, index):
184
+ """Converts an index (integer) in a token (string/unicode) using the vocab."""
185
+ return self.idx2token[index]
186
+
187
+ def convert_tokens_to_string(self, tokens):
188
+ """Converts a sequence of tokens (strings for sub-words) in a single string."""
189
+ out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
190
+ return out_string
191
+
192
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
193
+ """
194
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks
195
+ by concatenating and adding special tokens.
196
+ A KoBERT sequence has the following format:
197
+ single sequence: [CLS] X [SEP]
198
+ pair of sequences: [CLS] A [SEP] B [SEP]
199
+ """
200
+ if token_ids_1 is None:
201
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
202
+ cls = [self.cls_token_id]
203
+ sep = [self.sep_token_id]
204
+ return cls + token_ids_0 + sep + token_ids_1 + sep
205
+
206
+ def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
207
+ """
208
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
209
+ special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
210
+ Args:
211
+ token_ids_0: list of ids (must not contain special tokens)
212
+ token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
213
+ for sequence pairs
214
+ already_has_special_tokens: (default False) Set to True if the token list is already formated with
215
+ special tokens for the model
216
+ Returns:
217
+ A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
218
+ """
219
+
220
+ if already_has_special_tokens:
221
+ if token_ids_1 is not None:
222
+ raise ValueError(
223
+ "You should not supply a second sequence if the provided sequence of "
224
+ "ids is already formated with special tokens for the model."
225
+ )
226
+ return list(
227
+ map(
228
+ lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0,
229
+ token_ids_0,
230
+ )
231
+ )
232
+
233
+ if token_ids_1 is not None:
234
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
235
+ return [1] + ([0] * len(token_ids_0)) + [1]
236
+
237
+ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
238
+ """
239
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
240
+ A KoBERT sequence pair mask has the following format:
241
+ 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
242
+ | first sequence | second sequence
243
+ if token_ids_1 is None, only returns the first portion of the mask (0's).
244
+ """
245
+ sep = [self.sep_token_id]
246
+ cls = [self.cls_token_id]
247
+ if token_ids_1 is None:
248
+ return len(cls + token_ids_0 + sep) * [0]
249
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
250
+
251
+ def save_vocabulary(self, save_directory):
252
+ """Save the sentencepiece vocabulary (copy original file) and special tokens file
253
+ to a directory.
254
+ """
255
+ if not os.path.isdir(save_directory):
256
+ logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
257
+ return
258
+
259
+ # 1. Save sentencepiece model
260
+ out_vocab_model = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_file"])
261
+
262
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_model):
263
+ copyfile(self.vocab_file, out_vocab_model)
264
+
265
+ # 2. Save vocab.txt
266
+ index = 0
267
+ out_vocab_txt = os.path.join(save_directory, VOCAB_FILES_NAMES["vocab_txt"])
268
+ with open(out_vocab_txt, "w", encoding="utf-8") as writer:
269
+ for token, token_index in sorted(self.token2idx.items(), key=lambda kv: kv[1]):
270
+ if index != token_index:
271
+ logger.warning(
272
+ "Saving vocabulary to {}: vocabulary indices are not consecutive."
273
+ " Please check that the vocabulary is not corrupted!".format(out_vocab_txt)
274
+ )
275
+ index = token_index
276
+ writer.write(token + "\n")
277
+ index += 1
278
+
279
+ return out_vocab_model, out_vocab_txt