Upload with huggingface_hub
Browse files- cctokenizer.py +14 -24
- config.json +1 -0
- pytorch_model.bin +1 -1
- special_tokens_map.json +1 -0
- tokenizer_config.json +7 -7
cctokenizer.py
CHANGED
@@ -1,12 +1,11 @@
|
|
1 |
-
"""Tokenization classes for
|
2 |
-
|
3 |
-
from typing import List, Optional, Tuple, Union
|
4 |
|
|
|
5 |
from transformers import BertTokenizer
|
6 |
import numpy as np
|
7 |
import os
|
8 |
import re
|
9 |
-
|
10 |
|
11 |
# https://www.ling.upenn.edu/courses/Spring_2003/ling538/UnicodeRanges.html
|
12 |
# https://www.microfocus.com/documentation/idol/IDOL/Servers/IDOLServer/11.2/Guides/html/English/expert/Content/IDOLExpert/Languages/Script_Ranges.htm
|
@@ -259,7 +258,8 @@ class ChineseCharTokenizer(BertTokenizer):
|
|
259 |
[token for token in self.enclosed_tokens if len(token) == 6],
|
260 |
[token for token in self.enclosed_tokens if len(token) == 7]
|
261 |
]
|
262 |
-
self.
|
|
|
263 |
|
264 |
# # [EOS]相当于逗号、换行,不用看作special token
|
265 |
def convert_token_to_representative(self, token: str) -> str:
|
@@ -267,8 +267,8 @@ class ChineseCharTokenizer(BertTokenizer):
|
|
267 |
if token in self.vocab:
|
268 |
return token
|
269 |
else:
|
270 |
-
assert len(token) == 1
|
271 |
-
if re.match(r'\s', token):
|
272 |
return ' '
|
273 |
v = ord(token)
|
274 |
if _is_chinese_char(v):
|
@@ -279,6 +279,7 @@ class ChineseCharTokenizer(BertTokenizer):
|
|
279 |
else:
|
280 |
return '[UNK]'
|
281 |
|
|
|
282 |
def _tokenize(self, text):
|
283 |
# 如果没有人为加的特殊赋好,可以不用这个tokenize,list(text)就是tokenize的结果
|
284 |
split_tokens = []
|
@@ -299,24 +300,13 @@ class ChineseCharTokenizer(BertTokenizer):
|
|
299 |
return split_tokens
|
300 |
|
301 |
def _convert_token_to_id(self, token):
|
302 |
-
return self.vocab.get(token, self.
|
303 |
|
304 |
def convert_tokens_to_string(self, tokens):
|
305 |
return ''.join(tokens)
|
306 |
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
print(tokenizer.tokenize('tの 我བོя🪐㓨 im Ok O[MASK][SEP]'))
|
313 |
-
print('')
|
314 |
-
print(tokenizer.enclosed_tokens_by_len)
|
315 |
-
|
316 |
-
text = 'བོ 我[MASK]я🪐'
|
317 |
-
tokens = tokenizer.tokenize(text)
|
318 |
-
print(tokens) # བོ(藏语,两个unicode), 空格, 我, [MASK], я(俄语), 🪐
|
319 |
-
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
320 |
-
print(input_ids) # [UNK], , 我, [MASK], [U_RUS], [U_EMO]
|
321 |
-
inputs = tokenizer(text)
|
322 |
-
print(inputs)
|
|
|
1 |
+
"""Tokenization classes for ChineseCharTokenizer."""
|
|
|
|
|
2 |
|
3 |
+
from typing import Optional, Tuple, Union
|
4 |
from transformers import BertTokenizer
|
5 |
import numpy as np
|
6 |
import os
|
7 |
import re
|
8 |
+
import shutil
|
9 |
|
10 |
# https://www.ling.upenn.edu/courses/Spring_2003/ling538/UnicodeRanges.html
|
11 |
# https://www.microfocus.com/documentation/idol/IDOL/Servers/IDOLServer/11.2/Guides/html/English/expert/Content/IDOLExpert/Languages/Script_Ranges.htm
|
|
|
258 |
[token for token in self.enclosed_tokens if len(token) == 6],
|
259 |
[token for token in self.enclosed_tokens if len(token) == 7]
|
260 |
]
|
261 |
+
self.dir = os.path.join(os.path.dirname(vocab_file))
|
262 |
+
self.replace_map = load_json(os.path.join(self.dir, 'replace.json'))
|
263 |
|
264 |
# # [EOS]相当于逗号、换行,不用看作special token
|
265 |
def convert_token_to_representative(self, token: str) -> str:
|
|
|
267 |
if token in self.vocab:
|
268 |
return token
|
269 |
else:
|
270 |
+
assert len(token) == 1, token
|
271 |
+
if re.match(r'\s', token): # 匹配\u2003, \t等
|
272 |
return ' '
|
273 |
v = ord(token)
|
274 |
if _is_chinese_char(v):
|
|
|
279 |
else:
|
280 |
return '[UNK]'
|
281 |
|
282 |
+
# bert的tokenize会加上CLS?
|
283 |
def _tokenize(self, text):
|
284 |
# 如果没有人为加的特殊赋好,可以不用这个tokenize,list(text)就是tokenize的结果
|
285 |
split_tokens = []
|
|
|
300 |
return split_tokens
|
301 |
|
302 |
def _convert_token_to_id(self, token):
|
303 |
+
return self.vocab.get(self.convert_token_to_representative(token), self.vocab.get(self.unk_token)) # BUG: convert_token_to_representative 不是 id!
|
304 |
|
305 |
def convert_tokens_to_string(self, tokens):
|
306 |
return ''.join(tokens)
|
307 |
|
308 |
+
def save_pretrained(self, save_directory: Union[str, os.PathLike], legacy_format: Optional[bool] = None, filename_prefix: Optional[str] = None, push_to_hub: bool = False, **kwargs) -> Tuple[str]:
|
309 |
+
ret = super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)
|
310 |
+
shutil.copyfile(os.path.join(self.dir, 'replace.json'), f'{save_directory}/replace.json')
|
311 |
+
shutil.copyfile(os.path.join(self.dir, 'cctokenizer.py'), f'{save_directory}/cctokenizer.py')
|
312 |
+
return ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.json
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
{
|
|
|
2 |
"architectures": [
|
3 |
"BertForMaskedLM"
|
4 |
],
|
|
|
1 |
{
|
2 |
+
"_name_or_path": "output/2023-04-07_05-25-49/save/step_480000",
|
3 |
"architectures": [
|
4 |
"BertForMaskedLM"
|
5 |
],
|
pytorch_model.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 382042873
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6779545362cf64328e7074b88796295be61c620c2bbede8e03847c472f1add97
|
3 |
size 382042873
|
special_tokens_map.json
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
{
|
2 |
"cls_token": "[CLS]",
|
|
|
3 |
"mask_token": "[MASK]",
|
4 |
"pad_token": "[PAD]",
|
5 |
"sep_token": "[SEP]",
|
|
|
1 |
{
|
2 |
"cls_token": "[CLS]",
|
3 |
+
"eos_token": "[EOS]",
|
4 |
"mask_token": "[MASK]",
|
5 |
"pad_token": "[PAD]",
|
6 |
"sep_token": "[SEP]",
|
tokenizer_config.json
CHANGED
@@ -1,10 +1,16 @@
|
|
1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
"cls_token": "[CLS]",
|
3 |
"do_basic_tokenize": true,
|
4 |
"do_lower_case": true,
|
5 |
"mask_token": "[MASK]",
|
6 |
"model_max_length": 1000000000000000019884624838656,
|
7 |
-
"name_or_path": "chengzl18/
|
8 |
"never_split": null,
|
9 |
"pad_token": "[PAD]",
|
10 |
"sep_token": "[SEP]",
|
@@ -12,11 +18,5 @@
|
|
12 |
"strip_accents": null,
|
13 |
"tokenize_chinese_chars": true,
|
14 |
"tokenizer_class": "ChineseCharTokenizer",
|
15 |
-
"auto_map": {
|
16 |
-
"AutoTokenizer": [
|
17 |
-
"cctokenizer.ChineseCharTokenizer",
|
18 |
-
null
|
19 |
-
]
|
20 |
-
},
|
21 |
"unk_token": "[UNK]"
|
22 |
}
|
|
|
1 |
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoTokenizer": [
|
4 |
+
"cctokenizer.ChineseCharTokenizer",
|
5 |
+
null
|
6 |
+
]
|
7 |
+
},
|
8 |
"cls_token": "[CLS]",
|
9 |
"do_basic_tokenize": true,
|
10 |
"do_lower_case": true,
|
11 |
"mask_token": "[MASK]",
|
12 |
"model_max_length": 1000000000000000019884624838656,
|
13 |
+
"name_or_path": "chengzl18/cctokenizer",
|
14 |
"never_split": null,
|
15 |
"pad_token": "[PAD]",
|
16 |
"sep_token": "[SEP]",
|
|
|
18 |
"strip_accents": null,
|
19 |
"tokenize_chinese_chars": true,
|
20 |
"tokenizer_class": "ChineseCharTokenizer",
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
"unk_token": "[UNK]"
|
22 |
}
|