Upload with huggingface_hub
Browse files- cctokenizer.py +14 -24
- config.json +1 -0
- pytorch_model.bin +1 -1
- special_tokens_map.json +1 -0
- tokenizer_config.json +7 -7
cctokenizer.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
| 1 |
-
"""Tokenization classes for
|
| 2 |
-
|
| 3 |
-
from typing import List, Optional, Tuple, Union
|
| 4 |
|
|
|
|
| 5 |
from transformers import BertTokenizer
|
| 6 |
import numpy as np
|
| 7 |
import os
|
| 8 |
import re
|
| 9 |
-
|
| 10 |
|
| 11 |
# https://www.ling.upenn.edu/courses/Spring_2003/ling538/UnicodeRanges.html
|
| 12 |
# https://www.microfocus.com/documentation/idol/IDOL/Servers/IDOLServer/11.2/Guides/html/English/expert/Content/IDOLExpert/Languages/Script_Ranges.htm
|
|
@@ -259,7 +258,8 @@ class ChineseCharTokenizer(BertTokenizer):
|
|
| 259 |
[token for token in self.enclosed_tokens if len(token) == 6],
|
| 260 |
[token for token in self.enclosed_tokens if len(token) == 7]
|
| 261 |
]
|
| 262 |
-
self.
|
|
|
|
| 263 |
|
| 264 |
# # [EOS]相当于逗号、换行,不用看作special token
|
| 265 |
def convert_token_to_representative(self, token: str) -> str:
|
|
@@ -267,8 +267,8 @@ class ChineseCharTokenizer(BertTokenizer):
|
|
| 267 |
if token in self.vocab:
|
| 268 |
return token
|
| 269 |
else:
|
| 270 |
-
assert len(token) == 1
|
| 271 |
-
if re.match(r'\s', token):
|
| 272 |
return ' '
|
| 273 |
v = ord(token)
|
| 274 |
if _is_chinese_char(v):
|
|
@@ -279,6 +279,7 @@ class ChineseCharTokenizer(BertTokenizer):
|
|
| 279 |
else:
|
| 280 |
return '[UNK]'
|
| 281 |
|
|
|
|
| 282 |
def _tokenize(self, text):
|
| 283 |
# 如果没有人为加的特殊赋好,可以不用这个tokenize,list(text)就是tokenize的结果
|
| 284 |
split_tokens = []
|
|
@@ -299,24 +300,13 @@ class ChineseCharTokenizer(BertTokenizer):
|
|
| 299 |
return split_tokens
|
| 300 |
|
| 301 |
def _convert_token_to_id(self, token):
|
| 302 |
-
return self.vocab.get(token, self.
|
| 303 |
|
| 304 |
def convert_tokens_to_string(self, tokens):
|
| 305 |
return ''.join(tokens)
|
| 306 |
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
print(tokenizer.tokenize('tの 我བོя🪐㓨 im Ok O[MASK][SEP]'))
|
| 313 |
-
print('')
|
| 314 |
-
print(tokenizer.enclosed_tokens_by_len)
|
| 315 |
-
|
| 316 |
-
text = 'བོ 我[MASK]я🪐'
|
| 317 |
-
tokens = tokenizer.tokenize(text)
|
| 318 |
-
print(tokens) # བོ(藏语,两个unicode), 空格, 我, [MASK], я(俄语), 🪐
|
| 319 |
-
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
| 320 |
-
print(input_ids) # [UNK], , 我, [MASK], [U_RUS], [U_EMO]
|
| 321 |
-
inputs = tokenizer(text)
|
| 322 |
-
print(inputs)
|
|
|
|
| 1 |
+
"""Tokenization classes for ChineseCharTokenizer."""
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
from typing import Optional, Tuple, Union
|
| 4 |
from transformers import BertTokenizer
|
| 5 |
import numpy as np
|
| 6 |
import os
|
| 7 |
import re
|
| 8 |
+
import shutil
|
| 9 |
|
| 10 |
# https://www.ling.upenn.edu/courses/Spring_2003/ling538/UnicodeRanges.html
|
| 11 |
# https://www.microfocus.com/documentation/idol/IDOL/Servers/IDOLServer/11.2/Guides/html/English/expert/Content/IDOLExpert/Languages/Script_Ranges.htm
|
|
|
|
| 258 |
[token for token in self.enclosed_tokens if len(token) == 6],
|
| 259 |
[token for token in self.enclosed_tokens if len(token) == 7]
|
| 260 |
]
|
| 261 |
+
self.dir = os.path.join(os.path.dirname(vocab_file))
|
| 262 |
+
self.replace_map = load_json(os.path.join(self.dir, 'replace.json'))
|
| 263 |
|
| 264 |
# # [EOS]相当于逗号、换行,不用看作special token
|
| 265 |
def convert_token_to_representative(self, token: str) -> str:
|
|
|
|
| 267 |
if token in self.vocab:
|
| 268 |
return token
|
| 269 |
else:
|
| 270 |
+
assert len(token) == 1, token
|
| 271 |
+
if re.match(r'\s', token): # 匹配\u2003, \t等
|
| 272 |
return ' '
|
| 273 |
v = ord(token)
|
| 274 |
if _is_chinese_char(v):
|
|
|
|
| 279 |
else:
|
| 280 |
return '[UNK]'
|
| 281 |
|
| 282 |
+
# bert的tokenize会加上CLS?
|
| 283 |
def _tokenize(self, text):
|
| 284 |
# 如果没有人为加的特殊赋好,可以不用这个tokenize,list(text)就是tokenize的结果
|
| 285 |
split_tokens = []
|
|
|
|
| 300 |
return split_tokens
|
| 301 |
|
| 302 |
def _convert_token_to_id(self, token):
|
| 303 |
+
return self.vocab.get(self.convert_token_to_representative(token), self.vocab.get(self.unk_token)) # BUG: convert_token_to_representative 不是 id!
|
| 304 |
|
| 305 |
def convert_tokens_to_string(self, tokens):
|
| 306 |
return ''.join(tokens)
|
| 307 |
|
| 308 |
+
def save_pretrained(self, save_directory: Union[str, os.PathLike], legacy_format: Optional[bool] = None, filename_prefix: Optional[str] = None, push_to_hub: bool = False, **kwargs) -> Tuple[str]:
|
| 309 |
+
ret = super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)
|
| 310 |
+
shutil.copyfile(os.path.join(self.dir, 'replace.json'), f'{save_directory}/replace.json')
|
| 311 |
+
shutil.copyfile(os.path.join(self.dir, 'cctokenizer.py'), f'{save_directory}/cctokenizer.py')
|
| 312 |
+
return ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.json
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
{
|
|
|
|
| 2 |
"architectures": [
|
| 3 |
"BertForMaskedLM"
|
| 4 |
],
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "output/2023-04-07_05-25-49/save/step_480000",
|
| 3 |
"architectures": [
|
| 4 |
"BertForMaskedLM"
|
| 5 |
],
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 382042873
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6779545362cf64328e7074b88796295be61c620c2bbede8e03847c472f1add97
|
| 3 |
size 382042873
|
special_tokens_map.json
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"cls_token": "[CLS]",
|
|
|
|
| 3 |
"mask_token": "[MASK]",
|
| 4 |
"pad_token": "[PAD]",
|
| 5 |
"sep_token": "[SEP]",
|
|
|
|
| 1 |
{
|
| 2 |
"cls_token": "[CLS]",
|
| 3 |
+
"eos_token": "[EOS]",
|
| 4 |
"mask_token": "[MASK]",
|
| 5 |
"pad_token": "[PAD]",
|
| 6 |
"sep_token": "[SEP]",
|
tokenizer_config.json
CHANGED
|
@@ -1,10 +1,16 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"cls_token": "[CLS]",
|
| 3 |
"do_basic_tokenize": true,
|
| 4 |
"do_lower_case": true,
|
| 5 |
"mask_token": "[MASK]",
|
| 6 |
"model_max_length": 1000000000000000019884624838656,
|
| 7 |
-
"name_or_path": "chengzl18/
|
| 8 |
"never_split": null,
|
| 9 |
"pad_token": "[PAD]",
|
| 10 |
"sep_token": "[SEP]",
|
|
@@ -12,11 +18,5 @@
|
|
| 12 |
"strip_accents": null,
|
| 13 |
"tokenize_chinese_chars": true,
|
| 14 |
"tokenizer_class": "ChineseCharTokenizer",
|
| 15 |
-
"auto_map": {
|
| 16 |
-
"AutoTokenizer": [
|
| 17 |
-
"cctokenizer.ChineseCharTokenizer",
|
| 18 |
-
null
|
| 19 |
-
]
|
| 20 |
-
},
|
| 21 |
"unk_token": "[UNK]"
|
| 22 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"auto_map": {
|
| 3 |
+
"AutoTokenizer": [
|
| 4 |
+
"cctokenizer.ChineseCharTokenizer",
|
| 5 |
+
null
|
| 6 |
+
]
|
| 7 |
+
},
|
| 8 |
"cls_token": "[CLS]",
|
| 9 |
"do_basic_tokenize": true,
|
| 10 |
"do_lower_case": true,
|
| 11 |
"mask_token": "[MASK]",
|
| 12 |
"model_max_length": 1000000000000000019884624838656,
|
| 13 |
+
"name_or_path": "chengzl18/cctokenizer",
|
| 14 |
"never_split": null,
|
| 15 |
"pad_token": "[PAD]",
|
| 16 |
"sep_token": "[SEP]",
|
|
|
|
| 18 |
"strip_accents": null,
|
| 19 |
"tokenize_chinese_chars": true,
|
| 20 |
"tokenizer_class": "ChineseCharTokenizer",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
"unk_token": "[UNK]"
|
| 22 |
}
|