Upload with huggingface_hub
Browse files- cctokenizer.py +1 -3
cctokenizer.py
CHANGED
|
@@ -251,7 +251,6 @@ class ChineseCharTokenizer(BertTokenizer):
|
|
| 251 |
vocab_files_names = {"vocab_file": "vocab.txt", 'mapping_file': "replace.json"}
|
| 252 |
|
| 253 |
def __init__(self, vocab_file, *args, **kwargs):
|
| 254 |
-
print(vocab_file)
|
| 255 |
super(ChineseCharTokenizer, self).__init__(vocab_file, *args, **kwargs)
|
| 256 |
self.unicoder_ranges = get_unicode_ranges()
|
| 257 |
self.enclosed_tokens = {token for token in self.vocab if token[0] == '[' and token[-1] == ']' and 'unused' not in token}
|
|
@@ -260,8 +259,7 @@ class ChineseCharTokenizer(BertTokenizer):
|
|
| 260 |
[token for token in self.enclosed_tokens if len(token) == 6],
|
| 261 |
[token for token in self.enclosed_tokens if len(token) == 7]
|
| 262 |
]
|
| 263 |
-
|
| 264 |
-
self.replace_map = load_json(os.path.join(self.name_or_path, 'replace.json'))
|
| 265 |
|
| 266 |
# # [EOS]相当于逗号、换行,不用看作special token
|
| 267 |
def convert_token_to_representative(self, token: str) -> str:
|
|
|
|
| 251 |
vocab_files_names = {"vocab_file": "vocab.txt", 'mapping_file': "replace.json"}
|
| 252 |
|
| 253 |
def __init__(self, vocab_file, *args, **kwargs):
|
|
|
|
| 254 |
super(ChineseCharTokenizer, self).__init__(vocab_file, *args, **kwargs)
|
| 255 |
self.unicoder_ranges = get_unicode_ranges()
|
| 256 |
self.enclosed_tokens = {token for token in self.vocab if token[0] == '[' and token[-1] == ']' and 'unused' not in token}
|
|
|
|
| 259 |
[token for token in self.enclosed_tokens if len(token) == 6],
|
| 260 |
[token for token in self.enclosed_tokens if len(token) == 7]
|
| 261 |
]
|
| 262 |
+
self.replace_map = load_json(os.path.join(os.path.dirname(vocab_file), 'replace.json'))
|
|
|
|
| 263 |
|
| 264 |
# # [EOS]相当于逗号、换行,不用看作special token
|
| 265 |
def convert_token_to_representative(self, token: str) -> str:
|