chengzl18
/

thucbert-cm

Model card Files Files and versions

chengzl18 commited on Mar 21, 2023

Commit

65f4c65

·

1 Parent(s): df4d1ef

Upload with huggingface_hub

Files changed (1) hide show

cctokenizer.py +1 -3

cctokenizer.py CHANGED Viewed

@@ -251,7 +251,6 @@ class ChineseCharTokenizer(BertTokenizer):
     vocab_files_names = {"vocab_file": "vocab.txt", 'mapping_file': "replace.json"}
     def __init__(self, vocab_file, *args, **kwargs):
-        print(vocab_file)
         super(ChineseCharTokenizer, self).__init__(vocab_file, *args, **kwargs)
         self.unicoder_ranges = get_unicode_ranges()
         self.enclosed_tokens = {token for token in self.vocab if token[0] == '[' and token[-1] == ']' and 'unused' not in token}
@@ -260,8 +259,7 @@ class ChineseCharTokenizer(BertTokenizer):
             [token for token in self.enclosed_tokens if len(token) == 6],
             [token for token in self.enclosed_tokens if len(token) == 7]
         ]
-        print(vocab_file)
-        self.replace_map = load_json(os.path.join(self.name_or_path, 'replace.json'))
     # # [EOS]相当于逗号、换行，不用看作special token
     def convert_token_to_representative(self, token: str) -> str:

     vocab_files_names = {"vocab_file": "vocab.txt", 'mapping_file': "replace.json"}
     def __init__(self, vocab_file, *args, **kwargs):
         super(ChineseCharTokenizer, self).__init__(vocab_file, *args, **kwargs)
         self.unicoder_ranges = get_unicode_ranges()
         self.enclosed_tokens = {token for token in self.vocab if token[0] == '[' and token[-1] == ']' and 'unused' not in token}
             [token for token in self.enclosed_tokens if len(token) == 6],
             [token for token in self.enclosed_tokens if len(token) == 7]
         ]
+        self.replace_map = load_json(os.path.join(os.path.dirname(vocab_file), 'replace.json'))
     # # [EOS]相当于逗号、换行，不用看作special token
     def convert_token_to_representative(self, token: str) -> str: