chengzl18 commited on
Commit
22db59c
1 Parent(s): 5b6e56f

Upload with huggingface_hub

Browse files
cctokenizer.py CHANGED
@@ -1,12 +1,11 @@
1
- """Tokenization classes for THUBert."""
2
-
3
- from typing import List, Optional, Tuple, Union
4
 
 
5
  from transformers import BertTokenizer
6
  import numpy as np
7
  import os
8
  import re
9
-
10
 
11
  # https://www.ling.upenn.edu/courses/Spring_2003/ling538/UnicodeRanges.html
12
  # https://www.microfocus.com/documentation/idol/IDOL/Servers/IDOLServer/11.2/Guides/html/English/expert/Content/IDOLExpert/Languages/Script_Ranges.htm
@@ -259,7 +258,8 @@ class ChineseCharTokenizer(BertTokenizer):
259
  [token for token in self.enclosed_tokens if len(token) == 6],
260
  [token for token in self.enclosed_tokens if len(token) == 7]
261
  ]
262
- self.replace_map = load_json(os.path.join(os.path.dirname(vocab_file), 'replace.json'))
 
263
 
264
  # # [EOS]相当于逗号、换行,不用看作special token
265
  def convert_token_to_representative(self, token: str) -> str:
@@ -267,8 +267,8 @@ class ChineseCharTokenizer(BertTokenizer):
267
  if token in self.vocab:
268
  return token
269
  else:
270
- assert len(token) == 1
271
- if re.match(r'\s', token): # 匹配\u2003, \t等
272
  return ' '
273
  v = ord(token)
274
  if _is_chinese_char(v):
@@ -279,6 +279,7 @@ class ChineseCharTokenizer(BertTokenizer):
279
  else:
280
  return '[UNK]'
281
 
 
282
  def _tokenize(self, text):
283
  # 如果没有人为加的特殊赋好,可以不用这个tokenize,list(text)就是tokenize的结果
284
  split_tokens = []
@@ -299,24 +300,13 @@ class ChineseCharTokenizer(BertTokenizer):
299
  return split_tokens
300
 
301
  def _convert_token_to_id(self, token):
302
- return self.vocab.get(token, self.convert_token_to_representative(token))
303
 
304
  def convert_tokens_to_string(self, tokens):
305
  return ''.join(tokens)
306
 
307
-
308
- if __name__ == '__main__':
309
- tokenizer: ChineseCharTokenizer = ChineseCharTokenizer.from_pretrained("/data03/private/chengzhili/pretrain/bert/tokenizer/bert-base-chinese-char-cm")
310
- for c in '\tt \nのᄌབོяا ㄞ∥∩①₁我🪐㓨 im Ok O走吧鏍𩐏':
311
- print(c, tokenizer.convert_token_to_representative(c))
312
- print(tokenizer.tokenize('tの 我བོя🪐㓨 im Ok O[MASK][SEP]'))
313
- print('')
314
- print(tokenizer.enclosed_tokens_by_len)
315
-
316
- text = 'བོ 我[MASK]я🪐'
317
- tokens = tokenizer.tokenize(text)
318
- print(tokens) # བོ(藏语,两个unicode), 空格, 我, [MASK], я(俄语), 🪐
319
- input_ids = tokenizer.convert_tokens_to_ids(tokens)
320
- print(input_ids) # [UNK], , 我, [MASK], [U_RUS], [U_EMO]
321
- inputs = tokenizer(text)
322
- print(inputs)
 
1
+ """Tokenization classes for ChineseCharTokenizer."""
 
 
2
 
3
+ from typing import Optional, Tuple, Union
4
  from transformers import BertTokenizer
5
  import numpy as np
6
  import os
7
  import re
8
+ import shutil
9
 
10
  # https://www.ling.upenn.edu/courses/Spring_2003/ling538/UnicodeRanges.html
11
  # https://www.microfocus.com/documentation/idol/IDOL/Servers/IDOLServer/11.2/Guides/html/English/expert/Content/IDOLExpert/Languages/Script_Ranges.htm
 
258
  [token for token in self.enclosed_tokens if len(token) == 6],
259
  [token for token in self.enclosed_tokens if len(token) == 7]
260
  ]
261
+ self.dir = os.path.join(os.path.dirname(vocab_file))
262
+ self.replace_map = load_json(os.path.join(self.dir, 'replace.json'))
263
 
264
  # # [EOS]相当于逗号、换行,不用看作special token
265
  def convert_token_to_representative(self, token: str) -> str:
 
267
  if token in self.vocab:
268
  return token
269
  else:
270
+ assert len(token) == 1, token
271
+ if re.match(r'\s', token): # 匹配\u2003, \t等
272
  return ' '
273
  v = ord(token)
274
  if _is_chinese_char(v):
 
279
  else:
280
  return '[UNK]'
281
 
282
+ # bert的tokenize会加上CLS?
283
  def _tokenize(self, text):
284
  # 如果没有人为加的特殊赋好,可以不用这个tokenize,list(text)就是tokenize的结果
285
  split_tokens = []
 
300
  return split_tokens
301
 
302
  def _convert_token_to_id(self, token):
303
+ return self.vocab.get(self.convert_token_to_representative(token), self.vocab.get(self.unk_token)) # BUG: convert_token_to_representative 不是 id!
304
 
305
  def convert_tokens_to_string(self, tokens):
306
  return ''.join(tokens)
307
 
308
+ def save_pretrained(self, save_directory: Union[str, os.PathLike], legacy_format: Optional[bool] = None, filename_prefix: Optional[str] = None, push_to_hub: bool = False, **kwargs) -> Tuple[str]:
309
+ ret = super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)
310
+ shutil.copyfile(os.path.join(self.dir, 'replace.json'), f'{save_directory}/replace.json')
311
+ shutil.copyfile(os.path.join(self.dir, 'cctokenizer.py'), f'{save_directory}/cctokenizer.py')
312
+ return ret
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "architectures": [
3
  "BertForMaskedLM"
4
  ],
 
1
  {
2
+ "_name_or_path": "output/2023-04-07_05-25-49/save/step_480000",
3
  "architectures": [
4
  "BertForMaskedLM"
5
  ],
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:914b38d56f0084daeb0388f7a3282b780f9fd199162188471cad63952678f77f
3
  size 382042873
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6779545362cf64328e7074b88796295be61c620c2bbede8e03847c472f1add97
3
  size 382042873
special_tokens_map.json CHANGED
@@ -1,5 +1,6 @@
1
  {
2
  "cls_token": "[CLS]",
 
3
  "mask_token": "[MASK]",
4
  "pad_token": "[PAD]",
5
  "sep_token": "[SEP]",
 
1
  {
2
  "cls_token": "[CLS]",
3
+ "eos_token": "[EOS]",
4
  "mask_token": "[MASK]",
5
  "pad_token": "[PAD]",
6
  "sep_token": "[SEP]",
tokenizer_config.json CHANGED
@@ -1,10 +1,16 @@
1
  {
 
 
 
 
 
 
2
  "cls_token": "[CLS]",
3
  "do_basic_tokenize": true,
4
  "do_lower_case": true,
5
  "mask_token": "[MASK]",
6
  "model_max_length": 1000000000000000019884624838656,
7
- "name_or_path": "chengzl18/bert-base-chinese-char-cm",
8
  "never_split": null,
9
  "pad_token": "[PAD]",
10
  "sep_token": "[SEP]",
@@ -12,11 +18,5 @@
12
  "strip_accents": null,
13
  "tokenize_chinese_chars": true,
14
  "tokenizer_class": "ChineseCharTokenizer",
15
- "auto_map": {
16
- "AutoTokenizer": [
17
- "cctokenizer.ChineseCharTokenizer",
18
- null
19
- ]
20
- },
21
  "unk_token": "[UNK]"
22
  }
 
1
  {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "cctokenizer.ChineseCharTokenizer",
5
+ null
6
+ ]
7
+ },
8
  "cls_token": "[CLS]",
9
  "do_basic_tokenize": true,
10
  "do_lower_case": true,
11
  "mask_token": "[MASK]",
12
  "model_max_length": 1000000000000000019884624838656,
13
+ "name_or_path": "chengzl18/cctokenizer",
14
  "never_split": null,
15
  "pad_token": "[PAD]",
16
  "sep_token": "[SEP]",
 
18
  "strip_accents": null,
19
  "tokenize_chinese_chars": true,
20
  "tokenizer_class": "ChineseCharTokenizer",
 
 
 
 
 
 
21
  "unk_token": "[UNK]"
22
  }