luxiao commited on
Commit
dfe96b1
·
1 Parent(s): 08bee6c

tokenization.py

Browse files
Files changed (1) hide show
  1. tokenization.py +82 -0
tokenization.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ @file : tokenization.py
3
+ @author : xiaolu
4
+ @email : [email protected]
5
+ @time : 2022-02-28
6
+ """
7
+ import jieba
8
+ from transformers import BasicTokenizer, BertTokenizer
9
+
10
+
11
+ class CustomBasicTokenizer(BasicTokenizer):
12
+ def __init__(self,
13
+ vocab,
14
+ do_lower_case=True,
15
+ never_split=None,
16
+ tokenize_chinese_chars=True,
17
+ strip_accents=None):
18
+ super().__init__(do_lower_case=do_lower_case,
19
+ never_split=never_split,
20
+ tokenize_chinese_chars=tokenize_chinese_chars,
21
+ strip_accents=strip_accents)
22
+
23
+ self.vocab = vocab
24
+
25
+ def _tokenize_chinese_chars(self, text):
26
+ output = []
27
+ '''
28
+ 1、输入一个句子s,用pre_tokenize先分一次词,得到[w1,w2,…,wl];
29
+ 2、遍历各个wi,如果wi在词表中则保留,否则将wi用BERT自带的tokenize函数再分一次;
30
+ 3、将每个wi的tokenize结果有序拼接起来,作为最后的tokenize结果。
31
+ '''
32
+ for wholeword in jieba.cut(text, HMM=False):
33
+ if wholeword in self.vocab:
34
+ output.append(" ")
35
+ output.append(wholeword)
36
+ output.append(" ")
37
+ else:
38
+ for char in wholeword:
39
+ cp = ord(char)
40
+ if self._is_chinese_char(cp):
41
+ output.append(" ")
42
+ output.append(char)
43
+ output.append(" ")
44
+ else:
45
+ output.append(char)
46
+ return "".join(output)
47
+
48
+
49
+ class WoBertTokenizer(BertTokenizer):
50
+ def __init__(self,
51
+ vocab_file,
52
+ do_lower_case=True,
53
+ do_basic_tokenize=True,
54
+ never_split=None,
55
+ unk_token="[UNK]",
56
+ sep_token="[SEP]",
57
+ pad_token="[PAD]",
58
+ cls_token="[CLS]",
59
+ mask_token="[MASK]",
60
+ tokenize_chinese_chars=True,
61
+ strip_accents=None,
62
+ **kwargs):
63
+ super().__init__(vocab_file,
64
+ do_lower_case=do_lower_case,
65
+ do_basic_tokenize=do_basic_tokenize,
66
+ never_split=never_split,
67
+ unk_token=unk_token,
68
+ sep_token=sep_token,
69
+ pad_token=pad_token,
70
+ cls_token=cls_token,
71
+ mask_token=mask_token,
72
+ tokenize_chinese_chars=tokenize_chinese_chars,
73
+ strip_accents=strip_accents,
74
+ **kwargs)
75
+ if self.do_basic_tokenize:
76
+ self.basic_tokenizer = CustomBasicTokenizer(
77
+ vocab=self.vocab,
78
+ do_lower_case=do_lower_case,
79
+ never_split=never_split,
80
+ tokenize_chinese_chars=tokenize_chinese_chars,
81
+ strip_accents=strip_accents,
82
+ )