tokenization.py
Browse files- tokenization.py +82 -0
tokenization.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
@file : tokenization.py
|
3 |
+
@author : xiaolu
|
4 |
+
@email : [email protected]
|
5 |
+
@time : 2022-02-28
|
6 |
+
"""
|
7 |
+
import jieba
|
8 |
+
from transformers import BasicTokenizer, BertTokenizer
|
9 |
+
|
10 |
+
|
11 |
+
class CustomBasicTokenizer(BasicTokenizer):
|
12 |
+
def __init__(self,
|
13 |
+
vocab,
|
14 |
+
do_lower_case=True,
|
15 |
+
never_split=None,
|
16 |
+
tokenize_chinese_chars=True,
|
17 |
+
strip_accents=None):
|
18 |
+
super().__init__(do_lower_case=do_lower_case,
|
19 |
+
never_split=never_split,
|
20 |
+
tokenize_chinese_chars=tokenize_chinese_chars,
|
21 |
+
strip_accents=strip_accents)
|
22 |
+
|
23 |
+
self.vocab = vocab
|
24 |
+
|
25 |
+
def _tokenize_chinese_chars(self, text):
|
26 |
+
output = []
|
27 |
+
'''
|
28 |
+
1、输入一个句子s,用pre_tokenize先分一次词,得到[w1,w2,…,wl];
|
29 |
+
2、遍历各个wi,如果wi在词表中则保留,否则将wi用BERT自带的tokenize函数再分一次;
|
30 |
+
3、将每个wi的tokenize结果有序拼接起来,作为最后的tokenize结果。
|
31 |
+
'''
|
32 |
+
for wholeword in jieba.cut(text, HMM=False):
|
33 |
+
if wholeword in self.vocab:
|
34 |
+
output.append(" ")
|
35 |
+
output.append(wholeword)
|
36 |
+
output.append(" ")
|
37 |
+
else:
|
38 |
+
for char in wholeword:
|
39 |
+
cp = ord(char)
|
40 |
+
if self._is_chinese_char(cp):
|
41 |
+
output.append(" ")
|
42 |
+
output.append(char)
|
43 |
+
output.append(" ")
|
44 |
+
else:
|
45 |
+
output.append(char)
|
46 |
+
return "".join(output)
|
47 |
+
|
48 |
+
|
49 |
+
class WoBertTokenizer(BertTokenizer):
|
50 |
+
def __init__(self,
|
51 |
+
vocab_file,
|
52 |
+
do_lower_case=True,
|
53 |
+
do_basic_tokenize=True,
|
54 |
+
never_split=None,
|
55 |
+
unk_token="[UNK]",
|
56 |
+
sep_token="[SEP]",
|
57 |
+
pad_token="[PAD]",
|
58 |
+
cls_token="[CLS]",
|
59 |
+
mask_token="[MASK]",
|
60 |
+
tokenize_chinese_chars=True,
|
61 |
+
strip_accents=None,
|
62 |
+
**kwargs):
|
63 |
+
super().__init__(vocab_file,
|
64 |
+
do_lower_case=do_lower_case,
|
65 |
+
do_basic_tokenize=do_basic_tokenize,
|
66 |
+
never_split=never_split,
|
67 |
+
unk_token=unk_token,
|
68 |
+
sep_token=sep_token,
|
69 |
+
pad_token=pad_token,
|
70 |
+
cls_token=cls_token,
|
71 |
+
mask_token=mask_token,
|
72 |
+
tokenize_chinese_chars=tokenize_chinese_chars,
|
73 |
+
strip_accents=strip_accents,
|
74 |
+
**kwargs)
|
75 |
+
if self.do_basic_tokenize:
|
76 |
+
self.basic_tokenizer = CustomBasicTokenizer(
|
77 |
+
vocab=self.vocab,
|
78 |
+
do_lower_case=do_lower_case,
|
79 |
+
never_split=never_split,
|
80 |
+
tokenize_chinese_chars=tokenize_chinese_chars,
|
81 |
+
strip_accents=strip_accents,
|
82 |
+
)
|