xcczach commited on
Commit
f4dfaa9
·
verified ·
1 Parent(s): adb8833

Upload chinese.py

Browse files
Files changed (1) hide show
  1. chinese.py +176 -0
chinese.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ import cn2an
5
+ from pypinyin import lazy_pinyin, Style
6
+
7
+ from .symbols import punctuation
8
+ from .tone_sandhi import ToneSandhi
9
+ from .text_normlization import TextNormalizer
10
+
11
+ normalizer = lambda x: cn2an.transform(x, "an2cn")
12
+
13
+ current_file_path = os.path.dirname(__file__)
14
+ _pinyin_to_symbol_map = None
15
+ def pinyin_to_symbol_map():
16
+ global _pinyin_to_symbol_map
17
+ if _pinyin_to_symbol_map is None:
18
+ _pinyin_to_symbol_map = {
19
+ line.split("\t")[0]: line.strip().split("\t")[1]
20
+ for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
21
+ }
22
+ return _pinyin_to_symbol_map
23
+
24
+ import jieba_fast.posseg as psg
25
+
26
+
27
+ rep_map = {
28
+ ":": ",",
29
+ ";": ",",
30
+ ",": ",",
31
+ "。": ".",
32
+ "!": "!",
33
+ "?": "?",
34
+ "\n": ".",
35
+ "·": ",",
36
+ "、": ",",
37
+ # "...": "…",
38
+ "$": ".",
39
+ "/": ",",
40
+ "—": "-",
41
+ }
42
+
43
+ tone_modifier = ToneSandhi()
44
+
45
+
46
+ def replace_punctuation(text):
47
+ text = text.replace("嗯", "恩").replace("呣", "母")
48
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
49
+
50
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
51
+
52
+ replaced_text = re.sub(
53
+ r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
54
+ )
55
+
56
+ return replaced_text
57
+
58
+
59
+ def g2p(text):
60
+ pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
61
+ sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
62
+ phones, word2ph = _g2p(sentences)
63
+ return phones, word2ph
64
+
65
+
66
+ def _get_initials_finals(word):
67
+ initials = []
68
+ finals = []
69
+ orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
70
+ orig_finals = lazy_pinyin(
71
+ word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
72
+ )
73
+ for c, v in zip(orig_initials, orig_finals):
74
+ initials.append(c)
75
+ finals.append(v)
76
+ return initials, finals
77
+
78
+
79
+ def _g2p(segments):
80
+ phones_list = []
81
+ word2ph = []
82
+ for seg in segments:
83
+ pinyins = []
84
+ # Replace all English words in the sentence
85
+ seg = re.sub("[a-zA-Z]+", "", seg)
86
+ seg_cut = psg.lcut(seg)
87
+ initials = []
88
+ finals = []
89
+ seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
90
+ for word, pos in seg_cut:
91
+ if pos == "eng":
92
+ continue
93
+ sub_initials, sub_finals = _get_initials_finals(word)
94
+ sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
95
+ initials.append(sub_initials)
96
+ finals.append(sub_finals)
97
+
98
+ # assert len(sub_initials) == len(sub_finals) == len(word)
99
+ initials = sum(initials, [])
100
+ finals = sum(finals, [])
101
+ #
102
+ for c, v in zip(initials, finals):
103
+ raw_pinyin = c + v
104
+ # NOTE: post process for pypinyin outputs
105
+ # we discriminate i, ii and iii
106
+ if c == v:
107
+ assert c in punctuation
108
+ phone = [c]
109
+ word2ph.append(1)
110
+ else:
111
+ v_without_tone = v[:-1]
112
+ tone = v[-1]
113
+
114
+ pinyin = c + v_without_tone
115
+ assert tone in "12345"
116
+
117
+ if c:
118
+ # 多音节
119
+ v_rep_map = {
120
+ "uei": "ui",
121
+ "iou": "iu",
122
+ "uen": "un",
123
+ }
124
+ if v_without_tone in v_rep_map.keys():
125
+ pinyin = c + v_rep_map[v_without_tone]
126
+ else:
127
+ # 单音节
128
+ pinyin_rep_map = {
129
+ "ing": "ying",
130
+ "i": "yi",
131
+ "in": "yin",
132
+ "u": "wu",
133
+ }
134
+ if pinyin in pinyin_rep_map.keys():
135
+ pinyin = pinyin_rep_map[pinyin]
136
+ else:
137
+ single_rep_map = {
138
+ "v": "yu",
139
+ "e": "e",
140
+ "i": "y",
141
+ "u": "w",
142
+ }
143
+ if pinyin[0] in single_rep_map.keys():
144
+ pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
145
+
146
+ assert pinyin in pinyin_to_symbol_map().keys(), (pinyin, seg, raw_pinyin)
147
+ new_c, new_v = pinyin_to_symbol_map()[pinyin].split(" ")
148
+ new_v = new_v + tone
149
+ phone = [new_c, new_v]
150
+ word2ph.append(len(phone))
151
+
152
+ phones_list += phone
153
+ return phones_list, word2ph
154
+
155
+
156
+ def text_normalize(text):
157
+ # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
158
+ tx = TextNormalizer()
159
+ sentences = tx.normalize(text)
160
+ dest_text = ""
161
+ for sentence in sentences:
162
+ dest_text += replace_punctuation(sentence)
163
+ return dest_text
164
+
165
+
166
+ if __name__ == "__main__":
167
+ text = "啊——但是《原神》是由,米哈\游自主,研发的一款全.新开放世界.冒险游戏"
168
+ text = "呣呣呣~就是…大人的鼹鼠党吧?"
169
+ text = "你好"
170
+ text = text_normalize(text)
171
+ print(g2p(text))
172
+
173
+
174
+ # # 示例用法
175
+ # text = "这是一个示例文本:,你好!这是一个测试..."
176
+ # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试