Commit
·
e059497
1
Parent(s):
8bf7a01
seperate style calculation process
Browse files- inference.py +17 -18
- run.ipynb +0 -0
inference.py
CHANGED
@@ -33,7 +33,7 @@ def espeak_phn(text, lang):
|
|
33 |
print(e)
|
34 |
|
35 |
# IPA Phonemizer: https://github.com/bootphon/phonemizer
|
36 |
-
# Total including extend chars
|
37 |
|
38 |
_pad = "$"
|
39 |
_punctuation = ';:,.!?¡¿—…"«»“” '
|
@@ -135,9 +135,6 @@ class StyleTTS2(torch.nn.Module):
|
|
135 |
self.style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)# acoustic style encoder
|
136 |
|
137 |
self.__load_models(models_path)
|
138 |
-
|
139 |
-
self.ref_s_speakers = None
|
140 |
-
self.speakers = None
|
141 |
|
142 |
def __recursive_munch(self, d):
|
143 |
if isinstance(d, dict):
|
@@ -274,21 +271,23 @@ class StyleTTS2(torch.nn.Module):
|
|
274 |
|
275 |
return out.squeeze().cpu().numpy(), duration.mean()
|
276 |
|
277 |
-
def
|
278 |
-
self.ref_s_speakers = {}
|
279 |
-
self.speakers = speakers
|
280 |
-
for id in speakers:
|
281 |
-
ref_s = self.__compute_style(speakers[id]['path'], denoise=denoise, split_dur=split_dur)
|
282 |
-
self.ref_s_speakers[id] = ref_s
|
283 |
-
|
284 |
-
def generate(self, text, speakers, avg_style=False, stabilize=False, denoise=0.3, n_merge=14, default_speaker= "[id_1]"):
|
285 |
if avg_style: split_dur = 3
|
286 |
else: split_dur = 0
|
287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
if stabilize: smooth_dur=0.2
|
289 |
else: smooth_dur=0
|
290 |
-
|
291 |
-
self.__get_styles(speakers, denoise, split_dur)
|
292 |
|
293 |
list_wav = []
|
294 |
prev_d_mean = 0
|
@@ -324,8 +323,8 @@ class StyleTTS2(torch.nn.Module):
|
|
324 |
if bool(re.match(r'(\[id_\d+\])', i)):
|
325 |
#Set up env for matched speaker
|
326 |
speaker_id = i.strip('[]')
|
327 |
-
current_ref_s =
|
328 |
-
speed =
|
329 |
continue
|
330 |
text_norm = self.preprocess.text_preprocess(i, n_merge=n_merge)
|
331 |
for sentence in text_norm:
|
@@ -340,7 +339,7 @@ class StyleTTS2(torch.nn.Module):
|
|
340 |
print(e)
|
341 |
|
342 |
replacement_func = self.__init_replacement_func(cus_phonem)
|
343 |
-
phonem = espeak_phn(sentence,
|
344 |
phonem = re.sub(lang_pattern, replacement_func, phonem)
|
345 |
|
346 |
wav, prev_d_mean = self.__inference(phonem, current_ref_s, speed=speed, prev_d_mean=prev_d_mean, t=smooth_dur)
|
|
|
33 |
print(e)
|
34 |
|
35 |
# IPA Phonemizer: https://github.com/bootphon/phonemizer
|
36 |
+
# Total including extend chars 189
|
37 |
|
38 |
_pad = "$"
|
39 |
_punctuation = ';:,.!?¡¿—…"«»“” '
|
|
|
135 |
self.style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)# acoustic style encoder
|
136 |
|
137 |
self.__load_models(models_path)
|
|
|
|
|
|
|
138 |
|
139 |
def __recursive_munch(self, d):
|
140 |
if isinstance(d, dict):
|
|
|
271 |
|
272 |
return out.squeeze().cpu().numpy(), duration.mean()
|
273 |
|
274 |
+
def get_styles(self, speakers, denoise=0.3, avg_style=True):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
if avg_style: split_dur = 3
|
276 |
else: split_dur = 0
|
277 |
+
styles = {}
|
278 |
+
for id in speakers:
|
279 |
+
ref_s = self.__compute_style(speakers[id]['path'], denoise=denoise, split_dur=split_dur)
|
280 |
+
styles[id] = {
|
281 |
+
'style': ref_s,
|
282 |
+
'path': speakers[id]['path'],
|
283 |
+
'lang': speakers[id]['lang'],
|
284 |
+
'speed': speakers[id]['speed'],
|
285 |
+
}
|
286 |
+
return styles
|
287 |
+
|
288 |
+
def generate(self, text, styles, stabilize=False, n_merge=14, default_speaker= "[id_1]"):
|
289 |
if stabilize: smooth_dur=0.2
|
290 |
else: smooth_dur=0
|
|
|
|
|
291 |
|
292 |
list_wav = []
|
293 |
prev_d_mean = 0
|
|
|
323 |
if bool(re.match(r'(\[id_\d+\])', i)):
|
324 |
#Set up env for matched speaker
|
325 |
speaker_id = i.strip('[]')
|
326 |
+
current_ref_s = styles[speaker_id]['style']
|
327 |
+
speed = styles[speaker_id]['speed']
|
328 |
continue
|
329 |
text_norm = self.preprocess.text_preprocess(i, n_merge=n_merge)
|
330 |
for sentence in text_norm:
|
|
|
339 |
print(e)
|
340 |
|
341 |
replacement_func = self.__init_replacement_func(cus_phonem)
|
342 |
+
phonem = espeak_phn(sentence, styles[speaker_id]['lang'])
|
343 |
phonem = re.sub(lang_pattern, replacement_func, phonem)
|
344 |
|
345 |
wav, prev_d_mean = self.__inference(phonem, current_ref_s, speed=speed, prev_d_mean=prev_d_mean, t=smooth_dur)
|
run.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|