dangtr0408 commited on
Commit
e059497
·
1 Parent(s): 8bf7a01

seperate style calculation process

Browse files
Files changed (2) hide show
  1. inference.py +17 -18
  2. run.ipynb +0 -0
inference.py CHANGED
@@ -33,7 +33,7 @@ def espeak_phn(text, lang):
33
  print(e)
34
 
35
  # IPA Phonemizer: https://github.com/bootphon/phonemizer
36
- # Total including extend chars 187
37
 
38
  _pad = "$"
39
  _punctuation = ';:,.!?¡¿—…"«»“” '
@@ -135,9 +135,6 @@ class StyleTTS2(torch.nn.Module):
135
  self.style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)# acoustic style encoder
136
 
137
  self.__load_models(models_path)
138
-
139
- self.ref_s_speakers = None
140
- self.speakers = None
141
 
142
  def __recursive_munch(self, d):
143
  if isinstance(d, dict):
@@ -274,21 +271,23 @@ class StyleTTS2(torch.nn.Module):
274
 
275
  return out.squeeze().cpu().numpy(), duration.mean()
276
 
277
- def __get_styles(self, speakers, denoise, split_dur):
278
- self.ref_s_speakers = {}
279
- self.speakers = speakers
280
- for id in speakers:
281
- ref_s = self.__compute_style(speakers[id]['path'], denoise=denoise, split_dur=split_dur)
282
- self.ref_s_speakers[id] = ref_s
283
-
284
- def generate(self, text, speakers, avg_style=False, stabilize=False, denoise=0.3, n_merge=14, default_speaker= "[id_1]"):
285
  if avg_style: split_dur = 3
286
  else: split_dur = 0
287
-
 
 
 
 
 
 
 
 
 
 
 
288
  if stabilize: smooth_dur=0.2
289
  else: smooth_dur=0
290
-
291
- self.__get_styles(speakers, denoise, split_dur)
292
 
293
  list_wav = []
294
  prev_d_mean = 0
@@ -324,8 +323,8 @@ class StyleTTS2(torch.nn.Module):
324
  if bool(re.match(r'(\[id_\d+\])', i)):
325
  #Set up env for matched speaker
326
  speaker_id = i.strip('[]')
327
- current_ref_s = self.ref_s_speakers[speaker_id]
328
- speed = self.speakers[speaker_id]['speed']
329
  continue
330
  text_norm = self.preprocess.text_preprocess(i, n_merge=n_merge)
331
  for sentence in text_norm:
@@ -340,7 +339,7 @@ class StyleTTS2(torch.nn.Module):
340
  print(e)
341
 
342
  replacement_func = self.__init_replacement_func(cus_phonem)
343
- phonem = espeak_phn(sentence, self.speakers[speaker_id]['lang'])
344
  phonem = re.sub(lang_pattern, replacement_func, phonem)
345
 
346
  wav, prev_d_mean = self.__inference(phonem, current_ref_s, speed=speed, prev_d_mean=prev_d_mean, t=smooth_dur)
 
33
  print(e)
34
 
35
  # IPA Phonemizer: https://github.com/bootphon/phonemizer
36
+ # Total including extend chars 189
37
 
38
  _pad = "$"
39
  _punctuation = ';:,.!?¡¿—…"«»“” '
 
135
  self.style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)# acoustic style encoder
136
 
137
  self.__load_models(models_path)
 
 
 
138
 
139
  def __recursive_munch(self, d):
140
  if isinstance(d, dict):
 
271
 
272
  return out.squeeze().cpu().numpy(), duration.mean()
273
 
274
+ def get_styles(self, speakers, denoise=0.3, avg_style=True):
 
 
 
 
 
 
 
275
  if avg_style: split_dur = 3
276
  else: split_dur = 0
277
+ styles = {}
278
+ for id in speakers:
279
+ ref_s = self.__compute_style(speakers[id]['path'], denoise=denoise, split_dur=split_dur)
280
+ styles[id] = {
281
+ 'style': ref_s,
282
+ 'path': speakers[id]['path'],
283
+ 'lang': speakers[id]['lang'],
284
+ 'speed': speakers[id]['speed'],
285
+ }
286
+ return styles
287
+
288
+ def generate(self, text, styles, stabilize=False, n_merge=14, default_speaker= "[id_1]"):
289
  if stabilize: smooth_dur=0.2
290
  else: smooth_dur=0
 
 
291
 
292
  list_wav = []
293
  prev_d_mean = 0
 
323
  if bool(re.match(r'(\[id_\d+\])', i)):
324
  #Set up env for matched speaker
325
  speaker_id = i.strip('[]')
326
+ current_ref_s = styles[speaker_id]['style']
327
+ speed = styles[speaker_id]['speed']
328
  continue
329
  text_norm = self.preprocess.text_preprocess(i, n_merge=n_merge)
330
  for sentence in text_norm:
 
339
  print(e)
340
 
341
  replacement_func = self.__init_replacement_func(cus_phonem)
342
+ phonem = espeak_phn(sentence, styles[speaker_id]['lang'])
343
  phonem = re.sub(lang_pattern, replacement_func, phonem)
344
 
345
  wav, prev_d_mean = self.__inference(phonem, current_ref_s, speed=speed, prev_d_mean=prev_d_mean, t=smooth_dur)
run.ipynb CHANGED
The diff for this file is too large to render. See raw diff