Dionyssos commited on
Commit
780c8d5
·
1 Parent(s): bc7f42e

lang numerals

Browse files
Files changed (4) hide show
  1. Utils/text_utils.py +118 -34
  2. api.py +141 -132
  3. demo.py +12 -24
  4. msinference.py +122 -146
Utils/text_utils.py CHANGED
@@ -2,6 +2,7 @@
2
  import re
3
  import codecs
4
  import textwrap
 
5
  # IPA Phonemizer: https://github.com/bootphon/phonemizer
6
 
7
  _pad = "$"
@@ -16,10 +17,12 @@ dicts = {}
16
  for i in range(len((symbols))):
17
  dicts[symbols[i]] = i
18
 
 
19
  class TextCleaner:
20
  def __init__(self, dummy=None):
21
  self.word_index_dictionary = dicts
22
  print(len(dicts))
 
23
  def __call__(self, text):
24
  indexes = []
25
  for char in text:
@@ -32,7 +35,7 @@ class TextCleaner:
32
 
33
  # == Sentence Splitter
34
 
35
- alphabets= "([A-Za-z])"
36
  prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
37
  suffixes = "(Inc|Ltd|Jr|Sr|Co)"
38
  starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
@@ -42,7 +45,6 @@ digits = "([0-9])"
42
  multiple_dots = r'\.{2,}'
43
 
44
 
45
-
46
  def split_into_sentences(text):
47
  """
48
  Split the text into sentences.
@@ -59,54 +61,66 @@ def split_into_sentences(text):
59
  https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
60
  """
61
  text = " " + text + " "
62
- text = text.replace("\n"," ")
63
- text = re.sub(prefixes,"\\1<prd>",text)
64
- text = re.sub(websites,"<prd>\\1",text)
65
- text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
66
- text = re.sub(multiple_dots, lambda match: "<prd>" * len(match.group(0)) + "<stop>", text)
67
- if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
68
- text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
69
- text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
70
- text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
71
- text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
72
- text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
73
- text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
74
- text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
75
- if "" in text: text = text.replace(".”","”.")
76
- if "\"" in text: text = text.replace(".\"","\".")
77
- if "!" in text: text = text.replace("!\"","\"!")
78
- if "?" in text: text = text.replace("?\"","\"?")
79
- text = text.replace(".",".<stop>")
80
- text = text.replace("?","?<stop>")
81
- text = text.replace("!","!<stop>")
82
- text = text.replace("<prd>",".")
 
 
 
 
 
 
 
 
83
  sentences = text.split("<stop>")
84
  sentences = [s.strip() for s in sentences]
85
-
86
  # Split Very long sentences >500 phoneme - StyleTTS2 crashes
87
  # -- even 400 phonemes sometimes OOM in cuda:4
88
- sentences = [sub_sent+' ' for s in sentences for sub_sent in textwrap.wrap(s, 200, break_long_words=0)]
89
-
90
- # if sentences and not sentences[-1]:
 
91
  # sentences = sentences[:-1]
92
  return sentences
93
 
 
94
  def store_ssml(text=None,
95
  voice=None):
96
  '''create ssml:
97
  text : list of sentences
98
  voice: https://github.com/MycroftAI/mimic3-voices
99
  '''
100
- print('\n___________________________\n', len(text), text[0], '\n___________________________________\n')
 
101
  _s = '<speak>'
102
  for short_text in text:
103
 
104
- rate = min(max(.87, len(short_text) / 76), 1.14) #1.44) # 1.24 for bieber
105
-
106
-
107
  volume = int(74 * np.random.rand() + 24)
108
  # text = ('<speak>'
109
- _s += f'<prosody volume=\'{volume}\'>' # THe other voice does not have volume
 
110
  _s += f'<prosody rate=\'{rate}\'>'
111
  _s += f'<voice name=\'{voice}\'>'
112
  _s += '<s>'
@@ -116,7 +130,77 @@ def store_ssml(text=None,
116
  _s += '</prosody>'
117
  _s += '</prosody>'
118
  _s += '</speak>'
119
- print(len(text),'\n\n\n\n\n\n\n', _s)
120
-
121
  with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
122
  f.write(_s)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import re
3
  import codecs
4
  import textwrap
5
+ from num2words import num2words
6
  # IPA Phonemizer: https://github.com/bootphon/phonemizer
7
 
8
  _pad = "$"
 
17
  for i in range(len((symbols))):
18
  dicts[symbols[i]] = i
19
 
20
+
21
  class TextCleaner:
22
  def __init__(self, dummy=None):
23
  self.word_index_dictionary = dicts
24
  print(len(dicts))
25
+
26
  def __call__(self, text):
27
  indexes = []
28
  for char in text:
 
35
 
36
  # == Sentence Splitter
37
 
38
+ alphabets = "([A-Za-z])"
39
  prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
40
  suffixes = "(Inc|Ltd|Jr|Sr|Co)"
41
  starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
 
45
  multiple_dots = r'\.{2,}'
46
 
47
 
 
48
  def split_into_sentences(text):
49
  """
50
  Split the text into sentences.
 
61
  https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
62
  """
63
  text = " " + text + " "
64
+ text = text.replace("\n", " ")
65
+ text = re.sub(prefixes, "\\1<prd>", text)
66
+ text = re.sub(websites, "<prd>\\1", text)
67
+ text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
68
+ text = re.sub(multiple_dots, lambda match: "<prd>" *
69
+ len(match.group(0)) + "<stop>", text)
70
+ if "Ph.D" in text:
71
+ text = text.replace("Ph.D.", "Ph<prd>D<prd>")
72
+ text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text)
73
+ text = re.sub(acronyms+" "+starters, "\\1<stop> \\2", text)
74
+ text = re.sub(alphabets + "[.]" + alphabets + "[.]" +
75
+ alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text)
76
+ text = re.sub(alphabets + "[.]" + alphabets +
77
+ "[.]", "\\1<prd>\\2<prd>", text)
78
+ text = re.sub(" "+suffixes+"[.] "+starters, " \\1<stop> \\2", text)
79
+ text = re.sub(" "+suffixes+"[.]", " \\1<prd>", text)
80
+ text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
81
+ if "”" in text:
82
+ text = text.replace(".”", "”.")
83
+ if "\"" in text:
84
+ text = text.replace(".\"", "\".")
85
+ if "!" in text:
86
+ text = text.replace("!\"", "\"!")
87
+ if "?" in text:
88
+ text = text.replace("?\"", "\"?")
89
+ text = text.replace(".", ".<stop>")
90
+ text = text.replace("?", "?<stop>")
91
+ text = text.replace("!", "!<stop>")
92
+ text = text.replace("<prd>", ".")
93
  sentences = text.split("<stop>")
94
  sentences = [s.strip() for s in sentences]
95
+
96
  # Split Very long sentences >500 phoneme - StyleTTS2 crashes
97
  # -- even 400 phonemes sometimes OOM in cuda:4
98
+ sentences = [
99
+ sub_sent+' ' for s in sentences for sub_sent in textwrap.wrap(s, 200, break_long_words=0)]
100
+
101
+ # if sentences and not sentences[-1]:
102
  # sentences = sentences[:-1]
103
  return sentences
104
 
105
+
106
  def store_ssml(text=None,
107
  voice=None):
108
  '''create ssml:
109
  text : list of sentences
110
  voice: https://github.com/MycroftAI/mimic3-voices
111
  '''
112
+ print('\n___________________________\n', len(text),
113
+ text[0], '\n___________________________________\n')
114
  _s = '<speak>'
115
  for short_text in text:
116
 
117
+ # 1.44) # 1.24 for bieber
118
+ rate = min(max(.87, len(short_text) / 76), 1.14)
119
+
120
  volume = int(74 * np.random.rand() + 24)
121
  # text = ('<speak>'
122
+ # THe other voice does not have volume
123
+ _s += f'<prosody volume=\'{volume}\'>'
124
  _s += f'<prosody rate=\'{rate}\'>'
125
  _s += f'<voice name=\'{voice}\'>'
126
  _s += '<s>'
 
130
  _s += '</prosody>'
131
  _s += '</prosody>'
132
  _s += '</speak>'
133
+ print(len(text), '\n\n\n\n\n\n\n', _s)
134
+
135
  with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
136
  f.write(_s)
137
+
138
+
139
+ def transliterate_number(number_string, lang='en'):
140
+ """
141
+ Converts a number string to words in the specified language,
142
+ handling decimals, scientific notation, and preserving text
143
+ before and after the numeral.
144
+ """
145
+
146
+ if lang == 'rmc-script_latin':
147
+ lang = 'sr'
148
+ exponential_pronoun = ' puta deset na stepen od '
149
+ comma = ' tačka '
150
+ elif lang == 'ron':
151
+ lang = 'ro'
152
+ exponential_pronoun = ' tízszer a erejéig '
153
+ comma = ' virgulă '
154
+ elif lang == 'hun':
155
+ lang = 'hu'
156
+ exponential_pronoun = ' tízszer a erejéig '
157
+ comma = ' virgula '
158
+ elif lang == 'deu':
159
+ exponential_pronoun = ' mal zehn hoch '
160
+ comma = ' komma '
161
+ else:
162
+ lang = lang[:2]
163
+ exponential_pronoun = ' times ten to the power of '
164
+ comma = ' point '
165
+
166
+ def replace_number(match):
167
+ prefix = match.group(1) or ""
168
+ number_part = match.group(2)
169
+ suffix = match.group(5) or ""
170
+
171
+ try:
172
+ if 'e' in number_part.lower():
173
+ base, exponent = number_part.lower().split('e')
174
+ base = float(base)
175
+ exponent = int(exponent)
176
+ words = num2words(
177
+ base, lang=lang) + exponential_pronoun + num2words(exponent, lang=lang)
178
+ elif '.' in number_part:
179
+ integer_part, decimal_part = number_part.split('.')
180
+ words = num2words(int(integer_part), lang=lang) + comma + " ".join(
181
+ [num2words(int(digit), lang=lang) for digit in decimal_part])
182
+ else:
183
+ words = num2words(int(number_part), lang=lang)
184
+ return prefix + words + suffix
185
+ except ValueError:
186
+ return match.group(0) # Return original if conversion fails
187
+
188
+ pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
189
+ return re.sub(pattern, replace_number, number_string)
190
+
191
+
192
+ def discard_leading_numeral(text):
193
+ """Discards a leading numeral (integer or float) from a string.
194
+
195
+ Args:
196
+ text: The input string.
197
+
198
+ Returns:
199
+ The string with the leading numeral removed, or the original string
200
+ if it doesn't start with a numeral.
201
+ """
202
+ match = re.match(r"^\s*(\d+(\.\d*)?)\s*", text)
203
+ if match:
204
+ return text[match.end():].lstrip()
205
+ else:
206
+ return text
api.py CHANGED
@@ -42,35 +42,41 @@ def resize_with_white_padding(image):
42
  # Image is wider than the target, pad top and bottom
43
  new_w = target_w
44
  new_h = int(new_w / aspect_ratio)
45
- resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
 
46
  padding_h = target_h - new_h
47
  top_padding = padding_h // 2
48
  bottom_padding = padding_h - top_padding
49
  padding = [(top_padding, bottom_padding), (0, 0)]
50
  if len(image.shape) == 3:
51
  padding.append((0, 0)) # Add padding for color channels
52
- padded_image = np.pad(resized_image, padding, mode='constant', constant_values=255)
 
53
  elif aspect_ratio < target_aspect_ratio:
54
  # Image is taller than the target, pad left and right
55
  new_h = target_h
56
  new_w = int(new_h * aspect_ratio)
57
- resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
 
58
  padding_w = target_w - new_w
59
  left_padding = padding_w // 2
60
  right_padding = padding_w - left_padding
61
  padding = [(0, 0), (left_padding, right_padding)]
62
  if len(image.shape) == 3:
63
  padding.append((0, 0)) # Add padding for color channels
64
- padded_image = np.pad(resized_image, padding, mode='constant', constant_values=255)
 
65
  else:
66
  # Aspect ratio matches the target, just resize
67
- padded_image = cv2.resize(image, (target_w, target_h), interpolation=cv2.INTER_LANCZOS4)
 
68
 
69
- return padded_image # image 2 speech
70
 
71
 
72
  def _shorten(filename):
73
- return filename.replace("/","")[-6:]
 
74
 
75
  def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
76
  '''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
@@ -104,20 +110,23 @@ def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
104
  # return the resized image
105
  return resized
106
 
107
- def overlay(x,soundscape=None):
 
108
  if soundscape is not None:
109
  # AudioGen sound is suffice to be ~10s long
110
  background = sound_generator.generate(soundscape,
111
- duration=len(x)/16000 + .74, # sound duration = TTS dur
112
- ).detach().cpu().numpy() # bs, 11400 @.74s
 
113
 
114
  # len_soundscape = len(background)
115
 
116
  # fading = .5 + .5 * np.tanh(4*(np.linspace(10, -10, len_soundscape) + 9.4)) # fade heaviside 1,1,1,1,...,0
117
 
118
  # x = np.concatenate([fading * background, x], 0) # blend TTS with AudioGen
119
- #background /= np.abs(background).max() + 1e-7 # amplify speech to full [-1,1]
120
- x = .4 * x + .46 * background[:len(x)] # background will be longer by xtra .74s
 
121
  return x # TTS / AudioGen @ 16kHz
122
 
123
 
@@ -134,77 +143,79 @@ def tts_multi_sentence(precomputed_style_vector=None,
134
  voice : string or None (falls to styleTTS)
135
  soundscape : 'A castle in far away lands' -> if passed will generate background sound soundscape
136
  '''
137
-
138
-
139
  # StyleTTS2 - English
140
-
141
  if precomputed_style_vector is not None:
142
  x = []
143
  if not isinstance(text, list):
144
  text = split_into_sentences(text) # Avoid OOM in StyleTTS2
145
  for _sentence in text:
146
-
147
  # StyleTTS2 - pronounciation Fx
148
-
149
- _sentence = _sentence.lower() # .replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
 
150
  if 'vctk_low#p326' in voice:
151
  # fix sounding of sleepy AAABS TRAACT
152
- _sentence = _sentence.replace('abstract', 'ahbstract') # 'ahstract'
 
153
  x.append(msinference.inference(_sentence,
154
  precomputed_style_vector)
155
  )
156
  x = np.concatenate(x)
157
-
158
  # Fallback - MMS TTS - Non-English
159
-
160
  else:
161
-
162
  # dont split foreign sentences: Avoids speaker change issue
163
  x = msinference.foreign(text=text,
164
  lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
165
  speed=speed) # normalisation externally
166
-
167
-
168
  # volume
169
-
170
- x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
171
-
172
- return overlay(x, soundscape=soundscape)
173
-
174
 
 
 
 
175
 
176
 
177
  # voices = {}
178
  # import phonemizer
179
  # global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
180
-
181
  app = Flask(__name__)
182
 
 
183
  @app.route("/", methods=['GET', 'POST', 'PUT'])
184
  def serve_wav():
185
  # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
186
  # object-into-a-representation-suitable-for-mongodb
187
  r = request.form.to_dict(flat=False)
188
-
189
-
190
  # Physically Save Client Files
191
  for filename, obj in request.files.items():
192
  obj.save(f'{CACHE_DIR}{_shorten(filename)}')
193
-
194
- print('Saved all files on Server Side\n\n')
195
 
196
  args = SimpleNamespace(
197
- text = None if r.get('text') is None else CACHE_DIR + _shorten(r.get('text' )[0]), # crop last letters from original filename & use as tmp
198
- video = None if r.get('video') is None else CACHE_DIR + _shorten(r.get('video')[0]),
199
- image = None if r.get('image') is None else CACHE_DIR + _shorten(r.get('image')[0]),
200
- native = None if r.get('native') is None else CACHE_DIR + _shorten(r.get('native')[0]),
201
- affective = r.get('affective')[0],
202
- voice = r.get('voice')[0],
203
- speed = float(r.get('speed')[0]), # For Non-English MMS TTS
204
- soundscape=r.get('soundscape')[0] if r.get('soundscape') is not None else None,
205
- )
 
 
 
 
 
 
206
  # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
207
-
208
 
209
  print(args, 'ENTER Script')
210
  do_video_dub = True if args.text.endswith('.srt') else False
@@ -213,10 +224,12 @@ def serve_wav():
213
  AUDIO_TRACK = '_audio_track.wav'
214
 
215
  if do_video_dub:
216
- print('==\nFound .srt : {args.txt}, thus Video should be given as well\n\n')
 
217
  with open(args.text, "r") as f:
218
  s = f.read()
219
- text = [[j.content, j.start.total_seconds(), j.end.total_seconds()] for j in srt.parse(s)]
 
220
  assert args.video is not None
221
  native_audio_file = '_tmp.wav'
222
  subprocess.run(
@@ -231,36 +244,38 @@ def serve_wav():
231
  "-vn",
232
  native_audio_file])
233
  x_native, _ = soundfile.read(native_audio_file) # reads mp3
234
-
235
  # stereo in video
236
  if x_native.ndim > 1:
237
  x_native = x_native[:, 0] # stereo
238
-
239
  # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
240
  else:
241
  with open(args.text, 'r') as f:
242
  text = ''.join(f)
243
- text = re.sub(' +', ' ', text) # delete spaces / split in list in tts_multi_sentence()
244
-
 
245
  # == STYLE VECTOR ==
246
 
247
  precomputed_style_vector = None
248
-
249
  if args.native: # Voice Cloning
250
  try:
251
  precomputed_style_vector = msinference.compute_style(args.native)
252
  except soundfile.LibsndfileError: # Fallback - internal voice
253
- print('\n Could not voice clone audio:', args.native, 'fallback to video or Internal TTS voice.\n')
 
254
  if do_video_dub: # Clone voice via Video
255
  native_audio_file = args.video.replace('.', '').replace('/', '')
256
  native_audio_file += '__native_audio_track.wav'
257
  soundfile.write('tgt_spk.wav',
258
- np.concatenate([
259
- x_native[:int(4 * 16000)]], 0).astype(np.float32), 16000) # 27400?
260
  precomputed_style_vector = msinference.compute_style('tgt_spk.wav')
261
 
262
  # NOTE: style vector is normally None here - except if --native arg was passed
263
-
264
  # Native English Accent TTS
265
  if precomputed_style_vector is None:
266
  if 'en_US' in args.voice or 'en_UK' in args.voice:
@@ -272,53 +287,52 @@ def serve_wav():
272
  'cmu-arctic', 'cmu_arctic').replace(
273
  '_low', '') + '.wav')
274
  # Non-Native English Accent TTS
275
- elif '_' in args.voice:
276
  precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
277
  '/', '_').replace('#', '_').replace(
278
- 'cmu-arctic', 'cmu_arctic').replace(
279
- '_low', '') + '.wav')
280
  # Foreign Lang
281
  else:
282
  print(f'\n\n\n\n\n FallBack to MMS TTS due to: {args.voice=}')
283
-
284
-
285
  # NOTE : precomputed_style_vector is still None if MMS TTS
286
-
287
  # == SILENT VIDEO ==
288
 
289
  if args.video is not None:
290
  # banner - precomput @ 1920 pixels
291
  frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
292
- font = cv2.FONT_HERSHEY_SIMPLEX
293
  bottomLeftCornerOfText = (240, 74) # w,h
294
- fontScale = 2
295
- fontColor = (255, 255, 255)
296
- thickness = 4
297
- lineType = 2
298
  cv2.putText(frame_tts, 'TTS',
299
- bottomLeftCornerOfText,
300
- font,
301
- fontScale,
302
- fontColor,
303
- thickness,
304
- lineType)
305
  # cv2.imshow('i', frame_tts); cv2.waitKey(); cv2.destroyAllWindows()
306
  # ====================================== NATIVE VOICE
307
  frame_orig = np.zeros((104, 1920, 3), dtype=np.uint8)
308
- font = cv2.FONT_HERSHEY_SIMPLEX
309
  bottomLeftCornerOfText = (101, 74) # w,h
310
- fontScale = 2
311
- fontColor = (255, 255, 255)
312
- thickness = 4
313
- lineType = 1000
314
  cv2.putText(frame_orig, 'ORIGINAL VOICE',
315
- bottomLeftCornerOfText,
316
- font,
317
- fontScale,
318
- fontColor,
319
- thickness,
320
- lineType)
321
-
322
  print(f'\n______________________________\n'
323
  f'Gen Banners for TTS/Native Title {frame_tts.shape=} {frame_orig.shape=}'
324
  f'\n______________________________\n')
@@ -336,67 +350,63 @@ def serve_wav():
336
  #
337
  video_file = args.video
338
  vf = VideoFileClip(video_file)
339
-
340
  # GET 1st FRAME to OBTAIN frame RESOLUTION
341
  h, w, _ = vf.get_frame(0).shape
342
  frame_tts = _resize(frame_tts, width=w)
343
  frame_orig = _resize(frame_orig, width=w)
344
  h, w, _ = frame_orig.shape
345
-
346
  try:
347
-
348
  # inpaint banner to say if native voice
349
  num = x_native.shape[0]
350
- is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4)) # fade heaviside
351
-
 
352
  def inpaint_banner(get_frame, t):
353
  '''blend banner - (now plays) tts or native voic
354
  '''
355
-
356
  im = np.copy(get_frame(t)) # pic
357
-
358
 
359
  ix = int(t * 16000) # ix may overflow the is_tts.shape
360
  if ix < num:
361
  if is_tts[ix] > .5: # mask == 1 => tts / mask == 0 -> native
362
  frame = frame_tts # rename frame to rsz_frame_... because if frame_tts is mod
363
- # then is considered a "local variable" thus the "outer var"
364
- # is not observed by python raising referenced before assign
365
  else:
366
  frame = frame_orig
367
  # For the ix that is out of bounds of num assume frame_tts
368
  else:
369
  frame = frame_tts
370
-
371
  # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
372
-
373
-
374
 
375
  offset_h = 24
376
-
377
-
378
- print(f' > inpaint_banner() HAS NATIVE: {frame.shape=} {im.shape=}\n\n\n\n')
379
-
380
-
381
-
382
- im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
383
  + .6 * frame).astype(np.uint8)
384
-
385
  # im2 = np.concatenate([im, frame_tts], 0)
386
  # cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
387
  return im # np.concatenate([im, frane_ttts], 0)
388
-
389
  except UnboundLocalError: # args.native == False
390
-
391
  def inpaint_banner(get_frame, t):
392
-
393
  im = np.copy(get_frame(t))
394
-
395
  h, w, _ = frame_tts.shape # frame = banner
396
  if w != im.shape[1]: # rsz banners to fit video w
397
  local_frame = _resize(frame_tts, width=im.shape[1])
398
  offset_h = 24
399
- im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
400
  + .6 * local_frame).astype(np.uint8)
401
  return im
402
  vf = vf.fl(inpaint_banner)
@@ -405,9 +415,9 @@ def serve_wav():
405
  # ==== TTS .srt ====
406
 
407
  if do_video_dub:
408
- OUT_FILE = 'tmp.mp4' #args.out_file + '_video_dub.mp4'
409
  subtitles = text
410
- MAX_LEN = int(subtitles[-1][2] + 17) * 16000
411
  # 17 extra seconds fail-safe for long-last-segment
412
  print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
413
  pieces = []
@@ -423,10 +433,12 @@ def serve_wav():
423
  # x = audresample.resample(x.astype(np.float32), 24000, 22050) # reshapes (64,) -> (1,64)
424
  # PAD SHORTEST of TTS / NATIVE
425
  if len(x_native) > len(total):
426
- total = np.pad(total, (0, max(0, x_native.shape[0] - total.shape[0])))
 
427
 
428
  else: # pad native to len of is_tts & total
429
- x_native = np.pad(x_native, (0, max(0, total.shape[0] - x_native.shape[0])))
 
430
  # print(total.shape, x_native.shape, 'PADDED TRACKS')
431
  soundfile.write(AUDIO_TRACK,
432
  # (is_tts * total + (1-is_tts) * x_native)[:, None],
@@ -435,25 +447,25 @@ def serve_wav():
435
  else: # Video from plain (.txt)
436
  OUT_FILE = 'tmp.mp4'
437
  x = tts_multi_sentence(text=text,
438
- precomputed_style_vector=precomputed_style_vector,
439
- voice=args.voice,
440
- soundscape=args.soundscape,
441
- speed=args.speed)
442
  soundfile.write(AUDIO_TRACK, x, 16000)
443
 
444
  # IMAGE 2 SPEECH
445
 
446
  if args.image is not None:
447
-
448
  # Resize Input Image to 1920x1080 - Issue of .mp4 non visible for other aspect ratios
449
-
450
- STATIC_FRAME = args.image + '.jpg' # 'assets/image_from_T31.jpg'
451
  cv2.imwrite(
452
  STATIC_FRAME,
453
  resize_with_white_padding(cv2.imread(args.image)
454
  ))
455
-
456
- OUT_FILE = 'tmp.mp4' #args.out_file + '_image_to_speech.mp4'
457
 
458
  # SILENT CLIP
459
 
@@ -486,22 +498,19 @@ def serve_wav():
486
  CACHE_DIR + OUT_FILE])
487
 
488
  print(f'\noutput video is saved as {OUT_FILE}')
489
-
490
  else:
491
-
492
  # Fallback: No image nor video provided - do only tts
493
  x = tts_multi_sentence(text=text,
494
- precomputed_style_vector=precomputed_style_vector,
495
  voice=args.voice,
496
  soundscape=args.soundscape,
497
  speed=args.speed)
498
  OUT_FILE = 'tmp.wav'
499
  soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
500
 
501
-
502
-
503
-
504
- # audios = [msinference.inference(text,
505
  # msinference.compute_style(f'voices/{voice}.wav'))]
506
  # # for t in [text]:
507
  # output_buffer = io.BytesIO()
@@ -511,8 +520,7 @@ def serve_wav():
511
  # https://stackoverflow.com/questions/67591467/
512
  # flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
513
  # time.sleep(4)
514
-
515
-
516
  # send server's output as default file -> srv_result.xx
517
  print(f'\n=SERVER saved as {OUT_FILE=}\n')
518
  response = send_from_directory(CACHE_DIR, path=OUT_FILE)
@@ -520,6 +528,7 @@ def serve_wav():
520
  print('________________\n ? \n_______________')
521
  return response
522
 
 
523
  if __name__ == "__main__":
524
  app.run(host="0.0.0.0")
525
 
@@ -546,4 +555,4 @@ if __name__ == "__main__":
546
  # f'fusion.mp4', # save to correct location is handled in client
547
  # ])
548
  #
549
- # ffmpeg -f concat -i mylist.txt -c copy output.mp4
 
42
  # Image is wider than the target, pad top and bottom
43
  new_w = target_w
44
  new_h = int(new_w / aspect_ratio)
45
+ resized_image = cv2.resize(
46
+ image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
47
  padding_h = target_h - new_h
48
  top_padding = padding_h // 2
49
  bottom_padding = padding_h - top_padding
50
  padding = [(top_padding, bottom_padding), (0, 0)]
51
  if len(image.shape) == 3:
52
  padding.append((0, 0)) # Add padding for color channels
53
+ padded_image = np.pad(resized_image, padding,
54
+ mode='constant', constant_values=255)
55
  elif aspect_ratio < target_aspect_ratio:
56
  # Image is taller than the target, pad left and right
57
  new_h = target_h
58
  new_w = int(new_h * aspect_ratio)
59
+ resized_image = cv2.resize(
60
+ image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
61
  padding_w = target_w - new_w
62
  left_padding = padding_w // 2
63
  right_padding = padding_w - left_padding
64
  padding = [(0, 0), (left_padding, right_padding)]
65
  if len(image.shape) == 3:
66
  padding.append((0, 0)) # Add padding for color channels
67
+ padded_image = np.pad(resized_image, padding,
68
+ mode='constant', constant_values=255)
69
  else:
70
  # Aspect ratio matches the target, just resize
71
+ padded_image = cv2.resize(
72
+ image, (target_w, target_h), interpolation=cv2.INTER_LANCZOS4)
73
 
74
+ return padded_image # image 2 speech
75
 
76
 
77
  def _shorten(filename):
78
+ return filename.replace("/", "")[-6:]
79
+
80
 
81
  def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
82
  '''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
 
110
  # return the resized image
111
  return resized
112
 
113
+
114
+ def overlay(x, soundscape=None):
115
  if soundscape is not None:
116
  # AudioGen sound is suffice to be ~10s long
117
  background = sound_generator.generate(soundscape,
118
+ # sound duration = TTS dur
119
+ duration=len(x)/16000 + .74,
120
+ ).detach().cpu().numpy() # bs, 11400 @.74s
121
 
122
  # len_soundscape = len(background)
123
 
124
  # fading = .5 + .5 * np.tanh(4*(np.linspace(10, -10, len_soundscape) + 9.4)) # fade heaviside 1,1,1,1,...,0
125
 
126
  # x = np.concatenate([fading * background, x], 0) # blend TTS with AudioGen
127
+ # background /= np.abs(background).max() + 1e-7 # amplify speech to full [-1,1]
128
+ # background will be longer by xtra .74s
129
+ x = .47 * x + .46 * background[:len(x)]
130
  return x # TTS / AudioGen @ 16kHz
131
 
132
 
 
143
  voice : string or None (falls to styleTTS)
144
  soundscape : 'A castle in far away lands' -> if passed will generate background sound soundscape
145
  '''
146
+
 
147
  # StyleTTS2 - English
148
+
149
  if precomputed_style_vector is not None:
150
  x = []
151
  if not isinstance(text, list):
152
  text = split_into_sentences(text) # Avoid OOM in StyleTTS2
153
  for _sentence in text:
154
+
155
  # StyleTTS2 - pronounciation Fx
156
+
157
+ # .replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
158
+ _sentence = _sentence.lower()
159
  if 'vctk_low#p326' in voice:
160
  # fix sounding of sleepy AAABS TRAACT
161
+ _sentence = _sentence.replace(
162
+ 'abstract', 'ahbstract') # 'ahstract'
163
  x.append(msinference.inference(_sentence,
164
  precomputed_style_vector)
165
  )
166
  x = np.concatenate(x)
167
+
168
  # Fallback - MMS TTS - Non-English
169
+
170
  else:
171
+
172
  # dont split foreign sentences: Avoids speaker change issue
173
  x = msinference.foreign(text=text,
174
  lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
175
  speed=speed) # normalisation externally
176
+
 
177
  # volume
 
 
 
 
 
178
 
179
+ x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
180
+
181
+ return overlay(x, soundscape=soundscape)
182
 
183
 
184
  # voices = {}
185
  # import phonemizer
186
  # global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
 
187
  app = Flask(__name__)
188
 
189
+
190
  @app.route("/", methods=['GET', 'POST', 'PUT'])
191
  def serve_wav():
192
  # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
193
  # object-into-a-representation-suitable-for-mongodb
194
  r = request.form.to_dict(flat=False)
195
+
 
196
  # Physically Save Client Files
197
  for filename, obj in request.files.items():
198
  obj.save(f'{CACHE_DIR}{_shorten(filename)}')
199
+
200
+ print('Saved all files on Server Side\n\n')
201
 
202
  args = SimpleNamespace(
203
+ # crop last letters from original filename & use as tmp
204
+ text=None if r.get('text') is None else CACHE_DIR +
205
+ _shorten(r.get('text')[0]),
206
+ video=None if r.get('video') is None else CACHE_DIR +
207
+ _shorten(r.get('video')[0]),
208
+ image=None if r.get('image') is None else CACHE_DIR +
209
+ _shorten(r.get('image')[0]),
210
+ native=None if r.get('native') is None else CACHE_DIR +
211
+ _shorten(r.get('native')[0]),
212
+ affective=r.get('affective')[0],
213
+ voice=r.get('voice')[0],
214
+ speed=float(r.get('speed')[0]), # For Non-English MMS TTS
215
+ soundscape=r.get('soundscape')[0] if r.get(
216
+ 'soundscape') is not None else None,
217
+ )
218
  # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
 
219
 
220
  print(args, 'ENTER Script')
221
  do_video_dub = True if args.text.endswith('.srt') else False
 
224
  AUDIO_TRACK = '_audio_track.wav'
225
 
226
  if do_video_dub:
227
+ print(
228
+ '==\nFound .srt : {args.txt}, thus Video should be given as well\n\n')
229
  with open(args.text, "r") as f:
230
  s = f.read()
231
+ text = [[j.content, j.start.total_seconds(), j.end.total_seconds()]
232
+ for j in srt.parse(s)]
233
  assert args.video is not None
234
  native_audio_file = '_tmp.wav'
235
  subprocess.run(
 
244
  "-vn",
245
  native_audio_file])
246
  x_native, _ = soundfile.read(native_audio_file) # reads mp3
247
+
248
  # stereo in video
249
  if x_native.ndim > 1:
250
  x_native = x_native[:, 0] # stereo
251
+
252
  # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
253
  else:
254
  with open(args.text, 'r') as f:
255
  text = ''.join(f)
256
+ # delete spaces / split in list in tts_multi_sentence()
257
+ text = re.sub(' +', ' ', text)
258
+
259
  # == STYLE VECTOR ==
260
 
261
  precomputed_style_vector = None
262
+
263
  if args.native: # Voice Cloning
264
  try:
265
  precomputed_style_vector = msinference.compute_style(args.native)
266
  except soundfile.LibsndfileError: # Fallback - internal voice
267
+ print('\n Could not voice clone audio:', args.native,
268
+ 'fallback to video or Internal TTS voice.\n')
269
  if do_video_dub: # Clone voice via Video
270
  native_audio_file = args.video.replace('.', '').replace('/', '')
271
  native_audio_file += '__native_audio_track.wav'
272
  soundfile.write('tgt_spk.wav',
273
+ np.concatenate([
274
+ x_native[:int(4 * 16000)]], 0).astype(np.float32), 16000) # 27400?
275
  precomputed_style_vector = msinference.compute_style('tgt_spk.wav')
276
 
277
  # NOTE: style vector is normally None here - except if --native arg was passed
278
+
279
  # Native English Accent TTS
280
  if precomputed_style_vector is None:
281
  if 'en_US' in args.voice or 'en_UK' in args.voice:
 
287
  'cmu-arctic', 'cmu_arctic').replace(
288
  '_low', '') + '.wav')
289
  # Non-Native English Accent TTS
290
+ elif '_' in args.voice:
291
  precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
292
  '/', '_').replace('#', '_').replace(
293
+ 'cmu-arctic', 'cmu_arctic').replace(
294
+ '_low', '') + '.wav')
295
  # Foreign Lang
296
  else:
297
  print(f'\n\n\n\n\n FallBack to MMS TTS due to: {args.voice=}')
298
+
 
299
  # NOTE : precomputed_style_vector is still None if MMS TTS
300
+
301
  # == SILENT VIDEO ==
302
 
303
  if args.video is not None:
304
  # banner - precomput @ 1920 pixels
305
  frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
306
+ font = cv2.FONT_HERSHEY_SIMPLEX
307
  bottomLeftCornerOfText = (240, 74) # w,h
308
+ fontScale = 2
309
+ fontColor = (255, 255, 255)
310
+ thickness = 4
311
+ lineType = 2
312
  cv2.putText(frame_tts, 'TTS',
313
+ bottomLeftCornerOfText,
314
+ font,
315
+ fontScale,
316
+ fontColor,
317
+ thickness,
318
+ lineType)
319
  # cv2.imshow('i', frame_tts); cv2.waitKey(); cv2.destroyAllWindows()
320
  # ====================================== NATIVE VOICE
321
  frame_orig = np.zeros((104, 1920, 3), dtype=np.uint8)
322
+ font = cv2.FONT_HERSHEY_SIMPLEX
323
  bottomLeftCornerOfText = (101, 74) # w,h
324
+ fontScale = 2
325
+ fontColor = (255, 255, 255)
326
+ thickness = 4
327
+ lineType = 1000
328
  cv2.putText(frame_orig, 'ORIGINAL VOICE',
329
+ bottomLeftCornerOfText,
330
+ font,
331
+ fontScale,
332
+ fontColor,
333
+ thickness,
334
+ lineType)
335
+
336
  print(f'\n______________________________\n'
337
  f'Gen Banners for TTS/Native Title {frame_tts.shape=} {frame_orig.shape=}'
338
  f'\n______________________________\n')
 
350
  #
351
  video_file = args.video
352
  vf = VideoFileClip(video_file)
353
+
354
  # GET 1st FRAME to OBTAIN frame RESOLUTION
355
  h, w, _ = vf.get_frame(0).shape
356
  frame_tts = _resize(frame_tts, width=w)
357
  frame_orig = _resize(frame_orig, width=w)
358
  h, w, _ = frame_orig.shape
359
+
360
  try:
361
+
362
  # inpaint banner to say if native voice
363
  num = x_native.shape[0]
364
+ # fade heaviside
365
+ is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4))
366
+
367
  def inpaint_banner(get_frame, t):
368
  '''blend banner - (now plays) tts or native voic
369
  '''
370
+
371
  im = np.copy(get_frame(t)) # pic
 
372
 
373
  ix = int(t * 16000) # ix may overflow the is_tts.shape
374
  if ix < num:
375
  if is_tts[ix] > .5: # mask == 1 => tts / mask == 0 -> native
376
  frame = frame_tts # rename frame to rsz_frame_... because if frame_tts is mod
377
+ # then is considered a "local variable" thus the "outer var"
378
+ # is not observed by python raising referenced before assign
379
  else:
380
  frame = frame_orig
381
  # For the ix that is out of bounds of num assume frame_tts
382
  else:
383
  frame = frame_tts
384
+
385
  # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
 
 
386
 
387
  offset_h = 24
388
+
389
+ print(
390
+ f' > inpaint_banner() HAS NATIVE: {frame.shape=} {im.shape=}\n\n\n\n')
391
+
392
+ im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
 
 
393
  + .6 * frame).astype(np.uint8)
394
+
395
  # im2 = np.concatenate([im, frame_tts], 0)
396
  # cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
397
  return im # np.concatenate([im, frane_ttts], 0)
398
+
399
  except UnboundLocalError: # args.native == False
400
+
401
  def inpaint_banner(get_frame, t):
402
+
403
  im = np.copy(get_frame(t))
404
+
405
  h, w, _ = frame_tts.shape # frame = banner
406
  if w != im.shape[1]: # rsz banners to fit video w
407
  local_frame = _resize(frame_tts, width=im.shape[1])
408
  offset_h = 24
409
+ im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
410
  + .6 * local_frame).astype(np.uint8)
411
  return im
412
  vf = vf.fl(inpaint_banner)
 
415
  # ==== TTS .srt ====
416
 
417
  if do_video_dub:
418
+ OUT_FILE = 'tmp.mp4' # args.out_file + '_video_dub.mp4'
419
  subtitles = text
420
+ MAX_LEN = int(subtitles[-1][2] + 17) * 16000
421
  # 17 extra seconds fail-safe for long-last-segment
422
  print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
423
  pieces = []
 
433
  # x = audresample.resample(x.astype(np.float32), 24000, 22050) # reshapes (64,) -> (1,64)
434
  # PAD SHORTEST of TTS / NATIVE
435
  if len(x_native) > len(total):
436
+ total = np.pad(
437
+ total, (0, max(0, x_native.shape[0] - total.shape[0])))
438
 
439
  else: # pad native to len of is_tts & total
440
+ x_native = np.pad(
441
+ x_native, (0, max(0, total.shape[0] - x_native.shape[0])))
442
  # print(total.shape, x_native.shape, 'PADDED TRACKS')
443
  soundfile.write(AUDIO_TRACK,
444
  # (is_tts * total + (1-is_tts) * x_native)[:, None],
 
447
  else: # Video from plain (.txt)
448
  OUT_FILE = 'tmp.mp4'
449
  x = tts_multi_sentence(text=text,
450
+ precomputed_style_vector=precomputed_style_vector,
451
+ voice=args.voice,
452
+ soundscape=args.soundscape,
453
+ speed=args.speed)
454
  soundfile.write(AUDIO_TRACK, x, 16000)
455
 
456
  # IMAGE 2 SPEECH
457
 
458
  if args.image is not None:
459
+
460
  # Resize Input Image to 1920x1080 - Issue of .mp4 non visible for other aspect ratios
461
+
462
+ STATIC_FRAME = args.image + '.jpg' # 'assets/image_from_T31.jpg'
463
  cv2.imwrite(
464
  STATIC_FRAME,
465
  resize_with_white_padding(cv2.imread(args.image)
466
  ))
467
+
468
+ OUT_FILE = 'tmp.mp4' # args.out_file + '_image_to_speech.mp4'
469
 
470
  # SILENT CLIP
471
 
 
498
  CACHE_DIR + OUT_FILE])
499
 
500
  print(f'\noutput video is saved as {OUT_FILE}')
501
+
502
  else:
503
+
504
  # Fallback: No image nor video provided - do only tts
505
  x = tts_multi_sentence(text=text,
506
+ precomputed_style_vector=precomputed_style_vector,
507
  voice=args.voice,
508
  soundscape=args.soundscape,
509
  speed=args.speed)
510
  OUT_FILE = 'tmp.wav'
511
  soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
512
 
513
+ # audios = [msinference.inference(text,
 
 
 
514
  # msinference.compute_style(f'voices/{voice}.wav'))]
515
  # # for t in [text]:
516
  # output_buffer = io.BytesIO()
 
520
  # https://stackoverflow.com/questions/67591467/
521
  # flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
522
  # time.sleep(4)
523
+
 
524
  # send server's output as default file -> srv_result.xx
525
  print(f'\n=SERVER saved as {OUT_FILE=}\n')
526
  response = send_from_directory(CACHE_DIR, path=OUT_FILE)
 
528
  print('________________\n ? \n_______________')
529
  return response
530
 
531
+
532
  if __name__ == "__main__":
533
  app.run(host="0.0.0.0")
534
 
 
555
  # f'fusion.mp4', # save to correct location is handled in client
556
  # ])
557
  #
558
+ # ffmpeg -f concat -i mylist.txt -c copy output.mp4
demo.py CHANGED
@@ -2,26 +2,14 @@ import numpy as np
2
  import soundfile
3
  import msinference
4
  from audiocraft.builders import AudioGen
5
- # Prepend »Vom Prof. Friedrich ist noch eine .. string in the beginning brings the male voice in deu MMS TTS (if later string is much longer
6
- # sometimes the woman voices pronounces words <dass>) TODO amplify attn weights of first hidden states / certain voice
7
-
8
- def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Landschaft hier«, schrieb das Literarische'
9
- 'Conversations-Blatt anlässlich der Dresdner Akademieausstellung 1825, »eine einsame'
10
- 'Gebirgsgegend. Trefflich sind die verschiedenen Tinten der höhern Bergregionen dargestellt: vorn,'
11
- 'zwischen den sich thürmenden Basaltblöcken, drängen sich noch Gras und Bäumchen hervor,'
12
- '»Vom Prof. Friedrich ist noch eine recht schöne große Landschaft hier«, schrieb das Literarische'
13
- 'Conversations-Blatt anlässlich der Dresdner Akademieausstellung 1825, »eine einsame'
14
- 'Gebirgsgegend. Trefflich sind die verschiedenen Tinten der höhern Bergregionen dargestellt: vorn,'
15
- 'zwischen den sich thürmenden Basaltblöcken, drängen sich noch Gras und Bäumchen hervor,'
16
- 'A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.'
17
- 'DESCIPTION Bronzezeitlicher Zeremonialhut („Berliner Goldhut“), gefertigt aus einem Stück nahtlos getriebenem Goldblech und mit Kreisornamenten in Repousse-Technik verziert. Kalotte kegelförmig überhöht und mit breiter umlaufender Krempe. Krempe und Kalotte durch flaches Bronzeband verstärkt. An der Krempe außen tordierter Bronzedraht. Die Anordnung der Ornamentik auf Kalotte und Krempe des Zeremonialhutes wird als Darstellung eines Kalendersystems gedeutet, mit dem sich die Verschiebungen zwischen Sonnen- und Mondjahr berechnen und Mondfinsternisse voraussagen lassen.'
18
- 'Vorderseite: L IVL AVR SVLP ANTONINVS [LP ligiert]. Panzerbüste des Uranius Antoninus mit Lorbeerkranz in der Brustansicht nach l., Pteryges des r. Armansatzes sind zur Angabe eines erhobenen Armes waagerecht dargestellt. Rückseite: CONSERVATO-R AVG. Der Stein des Baal von Emesa auf einem Viergespann (quadriga) nach l. Auf dem Stein, der von zwei Schirmen gerahmt ist, ist das Relief eines Adlers zu sehen. Kommentar: Baldus (1971) 84 ff. 87 zur Frage der Münzstätte, ebd. 128 ff. zur Pterygesanhebung (Andeutung eines erhobenen Armes), die als alexanderhafter Gestus gilt. - Uranius Antoninus wurde im Sommer 253 n. Chr. im syrischen Emesa zum Kaiser erhoben und bewährte sich bald darauf bei der erfolgreichen Abwehr eines Einfalls der Sasaniden. Uranius Antoninus stammte möglicherweise aus der Familie der Iulia Domna, war Priester des Baals von Emesa, und ist mit dem literarisch überlieferten Sampsigeramus identisch, der als Organisator des Widerstandes gegen die Sasaniden in der Region belegt ist. Nach 254 n. Chr. fehlen Informationen über Uranius Antoninus, möglicherweise trat er nach Bereinigung der Notsituation hinter den Kaiser Valerianus zurück. Zu diesem Stück wurden 2017 im Zuge der Ausstellung Syria Antiqua zwei vergrößerte Reproduktionen (3D-Ausdrucke) erstellt, die bei den Galvanos in Schrank 81/121 liegen. Literatur: A. von Sallet, ZfN 17, 1890, 241 f. Taf. 4,9 (dieses Stück); H. R. Baldus, Uranius Antoninus (1971) 198 Nr. 85 Taf. 7,85; 12,85 (dieses Stück, mit Lit., 253/254 n. Chr. bzw. Stempelgruppe VIII ca. Dez. 253-Anfang 254 n. Chr.); RIC IV-3 Nr. 2 c; RPC IX Nr. 1940,2 Taf. 131 (dieses Stück).',
19
- voice='romanian', #'af_ZA_google-nwu_1919', # 'serbian', 'en_US/vctk_low#p276', 'isl',
20
  speed=1.14,
21
- affect = True, # False = higher clarity
22
  soundscape = 'dogs barg in dungeons n dragons'
23
  ):
24
- '''24kHz
25
 
26
  voice : 'en_US/vctk_low#p276' # Native English voices -> https://audeering.github.io/shift/
27
 
@@ -37,8 +25,8 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
37
  # StyleTTS2 - find voice from folder
38
 
39
  if ('en_US/' in voice) or ('en_UK/' in voice):
40
- a = '' if affect else 'v2/'
41
- style_vector = msinference.compute_style('assets/wavs/style_vector/' + a + voice.replace(
42
  '/', '_').replace('#', '_').replace(
43
  'cmu-arctic', 'cmu_arctic').replace(
44
  '_low', '') + '.wav')
@@ -58,7 +46,7 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
58
  style_vector)
59
 
60
 
61
- # Fallback - MMS TTS - Non-English voice/language
62
 
63
  else:
64
  x = msinference.foreign(text=text,
@@ -68,13 +56,13 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
68
  # volume
69
 
70
  x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
71
-
72
  if soundscape is not None:
73
  sound_gen = AudioGen().to('cuda:0').eval()
74
  background = sound_gen.generate(soundscape,
75
- duration=len(x)/24000 + .74, # sound duration in seconds
76
- ).detach().cpu().numpy() # bs, 11400 @.74s
77
  x = .5 * x + .47 * background[:len(x)]
78
  return x
79
 
80
- soundfile.write(f'demo.wav', tts_entry(), 24000)
 
2
  import soundfile
3
  import msinference
4
  from audiocraft.builders import AudioGen
5
+
6
+ def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
7
+ voice='en_US/vctk_low#p326', #'en_US/vctk_low#p276', # 'deu', 'af_ZA_google-nwu_1919', 'serbian', 'isl',
 
 
 
 
 
 
 
 
 
 
 
 
8
  speed=1.14,
9
+ affect = True, # False = higher clarity voice
10
  soundscape = 'dogs barg in dungeons n dragons'
11
  ):
12
+ '''16 KHz
13
 
14
  voice : 'en_US/vctk_low#p276' # Native English voices -> https://audeering.github.io/shift/
15
 
 
25
  # StyleTTS2 - find voice from folder
26
 
27
  if ('en_US/' in voice) or ('en_UK/' in voice):
28
+ a = '' if affect else '_v2'
29
+ style_vector = msinference.compute_style('assets/wavs/style_vector' + a + '/' + voice.replace(
30
  '/', '_').replace('#', '_').replace(
31
  'cmu-arctic', 'cmu_arctic').replace(
32
  '_low', '') + '.wav')
 
46
  style_vector)
47
 
48
 
49
+ # Fallback - MMS TTS - Non-English voice / langs
50
 
51
  else:
52
  x = msinference.foreign(text=text,
 
56
  # volume
57
 
58
  x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
59
+
60
  if soundscape is not None:
61
  sound_gen = AudioGen().to('cuda:0').eval()
62
  background = sound_gen.generate(soundscape,
63
+ duration=len(x)/16000 + .74, # sound duration in seconds
64
+ ).detach().cpu().numpy()
65
  x = .5 * x + .47 * background[:len(x)]
66
  return x
67
 
68
+ soundfile.write(f'demo.wav', tts_entry(), 16000)
msinference.py CHANGED
@@ -1,3 +1,13 @@
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
  from cached_path import cached_path
3
  # import nltk
@@ -9,6 +19,7 @@ import torchaudio
9
  import librosa
10
  from models import ProsodyPredictor, TextEncoder, StyleEncoder, load_F0_models
11
  from nltk.tokenize import word_tokenize
 
12
  import textwrap
13
  # IPA Phonemizer: https://github.com/bootphon/phonemizer
14
 
@@ -24,10 +35,15 @@ dicts = {}
24
  for i in range(len((symbols))):
25
  dicts[symbols[i]] = i
26
 
 
 
 
 
27
  class TextCleaner:
28
  def __init__(self, dummy=None):
29
  self.word_index_dictionary = dicts
30
  print(len(dicts))
 
31
  def __call__(self, text):
32
  indexes = []
33
  for char in text:
@@ -38,7 +54,6 @@ class TextCleaner:
38
  return indexes
39
 
40
 
41
-
42
  textclenaer = TextCleaner()
43
 
44
 
@@ -46,17 +61,20 @@ to_mel = torchaudio.transforms.MelSpectrogram(
46
  n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
47
  mean, std = -4, 4
48
 
 
49
  def alpha_num(f):
50
  f = re.sub(' +', ' ', f) # delete spaces
51
  f = re.sub(r'[^A-Z a-z0-9 ]+', '', f) # del non alpha num
52
  return f
53
 
 
54
  def preprocess(wave):
55
  wave_tensor = torch.from_numpy(wave).float()
56
  mel_tensor = to_mel(wave_tensor)
57
  mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
58
  return mel_tensor
59
 
 
60
  def compute_style(path):
61
  wave, sr = librosa.load(path, sr=24000)
62
  audio, index = librosa.effects.trim(wave, top_db=30)
@@ -67,18 +85,19 @@ def compute_style(path):
67
  with torch.no_grad():
68
  ref_s = style_encoder(mel_tensor.unsqueeze(1))
69
  ref_p = predictor_encoder(mel_tensor.unsqueeze(1)) # [bs, 11, 1, 128]
70
-
71
  s = torch.cat([ref_s, ref_p], dim=3) # [bs, 11, 1, 256]
72
-
73
  s = s[:, :, 0, :].transpose(1, 2) # [1, 128, 11]
74
- return s# [1, 128, 11]
 
75
 
76
  device = 'cpu'
77
  if torch.cuda.is_available():
78
  device = 'cuda'
79
 
80
- import phonemizer
81
- global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
82
  # phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
83
 
84
 
@@ -88,45 +107,43 @@ ASR_config = args['ASR_config']
88
  F0_path = args['F0_path']
89
  pitch_extractor = load_F0_models(F0_path).eval().to(device)
90
 
91
- from Utils.PLBERT.util import load_plbert
92
- from Modules.hifigan import Decoder
93
 
94
  bert = load_plbert(args['PLBERT_dir']).eval().to(device)
95
 
96
- decoder = Decoder(dim_in=512,
97
- style_dim=128,
98
  dim_out=80, # n_mels
99
- resblock_kernel_sizes = [3, 7, 11],
100
- upsample_rates = [10, 5, 3, 2],
101
  upsample_initial_channel=512,
102
  resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
103
  upsample_kernel_sizes=[20, 10, 6, 4]).eval().to(device)
104
 
105
- text_encoder = TextEncoder(channels=512,
106
- kernel_size=5,
107
- depth=3, #args['model_params']['n_layer'],
108
- n_symbols=178, #args['model_params']['n_token']
109
  ).eval().to(device)
110
 
111
- predictor = ProsodyPredictor(style_dim=128,
112
- d_hid=512,
113
  nlayers=3, # OFFICIAL config.nlayers=5;
114
- max_dur=50,
115
  dropout=.2).eval().to(device)
116
 
117
- style_encoder = StyleEncoder(dim_in=64,
118
- style_dim=128,
119
- max_conv_dim=512).eval().to(device) # acoustic style encoder
120
- predictor_encoder = StyleEncoder(dim_in=64,
121
- style_dim=128,
122
- max_conv_dim=512).eval().to(device) # prosodic style encoder
123
  bert_encoder = torch.nn.Linear(bert.config.hidden_size, 512).eval().to(device)
124
 
125
  # params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
126
- params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
 
127
  params = params_whole['net']
128
 
129
- from collections import OrderedDict
130
 
131
  def _del_prefix(d):
132
  # del ".module"
@@ -135,14 +152,19 @@ def _del_prefix(d):
135
  out[k[7:]] = v
136
  return out
137
 
138
- bert.load_state_dict( _del_prefix(params['bert']), strict=True)
 
139
  bert_encoder.load_state_dict(_del_prefix(params['bert_encoder']), strict=True)
140
- predictor.load_state_dict( _del_prefix(params['predictor']), strict=True) # XTRA non-ckpt LSTMs nlayers add slowiness to voice
141
- decoder.load_state_dict( _del_prefix(params['decoder']), strict=True)
 
142
  text_encoder.load_state_dict(_del_prefix(params['text_encoder']), strict=True)
143
- predictor_encoder.load_state_dict(_del_prefix(params['predictor_encoder']), strict=True)
144
- style_encoder.load_state_dict(_del_prefix(params['style_encoder']), strict=True)
145
- pitch_extractor.load_state_dict(_del_prefix(params['pitch_extractor']), strict=True)
 
 
 
146
 
147
  # def _shift(x):
148
  # # [bs, samples] shift circular each batch elem of sound
@@ -152,13 +174,13 @@ pitch_extractor.load_state_dict(_del_prefix(params['pitch_extractor']), strict=T
152
  # x[i, ...] = torch.roll(batch_elem, offset, dims=1) # batch_elem = [400000, ]
153
  # return x
154
 
 
155
  def inference(text,
156
  ref_s,
157
  use_gruut=False):
158
- # Ignore .,; AT end of sentence; or just [-50:]
159
-
160
- text = text.strip()
161
-
162
  ps = global_phonemizer.phonemize([text])
163
  # print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
164
  ps = word_tokenize(ps[0])
@@ -172,20 +194,20 @@ def inference(text,
172
 
173
  with torch.no_grad():
174
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
175
-
176
  hidden_states = text_encoder(tokens, input_lengths)
177
-
178
  bert_dur = bert(tokens, attention_mask=None)
179
  d_en = bert_encoder(bert_dur).transpose(-1, -2)
180
- ref = ref_s[:, :128, :] # [bs, 128, 11]
181
  s = ref_s[:, 128:, :]
182
  d = predictor.text_encoder(d_en, s, input_lengths)
183
  d = d.transpose(1, 2)
184
  # -------------------------------- pred_aln_trg = clones bert frames as duration
185
-
186
  d = predictor.text_encoder(d_en,
187
- s,
188
- input_lengths)
189
 
190
  x, _ = predictor.lstm(d)
191
 
@@ -194,7 +216,6 @@ def inference(text,
194
  duration = torch.sigmoid(duration).sum(axis=-1)
195
  pred_dur = torch.round(duration.squeeze()).clamp(min=1)
196
 
197
-
198
  pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
199
  c_frame = 0
200
  for i in range(pred_aln_trg.size(0)):
@@ -222,15 +243,15 @@ def inference(text,
222
  N=N_pred,
223
  s=ref)
224
 
225
- x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
226
 
227
  # StyleTTS2 is 24kHz -> Resample to 16kHz ofAudioGen / MMS
228
 
229
  if x.shape[0] > 10:
230
  x /= np.abs(x).max() + 1e-7
231
  x = audresample.resample(signal=x.astype(np.float32),
232
- original_rate=24000,
233
- target_rate=16000)[0, :] # reshapes (64,) -> (1,64)
234
 
235
  else:
236
  print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
@@ -238,26 +259,15 @@ def inference(text,
238
  return x
239
 
240
 
241
-
242
-
243
  # ___________________________________________________________
244
 
245
  # https://huggingface.co/spaces/mms-meta/MMS/blob/main/tts.py
246
  # ___________________________________________________________
247
-
248
  # -*- coding: utf-8 -*-
249
-
250
  # Copyright (c) Facebook, Inc. and its affiliates.
251
  #
252
  # This source code is licensed under the MIT license found in the
253
  # LICENSE file in the root directory of this source tree.
254
- from num2words import num2words
255
- import os
256
- import re
257
- import tempfile
258
- import torch
259
- import sys
260
- from Modules.vits.models import VitsModel, VitsTokenizer
261
 
262
  TTS_LANGUAGES = {}
263
  # with open('_d.csv', 'w') as f2:
@@ -266,96 +276,54 @@ with open(f"Utils/all_langs.csv") as f:
266
  iso, name = line.split(",", 1)
267
  TTS_LANGUAGES[iso.strip()] = name.strip()
268
  # f2.write(iso + ',' + name.replace("a S","")+'\n')
269
- # =============================================================================================
270
- # R O M A N I A N N U M E R A L S
271
- # =============================================================================================
272
-
273
- def _ro_number(number_str):
274
- # Function to convert numbers to their phonetic form in Romanian
275
- # Check if the number is negative
276
- negative = number_str.startswith('-')
277
- if negative:
278
- number_str = number_str[1:] # Remove the minus sign for now
279
-
280
- # Handle floating point numbers by splitting into integer and decimal parts
281
- if '.' in number_str:
282
- integer_part, decimal_part = number_str.split('.')
283
- integer_words = num2words(integer_part, lang='ro')
284
- decimal_words = ' '.join([num2words(digit, lang='ro') for digit in decimal_part])
285
- result = f"{integer_words} virgulă {decimal_words}"
286
- else:
287
- result = num2words(number_str, lang='ro')
288
-
289
- # Add 'minus' if the number is negative
290
- if negative:
291
- result = "minus " + result
292
-
293
- return result
294
-
295
- def romanian_num2str(input_string):
296
- # Function to convert a string with numbers to phonetic representation in Romanian
297
- # Regex pattern to identify numbers in the string (including negative numbers, decimals)
298
- pattern = r'-?\d+(\.\d+)?'
299
-
300
- def replace_with_phoneme(match):
301
- # Extract the matched number and convert it to phonetic representation
302
- number_str = match.group()
303
- return _ro_number(number_str)
304
-
305
- # Use regex to find all numbers in the input string and replace them with their phonetic form
306
- return re.sub(pattern, replace_with_phoneme, input_string)
307
 
308
 
309
  # ==============================================================================================
310
- # LOAD hun / ron / serbian - rmc-script_latin / cyrillic-Carpathian (not Vlax)
311
  # ==============================================================================================
312
 
313
  PHONEME_MAP = {
314
- 'služ' : 'sloooozz', # 'službeno'
315
- 'suver': 'siuveeerra', # 'suverena'
316
- 'država': 'dirrezav', # 'država'
317
- 'iči': 'ici', # 'Graniči'
318
- 's ': 'se', # a s with space
319
- 'q': 'ku',
320
- 'w': 'aou',
321
- 'z': 's',
322
- "š": "s",
323
- 'th': 'ta',
324
- 'v': 'vv',
325
- # "ć": "č",
326
- # "đ": "ď",
327
- # "lj": "ľ",
328
- # "nj": "ň",
329
- "ž": "z",
330
- # "c": "č"
331
- }
332
-
333
- # ALLOWED_PHONEMES = set("šč_bďph`-3žt 'ľzj5yuoóx1vfnaiedt́sṁkň2rčlg")
334
-
335
- def number_to_phonemes(match):
336
- number = int(match.group())
337
- words = num2words(number, lang='sr')
338
- return fix_phones(words.lower())
339
- # return words
340
 
341
  def fix_phones(text):
342
  for src, target in PHONEME_MAP.items():
343
  text = text.replace(src, target)
344
  # text = re.sub(r'\s+', '` `', text) #.strip() #.lower()
345
  # text = re.sub(r'\s+', '_ _', text) # almost proper pausing
346
-
347
  return text.replace(',', '_ _').replace('.', '_ _')
348
 
 
349
  def has_cyrillic(text):
350
  # https://stackoverflow.com/questions/48255244/python-check-if-a-string-contains-cyrillic-characters
351
  return bool(re.search('[\u0400-\u04FF]', text))
352
 
353
- def foreign(text=None, # split sentences here so we can prepend a txt for german to each sentence to
 
354
  # fall on the male voice (Sink attn)
355
  lang='romanian',
356
  speed=None):
357
 
358
- lang = lang.lower() # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
 
359
 
360
  # https://huggingface.co/spaces/mms-meta/MMS
361
 
@@ -367,11 +335,13 @@ def foreign(text=None, # split sentences here so we can prepend a txt for germ
367
 
368
  if has_cyrillic(text): # check 0-th sentence if is cyrillic
369
 
370
- lang_code = 'rmc-script_cyrillic' # romani carpathian (also has latin / cyrillic Vlax)
 
371
 
372
  else:
373
 
374
- lang_code = 'rmc-script_latin' # romani carpathian (has also Vlax)
 
375
 
376
  elif 'rom' in lang:
377
 
@@ -392,12 +362,12 @@ def foreign(text=None, # split sentences here so we can prepend a txt for germ
392
 
393
  lang_code = lang.split()[0].strip()
394
 
395
- # Load VITS
396
 
397
  net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval().to(device)
398
  tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
399
 
400
- # CALL MMS TTS VITS
401
 
402
  total_audio = []
403
 
@@ -406,41 +376,49 @@ def foreign(text=None, # split sentences here so we can prepend a txt for germ
406
  if lang_code == 'deu':
407
  # Split Very long sentences >500 phoneme - StyleTTS2 crashes # -- even 400 phonemes sometimes OOM in cuda:4
408
  # However prosody is nicer on non-split for MMS TTS
409
- text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 200, break_long_words=0)] # prepend txt snippet
410
- # assert that it chooses unique voice
 
 
411
  else:
412
- text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 640, break_long_words=0)] # allow longer non split text
413
- # for non deu MMS TTS lang.
 
 
414
 
415
  for _t in text:
416
 
417
  _t = _t.lower()
418
 
 
 
 
 
 
 
 
419
  if lang_code == 'rmc-script_latin':
420
 
421
- _t = re.sub(r'\d+', number_to_phonemes, _t)
422
- _t = fix_phones(_t)
423
 
424
  elif lang_code == 'ron':
425
 
426
- # numerals
427
- _t = romanian_num2str(_t)
428
-
429
  # tone
430
  _t = _t.replace("ţ", "ț"
431
- ).replace('ț','ts').replace('î', 'u').replace('â','a').replace('ş','s')
432
 
433
  # /data/dkounadis/.hf7/hub/models--facebook--mms-tts/snapshots/44cc7fb408064ef9ea6e7c59130d88cac1274671/models/rmc-script_latin/vocab.txt
434
- inputs = tokenizer(_t, return_tensors="pt") # input_ids / attention_mask
 
435
 
436
  with torch.no_grad():
437
 
438
  # MMS
439
 
440
  x = net_g(input_ids=inputs.input_ids.to(device),
441
- attention_mask=inputs.attention_mask.to(device),
442
- speed = speed + .44 * np.random.rand() # variable speed for different sentence
443
- )[0, :]
444
 
445
  # crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()
446
 
@@ -455,5 +433,3 @@ def foreign(text=None, # split sentences here so we can prepend a txt for germ
455
  # print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
456
 
457
  return x # 16kHz - only resample StyleTTS2 from 24Hkz -> 16kHz
458
-
459
-
 
1
+ from Modules.vits.models import VitsModel, VitsTokenizer
2
+ import sys
3
+ import tempfile
4
+ import re
5
+ import os
6
+ from num2words import num2words
7
+ from collections import OrderedDict
8
+ from Modules.hifigan import Decoder
9
+ from Utils.PLBERT.util import load_plbert
10
+ import phonemizer
11
  import torch
12
  from cached_path import cached_path
13
  # import nltk
 
19
  import librosa
20
  from models import ProsodyPredictor, TextEncoder, StyleEncoder, load_F0_models
21
  from nltk.tokenize import word_tokenize
22
+ from Utils.text_utils import transliterate_number
23
  import textwrap
24
  # IPA Phonemizer: https://github.com/bootphon/phonemizer
25
 
 
35
  for i in range(len((symbols))):
36
  dicts[symbols[i]] = i
37
 
38
+
39
+
40
+
41
+
42
  class TextCleaner:
43
  def __init__(self, dummy=None):
44
  self.word_index_dictionary = dicts
45
  print(len(dicts))
46
+
47
  def __call__(self, text):
48
  indexes = []
49
  for char in text:
 
54
  return indexes
55
 
56
 
 
57
  textclenaer = TextCleaner()
58
 
59
 
 
61
  n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
62
  mean, std = -4, 4
63
 
64
+
65
  def alpha_num(f):
66
  f = re.sub(' +', ' ', f) # delete spaces
67
  f = re.sub(r'[^A-Z a-z0-9 ]+', '', f) # del non alpha num
68
  return f
69
 
70
+
71
  def preprocess(wave):
72
  wave_tensor = torch.from_numpy(wave).float()
73
  mel_tensor = to_mel(wave_tensor)
74
  mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
75
  return mel_tensor
76
 
77
+
78
  def compute_style(path):
79
  wave, sr = librosa.load(path, sr=24000)
80
  audio, index = librosa.effects.trim(wave, top_db=30)
 
85
  with torch.no_grad():
86
  ref_s = style_encoder(mel_tensor.unsqueeze(1))
87
  ref_p = predictor_encoder(mel_tensor.unsqueeze(1)) # [bs, 11, 1, 128]
88
+
89
  s = torch.cat([ref_s, ref_p], dim=3) # [bs, 11, 1, 256]
90
+
91
  s = s[:, :, 0, :].transpose(1, 2) # [1, 128, 11]
92
+ return s # [1, 128, 11]
93
+
94
 
95
  device = 'cpu'
96
  if torch.cuda.is_available():
97
  device = 'cuda'
98
 
99
+ global_phonemizer = phonemizer.backend.EspeakBackend(
100
+ language='en-us', preserve_punctuation=True, with_stress=True)
101
  # phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
102
 
103
 
 
107
  F0_path = args['F0_path']
108
  pitch_extractor = load_F0_models(F0_path).eval().to(device)
109
 
 
 
110
 
111
  bert = load_plbert(args['PLBERT_dir']).eval().to(device)
112
 
113
+ decoder = Decoder(dim_in=512,
114
+ style_dim=128,
115
  dim_out=80, # n_mels
116
+ resblock_kernel_sizes=[3, 7, 11],
117
+ upsample_rates=[10, 5, 3, 2],
118
  upsample_initial_channel=512,
119
  resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
120
  upsample_kernel_sizes=[20, 10, 6, 4]).eval().to(device)
121
 
122
+ text_encoder = TextEncoder(channels=512,
123
+ kernel_size=5,
124
+ depth=3, # args['model_params']['n_layer'],
125
+ n_symbols=178, # args['model_params']['n_token']
126
  ).eval().to(device)
127
 
128
+ predictor = ProsodyPredictor(style_dim=128,
129
+ d_hid=512,
130
  nlayers=3, # OFFICIAL config.nlayers=5;
131
+ max_dur=50,
132
  dropout=.2).eval().to(device)
133
 
134
+ style_encoder = StyleEncoder(dim_in=64,
135
+ style_dim=128,
136
+ max_conv_dim=512).eval().to(device) # acoustic style encoder
137
+ predictor_encoder = StyleEncoder(dim_in=64,
138
+ style_dim=128,
139
+ max_conv_dim=512).eval().to(device) # prosodic style encoder
140
  bert_encoder = torch.nn.Linear(bert.config.hidden_size, 512).eval().to(device)
141
 
142
  # params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
143
+ params_whole = torch.load(str(cached_path(
144
+ "hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
145
  params = params_whole['net']
146
 
 
147
 
148
  def _del_prefix(d):
149
  # del ".module"
 
152
  out[k[7:]] = v
153
  return out
154
 
155
+
156
+ bert.load_state_dict(_del_prefix(params['bert']), strict=True)
157
  bert_encoder.load_state_dict(_del_prefix(params['bert_encoder']), strict=True)
158
+ # XTRA non-ckpt LSTMs nlayers add slowiness to voice
159
+ predictor.load_state_dict(_del_prefix(params['predictor']), strict=True)
160
+ decoder.load_state_dict(_del_prefix(params['decoder']), strict=True)
161
  text_encoder.load_state_dict(_del_prefix(params['text_encoder']), strict=True)
162
+ predictor_encoder.load_state_dict(_del_prefix(
163
+ params['predictor_encoder']), strict=True)
164
+ style_encoder.load_state_dict(_del_prefix(
165
+ params['style_encoder']), strict=True)
166
+ pitch_extractor.load_state_dict(_del_prefix(
167
+ params['pitch_extractor']), strict=True)
168
 
169
  # def _shift(x):
170
  # # [bs, samples] shift circular each batch elem of sound
 
174
  # x[i, ...] = torch.roll(batch_elem, offset, dims=1) # batch_elem = [400000, ]
175
  # return x
176
 
177
+
178
  def inference(text,
179
  ref_s,
180
  use_gruut=False):
181
+
182
+ text = transliterate_number(text, lang='en').strip()
183
+
 
184
  ps = global_phonemizer.phonemize([text])
185
  # print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
186
  ps = word_tokenize(ps[0])
 
194
 
195
  with torch.no_grad():
196
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
197
+
198
  hidden_states = text_encoder(tokens, input_lengths)
199
+
200
  bert_dur = bert(tokens, attention_mask=None)
201
  d_en = bert_encoder(bert_dur).transpose(-1, -2)
202
+ ref = ref_s[:, :128, :] # [bs, 128, 11]
203
  s = ref_s[:, 128:, :]
204
  d = predictor.text_encoder(d_en, s, input_lengths)
205
  d = d.transpose(1, 2)
206
  # -------------------------------- pred_aln_trg = clones bert frames as duration
207
+
208
  d = predictor.text_encoder(d_en,
209
+ s,
210
+ input_lengths)
211
 
212
  x, _ = predictor.lstm(d)
213
 
 
216
  duration = torch.sigmoid(duration).sum(axis=-1)
217
  pred_dur = torch.round(duration.squeeze()).clamp(min=1)
218
 
 
219
  pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
220
  c_frame = 0
221
  for i in range(pred_aln_trg.size(0)):
 
243
  N=N_pred,
244
  s=ref)
245
 
246
+ x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
247
 
248
  # StyleTTS2 is 24kHz -> Resample to 16kHz ofAudioGen / MMS
249
 
250
  if x.shape[0] > 10:
251
  x /= np.abs(x).max() + 1e-7
252
  x = audresample.resample(signal=x.astype(np.float32),
253
+ original_rate=24000,
254
+ target_rate=16000)[0, :] # reshapes (64,) -> (1,64)
255
 
256
  else:
257
  print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
 
259
  return x
260
 
261
 
 
 
262
  # ___________________________________________________________
263
 
264
  # https://huggingface.co/spaces/mms-meta/MMS/blob/main/tts.py
265
  # ___________________________________________________________
 
266
  # -*- coding: utf-8 -*-
 
267
  # Copyright (c) Facebook, Inc. and its affiliates.
268
  #
269
  # This source code is licensed under the MIT license found in the
270
  # LICENSE file in the root directory of this source tree.
 
 
 
 
 
 
 
271
 
272
  TTS_LANGUAGES = {}
273
  # with open('_d.csv', 'w') as f2:
 
276
  iso, name = line.split(",", 1)
277
  TTS_LANGUAGES[iso.strip()] = name.strip()
278
  # f2.write(iso + ',' + name.replace("a S","")+'\n')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
 
281
  # ==============================================================================================
282
+ # LOAD hun / ron / serbian - rmc-script_latin / cyrillic-Carpathian (not Vlax)
283
  # ==============================================================================================
284
 
285
  PHONEME_MAP = {
286
+ 'služ': 'sloooozz', # 'službeno'
287
+ 'suver': 'siuveeerra', # 'suverena'
288
+ 'država': 'dirrezav', # 'država'
289
+ 'iči': 'ici', # 'Graniči'
290
+ 's ': 'se', # a s with space
291
+ 'q': 'ku',
292
+ 'w': 'aou',
293
+ 'z': 's',
294
+ "š": "s",
295
+ 'th': 'ta',
296
+ 'v': 'vv',
297
+ # "ć": "č",
298
+ # "đ": "ď",
299
+ # "lj": "ľ",
300
+ # "nj": "ň",
301
+ "ž": "z",
302
+ # "c": "č"
303
+ }
304
+
 
 
 
 
 
 
 
305
 
306
  def fix_phones(text):
307
  for src, target in PHONEME_MAP.items():
308
  text = text.replace(src, target)
309
  # text = re.sub(r'\s+', '` `', text) #.strip() #.lower()
310
  # text = re.sub(r'\s+', '_ _', text) # almost proper pausing
311
+
312
  return text.replace(',', '_ _').replace('.', '_ _')
313
 
314
+
315
  def has_cyrillic(text):
316
  # https://stackoverflow.com/questions/48255244/python-check-if-a-string-contains-cyrillic-characters
317
  return bool(re.search('[\u0400-\u04FF]', text))
318
 
319
+
320
+ def foreign(text=None, # split sentences here so we can prepend a txt for german to each sentence to
321
  # fall on the male voice (Sink attn)
322
  lang='romanian',
323
  speed=None):
324
 
325
+ # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
326
+ lang = lang.lower()
327
 
328
  # https://huggingface.co/spaces/mms-meta/MMS
329
 
 
335
 
336
  if has_cyrillic(text): # check 0-th sentence if is cyrillic
337
 
338
+ # romani carpathian (also has latin / cyrillic Vlax)
339
+ lang_code = 'rmc-script_cyrillic'
340
 
341
  else:
342
 
343
+ # romani carpathian (has also Vlax)
344
+ lang_code = 'rmc-script_latin'
345
 
346
  elif 'rom' in lang:
347
 
 
362
 
363
  lang_code = lang.split()[0].strip()
364
 
365
+ # load VITS
366
 
367
  net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval().to(device)
368
  tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
369
 
370
+
371
 
372
  total_audio = []
373
 
 
376
  if lang_code == 'deu':
377
  # Split Very long sentences >500 phoneme - StyleTTS2 crashes # -- even 400 phonemes sometimes OOM in cuda:4
378
  # However prosody is nicer on non-split for MMS TTS
379
+ # prepend txt snippet
380
+ text = [
381
+ sub_sent+' ' for sub_sent in textwrap.wrap(text, 200, break_long_words=0)]
382
+ # assert that it chooses unique voice
383
  else:
384
+ # allow longer non split text
385
+ text = [
386
+ sub_sent+' ' for sub_sent in textwrap.wrap(text, 640, break_long_words=0)]
387
+ # for non deu MMS TTS lang.
388
 
389
  for _t in text:
390
 
391
  _t = _t.lower()
392
 
393
+ # apply this in api.py -> tts_multi_sentence before switching between Styletts2
394
+ print('\n\n\n\nBEF TRansliteration', _t,'\n\n\n\n\n')
395
+ _t = transliterate_number(_t, lang=lang_code)
396
+ print('AFT nums', _t,'\n____________________________________________')
397
+
398
+ # However if we transliterate here also the demo sees the transliteration
399
+
400
  if lang_code == 'rmc-script_latin':
401
 
402
+ _t = fix_phones(_t) # phonemes replace per language
 
403
 
404
  elif lang_code == 'ron':
405
 
 
 
 
406
  # tone
407
  _t = _t.replace("ţ", "ț"
408
+ ).replace('ț', 'ts').replace('î', 'u').replace('â', 'a').replace('ş', 's')
409
 
410
  # /data/dkounadis/.hf7/hub/models--facebook--mms-tts/snapshots/44cc7fb408064ef9ea6e7c59130d88cac1274671/models/rmc-script_latin/vocab.txt
411
+ # input_ids / attention_mask
412
+ inputs = tokenizer(_t, return_tensors="pt")
413
 
414
  with torch.no_grad():
415
 
416
  # MMS
417
 
418
  x = net_g(input_ids=inputs.input_ids.to(device),
419
+ attention_mask=inputs.attention_mask.to(device),
420
+ speed=speed + .44 * np.random.rand() # variable speed for different sentence
421
+ )[0, :]
422
 
423
  # crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()
424
 
 
433
  # print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
434
 
435
  return x # 16kHz - only resample StyleTTS2 from 24Hkz -> 16kHz