lang numerals
Browse files- Utils/text_utils.py +118 -34
- api.py +141 -132
- demo.py +12 -24
- msinference.py +122 -146
Utils/text_utils.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
import re
|
3 |
import codecs
|
4 |
import textwrap
|
|
|
5 |
# IPA Phonemizer: https://github.com/bootphon/phonemizer
|
6 |
|
7 |
_pad = "$"
|
@@ -16,10 +17,12 @@ dicts = {}
|
|
16 |
for i in range(len((symbols))):
|
17 |
dicts[symbols[i]] = i
|
18 |
|
|
|
19 |
class TextCleaner:
|
20 |
def __init__(self, dummy=None):
|
21 |
self.word_index_dictionary = dicts
|
22 |
print(len(dicts))
|
|
|
23 |
def __call__(self, text):
|
24 |
indexes = []
|
25 |
for char in text:
|
@@ -32,7 +35,7 @@ class TextCleaner:
|
|
32 |
|
33 |
# == Sentence Splitter
|
34 |
|
35 |
-
alphabets= "([A-Za-z])"
|
36 |
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
|
37 |
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
|
38 |
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
|
@@ -42,7 +45,6 @@ digits = "([0-9])"
|
|
42 |
multiple_dots = r'\.{2,}'
|
43 |
|
44 |
|
45 |
-
|
46 |
def split_into_sentences(text):
|
47 |
"""
|
48 |
Split the text into sentences.
|
@@ -59,54 +61,66 @@ def split_into_sentences(text):
|
|
59 |
https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
|
60 |
"""
|
61 |
text = " " + text + " "
|
62 |
-
text = text.replace("\n"," ")
|
63 |
-
text = re.sub(prefixes,"\\1<prd>",text)
|
64 |
-
text = re.sub(websites,"<prd>\\1",text)
|
65 |
-
text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
|
66 |
-
text = re.sub(multiple_dots, lambda match: "<prd>" *
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
text = re.sub(
|
71 |
-
text = re.sub(
|
72 |
-
text = re.sub(
|
73 |
-
|
74 |
-
text = re.sub(
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
sentences = text.split("<stop>")
|
84 |
sentences = [s.strip() for s in sentences]
|
85 |
-
|
86 |
# Split Very long sentences >500 phoneme - StyleTTS2 crashes
|
87 |
# -- even 400 phonemes sometimes OOM in cuda:4
|
88 |
-
sentences = [
|
89 |
-
|
90 |
-
|
|
|
91 |
# sentences = sentences[:-1]
|
92 |
return sentences
|
93 |
|
|
|
94 |
def store_ssml(text=None,
|
95 |
voice=None):
|
96 |
'''create ssml:
|
97 |
text : list of sentences
|
98 |
voice: https://github.com/MycroftAI/mimic3-voices
|
99 |
'''
|
100 |
-
print('\n___________________________\n', len(text),
|
|
|
101 |
_s = '<speak>'
|
102 |
for short_text in text:
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
volume = int(74 * np.random.rand() + 24)
|
108 |
# text = ('<speak>'
|
109 |
-
|
|
|
110 |
_s += f'<prosody rate=\'{rate}\'>'
|
111 |
_s += f'<voice name=\'{voice}\'>'
|
112 |
_s += '<s>'
|
@@ -116,7 +130,77 @@ def store_ssml(text=None,
|
|
116 |
_s += '</prosody>'
|
117 |
_s += '</prosody>'
|
118 |
_s += '</speak>'
|
119 |
-
print(len(text),'\n\n\n\n\n\n\n', _s)
|
120 |
-
|
121 |
with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
|
122 |
f.write(_s)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import re
|
3 |
import codecs
|
4 |
import textwrap
|
5 |
+
from num2words import num2words
|
6 |
# IPA Phonemizer: https://github.com/bootphon/phonemizer
|
7 |
|
8 |
_pad = "$"
|
|
|
17 |
for i in range(len((symbols))):
|
18 |
dicts[symbols[i]] = i
|
19 |
|
20 |
+
|
21 |
class TextCleaner:
|
22 |
def __init__(self, dummy=None):
|
23 |
self.word_index_dictionary = dicts
|
24 |
print(len(dicts))
|
25 |
+
|
26 |
def __call__(self, text):
|
27 |
indexes = []
|
28 |
for char in text:
|
|
|
35 |
|
36 |
# == Sentence Splitter
|
37 |
|
38 |
+
alphabets = "([A-Za-z])"
|
39 |
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
|
40 |
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
|
41 |
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
|
|
|
45 |
multiple_dots = r'\.{2,}'
|
46 |
|
47 |
|
|
|
48 |
def split_into_sentences(text):
|
49 |
"""
|
50 |
Split the text into sentences.
|
|
|
61 |
https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
|
62 |
"""
|
63 |
text = " " + text + " "
|
64 |
+
text = text.replace("\n", " ")
|
65 |
+
text = re.sub(prefixes, "\\1<prd>", text)
|
66 |
+
text = re.sub(websites, "<prd>\\1", text)
|
67 |
+
text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
|
68 |
+
text = re.sub(multiple_dots, lambda match: "<prd>" *
|
69 |
+
len(match.group(0)) + "<stop>", text)
|
70 |
+
if "Ph.D" in text:
|
71 |
+
text = text.replace("Ph.D.", "Ph<prd>D<prd>")
|
72 |
+
text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text)
|
73 |
+
text = re.sub(acronyms+" "+starters, "\\1<stop> \\2", text)
|
74 |
+
text = re.sub(alphabets + "[.]" + alphabets + "[.]" +
|
75 |
+
alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text)
|
76 |
+
text = re.sub(alphabets + "[.]" + alphabets +
|
77 |
+
"[.]", "\\1<prd>\\2<prd>", text)
|
78 |
+
text = re.sub(" "+suffixes+"[.] "+starters, " \\1<stop> \\2", text)
|
79 |
+
text = re.sub(" "+suffixes+"[.]", " \\1<prd>", text)
|
80 |
+
text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
|
81 |
+
if "”" in text:
|
82 |
+
text = text.replace(".”", "”.")
|
83 |
+
if "\"" in text:
|
84 |
+
text = text.replace(".\"", "\".")
|
85 |
+
if "!" in text:
|
86 |
+
text = text.replace("!\"", "\"!")
|
87 |
+
if "?" in text:
|
88 |
+
text = text.replace("?\"", "\"?")
|
89 |
+
text = text.replace(".", ".<stop>")
|
90 |
+
text = text.replace("?", "?<stop>")
|
91 |
+
text = text.replace("!", "!<stop>")
|
92 |
+
text = text.replace("<prd>", ".")
|
93 |
sentences = text.split("<stop>")
|
94 |
sentences = [s.strip() for s in sentences]
|
95 |
+
|
96 |
# Split Very long sentences >500 phoneme - StyleTTS2 crashes
|
97 |
# -- even 400 phonemes sometimes OOM in cuda:4
|
98 |
+
sentences = [
|
99 |
+
sub_sent+' ' for s in sentences for sub_sent in textwrap.wrap(s, 200, break_long_words=0)]
|
100 |
+
|
101 |
+
# if sentences and not sentences[-1]:
|
102 |
# sentences = sentences[:-1]
|
103 |
return sentences
|
104 |
|
105 |
+
|
106 |
def store_ssml(text=None,
|
107 |
voice=None):
|
108 |
'''create ssml:
|
109 |
text : list of sentences
|
110 |
voice: https://github.com/MycroftAI/mimic3-voices
|
111 |
'''
|
112 |
+
print('\n___________________________\n', len(text),
|
113 |
+
text[0], '\n___________________________________\n')
|
114 |
_s = '<speak>'
|
115 |
for short_text in text:
|
116 |
|
117 |
+
# 1.44) # 1.24 for bieber
|
118 |
+
rate = min(max(.87, len(short_text) / 76), 1.14)
|
119 |
+
|
120 |
volume = int(74 * np.random.rand() + 24)
|
121 |
# text = ('<speak>'
|
122 |
+
# THe other voice does not have volume
|
123 |
+
_s += f'<prosody volume=\'{volume}\'>'
|
124 |
_s += f'<prosody rate=\'{rate}\'>'
|
125 |
_s += f'<voice name=\'{voice}\'>'
|
126 |
_s += '<s>'
|
|
|
130 |
_s += '</prosody>'
|
131 |
_s += '</prosody>'
|
132 |
_s += '</speak>'
|
133 |
+
print(len(text), '\n\n\n\n\n\n\n', _s)
|
134 |
+
|
135 |
with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
|
136 |
f.write(_s)
|
137 |
+
|
138 |
+
|
139 |
+
def transliterate_number(number_string, lang='en'):
|
140 |
+
"""
|
141 |
+
Converts a number string to words in the specified language,
|
142 |
+
handling decimals, scientific notation, and preserving text
|
143 |
+
before and after the numeral.
|
144 |
+
"""
|
145 |
+
|
146 |
+
if lang == 'rmc-script_latin':
|
147 |
+
lang = 'sr'
|
148 |
+
exponential_pronoun = ' puta deset na stepen od '
|
149 |
+
comma = ' tačka '
|
150 |
+
elif lang == 'ron':
|
151 |
+
lang = 'ro'
|
152 |
+
exponential_pronoun = ' tízszer a erejéig '
|
153 |
+
comma = ' virgulă '
|
154 |
+
elif lang == 'hun':
|
155 |
+
lang = 'hu'
|
156 |
+
exponential_pronoun = ' tízszer a erejéig '
|
157 |
+
comma = ' virgula '
|
158 |
+
elif lang == 'deu':
|
159 |
+
exponential_pronoun = ' mal zehn hoch '
|
160 |
+
comma = ' komma '
|
161 |
+
else:
|
162 |
+
lang = lang[:2]
|
163 |
+
exponential_pronoun = ' times ten to the power of '
|
164 |
+
comma = ' point '
|
165 |
+
|
166 |
+
def replace_number(match):
|
167 |
+
prefix = match.group(1) or ""
|
168 |
+
number_part = match.group(2)
|
169 |
+
suffix = match.group(5) or ""
|
170 |
+
|
171 |
+
try:
|
172 |
+
if 'e' in number_part.lower():
|
173 |
+
base, exponent = number_part.lower().split('e')
|
174 |
+
base = float(base)
|
175 |
+
exponent = int(exponent)
|
176 |
+
words = num2words(
|
177 |
+
base, lang=lang) + exponential_pronoun + num2words(exponent, lang=lang)
|
178 |
+
elif '.' in number_part:
|
179 |
+
integer_part, decimal_part = number_part.split('.')
|
180 |
+
words = num2words(int(integer_part), lang=lang) + comma + " ".join(
|
181 |
+
[num2words(int(digit), lang=lang) for digit in decimal_part])
|
182 |
+
else:
|
183 |
+
words = num2words(int(number_part), lang=lang)
|
184 |
+
return prefix + words + suffix
|
185 |
+
except ValueError:
|
186 |
+
return match.group(0) # Return original if conversion fails
|
187 |
+
|
188 |
+
pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
|
189 |
+
return re.sub(pattern, replace_number, number_string)
|
190 |
+
|
191 |
+
|
192 |
+
def discard_leading_numeral(text):
|
193 |
+
"""Discards a leading numeral (integer or float) from a string.
|
194 |
+
|
195 |
+
Args:
|
196 |
+
text: The input string.
|
197 |
+
|
198 |
+
Returns:
|
199 |
+
The string with the leading numeral removed, or the original string
|
200 |
+
if it doesn't start with a numeral.
|
201 |
+
"""
|
202 |
+
match = re.match(r"^\s*(\d+(\.\d*)?)\s*", text)
|
203 |
+
if match:
|
204 |
+
return text[match.end():].lstrip()
|
205 |
+
else:
|
206 |
+
return text
|
api.py
CHANGED
@@ -42,35 +42,41 @@ def resize_with_white_padding(image):
|
|
42 |
# Image is wider than the target, pad top and bottom
|
43 |
new_w = target_w
|
44 |
new_h = int(new_w / aspect_ratio)
|
45 |
-
resized_image = cv2.resize(
|
|
|
46 |
padding_h = target_h - new_h
|
47 |
top_padding = padding_h // 2
|
48 |
bottom_padding = padding_h - top_padding
|
49 |
padding = [(top_padding, bottom_padding), (0, 0)]
|
50 |
if len(image.shape) == 3:
|
51 |
padding.append((0, 0)) # Add padding for color channels
|
52 |
-
padded_image = np.pad(resized_image, padding,
|
|
|
53 |
elif aspect_ratio < target_aspect_ratio:
|
54 |
# Image is taller than the target, pad left and right
|
55 |
new_h = target_h
|
56 |
new_w = int(new_h * aspect_ratio)
|
57 |
-
resized_image = cv2.resize(
|
|
|
58 |
padding_w = target_w - new_w
|
59 |
left_padding = padding_w // 2
|
60 |
right_padding = padding_w - left_padding
|
61 |
padding = [(0, 0), (left_padding, right_padding)]
|
62 |
if len(image.shape) == 3:
|
63 |
padding.append((0, 0)) # Add padding for color channels
|
64 |
-
padded_image = np.pad(resized_image, padding,
|
|
|
65 |
else:
|
66 |
# Aspect ratio matches the target, just resize
|
67 |
-
padded_image = cv2.resize(
|
|
|
68 |
|
69 |
-
return padded_image
|
70 |
|
71 |
|
72 |
def _shorten(filename):
|
73 |
-
return filename.replace("/","")[-6:]
|
|
|
74 |
|
75 |
def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
|
76 |
'''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
|
@@ -104,20 +110,23 @@ def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
|
|
104 |
# return the resized image
|
105 |
return resized
|
106 |
|
107 |
-
|
|
|
108 |
if soundscape is not None:
|
109 |
# AudioGen sound is suffice to be ~10s long
|
110 |
background = sound_generator.generate(soundscape,
|
111 |
-
|
112 |
-
|
|
|
113 |
|
114 |
# len_soundscape = len(background)
|
115 |
|
116 |
# fading = .5 + .5 * np.tanh(4*(np.linspace(10, -10, len_soundscape) + 9.4)) # fade heaviside 1,1,1,1,...,0
|
117 |
|
118 |
# x = np.concatenate([fading * background, x], 0) # blend TTS with AudioGen
|
119 |
-
#background /= np.abs(background).max() + 1e-7 # amplify speech to full [-1,1]
|
120 |
-
|
|
|
121 |
return x # TTS / AudioGen @ 16kHz
|
122 |
|
123 |
|
@@ -134,77 +143,79 @@ def tts_multi_sentence(precomputed_style_vector=None,
|
|
134 |
voice : string or None (falls to styleTTS)
|
135 |
soundscape : 'A castle in far away lands' -> if passed will generate background sound soundscape
|
136 |
'''
|
137 |
-
|
138 |
-
|
139 |
# StyleTTS2 - English
|
140 |
-
|
141 |
if precomputed_style_vector is not None:
|
142 |
x = []
|
143 |
if not isinstance(text, list):
|
144 |
text = split_into_sentences(text) # Avoid OOM in StyleTTS2
|
145 |
for _sentence in text:
|
146 |
-
|
147 |
# StyleTTS2 - pronounciation Fx
|
148 |
-
|
149 |
-
|
|
|
150 |
if 'vctk_low#p326' in voice:
|
151 |
# fix sounding of sleepy AAABS TRAACT
|
152 |
-
_sentence = _sentence.replace(
|
|
|
153 |
x.append(msinference.inference(_sentence,
|
154 |
precomputed_style_vector)
|
155 |
)
|
156 |
x = np.concatenate(x)
|
157 |
-
|
158 |
# Fallback - MMS TTS - Non-English
|
159 |
-
|
160 |
else:
|
161 |
-
|
162 |
# dont split foreign sentences: Avoids speaker change issue
|
163 |
x = msinference.foreign(text=text,
|
164 |
lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
|
165 |
speed=speed) # normalisation externally
|
166 |
-
|
167 |
-
|
168 |
# volume
|
169 |
-
|
170 |
-
x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
|
171 |
-
|
172 |
-
return overlay(x, soundscape=soundscape)
|
173 |
-
|
174 |
|
|
|
|
|
|
|
175 |
|
176 |
|
177 |
# voices = {}
|
178 |
# import phonemizer
|
179 |
# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
|
180 |
-
|
181 |
app = Flask(__name__)
|
182 |
|
|
|
183 |
@app.route("/", methods=['GET', 'POST', 'PUT'])
|
184 |
def serve_wav():
|
185 |
# https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
|
186 |
# object-into-a-representation-suitable-for-mongodb
|
187 |
r = request.form.to_dict(flat=False)
|
188 |
-
|
189 |
-
|
190 |
# Physically Save Client Files
|
191 |
for filename, obj in request.files.items():
|
192 |
obj.save(f'{CACHE_DIR}{_shorten(filename)}')
|
193 |
-
|
194 |
-
print('Saved all files on Server Side\n\n')
|
195 |
|
196 |
args = SimpleNamespace(
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
|
207 |
-
|
208 |
|
209 |
print(args, 'ENTER Script')
|
210 |
do_video_dub = True if args.text.endswith('.srt') else False
|
@@ -213,10 +224,12 @@ def serve_wav():
|
|
213 |
AUDIO_TRACK = '_audio_track.wav'
|
214 |
|
215 |
if do_video_dub:
|
216 |
-
print(
|
|
|
217 |
with open(args.text, "r") as f:
|
218 |
s = f.read()
|
219 |
-
text = [[j.content, j.start.total_seconds(), j.end.total_seconds()]
|
|
|
220 |
assert args.video is not None
|
221 |
native_audio_file = '_tmp.wav'
|
222 |
subprocess.run(
|
@@ -231,36 +244,38 @@ def serve_wav():
|
|
231 |
"-vn",
|
232 |
native_audio_file])
|
233 |
x_native, _ = soundfile.read(native_audio_file) # reads mp3
|
234 |
-
|
235 |
# stereo in video
|
236 |
if x_native.ndim > 1:
|
237 |
x_native = x_native[:, 0] # stereo
|
238 |
-
|
239 |
# ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
|
240 |
else:
|
241 |
with open(args.text, 'r') as f:
|
242 |
text = ''.join(f)
|
243 |
-
|
244 |
-
|
|
|
245 |
# == STYLE VECTOR ==
|
246 |
|
247 |
precomputed_style_vector = None
|
248 |
-
|
249 |
if args.native: # Voice Cloning
|
250 |
try:
|
251 |
precomputed_style_vector = msinference.compute_style(args.native)
|
252 |
except soundfile.LibsndfileError: # Fallback - internal voice
|
253 |
-
print('\n Could not voice clone audio:', args.native,
|
|
|
254 |
if do_video_dub: # Clone voice via Video
|
255 |
native_audio_file = args.video.replace('.', '').replace('/', '')
|
256 |
native_audio_file += '__native_audio_track.wav'
|
257 |
soundfile.write('tgt_spk.wav',
|
258 |
-
|
259 |
-
|
260 |
precomputed_style_vector = msinference.compute_style('tgt_spk.wav')
|
261 |
|
262 |
# NOTE: style vector is normally None here - except if --native arg was passed
|
263 |
-
|
264 |
# Native English Accent TTS
|
265 |
if precomputed_style_vector is None:
|
266 |
if 'en_US' in args.voice or 'en_UK' in args.voice:
|
@@ -272,53 +287,52 @@ def serve_wav():
|
|
272 |
'cmu-arctic', 'cmu_arctic').replace(
|
273 |
'_low', '') + '.wav')
|
274 |
# Non-Native English Accent TTS
|
275 |
-
elif '_' in
|
276 |
precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
|
277 |
'/', '_').replace('#', '_').replace(
|
278 |
-
|
279 |
-
|
280 |
# Foreign Lang
|
281 |
else:
|
282 |
print(f'\n\n\n\n\n FallBack to MMS TTS due to: {args.voice=}')
|
283 |
-
|
284 |
-
|
285 |
# NOTE : precomputed_style_vector is still None if MMS TTS
|
286 |
-
|
287 |
# == SILENT VIDEO ==
|
288 |
|
289 |
if args.video is not None:
|
290 |
# banner - precomput @ 1920 pixels
|
291 |
frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
|
292 |
-
font
|
293 |
bottomLeftCornerOfText = (240, 74) # w,h
|
294 |
-
fontScale
|
295 |
-
fontColor
|
296 |
-
thickness
|
297 |
-
lineType
|
298 |
cv2.putText(frame_tts, 'TTS',
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
# cv2.imshow('i', frame_tts); cv2.waitKey(); cv2.destroyAllWindows()
|
306 |
# ====================================== NATIVE VOICE
|
307 |
frame_orig = np.zeros((104, 1920, 3), dtype=np.uint8)
|
308 |
-
font
|
309 |
bottomLeftCornerOfText = (101, 74) # w,h
|
310 |
-
fontScale
|
311 |
-
fontColor
|
312 |
-
thickness
|
313 |
-
lineType
|
314 |
cv2.putText(frame_orig, 'ORIGINAL VOICE',
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
print(f'\n______________________________\n'
|
323 |
f'Gen Banners for TTS/Native Title {frame_tts.shape=} {frame_orig.shape=}'
|
324 |
f'\n______________________________\n')
|
@@ -336,67 +350,63 @@ def serve_wav():
|
|
336 |
#
|
337 |
video_file = args.video
|
338 |
vf = VideoFileClip(video_file)
|
339 |
-
|
340 |
# GET 1st FRAME to OBTAIN frame RESOLUTION
|
341 |
h, w, _ = vf.get_frame(0).shape
|
342 |
frame_tts = _resize(frame_tts, width=w)
|
343 |
frame_orig = _resize(frame_orig, width=w)
|
344 |
h, w, _ = frame_orig.shape
|
345 |
-
|
346 |
try:
|
347 |
-
|
348 |
# inpaint banner to say if native voice
|
349 |
num = x_native.shape[0]
|
350 |
-
|
351 |
-
|
|
|
352 |
def inpaint_banner(get_frame, t):
|
353 |
'''blend banner - (now plays) tts or native voic
|
354 |
'''
|
355 |
-
|
356 |
im = np.copy(get_frame(t)) # pic
|
357 |
-
|
358 |
|
359 |
ix = int(t * 16000) # ix may overflow the is_tts.shape
|
360 |
if ix < num:
|
361 |
if is_tts[ix] > .5: # mask == 1 => tts / mask == 0 -> native
|
362 |
frame = frame_tts # rename frame to rsz_frame_... because if frame_tts is mod
|
363 |
-
|
364 |
-
|
365 |
else:
|
366 |
frame = frame_orig
|
367 |
# For the ix that is out of bounds of num assume frame_tts
|
368 |
else:
|
369 |
frame = frame_tts
|
370 |
-
|
371 |
# im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
|
372 |
-
|
373 |
-
|
374 |
|
375 |
offset_h = 24
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
|
383 |
+ .6 * frame).astype(np.uint8)
|
384 |
-
|
385 |
# im2 = np.concatenate([im, frame_tts], 0)
|
386 |
# cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
|
387 |
return im # np.concatenate([im, frane_ttts], 0)
|
388 |
-
|
389 |
except UnboundLocalError: # args.native == False
|
390 |
-
|
391 |
def inpaint_banner(get_frame, t):
|
392 |
-
|
393 |
im = np.copy(get_frame(t))
|
394 |
-
|
395 |
h, w, _ = frame_tts.shape # frame = banner
|
396 |
if w != im.shape[1]: # rsz banners to fit video w
|
397 |
local_frame = _resize(frame_tts, width=im.shape[1])
|
398 |
offset_h = 24
|
399 |
-
im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
|
400 |
+ .6 * local_frame).astype(np.uint8)
|
401 |
return im
|
402 |
vf = vf.fl(inpaint_banner)
|
@@ -405,9 +415,9 @@ def serve_wav():
|
|
405 |
# ==== TTS .srt ====
|
406 |
|
407 |
if do_video_dub:
|
408 |
-
OUT_FILE = 'tmp.mp4'
|
409 |
subtitles = text
|
410 |
-
MAX_LEN = int(subtitles[-1][2] + 17) * 16000
|
411 |
# 17 extra seconds fail-safe for long-last-segment
|
412 |
print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
|
413 |
pieces = []
|
@@ -423,10 +433,12 @@ def serve_wav():
|
|
423 |
# x = audresample.resample(x.astype(np.float32), 24000, 22050) # reshapes (64,) -> (1,64)
|
424 |
# PAD SHORTEST of TTS / NATIVE
|
425 |
if len(x_native) > len(total):
|
426 |
-
total = np.pad(
|
|
|
427 |
|
428 |
else: # pad native to len of is_tts & total
|
429 |
-
x_native = np.pad(
|
|
|
430 |
# print(total.shape, x_native.shape, 'PADDED TRACKS')
|
431 |
soundfile.write(AUDIO_TRACK,
|
432 |
# (is_tts * total + (1-is_tts) * x_native)[:, None],
|
@@ -435,25 +447,25 @@ def serve_wav():
|
|
435 |
else: # Video from plain (.txt)
|
436 |
OUT_FILE = 'tmp.mp4'
|
437 |
x = tts_multi_sentence(text=text,
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
soundfile.write(AUDIO_TRACK, x, 16000)
|
443 |
|
444 |
# IMAGE 2 SPEECH
|
445 |
|
446 |
if args.image is not None:
|
447 |
-
|
448 |
# Resize Input Image to 1920x1080 - Issue of .mp4 non visible for other aspect ratios
|
449 |
-
|
450 |
-
STATIC_FRAME = args.image
|
451 |
cv2.imwrite(
|
452 |
STATIC_FRAME,
|
453 |
resize_with_white_padding(cv2.imread(args.image)
|
454 |
))
|
455 |
-
|
456 |
-
OUT_FILE = 'tmp.mp4'
|
457 |
|
458 |
# SILENT CLIP
|
459 |
|
@@ -486,22 +498,19 @@ def serve_wav():
|
|
486 |
CACHE_DIR + OUT_FILE])
|
487 |
|
488 |
print(f'\noutput video is saved as {OUT_FILE}')
|
489 |
-
|
490 |
else:
|
491 |
-
|
492 |
# Fallback: No image nor video provided - do only tts
|
493 |
x = tts_multi_sentence(text=text,
|
494 |
-
precomputed_style_vector=precomputed_style_vector,
|
495 |
voice=args.voice,
|
496 |
soundscape=args.soundscape,
|
497 |
speed=args.speed)
|
498 |
OUT_FILE = 'tmp.wav'
|
499 |
soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
|
500 |
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
# audios = [msinference.inference(text,
|
505 |
# msinference.compute_style(f'voices/{voice}.wav'))]
|
506 |
# # for t in [text]:
|
507 |
# output_buffer = io.BytesIO()
|
@@ -511,8 +520,7 @@ def serve_wav():
|
|
511 |
# https://stackoverflow.com/questions/67591467/
|
512 |
# flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
|
513 |
# time.sleep(4)
|
514 |
-
|
515 |
-
|
516 |
# send server's output as default file -> srv_result.xx
|
517 |
print(f'\n=SERVER saved as {OUT_FILE=}\n')
|
518 |
response = send_from_directory(CACHE_DIR, path=OUT_FILE)
|
@@ -520,6 +528,7 @@ def serve_wav():
|
|
520 |
print('________________\n ? \n_______________')
|
521 |
return response
|
522 |
|
|
|
523 |
if __name__ == "__main__":
|
524 |
app.run(host="0.0.0.0")
|
525 |
|
@@ -546,4 +555,4 @@ if __name__ == "__main__":
|
|
546 |
# f'fusion.mp4', # save to correct location is handled in client
|
547 |
# ])
|
548 |
#
|
549 |
-
# ffmpeg -f concat -i mylist.txt -c copy output.mp4
|
|
|
42 |
# Image is wider than the target, pad top and bottom
|
43 |
new_w = target_w
|
44 |
new_h = int(new_w / aspect_ratio)
|
45 |
+
resized_image = cv2.resize(
|
46 |
+
image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
|
47 |
padding_h = target_h - new_h
|
48 |
top_padding = padding_h // 2
|
49 |
bottom_padding = padding_h - top_padding
|
50 |
padding = [(top_padding, bottom_padding), (0, 0)]
|
51 |
if len(image.shape) == 3:
|
52 |
padding.append((0, 0)) # Add padding for color channels
|
53 |
+
padded_image = np.pad(resized_image, padding,
|
54 |
+
mode='constant', constant_values=255)
|
55 |
elif aspect_ratio < target_aspect_ratio:
|
56 |
# Image is taller than the target, pad left and right
|
57 |
new_h = target_h
|
58 |
new_w = int(new_h * aspect_ratio)
|
59 |
+
resized_image = cv2.resize(
|
60 |
+
image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
|
61 |
padding_w = target_w - new_w
|
62 |
left_padding = padding_w // 2
|
63 |
right_padding = padding_w - left_padding
|
64 |
padding = [(0, 0), (left_padding, right_padding)]
|
65 |
if len(image.shape) == 3:
|
66 |
padding.append((0, 0)) # Add padding for color channels
|
67 |
+
padded_image = np.pad(resized_image, padding,
|
68 |
+
mode='constant', constant_values=255)
|
69 |
else:
|
70 |
# Aspect ratio matches the target, just resize
|
71 |
+
padded_image = cv2.resize(
|
72 |
+
image, (target_w, target_h), interpolation=cv2.INTER_LANCZOS4)
|
73 |
|
74 |
+
return padded_image # image 2 speech
|
75 |
|
76 |
|
77 |
def _shorten(filename):
|
78 |
+
return filename.replace("/", "")[-6:]
|
79 |
+
|
80 |
|
81 |
def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
|
82 |
'''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
|
|
|
110 |
# return the resized image
|
111 |
return resized
|
112 |
|
113 |
+
|
114 |
+
def overlay(x, soundscape=None):
|
115 |
if soundscape is not None:
|
116 |
# AudioGen sound is suffice to be ~10s long
|
117 |
background = sound_generator.generate(soundscape,
|
118 |
+
# sound duration = TTS dur
|
119 |
+
duration=len(x)/16000 + .74,
|
120 |
+
).detach().cpu().numpy() # bs, 11400 @.74s
|
121 |
|
122 |
# len_soundscape = len(background)
|
123 |
|
124 |
# fading = .5 + .5 * np.tanh(4*(np.linspace(10, -10, len_soundscape) + 9.4)) # fade heaviside 1,1,1,1,...,0
|
125 |
|
126 |
# x = np.concatenate([fading * background, x], 0) # blend TTS with AudioGen
|
127 |
+
# background /= np.abs(background).max() + 1e-7 # amplify speech to full [-1,1]
|
128 |
+
# background will be longer by xtra .74s
|
129 |
+
x = .47 * x + .46 * background[:len(x)]
|
130 |
return x # TTS / AudioGen @ 16kHz
|
131 |
|
132 |
|
|
|
143 |
voice : string or None (falls to styleTTS)
|
144 |
soundscape : 'A castle in far away lands' -> if passed will generate background sound soundscape
|
145 |
'''
|
146 |
+
|
|
|
147 |
# StyleTTS2 - English
|
148 |
+
|
149 |
if precomputed_style_vector is not None:
|
150 |
x = []
|
151 |
if not isinstance(text, list):
|
152 |
text = split_into_sentences(text) # Avoid OOM in StyleTTS2
|
153 |
for _sentence in text:
|
154 |
+
|
155 |
# StyleTTS2 - pronounciation Fx
|
156 |
+
|
157 |
+
# .replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
|
158 |
+
_sentence = _sentence.lower()
|
159 |
if 'vctk_low#p326' in voice:
|
160 |
# fix sounding of sleepy AAABS TRAACT
|
161 |
+
_sentence = _sentence.replace(
|
162 |
+
'abstract', 'ahbstract') # 'ahstract'
|
163 |
x.append(msinference.inference(_sentence,
|
164 |
precomputed_style_vector)
|
165 |
)
|
166 |
x = np.concatenate(x)
|
167 |
+
|
168 |
# Fallback - MMS TTS - Non-English
|
169 |
+
|
170 |
else:
|
171 |
+
|
172 |
# dont split foreign sentences: Avoids speaker change issue
|
173 |
x = msinference.foreign(text=text,
|
174 |
lang=voice, # voice = 'romanian', 'serbian' 'hungarian'
|
175 |
speed=speed) # normalisation externally
|
176 |
+
|
|
|
177 |
# volume
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
+
x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
|
180 |
+
|
181 |
+
return overlay(x, soundscape=soundscape)
|
182 |
|
183 |
|
184 |
# voices = {}
|
185 |
# import phonemizer
|
186 |
# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
|
|
|
187 |
app = Flask(__name__)
|
188 |
|
189 |
+
|
190 |
@app.route("/", methods=['GET', 'POST', 'PUT'])
|
191 |
def serve_wav():
|
192 |
# https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
|
193 |
# object-into-a-representation-suitable-for-mongodb
|
194 |
r = request.form.to_dict(flat=False)
|
195 |
+
|
|
|
196 |
# Physically Save Client Files
|
197 |
for filename, obj in request.files.items():
|
198 |
obj.save(f'{CACHE_DIR}{_shorten(filename)}')
|
199 |
+
|
200 |
+
print('Saved all files on Server Side\n\n')
|
201 |
|
202 |
args = SimpleNamespace(
|
203 |
+
# crop last letters from original filename & use as tmp
|
204 |
+
text=None if r.get('text') is None else CACHE_DIR +
|
205 |
+
_shorten(r.get('text')[0]),
|
206 |
+
video=None if r.get('video') is None else CACHE_DIR +
|
207 |
+
_shorten(r.get('video')[0]),
|
208 |
+
image=None if r.get('image') is None else CACHE_DIR +
|
209 |
+
_shorten(r.get('image')[0]),
|
210 |
+
native=None if r.get('native') is None else CACHE_DIR +
|
211 |
+
_shorten(r.get('native')[0]),
|
212 |
+
affective=r.get('affective')[0],
|
213 |
+
voice=r.get('voice')[0],
|
214 |
+
speed=float(r.get('speed')[0]), # For Non-English MMS TTS
|
215 |
+
soundscape=r.get('soundscape')[0] if r.get(
|
216 |
+
'soundscape') is not None else None,
|
217 |
+
)
|
218 |
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
|
|
|
219 |
|
220 |
print(args, 'ENTER Script')
|
221 |
do_video_dub = True if args.text.endswith('.srt') else False
|
|
|
224 |
AUDIO_TRACK = '_audio_track.wav'
|
225 |
|
226 |
if do_video_dub:
|
227 |
+
print(
|
228 |
+
'==\nFound .srt : {args.txt}, thus Video should be given as well\n\n')
|
229 |
with open(args.text, "r") as f:
|
230 |
s = f.read()
|
231 |
+
text = [[j.content, j.start.total_seconds(), j.end.total_seconds()]
|
232 |
+
for j in srt.parse(s)]
|
233 |
assert args.video is not None
|
234 |
native_audio_file = '_tmp.wav'
|
235 |
subprocess.run(
|
|
|
244 |
"-vn",
|
245 |
native_audio_file])
|
246 |
x_native, _ = soundfile.read(native_audio_file) # reads mp3
|
247 |
+
|
248 |
# stereo in video
|
249 |
if x_native.ndim > 1:
|
250 |
x_native = x_native[:, 0] # stereo
|
251 |
+
|
252 |
# ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
|
253 |
else:
|
254 |
with open(args.text, 'r') as f:
|
255 |
text = ''.join(f)
|
256 |
+
# delete spaces / split in list in tts_multi_sentence()
|
257 |
+
text = re.sub(' +', ' ', text)
|
258 |
+
|
259 |
# == STYLE VECTOR ==
|
260 |
|
261 |
precomputed_style_vector = None
|
262 |
+
|
263 |
if args.native: # Voice Cloning
|
264 |
try:
|
265 |
precomputed_style_vector = msinference.compute_style(args.native)
|
266 |
except soundfile.LibsndfileError: # Fallback - internal voice
|
267 |
+
print('\n Could not voice clone audio:', args.native,
|
268 |
+
'fallback to video or Internal TTS voice.\n')
|
269 |
if do_video_dub: # Clone voice via Video
|
270 |
native_audio_file = args.video.replace('.', '').replace('/', '')
|
271 |
native_audio_file += '__native_audio_track.wav'
|
272 |
soundfile.write('tgt_spk.wav',
|
273 |
+
np.concatenate([
|
274 |
+
x_native[:int(4 * 16000)]], 0).astype(np.float32), 16000) # 27400?
|
275 |
precomputed_style_vector = msinference.compute_style('tgt_spk.wav')
|
276 |
|
277 |
# NOTE: style vector is normally None here - except if --native arg was passed
|
278 |
+
|
279 |
# Native English Accent TTS
|
280 |
if precomputed_style_vector is None:
|
281 |
if 'en_US' in args.voice or 'en_UK' in args.voice:
|
|
|
287 |
'cmu-arctic', 'cmu_arctic').replace(
|
288 |
'_low', '') + '.wav')
|
289 |
# Non-Native English Accent TTS
|
290 |
+
elif '_' in args.voice:
|
291 |
precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
|
292 |
'/', '_').replace('#', '_').replace(
|
293 |
+
'cmu-arctic', 'cmu_arctic').replace(
|
294 |
+
'_low', '') + '.wav')
|
295 |
# Foreign Lang
|
296 |
else:
|
297 |
print(f'\n\n\n\n\n FallBack to MMS TTS due to: {args.voice=}')
|
298 |
+
|
|
|
299 |
# NOTE : precomputed_style_vector is still None if MMS TTS
|
300 |
+
|
301 |
# == SILENT VIDEO ==
|
302 |
|
303 |
if args.video is not None:
|
304 |
# banner - precomput @ 1920 pixels
|
305 |
frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
|
306 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
307 |
bottomLeftCornerOfText = (240, 74) # w,h
|
308 |
+
fontScale = 2
|
309 |
+
fontColor = (255, 255, 255)
|
310 |
+
thickness = 4
|
311 |
+
lineType = 2
|
312 |
cv2.putText(frame_tts, 'TTS',
|
313 |
+
bottomLeftCornerOfText,
|
314 |
+
font,
|
315 |
+
fontScale,
|
316 |
+
fontColor,
|
317 |
+
thickness,
|
318 |
+
lineType)
|
319 |
# cv2.imshow('i', frame_tts); cv2.waitKey(); cv2.destroyAllWindows()
|
320 |
# ====================================== NATIVE VOICE
|
321 |
frame_orig = np.zeros((104, 1920, 3), dtype=np.uint8)
|
322 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
323 |
bottomLeftCornerOfText = (101, 74) # w,h
|
324 |
+
fontScale = 2
|
325 |
+
fontColor = (255, 255, 255)
|
326 |
+
thickness = 4
|
327 |
+
lineType = 1000
|
328 |
cv2.putText(frame_orig, 'ORIGINAL VOICE',
|
329 |
+
bottomLeftCornerOfText,
|
330 |
+
font,
|
331 |
+
fontScale,
|
332 |
+
fontColor,
|
333 |
+
thickness,
|
334 |
+
lineType)
|
335 |
+
|
336 |
print(f'\n______________________________\n'
|
337 |
f'Gen Banners for TTS/Native Title {frame_tts.shape=} {frame_orig.shape=}'
|
338 |
f'\n______________________________\n')
|
|
|
350 |
#
|
351 |
video_file = args.video
|
352 |
vf = VideoFileClip(video_file)
|
353 |
+
|
354 |
# GET 1st FRAME to OBTAIN frame RESOLUTION
|
355 |
h, w, _ = vf.get_frame(0).shape
|
356 |
frame_tts = _resize(frame_tts, width=w)
|
357 |
frame_orig = _resize(frame_orig, width=w)
|
358 |
h, w, _ = frame_orig.shape
|
359 |
+
|
360 |
try:
|
361 |
+
|
362 |
# inpaint banner to say if native voice
|
363 |
num = x_native.shape[0]
|
364 |
+
# fade heaviside
|
365 |
+
is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4))
|
366 |
+
|
367 |
def inpaint_banner(get_frame, t):
|
368 |
'''blend banner - (now plays) tts or native voic
|
369 |
'''
|
370 |
+
|
371 |
im = np.copy(get_frame(t)) # pic
|
|
|
372 |
|
373 |
ix = int(t * 16000) # ix may overflow the is_tts.shape
|
374 |
if ix < num:
|
375 |
if is_tts[ix] > .5: # mask == 1 => tts / mask == 0 -> native
|
376 |
frame = frame_tts # rename frame to rsz_frame_... because if frame_tts is mod
|
377 |
+
# then is considered a "local variable" thus the "outer var"
|
378 |
+
# is not observed by python raising referenced before assign
|
379 |
else:
|
380 |
frame = frame_orig
|
381 |
# For the ix that is out of bounds of num assume frame_tts
|
382 |
else:
|
383 |
frame = frame_tts
|
384 |
+
|
385 |
# im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
|
|
|
|
|
386 |
|
387 |
offset_h = 24
|
388 |
+
|
389 |
+
print(
|
390 |
+
f' > inpaint_banner() HAS NATIVE: {frame.shape=} {im.shape=}\n\n\n\n')
|
391 |
+
|
392 |
+
im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
|
|
|
|
|
393 |
+ .6 * frame).astype(np.uint8)
|
394 |
+
|
395 |
# im2 = np.concatenate([im, frame_tts], 0)
|
396 |
# cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
|
397 |
return im # np.concatenate([im, frane_ttts], 0)
|
398 |
+
|
399 |
except UnboundLocalError: # args.native == False
|
400 |
+
|
401 |
def inpaint_banner(get_frame, t):
|
402 |
+
|
403 |
im = np.copy(get_frame(t))
|
404 |
+
|
405 |
h, w, _ = frame_tts.shape # frame = banner
|
406 |
if w != im.shape[1]: # rsz banners to fit video w
|
407 |
local_frame = _resize(frame_tts, width=im.shape[1])
|
408 |
offset_h = 24
|
409 |
+
im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
|
410 |
+ .6 * local_frame).astype(np.uint8)
|
411 |
return im
|
412 |
vf = vf.fl(inpaint_banner)
|
|
|
415 |
# ==== TTS .srt ====
|
416 |
|
417 |
if do_video_dub:
|
418 |
+
OUT_FILE = 'tmp.mp4' # args.out_file + '_video_dub.mp4'
|
419 |
subtitles = text
|
420 |
+
MAX_LEN = int(subtitles[-1][2] + 17) * 16000
|
421 |
# 17 extra seconds fail-safe for long-last-segment
|
422 |
print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
|
423 |
pieces = []
|
|
|
433 |
# x = audresample.resample(x.astype(np.float32), 24000, 22050) # reshapes (64,) -> (1,64)
|
434 |
# PAD SHORTEST of TTS / NATIVE
|
435 |
if len(x_native) > len(total):
|
436 |
+
total = np.pad(
|
437 |
+
total, (0, max(0, x_native.shape[0] - total.shape[0])))
|
438 |
|
439 |
else: # pad native to len of is_tts & total
|
440 |
+
x_native = np.pad(
|
441 |
+
x_native, (0, max(0, total.shape[0] - x_native.shape[0])))
|
442 |
# print(total.shape, x_native.shape, 'PADDED TRACKS')
|
443 |
soundfile.write(AUDIO_TRACK,
|
444 |
# (is_tts * total + (1-is_tts) * x_native)[:, None],
|
|
|
447 |
else: # Video from plain (.txt)
|
448 |
OUT_FILE = 'tmp.mp4'
|
449 |
x = tts_multi_sentence(text=text,
|
450 |
+
precomputed_style_vector=precomputed_style_vector,
|
451 |
+
voice=args.voice,
|
452 |
+
soundscape=args.soundscape,
|
453 |
+
speed=args.speed)
|
454 |
soundfile.write(AUDIO_TRACK, x, 16000)
|
455 |
|
456 |
# IMAGE 2 SPEECH
|
457 |
|
458 |
if args.image is not None:
|
459 |
+
|
460 |
# Resize Input Image to 1920x1080 - Issue of .mp4 non visible for other aspect ratios
|
461 |
+
|
462 |
+
STATIC_FRAME = args.image + '.jpg' # 'assets/image_from_T31.jpg'
|
463 |
cv2.imwrite(
|
464 |
STATIC_FRAME,
|
465 |
resize_with_white_padding(cv2.imread(args.image)
|
466 |
))
|
467 |
+
|
468 |
+
OUT_FILE = 'tmp.mp4' # args.out_file + '_image_to_speech.mp4'
|
469 |
|
470 |
# SILENT CLIP
|
471 |
|
|
|
498 |
CACHE_DIR + OUT_FILE])
|
499 |
|
500 |
print(f'\noutput video is saved as {OUT_FILE}')
|
501 |
+
|
502 |
else:
|
503 |
+
|
504 |
# Fallback: No image nor video provided - do only tts
|
505 |
x = tts_multi_sentence(text=text,
|
506 |
+
precomputed_style_vector=precomputed_style_vector,
|
507 |
voice=args.voice,
|
508 |
soundscape=args.soundscape,
|
509 |
speed=args.speed)
|
510 |
OUT_FILE = 'tmp.wav'
|
511 |
soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
|
512 |
|
513 |
+
# audios = [msinference.inference(text,
|
|
|
|
|
|
|
514 |
# msinference.compute_style(f'voices/{voice}.wav'))]
|
515 |
# # for t in [text]:
|
516 |
# output_buffer = io.BytesIO()
|
|
|
520 |
# https://stackoverflow.com/questions/67591467/
|
521 |
# flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
|
522 |
# time.sleep(4)
|
523 |
+
|
|
|
524 |
# send server's output as default file -> srv_result.xx
|
525 |
print(f'\n=SERVER saved as {OUT_FILE=}\n')
|
526 |
response = send_from_directory(CACHE_DIR, path=OUT_FILE)
|
|
|
528 |
print('________________\n ? \n_______________')
|
529 |
return response
|
530 |
|
531 |
+
|
532 |
if __name__ == "__main__":
|
533 |
app.run(host="0.0.0.0")
|
534 |
|
|
|
555 |
# f'fusion.mp4', # save to correct location is handled in client
|
556 |
# ])
|
557 |
#
|
558 |
+
# ffmpeg -f concat -i mylist.txt -c copy output.mp4
|
demo.py
CHANGED
@@ -2,26 +2,14 @@ import numpy as np
|
|
2 |
import soundfile
|
3 |
import msinference
|
4 |
from audiocraft.builders import AudioGen
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Landschaft hier«, schrieb das Literarische'
|
9 |
-
'Conversations-Blatt anlässlich der Dresdner Akademieausstellung 1825, »eine einsame'
|
10 |
-
'Gebirgsgegend. Trefflich sind die verschiedenen Tinten der höhern Bergregionen dargestellt: vorn,'
|
11 |
-
'zwischen den sich thürmenden Basaltblöcken, drängen sich noch Gras und Bäumchen hervor,'
|
12 |
-
'»Vom Prof. Friedrich ist noch eine recht schöne große Landschaft hier«, schrieb das Literarische'
|
13 |
-
'Conversations-Blatt anlässlich der Dresdner Akademieausstellung 1825, »eine einsame'
|
14 |
-
'Gebirgsgegend. Trefflich sind die verschiedenen Tinten der höhern Bergregionen dargestellt: vorn,'
|
15 |
-
'zwischen den sich thürmenden Basaltblöcken, drängen sich noch Gras und Bäumchen hervor,'
|
16 |
-
'A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.'
|
17 |
-
'DESCIPTION Bronzezeitlicher Zeremonialhut („Berliner Goldhut“), gefertigt aus einem Stück nahtlos getriebenem Goldblech und mit Kreisornamenten in Repousse-Technik verziert. Kalotte kegelförmig überhöht und mit breiter umlaufender Krempe. Krempe und Kalotte durch flaches Bronzeband verstärkt. An der Krempe außen tordierter Bronzedraht. Die Anordnung der Ornamentik auf Kalotte und Krempe des Zeremonialhutes wird als Darstellung eines Kalendersystems gedeutet, mit dem sich die Verschiebungen zwischen Sonnen- und Mondjahr berechnen und Mondfinsternisse voraussagen lassen.'
|
18 |
-
'Vorderseite: L IVL AVR SVLP ANTONINVS [LP ligiert]. Panzerbüste des Uranius Antoninus mit Lorbeerkranz in der Brustansicht nach l., Pteryges des r. Armansatzes sind zur Angabe eines erhobenen Armes waagerecht dargestellt. Rückseite: CONSERVATO-R AVG. Der Stein des Baal von Emesa auf einem Viergespann (quadriga) nach l. Auf dem Stein, der von zwei Schirmen gerahmt ist, ist das Relief eines Adlers zu sehen. Kommentar: Baldus (1971) 84 ff. 87 zur Frage der Münzstätte, ebd. 128 ff. zur Pterygesanhebung (Andeutung eines erhobenen Armes), die als alexanderhafter Gestus gilt. - Uranius Antoninus wurde im Sommer 253 n. Chr. im syrischen Emesa zum Kaiser erhoben und bewährte sich bald darauf bei der erfolgreichen Abwehr eines Einfalls der Sasaniden. Uranius Antoninus stammte möglicherweise aus der Familie der Iulia Domna, war Priester des Baals von Emesa, und ist mit dem literarisch überlieferten Sampsigeramus identisch, der als Organisator des Widerstandes gegen die Sasaniden in der Region belegt ist. Nach 254 n. Chr. fehlen Informationen über Uranius Antoninus, möglicherweise trat er nach Bereinigung der Notsituation hinter den Kaiser Valerianus zurück. Zu diesem Stück wurden 2017 im Zuge der Ausstellung Syria Antiqua zwei vergrößerte Reproduktionen (3D-Ausdrucke) erstellt, die bei den Galvanos in Schrank 81/121 liegen. Literatur: A. von Sallet, ZfN 17, 1890, 241 f. Taf. 4,9 (dieses Stück); H. R. Baldus, Uranius Antoninus (1971) 198 Nr. 85 Taf. 7,85; 12,85 (dieses Stück, mit Lit., 253/254 n. Chr. bzw. Stempelgruppe VIII ca. Dez. 253-Anfang 254 n. Chr.); RIC IV-3 Nr. 2 c; RPC IX Nr. 1940,2 Taf. 131 (dieses Stück).',
|
19 |
-
voice='romanian', #'af_ZA_google-nwu_1919', # 'serbian', 'en_US/vctk_low#p276', 'isl',
|
20 |
speed=1.14,
|
21 |
-
affect = True, # False = higher clarity
|
22 |
soundscape = 'dogs barg in dungeons n dragons'
|
23 |
):
|
24 |
-
'''
|
25 |
|
26 |
voice : 'en_US/vctk_low#p276' # Native English voices -> https://audeering.github.io/shift/
|
27 |
|
@@ -37,8 +25,8 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
|
|
37 |
# StyleTTS2 - find voice from folder
|
38 |
|
39 |
if ('en_US/' in voice) or ('en_UK/' in voice):
|
40 |
-
a = '' if affect else '
|
41 |
-
style_vector = msinference.compute_style('assets/wavs/style_vector
|
42 |
'/', '_').replace('#', '_').replace(
|
43 |
'cmu-arctic', 'cmu_arctic').replace(
|
44 |
'_low', '') + '.wav')
|
@@ -58,7 +46,7 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
|
|
58 |
style_vector)
|
59 |
|
60 |
|
61 |
-
# Fallback - MMS TTS - Non-English voice/
|
62 |
|
63 |
else:
|
64 |
x = msinference.foreign(text=text,
|
@@ -68,13 +56,13 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
|
|
68 |
# volume
|
69 |
|
70 |
x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
|
71 |
-
|
72 |
if soundscape is not None:
|
73 |
sound_gen = AudioGen().to('cuda:0').eval()
|
74 |
background = sound_gen.generate(soundscape,
|
75 |
-
duration=len(x)/
|
76 |
-
).detach().cpu().numpy()
|
77 |
x = .5 * x + .47 * background[:len(x)]
|
78 |
return x
|
79 |
|
80 |
-
soundfile.write(f'demo.wav', tts_entry(),
|
|
|
2 |
import soundfile
|
3 |
import msinference
|
4 |
from audiocraft.builders import AudioGen
|
5 |
+
|
6 |
+
def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
|
7 |
+
voice='en_US/vctk_low#p326', #'en_US/vctk_low#p276', # 'deu', 'af_ZA_google-nwu_1919', 'serbian', 'isl',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
speed=1.14,
|
9 |
+
affect = True, # False = higher clarity voice
|
10 |
soundscape = 'dogs barg in dungeons n dragons'
|
11 |
):
|
12 |
+
'''16 KHz
|
13 |
|
14 |
voice : 'en_US/vctk_low#p276' # Native English voices -> https://audeering.github.io/shift/
|
15 |
|
|
|
25 |
# StyleTTS2 - find voice from folder
|
26 |
|
27 |
if ('en_US/' in voice) or ('en_UK/' in voice):
|
28 |
+
a = '' if affect else '_v2'
|
29 |
+
style_vector = msinference.compute_style('assets/wavs/style_vector' + a + '/' + voice.replace(
|
30 |
'/', '_').replace('#', '_').replace(
|
31 |
'cmu-arctic', 'cmu_arctic').replace(
|
32 |
'_low', '') + '.wav')
|
|
|
46 |
style_vector)
|
47 |
|
48 |
|
49 |
+
# Fallback - MMS TTS - Non-English voice / langs
|
50 |
|
51 |
else:
|
52 |
x = msinference.foreign(text=text,
|
|
|
56 |
# volume
|
57 |
|
58 |
x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
|
59 |
+
|
60 |
if soundscape is not None:
|
61 |
sound_gen = AudioGen().to('cuda:0').eval()
|
62 |
background = sound_gen.generate(soundscape,
|
63 |
+
duration=len(x)/16000 + .74, # sound duration in seconds
|
64 |
+
).detach().cpu().numpy()
|
65 |
x = .5 * x + .47 * background[:len(x)]
|
66 |
return x
|
67 |
|
68 |
+
soundfile.write(f'demo.wav', tts_entry(), 16000)
|
msinference.py
CHANGED
@@ -1,3 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
from cached_path import cached_path
|
3 |
# import nltk
|
@@ -9,6 +19,7 @@ import torchaudio
|
|
9 |
import librosa
|
10 |
from models import ProsodyPredictor, TextEncoder, StyleEncoder, load_F0_models
|
11 |
from nltk.tokenize import word_tokenize
|
|
|
12 |
import textwrap
|
13 |
# IPA Phonemizer: https://github.com/bootphon/phonemizer
|
14 |
|
@@ -24,10 +35,15 @@ dicts = {}
|
|
24 |
for i in range(len((symbols))):
|
25 |
dicts[symbols[i]] = i
|
26 |
|
|
|
|
|
|
|
|
|
27 |
class TextCleaner:
|
28 |
def __init__(self, dummy=None):
|
29 |
self.word_index_dictionary = dicts
|
30 |
print(len(dicts))
|
|
|
31 |
def __call__(self, text):
|
32 |
indexes = []
|
33 |
for char in text:
|
@@ -38,7 +54,6 @@ class TextCleaner:
|
|
38 |
return indexes
|
39 |
|
40 |
|
41 |
-
|
42 |
textclenaer = TextCleaner()
|
43 |
|
44 |
|
@@ -46,17 +61,20 @@ to_mel = torchaudio.transforms.MelSpectrogram(
|
|
46 |
n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
|
47 |
mean, std = -4, 4
|
48 |
|
|
|
49 |
def alpha_num(f):
|
50 |
f = re.sub(' +', ' ', f) # delete spaces
|
51 |
f = re.sub(r'[^A-Z a-z0-9 ]+', '', f) # del non alpha num
|
52 |
return f
|
53 |
|
|
|
54 |
def preprocess(wave):
|
55 |
wave_tensor = torch.from_numpy(wave).float()
|
56 |
mel_tensor = to_mel(wave_tensor)
|
57 |
mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
|
58 |
return mel_tensor
|
59 |
|
|
|
60 |
def compute_style(path):
|
61 |
wave, sr = librosa.load(path, sr=24000)
|
62 |
audio, index = librosa.effects.trim(wave, top_db=30)
|
@@ -67,18 +85,19 @@ def compute_style(path):
|
|
67 |
with torch.no_grad():
|
68 |
ref_s = style_encoder(mel_tensor.unsqueeze(1))
|
69 |
ref_p = predictor_encoder(mel_tensor.unsqueeze(1)) # [bs, 11, 1, 128]
|
70 |
-
|
71 |
s = torch.cat([ref_s, ref_p], dim=3) # [bs, 11, 1, 256]
|
72 |
-
|
73 |
s = s[:, :, 0, :].transpose(1, 2) # [1, 128, 11]
|
74 |
-
return s# [1, 128, 11]
|
|
|
75 |
|
76 |
device = 'cpu'
|
77 |
if torch.cuda.is_available():
|
78 |
device = 'cuda'
|
79 |
|
80 |
-
|
81 |
-
|
82 |
# phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
|
83 |
|
84 |
|
@@ -88,45 +107,43 @@ ASR_config = args['ASR_config']
|
|
88 |
F0_path = args['F0_path']
|
89 |
pitch_extractor = load_F0_models(F0_path).eval().to(device)
|
90 |
|
91 |
-
from Utils.PLBERT.util import load_plbert
|
92 |
-
from Modules.hifigan import Decoder
|
93 |
|
94 |
bert = load_plbert(args['PLBERT_dir']).eval().to(device)
|
95 |
|
96 |
-
decoder = Decoder(dim_in=512,
|
97 |
-
style_dim=128,
|
98 |
dim_out=80, # n_mels
|
99 |
-
resblock_kernel_sizes
|
100 |
-
upsample_rates
|
101 |
upsample_initial_channel=512,
|
102 |
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
103 |
upsample_kernel_sizes=[20, 10, 6, 4]).eval().to(device)
|
104 |
|
105 |
-
text_encoder = TextEncoder(channels=512,
|
106 |
-
kernel_size=5,
|
107 |
-
depth=3,
|
108 |
-
n_symbols=178,
|
109 |
).eval().to(device)
|
110 |
|
111 |
-
predictor = ProsodyPredictor(style_dim=128,
|
112 |
-
d_hid=512,
|
113 |
nlayers=3, # OFFICIAL config.nlayers=5;
|
114 |
-
max_dur=50,
|
115 |
dropout=.2).eval().to(device)
|
116 |
|
117 |
-
style_encoder = StyleEncoder(dim_in=64,
|
118 |
-
style_dim=128,
|
119 |
-
max_conv_dim=512).eval().to(device)
|
120 |
-
predictor_encoder = StyleEncoder(dim_in=64,
|
121 |
-
style_dim=128,
|
122 |
-
max_conv_dim=512).eval().to(device)
|
123 |
bert_encoder = torch.nn.Linear(bert.config.hidden_size, 512).eval().to(device)
|
124 |
|
125 |
# params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
|
126 |
-
params_whole = torch.load(str(cached_path(
|
|
|
127 |
params = params_whole['net']
|
128 |
|
129 |
-
from collections import OrderedDict
|
130 |
|
131 |
def _del_prefix(d):
|
132 |
# del ".module"
|
@@ -135,14 +152,19 @@ def _del_prefix(d):
|
|
135 |
out[k[7:]] = v
|
136 |
return out
|
137 |
|
138 |
-
|
|
|
139 |
bert_encoder.load_state_dict(_del_prefix(params['bert_encoder']), strict=True)
|
140 |
-
|
141 |
-
|
|
|
142 |
text_encoder.load_state_dict(_del_prefix(params['text_encoder']), strict=True)
|
143 |
-
predictor_encoder.load_state_dict(_del_prefix(
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
146 |
|
147 |
# def _shift(x):
|
148 |
# # [bs, samples] shift circular each batch elem of sound
|
@@ -152,13 +174,13 @@ pitch_extractor.load_state_dict(_del_prefix(params['pitch_extractor']), strict=T
|
|
152 |
# x[i, ...] = torch.roll(batch_elem, offset, dims=1) # batch_elem = [400000, ]
|
153 |
# return x
|
154 |
|
|
|
155 |
def inference(text,
|
156 |
ref_s,
|
157 |
use_gruut=False):
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
ps = global_phonemizer.phonemize([text])
|
163 |
# print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
|
164 |
ps = word_tokenize(ps[0])
|
@@ -172,20 +194,20 @@ def inference(text,
|
|
172 |
|
173 |
with torch.no_grad():
|
174 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
175 |
-
|
176 |
hidden_states = text_encoder(tokens, input_lengths)
|
177 |
-
|
178 |
bert_dur = bert(tokens, attention_mask=None)
|
179 |
d_en = bert_encoder(bert_dur).transpose(-1, -2)
|
180 |
-
ref = ref_s[:, :128, :]
|
181 |
s = ref_s[:, 128:, :]
|
182 |
d = predictor.text_encoder(d_en, s, input_lengths)
|
183 |
d = d.transpose(1, 2)
|
184 |
# -------------------------------- pred_aln_trg = clones bert frames as duration
|
185 |
-
|
186 |
d = predictor.text_encoder(d_en,
|
187 |
-
|
188 |
-
|
189 |
|
190 |
x, _ = predictor.lstm(d)
|
191 |
|
@@ -194,7 +216,6 @@ def inference(text,
|
|
194 |
duration = torch.sigmoid(duration).sum(axis=-1)
|
195 |
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
196 |
|
197 |
-
|
198 |
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
|
199 |
c_frame = 0
|
200 |
for i in range(pred_aln_trg.size(0)):
|
@@ -222,15 +243,15 @@ def inference(text,
|
|
222 |
N=N_pred,
|
223 |
s=ref)
|
224 |
|
225 |
-
x = x.cpu().numpy()[0, 0, :-400]
|
226 |
|
227 |
# StyleTTS2 is 24kHz -> Resample to 16kHz ofAudioGen / MMS
|
228 |
|
229 |
if x.shape[0] > 10:
|
230 |
x /= np.abs(x).max() + 1e-7
|
231 |
x = audresample.resample(signal=x.astype(np.float32),
|
232 |
-
|
233 |
-
|
234 |
|
235 |
else:
|
236 |
print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
|
@@ -238,26 +259,15 @@ def inference(text,
|
|
238 |
return x
|
239 |
|
240 |
|
241 |
-
|
242 |
-
|
243 |
# ___________________________________________________________
|
244 |
|
245 |
# https://huggingface.co/spaces/mms-meta/MMS/blob/main/tts.py
|
246 |
# ___________________________________________________________
|
247 |
-
|
248 |
# -*- coding: utf-8 -*-
|
249 |
-
|
250 |
# Copyright (c) Facebook, Inc. and its affiliates.
|
251 |
#
|
252 |
# This source code is licensed under the MIT license found in the
|
253 |
# LICENSE file in the root directory of this source tree.
|
254 |
-
from num2words import num2words
|
255 |
-
import os
|
256 |
-
import re
|
257 |
-
import tempfile
|
258 |
-
import torch
|
259 |
-
import sys
|
260 |
-
from Modules.vits.models import VitsModel, VitsTokenizer
|
261 |
|
262 |
TTS_LANGUAGES = {}
|
263 |
# with open('_d.csv', 'w') as f2:
|
@@ -266,96 +276,54 @@ with open(f"Utils/all_langs.csv") as f:
|
|
266 |
iso, name = line.split(",", 1)
|
267 |
TTS_LANGUAGES[iso.strip()] = name.strip()
|
268 |
# f2.write(iso + ',' + name.replace("a S","")+'\n')
|
269 |
-
# =============================================================================================
|
270 |
-
# R O M A N I A N N U M E R A L S
|
271 |
-
# =============================================================================================
|
272 |
-
|
273 |
-
def _ro_number(number_str):
|
274 |
-
# Function to convert numbers to their phonetic form in Romanian
|
275 |
-
# Check if the number is negative
|
276 |
-
negative = number_str.startswith('-')
|
277 |
-
if negative:
|
278 |
-
number_str = number_str[1:] # Remove the minus sign for now
|
279 |
-
|
280 |
-
# Handle floating point numbers by splitting into integer and decimal parts
|
281 |
-
if '.' in number_str:
|
282 |
-
integer_part, decimal_part = number_str.split('.')
|
283 |
-
integer_words = num2words(integer_part, lang='ro')
|
284 |
-
decimal_words = ' '.join([num2words(digit, lang='ro') for digit in decimal_part])
|
285 |
-
result = f"{integer_words} virgulă {decimal_words}"
|
286 |
-
else:
|
287 |
-
result = num2words(number_str, lang='ro')
|
288 |
-
|
289 |
-
# Add 'minus' if the number is negative
|
290 |
-
if negative:
|
291 |
-
result = "minus " + result
|
292 |
-
|
293 |
-
return result
|
294 |
-
|
295 |
-
def romanian_num2str(input_string):
|
296 |
-
# Function to convert a string with numbers to phonetic representation in Romanian
|
297 |
-
# Regex pattern to identify numbers in the string (including negative numbers, decimals)
|
298 |
-
pattern = r'-?\d+(\.\d+)?'
|
299 |
-
|
300 |
-
def replace_with_phoneme(match):
|
301 |
-
# Extract the matched number and convert it to phonetic representation
|
302 |
-
number_str = match.group()
|
303 |
-
return _ro_number(number_str)
|
304 |
-
|
305 |
-
# Use regex to find all numbers in the input string and replace them with their phonetic form
|
306 |
-
return re.sub(pattern, replace_with_phoneme, input_string)
|
307 |
|
308 |
|
309 |
# ==============================================================================================
|
310 |
-
# LOAD hun / ron / serbian - rmc-script_latin / cyrillic-Carpathian (not Vlax)
|
311 |
# ==============================================================================================
|
312 |
|
313 |
PHONEME_MAP = {
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
# ALLOWED_PHONEMES = set("šč_bďph`-3žt 'ľzj5yuoóx1vfnaiedt́sṁkň2rčlg")
|
334 |
-
|
335 |
-
def number_to_phonemes(match):
|
336 |
-
number = int(match.group())
|
337 |
-
words = num2words(number, lang='sr')
|
338 |
-
return fix_phones(words.lower())
|
339 |
-
# return words
|
340 |
|
341 |
def fix_phones(text):
|
342 |
for src, target in PHONEME_MAP.items():
|
343 |
text = text.replace(src, target)
|
344 |
# text = re.sub(r'\s+', '` `', text) #.strip() #.lower()
|
345 |
# text = re.sub(r'\s+', '_ _', text) # almost proper pausing
|
346 |
-
|
347 |
return text.replace(',', '_ _').replace('.', '_ _')
|
348 |
|
|
|
349 |
def has_cyrillic(text):
|
350 |
# https://stackoverflow.com/questions/48255244/python-check-if-a-string-contains-cyrillic-characters
|
351 |
return bool(re.search('[\u0400-\u04FF]', text))
|
352 |
|
353 |
-
|
|
|
354 |
# fall on the male voice (Sink attn)
|
355 |
lang='romanian',
|
356 |
speed=None):
|
357 |
|
358 |
-
|
|
|
359 |
|
360 |
# https://huggingface.co/spaces/mms-meta/MMS
|
361 |
|
@@ -367,11 +335,13 @@ def foreign(text=None, # split sentences here so we can prepend a txt for germ
|
|
367 |
|
368 |
if has_cyrillic(text): # check 0-th sentence if is cyrillic
|
369 |
|
370 |
-
|
|
|
371 |
|
372 |
else:
|
373 |
|
374 |
-
|
|
|
375 |
|
376 |
elif 'rom' in lang:
|
377 |
|
@@ -392,12 +362,12 @@ def foreign(text=None, # split sentences here so we can prepend a txt for germ
|
|
392 |
|
393 |
lang_code = lang.split()[0].strip()
|
394 |
|
395 |
-
#
|
396 |
|
397 |
net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval().to(device)
|
398 |
tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
|
399 |
|
400 |
-
|
401 |
|
402 |
total_audio = []
|
403 |
|
@@ -406,41 +376,49 @@ def foreign(text=None, # split sentences here so we can prepend a txt for germ
|
|
406 |
if lang_code == 'deu':
|
407 |
# Split Very long sentences >500 phoneme - StyleTTS2 crashes # -- even 400 phonemes sometimes OOM in cuda:4
|
408 |
# However prosody is nicer on non-split for MMS TTS
|
409 |
-
|
410 |
-
|
|
|
|
|
411 |
else:
|
412 |
-
|
413 |
-
|
|
|
|
|
414 |
|
415 |
for _t in text:
|
416 |
|
417 |
_t = _t.lower()
|
418 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
419 |
if lang_code == 'rmc-script_latin':
|
420 |
|
421 |
-
_t =
|
422 |
-
_t = fix_phones(_t)
|
423 |
|
424 |
elif lang_code == 'ron':
|
425 |
|
426 |
-
# numerals
|
427 |
-
_t = romanian_num2str(_t)
|
428 |
-
|
429 |
# tone
|
430 |
_t = _t.replace("ţ", "ț"
|
431 |
-
|
432 |
|
433 |
# /data/dkounadis/.hf7/hub/models--facebook--mms-tts/snapshots/44cc7fb408064ef9ea6e7c59130d88cac1274671/models/rmc-script_latin/vocab.txt
|
434 |
-
|
|
|
435 |
|
436 |
with torch.no_grad():
|
437 |
|
438 |
# MMS
|
439 |
|
440 |
x = net_g(input_ids=inputs.input_ids.to(device),
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
|
445 |
# crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()
|
446 |
|
@@ -455,5 +433,3 @@ def foreign(text=None, # split sentences here so we can prepend a txt for germ
|
|
455 |
# print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
|
456 |
|
457 |
return x # 16kHz - only resample StyleTTS2 from 24Hkz -> 16kHz
|
458 |
-
|
459 |
-
|
|
|
1 |
+
from Modules.vits.models import VitsModel, VitsTokenizer
|
2 |
+
import sys
|
3 |
+
import tempfile
|
4 |
+
import re
|
5 |
+
import os
|
6 |
+
from num2words import num2words
|
7 |
+
from collections import OrderedDict
|
8 |
+
from Modules.hifigan import Decoder
|
9 |
+
from Utils.PLBERT.util import load_plbert
|
10 |
+
import phonemizer
|
11 |
import torch
|
12 |
from cached_path import cached_path
|
13 |
# import nltk
|
|
|
19 |
import librosa
|
20 |
from models import ProsodyPredictor, TextEncoder, StyleEncoder, load_F0_models
|
21 |
from nltk.tokenize import word_tokenize
|
22 |
+
from Utils.text_utils import transliterate_number
|
23 |
import textwrap
|
24 |
# IPA Phonemizer: https://github.com/bootphon/phonemizer
|
25 |
|
|
|
35 |
for i in range(len((symbols))):
|
36 |
dicts[symbols[i]] = i
|
37 |
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
class TextCleaner:
|
43 |
def __init__(self, dummy=None):
|
44 |
self.word_index_dictionary = dicts
|
45 |
print(len(dicts))
|
46 |
+
|
47 |
def __call__(self, text):
|
48 |
indexes = []
|
49 |
for char in text:
|
|
|
54 |
return indexes
|
55 |
|
56 |
|
|
|
57 |
textclenaer = TextCleaner()
|
58 |
|
59 |
|
|
|
61 |
n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
|
62 |
mean, std = -4, 4
|
63 |
|
64 |
+
|
65 |
def alpha_num(f):
|
66 |
f = re.sub(' +', ' ', f) # delete spaces
|
67 |
f = re.sub(r'[^A-Z a-z0-9 ]+', '', f) # del non alpha num
|
68 |
return f
|
69 |
|
70 |
+
|
71 |
def preprocess(wave):
|
72 |
wave_tensor = torch.from_numpy(wave).float()
|
73 |
mel_tensor = to_mel(wave_tensor)
|
74 |
mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
|
75 |
return mel_tensor
|
76 |
|
77 |
+
|
78 |
def compute_style(path):
|
79 |
wave, sr = librosa.load(path, sr=24000)
|
80 |
audio, index = librosa.effects.trim(wave, top_db=30)
|
|
|
85 |
with torch.no_grad():
|
86 |
ref_s = style_encoder(mel_tensor.unsqueeze(1))
|
87 |
ref_p = predictor_encoder(mel_tensor.unsqueeze(1)) # [bs, 11, 1, 128]
|
88 |
+
|
89 |
s = torch.cat([ref_s, ref_p], dim=3) # [bs, 11, 1, 256]
|
90 |
+
|
91 |
s = s[:, :, 0, :].transpose(1, 2) # [1, 128, 11]
|
92 |
+
return s # [1, 128, 11]
|
93 |
+
|
94 |
|
95 |
device = 'cpu'
|
96 |
if torch.cuda.is_available():
|
97 |
device = 'cuda'
|
98 |
|
99 |
+
global_phonemizer = phonemizer.backend.EspeakBackend(
|
100 |
+
language='en-us', preserve_punctuation=True, with_stress=True)
|
101 |
# phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
|
102 |
|
103 |
|
|
|
107 |
F0_path = args['F0_path']
|
108 |
pitch_extractor = load_F0_models(F0_path).eval().to(device)
|
109 |
|
|
|
|
|
110 |
|
111 |
bert = load_plbert(args['PLBERT_dir']).eval().to(device)
|
112 |
|
113 |
+
decoder = Decoder(dim_in=512,
|
114 |
+
style_dim=128,
|
115 |
dim_out=80, # n_mels
|
116 |
+
resblock_kernel_sizes=[3, 7, 11],
|
117 |
+
upsample_rates=[10, 5, 3, 2],
|
118 |
upsample_initial_channel=512,
|
119 |
resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
|
120 |
upsample_kernel_sizes=[20, 10, 6, 4]).eval().to(device)
|
121 |
|
122 |
+
text_encoder = TextEncoder(channels=512,
|
123 |
+
kernel_size=5,
|
124 |
+
depth=3, # args['model_params']['n_layer'],
|
125 |
+
n_symbols=178, # args['model_params']['n_token']
|
126 |
).eval().to(device)
|
127 |
|
128 |
+
predictor = ProsodyPredictor(style_dim=128,
|
129 |
+
d_hid=512,
|
130 |
nlayers=3, # OFFICIAL config.nlayers=5;
|
131 |
+
max_dur=50,
|
132 |
dropout=.2).eval().to(device)
|
133 |
|
134 |
+
style_encoder = StyleEncoder(dim_in=64,
|
135 |
+
style_dim=128,
|
136 |
+
max_conv_dim=512).eval().to(device) # acoustic style encoder
|
137 |
+
predictor_encoder = StyleEncoder(dim_in=64,
|
138 |
+
style_dim=128,
|
139 |
+
max_conv_dim=512).eval().to(device) # prosodic style encoder
|
140 |
bert_encoder = torch.nn.Linear(bert.config.hidden_size, 512).eval().to(device)
|
141 |
|
142 |
# params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
|
143 |
+
params_whole = torch.load(str(cached_path(
|
144 |
+
"hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
|
145 |
params = params_whole['net']
|
146 |
|
|
|
147 |
|
148 |
def _del_prefix(d):
|
149 |
# del ".module"
|
|
|
152 |
out[k[7:]] = v
|
153 |
return out
|
154 |
|
155 |
+
|
156 |
+
bert.load_state_dict(_del_prefix(params['bert']), strict=True)
|
157 |
bert_encoder.load_state_dict(_del_prefix(params['bert_encoder']), strict=True)
|
158 |
+
# XTRA non-ckpt LSTMs nlayers add slowiness to voice
|
159 |
+
predictor.load_state_dict(_del_prefix(params['predictor']), strict=True)
|
160 |
+
decoder.load_state_dict(_del_prefix(params['decoder']), strict=True)
|
161 |
text_encoder.load_state_dict(_del_prefix(params['text_encoder']), strict=True)
|
162 |
+
predictor_encoder.load_state_dict(_del_prefix(
|
163 |
+
params['predictor_encoder']), strict=True)
|
164 |
+
style_encoder.load_state_dict(_del_prefix(
|
165 |
+
params['style_encoder']), strict=True)
|
166 |
+
pitch_extractor.load_state_dict(_del_prefix(
|
167 |
+
params['pitch_extractor']), strict=True)
|
168 |
|
169 |
# def _shift(x):
|
170 |
# # [bs, samples] shift circular each batch elem of sound
|
|
|
174 |
# x[i, ...] = torch.roll(batch_elem, offset, dims=1) # batch_elem = [400000, ]
|
175 |
# return x
|
176 |
|
177 |
+
|
178 |
def inference(text,
|
179 |
ref_s,
|
180 |
use_gruut=False):
|
181 |
+
|
182 |
+
text = transliterate_number(text, lang='en').strip()
|
183 |
+
|
|
|
184 |
ps = global_phonemizer.phonemize([text])
|
185 |
# print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
|
186 |
ps = word_tokenize(ps[0])
|
|
|
194 |
|
195 |
with torch.no_grad():
|
196 |
input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
|
197 |
+
|
198 |
hidden_states = text_encoder(tokens, input_lengths)
|
199 |
+
|
200 |
bert_dur = bert(tokens, attention_mask=None)
|
201 |
d_en = bert_encoder(bert_dur).transpose(-1, -2)
|
202 |
+
ref = ref_s[:, :128, :] # [bs, 128, 11]
|
203 |
s = ref_s[:, 128:, :]
|
204 |
d = predictor.text_encoder(d_en, s, input_lengths)
|
205 |
d = d.transpose(1, 2)
|
206 |
# -------------------------------- pred_aln_trg = clones bert frames as duration
|
207 |
+
|
208 |
d = predictor.text_encoder(d_en,
|
209 |
+
s,
|
210 |
+
input_lengths)
|
211 |
|
212 |
x, _ = predictor.lstm(d)
|
213 |
|
|
|
216 |
duration = torch.sigmoid(duration).sum(axis=-1)
|
217 |
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
218 |
|
|
|
219 |
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
|
220 |
c_frame = 0
|
221 |
for i in range(pred_aln_trg.size(0)):
|
|
|
243 |
N=N_pred,
|
244 |
s=ref)
|
245 |
|
246 |
+
x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
|
247 |
|
248 |
# StyleTTS2 is 24kHz -> Resample to 16kHz ofAudioGen / MMS
|
249 |
|
250 |
if x.shape[0] > 10:
|
251 |
x /= np.abs(x).max() + 1e-7
|
252 |
x = audresample.resample(signal=x.astype(np.float32),
|
253 |
+
original_rate=24000,
|
254 |
+
target_rate=16000)[0, :] # reshapes (64,) -> (1,64)
|
255 |
|
256 |
else:
|
257 |
print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
|
|
|
259 |
return x
|
260 |
|
261 |
|
|
|
|
|
262 |
# ___________________________________________________________
|
263 |
|
264 |
# https://huggingface.co/spaces/mms-meta/MMS/blob/main/tts.py
|
265 |
# ___________________________________________________________
|
|
|
266 |
# -*- coding: utf-8 -*-
|
|
|
267 |
# Copyright (c) Facebook, Inc. and its affiliates.
|
268 |
#
|
269 |
# This source code is licensed under the MIT license found in the
|
270 |
# LICENSE file in the root directory of this source tree.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
|
272 |
TTS_LANGUAGES = {}
|
273 |
# with open('_d.csv', 'w') as f2:
|
|
|
276 |
iso, name = line.split(",", 1)
|
277 |
TTS_LANGUAGES[iso.strip()] = name.strip()
|
278 |
# f2.write(iso + ',' + name.replace("a S","")+'\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
|
280 |
|
281 |
# ==============================================================================================
|
282 |
+
# LOAD hun / ron / serbian - rmc-script_latin / cyrillic-Carpathian (not Vlax)
|
283 |
# ==============================================================================================
|
284 |
|
285 |
PHONEME_MAP = {
|
286 |
+
'služ': 'sloooozz', # 'službeno'
|
287 |
+
'suver': 'siuveeerra', # 'suverena'
|
288 |
+
'država': 'dirrezav', # 'država'
|
289 |
+
'iči': 'ici', # 'Graniči'
|
290 |
+
's ': 'se', # a s with space
|
291 |
+
'q': 'ku',
|
292 |
+
'w': 'aou',
|
293 |
+
'z': 's',
|
294 |
+
"š": "s",
|
295 |
+
'th': 'ta',
|
296 |
+
'v': 'vv',
|
297 |
+
# "ć": "č",
|
298 |
+
# "đ": "ď",
|
299 |
+
# "lj": "ľ",
|
300 |
+
# "nj": "ň",
|
301 |
+
"ž": "z",
|
302 |
+
# "c": "č"
|
303 |
+
}
|
304 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
|
306 |
def fix_phones(text):
|
307 |
for src, target in PHONEME_MAP.items():
|
308 |
text = text.replace(src, target)
|
309 |
# text = re.sub(r'\s+', '` `', text) #.strip() #.lower()
|
310 |
# text = re.sub(r'\s+', '_ _', text) # almost proper pausing
|
311 |
+
|
312 |
return text.replace(',', '_ _').replace('.', '_ _')
|
313 |
|
314 |
+
|
315 |
def has_cyrillic(text):
|
316 |
# https://stackoverflow.com/questions/48255244/python-check-if-a-string-contains-cyrillic-characters
|
317 |
return bool(re.search('[\u0400-\u04FF]', text))
|
318 |
|
319 |
+
|
320 |
+
def foreign(text=None, # split sentences here so we can prepend a txt for german to each sentence to
|
321 |
# fall on the male voice (Sink attn)
|
322 |
lang='romanian',
|
323 |
speed=None):
|
324 |
|
325 |
+
# https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
|
326 |
+
lang = lang.lower()
|
327 |
|
328 |
# https://huggingface.co/spaces/mms-meta/MMS
|
329 |
|
|
|
335 |
|
336 |
if has_cyrillic(text): # check 0-th sentence if is cyrillic
|
337 |
|
338 |
+
# romani carpathian (also has latin / cyrillic Vlax)
|
339 |
+
lang_code = 'rmc-script_cyrillic'
|
340 |
|
341 |
else:
|
342 |
|
343 |
+
# romani carpathian (has also Vlax)
|
344 |
+
lang_code = 'rmc-script_latin'
|
345 |
|
346 |
elif 'rom' in lang:
|
347 |
|
|
|
362 |
|
363 |
lang_code = lang.split()[0].strip()
|
364 |
|
365 |
+
# load VITS
|
366 |
|
367 |
net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval().to(device)
|
368 |
tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
|
369 |
|
370 |
+
|
371 |
|
372 |
total_audio = []
|
373 |
|
|
|
376 |
if lang_code == 'deu':
|
377 |
# Split Very long sentences >500 phoneme - StyleTTS2 crashes # -- even 400 phonemes sometimes OOM in cuda:4
|
378 |
# However prosody is nicer on non-split for MMS TTS
|
379 |
+
# prepend txt snippet
|
380 |
+
text = [
|
381 |
+
sub_sent+' ' for sub_sent in textwrap.wrap(text, 200, break_long_words=0)]
|
382 |
+
# assert that it chooses unique voice
|
383 |
else:
|
384 |
+
# allow longer non split text
|
385 |
+
text = [
|
386 |
+
sub_sent+' ' for sub_sent in textwrap.wrap(text, 640, break_long_words=0)]
|
387 |
+
# for non deu MMS TTS lang.
|
388 |
|
389 |
for _t in text:
|
390 |
|
391 |
_t = _t.lower()
|
392 |
|
393 |
+
# apply this in api.py -> tts_multi_sentence before switching between Styletts2
|
394 |
+
print('\n\n\n\nBEF TRansliteration', _t,'\n\n\n\n\n')
|
395 |
+
_t = transliterate_number(_t, lang=lang_code)
|
396 |
+
print('AFT nums', _t,'\n____________________________________________')
|
397 |
+
|
398 |
+
# However if we transliterate here also the demo sees the transliteration
|
399 |
+
|
400 |
if lang_code == 'rmc-script_latin':
|
401 |
|
402 |
+
_t = fix_phones(_t) # phonemes replace per language
|
|
|
403 |
|
404 |
elif lang_code == 'ron':
|
405 |
|
|
|
|
|
|
|
406 |
# tone
|
407 |
_t = _t.replace("ţ", "ț"
|
408 |
+
).replace('ț', 'ts').replace('î', 'u').replace('â', 'a').replace('ş', 's')
|
409 |
|
410 |
# /data/dkounadis/.hf7/hub/models--facebook--mms-tts/snapshots/44cc7fb408064ef9ea6e7c59130d88cac1274671/models/rmc-script_latin/vocab.txt
|
411 |
+
# input_ids / attention_mask
|
412 |
+
inputs = tokenizer(_t, return_tensors="pt")
|
413 |
|
414 |
with torch.no_grad():
|
415 |
|
416 |
# MMS
|
417 |
|
418 |
x = net_g(input_ids=inputs.input_ids.to(device),
|
419 |
+
attention_mask=inputs.attention_mask.to(device),
|
420 |
+
speed=speed + .44 * np.random.rand() # variable speed for different sentence
|
421 |
+
)[0, :]
|
422 |
|
423 |
# crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()
|
424 |
|
|
|
433 |
# print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
|
434 |
|
435 |
return x # 16kHz - only resample StyleTTS2 from 24Hkz -> 16kHz
|
|
|
|