dkounadis
/

artificial-styletts2

@@ -2,6 +2,7 @@
 import re
 import codecs
 import textwrap
 # IPA Phonemizer: https://github.com/bootphon/phonemizer
 _pad = "$"
@@ -16,10 +17,12 @@ dicts = {}
 for i in range(len((symbols))):
     dicts[symbols[i]] = i
 class TextCleaner:
     def __init__(self, dummy=None):
         self.word_index_dictionary = dicts
         print(len(dicts))
     def __call__(self, text):
         indexes = []
         for char in text:
@@ -32,7 +35,7 @@ class TextCleaner:
 # == Sentence Splitter
-alphabets= "([A-Za-z])"
 prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
 suffixes = "(Inc|Ltd|Jr|Sr|Co)"
 starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
@@ -42,7 +45,6 @@ digits = "([0-9])"
 multiple_dots = r'\.{2,}'
 def split_into_sentences(text):
     """
     Split the text into sentences.
@@ -59,54 +61,66 @@ def split_into_sentences(text):
     https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
     """
     text = " " + text + "  "
-    text = text.replace("\n"," ")
-    text = re.sub(prefixes,"\\1<prd>",text)
-    text = re.sub(websites,"<prd>\\1",text)
-    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
-    text = re.sub(multiple_dots, lambda match: "<prd>" * len(match.group(0)) + "<stop>", text)
-    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
-    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
-    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
-    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
-    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
-    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
-    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
-    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
-    if "”" in text: text = text.replace(".”","”.")
-    if "\"" in text: text = text.replace(".\"","\".")
-    if "!" in text: text = text.replace("!\"","\"!")
-    if "?" in text: text = text.replace("?\"","\"?")
-    text = text.replace(".",".<stop>")
-    text = text.replace("?","?<stop>")
-    text = text.replace("!","!<stop>")
-    text = text.replace("<prd>",".")
     sentences = text.split("<stop>")
     sentences = [s.strip() for s in sentences]
     # Split Very long sentences >500 phoneme - StyleTTS2 crashes
     # -- even 400 phonemes sometimes OOM in cuda:4
-    sentences = [sub_sent+' ' for s in sentences for sub_sent in textwrap.wrap(s, 200, break_long_words=0)]
-    # if sentences and not sentences[-1]:
     #     sentences = sentences[:-1]
     return sentences
 def store_ssml(text=None,
                voice=None):
     '''create ssml:
            text : list of sentences
            voice: https://github.com/MycroftAI/mimic3-voices
     '''
-    print('\n___________________________\n', len(text), text[0], '\n___________________________________\n')
     _s = '<speak>'
     for short_text in text:
-        rate = min(max(.87, len(short_text) / 76), 1.14) #1.44)  # 1.24 for bieber
         volume = int(74 * np.random.rand() + 24)
         # text = ('<speak>'
-        _s += f'<prosody volume=\'{volume}\'>'   # THe other voice does not have volume
         _s += f'<prosody rate=\'{rate}\'>'
         _s += f'<voice name=\'{voice}\'>'
         _s += '<s>'
@@ -116,7 +130,77 @@ def store_ssml(text=None,
         _s += '</prosody>'
         _s += '</prosody>'
     _s += '</speak>'
-    print(len(text),'\n\n\n\n\n\n\n', _s)
     with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
         f.write(_s)

 import re
 import codecs
 import textwrap
+from num2words import num2words
 # IPA Phonemizer: https://github.com/bootphon/phonemizer
 _pad = "$"
 for i in range(len((symbols))):
     dicts[symbols[i]] = i
 class TextCleaner:
     def __init__(self, dummy=None):
         self.word_index_dictionary = dicts
         print(len(dicts))
     def __call__(self, text):
         indexes = []
         for char in text:
 # == Sentence Splitter
+alphabets = "([A-Za-z])"
 prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
 suffixes = "(Inc|Ltd|Jr|Sr|Co)"
 starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
 multiple_dots = r'\.{2,}'
 def split_into_sentences(text):
     """
     Split the text into sentences.
     https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
     """
     text = " " + text + "  "
+    text = text.replace("\n", " ")
+    text = re.sub(prefixes, "\\1<prd>", text)
+    text = re.sub(websites, "<prd>\\1", text)
+    text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
+    text = re.sub(multiple_dots, lambda match: "<prd>" *
+                  len(match.group(0)) + "<stop>", text)
+    if "Ph.D" in text:
+        text = text.replace("Ph.D.", "Ph<prd>D<prd>")
+    text = re.sub("\s" + alphabets + "[.] ", " \\1<prd> ", text)
+    text = re.sub(acronyms+" "+starters, "\\1<stop> \\2", text)
+    text = re.sub(alphabets + "[.]" + alphabets + "[.]" +
+                  alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text)
+    text = re.sub(alphabets + "[.]" + alphabets +
+                  "[.]", "\\1<prd>\\2<prd>", text)
+    text = re.sub(" "+suffixes+"[.] "+starters, " \\1<stop> \\2", text)
+    text = re.sub(" "+suffixes+"[.]", " \\1<prd>", text)
+    text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
+    if "”" in text:
+        text = text.replace(".”", "”.")
+    if "\"" in text:
+        text = text.replace(".\"", "\".")
+    if "!" in text:
+        text = text.replace("!\"", "\"!")
+    if "?" in text:
+        text = text.replace("?\"", "\"?")
+    text = text.replace(".", ".<stop>")
+    text = text.replace("?", "?<stop>")
+    text = text.replace("!", "!<stop>")
+    text = text.replace("<prd>", ".")
     sentences = text.split("<stop>")
     sentences = [s.strip() for s in sentences]
     # Split Very long sentences >500 phoneme - StyleTTS2 crashes
     # -- even 400 phonemes sometimes OOM in cuda:4
+    sentences = [
+        sub_sent+' ' for s in sentences for sub_sent in textwrap.wrap(s, 200, break_long_words=0)]
+    # if sentences and not sentences[-1]:
     #     sentences = sentences[:-1]
     return sentences
 def store_ssml(text=None,
                voice=None):
     '''create ssml:
            text : list of sentences
            voice: https://github.com/MycroftAI/mimic3-voices
     '''
+    print('\n___________________________\n', len(text),
+          text[0], '\n___________________________________\n')
     _s = '<speak>'
     for short_text in text:
+        # 1.44)  # 1.24 for bieber
+        rate = min(max(.87, len(short_text) / 76), 1.14)
         volume = int(74 * np.random.rand() + 24)
         # text = ('<speak>'
+        # THe other voice does not have volume
+        _s += f'<prosody volume=\'{volume}\'>'
         _s += f'<prosody rate=\'{rate}\'>'
         _s += f'<voice name=\'{voice}\'>'
         _s += '<s>'
         _s += '</prosody>'
         _s += '</prosody>'
     _s += '</speak>'
+    print(len(text), '\n\n\n\n\n\n\n', _s)
     with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
         f.write(_s)
+def transliterate_number(number_string, lang='en'):
+    """
+    Converts a number string to words in the specified language,
+    handling decimals, scientific notation, and preserving text
+    before and after the numeral.
+    """
+    if lang == 'rmc-script_latin':
+        lang = 'sr'
+        exponential_pronoun = ' puta deset na stepen od '
+        comma = ' tačka '
+    elif lang == 'ron':
+        lang = 'ro'
+        exponential_pronoun = ' tízszer a erejéig '
+        comma = ' virgulă '
+    elif lang == 'hun':
+        lang = 'hu'
+        exponential_pronoun = ' tízszer a erejéig '
+        comma = ' virgula '
+    elif lang == 'deu':
+        exponential_pronoun = ' mal zehn hoch '
+        comma = ' komma '
+    else:
+        lang = lang[:2]
+        exponential_pronoun = ' times ten to the power of '
+        comma = ' point '
+    def replace_number(match):
+        prefix = match.group(1) or ""
+        number_part = match.group(2)
+        suffix = match.group(5) or ""
+        try:
+            if 'e' in number_part.lower():
+                base, exponent = number_part.lower().split('e')
+                base = float(base)
+                exponent = int(exponent)
+                words = num2words(
+                    base, lang=lang) + exponential_pronoun + num2words(exponent, lang=lang)
+            elif '.' in number_part:
+                integer_part, decimal_part = number_part.split('.')
+                words = num2words(int(integer_part), lang=lang) + comma + " ".join(
+                    [num2words(int(digit), lang=lang) for digit in decimal_part])
+            else:
+                words = num2words(int(number_part), lang=lang)
+            return prefix + words + suffix
+        except ValueError:
+            return match.group(0)  # Return original if conversion fails
+    pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
+    return re.sub(pattern, replace_number, number_string)
+def discard_leading_numeral(text):
+  """Discards a leading numeral (integer or float) from a string.
+  Args:
+    text: The input string.
+  Returns:
+    The string with the leading numeral removed, or the original string
+    if it doesn't start with a numeral.
+  """
+  match = re.match(r"^\s*(\d+(\.\d*)?)\s*", text)
+  if match:
+    return text[match.end():].lstrip()
+  else:
+    return text

api.py CHANGED Viewed

@@ -42,35 +42,41 @@ def resize_with_white_padding(image):
         # Image is wider than the target, pad top and bottom
         new_w = target_w
         new_h = int(new_w / aspect_ratio)
-        resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
         padding_h = target_h - new_h
         top_padding = padding_h // 2
         bottom_padding = padding_h - top_padding
         padding = [(top_padding, bottom_padding), (0, 0)]
         if len(image.shape) == 3:
             padding.append((0, 0))  # Add padding for color channels
-        padded_image = np.pad(resized_image, padding, mode='constant', constant_values=255)
     elif aspect_ratio < target_aspect_ratio:
         # Image is taller than the target, pad left and right
         new_h = target_h
         new_w = int(new_h * aspect_ratio)
-        resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
         padding_w = target_w - new_w
         left_padding = padding_w // 2
         right_padding = padding_w - left_padding
         padding = [(0, 0), (left_padding, right_padding)]
         if len(image.shape) == 3:
             padding.append((0, 0))  # Add padding for color channels
-        padded_image = np.pad(resized_image, padding, mode='constant', constant_values=255)
     else:
         # Aspect ratio matches the target, just resize
-        padded_image = cv2.resize(image, (target_w, target_h), interpolation=cv2.INTER_LANCZOS4)
-    return padded_image # image 2 speech
 def _shorten(filename):
-    return filename.replace("/","")[-6:]
 def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
     '''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
@@ -104,20 +110,23 @@ def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
     # return the resized image
     return resized
-def overlay(x,soundscape=None):
     if soundscape is not None:
         # AudioGen sound is suffice to be ~10s long
         background = sound_generator.generate(soundscape,
-                                              duration=len(x)/16000 + .74, # sound duration = TTS dur
-                                              ).detach().cpu().numpy() # bs, 11400 @.74s
         # len_soundscape = len(background)
         # fading = .5 + .5 * np.tanh(4*(np.linspace(10, -10, len_soundscape) + 9.4))  # fade heaviside  1,1,1,1,...,0
         # x = np.concatenate([fading * background, x], 0)  # blend TTS with AudioGen
-        #background /= np.abs(background).max() + 1e-7  # amplify speech to full [-1,1]
-        x = .4 * x + .46 * background[:len(x)]  # background will be longer by xtra .74s
     return x  # TTS / AudioGen @ 16kHz
@@ -134,77 +143,79 @@ def tts_multi_sentence(precomputed_style_vector=None,
        voice : string or None (falls to styleTTS)
        soundscape : 'A castle in far away lands' -> if passed will generate background sound soundscape
        '''
     # StyleTTS2 - English
     if precomputed_style_vector is not None:
         x = []
         if not isinstance(text, list):
             text = split_into_sentences(text)  # Avoid OOM in StyleTTS2
         for _sentence in text:
             # StyleTTS2 - pronounciation Fx
-            _sentence = _sentence.lower()  # .replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
             if 'vctk_low#p326' in voice:
                 # fix sounding of sleepy AAABS TRAACT
-                _sentence = _sentence.replace('abstract', 'ahbstract')  # 'ahstract'
             x.append(msinference.inference(_sentence,
                                            precomputed_style_vector)
                      )
         x = np.concatenate(x)
     # Fallback - MMS TTS - Non-English
     else:
         # dont split foreign sentences: Avoids speaker change issue
         x = msinference.foreign(text=text,
                                 lang=voice,  # voice = 'romanian', 'serbian' 'hungarian'
                                 speed=speed)  # normalisation externally
     # volume
-    x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
-    return overlay(x, soundscape=soundscape)
 # voices = {}
 # import phonemizer
 # global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)
 app = Flask(__name__)
 @app.route("/", methods=['GET', 'POST', 'PUT'])
 def serve_wav():
     # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
     #                      object-into-a-representation-suitable-for-mongodb
     r = request.form.to_dict(flat=False)
     # Physically Save Client Files
     for filename, obj in request.files.items():
         obj.save(f'{CACHE_DIR}{_shorten(filename)}')
-    print('Saved all files on Server Side\n\n')
     args = SimpleNamespace(
-        text      = None if r.get('text')  is None else CACHE_DIR + _shorten(r.get('text' )[0]),  # crop last letters from original filename & use as tmp
-        video     = None if r.get('video') is None else CACHE_DIR + _shorten(r.get('video')[0]),
-        image     = None if r.get('image') is None else CACHE_DIR + _shorten(r.get('image')[0]),
-        native    = None if r.get('native') is None else CACHE_DIR + _shorten(r.get('native')[0]),
-        affective =       r.get('affective')[0],
-        voice     =       r.get('voice')[0],
-        speed     = float(r.get('speed')[0]),  # For Non-English MMS TTS
-        soundscape=r.get('soundscape')[0] if r.get('soundscape') is not None else None,
-                )
     # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
     print(args, 'ENTER Script')
     do_video_dub = True if args.text.endswith('.srt') else False
@@ -213,10 +224,12 @@ def serve_wav():
     AUDIO_TRACK = '_audio_track.wav'
     if do_video_dub:
-        print('==\nFound .srt : {args.txt}, thus Video should be given as well\n\n')
         with open(args.text, "r") as f:
             s = f.read()
-        text = [[j.content, j.start.total_seconds(), j.end.total_seconds()] for j in srt.parse(s)]
         assert args.video is not None
         native_audio_file = '_tmp.wav'
         subprocess.run(
@@ -231,36 +244,38 @@ def serve_wav():
                 "-vn",
                 native_audio_file])
         x_native, _ = soundfile.read(native_audio_file)  # reads mp3
         # stereo in video
         if x_native.ndim > 1:
             x_native = x_native[:, 0]  # stereo
         # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
     else:
         with open(args.text, 'r') as f:
             text = ''.join(f)
-        text = re.sub(' +', ' ', text)  # delete spaces  / split in list in tts_multi_sentence()
     # == STYLE VECTOR ==
     precomputed_style_vector = None
     if args.native:  # Voice Cloning
         try:
             precomputed_style_vector = msinference.compute_style(args.native)
         except soundfile.LibsndfileError:  # Fallback - internal voice
-            print('\n  Could not voice clone audio:', args.native, 'fallback to video or Internal TTS voice.\n')
         if do_video_dub:  # Clone voice via Video
             native_audio_file = args.video.replace('.', '').replace('/', '')
             native_audio_file += '__native_audio_track.wav'
             soundfile.write('tgt_spk.wav',
-                np.concatenate([
-                    x_native[:int(4 * 16000)]], 0).astype(np.float32), 16000)  # 27400?
             precomputed_style_vector = msinference.compute_style('tgt_spk.wav')
     # NOTE: style vector is normally None here - except if --native arg was passed
     # Native English Accent TTS
     if precomputed_style_vector is None:
         if 'en_US' in args.voice or 'en_UK' in args.voice:
@@ -272,53 +287,52 @@ def serve_wav():
                     'cmu-arctic', 'cmu_arctic').replace(
                     '_low', '') + '.wav')
         # Non-Native English Accent TTS
-        elif '_' in  args.voice:
             precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
                                                                  '/', '_').replace('#', '_').replace(
-                                                                    'cmu-arctic', 'cmu_arctic').replace(
-                                                                        '_low', '') + '.wav')
         # Foreign Lang
         else:
             print(f'\n\n\n\n\n FallBack to MMS TTS due to: {args.voice=}')
     # NOTE : precomputed_style_vector is still None if MMS TTS
     # == SILENT VIDEO ==
     if args.video is not None:
         # banner - precomput @ 1920 pixels
         frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
-        font                   = cv2.FONT_HERSHEY_SIMPLEX
         bottomLeftCornerOfText = (240, 74)  # w,h
-        fontScale              = 2
-        fontColor              = (255, 255, 255)
-        thickness              = 4
-        lineType               = 2
         cv2.putText(frame_tts, 'TTS',
-            bottomLeftCornerOfText,
-            font,
-            fontScale,
-            fontColor,
-            thickness,
-            lineType)
         #     cv2.imshow('i', frame_tts); cv2.waitKey(); cv2.destroyAllWindows()
         # ====================================== NATIVE VOICE
         frame_orig = np.zeros((104, 1920, 3), dtype=np.uint8)
-        font                   = cv2.FONT_HERSHEY_SIMPLEX
         bottomLeftCornerOfText = (101, 74)  # w,h
-        fontScale              = 2
-        fontColor              = (255, 255, 255)
-        thickness              = 4
-        lineType               = 1000
         cv2.putText(frame_orig, 'ORIGINAL VOICE',
-            bottomLeftCornerOfText,
-            font,
-            fontScale,
-            fontColor,
-            thickness,
-            lineType)
         print(f'\n______________________________\n'
               f'Gen Banners for TTS/Native Title {frame_tts.shape=} {frame_orig.shape=}'
               f'\n______________________________\n')
@@ -336,67 +350,63 @@ def serve_wav():
         #
         video_file = args.video
         vf = VideoFileClip(video_file)
         # GET 1st FRAME to OBTAIN frame RESOLUTION
         h, w, _ = vf.get_frame(0).shape
         frame_tts = _resize(frame_tts, width=w)
         frame_orig = _resize(frame_orig, width=w)
         h, w, _ = frame_orig.shape
         try:
             # inpaint banner to say if native voice
             num = x_native.shape[0]
-            is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4))  # fade heaviside
             def inpaint_banner(get_frame, t):
                 '''blend banner - (now plays) tts or native voic
                 '''
                 im = np.copy(get_frame(t))  # pic
                 ix = int(t * 16000)   # ix may overflow the is_tts.shape
                 if ix < num:
                     if is_tts[ix] > .5:     # mask == 1 => tts / mask == 0 -> native
                         frame = frame_tts   # rename frame to rsz_frame_... because if frame_tts is mod
-                                            # then is considered a "local variable" thus the "outer var"
-                                            # is not observed by python raising referenced before assign
                     else:
                         frame = frame_orig
                 # For the ix that is out of bounds of num assume frame_tts
                 else:
                     frame = frame_tts
                 # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
                 offset_h = 24
-                print(f'  > inpaint_banner() HAS NATIVE:  {frame.shape=} {im.shape=}\n\n\n\n')
-                im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
                                                     + .6 * frame).astype(np.uint8)
                 # im2 = np.concatenate([im, frame_tts], 0)
                 # cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
                 return im  # np.concatenate([im, frane_ttts], 0)
         except UnboundLocalError:  # args.native == False
             def inpaint_banner(get_frame, t):
                 im = np.copy(get_frame(t))
                 h, w, _ = frame_tts.shape      # frame = banner
                 if w != im.shape[1]:        # rsz banners to fit video w
                     local_frame = _resize(frame_tts, width=im.shape[1])
                 offset_h = 24
-                im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
                                                     + .6 * local_frame).astype(np.uint8)
                 return im
         vf = vf.fl(inpaint_banner)
@@ -405,9 +415,9 @@ def serve_wav():
         # ==== TTS .srt ====
         if do_video_dub:
-            OUT_FILE = 'tmp.mp4' #args.out_file + '_video_dub.mp4'
             subtitles = text
-            MAX_LEN = int(subtitles[-1][2] + 17) * 16000
             # 17 extra seconds fail-safe for long-last-segment
             print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
             pieces = []
@@ -423,10 +433,12 @@ def serve_wav():
             # x = audresample.resample(x.astype(np.float32), 24000, 22050)  # reshapes (64,) -> (1,64)
             # PAD SHORTEST of  TTS / NATIVE
             if len(x_native) > len(total):
-                total = np.pad(total, (0, max(0, x_native.shape[0] - total.shape[0])))
             else:  # pad native to len of is_tts & total
-                x_native = np.pad(x_native, (0, max(0, total.shape[0] - x_native.shape[0])))
             # print(total.shape, x_native.shape, 'PADDED TRACKS')
             soundfile.write(AUDIO_TRACK,
                             # (is_tts * total + (1-is_tts) * x_native)[:, None],
@@ -435,25 +447,25 @@ def serve_wav():
         else:  # Video from plain (.txt)
             OUT_FILE = 'tmp.mp4'
             x = tts_multi_sentence(text=text,
-                               precomputed_style_vector=precomputed_style_vector,
-                               voice=args.voice,
-                               soundscape=args.soundscape,
-                               speed=args.speed)
             soundfile.write(AUDIO_TRACK, x, 16000)
     # IMAGE 2 SPEECH
     if args.image is not None:
         # Resize Input Image to 1920x1080 - Issue of .mp4 non visible for other aspect ratios
-        STATIC_FRAME = args.image  + '.jpg' # 'assets/image_from_T31.jpg'
         cv2.imwrite(
             STATIC_FRAME,
             resize_with_white_padding(cv2.imread(args.image)
                                       ))
-        OUT_FILE = 'tmp.mp4' #args.out_file + '_image_to_speech.mp4'
         # SILENT CLIP
@@ -486,22 +498,19 @@ def serve_wav():
                 CACHE_DIR + OUT_FILE])
         print(f'\noutput video is saved as {OUT_FILE}')
     else:
         # Fallback: No image nor video provided - do only tts
         x = tts_multi_sentence(text=text,
-                               precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
                                speed=args.speed)
         OUT_FILE = 'tmp.wav'
         soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
-    # audios = [msinference.inference(text,
     #                                 msinference.compute_style(f'voices/{voice}.wav'))]
     # # for t in [text]:
     # output_buffer = io.BytesIO()
@@ -511,8 +520,7 @@ def serve_wav():
     # https://stackoverflow.com/questions/67591467/
     #            flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
     # time.sleep(4)
     # send server's output as default file -> srv_result.xx
     print(f'\n=SERVER saved as {OUT_FILE=}\n')
     response = send_from_directory(CACHE_DIR, path=OUT_FILE)
@@ -520,6 +528,7 @@ def serve_wav():
     print('________________\n              ? \n_______________')
     return response
 if __name__ == "__main__":
     app.run(host="0.0.0.0")
@@ -546,4 +555,4 @@ if __name__ == "__main__":
 #             f'fusion.mp4',  # save to correct location is handled in client
 #                 ])
 #
-# ffmpeg -f concat -i mylist.txt -c copy output.mp4

         # Image is wider than the target, pad top and bottom
         new_w = target_w
         new_h = int(new_w / aspect_ratio)
+        resized_image = cv2.resize(
+            image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
         padding_h = target_h - new_h
         top_padding = padding_h // 2
         bottom_padding = padding_h - top_padding
         padding = [(top_padding, bottom_padding), (0, 0)]
         if len(image.shape) == 3:
             padding.append((0, 0))  # Add padding for color channels
+        padded_image = np.pad(resized_image, padding,
+                              mode='constant', constant_values=255)
     elif aspect_ratio < target_aspect_ratio:
         # Image is taller than the target, pad left and right
         new_h = target_h
         new_w = int(new_h * aspect_ratio)
+        resized_image = cv2.resize(
+            image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
         padding_w = target_w - new_w
         left_padding = padding_w // 2
         right_padding = padding_w - left_padding
         padding = [(0, 0), (left_padding, right_padding)]
         if len(image.shape) == 3:
             padding.append((0, 0))  # Add padding for color channels
+        padded_image = np.pad(resized_image, padding,
+                              mode='constant', constant_values=255)
     else:
         # Aspect ratio matches the target, just resize
+        padded_image = cv2.resize(
+            image, (target_w, target_h), interpolation=cv2.INTER_LANCZOS4)
+    return padded_image  # image 2 speech
 def _shorten(filename):
+    return filename.replace("/", "")[-6:]
 def _resize(image, width=None, height=None, inter=cv2.INTER_AREA):
     '''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py'''
     # return the resized image
     return resized
+def overlay(x, soundscape=None):
     if soundscape is not None:
         # AudioGen sound is suffice to be ~10s long
         background = sound_generator.generate(soundscape,
+                                              # sound duration = TTS dur
+                                              duration=len(x)/16000 + .74,
+                                              ).detach().cpu().numpy()  # bs, 11400 @.74s
         # len_soundscape = len(background)
         # fading = .5 + .5 * np.tanh(4*(np.linspace(10, -10, len_soundscape) + 9.4))  # fade heaviside  1,1,1,1,...,0
         # x = np.concatenate([fading * background, x], 0)  # blend TTS with AudioGen
+        # background /= np.abs(background).max() + 1e-7  # amplify speech to full [-1,1]
+        # background will be longer by xtra .74s
+        x = .47 * x + .46 * background[:len(x)]
     return x  # TTS / AudioGen @ 16kHz
        voice : string or None (falls to styleTTS)
        soundscape : 'A castle in far away lands' -> if passed will generate background sound soundscape
        '''
     # StyleTTS2 - English
     if precomputed_style_vector is not None:
         x = []
         if not isinstance(text, list):
             text = split_into_sentences(text)  # Avoid OOM in StyleTTS2
         for _sentence in text:
             # StyleTTS2 - pronounciation Fx
+            # .replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
+            _sentence = _sentence.lower()
             if 'vctk_low#p326' in voice:
                 # fix sounding of sleepy AAABS TRAACT
+                _sentence = _sentence.replace(
+                    'abstract', 'ahbstract')  # 'ahstract'
             x.append(msinference.inference(_sentence,
                                            precomputed_style_vector)
                      )
         x = np.concatenate(x)
     # Fallback - MMS TTS - Non-English
     else:
         # dont split foreign sentences: Avoids speaker change issue
         x = msinference.foreign(text=text,
                                 lang=voice,  # voice = 'romanian', 'serbian' 'hungarian'
                                 speed=speed)  # normalisation externally
     # volume
+    x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
+    return overlay(x, soundscape=soundscape)
 # voices = {}
 # import phonemizer
 # global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)
 app = Flask(__name__)
 @app.route("/", methods=['GET', 'POST', 'PUT'])
 def serve_wav():
     # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
     #                      object-into-a-representation-suitable-for-mongodb
     r = request.form.to_dict(flat=False)
     # Physically Save Client Files
     for filename, obj in request.files.items():
         obj.save(f'{CACHE_DIR}{_shorten(filename)}')
+    print('Saved all files on Server Side\n\n')
     args = SimpleNamespace(
+        # crop last letters from original filename & use as tmp
+        text=None if r.get('text') is None else CACHE_DIR +
+        _shorten(r.get('text')[0]),
+        video=None if r.get('video') is None else CACHE_DIR +
+        _shorten(r.get('video')[0]),
+        image=None if r.get('image') is None else CACHE_DIR +
+        _shorten(r.get('image')[0]),
+        native=None if r.get('native') is None else CACHE_DIR +
+        _shorten(r.get('native')[0]),
+        affective=r.get('affective')[0],
+        voice=r.get('voice')[0],
+        speed=float(r.get('speed')[0]),  # For Non-English MMS TTS
+        soundscape=r.get('soundscape')[0] if r.get(
+            'soundscape') is not None else None,
+    )
     # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
     print(args, 'ENTER Script')
     do_video_dub = True if args.text.endswith('.srt') else False
     AUDIO_TRACK = '_audio_track.wav'
     if do_video_dub:
+        print(
+            '==\nFound .srt : {args.txt}, thus Video should be given as well\n\n')
         with open(args.text, "r") as f:
             s = f.read()
+        text = [[j.content, j.start.total_seconds(), j.end.total_seconds()]
+                for j in srt.parse(s)]
         assert args.video is not None
         native_audio_file = '_tmp.wav'
         subprocess.run(
                 "-vn",
                 native_audio_file])
         x_native, _ = soundfile.read(native_audio_file)  # reads mp3
         # stereo in video
         if x_native.ndim > 1:
             x_native = x_native[:, 0]  # stereo
         # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
     else:
         with open(args.text, 'r') as f:
             text = ''.join(f)
+        # delete spaces  / split in list in tts_multi_sentence()
+        text = re.sub(' +', ' ', text)
     # == STYLE VECTOR ==
     precomputed_style_vector = None
     if args.native:  # Voice Cloning
         try:
             precomputed_style_vector = msinference.compute_style(args.native)
         except soundfile.LibsndfileError:  # Fallback - internal voice
+            print('\n  Could not voice clone audio:', args.native,
+                  'fallback to video or Internal TTS voice.\n')
         if do_video_dub:  # Clone voice via Video
             native_audio_file = args.video.replace('.', '').replace('/', '')
             native_audio_file += '__native_audio_track.wav'
             soundfile.write('tgt_spk.wav',
+                            np.concatenate([
+                                x_native[:int(4 * 16000)]], 0).astype(np.float32), 16000)  # 27400?
             precomputed_style_vector = msinference.compute_style('tgt_spk.wav')
     # NOTE: style vector is normally None here - except if --native arg was passed
     # Native English Accent TTS
     if precomputed_style_vector is None:
         if 'en_US' in args.voice or 'en_UK' in args.voice:
                     'cmu-arctic', 'cmu_arctic').replace(
                     '_low', '') + '.wav')
         # Non-Native English Accent TTS
+        elif '_' in args.voice:
             precomputed_style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + args.voice.replace(
                                                                  '/', '_').replace('#', '_').replace(
+                'cmu-arctic', 'cmu_arctic').replace(
+                '_low', '') + '.wav')
         # Foreign Lang
         else:
             print(f'\n\n\n\n\n FallBack to MMS TTS due to: {args.voice=}')
     # NOTE : precomputed_style_vector is still None if MMS TTS
     # == SILENT VIDEO ==
     if args.video is not None:
         # banner - precomput @ 1920 pixels
         frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
+        font = cv2.FONT_HERSHEY_SIMPLEX
         bottomLeftCornerOfText = (240, 74)  # w,h
+        fontScale = 2
+        fontColor = (255, 255, 255)
+        thickness = 4
+        lineType = 2
         cv2.putText(frame_tts, 'TTS',
+                    bottomLeftCornerOfText,
+                    font,
+                    fontScale,
+                    fontColor,
+                    thickness,
+                    lineType)
         #     cv2.imshow('i', frame_tts); cv2.waitKey(); cv2.destroyAllWindows()
         # ====================================== NATIVE VOICE
         frame_orig = np.zeros((104, 1920, 3), dtype=np.uint8)
+        font = cv2.FONT_HERSHEY_SIMPLEX
         bottomLeftCornerOfText = (101, 74)  # w,h
+        fontScale = 2
+        fontColor = (255, 255, 255)
+        thickness = 4
+        lineType = 1000
         cv2.putText(frame_orig, 'ORIGINAL VOICE',
+                    bottomLeftCornerOfText,
+                    font,
+                    fontScale,
+                    fontColor,
+                    thickness,
+                    lineType)
         print(f'\n______________________________\n'
               f'Gen Banners for TTS/Native Title {frame_tts.shape=} {frame_orig.shape=}'
               f'\n______________________________\n')
         #
         video_file = args.video
         vf = VideoFileClip(video_file)
         # GET 1st FRAME to OBTAIN frame RESOLUTION
         h, w, _ = vf.get_frame(0).shape
         frame_tts = _resize(frame_tts, width=w)
         frame_orig = _resize(frame_orig, width=w)
         h, w, _ = frame_orig.shape
         try:
             # inpaint banner to say if native voice
             num = x_native.shape[0]
+            # fade heaviside
+            is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4))
             def inpaint_banner(get_frame, t):
                 '''blend banner - (now plays) tts or native voic
                 '''
                 im = np.copy(get_frame(t))  # pic
                 ix = int(t * 16000)   # ix may overflow the is_tts.shape
                 if ix < num:
                     if is_tts[ix] > .5:     # mask == 1 => tts / mask == 0 -> native
                         frame = frame_tts   # rename frame to rsz_frame_... because if frame_tts is mod
+                        # then is considered a "local variable" thus the "outer var"
+                        # is not observed by python raising referenced before assign
                     else:
                         frame = frame_orig
                 # For the ix that is out of bounds of num assume frame_tts
                 else:
                     frame = frame_tts
                 # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
                 offset_h = 24
+                print(
+                    f'  > inpaint_banner() HAS NATIVE:  {frame.shape=} {im.shape=}\n\n\n\n')
+                im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
                                                     + .6 * frame).astype(np.uint8)
                 # im2 = np.concatenate([im, frame_tts], 0)
                 # cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
                 return im  # np.concatenate([im, frane_ttts], 0)
         except UnboundLocalError:  # args.native == False
             def inpaint_banner(get_frame, t):
                 im = np.copy(get_frame(t))
                 h, w, _ = frame_tts.shape      # frame = banner
                 if w != im.shape[1]:        # rsz banners to fit video w
                     local_frame = _resize(frame_tts, width=im.shape[1])
                 offset_h = 24
+                im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
                                                     + .6 * local_frame).astype(np.uint8)
                 return im
         vf = vf.fl(inpaint_banner)
         # ==== TTS .srt ====
         if do_video_dub:
+            OUT_FILE = 'tmp.mp4'  # args.out_file + '_video_dub.mp4'
             subtitles = text
+            MAX_LEN = int(subtitles[-1][2] + 17) * 16000
             # 17 extra seconds fail-safe for long-last-segment
             print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
             pieces = []
             # x = audresample.resample(x.astype(np.float32), 24000, 22050)  # reshapes (64,) -> (1,64)
             # PAD SHORTEST of  TTS / NATIVE
             if len(x_native) > len(total):
+                total = np.pad(
+                    total, (0, max(0, x_native.shape[0] - total.shape[0])))
             else:  # pad native to len of is_tts & total
+                x_native = np.pad(
+                    x_native, (0, max(0, total.shape[0] - x_native.shape[0])))
             # print(total.shape, x_native.shape, 'PADDED TRACKS')
             soundfile.write(AUDIO_TRACK,
                             # (is_tts * total + (1-is_tts) * x_native)[:, None],
         else:  # Video from plain (.txt)
             OUT_FILE = 'tmp.mp4'
             x = tts_multi_sentence(text=text,
+                                   precomputed_style_vector=precomputed_style_vector,
+                                   voice=args.voice,
+                                   soundscape=args.soundscape,
+                                   speed=args.speed)
             soundfile.write(AUDIO_TRACK, x, 16000)
     # IMAGE 2 SPEECH
     if args.image is not None:
         # Resize Input Image to 1920x1080 - Issue of .mp4 non visible for other aspect ratios
+        STATIC_FRAME = args.image + '.jpg'  # 'assets/image_from_T31.jpg'
         cv2.imwrite(
             STATIC_FRAME,
             resize_with_white_padding(cv2.imread(args.image)
                                       ))
+        OUT_FILE = 'tmp.mp4'  # args.out_file + '_image_to_speech.mp4'
         # SILENT CLIP
                 CACHE_DIR + OUT_FILE])
         print(f'\noutput video is saved as {OUT_FILE}')
     else:
         # Fallback: No image nor video provided - do only tts
         x = tts_multi_sentence(text=text,
+                               precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                soundscape=args.soundscape,
                                speed=args.speed)
         OUT_FILE = 'tmp.wav'
         soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
+    # audios = [msinference.inference(text,
     #                                 msinference.compute_style(f'voices/{voice}.wav'))]
     # # for t in [text]:
     # output_buffer = io.BytesIO()
     # https://stackoverflow.com/questions/67591467/
     #            flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
     # time.sleep(4)
     # send server's output as default file -> srv_result.xx
     print(f'\n=SERVER saved as {OUT_FILE=}\n')
     response = send_from_directory(CACHE_DIR, path=OUT_FILE)
     print('________________\n              ? \n_______________')
     return response
 if __name__ == "__main__":
     app.run(host="0.0.0.0")
 #             f'fusion.mp4',  # save to correct location is handled in client
 #                 ])
 #
+# ffmpeg -f concat -i mylist.txt -c copy output.mp4

demo.py CHANGED Viewed

@@ -2,26 +2,14 @@ import numpy as np
 import soundfile
 import msinference
 from audiocraft.builders import AudioGen
-# Prepend »Vom Prof. Friedrich ist noch eine .. string in the beginning brings the male voice in deu MMS TTS (if later string is much longer
-#                                      sometimes the woman voices pronounces words <dass>) TODO amplify attn weights of first hidden states / certain voice
-def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Landschaft hier«, schrieb das Literarische'
-                   'Conversations-Blatt anlässlich der Dresdner Akademieausstellung 1825, »eine einsame'
-                   'Gebirgsgegend. Trefflich sind die verschiedenen Tinten der höhern Bergregionen dargestellt: vorn,'
-                   'zwischen den sich thürmenden Basaltblöcken, drängen sich noch Gras und Bäumchen hervor,'
-                   '»Vom Prof. Friedrich ist noch eine recht schöne große Landschaft hier«, schrieb das Literarische'
-                   'Conversations-Blatt anlässlich der Dresdner Akademieausstellung 1825, »eine einsame'
-                   'Gebirgsgegend. Trefflich sind die verschiedenen Tinten der höhern Bergregionen dargestellt: vorn,'
-                   'zwischen den sich thürmenden Basaltblöcken, drängen sich noch Gras und Bäumchen hervor,'
-                   'A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.'
-                   'DESCIPTION Bronzezeitlicher Zeremonialhut („Berliner Goldhut“), gefertigt aus einem Stück nahtlos getriebenem Goldblech und mit Kreisornamenten in Repousse-Technik verziert. Kalotte kegelförmig überhöht und mit breiter umlaufender Krempe. Krempe und Kalotte durch flaches Bronzeband verstärkt. An der Krempe außen tordierter Bronzedraht. Die Anordnung der Ornamentik auf Kalotte und Krempe des Zeremonialhutes wird als Darstellung eines Kalendersystems gedeutet, mit dem sich die Verschiebungen zwischen Sonnen- und Mondjahr berechnen und Mondfinsternisse voraussagen lassen.'
-                   'Vorderseite: L IVL AVR SVLP ANTONINVS [LP ligiert]. Panzerbüste des Uranius Antoninus mit Lorbeerkranz in der Brustansicht nach l., Pteryges des r. Armansatzes sind zur Angabe eines erhobenen Armes waagerecht dargestellt. Rückseite: CONSERVATO-R AVG. Der Stein des Baal von Emesa auf einem Viergespann (quadriga) nach l. Auf dem Stein, der von zwei Schirmen gerahmt ist, ist das Relief eines Adlers zu sehen. Kommentar: Baldus (1971) 84 ff. 87 zur Frage der Münzstätte, ebd. 128 ff. zur Pterygesanhebung (Andeutung eines erhobenen Armes), die als alexanderhafter Gestus gilt. - Uranius Antoninus wurde im Sommer 253 n. Chr. im syrischen Emesa zum Kaiser erhoben und bewährte sich bald darauf bei der erfolgreichen Abwehr eines Einfalls der Sasaniden. Uranius Antoninus stammte möglicherweise aus der Familie der Iulia Domna, war Priester des Baals von Emesa, und ist mit dem literarisch überlieferten Sampsigeramus identisch, der als Organisator des Widerstandes gegen die Sasaniden in der Region belegt ist. Nach 254 n. Chr. fehlen Informationen über Uranius Antoninus, möglicherweise trat er nach Bereinigung der Notsituation hinter den Kaiser Valerianus zurück. Zu diesem Stück wurden 2017 im Zuge der Ausstellung Syria Antiqua zwei vergrößerte Reproduktionen (3D-Ausdrucke) erstellt, die bei den Galvanos in Schrank 81/121 liegen. Literatur: A. von Sallet, ZfN 17, 1890, 241 f. Taf. 4,9 (dieses Stück); H. R. Baldus, Uranius Antoninus (1971) 198 Nr. 85 Taf. 7,85; 12,85 (dieses Stück, mit Lit., 253/254 n. Chr. bzw. Stempelgruppe VIII ca. Dez. 253-Anfang 254 n. Chr.); RIC IV-3 Nr. 2 c; RPC IX Nr. 1940,2 Taf. 131 (dieses Stück).',
-              voice='romanian',  #'af_ZA_google-nwu_1919',  # 'serbian', 'en_US/vctk_low#p276', 'isl',
               speed=1.14,
-              affect = True,  # False = higher clarity
               soundscape = 'dogs barg in dungeons n dragons'
               ):
-    '''24kHz
        voice : 'en_US/vctk_low#p276'  # Native English voices -> https://audeering.github.io/shift/
@@ -37,8 +25,8 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
     # StyleTTS2 - find voice from folder
     if ('en_US/' in voice) or ('en_UK/' in voice):
-        a = '' if affect else 'v2/'
-        style_vector = msinference.compute_style('assets/wavs/style_vector/' + a + voice.replace(
                                                 '/', '_').replace('#', '_').replace(
                                                     'cmu-arctic', 'cmu_arctic').replace(
                                                         '_low', '') + '.wav')
@@ -58,7 +46,7 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
                                     style_vector)
-    # Fallback - MMS TTS - Non-English voice/language
     else:
         x = msinference.foreign(text=text,
@@ -68,13 +56,13 @@ def tts_entry(text='»Vom Prof. Friedrich ist noch eine recht schöne große Lan
     # volume
     x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
     if soundscape is not None:
         sound_gen = AudioGen().to('cuda:0').eval()
         background = sound_gen.generate(soundscape,
-                                              duration=len(x)/24000 + .74,  # sound duration in seconds
-                                              ).detach().cpu().numpy() # bs, 11400 @.74s
         x = .5 * x + .47 * background[:len(x)]
     return x
-soundfile.write(f'demo.wav', tts_entry(), 24000)

 import soundfile
 import msinference
 from audiocraft.builders import AudioGen
+def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
+              voice='en_US/vctk_low#p326', #'en_US/vctk_low#p276',  # 'deu', 'af_ZA_google-nwu_1919', 'serbian', 'isl',
               speed=1.14,
+              affect = True,  # False = higher clarity voice
               soundscape = 'dogs barg in dungeons n dragons'
               ):
+    '''16 KHz
        voice : 'en_US/vctk_low#p276'  # Native English voices -> https://audeering.github.io/shift/
     # StyleTTS2 - find voice from folder
     if ('en_US/' in voice) or ('en_UK/' in voice):
+        a = '' if affect else '_v2'
+        style_vector = msinference.compute_style('assets/wavs/style_vector' + a + '/' + voice.replace(
                                                 '/', '_').replace('#', '_').replace(
                                                     'cmu-arctic', 'cmu_arctic').replace(
                                                         '_low', '') + '.wav')
                                     style_vector)
+    # Fallback - MMS TTS - Non-English voice / langs
     else:
         x = msinference.foreign(text=text,
     # volume
     x /= np.abs(x).max() + 1e-7  # amplify speech to full [-1,1]
     if soundscape is not None:
         sound_gen = AudioGen().to('cuda:0').eval()
         background = sound_gen.generate(soundscape,
+                                              duration=len(x)/16000 + .74,  # sound duration in seconds
+                                              ).detach().cpu().numpy()
         x = .5 * x + .47 * background[:len(x)]
     return x
+soundfile.write(f'demo.wav', tts_entry(), 16000)

msinference.py CHANGED Viewed

@@ -1,3 +1,13 @@
 import torch
 from cached_path import cached_path
 # import nltk
@@ -9,6 +19,7 @@ import torchaudio
 import librosa
 from models import ProsodyPredictor, TextEncoder, StyleEncoder, load_F0_models
 from nltk.tokenize import word_tokenize
 import textwrap
 # IPA Phonemizer: https://github.com/bootphon/phonemizer
@@ -24,10 +35,15 @@ dicts = {}
 for i in range(len((symbols))):
     dicts[symbols[i]] = i
 class TextCleaner:
     def __init__(self, dummy=None):
         self.word_index_dictionary = dicts
         print(len(dicts))
     def __call__(self, text):
         indexes = []
         for char in text:
@@ -38,7 +54,6 @@ class TextCleaner:
         return indexes
 textclenaer = TextCleaner()
@@ -46,17 +61,20 @@ to_mel = torchaudio.transforms.MelSpectrogram(
     n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
 mean, std = -4, 4
 def alpha_num(f):
     f = re.sub(' +', ' ', f)              # delete spaces
     f = re.sub(r'[^A-Z a-z0-9 ]+', '', f)  # del non alpha num
     return f
 def preprocess(wave):
     wave_tensor = torch.from_numpy(wave).float()
     mel_tensor = to_mel(wave_tensor)
     mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
     return mel_tensor
 def compute_style(path):
     wave, sr = librosa.load(path, sr=24000)
     audio, index = librosa.effects.trim(wave, top_db=30)
@@ -67,18 +85,19 @@ def compute_style(path):
     with torch.no_grad():
         ref_s = style_encoder(mel_tensor.unsqueeze(1))
         ref_p = predictor_encoder(mel_tensor.unsqueeze(1))  # [bs, 11, 1, 128]
     s = torch.cat([ref_s, ref_p], dim=3)  # [bs, 11, 1, 256]
     s = s[:, :, 0, :].transpose(1, 2)  # [1, 128, 11]
-    return s# [1, 128, 11]
 device = 'cpu'
 if torch.cuda.is_available():
     device = 'cuda'
-import phonemizer
-global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)
 # phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
@@ -88,45 +107,43 @@ ASR_config = args['ASR_config']
 F0_path = args['F0_path']
 pitch_extractor = load_F0_models(F0_path).eval().to(device)
-from Utils.PLBERT.util import load_plbert
-from Modules.hifigan import Decoder
 bert = load_plbert(args['PLBERT_dir']).eval().to(device)
-decoder = Decoder(dim_in=512,
-                  style_dim=128,
                   dim_out=80,  # n_mels
-                  resblock_kernel_sizes = [3, 7, 11],
-                  upsample_rates = [10, 5, 3, 2],
                   upsample_initial_channel=512,
                   resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
                   upsample_kernel_sizes=[20, 10, 6, 4]).eval().to(device)
-text_encoder = TextEncoder(channels=512,
-                           kernel_size=5,
-                           depth=3, #args['model_params']['n_layer'],
-                           n_symbols=178, #args['model_params']['n_token']
                            ).eval().to(device)
-predictor = ProsodyPredictor(style_dim=128,
-                             d_hid=512,
                              nlayers=3,  # OFFICIAL config.nlayers=5;
-                             max_dur=50,
                              dropout=.2).eval().to(device)
-style_encoder = StyleEncoder(dim_in=64,
-                             style_dim=128,
-                             max_conv_dim=512).eval().to(device) # acoustic style encoder
-predictor_encoder = StyleEncoder(dim_in=64,
-                                 style_dim=128,
-                                 max_conv_dim=512).eval().to(device) # prosodic style encoder
 bert_encoder = torch.nn.Linear(bert.config.hidden_size, 512).eval().to(device)
 # params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
-params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
 params = params_whole['net']
-from collections import OrderedDict
 def _del_prefix(d):
     # del ".module"
@@ -135,14 +152,19 @@ def _del_prefix(d):
         out[k[7:]] = v
     return out
-bert.load_state_dict(        _del_prefix(params['bert']), strict=True)
 bert_encoder.load_state_dict(_del_prefix(params['bert_encoder']), strict=True)
-predictor.load_state_dict(   _del_prefix(params['predictor']), strict=True)  # XTRA non-ckpt LSTMs nlayers add slowiness to voice
-decoder.load_state_dict(     _del_prefix(params['decoder']), strict=True)
 text_encoder.load_state_dict(_del_prefix(params['text_encoder']), strict=True)
-predictor_encoder.load_state_dict(_del_prefix(params['predictor_encoder']), strict=True)
-style_encoder.load_state_dict(_del_prefix(params['style_encoder']), strict=True)
-pitch_extractor.load_state_dict(_del_prefix(params['pitch_extractor']), strict=True)
 # def _shift(x):
 #     # [bs, samples] shift circular each batch elem of sound
@@ -152,13 +174,13 @@ pitch_extractor.load_state_dict(_del_prefix(params['pitch_extractor']), strict=T
 #         x[i, ...] = torch.roll(batch_elem, offset, dims=1)  # batch_elem = [400000, ]
 #     return x
 def inference(text,
               ref_s,
               use_gruut=False):
-    # Ignore .,; AT end of sentence; or just [-50:]
-    text = text.strip()
     ps = global_phonemizer.phonemize([text])
     # print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
     ps = word_tokenize(ps[0])
@@ -172,20 +194,20 @@ def inference(text,
     with torch.no_grad():
         input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
         hidden_states = text_encoder(tokens, input_lengths)
         bert_dur = bert(tokens, attention_mask=None)
         d_en = bert_encoder(bert_dur).transpose(-1, -2)
-        ref = ref_s[:, :128, :] # [bs, 128, 11]
         s = ref_s[:, 128:, :]
         d = predictor.text_encoder(d_en, s, input_lengths)
         d = d.transpose(1, 2)
         # -------------------------------- pred_aln_trg = clones bert frames as duration
         d = predictor.text_encoder(d_en,
-                                         s,
-                                         input_lengths)
         x, _ = predictor.lstm(d)
@@ -194,7 +216,6 @@ def inference(text,
         duration = torch.sigmoid(duration).sum(axis=-1)
         pred_dur = torch.round(duration.squeeze()).clamp(min=1)
         pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
         c_frame = 0
         for i in range(pred_aln_trg.size(0)):
@@ -222,15 +243,15 @@ def inference(text,
                     N=N_pred,
                     s=ref)
-    x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
     # StyleTTS2 is 24kHz -> Resample to 16kHz ofAudioGen / MMS
     if x.shape[0] > 10:
         x /= np.abs(x).max() + 1e-7
         x = audresample.resample(signal=x.astype(np.float32),
-                             original_rate=24000,
-                             target_rate=16000)[0, :]  # reshapes (64,) -> (1,64)
     else:
         print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
@@ -238,26 +259,15 @@ def inference(text,
     return x
 # ___________________________________________________________
 # https://huggingface.co/spaces/mms-meta/MMS/blob/main/tts.py
 # ___________________________________________________________
 # -*- coding: utf-8 -*-
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-from num2words import num2words
-import os
-import re
-import tempfile
-import torch
-import sys
-from Modules.vits.models import VitsModel, VitsTokenizer
 TTS_LANGUAGES = {}
 # with open('_d.csv', 'w') as f2:
@@ -266,96 +276,54 @@ with open(f"Utils/all_langs.csv") as f:
         iso, name = line.split(",", 1)
         TTS_LANGUAGES[iso.strip()] = name.strip()
         # f2.write(iso + ',' + name.replace("a S","")+'\n')
-# =============================================================================================
-#                         R O M A N I A N   N U M E R A L S
-# =============================================================================================
-def _ro_number(number_str):
-    # Function to convert numbers to their phonetic form in Romanian
-    # Check if the number is negative
-    negative = number_str.startswith('-')
-    if negative:
-        number_str = number_str[1:]  # Remove the minus sign for now
-    # Handle floating point numbers by splitting into integer and decimal parts
-    if '.' in number_str:
-        integer_part, decimal_part = number_str.split('.')
-        integer_words = num2words(integer_part, lang='ro')
-        decimal_words = ' '.join([num2words(digit, lang='ro') for digit in decimal_part])
-        result = f"{integer_words} virgulă {decimal_words}"
-    else:
-        result = num2words(number_str, lang='ro')
-    # Add 'minus' if the number is negative
-    if negative:
-        result = "minus " + result
-    return result
-def romanian_num2str(input_string):
-    # Function to convert a string with numbers to phonetic representation in Romanian
-    # Regex pattern to identify numbers in the string (including negative numbers, decimals)
-    pattern = r'-?\d+(\.\d+)?'
-    def replace_with_phoneme(match):
-        # Extract the matched number and convert it to phonetic representation
-        number_str = match.group()
-        return _ro_number(number_str)
-    # Use regex to find all numbers in the input string and replace them with their phonetic form
-    return re.sub(pattern, replace_with_phoneme, input_string)
 # ==============================================================================================
-# LOAD hun / ron / serbian - rmc-script_latin / cyrillic-Carpathian (not Vlax)
 # ==============================================================================================
 PHONEME_MAP = {
-        'služ' : 'sloooozz', # 'službeno'
-        'suver': 'siuveeerra', # 'suverena'
-        'država': 'dirrezav', # 'država'
-        'iči': 'ici', # 'Graniči'
-        's ': 'se', # a s with space
-        'q': 'ku',
-        'w': 'aou',
-        'z': 's',
-        "š": "s",
-        'th': 'ta',
-        'v': 'vv',
-        # "ć": "č",
-        # "đ": "ď",
-        # "lj": "ľ",
-        # "nj": "ň",
-        "ž": "z",
-        # "c": "č"
-        }
-# ALLOWED_PHONEMES = set("šč_bďph`-3žt 'ľzj5yuoóx1vfnaiedt́sṁkň2rčlg")
-def number_to_phonemes(match):
-    number = int(match.group())
-    words = num2words(number, lang='sr')
-    return fix_phones(words.lower())
-    # return words
 def fix_phones(text):
     for src, target in PHONEME_MAP.items():
         text = text.replace(src, target)
     # text = re.sub(r'\s+', '` `', text) #.strip() #.lower()
     # text = re.sub(r'\s+', '_     _', text)  # almost proper pausing
     return text.replace(',', '_     _').replace('.', '_    _')
 def has_cyrillic(text):
     # https://stackoverflow.com/questions/48255244/python-check-if-a-string-contains-cyrillic-characters
     return bool(re.search('[\u0400-\u04FF]', text))
-def foreign(text=None,   # split sentences here so we can prepend a txt for german to each sentence to
                          # fall on the male voice (Sink attn)
             lang='romanian',
             speed=None):
-    lang = lang.lower()  # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
     # https://huggingface.co/spaces/mms-meta/MMS
@@ -367,11 +335,13 @@ def foreign(text=None,   # split sentences here so we can prepend a txt for germ
         if has_cyrillic(text):  # check 0-th sentence if is cyrillic
-            lang_code = 'rmc-script_cyrillic'   # romani carpathian (also has latin / cyrillic Vlax)
         else:
-            lang_code = 'rmc-script_latin'   # romani carpathian (has also Vlax)
     elif 'rom' in lang:
@@ -392,12 +362,12 @@ def foreign(text=None,   # split sentences here so we can prepend a txt for germ
         lang_code = lang.split()[0].strip()
-    #  Load VITS
     net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval().to(device)
     tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
-    # CALL MMS TTS VITS
     total_audio = []
@@ -406,41 +376,49 @@ def foreign(text=None,   # split sentences here so we can prepend a txt for germ
         if lang_code == 'deu':
             # Split Very long sentences >500 phoneme - StyleTTS2 crashes # -- even 400 phonemes sometimes OOM in cuda:4
             # However prosody is nicer on non-split for MMS TTS
-            text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 200, break_long_words=0)]   # prepend txt snippet
-                                                                                                 # assert that it chooses unique voice
         else:
-            text = [sub_sent+' ' for sub_sent in textwrap.wrap(text, 640, break_long_words=0)]  # allow longer non split text
-                                                                                                 # for non deu MMS TTS lang.
     for _t in text:
         _t = _t.lower()
         if lang_code == 'rmc-script_latin':
-            _t = re.sub(r'\d+', number_to_phonemes, _t)
-            _t = fix_phones(_t)
         elif lang_code == 'ron':
-            # numerals
-            _t = romanian_num2str(_t)
             # tone
             _t = _t.replace("ţ", "ț"
-                        ).replace('ț','ts').replace('î', 'u').replace('â','a').replace('ş','s')
         # /data/dkounadis/.hf7/hub/models--facebook--mms-tts/snapshots/44cc7fb408064ef9ea6e7c59130d88cac1274671/models/rmc-script_latin/vocab.txt
-        inputs = tokenizer(_t, return_tensors="pt")  # input_ids / attention_mask
         with torch.no_grad():
             # MMS
             x = net_g(input_ids=inputs.input_ids.to(device),
-                           attention_mask=inputs.attention_mask.to(device),
-                           speed = speed + .44 * np.random.rand()  # variable speed for different sentence
-                           )[0, :]
             # crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()
@@ -455,5 +433,3 @@ def foreign(text=None,   # split sentences here so we can prepend a txt for germ
     # print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
     return x  # 16kHz - only resample  StyleTTS2 from 24Hkz -> 16kHz

+from Modules.vits.models import VitsModel, VitsTokenizer
+import sys
+import tempfile
+import re
+import os
+from num2words import num2words
+from collections import OrderedDict
+from Modules.hifigan import Decoder
+from Utils.PLBERT.util import load_plbert
+import phonemizer
 import torch
 from cached_path import cached_path
 # import nltk
 import librosa
 from models import ProsodyPredictor, TextEncoder, StyleEncoder, load_F0_models
 from nltk.tokenize import word_tokenize
+from Utils.text_utils import transliterate_number
 import textwrap
 # IPA Phonemizer: https://github.com/bootphon/phonemizer
 for i in range(len((symbols))):
     dicts[symbols[i]] = i
 class TextCleaner:
     def __init__(self, dummy=None):
         self.word_index_dictionary = dicts
         print(len(dicts))
     def __call__(self, text):
         indexes = []
         for char in text:
         return indexes
 textclenaer = TextCleaner()
     n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
 mean, std = -4, 4
 def alpha_num(f):
     f = re.sub(' +', ' ', f)              # delete spaces
     f = re.sub(r'[^A-Z a-z0-9 ]+', '', f)  # del non alpha num
     return f
 def preprocess(wave):
     wave_tensor = torch.from_numpy(wave).float()
     mel_tensor = to_mel(wave_tensor)
     mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
     return mel_tensor
 def compute_style(path):
     wave, sr = librosa.load(path, sr=24000)
     audio, index = librosa.effects.trim(wave, top_db=30)
     with torch.no_grad():
         ref_s = style_encoder(mel_tensor.unsqueeze(1))
         ref_p = predictor_encoder(mel_tensor.unsqueeze(1))  # [bs, 11, 1, 128]
     s = torch.cat([ref_s, ref_p], dim=3)  # [bs, 11, 1, 256]
     s = s[:, :, 0, :].transpose(1, 2)  # [1, 128, 11]
+    return s  # [1, 128, 11]
 device = 'cpu'
 if torch.cuda.is_available():
     device = 'cuda'
+global_phonemizer = phonemizer.backend.EspeakBackend(
+    language='en-us', preserve_punctuation=True,  with_stress=True)
 # phonemizer = Phonemizer.from_checkpoint(str(cached_path('https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_ipa_forward.pt')))
 F0_path = args['F0_path']
 pitch_extractor = load_F0_models(F0_path).eval().to(device)
 bert = load_plbert(args['PLBERT_dir']).eval().to(device)
+decoder = Decoder(dim_in=512,
+                  style_dim=128,
                   dim_out=80,  # n_mels
+                  resblock_kernel_sizes=[3, 7, 11],
+                  upsample_rates=[10, 5, 3, 2],
                   upsample_initial_channel=512,
                   resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
                   upsample_kernel_sizes=[20, 10, 6, 4]).eval().to(device)
+text_encoder = TextEncoder(channels=512,
+                           kernel_size=5,
+                           depth=3,  # args['model_params']['n_layer'],
+                           n_symbols=178,  # args['model_params']['n_token']
                            ).eval().to(device)
+predictor = ProsodyPredictor(style_dim=128,
+                             d_hid=512,
                              nlayers=3,  # OFFICIAL config.nlayers=5;
+                             max_dur=50,
                              dropout=.2).eval().to(device)
+style_encoder = StyleEncoder(dim_in=64,
+                             style_dim=128,
+                             max_conv_dim=512).eval().to(device)  # acoustic style encoder
+predictor_encoder = StyleEncoder(dim_in=64,
+                                 style_dim=128,
+                                 max_conv_dim=512).eval().to(device)  # prosodic style encoder
 bert_encoder = torch.nn.Linear(bert.config.hidden_size, 512).eval().to(device)
 # params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
+params_whole = torch.load(str(cached_path(
+    "hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
 params = params_whole['net']
 def _del_prefix(d):
     # del ".module"
         out[k[7:]] = v
     return out
+bert.load_state_dict(_del_prefix(params['bert']), strict=True)
 bert_encoder.load_state_dict(_del_prefix(params['bert_encoder']), strict=True)
+# XTRA non-ckpt LSTMs nlayers add slowiness to voice
+predictor.load_state_dict(_del_prefix(params['predictor']), strict=True)
+decoder.load_state_dict(_del_prefix(params['decoder']), strict=True)
 text_encoder.load_state_dict(_del_prefix(params['text_encoder']), strict=True)
+predictor_encoder.load_state_dict(_del_prefix(
+    params['predictor_encoder']), strict=True)
+style_encoder.load_state_dict(_del_prefix(
+    params['style_encoder']), strict=True)
+pitch_extractor.load_state_dict(_del_prefix(
+    params['pitch_extractor']), strict=True)
 # def _shift(x):
 #     # [bs, samples] shift circular each batch elem of sound
 #         x[i, ...] = torch.roll(batch_elem, offset, dims=1)  # batch_elem = [400000, ]
 #     return x
 def inference(text,
               ref_s,
               use_gruut=False):
+    text = transliterate_number(text, lang='en').strip()
     ps = global_phonemizer.phonemize([text])
     # print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
     ps = word_tokenize(ps[0])
     with torch.no_grad():
         input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
         hidden_states = text_encoder(tokens, input_lengths)
         bert_dur = bert(tokens, attention_mask=None)
         d_en = bert_encoder(bert_dur).transpose(-1, -2)
+        ref = ref_s[:, :128, :]  # [bs, 128, 11]
         s = ref_s[:, 128:, :]
         d = predictor.text_encoder(d_en, s, input_lengths)
         d = d.transpose(1, 2)
         # -------------------------------- pred_aln_trg = clones bert frames as duration
         d = predictor.text_encoder(d_en,
+                                   s,
+                                   input_lengths)
         x, _ = predictor.lstm(d)
         duration = torch.sigmoid(duration).sum(axis=-1)
         pred_dur = torch.round(duration.squeeze()).clamp(min=1)
         pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
         c_frame = 0
         for i in range(pred_aln_trg.size(0)):
                     N=N_pred,
                     s=ref)
+    x = x.cpu().numpy()[0, 0, :-400]  # weird pulse at the end of sentences
     # StyleTTS2 is 24kHz -> Resample to 16kHz ofAudioGen / MMS
     if x.shape[0] > 10:
         x /= np.abs(x).max() + 1e-7
         x = audresample.resample(signal=x.astype(np.float32),
+                                 original_rate=24000,
+                                 target_rate=16000)[0, :]  # reshapes (64,) -> (1,64)
     else:
         print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
     return x
 # ___________________________________________________________
 # https://huggingface.co/spaces/mms-meta/MMS/blob/main/tts.py
 # ___________________________________________________________
 # -*- coding: utf-8 -*-
 # Copyright (c) Facebook, Inc. and its affiliates.
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 TTS_LANGUAGES = {}
 # with open('_d.csv', 'w') as f2:
         iso, name = line.split(",", 1)
         TTS_LANGUAGES[iso.strip()] = name.strip()
         # f2.write(iso + ',' + name.replace("a S","")+'\n')
 # ==============================================================================================
+# LOAD hun / ron / serbian - rmc-script_latin / cyrillic-Carpathian (not Vlax)
 # ==============================================================================================
 PHONEME_MAP = {
+    'služ': 'sloooozz',  # 'službeno'
+    'suver': 'siuveeerra',  # 'suverena'
+    'država': 'dirrezav',  # 'država'
+    'iči': 'ici',  # 'Graniči'
+    's ': 'se',  # a s with space
+    'q': 'ku',
+    'w': 'aou',
+    'z': 's',
+    "š": "s",
+    'th': 'ta',
+    'v': 'vv',
+    # "ć": "č",
+    # "đ": "ď",
+    # "lj": "ľ",
+    # "nj": "ň",
+    "ž": "z",
+    # "c": "č"
+}
 def fix_phones(text):
     for src, target in PHONEME_MAP.items():
         text = text.replace(src, target)
     # text = re.sub(r'\s+', '` `', text) #.strip() #.lower()
     # text = re.sub(r'\s+', '_     _', text)  # almost proper pausing
     return text.replace(',', '_     _').replace('.', '_    _')
 def has_cyrillic(text):
     # https://stackoverflow.com/questions/48255244/python-check-if-a-string-contains-cyrillic-characters
     return bool(re.search('[\u0400-\u04FF]', text))
+def foreign(text=None,   # split sentences here so we can prepend a txt for german to each sentence to
                          # fall on the male voice (Sink attn)
             lang='romanian',
             speed=None):
+    # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
+    lang = lang.lower()
     # https://huggingface.co/spaces/mms-meta/MMS
         if has_cyrillic(text):  # check 0-th sentence if is cyrillic
+            # romani carpathian (also has latin / cyrillic Vlax)
+            lang_code = 'rmc-script_cyrillic'
         else:
+            # romani carpathian (has also Vlax)
+            lang_code = 'rmc-script_latin'
     elif 'rom' in lang:
         lang_code = lang.split()[0].strip()
+    # load VITS
     net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval().to(device)
     tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
     total_audio = []
         if lang_code == 'deu':
             # Split Very long sentences >500 phoneme - StyleTTS2 crashes # -- even 400 phonemes sometimes OOM in cuda:4
             # However prosody is nicer on non-split for MMS TTS
+            # prepend txt snippet
+            text = [
+                sub_sent+' ' for sub_sent in textwrap.wrap(text, 200, break_long_words=0)]
+            # assert that it chooses unique voice
         else:
+            # allow longer non split text
+            text = [
+                sub_sent+' ' for sub_sent in textwrap.wrap(text, 640, break_long_words=0)]
+            # for non deu MMS TTS lang.
     for _t in text:
         _t = _t.lower()
+        # apply this in api.py -> tts_multi_sentence before switching between Styletts2
+        print('\n\n\n\nBEF TRansliteration', _t,'\n\n\n\n\n')
+        _t = transliterate_number(_t, lang=lang_code)
+        print('AFT nums', _t,'\n____________________________________________')
+        # However if we transliterate here also the demo sees the transliteration
         if lang_code == 'rmc-script_latin':
+            _t = fix_phones(_t)  # phonemes replace per language
         elif lang_code == 'ron':
             # tone
             _t = _t.replace("ţ", "ț"
+                            ).replace('ț', 'ts').replace('î', 'u').replace('â', 'a').replace('ş', 's')
         # /data/dkounadis/.hf7/hub/models--facebook--mms-tts/snapshots/44cc7fb408064ef9ea6e7c59130d88cac1274671/models/rmc-script_latin/vocab.txt
+        # input_ids / attention_mask
+        inputs = tokenizer(_t, return_tensors="pt")
         with torch.no_grad():
             # MMS
             x = net_g(input_ids=inputs.input_ids.to(device),
+                      attention_mask=inputs.attention_mask.to(device),
+                      speed=speed + .44 * np.random.rand()  # variable speed for different sentence
+                      )[0, :]
             # crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()
     # print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
     return x  # 16kHz - only resample  StyleTTS2 from 24Hkz -> 16kHz