Spaces:

tartuNLP
/

XTTSv2-est

Running

App Files Files Community

Rasmus Lellep commited on Aug 8

Commit

03381f4

1 Parent(s): 2bac171

added estonian examples, changed demo description

Browse files

Files changed (4) hide show

.DS_Store +0 -0
app.py +97 -89
app_local.py +0 -62
examples/LJ005-0214.wav +3 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

app.py CHANGED Viewed

@@ -96,9 +96,7 @@ supported_languages = config.languages
 def predict(
     prompt,
     language,
-    audio_file_pth,
-    mic_file_path,
-    use_mic,
     voice_cleanup,
     no_lang_auto_detect,
     agree,
@@ -146,23 +144,6 @@ def predict(
                     None,
                 )
-        if use_mic == True:
-            if mic_file_path is not None:
-                speaker_wav = mic_file_path
-            else:
-                gr.Warning(
-                    "Please record your voice with Microphone, or uncheck Use Microphone to use reference audios"
-                )
-                return (
-                    None,
-                    None,
-                    None,
-                    None,
-                )
-        else:
-            speaker_wav = audio_file_pth
         # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
         # This is fast filtering not perfect
@@ -273,6 +254,7 @@ def predict(
             wav_chunks = []
             ## Direct mode
             print("I: Generating new audio...")
             t0 = time.time()
             out = model.inference(
@@ -293,9 +275,9 @@ def predict(
             print(f"Real-time factor (RTF): {real_time_factor}")
             metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
             torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
-            """
             print("I: Generating new audio in streaming mode...")
             t0 = time.time()
             chunks = model.inference_stream(
@@ -330,7 +312,7 @@ def predict(
             metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
             torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
-            """
         except RuntimeError as e:
             if "device-side assert" in str(e):
@@ -353,9 +335,7 @@ def predict(
                     error_time,
                     prompt,
                     language,
-                    audio_file_pth,
-                    mic_file_path,
-                    use_mic,
                     voice_cleanup,
                     no_lang_auto_detect,
                     agree,
@@ -436,27 +416,15 @@ description = """
 <br/>
-This demo is currently running **XTTS v2.0.3** <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a multilingual text-to-speech and voice-cloning model. This demo features zero-shot voice cloning, however, you can fine-tune XTTS for better results. Leave a star 🌟 on Github <a href="https://github.com/coqui-ai/TTS">🐸TTS</a>, where our open-source inference and training code lives.
-<br/>
-Supported languages: Arabic: ar, Brazilian Portuguese: pt , Mandarin Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, Estonian: et, French: fr, German: de, Italian: it, Polish: pl, Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu, Hindi: hi
 <br/>
 """
-links = """
-<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
-|                                 |                                         |
-| ------------------------------- | --------------------------------------- |
-| 🐸💬 **CoquiTTS**                | <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>|
-| 💼 **Documentation**            | [ReadTheDocs](https://tts.readthedocs.io/en/latest/)
-| 👩‍💻 **Questions**                | [GitHub Discussions](https://github.com/coqui-ai/TTS/discussions) |
-| 🗯 **Community**         | [![Dicord](https://img.shields.io/discord/1037326658807533628?color=%239B59B6&label=chat%20on%20discord)](https://discord.gg/5eXr5seRrv)  |
-"""
 article = """
 <div style='margin:20px auto;'>
@@ -465,12 +433,91 @@ article = """
 </div>
 """
 examples = [
     [
         "Once when I was six years old I saw a magnificent picture",
         "en",
         "examples/female.wav",
-        None,
-        False,
         False,
         False,
         True,
@@ -479,8 +526,6 @@ examples = [
         "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
         "fr",
         "examples/female.wav",
-        None,
-        False,
         False,
         False,
         True,
@@ -489,8 +534,6 @@ examples = [
         "Als ich sechs war, sah ich einmal ein wunderbares Bild",
         "de",
         "examples/female.wav",
-        None,
-        False,
         False,
         False,
         True,
@@ -499,8 +542,6 @@ examples = [
         "Cuando tenía seis años, vi una vez una imagen magnífica",
         "es",
         "examples/female.wav",
-        None,
-        False,
         False,
         False,
         True,
@@ -509,8 +550,6 @@ examples = [
         "Kunagi, kui olin kuueaastane, nägin ma ühte imelist pilti",
         "et",
         "examples/female.wav",
-        None,
-        False,
         False,
         False,
         True,
@@ -519,8 +558,6 @@ examples = [
         "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
         "pt",
         "examples/female.wav",
-        None,
-        False,
         False,
         False,
         True,
@@ -529,8 +566,6 @@ examples = [
         "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
         "pl",
         "examples/female.wav",
-        None,
-        False,
         False,
         False,
         True,
@@ -539,8 +574,6 @@ examples = [
         "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
         "it",
         "examples/female.wav",
-        None,
-        False,
         False,
         False,
         True,
@@ -549,8 +582,6 @@ examples = [
         "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
         "tr",
         "examples/male.wav",
-        None,
-        False,
         False,
         False,
         True,
@@ -559,8 +590,6 @@ examples = [
         "Когда мне было шесть лет, я увидел однажды удивительную картинку",
         "ru",
         "examples/female.wav",
-        None,
-        False,
         False,
         False,
         True,
@@ -569,8 +598,6 @@ examples = [
         "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
         "nl",
         "examples/male.wav",
-        None,
-        False,
         False,
         False,
         True,
@@ -579,8 +606,6 @@ examples = [
         "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
         "cs",
         "examples/female.wav",
-        None,
-        False,
         False,
         False,
         True,
@@ -589,8 +614,6 @@ examples = [
         "当我还只有六岁的时候， 看到了一副精彩的插画",
         "zh-cn",
         "examples/male.wav",
-        None,
-        False,
         False,
         False,
         True,
@@ -599,9 +622,7 @@ examples = [
         "かつて 六歳のとき、素晴らしい絵を見ました",
         "ja",
         "examples/female.wav",
-        None,
         False,
-        True,
         False,
         True,
     ],
@@ -609,9 +630,7 @@ examples = [
         "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
         "ko",
         "examples/male.wav",
-        None,
         False,
-        True,
         False,
         True,
     ],
@@ -619,12 +638,11 @@ examples = [
         "Egyszer hat éves koromban láttam egy csodálatos képet",
         "hu",
         "examples/male.wav",
-        None,
         False,
-        True,
         False,
         True,
     ],
 ]
@@ -644,12 +662,11 @@ with gr.Blocks(analytics_enabled=False) as demo:
     with gr.Row():
         with gr.Column():
             gr.Markdown(description)
-        with gr.Column():
-            gr.Markdown(links)
     with gr.Row():
         with gr.Column():
             input_text_gr = gr.Textbox(
                 label="Text Prompt",
                 info="One or two sentences at a time is better. Up to 200 text characters.",
                 value="Tere, olen sinu hääle kloon. Ürita mulle lindistada võimalikult hea kvaliteediga klipp, et oskaksin su kõnet paremini jäljendada.",
@@ -679,24 +696,15 @@ with gr.Blocks(analytics_enabled=False) as demo:
                 ],
                 multiselect=False,
                 value="et",
             )
             ref_gr = gr.Audio(
                 label="Reference Audio",
                 #info="Click on the ✎ button to upload your own target speaker audio",
                 type="filepath",
                 value="examples/female.wav",
             )
-            mic_gr = gr.Audio(
-                sources="microphone",
-                #info="Use your microphone to record audio",
-                type="filepath",
-                label="Use Microphone for Reference",
-            )
-            use_mic_gr = gr.Checkbox(
-                label="Use Microphone",
-                value=False,
-                info="Notice: Microphone input may not work properly under traffic",
-            )
             clean_ref_gr = gr.Checkbox(
                 label="Cleanup Reference Voice",
                 value=False,
@@ -724,12 +732,12 @@ with gr.Blocks(analytics_enabled=False) as demo:
     with gr.Row():
         gr.Examples(examples,
                     label="Examples",
-                    inputs=[input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
                     outputs=[audio_gr, out_text_gr, ref_audio_gr],
                     fn=predict,
                     cache_examples=False,)
-    tts_button.click(predict, [input_text_gr, language_gr, ref_gr, mic_gr, use_mic_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[audio_gr, out_text_gr, ref_audio_gr])
 if __name__ == "__main__":
     demo.queue()

 def predict(
     prompt,
     language,
+    speaker_wav,
     voice_cleanup,
     no_lang_auto_detect,
     agree,
                     None,
                 )
         # Filtering for microphone input, as it has BG noise, maybe silence in beginning and end
         # This is fast filtering not perfect
             wav_chunks = []
             ## Direct mode
+            '''
             print("I: Generating new audio...")
             t0 = time.time()
             out = model.inference(
             print(f"Real-time factor (RTF): {real_time_factor}")
             metrics_text+=f"Real-time factor (RTF): {real_time_factor:.2f}\n"
             torchaudio.save("output.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
+            '''
             print("I: Generating new audio in streaming mode...")
             t0 = time.time()
             chunks = model.inference_stream(
             metrics_text += f"Real-time factor (RTF): {real_time_factor:.2f}\n"
             torchaudio.save("output.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
         except RuntimeError as e:
             if "device-side assert" in str(e):
                     error_time,
                     prompt,
                     language,
+                    speaker_wav,
                     voice_cleanup,
                     no_lang_auto_detect,
                     agree,
 <br/>
+This demo is running an XTTS model fine-tuned on Estonian data. <a href="https://huggingface.co/coqui/XTTS-v2">XTTS</a> is a multilingual text-to-speech and voice-cloning model. This demo features zero-shot voice cloning.
 <br/>
 """
+#Supported languages: Arabic: ar, Brazilian Portuguese: pt , Mandarin Chinese: zh-cn, Czech: cs, Dutch: nl, English: en, Estonian: et, French: fr, German: de, Italian: it, Polish: pl, Russian: ru, Spanish: es, Turkish: tr, Japanese: ja, Korean: ko, Hungarian: hu, Hindi: hi
+#<br/>
+#"""
 article = """
 <div style='margin:20px auto;'>
 </div>
 """
 examples = [
+    [
+        "Kuigi ilm oli vihmane ja tuuline, otsustasid matkajad siiski metsa minna, lootes, et pilved hajuvad.",
+        "et",
+        "examples/female.wav",
+        False,
+        False,
+        True,
+    ],
+    [
+        "Vanaema jutustatud muinasjutt oli nii kaasahaarav, et lapsed unustasid aja ja kuulasid vaikselt iga sõna.",
+        "et",
+        "examples/male.wav",
+        False,
+        False,
+        True,
+    ],
+    [
+        "Arvuti tarkvarauuendus lahendas mitu v��ikest, kuid tüütut probleemi, mis olid kasutusmugavust seganud.",
+        "et",
+        "examples/female.wav",
+        False,
+        False,
+        True,
+    ],
+    [
+        "Pärast pikka ja pingelist töönädalat tundus nädalavahetus maal vanemate juures nagu paradiis.",
+        "et",
+        "examples/male.wav",
+        False,
+        False,
+        True,
+    ],
+    [
+        "Kuigi laulja esitus oli tehniliselt veatu, puudus selles miskipärast südamest tulev emotsioon, mis publikut liigutaks.",
+        "et",
+        "examples/female.wav",
+        False,
+        False,
+        True,
+    ],
+    [
+        "Ajalooeksami ettevalmistamine nõudis väga palju materjali läbitöötamist, alates Vana-Roomast kuni tänapäevani.",
+        "et",
+        "examples/male.wav",
+        False,
+        False,
+        True,
+    ],
+    [
+        "Tõde ja ilu on kaks mõistet, mille tähendus on muutunud ajas ja kultuurides, kuid mis on alati olnud inimkonna arutelude keskmes.",
+        "et",
+        "examples/LJ001-0030.wav",
+        False,
+        False,
+        True,
+    ],
+    [
+        "Kogu pere osales kevadel talgupäeval, koristades parki ja istutades uusi puid, et anda oma panus kogukonda.",
+        "et",
+        "examples/LJ001-0030.wav",
+        False,
+        False,
+        True,
+    ],
+    [
+        "Kuigi ta oli reisinud paljudes maailma paikades, tunnistas ta, et kõige ilusamad päikeseloojangud on siiski Eesti rannikul.",
+        "et",
+        "examples/LJ005-0214.wav",
+        False,
+        False,
+        True,
+    ],
+    [
+        "Sõbrad arutasid pikalt, kas minna suvepuhkusele Lõuna-Euroopasse või avastada hoopis Lõuna-Eesti kauneid loodusradu.",
+        "et",
+        "examples/LJ005-0214.wav",
+        False,
+        False,
+        True,
+    ],
+    '''
     [
         "Once when I was six years old I saw a magnificent picture",
         "en",
         "examples/female.wav",
         False,
         False,
         True,
         "Lorsque j'avais six ans j'ai vu, une fois, une magnifique image",
         "fr",
         "examples/female.wav",
         False,
         False,
         True,
         "Als ich sechs war, sah ich einmal ein wunderbares Bild",
         "de",
         "examples/female.wav",
         False,
         False,
         True,
         "Cuando tenía seis años, vi una vez una imagen magnífica",
         "es",
         "examples/female.wav",
         False,
         False,
         True,
         "Kunagi, kui olin kuueaastane, nägin ma ühte imelist pilti",
         "et",
         "examples/female.wav",
         False,
         False,
         True,
         "Quando eu tinha seis anos eu vi, uma vez, uma imagem magnífica",
         "pt",
         "examples/female.wav",
         False,
         False,
         True,
         "Kiedy miałem sześć lat, zobaczyłem pewnego razu wspaniały obrazek",
         "pl",
         "examples/female.wav",
         False,
         False,
         True,
         "Un tempo lontano, quando avevo sei anni, vidi un magnifico disegno",
         "it",
         "examples/female.wav",
         False,
         False,
         True,
         "Bir zamanlar, altı yaşındayken, muhteşem bir resim gördüm",
         "tr",
         "examples/male.wav",
         False,
         False,
         True,
         "Когда мне было шесть лет, я увидел однажды удивительную картинку",
         "ru",
         "examples/female.wav",
         False,
         False,
         True,
         "Toen ik een jaar of zes was, zag ik op een keer een prachtige plaat",
         "nl",
         "examples/male.wav",
         False,
         False,
         True,
         "Když mi bylo šest let, viděl jsem jednou nádherný obrázek",
         "cs",
         "examples/female.wav",
         False,
         False,
         True,
         "当我还只有六岁的时候， 看到了一副精彩的插画",
         "zh-cn",
         "examples/male.wav",
         False,
         False,
         True,
         "かつて 六歳のとき、素晴らしい絵を見ました",
         "ja",
         "examples/female.wav",
         False,
         False,
         True,
     ],
         "한번은 내가 여섯 살이었을 때 멋진 그림을 보았습니다.",
         "ko",
         "examples/male.wav",
         False,
         False,
         True,
     ],
         "Egyszer hat éves koromban láttam egy csodálatos képet",
         "hu",
         "examples/male.wav",
         False,
         False,
         True,
     ],
+    '''
 ]
     with gr.Row():
         with gr.Column():
             gr.Markdown(description)
     with gr.Row():
         with gr.Column():
             input_text_gr = gr.Textbox(
+                lines=2,
                 label="Text Prompt",
                 info="One or two sentences at a time is better. Up to 200 text characters.",
                 value="Tere, olen sinu hääle kloon. Ürita mulle lindistada võimalikult hea kvaliteediga klipp, et oskaksin su kõnet paremini jäljendada.",
                 ],
                 multiselect=False,
                 value="et",
+                interactive=False,
             )
             ref_gr = gr.Audio(
                 label="Reference Audio",
+                sources=["microphone", "upload"],
                 #info="Click on the ✎ button to upload your own target speaker audio",
                 type="filepath",
                 value="examples/female.wav",
             )
             clean_ref_gr = gr.Checkbox(
                 label="Cleanup Reference Voice",
                 value=False,
     with gr.Row():
         gr.Examples(examples,
                     label="Examples",
+                    inputs=[input_text_gr, language_gr, ref_gr, clean_ref_gr, auto_det_lang_gr, tos_gr],
                     outputs=[audio_gr, out_text_gr, ref_audio_gr],
                     fn=predict,
                     cache_examples=False,)
+    tts_button.click(predict, [input_text_gr, language_gr, ref_gr, clean_ref_gr, auto_det_lang_gr, tos_gr], outputs=[audio_gr, out_text_gr, ref_audio_gr])
 if __name__ == "__main__":
     demo.queue()

app_local.py DELETED Viewed

@@ -1,62 +0,0 @@
-import gradio as gr
-import numpy as np
-import torch
-from TTS.tts.configs.xtts_config import XttsConfig
-from TTS.tts.models.xtts import Xtts
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-def load_model():
-    config = XttsConfig()
-    config.load_json("model/config.json")
-    XTTS_MODEL = Xtts.init_from_config(config)
-    XTTS_MODEL.load_checkpoint(
-        config,
-        checkpoint_path="model/model.pth",
-        vocab_path="model/vocab.json",
-        eval=True,
-        use_deepspeed=False
-    )
-    XTTS_MODEL.to(device)
-    return XTTS_MODEL
-model = load_model()
-def predict(sentence, language, reference_clip):
-    if not reference_clip or not reference_clip.split('.')[-1] in ['mp3', 'wav']:
-        return
-    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
-        audio_path=reference_clip,
-        gpt_cond_len=model.config.gpt_cond_len,
-        max_ref_length=model.config.max_ref_len,
-        sound_norm_refs=model.config.sound_norm_refs,
-    )
-    wav_chunks = []
-    for chunk in model.inference_stream(
-        text=sentence,
-        language=language,
-        gpt_cond_latent=gpt_cond_latent,
-        speaker_embedding=speaker_embedding,
-        temperature=model.config.temperature,
-        length_penalty=model.config.length_penalty,
-        repetition_penalty=model.config.repetition_penalty,
-        top_k=model.config.top_k,
-        top_p=model.config.top_p,
-    ):
-        if chunk is not None:
-            wav_chunks.append(chunk)
-    return (22050, torch.cat(wav_chunks, dim=0).unsqueeze(0)[0].numpy())
-demo = gr.Interface(
-    title="XTTSv2-est Demo",
-    description="To get the best results, provide a reference clip around the same length as the output sentence you want.",
-    fn=predict,
-    inputs=["text", gr.Dropdown(["et", "en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "hu", "ko", "ja", "hi"]), gr.File()],
-    outputs=[gr.Audio()],
-)
-if __name__ == "__main__":
-    demo.queue()
-    demo.launch()

examples/LJ005-0214.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74ca9692e2c4180768c0eb2de07c056f75425615d47f7c0fcfd8e31589a2e643
+size 424294