Darija-Arabic-TTS

Running on Zero

App Files Files Community

MohamedRashad commited on Jan 27

Commit

69b575f

1 Parent(s): 2d772b8

Add initial implementation of Egyptian-Arabic TTS with Gradio interface

Browse files

Files changed (2) hide show

app.py +80 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import torch
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+from pathlib import Path
+import gradio as gr
+import spaces
+CONFIG_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/config.json'
+VOCAB_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/vocab.json'
+MODEL_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/model.pth'
+SPEAKER_AUDIO_URL = 'https://huggingface.co/OmarSamir/EGTTS-V0.1/resolve/main/speaker_reference.wav'
+base_path = Path(__file__).parent
+# Download the files into the base_path
+config_path = base_path / 'config.json'
+if not config_path.exists():
+    torch.hub.download_url_to_file(CONFIG_URL, config_path)
+vocab_path = base_path / 'vocab.json'
+if not vocab_path.exists():
+    torch.hub.download_url_to_file(VOCAB_URL, vocab_path)
+model_path = base_path / 'model.pth'
+if not model_path.exists():
+    torch.hub.download_url_to_file(MODEL_URL, model_path)
+speaker_audio_path = base_path / 'speaker_reference.wav'
+if not speaker_audio_path.exists():
+    torch.hub.download_url_to_file(SPEAKER_AUDIO_URL, speaker_audio_path)
+config_path = str(config_path)
+vocab_path = str(vocab_path)
+model_path = str(model_path.parent)
+speaker_audio_path = str(speaker_audio_path)
+print("Loading model...")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+config = XttsConfig()
+config.load_json(config_path)
+model = Xtts.init_from_config(config)
+model.load_checkpoint(config, checkpoint_dir=model_path, use_deepspeed=True, vocab_path=vocab_path)
+model.to(device)
+@spaces.GPU
+def infer_EGTTS(text: str, speaker_audio_path: str, temperature: float = 0.75):
+    print("Computing speaker latents...")
+    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[speaker_audio_path])
+    print("Inference...")
+    out = model.inference(
+        text,
+        "ar",
+        gpt_cond_latent,
+        speaker_embedding,
+        temperature=temperature,
+    )
+    return 24000, out["wav"]
+header = """<h1 style="text-align:center">Egyptian-Arabic-TTS (EGTTS)</h1>
+## Instructions:
+1. Enter the text you want to synthesize.
+2. Upload a 4-5 seconds audio file of the speaker you want to clone.
+3. Click on the "Generate" button.
+**This space was only possible because of the amazing work done by [OmarSamir](https://huggingface.co/OmarSamir) on the [EGTTS](https://huggingface.co/OmarSamir/EGTTS-V0.1) model.**
+"""
+with gr.Blocks(title="EGTTS") as app:
+    gr.Markdown(header)
+    with gr.Row():
+        with gr.Column():
+            text = gr.Textbox(label="Text to synthesize", value="السلام عليكم ورحمة الله")
+            speaker_refrence = gr.Audio(label="Speaker reference", value=speaker_audio_path, type="filepath")
+            temperature = gr.Slider(label="Temperature", min_value=0.1, max_value=1.0, value=0.75, step=0.05)
+            generate_btn = gr.Button(value="Generate", variant="primary")
+        output = gr.Audio(label="Synthesized audio")
+    generate_btn.click(infer_EGTTS, inputs=[text, speaker_refrence, temperature], outputs=output)
+app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+git+https://github.com/coqui-ai/TTS
+transformers
+deepspeed
+torch
+torchaudio
+spaces
+gradio