Spaces:

projecte-aina
/

matxa-alvocat-tts-ca

Running

App Files Files Community

wetdog commited on Mar 6, 2024

Commit

92df4f5

1 Parent(s): cde96b8

add inference app

Browse files

Files changed (1) hide show

infer_onnx.py +180 -0

infer_onnx.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import numpy as np
+import onnxruntime
+import utils
+from text import text_to_sequence, sequence_to_text
+import torch
+import gradio as gr
+import soundfile as sf
+import tempfile
+import yaml
+def intersperse(lst, item):
+    result = [item] * (len(lst) * 2 + 1)
+    result[1::2] = lst
+    return result
+def process_text(i: int, text: str, device: torch.device):
+    print(f"[{i}] - Input text: {text}")
+    x = torch.tensor(
+        intersperse(text_to_sequence(text, ["catalan_cleaners"]), 0),
+        dtype=torch.long,
+        device=device,
+    )[None]
+    x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device)
+    x_phones = sequence_to_text(x.squeeze(0).tolist())
+    print(x_phones)
+    return x.numpy(), x_lengths.numpy()
+MODEL_PATH_MATCHA_MEL="matcha_multispeaker_cat_opset_15.onnx"
+MODEL_PATH_MATCHA="matcha_hifigan_multispeaker_cat.onnx"
+MODEL_PATH_VOCOS="mel_spec_22khz.onnx"
+CONFIG_PATH="/home/jgiraldo/projects/tts-onnx-comparison/config_22khz.yaml"
+sess_options = onnxruntime.SessionOptions()
+model_matcha_mel= onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA_MEL), sess_options=sess_options, providers=["CPUExecutionProvider"])
+model_vocos = onnxruntime.InferenceSession(str(MODEL_PATH_VOCOS), sess_options=sess_options, providers=["CPUExecutionProvider"])
+model_matcha = onnxruntime.InferenceSession(str(MODEL_PATH_MATCHA), sess_options=sess_options, providers=["CPUExecutionProvider"])
+def vocos_inference(mel: torch.Tensor, config):
+    with open(CONFIG_PATH, "r") as f:
+        config = yaml.safe_load(f)
+    params = config["feature_extractor"]["init_args"]
+    sample_rate = params["sample_rate"]
+    n_fft= params["n_fft"]
+    hop_length= params["hop_length"]
+    win_length = n_fft
+    # ONNX inference
+    mag, x, y = model_vocos.run(
+        None,
+        {
+            "mels": mel.float().numpy()
+        },
+    )
+    # complex spectrogram from vocos output
+    spectrogram = mag * (x + 1j * y)
+    window = torch.hann_window(win_length)
+    # Inverse stft
+    pad = (win_length - hop_length) // 2
+    spectrogram = torch.tensor(spectrogram)
+    B, N, T = spectrogram.shape
+    print("Spectrogram synthesized shape", spectrogram.shape)
+    # Inverse FFT
+    ifft = torch.fft.irfft(spectrogram, n_fft, dim=1, norm="backward")
+    ifft = ifft * window[None, :, None]
+    # Overlap and Add
+    output_size = (T - 1) * hop_length + win_length
+    y = torch.nn.functional.fold(
+        ifft, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length),
+    )[:, 0, 0, pad:-pad]
+    # Window envelope
+    window_sq = window.square().expand(1, T, -1).transpose(1, 2)
+    window_envelope = torch.nn.functional.fold(
+        window_sq, output_size=(1, output_size), kernel_size=(1, win_length), stride=(1, hop_length),
+    ).squeeze()[pad:-pad]
+    # Normalize
+    assert (window_envelope > 1e-11).all()
+    y = y / window_envelope
+    return y
+def tts(text:str, spk_id:int):
+    sid = np.array([int(spk_id)]) if spk_id is not None else None
+    text_matcha , text_lengths = process_text(0,text,"cpu")
+    # MATCHA VOCOS
+    inputs = {
+        "x": text_matcha,
+        "x_lengths": text_lengths,
+        "scales": np.array([0.667, 1.0], dtype=np.float32),
+        "spks": sid
+    }
+    mel, mel_lengths = model_matcha_mel.run(None, inputs)
+    # vocos inference
+    wavs_vocos = vocos_inference(mel, CONFIG_PATH)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha_vocos:
+        sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
+    #MATCHA HIFIGAN
+    inputs = {
+        "x": text_matcha,
+        "x_lengths": text_lengths,
+        "scales": np.array([0.667, 1.0], dtype=np.float32),
+        "spks": sid
+    }
+    wavs, wav_lengths = model_matcha.run(None, inputs)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_matcha:
+        sf.write(fp_matcha.name, wavs.squeeze(0), 22050, "PCM_24")
+    return fp_matcha_vocos.name, fp_matcha.name
+## GUI space
+title = """
+<div style="text-align: center; max-width: 700px; margin: 0 auto;">
+    <div
+        style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;"
+    > <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
+        TTS Catalan Comparison
+    </h1> </div>
+</div>
+ """
+description = """
+VITS2 is an end-to-end speech synthesis model that predicts a speech waveform conditional on an input text sequence. VITS2 improved the
+training and inference efficiency and naturalness by introducing adversarial learning into the duration predictor. The transformer
+block was added to the normalizing flows to capture the long-term dependency when transforming the distribution.
+The synthesis quality was improved by incorporating Gaussian noise into the alignment search.
+🍵 Matcha-TTS, a new approach to non-autoregressive neural TTS, that uses conditional flow matching (similar to rectified flows) to speed up ODE-based speech synthesis
+Models are being trained in openslr69 and festcat datasets
+"""
+article = "Training and demo by BSC."
+vits2_inference = gr.Interface(
+    fn=tts,
+    inputs=[
+        gr.Textbox(
+            value="m'ha costat desenvolupar molt una veu, i ara que la tinc no estaré en silenci.",
+            max_lines=1,
+            label="Input text",
+        ),
+        gr.Slider(
+            1,
+            47,
+            value=10,
+            step=1,
+            label="Speaker id",
+            info=f"Models are trained on 47 speakers. You can prompt the model using one of these speaker ids.",
+        ),
+    ],
+    outputs=[gr.Audio(label="Matcha vocos", interactive=False, type="filepath"),
+             gr.Audio(label="Matcha", interactive=False, type="filepath")]
+)
+demo = gr.Blocks()
+with demo:
+    gr.Markdown(title)
+    gr.Markdown(description)
+    gr.TabbedInterface([vits2_inference], ["Multispeaker"])
+    gr.Markdown(article)
+demo.queue(max_size=10)
+demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860)