Spaces:

mehdi999
/

pardi-speech

Running on Zero

App Files Files Community

mehdi999 commited on 24 days ago

Commit

4af42e5

1 Parent(s): ef11c21

Demo Lina-speech (pardi-speech) on Spaces

Browse files

Files changed (3) hide show

app.py +157 -0
readme.md +11 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# app.py
+import os
+import gradio as gr
+import numpy as np
+import torch
+import soundfile as sf
+import spaces
+from huggingface_hub import login
+from pardi_speech import PardiSpeech, VelocityHeadSamplingParams  # présent dans ce repo
+# Les sous-modules requis se trouvent dans tts/ et codec/
+MODEL_REPO_ID = os.environ.get("MODEL_REPO_ID", "theodorr/pardi-speech-enfr-forbidden")
+# Auth HF (le secret est défini dans les Settings du Space)
+HF_TOKEN = os.environ.get("HF_TOKEN")
+if HF_TOKEN:
+    try:
+        login(token=HF_TOKEN)
+        print("✅ Logged to Hugging Face Hub.")
+    except Exception as e:
+        print("⚠️ HF login failed:", e)
+# Chargement lazy pour ZeroGPU (et pour réduire le temps de cold-start)
+_pardi = None
+_sampling_rate = 24000
+def _normalize_text(s: str, lang_hint: str = "fr") -> str:
+    """Normalisation légère façon Whisper: lowercase + chiffres en lettres (si possible)."""
+    s = (s or "").strip().lower()
+    try:
+        import re
+        from num2words import num2words
+        def repl(m): return num2words(int(m.group()), lang=lang_hint)
+        s = re.sub(r"\d+", repl, s)
+    except Exception:
+        pass
+    return s
+def _load_model(device: str = "cuda"):
+    global _pardi, _sampling_rate
+    if _pardi is None:
+        _pardi = PardiSpeech.from_pretrained(MODEL_REPO_ID, map_location=device)
+        _sampling_rate = getattr(_pardi, "sampling_rate", 24000)
+        print(f"✅ PardiSpeech loaded on {device} (sr={_sampling_rate}).")
+    return _pardi
+def _to_mono_float32(arr: np.ndarray) -> np.ndarray:
+    arr = arr.astype(np.float32)
+    if arr.ndim == 2:  # stereo -> mono
+        arr = arr.mean(axis=1)
+    return arr
+@spaces.GPU(duration=120)  # ZeroGPU: alloue un GPU pendant l'appel (noop ailleurs)
+def synthesize(
+    text: str,
+    ref_audio,                      # tuple (sr, np.ndarray) ou chemin
+    ref_text: str,
+    steps: int,
+    cfg: float,
+    cfg_ref: float,
+    temperature: float,
+    max_seq_len: int,
+    seed: int,
+    lang_hint: str
+):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    torch.manual_seed(int(seed))
+    pardi = _load_model(device)
+    txt = _normalize_text(text, lang_hint=lang_hint)
+    # Prépare cache décodage
+    cache = pardi.tts.audio_decoder.init_cache(int(max_seq_len), device)
+    # Paramètres de sampling (cf. notebook inference & tes notes)
+    vel_params = VelocityHeadSamplingParams(
+        cfg_ref=float(cfg_ref),
+        cfg=float(cfg),
+        num_steps=int(steps),
+        temperature=float(temperature)
+    )
+    # Gestion du prefix (optionnel)
+    prefix = None
+    if ref_audio is not None:
+        # ref_audio peut être un chemin ou (sr, wav)
+        if isinstance(ref_audio, str):
+            wav, sr = sf.read(ref_audio)
+        else:
+            sr, wav = ref_audio
+        wav = _to_mono_float32(np.array(wav))
+        wav_t = torch.from_numpy(wav).to(device)
+        # Resample sur le sr attendu par le codec/ modèle
+        import torchaudio
+        if sr != pardi.sampling_rate:
+            wav_t = torchaudio.functional.resample(wav_t, sr, pardi.sampling_rate)
+        wav_t = wav_t.unsqueeze(0)  # [1, T]
+        with torch.inference_mode():
+            # Encode prefix en tokens via PatchVAE (comme dans inference.ipynb)
+            prefix_tokens = pardi.patchvae.encode(wav_t)  # [1, ...]
+        # ref_text est optionnel ; s’il est vide, on passe une chaîne vide
+        prefix = (ref_text or "", prefix_tokens[0])
+    # Synthèse
+    with torch.inference_mode():
+        wavs, _ = pardi.text_to_speech(
+            [txt],
+            prefix,
+            max_seq_len=int(max_seq_len),
+            velocity_head_sampling_params=vel_params,
+            cache=cache
+        )
+    wav = wavs[0].detach().cpu().numpy()  # float32 [-1,1]
+    return (_sampling_rate, wav)
+def build_demo():
+    with gr.Blocks(title="Lina‑speech / pardi‑speech Demo") as demo:
+        gr.Markdown(
+            "## Lina‑speech (pardi‑speech) – Démo TTS\n"
+            "Génère de l'audio à partir de texte, avec ou sans *prefix* (audio de référence).\n"
+            "Paramètres avancés: *num_steps*, *CFG*, *température*, *max_seq_len*, *seed*."
+        )
+        with gr.Row():
+            text = gr.Textbox(label="Texte à synthétiser", lines=4, placeholder="Tape ton texte ici…")
+        with gr.Accordion("Prefix (optionnel)", open=False):
+            ref_audio = gr.Audio(sources=["upload", "microphone"], type="numpy", label="Audio de référence")
+            ref_text  = gr.Textbox(label="Texte du prefix (si connu)", placeholder="Transcription du prefix (optionnel)")
+        with gr.Accordion("Options avancées", open=False):
+            with gr.Row():
+                steps = gr.Slider(1, 50, value=10, step=1, label="num_steps")
+                cfg = gr.Slider(0.5, 3.0, value=1.4, step=0.05, label="CFG (guidance)")
+                cfg_ref = gr.Slider(0.5, 3.0, value=1.0, step=0.05, label="CFG (réf.)")
+            with gr.Row():
+                temperature = gr.Slider(0.1, 2.0, value=1.0, step=0.05, label="Température")
+                max_seq_len = gr.Slider(50, 1200, value=300, step=10, label="max_seq_len (tokens audio)")
+                seed = gr.Number(value=0, precision=0, label="Seed (reproductibilité)")
+            lang_hint = gr.Dropdown(choices=["fr", "en"], value="fr", label="Langue (normalisation)")
+        btn = gr.Button("Synthétiser")
+        out_audio = gr.Audio(label="Sortie audio", type="numpy")
+        # File d'attente pour GPU (gestion du débit)
+        demo.queue(default_concurrency_limit=1, max_size=32)
+        btn.click(
+            fn=synthesize,
+            inputs=[text, ref_audio, ref_text, steps, cfg, cfg_ref, temperature, max_seq_len, seed, lang_hint],
+            outputs=[out_audio]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = build_demo()
+    demo.launch()

readme.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# Lina-speech (pardi-speech) — Demo Gradio
+- Charge un checkpoint privé: `${MODEL_REPO_ID}` (par défaut `theodorr/pardi-speech-enfr-forbidden`)
+- Nécessite un secret `HF_TOKEN` (Settings ▸ Secrets)
+## Paramètres
+- num_steps, CFG, CFG_ref, température, max_seq_len, seed
+- Prefix optionnel: audio + texte (si disponible)
+## Matériel
+- ZeroGPU (PRO requis pour héberger) ou GPU T4/L4/L40S/A10G…

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio>=4.44.0
+spaces>=0.20.0
+huggingface_hub>=0.24.0
+torch>=2.2.0
+torchaudio>=2.2.0
+numpy
+soundfile
+librosa
+num2words
+tqdm