Spaces:

mehdi999
/

pardi-speech

Running on Zero

App Files Files Community

mehdi999 commited on Oct 30

Commit

9f2e2fc

1 Parent(s): 2dc4aff

added few things

Browse files

Files changed (1) hide show

app.py +127 -168

app.py CHANGED Viewed

@@ -1,122 +1,33 @@
 import os
-import time
-import traceback
-import threading
-from concurrent.futures import ThreadPoolExecutor, TimeoutError as FTimeout
 import gradio as gr
 import numpy as np
-import soundfile as sf
 import torch
 import spaces
-# ---------- Force safe runtime BEFORE any project imports ----------
 os.environ.setdefault("FLA_CONV_BACKEND", "torch")
 os.environ.setdefault("FLA_USE_FAST_OPS", "0")
-os.environ.setdefault("FLA_DISABLE_TRITON", "1")
-os.environ.setdefault("TORCH_COMPILE_DISABLE", "1")
 os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
-os.environ.setdefault("CUDA_LAUNCH_BLOCKING", "1")
-os.environ.setdefault("PYTORCH_NO_CUDA_MEMORY_CACHING", "1")
-os.environ.setdefault("PYTORCH_JIT_DISABLE", "1")
-os.environ.setdefault("TORCHINDUCTOR_DISABLE", "1")
-os.environ.setdefault("NVTX_PROFILE", "0")
 torch.backends.cuda.matmul.allow_tf32 = True
 try:
     torch.set_float32_matmul_precision("high")
 except Exception:
     pass
-from huggingface_hub import login
-# Delay project imports until after we install stubs/patches
-def _install_fla_stub_and_instrumentation(LOG):
-    """
-    - Replace SimpleGatedLinearAttention by a safe PyTorch stub
-    - Instrument key constructors to log begin/end
-    """
-    try:
-        import importlib
-        # --- FLA stub on SimpleGatedLinearAttention
-        sgm = importlib.import_module("tts.model.simple_gla")
-        import torch.nn as nn
-        class SafeSimpleGatedLinearAttention(nn.Module):
-            def __init__(self, *args, **kwargs):
-                super().__init__()
-                self.kwargs = dict(kwargs)
-            def forward(self, x, past_key_values=None, use_cache: bool = False, **kwargs):
-                conv_state = None
-                if use_cache and isinstance(past_key_values, dict):
-                    conv_state = past_key_values.get("conv_state")
-                return x, conv_state
-        sgm.SimpleGatedLinearAttention = SafeSimpleGatedLinearAttention
-        LOG("[patch] SimpleGatedLinearAttention -> Safe stub")
-    except Exception as e:
-        LOG(f"[patch] FLA stub failed: {e}")
-    # --- Instrument deeper pieces
-    try:
-        tts_mod = importlib.import_module("tts.tts")
-        _orig_ifc = tts_mod.ARTTSModel.instantiate_from_config
-        def _ifc_verbose(cfg):
-            LOG("[inst] ARTTSModel.instantiate_from_config: begin")
-            o = _orig_ifc(cfg)
-            LOG("[inst] ARTTSModel.instantiate_from_config: end")
-            return o
-        tts_mod.ARTTSModel.instantiate_from_config = staticmethod(_ifc_verbose)  # type: ignore
-        LOG("[patch] ARTTSModel.instantiate_from_config instrumented")
-    except Exception as e:
-        LOG(f"[patch] ARTTSModel patch failed: {e}")
-    # Patch constructors that previously appeared in traces
-    try:
-        from codec.models.patchvae.model import PatchVAE
-        _orig_p_init = PatchVAE.__init__
-        def _p_init_verbose(self, *a, **kw):
-            LOG("[inst] PatchVAE.__init__: begin")
-            r = _orig_p_init(self, *a, **kw)
-            LOG("[inst] PatchVAE.__init__: end")
-            return r
-        PatchVAE.__init__ = _p_init_verbose  # type: ignore
-        LOG("[patch] PatchVAE.__init__ instrumented")
-    except Exception as e:
-        LOG(f"[patch] PatchVAE patch failed: {e}")
-    try:
-        from codec.models.wavvae.model import WavVAE
-        _orig_w_init = WavVAE.__init__
-        def _w_init_verbose(self, *a, **kw):
-            LOG("[inst] WavVAE.__init__: begin")
-            r = _orig_w_init(self, *a, **kw)
-            LOG("[inst] WavVAE.__init__: end")
-            return r
-        WavVAE.__init__ = _w_init_verbose  # type: ignore
-        LOG("[patch] WavVAE.__init__ instrumented")
-    except Exception as e:
-        LOG(f"[patch] WavVAE patch failed: {e}")
-def _env_diag() -> str:
-    parts = [f"torch={torch.__version__}"]
-    try:
-        import triton  # type: ignore
-        parts.append(f"triton={getattr(triton, '__version__', 'unknown')}")
-    except Exception:
-        parts.append("triton=not_importable")
-    parts.append(f"cuda.is_available={torch.cuda.is_available()}")
-    if torch.cuda.is_available():
-        parts.append(f"cuda.version={torch.version.cuda}")
-        try:
-            free, total = torch.cuda.mem_get_info()
-            parts.append(f"mem_free={free/1e9:.2f}GB/{total/1e9:.2f}GB")
-        except Exception:
-            pass
-    return " | ".join(parts)
 def _normalize_text(s: str, lang_hint: str = "fr") -> str:
@@ -142,54 +53,71 @@ def _to_mono_float32(arr: np.ndarray) -> np.ndarray:
     return arr.astype(np.float32)
-def _full_thread_dump(LOG, label="stack"):
     try:
-        import faulthandler, io, sys
-        buf = io.StringIO()
-        faulthandler.dump_traceback(file=buf, all_threads=True)
-        LOG(f"[{label}] dump begin")
-        LOG(buf.getvalue()[-2000:])
-        LOG(f"[{label}] dump end")
     except Exception as e:
-        LOG(f"[{label}] dump failed: {e}")
-def _load_model(LOG):
-    # Apply stub & instrumentation BEFORE imports that build the graph
-    _install_fla_stub_and_instrumentation(LOG)
-    # Import model AFTER patches
-    from pardi_speech import PardiSpeech, VelocityHeadSamplingParams as _VHSP  # noqa
-    dev = "cuda" if torch.cuda.is_available() else "cpu"
-    LOG(f"[load] PardiSpeech.from_pretrained(repo_id=theodorr/pardi-speech-enfr-forbidden, map_location={dev})…")
-    # Start a watchdog dumper thread for extra detail every 20s
-    stop_evt = threading.Event()
-    def dumper():
-        k = 1
-        while not stop_evt.wait(20.0):
-            _full_thread_dump(LOG, label=f"stack@{20*k}s")
-            k += 1
-    th = threading.Thread(target=dumper, daemon=True)
-    th.start()
-    m = PardiSpeech.from_pretrained("theodorr/pardi-speech-enfr-forbidden", map_location=dev)
     m.eval()
-    stop_evt.set()
-    th.join(timeout=1.0)
     sr = getattr(m, "sampling_rate", 24000)
-    LOG(f"[load] ready (sr={sr})")
     return m, sr
 @spaces.GPU(duration=200)
 def synthesize(
     text: str,
     debug: bool,
-    adv_sampling: bool,
     ref_audio,
     ref_text: str,
     steps: int,
@@ -200,18 +128,19 @@ def synthesize(
     seed: int,
     lang_hint: str,
 ):
     logs = []
     t0 = time.perf_counter()
     def LOG(msg):
         logs.append(str(msg))
         joined = "\n".join(logs)
-        if len(joined) > 12000:
-            joined = joined[-12000:]
         return joined
     try:
-        HF_TOKEN = os.environ.get("HF_TOKEN")
         if HF_TOKEN:
             try:
                 login(token=HF_TOKEN)
@@ -220,32 +149,40 @@ def synthesize(
                 yield None, LOG(f"⚠️ HF login failed: {e}")
         yield None, LOG("[env] " + _env_diag())
         torch.manual_seed(int(seed))
-        # Load model with watchdog + heartbeats
-        yield None, LOG("[init] loading model…")
-        MAX_WALLTIME_S = 110
         with ThreadPoolExecutor(max_workers=1) as ex:
-            fut = ex.submit(_load_model, LOG)
-            last = time.perf_counter()
             while True:
                 try:
-                    pardi, _sr = fut.result(timeout=2.0)
                     break
                 except FTimeout:
                     now = time.perf_counter()
                     elapsed = now - t0
-                    if now - last >= 2.0:
-                        yield None, LOG(f"[init] still loading… {elapsed:.1f}s")
-                        last = now
                     if elapsed > MAX_WALLTIME_S:
-                        _full_thread_dump(LOG, label="stack@timeout")
                         ex.shutdown(cancel_futures=True)
-                        raise TimeoutError(f"Watchdog: dépassement {elapsed:.1f}s pendant from_pretrained")
         yield None, LOG(f"[init] model ready on {'cuda' if torch.cuda.is_available() else 'cpu'}, sr={_sr}")
-        # ---- Prepare text / prefix ----
         txt = _normalize_text(text, lang_hint=lang_hint)
         yield None, LOG(f"[text] normalized: {txt[:120]}{'…' if len(txt)>120 else ''}")
@@ -265,29 +202,45 @@ def synthesize(
                 import torchaudio
                 if sr != getattr(pardi, "sampling_rate", 24000):
                     wav_t = torchaudio.functional.resample(wav_t, sr, getattr(pardi, "sampling_rate", 24000))
-            except Exception:
-                LOG("⚠️ torchaudio resample not available")
             wav_t = wav_t.unsqueeze(0)
             with torch.inference_mode():
                 prefix_tokens = pardi.patchvae.encode(wav_t)
             prefix = (ref_text or "", prefix_tokens[0])
             yield None, LOG("[prefix] done.")
-        # ---- Generate ----
         yield None, LOG(f"[run] has_prefix={prefix is not None}, steps={steps}, cfg={cfg}, cfg_ref={cfg_ref}, T={temperature}, max_seq_len={max_seq_len}, seed={seed}, adv_sampling={adv_sampling}")
         with torch.inference_mode():
             if adv_sampling:
-                from pardi_speech import VelocityHeadSamplingParams
                 try:
-                    vel_params = VelocityHeadSamplingParams(cfg_ref=float(cfg_ref), cfg=float(cfg), num_steps=int(steps))
                 except TypeError:
-                    vel_params = VelocityHeadSamplingParams(cfg_ref=float(cfg_ref), cfg=float(cfg), num_steps=int(steps), temperature=float(temperature))
-                wavs, _ = pardi.text_to_speech([txt], prefix, max_seq_len=int(max_seq_len), velocity_head_sampling_params=vel_params)
             else:
-                wavs, _ = pardi.text_to_speech([txt], prefix, max_seq_len=int(max_seq_len))
         wav = wavs[0].detach().cpu().numpy().astype(np.float32)
-        yield (24000, wav), LOG(f"[ok] walltime={time.perf_counter()-t0:.2f}s")
     except Exception as e:
         tb = traceback.format_exc()
@@ -298,11 +251,13 @@ def build_demo():
     with gr.Blocks(title="Lina-speech / pardi-speech Demo") as demo:
         gr.Markdown(
             "## Lina-speech (pardi-speech) – Démo TTS\n"
-            "Génère de l'audio à partir de texte, avec ou sans *prefix* (audio de référence)."
         )
         with gr.Row():
-            text = gr.Textbox(label="Texte à synthétiser", lines=4, value="Bonjour ! Ceci est un test de la démo Lina-speech.", placeholder="Tape ton texte ici…")
         debug = gr.Checkbox(value=False, label="Mode debug (afficher la stacktrace)")
         adv_sampling = gr.Checkbox(value=False, label="Sampling avancé (Velocity Head)")
@@ -323,13 +278,17 @@ def build_demo():
         btn = gr.Button("Synthétiser")
         out_audio = gr.Audio(label="Sortie audio", type="numpy")
-        logs_box = gr.Textbox(label="Logs (live)", lines=20)
         demo.queue(default_concurrency_limit=1, max_size=32)
-        btn.click(fn=synthesize,
-                  inputs=[text, debug, adv_sampling, ref_audio, ref_text, steps, cfg, cfg_ref, temperature, max_seq_len, seed, lang_hint],
-                  outputs=[out_audio, logs_box],
-                  api_name="synthesize")
     return demo

 import os
 import gradio as gr
 import numpy as np
 import torch
+import soundfile as sf
 import spaces
+import traceback
+import time
+from concurrent.futures import ThreadPoolExecutor, TimeoutError as FTimeout
+# FLA: forcer les convolutions en backend PyTorch (pas de Triton)
 os.environ.setdefault("FLA_CONV_BACKEND", "torch")
 os.environ.setdefault("FLA_USE_FAST_OPS", "0")
 os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
+# Meilleure perf FP32 sur GPU compatibles
 torch.backends.cuda.matmul.allow_tf32 = True
 try:
     torch.set_float32_matmul_precision("high")
 except Exception:
     pass
+from huggingface_hub import login, snapshot_download
+from pardi_speech import PardiSpeech, VelocityHeadSamplingParams  # présent dans ce repo
+MODEL_REPO_ID = os.environ.get("MODEL_REPO_ID", "theodorr/pardi-speech-enfr-forbidden")
+HF_TOKEN = os.environ.get("HF_TOKEN")
+_pardi = None
+_sampling_rate = 24000
 def _normalize_text(s: str, lang_hint: str = "fr") -> str:
     return arr.astype(np.float32)
+def _env_diag() -> str:
+    parts = []
     try:
+        parts.append(f"torch={torch.__version__}")
+        try:
+            import triton  # type: ignore
+            parts.append(f"triton={getattr(triton, '__version__', 'unknown')}")
+        except Exception:
+            parts.append("triton=not_importable")
+        parts.append(f"cuda.is_available={torch.cuda.is_available()}")
+        if torch.cuda.is_available():
+            parts.append(f"cuda.version={torch.version.cuda}")
+            try:
+                free, total = torch.cuda.mem_get_info()
+                parts.append(f"mem_free={free/1e9:.2f}GB/{total/1e9:.2f}GB")
+            except Exception:
+                pass
     except Exception as e:
+        parts.append(f"env_diag_error={e}")
+    return " | ".join(parts)
+def _load_model_cpu_first(log):
+    """
+    Essaye de pré-télécharger puis de charger sur CPU en priorité.
+    Si ça échoue ou dépasse le timeout, on réessaie sur CUDA.
+    """
+    # 1) prefetch repo to local cache (évite les blocages de téléchargement cachés)
+    log("[prefetch] snapshot_download…")
+    local_dir = snapshot_download(
+        repo_id=MODEL_REPO_ID,
+        token=HF_TOKEN,
+        local_dir=None,  # hub cache
+        local_files_only=False,
+        allow_patterns=None,  # tout
+        ignore_patterns=None,
+    )
+    log(f"[prefetch] done -> {local_dir}")
+    # 2) CPU load
+    log("[load] from_pretrained(map_location='cpu')…")
+    m = PardiSpeech.from_pretrained(local_dir, map_location='cpu')
     m.eval()
     sr = getattr(m, "sampling_rate", 24000)
+    log(f"[load] cpu OK (sr={sr})")
     return m, sr
+def _move_to_cuda_if_available(m, log):
+    if torch.cuda.is_available():
+        log("[move] moving model to cuda…")
+        # PardiSpeech expose généralement un .to(device) (via nn.Module)
+        try:
+            m = m.to('cuda')  # type: ignore[attr-defined]
+        except Exception as e:
+            log(f"[move] .to('cuda') failed: {e}. Keeping on CPU.")
+        return m
+    return m
 @spaces.GPU(duration=200)
 def synthesize(
     text: str,
     debug: bool,
+    adv_sampling: bool,   # toggle "Sampling avancé (Velocity Head)"
     ref_audio,
     ref_text: str,
     steps: int,
     seed: int,
     lang_hint: str,
 ):
+    # ---- Generator that streams logs to UI ----
     logs = []
     t0 = time.perf_counter()
     def LOG(msg):
         logs.append(str(msg))
+        # Keep last ~8000 chars
         joined = "\n".join(logs)
+        if len(joined) > 8000:
+            joined = joined[-8000:]
         return joined
     try:
         if HF_TOKEN:
             try:
                 login(token=HF_TOKEN)
                 yield None, LOG(f"⚠️ HF login failed: {e}")
         yield None, LOG("[env] " + _env_diag())
+        device = "cuda" if torch.cuda.is_available() else "cpu"
         torch.manual_seed(int(seed))
+        os.environ.setdefault("CUDA_LAUNCH_BLOCKING", "1")
+        # --- CPU-first loader with heartbeats and timeout ---
+        yield None, LOG("[init] prefetch + CPU-first load…")
+        MAX_WALLTIME_S = 110  # UX watchdog
         with ThreadPoolExecutor(max_workers=1) as ex:
+            fut = ex.submit(_load_model_cpu_first, LOG)
+            last_hb = time.perf_counter()
             while True:
                 try:
+                    m, sr = fut.result(timeout=2.0)
+                    pardi = m
+                    _sr = sr
                     break
                 except FTimeout:
                     now = time.perf_counter()
                     elapsed = now - t0
+                    # heartbeat
+                    if now - last_hb >= 2.0:
+                        yield None, LOG(f"[init] still loading on CPU… {elapsed:.1f}s")
+                        last_hb = now
                     if elapsed > MAX_WALLTIME_S:
                         ex.shutdown(cancel_futures=True)
+                        raise TimeoutError(f"Watchdog: dépassement {elapsed:.1f}s pendant le chargement (CPU)")
+        # Move to cuda if possible
+        pardi = _move_to_cuda_if_available(pardi, LOG)
         yield None, LOG(f"[init] model ready on {'cuda' if torch.cuda.is_available() else 'cpu'}, sr={_sr}")
+        # ---- Text & prefix ----
         txt = _normalize_text(text, lang_hint=lang_hint)
         yield None, LOG(f"[text] normalized: {txt[:120]}{'…' if len(txt)>120 else ''}")
                 import torchaudio
                 if sr != getattr(pardi, "sampling_rate", 24000):
                     wav_t = torchaudio.functional.resample(wav_t, sr, getattr(pardi, "sampling_rate", 24000))
+            except Exception as _e:
+                LOG("⚠️ torchaudio not available for resample; using original SR")
             wav_t = wav_t.unsqueeze(0)
             with torch.inference_mode():
                 prefix_tokens = pardi.patchvae.encode(wav_t)
             prefix = (ref_text or "", prefix_tokens[0])
             yield None, LOG("[prefix] done.")
         yield None, LOG(f"[run] has_prefix={prefix is not None}, steps={steps}, cfg={cfg}, cfg_ref={cfg_ref}, T={temperature}, max_seq_len={max_seq_len}, seed={seed}, adv_sampling={adv_sampling}")
+        # ---- FAST PATH by default ----
         with torch.inference_mode():
             if adv_sampling:
+                yield None, LOG("[run] VelocityHeadSamplingParams enabled…")
                 try:
+                    vel_params = VelocityHeadSamplingParams(
+                        cfg_ref=float(cfg_ref),
+                        cfg=float(cfg),
+                        num_steps=int(steps)
+                    )
                 except TypeError:
+                    vel_params = VelocityHeadSamplingParams(
+                        cfg_ref=float(cfg_ref),
+                        cfg=float(cfg),
+                        num_steps=int(steps),
+                        temperature=float(temperature)
+                    )
+                wavs, _ = pardi.text_to_speech(
+                    [txt], prefix, max_seq_len=int(max_seq_len),
+                    velocity_head_sampling_params=vel_params
+                )
             else:
+                yield None, LOG("[run] fast path (notebook) without VelocityHead…")
+                wavs, _ = pardi.text_to_speech(
+                    [txt], prefix, max_seq_len=int(max_seq_len)
+                )
         wav = wavs[0].detach().cpu().numpy().astype(np.float32)
+        yield (_sampling_rate, wav), LOG(f"[ok] walltime={time.perf_counter()-t0:.2f}s")
     except Exception as e:
         tb = traceback.format_exc()
     with gr.Blocks(title="Lina-speech / pardi-speech Demo") as demo:
         gr.Markdown(
             "## Lina-speech (pardi-speech) – Démo TTS\n"
+            "Génère de l'audio à partir de texte, avec ou sans *prefix* (audio de référence).\n"
+            "Par défaut, le chemin **rapide** (comme dans le notebook) est utilisé. "
+            "Active **Sampling avancé** pour passer par Velocity Head."
         )
         with gr.Row():
+            text = gr.Textbox(label="Texte à synthétiser", lines=4, placeholder="Tape ton texte ici…")
         debug = gr.Checkbox(value=False, label="Mode debug (afficher la stacktrace)")
         adv_sampling = gr.Checkbox(value=False, label="Sampling avancé (Velocity Head)")
         btn = gr.Button("Synthétiser")
         out_audio = gr.Audio(label="Sortie audio", type="numpy")
+        logs_box = gr.Textbox(label="Logs (live)", lines=18)
         demo.queue(default_concurrency_limit=1, max_size=32)
+        # Use generator function: stream logs to UI while running
+        btn.click(
+            fn=synthesize,
+            inputs=[text, debug, adv_sampling, ref_audio, ref_text, steps, cfg, cfg_ref, temperature, max_seq_len, seed, lang_hint],
+            outputs=[out_audio, logs_box],
+            api_name="synthesize"
+        )
     return demo