Spaces:

espnet
/

cvss-c_es-en_s2st

Running

App Files Files Community

tjysdsg commited on Oct 29, 2023

Commit

ed95978

1 Parent(s): f5a907f

Move model downloading to initialization stage

Browse files

Files changed (2) hide show

app.py +78 -79
utils.py +4 -0

app.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import os
 import gradio as gr
-import numpy as np
-import torch
 import torchaudio
 from typing import Tuple, Optional
 import soundfile as sf
 from s2st_inference import s2st_inference
 SAMPLE_RATE = 16000
 MAX_INPUT_LENGTH = 60  # seconds
@@ -19,85 +18,83 @@ NGPU = 0
 BEAM_SIZE = 1
-def download_model(tag: str, out_dir: str):
-    from huggingface_hub import snapshot_download
-    return snapshot_download(repo_id=tag, local_dir=out_dir)
-def s2st(
-        audio_source: str,
-        input_audio_mic: Optional[str],
-        input_audio_file: Optional[str],
-):
-    if audio_source == 'file':
-        input_path = input_audio_file
-    else:
-        input_path = input_audio_mic
-    if input_path is None:
-        gr.Error(f"Input audio is too long. Truncated to {MAX_INPUT_LENGTH} seconds.")
-        return (None, None), None
-    orig_wav, orig_sr = torchaudio.load(input_path)
-    wav = torchaudio.functional.resample(orig_wav, orig_freq=orig_sr, new_freq=SAMPLE_RATE)
-    max_length = int(MAX_INPUT_LENGTH * SAMPLE_RATE)
-    if wav.shape[1] > max_length:
-        wav = wav[:, :max_length]
-        gr.Warning(f"Input audio is too long. Truncated to {MAX_INPUT_LENGTH} seconds.")
-    wav = wav[0]  # mono
-    # Download models
-    os.makedirs(S2UT_DIR, exist_ok=True)
-    os.makedirs(VOCODER_DIR, exist_ok=True)
-    s2ut_path = download_model(S2UT_TAG, S2UT_DIR)
-    vocoder_path = download_model(VOCODER_TAG, VOCODER_DIR)
-    # Temporary change cwd to model dir so that it loads correctly
-    cwd = os.getcwd()
-    os.chdir(s2ut_path)
-    # Translate wav
-    out_wav = s2st_inference(
-        wav,
-        train_config=os.path.join(
-            s2ut_path,
-            'exp',
-            's2st_train_s2st_discrete_unit_raw_fbank_es_en',
-            'config.yaml',
-        ),
-        model_file=os.path.join(
-            s2ut_path,
-            'exp',
-            's2st_train_s2st_discrete_unit_raw_fbank_es_en',
-            '500epoch.pth',
-        ),
-        vocoder_file=os.path.join(
-            vocoder_path,
-            'checkpoint-450000steps.pkl',
-        ),
-        vocoder_config=os.path.join(
-            vocoder_path,
-            'config.yml',
-        ),
-        ngpu=NGPU,
-        beam_size=BEAM_SIZE,
-    )
-    # Restore working directory
-    os.chdir(cwd)
-    # Save result
-    output_path = 'output.wav'
-    sf.write(
-        output_path,
-        out_wav,
-        16000,
-        "PCM_16",
-    )
-    return output_path, f'Source: {audio_source}'
 def update_audio_ui(audio_source: str) -> Tuple[dict, dict]:
@@ -109,6 +106,8 @@ def update_audio_ui(audio_source: str) -> Tuple[dict, dict]:
 def main():
     with gr.Blocks() as demo:
         with gr.Group():
             with gr.Row() as audio_box:
@@ -153,7 +152,7 @@ def main():
         )
         btn.click(
-            fn=s2st,
             inputs=[
                 audio_source,
                 input_audio_mic,

 import os
 import gradio as gr
 import torchaudio
 from typing import Tuple, Optional
 import soundfile as sf
 from s2st_inference import s2st_inference
+from utils import download_model
 SAMPLE_RATE = 16000
 MAX_INPUT_LENGTH = 60  # seconds
 BEAM_SIZE = 1
+class App:
+    def __init__(self):
+        # Download models
+        os.makedirs(S2UT_DIR, exist_ok=True)
+        os.makedirs(VOCODER_DIR, exist_ok=True)
+        self.s2ut_path = download_model(S2UT_TAG, S2UT_DIR)
+        self.vocoder_path = download_model(VOCODER_TAG, VOCODER_DIR)
+    def s2st(
+            self,
+            audio_source: str,
+            input_audio_mic: Optional[str],
+            input_audio_file: Optional[str],
+    ):
+        if audio_source == 'file':
+            input_path = input_audio_file
+        else:
+            input_path = input_audio_mic
+        if input_path is None:
+            gr.Error(f"Input audio is too long. Truncated to {MAX_INPUT_LENGTH} seconds.")
+            return (None, None), None
+        orig_wav, orig_sr = torchaudio.load(input_path)
+        wav = torchaudio.functional.resample(orig_wav, orig_freq=orig_sr, new_freq=SAMPLE_RATE)
+        max_length = int(MAX_INPUT_LENGTH * SAMPLE_RATE)
+        if wav.shape[1] > max_length:
+            wav = wav[:, :max_length]
+            gr.Warning(f"Input audio is too long. Truncated to {MAX_INPUT_LENGTH} seconds.")
+        wav = wav[0]  # mono
+        # Temporary change cwd to model dir so that it loads correctly
+        cwd = os.getcwd()
+        os.chdir(self.s2ut_path)
+        # Translate wav
+        out_wav = s2st_inference(
+            wav,
+            train_config=os.path.join(
+                self.s2ut_path,
+                'exp',
+                's2st_train_s2st_discrete_unit_raw_fbank_es_en',
+                'config.yaml',
+            ),
+            model_file=os.path.join(
+                self.s2ut_path,
+                'exp',
+                's2st_train_s2st_discrete_unit_raw_fbank_es_en',
+                '500epoch.pth',
+            ),
+            vocoder_file=os.path.join(
+                self.vocoder_path,
+                'checkpoint-450000steps.pkl',
+            ),
+            vocoder_config=os.path.join(
+                self.vocoder_path,
+                'config.yml',
+            ),
+            ngpu=NGPU,
+            beam_size=BEAM_SIZE,
+        )
+        # Restore working directory
+        os.chdir(cwd)
+        # Save result
+        output_path = 'output.wav'
+        sf.write(
+            output_path,
+            out_wav,
+            16000,
+            "PCM_16",
+        )
+        return output_path, f'Source: {audio_source}'
 def update_audio_ui(audio_source: str) -> Tuple[dict, dict]:
 def main():
+    app = App()
     with gr.Blocks() as demo:
         with gr.Group():
             with gr.Row() as audio_box:
         )
         btn.click(
+            fn=app.s2st,
             inputs=[
                 audio_source,
                 input_audio_mic,

utils.py ADDED Viewed

	@@ -0,0 +1,4 @@

+def download_model(tag: str, out_dir: str):
+    from huggingface_hub import snapshot_download
+    return snapshot_download(repo_id=tag, local_dir=out_dir)