Spaces:
Running
Running
Try to implement s2st
Browse files- .gitignore +3 -0
- app.py +58 -2
- output.wav +0 -0
- s2st_inference.py +121 -0
.gitignore
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
data
|
| 2 |
|
| 3 |
# Created by https://www.toptal.com/developers/gitignore/api/linux,windows,macos,jetbrains+all,visualstudiocode,python,jupyternotebooks
|
|
|
|
| 1 |
+
*.wav
|
| 2 |
+
model
|
| 3 |
+
vocoder
|
| 4 |
data
|
| 5 |
|
| 6 |
# Created by https://www.toptal.com/developers/gitignore/api/linux,windows,macos,jetbrains+all,visualstudiocode,python,jupyternotebooks
|
app.py
CHANGED
|
@@ -4,10 +4,23 @@ import numpy as np
|
|
| 4 |
import torch
|
| 5 |
import torchaudio
|
| 6 |
from typing import Tuple, Optional
|
|
|
|
|
|
|
| 7 |
|
| 8 |
SAMPLE_RATE = 16000
|
| 9 |
MAX_INPUT_LENGTH = 60 # seconds
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
def s2st(
|
| 13 |
audio_source: str,
|
|
@@ -32,9 +45,52 @@ def s2st(
|
|
| 32 |
|
| 33 |
wav = wav[0] # mono
|
| 34 |
|
| 35 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
output_path = 'output.wav'
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
return output_path, f'Source: {audio_source}'
|
| 40 |
|
|
|
|
| 4 |
import torch
|
| 5 |
import torchaudio
|
| 6 |
from typing import Tuple, Optional
|
| 7 |
+
import soundfile as sf
|
| 8 |
+
from s2st_inference import s2st_inference
|
| 9 |
|
| 10 |
SAMPLE_RATE = 16000
|
| 11 |
MAX_INPUT_LENGTH = 60 # seconds
|
| 12 |
|
| 13 |
+
S2UT_TAG = 'espnet/jiyang_tang_cvss-c_es-en_discrete_unit'
|
| 14 |
+
S2UT_DIR = 'model'
|
| 15 |
+
VOCODER_TAG = 'espnet/cvss-c_en_wavegan_hubert_vocoder'
|
| 16 |
+
VOCODER_DIR = 'vocoder'
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def download_model(tag: str, out_dir: str):
|
| 20 |
+
from huggingface_hub import snapshot_download
|
| 21 |
+
|
| 22 |
+
return snapshot_download(repo_id=tag, local_dir=out_dir)
|
| 23 |
+
|
| 24 |
|
| 25 |
def s2st(
|
| 26 |
audio_source: str,
|
|
|
|
| 45 |
|
| 46 |
wav = wav[0] # mono
|
| 47 |
|
| 48 |
+
# Download models
|
| 49 |
+
os.makedirs(S2UT_DIR, exist_ok=True)
|
| 50 |
+
os.makedirs(VOCODER_DIR, exist_ok=True)
|
| 51 |
+
s2ut_path = download_model(S2UT_TAG, S2UT_DIR)
|
| 52 |
+
vocoder_path = download_model(VOCODER_TAG, VOCODER_DIR)
|
| 53 |
+
|
| 54 |
+
# Temporary change cwd to model dir so that it loads correctly
|
| 55 |
+
cwd = os.getcwd()
|
| 56 |
+
os.chdir(s2ut_path)
|
| 57 |
+
|
| 58 |
+
# Translate wav
|
| 59 |
+
out_wav = s2st_inference(
|
| 60 |
+
wav,
|
| 61 |
+
train_config=os.path.join(
|
| 62 |
+
s2ut_path,
|
| 63 |
+
'exp',
|
| 64 |
+
's2st_train_s2st_discrete_unit_raw_fbank_es_en',
|
| 65 |
+
'config.yaml',
|
| 66 |
+
),
|
| 67 |
+
model_file=os.path.join(
|
| 68 |
+
s2ut_path,
|
| 69 |
+
'exp',
|
| 70 |
+
's2st_train_s2st_discrete_unit_raw_fbank_es_en',
|
| 71 |
+
'500epoch.pth',
|
| 72 |
+
),
|
| 73 |
+
vocoder_file=os.path.join(
|
| 74 |
+
vocoder_path,
|
| 75 |
+
'checkpoint-400000steps.pkl',
|
| 76 |
+
),
|
| 77 |
+
vocoder_config=os.path.join(
|
| 78 |
+
vocoder_path,
|
| 79 |
+
'config.yml',
|
| 80 |
+
),
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# Restore working directory
|
| 84 |
+
os.chdir(cwd)
|
| 85 |
+
|
| 86 |
+
# Save result
|
| 87 |
output_path = 'output.wav'
|
| 88 |
+
sf.write(
|
| 89 |
+
output_path,
|
| 90 |
+
out_wav,
|
| 91 |
+
16000,
|
| 92 |
+
"PCM_16",
|
| 93 |
+
)
|
| 94 |
|
| 95 |
return output_path, f'Source: {audio_source}'
|
| 96 |
|
output.wav
DELETED
|
Binary file (136 kB)
|
|
|
s2st_inference.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import logging
|
| 3 |
+
import shutil
|
| 4 |
+
import sys
|
| 5 |
+
import time
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
| 8 |
+
import numpy as np
|
| 9 |
+
import soundfile as sf
|
| 10 |
+
import torch
|
| 11 |
+
from typeguard import check_argument_types
|
| 12 |
+
from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
|
| 13 |
+
from espnet2.bin.s2st_inference import Speech2Speech
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def s2st_inference(
|
| 17 |
+
speech: torch.Tensor,
|
| 18 |
+
ngpu: int = 0,
|
| 19 |
+
seed: int = 2023,
|
| 20 |
+
log_level: Union[int, str] = 'INFO',
|
| 21 |
+
train_config: Optional[str] = None,
|
| 22 |
+
model_file: Optional[str] = None,
|
| 23 |
+
threshold: float = 0.5,
|
| 24 |
+
minlenratio: float = 0,
|
| 25 |
+
maxlenratio: float = 10.0,
|
| 26 |
+
st_subtask_minlenratio: float = 0,
|
| 27 |
+
st_subtask_maxlenratio: float = 1.5,
|
| 28 |
+
use_teacher_forcing: bool = False,
|
| 29 |
+
use_att_constraint: bool = False,
|
| 30 |
+
backward_window: int = 1,
|
| 31 |
+
forward_window: int = 3,
|
| 32 |
+
always_fix_seed: bool = False,
|
| 33 |
+
beam_size: int = 5,
|
| 34 |
+
penalty: float = 0,
|
| 35 |
+
st_subtask_beam_size: int = 5,
|
| 36 |
+
st_subtask_penalty: float = 0,
|
| 37 |
+
st_subtask_token_type: Optional[str] = None,
|
| 38 |
+
st_subtask_bpemodel: Optional[str] = None,
|
| 39 |
+
vocoder_config: Optional[str] = None,
|
| 40 |
+
vocoder_file: Optional[str] = None,
|
| 41 |
+
vocoder_tag: Optional[str] = None,
|
| 42 |
+
):
|
| 43 |
+
"""Run text-to-speech inference."""
|
| 44 |
+
assert check_argument_types()
|
| 45 |
+
if ngpu > 1:
|
| 46 |
+
raise NotImplementedError("only single GPU decoding is supported")
|
| 47 |
+
logging.basicConfig(
|
| 48 |
+
level=log_level,
|
| 49 |
+
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
if ngpu >= 1:
|
| 53 |
+
device = "cuda"
|
| 54 |
+
else:
|
| 55 |
+
device = "cpu"
|
| 56 |
+
|
| 57 |
+
# 1. Set random-seed
|
| 58 |
+
set_all_random_seed(seed)
|
| 59 |
+
|
| 60 |
+
# 2. Build model
|
| 61 |
+
speech2speech_kwargs = dict(
|
| 62 |
+
train_config=train_config,
|
| 63 |
+
model_file=model_file,
|
| 64 |
+
threshold=threshold,
|
| 65 |
+
maxlenratio=maxlenratio,
|
| 66 |
+
minlenratio=minlenratio,
|
| 67 |
+
st_subtask_maxlenratio=st_subtask_maxlenratio,
|
| 68 |
+
st_subtask_minlenratio=st_subtask_minlenratio,
|
| 69 |
+
use_teacher_forcing=use_teacher_forcing,
|
| 70 |
+
use_att_constraint=use_att_constraint,
|
| 71 |
+
backward_window=backward_window,
|
| 72 |
+
forward_window=forward_window,
|
| 73 |
+
beam_size=beam_size,
|
| 74 |
+
penalty=penalty,
|
| 75 |
+
st_subtask_beam_size=st_subtask_beam_size,
|
| 76 |
+
st_subtask_penalty=st_subtask_penalty,
|
| 77 |
+
st_subtask_token_type=st_subtask_token_type,
|
| 78 |
+
st_subtask_bpemodel=st_subtask_bpemodel,
|
| 79 |
+
vocoder_config=vocoder_config,
|
| 80 |
+
vocoder_file=vocoder_file,
|
| 81 |
+
device=device,
|
| 82 |
+
seed=seed,
|
| 83 |
+
always_fix_seed=always_fix_seed,
|
| 84 |
+
)
|
| 85 |
+
speech2speech = Speech2Speech.from_pretrained(
|
| 86 |
+
vocoder_tag=vocoder_tag,
|
| 87 |
+
**speech2speech_kwargs,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
start_time = time.perf_counter()
|
| 91 |
+
|
| 92 |
+
speech_lengths = torch.as_tensor([speech.shape[0]])
|
| 93 |
+
output_dict = speech2speech(speech.unsqueeze(0), speech_lengths)
|
| 94 |
+
|
| 95 |
+
insize = speech.size(0) + 1
|
| 96 |
+
# standard speech2mel model case
|
| 97 |
+
feat_gen = output_dict["feat_gen"]
|
| 98 |
+
logging.info(
|
| 99 |
+
f"inference speed = {int(feat_gen.size(0)) / (time.perf_counter() - start_time):.1f} frames / sec."
|
| 100 |
+
)
|
| 101 |
+
logging.info(f"(size:{insize}->{feat_gen.size(0)})")
|
| 102 |
+
if feat_gen.size(0) == insize * maxlenratio:
|
| 103 |
+
logging.warning(f"output length reaches maximum length.")
|
| 104 |
+
|
| 105 |
+
feat_gen = output_dict["feat_gen"].cpu().numpy()
|
| 106 |
+
if output_dict.get("feat_gen_denorm") is not None:
|
| 107 |
+
feat_gen_denorm = output_dict["feat_gen_denorm"].cpu().numpy()
|
| 108 |
+
|
| 109 |
+
assert 'wav' in output_dict
|
| 110 |
+
wav = output_dict["wav"].cpu().numpy()
|
| 111 |
+
logging.info(f"wav {len(wav)}")
|
| 112 |
+
|
| 113 |
+
return wav
|
| 114 |
+
|
| 115 |
+
# if output_dict.get("st_subtask_token") is not None:
|
| 116 |
+
# writer["token"][key] = " ".join(output_dict["st_subtask_token"])
|
| 117 |
+
# writer["token_int"][key] == " ".join(
|
| 118 |
+
# map(str, output_dict["st_subtask_token_int"])
|
| 119 |
+
# )
|
| 120 |
+
# if output_dict.get("st_subtask_text") is not None:
|
| 121 |
+
# writer["text"][key] = output_dict["st_subtask_text"]
|