Spaces:

jlmarrugom
/

voice_fixer_app

Runtime error

App Files Files Community

jlmarrugom commited on Jan 7, 2023

Commit

5ec3488

1 Parent(s): 446d1e4

Upload 36 files

Browse files

Files changed (36) hide show

main.py +37 -0
packages.txt +2 -0
requirements.txt +11 -0
voicefixer/__init__.py +14 -0
voicefixer/__main__.py +170 -0
voicefixer/base.py +145 -0
voicefixer/restorer/__init__.py +44 -0
voicefixer/restorer/model.py +680 -0
voicefixer/restorer/model_kqq_bn.py +186 -0
voicefixer/restorer/modules.py +217 -0
voicefixer/tools/__init__.py +11 -0
voicefixer/tools/base.py +244 -0
voicefixer/tools/io.py +44 -0
voicefixer/tools/mel_scale.py +238 -0
voicefixer/tools/modules/__init__.py +11 -0
voicefixer/tools/modules/fDomainHelper.py +234 -0
voicefixer/tools/modules/filters/f_2_64.mat +0 -0
voicefixer/tools/modules/filters/f_4_64.mat +0 -0
voicefixer/tools/modules/filters/f_8_64.mat +0 -0
voicefixer/tools/modules/filters/h_2_64.mat +0 -0
voicefixer/tools/modules/filters/h_4_64.mat +0 -0
voicefixer/tools/modules/filters/h_8_64.mat +0 -0
voicefixer/tools/modules/pqmf.py +116 -0
voicefixer/tools/path.py +13 -0
voicefixer/tools/pytorch_util.py +180 -0
voicefixer/tools/random_.py +52 -0
voicefixer/tools/wav.py +242 -0
voicefixer/vocoder/__init__.py +30 -0
voicefixer/vocoder/base.py +86 -0
voicefixer/vocoder/config.py +316 -0
voicefixer/vocoder/model/__init__.py +11 -0
voicefixer/vocoder/model/generator.py +168 -0
voicefixer/vocoder/model/modules.py +947 -0
voicefixer/vocoder/model/pqmf.py +61 -0
voicefixer/vocoder/model/res_msd.py +71 -0
voicefixer/vocoder/model/util.py +135 -0

main.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from voicefixer.base import VoiceFixer
+import streamlit as st
+from audio_recorder_streamlit import audio_recorder
+from io import BytesIO
+import soundfile as sf
+st.set_page_config(page_title="VoiceFixer app", page_icon=":notes:")
+st.title("Voice Fixer App :notes:")
+st.write(
+    """
+    This app is a mix of [VoiceFixer Model](https://github.com/haoheliu/voicefixer), and a custom
+    Streamlit component that [records audio](https://github.com/Joooohan/audio-recorder-streamlit) Online.
+    Currently the app shows great results when removing background noises, but
+    speech improvements aren't as obvious.
+    """)
+#Config files are on voicefixer/base and voicefixer/vocoder/config import
+# They were uploaded on hugging face
+voicefixer = VoiceFixer()
+audio_bytes = audio_recorder(
+    pause_threshold= 1.5
+)
+try:
+    data, samplerate = sf.read(BytesIO(audio_bytes))
+    print(samplerate)
+    sf.write("original.wav",data,samplerate)
+    st.audio(audio_bytes, format = "audio/wav")
+    if data.shape[0]>=10000:
+        voicefixer.restore(input="original.wav", # low quality .wav/.flac file
+                       output="enhanced_output.wav",
+                       cuda=False, # GPU acceleration
+                       mode=0)
+        st.write("The Audio without background noises and a little enhancement :ocean:")
+        st.audio("enhanced_output.wav")
+    else: st.warning("Recorded Audio is too short, try again :relieved:")#wink
+except:
+    st.info("Try to record some audio :relieved:")

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ ffmpeg
2	+ libsndfile1

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+audio_recorder_streamlit>=0.0.7
+soundfile>=0.9.0
+huggingface-hub>=0.11.1
+librosa>=0.8.1,<0.9.0
+torch>=1.7.0
+matplotlib
+progressbar
+torchlibrosa==0.0.7
+GitPython
+streamlit>=1.12.
+pyyaml

voicefixer/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@File    :   __init__.py.py
+@Contact :   [email protected]
+@License :   (C)Copyright 2020-2100
+@Modify Time      @Author    @Version    @Desciption
+------------      -------    --------    -----------
+9/14/21 12:31 AM   Haohe Liu      1.0         None
+"""
+from voicefixer.vocoder.base import Vocoder
+from voicefixer.base import VoiceFixer

voicefixer/__main__.py ADDED Viewed

	@@ -0,0 +1,170 @@

+#!/usr/bin/python3
+from genericpath import exists
+import os.path
+import argparse
+from voicefixer import VoiceFixer
+import torch
+import os
+def writefile(infile, outfile, mode, append_mode, cuda, verbose=False):
+    if append_mode is True:
+        outbasename, outext = os.path.splitext(os.path.basename(outfile))
+        outfile = os.path.join(
+            os.path.dirname(outfile), "{}-mode{}{}".format(outbasename, mode, outext)
+        )
+    if verbose:
+        print("Processing {}, mode={}".format(infile, mode))
+    voicefixer.restore(input=infile, output=outfile, cuda=cuda, mode=int(mode))
+def check_arguments(args):
+    process_file, process_folder = len(args.infile) != 0, len(args.infolder) != 0
+    # assert len(args.infile) == 0 and len(args.outfile) == 0 or process_file, \
+    #         "Error: You should give the input and output file path at the same time. The input and output file path we receive is %s and %s" % (args.infile, args.outfile)
+    # assert len(args.infolder) == 0 and len(args.outfolder) == 0 or process_folder, \
+    #         "Error: You should give the input and output folder path at the same time. The input and output folder path we receive is %s and %s" % (args.infolder, args.outfolder)
+    assert (
+        process_file or process_folder
+    ), "Error: You need to specify a input file path (--infile) or a input folder path (--infolder) to proceed. For more information please run: voicefixer -h"
+    # if(args.cuda and not torch.cuda.is_available()):
+    #     print("Warning: You set --cuda while no cuda device found on your machine. We will use CPU instead.")
+    if process_file:
+        assert os.path.exists(args.infile), (
+            "Error: The input file %s is not found." % args.infile
+        )
+        output_dirname = os.path.dirname(args.outfile)
+        if len(output_dirname) > 1:
+            os.makedirs(output_dirname, exist_ok=True)
+    if process_folder:
+        assert os.path.exists(args.infolder), (
+            "Error: The input folder %s is not found." % args.infile
+        )
+        output_dirname = args.outfolder
+        if len(output_dirname) > 1:
+            os.makedirs(args.outfolder, exist_ok=True)
+    return process_file, process_folder
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="VoiceFixer - restores degraded speech"
+    )
+    parser.add_argument(
+        "-i",
+        "--infile",
+        type=str,
+        default="",
+        help="An input file to be processed by VoiceFixer.",
+    )
+    parser.add_argument(
+        "-o",
+        "--outfile",
+        type=str,
+        default="outfile.wav",
+        help="An output file to store the result.",
+    )
+    parser.add_argument(
+        "-ifdr",
+        "--infolder",
+        type=str,
+        default="",
+        help="Input folder. Place all your wav file that need process in this folder.",
+    )
+    parser.add_argument(
+        "-ofdr",
+        "--outfolder",
+        type=str,
+        default="outfolder",
+        help="Output folder. The processed files will be stored in this folder.",
+    )
+    parser.add_argument(
+        "--mode", help="mode", choices=["0", "1", "2", "all"], default="0"
+    )
+    parser.add_argument('--disable-cuda', help='Set this flag if you do not want to use your gpu.', default=False, action="store_true")
+    parser.add_argument(
+        "--silent",
+        help="Set this flag if you do not want to see any message.",
+        default=False,
+        action="store_true",
+    )
+    args = parser.parse_args()
+    if torch.cuda.is_available() and not args.disable_cuda:
+        cuda = True
+    else:
+        cuda = False
+    process_file, process_folder = check_arguments(args)
+    if not args.silent:
+        print("Initializing VoiceFixer")
+    voicefixer = VoiceFixer()
+    if not args.silent:
+        print("Start processing the input file %s." % args.infile)
+    if process_file:
+        audioext = os.path.splitext(os.path.basename(args.infile))[-1]
+        if audioext != ".wav":
+            raise ValueError(
+                "Error: Error processing the input file. We only support the .wav format currently. Please convert your %s format to .wav. Thanks."
+                % audioext
+            )
+        if args.mode == "all":
+            for file_mode in range(3):
+                writefile(
+                    args.infile,
+                    args.outfile,
+                    file_mode,
+                    True,
+                    cuda,
+                    verbose=not args.silent,
+                )
+        else:
+            writefile(
+                args.infile,
+                args.outfile,
+                args.mode,
+                False,
+                cuda,
+                verbose=not args.silent,
+            )
+    if process_folder:
+        if not args.silent:
+            files = [
+                file
+                for file in os.listdir(args.infolder)
+                if (os.path.splitext(os.path.basename(file))[-1] == ".wav")
+            ]
+            print(
+                "Found %s .wav files in the input folder %s. Start processing."
+                % (len(files), args.infolder)
+            )
+        for file in os.listdir(args.infolder):
+            outbasename, outext = os.path.splitext(os.path.basename(file))
+            in_file = os.path.join(args.infolder, file)
+            out_file = os.path.join(args.outfolder, file)
+            if args.mode == "all":
+                for file_mode in range(3):
+                    writefile(
+                        in_file,
+                        out_file,
+                        file_mode,
+                        True,
+                        cuda,
+                        verbose=not args.silent,
+                    )
+            else:
+                writefile(
+                    in_file, out_file, args.mode, False, cuda, verbose=not args.silent
+                )
+    if not args.silent:
+        print("Done")

voicefixer/base.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import librosa.display
+from voicefixer.tools.pytorch_util import *
+from voicefixer.tools.wav import *
+from voicefixer.restorer.model import VoiceFixer as voicefixer_fe
+import os
+from huggingface_hub import hf_hub_download
+path_to_ckpt = hf_hub_download(repo_id="jlmarrugom/voice_fixer", filename="vf.ckpt")
+EPS = 1e-8
+class VoiceFixer(nn.Module):
+    def __init__(self):
+        super(VoiceFixer, self).__init__()
+        self._model = voicefixer_fe(channels=2, sample_rate=44100)
+        # print(os.path.join(os.path.expanduser('~'), ".cache/voicefixer/analysis_module/checkpoints/epoch=15_trimed_bn.ckpt"))
+        self.analysis_module_ckpt = path_to_ckpt #"models/vf.ckpt"
+        if(not os.path.exists(self.analysis_module_ckpt)):
+            raise RuntimeError("Error 0: The checkpoint for analysis module (vf.ckpt) is not found in ~/.cache/voicefixer/analysis_module/checkpoints. \
+                                By default the checkpoint should be download automatically by this program. Something bad may happened.\
+                                But don't worry! Alternatively you can download it directly from Zenodo: https://zenodo.org/record/5600188/files/vf.ckpt?download=1.")
+        self._model.load_state_dict(
+            torch.load(
+                self.analysis_module_ckpt
+            )
+        )
+        self._model.eval()
+    def _load_wav_energy(self, path, sample_rate, threshold=0.95):
+        wav_10k, _ = librosa.load(path, sr=sample_rate)
+        stft = np.log10(np.abs(librosa.stft(wav_10k)) + 1.0)
+        fbins = stft.shape[0]
+        e_stft = np.sum(stft, axis=1)
+        for i in range(e_stft.shape[0]):
+            e_stft[-i - 1] = np.sum(e_stft[: -i - 1])
+        total = e_stft[-1]
+        for i in range(e_stft.shape[0]):
+            if e_stft[i] < total * threshold:
+                continue
+            else:
+                break
+        return wav_10k, int((sample_rate // 2) * (i / fbins))
+    def _load_wav(self, path, sample_rate, threshold=0.95):
+        wav_10k, _ = librosa.load(path, sr=sample_rate)
+        return wav_10k
+    def _amp_to_original_f(self, mel_sp_est, mel_sp_target, cutoff=0.2):
+        freq_dim = mel_sp_target.size()[-1]
+        mel_sp_est_low, mel_sp_target_low = (
+            mel_sp_est[..., 5 : int(freq_dim * cutoff)],
+            mel_sp_target[..., 5 : int(freq_dim * cutoff)],
+        )
+        energy_est, energy_target = torch.mean(mel_sp_est_low, dim=(2, 3)), torch.mean(
+            mel_sp_target_low, dim=(2, 3)
+        )
+        amp_ratio = energy_target / energy_est
+        return mel_sp_est * amp_ratio[..., None, None], mel_sp_target
+    def _trim_center(self, est, ref):
+        diff = np.abs(est.shape[-1] - ref.shape[-1])
+        if est.shape[-1] == ref.shape[-1]:
+            return est, ref
+        elif est.shape[-1] > ref.shape[-1]:
+            min_len = min(est.shape[-1], ref.shape[-1])
+            est, ref = est[..., int(diff // 2) : -int(diff // 2)], ref
+            est, ref = est[..., :min_len], ref[..., :min_len]
+            return est, ref
+        else:
+            min_len = min(est.shape[-1], ref.shape[-1])
+            est, ref = est, ref[..., int(diff // 2) : -int(diff // 2)]
+            est, ref = est[..., :min_len], ref[..., :min_len]
+            return est, ref
+    def _pre(self, model, input, cuda):
+        input = input[None, None, ...]
+        input = torch.tensor(input)
+        input = try_tensor_cuda(input, cuda=cuda)
+        sp, _, _ = model.f_helper.wav_to_spectrogram_phase(input)
+        mel_orig = model.mel(sp.permute(0, 1, 3, 2)).permute(0, 1, 3, 2)
+        # return models.to_log(sp), models.to_log(mel_orig)
+        return sp, mel_orig
+    def remove_higher_frequency(self, wav, ratio=0.95):
+        stft = librosa.stft(wav)
+        real, img = np.real(stft), np.imag(stft)
+        mag = (real**2 + img**2) ** 0.5
+        cos, sin = real / (mag + EPS), img / (mag + EPS)
+        spec = np.abs(stft)  # [1025,T]
+        feature = spec.copy()
+        feature = np.log10(feature + EPS)
+        feature[feature < 0] = 0
+        energy_level = np.sum(feature, axis=1)
+        threshold = np.sum(energy_level) * ratio
+        curent_level, i = energy_level[0], 0
+        while i < energy_level.shape[0] and curent_level < threshold:
+            curent_level += energy_level[i + 1, ...]
+            i += 1
+        spec[i:, ...] = np.zeros_like(spec[i:, ...])
+        stft = spec * cos + 1j * spec * sin
+        return librosa.istft(stft)
+    @torch.no_grad()
+    def restore_inmem(self, wav_10k, cuda=False, mode=0, your_vocoder_func=None):
+        check_cuda_availability(cuda=cuda)
+        self._model = try_tensor_cuda(self._model, cuda=cuda)
+        if mode == 0:
+            self._model.eval()
+        elif mode == 1:
+            self._model.eval()
+        elif mode == 2:
+            self._model.train()  # More effective on seriously demaged speech
+        res = []
+        seg_length = 44100 * 30
+        break_point = seg_length
+        while break_point < wav_10k.shape[0] + seg_length:
+            segment = wav_10k[break_point - seg_length : break_point]
+            if mode == 1:
+                segment = self.remove_higher_frequency(segment)
+            sp, mel_noisy = self._pre(self._model, segment, cuda)
+            out_model = self._model(sp, mel_noisy)
+            denoised_mel = from_log(out_model["mel"])
+            if your_vocoder_func is None:
+                out = self._model.vocoder(denoised_mel, cuda=cuda)
+            else:
+                out = your_vocoder_func(denoised_mel)
+            # unify energy
+            if torch.max(torch.abs(out)) > 1.0:
+                out = out / torch.max(torch.abs(out))
+                print("Warning: Exceed energy limit,", input)
+            # frame alignment
+            out, _ = self._trim_center(out, segment)
+            res.append(out)
+            break_point += seg_length
+        out = torch.cat(res, -1)
+        return tensor2numpy(out.squeeze(0))
+    def restore(self, input, output, cuda=False, mode=0, your_vocoder_func=None):
+        wav_10k = self._load_wav(input, sample_rate=44100)
+        out_np_wav = self.restore_inmem(
+            wav_10k, cuda=cuda, mode=mode, your_vocoder_func=your_vocoder_func
+        )
+        save_wave(out_np_wav, fname=output, sample_rate=44100)

voicefixer/restorer/__init__.py ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@File    :   __init__.py.py
+@Contact :   [email protected]
+@License :   (C)Copyright 2020-2100
+@Modify Time      @Author    @Version    @Desciption
+------------      -------    --------    -----------
+9/14/21 12:31 AM   Haohe Liu      1.0         None
+"""
+import os
+import torch
+import urllib.request
+meta = {
+    "voicefixer_fe": {
+        "path": os.path.join(
+            os.path.expanduser("~"),
+            ".cache/voicefixer/analysis_module/checkpoints/vf.ckpt",
+        ),
+        "url": "https://zenodo.org/record/5600188/files/vf.ckpt?download=1",
+    },
+}
+if not os.path.exists(meta["voicefixer_fe"]["path"]):
+    os.makedirs(os.path.dirname(meta["voicefixer_fe"]["path"]), exist_ok=True)
+    print("Downloading the main structure of voicefixer")
+    urllib.request.urlretrieve(
+        meta["voicefixer_fe"]["url"], meta["voicefixer_fe"]["path"]
+    )
+    print(
+        "Weights downloaded in: {} Size: {}".format(
+            meta["voicefixer_fe"]["path"],
+            os.path.getsize(meta["voicefixer_fe"]["path"]),
+        )
+    )
+    # cmd = "wget "+ meta["voicefixer_fe"]['url'] + " -O " + meta["voicefixer_fe"]['path']
+    # os.system(cmd)
+    # temp = torch.load(meta["voicefixer_fe"]['path'])
+    # torch.save(temp['state_dict'], os.path.join(os.path.expanduser('~'), ".cache/voicefixer/analysis_module/checkpoints/vf.ckpt"))

voicefixer/restorer/model.py ADDED Viewed

	@@ -0,0 +1,680 @@

+# import pytorch_lightning as pl
+import torch.utils
+from voicefixer.tools.mel_scale import MelScale
+import torch.utils.data
+import matplotlib.pyplot as plt
+import librosa.display
+from voicefixer.vocoder.base import Vocoder
+from voicefixer.tools.pytorch_util import *
+from voicefixer.restorer.model_kqq_bn import UNetResComplex_100Mb
+from voicefixer.tools.random_ import *
+from voicefixer.tools.wav import *
+from voicefixer.tools.modules.fDomainHelper import FDomainHelper
+from voicefixer.tools.io import load_json, write_json
+from matplotlib import cm
+os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
+EPS = 1e-8
+class BN_GRU(torch.nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        hidden_dim,
+        layer=1,
+        bidirectional=False,
+        batchnorm=True,
+        dropout=0.0,
+    ):
+        super(BN_GRU, self).__init__()
+        self.batchnorm = batchnorm
+        if batchnorm:
+            self.bn = nn.BatchNorm2d(1)
+        self.gru = torch.nn.GRU(
+            input_size=input_dim,
+            hidden_size=hidden_dim,
+            num_layers=layer,
+            bidirectional=bidirectional,
+            dropout=dropout,
+            batch_first=True,
+        )
+        self.init_weights()
+    def init_weights(self):
+        for m in self.modules():
+            if type(m) in [nn.GRU, nn.LSTM, nn.RNN]:
+                for name, param in m.named_parameters():
+                    if "weight_ih" in name:
+                        torch.nn.init.xavier_uniform_(param.data)
+                    elif "weight_hh" in name:
+                        torch.nn.init.orthogonal_(param.data)
+                    elif "bias" in name:
+                        param.data.fill_(0)
+    def forward(self, inputs):
+        # (batch, 1, seq, feature)
+        if self.batchnorm:
+            inputs = self.bn(inputs)
+        out, _ = self.gru(inputs.squeeze(1))
+        return out.unsqueeze(1)
+class Generator(nn.Module):
+    def __init__(self, n_mel, hidden, channels):
+        super(Generator, self).__init__()
+        # todo the currently running trail don't have dropout
+        self.denoiser = nn.Sequential(
+            nn.BatchNorm2d(1),
+            nn.Linear(n_mel, n_mel * 2),
+            nn.ReLU(inplace=True),
+            nn.BatchNorm2d(1),
+            nn.Linear(n_mel * 2, n_mel * 4),
+            nn.Dropout(0.5),
+            nn.ReLU(inplace=True),
+            BN_GRU(
+                input_dim=n_mel * 4,
+                hidden_dim=n_mel * 2,
+                bidirectional=True,
+                layer=2,
+                batchnorm=True,
+            ),
+            BN_GRU(
+                input_dim=n_mel * 4,
+                hidden_dim=n_mel * 2,
+                bidirectional=True,
+                layer=2,
+                batchnorm=True,
+            ),
+            nn.BatchNorm2d(1),
+            nn.ReLU(inplace=True),
+            nn.Linear(n_mel * 4, n_mel * 4),
+            nn.Dropout(0.5),
+            nn.BatchNorm2d(1),
+            nn.ReLU(inplace=True),
+            nn.Linear(n_mel * 4, n_mel),
+            nn.Sigmoid(),
+        )
+        self.unet = UNetResComplex_100Mb(channels=channels)
+    def forward(self, sp, mel_orig):
+        # Denoising
+        noisy = mel_orig.clone()
+        clean = self.denoiser(noisy) * noisy
+        x = to_log(clean.detach())
+        unet_in = torch.cat([to_log(mel_orig), x], dim=1)
+        # unet_in = lstm_out
+        unet_out = self.unet(unet_in)["mel"]
+        # masks
+        mel = unet_out + x
+        # todo mel and addition here are in log scales
+        return {
+            "mel": mel,
+            "lstm_out": unet_out,
+            "unet_out": unet_out,
+            "noisy": noisy,
+            "clean": clean,
+        }
+class VoiceFixer(nn.Module):
+    def __init__(
+        self,
+        channels,
+        type_target="vocals",
+        nsrc=1,
+        loss="l1",
+        lr=0.002,
+        gamma=0.9,
+        batchsize=None,
+        frame_length=None,
+        sample_rate=None,
+        warm_up_steps=1000,
+        reduce_lr_steps=15000,
+        # datas
+        check_val_every_n_epoch=5,
+    ):
+        super(VoiceFixer, self).__init__()
+        if sample_rate == 44100:
+            window_size = 2048
+            hop_size = 441
+            n_mel = 128
+        elif sample_rate == 24000:
+            window_size = 768
+            hop_size = 240
+            n_mel = 80
+        elif sample_rate == 16000:
+            window_size = 512
+            hop_size = 160
+            n_mel = 80
+        else:
+            raise ValueError(
+                "Error: Sample rate " + str(sample_rate) + " not supported"
+            )
+        center = (True,)
+        pad_mode = "reflect"
+        window = "hann"
+        freeze_parameters = True
+        # self.save_hyperparameters()
+        self.nsrc = nsrc
+        self.type_target = type_target
+        self.channels = channels
+        self.lr = lr
+        self.generated = None
+        self.gamma = gamma
+        self.sample_rate = sample_rate
+        self.sample_rate = sample_rate
+        self.batchsize = batchsize
+        self.frame_length = frame_length
+        # self.hparams['channels'] = 2
+        # self.am = AudioMetrics()
+        # self.im = ImgMetrics()
+        self.vocoder = Vocoder(sample_rate=44100)
+        self.valid = None
+        self.fake = None
+        self.train_step = 0
+        self.val_step = 0
+        self.val_result_save_dir = None
+        self.val_result_save_dir_step = None
+        self.downsample_ratio = 2**6  # This number equals 2^{#encoder_blcoks}
+        self.check_val_every_n_epoch = check_val_every_n_epoch
+        self.f_helper = FDomainHelper(
+            window_size=window_size,
+            hop_size=hop_size,
+            center=center,
+            pad_mode=pad_mode,
+            window=window,
+            freeze_parameters=freeze_parameters,
+        )
+        hidden = window_size // 2 + 1
+        self.mel = MelScale(n_mels=n_mel, sample_rate=sample_rate, n_stft=hidden)
+        # masking
+        self.generator = Generator(n_mel, hidden, channels)
+        self.lr_lambda = lambda step: self.get_lr_lambda(
+            step,
+            gamma=self.gamma,
+            warm_up_steps=warm_up_steps,
+            reduce_lr_steps=reduce_lr_steps,
+        )
+        self.lr_lambda_2 = lambda step: self.get_lr_lambda(
+            step, gamma=self.gamma, warm_up_steps=10, reduce_lr_steps=reduce_lr_steps
+        )
+        self.mel_weight_44k_128 = (
+            torch.tensor(
+                [
+                    19.40951426,
+                    19.94047336,
+                    20.4859038,
+                    21.04629067,
+                    21.62194148,
+                    22.21335214,
+                    22.8210215,
+                    23.44529231,
+                    24.08660962,
+                    24.74541882,
+                    25.42234287,
+                    26.11770576,
+                    26.83212784,
+                    27.56615283,
+                    28.32007747,
+                    29.0947679,
+                    29.89060111,
+                    30.70832636,
+                    31.54828121,
+                    32.41121487,
+                    33.29780773,
+                    34.20865341,
+                    35.14437675,
+                    36.1056621,
+                    37.09332763,
+                    38.10795802,
+                    39.15039691,
+                    40.22119881,
+                    41.32154931,
+                    42.45172373,
+                    43.61293329,
+                    44.80609379,
+                    46.031602,
+                    47.29070223,
+                    48.58427549,
+                    49.91327905,
+                    51.27863232,
+                    52.68119708,
+                    54.1222372,
+                    55.60274206,
+                    57.12364703,
+                    58.68617876,
+                    60.29148652,
+                    61.94081306,
+                    63.63501986,
+                    65.37562658,
+                    67.16408954,
+                    69.00109084,
+                    70.88850318,
+                    72.82736101,
+                    74.81985537,
+                    76.86654792,
+                    78.96885475,
+                    81.12900906,
+                    83.34840929,
+                    85.62810662,
+                    87.97005418,
+                    90.37689804,
+                    92.84887686,
+                    95.38872881,
+                    97.99777002,
+                    100.67862715,
+                    103.43232942,
+                    106.26140638,
+                    109.16827015,
+                    112.15470471,
+                    115.22184756,
+                    118.37439245,
+                    121.6122689,
+                    124.93877158,
+                    128.35661454,
+                    131.86761321,
+                    135.47417938,
+                    139.18059494,
+                    142.98713744,
+                    146.89771854,
+                    150.91684347,
+                    155.0446638,
+                    159.28614648,
+                    163.64270198,
+                    168.12035831,
+                    172.71749158,
+                    177.44220154,
+                    182.29556933,
+                    187.28286676,
+                    192.40502126,
+                    197.6682721,
+                    203.07516896,
+                    208.63088733,
+                    214.33770931,
+                    220.19910108,
+                    226.22363072,
+                    232.41087124,
+                    238.76803591,
+                    245.30079083,
+                    252.01064464,
+                    258.90261676,
+                    265.98474,
+                    273.26010248,
+                    280.73496362,
+                    288.41440094,
+                    296.30489752,
+                    304.41180337,
+                    312.7377183,
+                    321.28877878,
+                    330.07870237,
+                    339.10812951,
+                    348.38276173,
+                    357.91393924,
+                    367.70513992,
+                    377.76413924,
+                    388.09467408,
+                    398.70920178,
+                    409.61813793,
+                    420.81980127,
+                    432.33215467,
+                    444.16083117,
+                    456.30919947,
+                    468.78589276,
+                    481.61325588,
+                    494.78824596,
+                    508.31969844,
+                    522.2238331,
+                    536.51163441,
+                    551.18859414,
+                    566.26142988,
+                    581.75006061,
+                    597.66210737,
+                ]
+            )
+            / 19.40951426
+        )
+        self.mel_weight_44k_128 = self.mel_weight_44k_128[None, None, None, ...]
+        self.g_loss_weight = 0.01
+        self.d_loss_weight = 1
+    def get_vocoder(self):
+        return self.vocoder
+    def get_f_helper(self):
+        return self.f_helper
+    def get_lr_lambda(self, step, gamma, warm_up_steps, reduce_lr_steps):
+        r"""Get lr_lambda for LambdaLR. E.g.,
+        .. code-block: python
+            lr_lambda = lambda step: get_lr_lambda(step, warm_up_steps=1000, reduce_lr_steps=10000)
+            from torch.optim.lr_scheduler import LambdaLR
+            LambdaLR(optimizer, lr_lambda)
+        """
+        if step <= warm_up_steps:
+            return step / warm_up_steps
+        else:
+            return gamma ** (step // reduce_lr_steps)
+    def init_weights(self, module: nn.Module):
+        for m in module.modules():
+            if type(m) in [nn.GRU, nn.LSTM, nn.RNN]:
+                for name, param in m.named_parameters():
+                    if "weight_ih" in name:
+                        torch.nn.init.xavier_uniform_(param.data)
+                    elif "weight_hh" in name:
+                        torch.nn.init.orthogonal_(param.data)
+                    elif "bias" in name:
+                        param.data.fill_(0)
+    def pre(self, input):
+        sp, _, _ = self.f_helper.wav_to_spectrogram_phase(input)
+        mel_orig = self.mel(sp.permute(0, 1, 3, 2)).permute(0, 1, 3, 2)
+        return sp, mel_orig
+    def forward(self, sp, mel_orig):
+        """
+        Args:
+          input: (batch_size, channels_num, segment_samples)
+        Outputs:
+          output_dict: {
+            'wav': (batch_size, channels_num, segment_samples),
+            'sp': (batch_size, channels_num, time_steps, freq_bins)}
+        """
+        return self.generator(sp, mel_orig)
+    def configure_optimizers(self):
+        optimizer_g = torch.optim.Adam(
+            [{"params": self.generator.parameters()}],
+            lr=self.lr,
+            amsgrad=True,
+            betas=(0.5, 0.999),
+        )
+        optimizer_d = torch.optim.Adam(
+            [{"params": self.discriminator.parameters()}],
+            lr=self.lr,
+            amsgrad=True,
+            betas=(0.5, 0.999),
+        )
+        scheduler_g = {
+            "scheduler": torch.optim.lr_scheduler.LambdaLR(optimizer_g, self.lr_lambda),
+            "interval": "step",
+            "frequency": 1,
+        }
+        scheduler_d = {
+            "scheduler": torch.optim.lr_scheduler.LambdaLR(optimizer_d, self.lr_lambda),
+            "interval": "step",
+            "frequency": 1,
+        }
+        return [optimizer_g, optimizer_d], [scheduler_g, scheduler_d]
+    def preprocess(self, batch, train=False, cutoff=None):
+        if train:
+            vocal = batch[self.type_target]  # final target
+            noise = batch["noise_LR"]  # augmented low resolution audio with noise
+            augLR = batch[
+                self.type_target + "_aug_LR"
+            ]  # # augment low resolution audio
+            LR = batch[self.type_target + "_LR"]
+            # embed()
+            vocal, LR, augLR, noise = (
+                vocal.float().permute(0, 2, 1),
+                LR.float().permute(0, 2, 1),
+                augLR.float().permute(0, 2, 1),
+                noise.float().permute(0, 2, 1),
+            )
+            # LR, noise = self.add_random_noise(LR, noise)
+            snr, scale = [], []
+            for i in range(vocal.size()[0]):
+                (
+                    vocal[i, ...],
+                    LR[i, ...],
+                    augLR[i, ...],
+                    noise[i, ...],
+                    _snr,
+                    _scale,
+                ) = add_noise_and_scale_with_HQ_with_Aug(
+                    vocal[i, ...],
+                    LR[i, ...],
+                    augLR[i, ...],
+                    noise[i, ...],
+                    snr_l=-5,
+                    snr_h=45,
+                    scale_lower=0.6,
+                    scale_upper=1.0,
+                )
+                snr.append(_snr), scale.append(_scale)
+            # vocal, LR = self.amp_to_original_f(vocal, LR)
+            # noise = (noise * 0.0) + 1e-8 # todo
+            return vocal, augLR, LR, noise + augLR
+        else:
+            if cutoff is None:
+                LR_noisy = batch["noisy"]
+                LR = batch["vocals"]
+                vocals = batch["vocals"]
+                vocals, LR, LR_noisy = (
+                    vocals.float().permute(0, 2, 1),
+                    LR.float().permute(0, 2, 1),
+                    LR_noisy.float().permute(0, 2, 1),
+                )
+                return vocals, LR, LR_noisy, batch["fname"][0]
+            else:
+                LR_noisy = batch["noisy" + "LR" + "_" + str(cutoff)]
+                LR = batch["vocals" + "LR" + "_" + str(cutoff)]
+                vocals = batch["vocals"]
+                vocals, LR, LR_noisy = (
+                    vocals.float().permute(0, 2, 1),
+                    LR.float().permute(0, 2, 1),
+                    LR_noisy.float().permute(0, 2, 1),
+                )
+                return vocals, LR, LR_noisy, batch["fname"][0]
+    def training_step(self, batch, batch_nb, optimizer_idx):
+        # dict_keys(['vocals', 'vocals_aug', 'vocals_augLR', 'noise'])
+        config = load_json("temp_path.json")
+        if "g_loss_weight" not in config.keys():
+            config["g_loss_weight"] = self.g_loss_weight
+            config["d_loss_weight"] = self.d_loss_weight
+            write_json(config, "temp_path.json")
+        elif (
+            config["g_loss_weight"] != self.g_loss_weight
+            or config["d_loss_weight"] != self.d_loss_weight
+        ):
+            print(
+                "Update d_loss weight, from",
+                self.d_loss_weight,
+                "to",
+                config["d_loss_weight"],
+            )
+            print(
+                "Update g_loss weight, from",
+                self.g_loss_weight,
+                "to",
+                config["g_loss_weight"],
+            )
+            self.g_loss_weight = config["g_loss_weight"]
+            self.d_loss_weight = config["d_loss_weight"]
+        if optimizer_idx == 0:
+            self.vocal, self.augLR, _, self.LR_noisy = self.preprocess(
+                batch, train=True
+            )
+            for i in range(self.vocal.size()[0]):
+                save_wave(
+                    tensor2numpy(self.vocal[i, ...]),
+                    str(i) + "vocal" + ".wav",
+                    sample_rate=44100,
+                )
+                save_wave(
+                    tensor2numpy(self.LR_noisy[i, ...]),
+                    str(i) + "LR_noisy" + ".wav",
+                    sample_rate=44100,
+                )
+            # all_mel_e2e in non-log scale
+            _, self.mel_target = self.pre(self.vocal)
+            self.sp_LR_target, self.mel_LR_target = self.pre(self.augLR)
+            self.sp_LR_target_noisy, self.mel_LR_target_noisy = self.pre(self.LR_noisy)
+            if self.valid is None or self.valid.size()[0] != self.mel_target.size()[0]:
+                self.valid = torch.ones(
+                    self.mel_target.size()[0], 1, self.mel_target.size()[2], 1
+                )
+                self.valid = self.valid.type_as(self.mel_target)
+            if self.fake is None or self.fake.size()[0] != self.mel_target.size()[0]:
+                self.fake = torch.zeros(
+                    self.mel_target.size()[0], 1, self.mel_target.size()[2], 1
+                )
+                self.fake = self.fake.type_as(self.mel_target)
+            self.generated = self(self.sp_LR_target_noisy, self.mel_LR_target_noisy)
+            denoise_loss = self.l1loss(self.generated["clean"], self.mel_LR_target)
+            targ_loss = self.l1loss(self.generated["mel"], to_log(self.mel_target))
+            self.log(
+                "targ-l",
+                targ_loss,
+                on_step=True,
+                on_epoch=False,
+                logger=True,
+                sync_dist=True,
+                prog_bar=True,
+            )
+            self.log(
+                "noise-l",
+                denoise_loss,
+                on_step=True,
+                on_epoch=False,
+                logger=True,
+                sync_dist=True,
+                prog_bar=True,
+            )
+            loss = targ_loss + denoise_loss
+            if self.train_step >= 18000:
+                g_loss = self.bce_loss(
+                    self.discriminator(self.generated["mel"]), self.valid
+                )
+                self.log(
+                    "g_l",
+                    g_loss,
+                    on_step=True,
+                    on_epoch=False,
+                    logger=True,
+                    sync_dist=True,
+                    prog_bar=True,
+                )
+                # print("g_loss", g_loss)
+                all_loss = loss + self.g_loss_weight * g_loss
+                self.log(
+                    "all_loss",
+                    all_loss,
+                    on_step=True,
+                    on_epoch=True,
+                    logger=True,
+                    sync_dist=True,
+                )
+            else:
+                all_loss = loss
+            self.train_step += 0.5
+            return {"loss": all_loss}
+        elif optimizer_idx == 1:
+            if self.train_step >= 16000:
+                self.generated = self(self.sp_LR_target_noisy, self.mel_LR_target_noisy)
+                self.train_step += 0.5
+                real_loss = self.bce_loss(
+                    self.discriminator(to_log(self.mel_target)), self.valid
+                )
+                self.log(
+                    "r_l",
+                    real_loss,
+                    on_step=True,
+                    on_epoch=False,
+                    logger=True,
+                    sync_dist=True,
+                    prog_bar=True,
+                )
+                fake_loss = self.bce_loss(
+                    self.discriminator(self.generated["mel"].detach()), self.fake
+                )
+                self.log(
+                    "d_l",
+                    fake_loss,
+                    on_step=True,
+                    on_epoch=False,
+                    logger=True,
+                    sync_dist=True,
+                    prog_bar=True,
+                )
+                d_loss = self.d_loss_weight * (real_loss + fake_loss) / 2
+                self.log(
+                    "discriminator_loss",
+                    d_loss,
+                    on_step=True,
+                    on_epoch=True,
+                    logger=True,
+                    sync_dist=True,
+                )
+                return {"loss": d_loss}
+    def draw_and_save(
+        self, mel: torch.Tensor, path, clip_max=None, clip_min=None, needlog=True
+    ):
+        plt.figure(figsize=(15, 5))
+        if clip_min is None:
+            clip_max, clip_min = self.clip(mel)
+        mel = np.transpose(tensor2numpy(mel)[0, 0, ...], (1, 0))
+        # assert np.sum(mel < 0) == 0, str(np.sum(mel < 0)) + str(np.sum(mel < 0))
+        if needlog:
+            assert np.sum(mel < 0) == 0, str(np.sum(mel < 0)) + "-" + path
+            mel_log = np.log10(mel + EPS)
+        else:
+            mel_log = mel
+        # plt.imshow(mel)
+        librosa.display.specshow(
+            mel_log,
+            sr=44100,
+            x_axis="frames",
+            y_axis="mel",
+            cmap=cm.jet,
+            vmax=clip_max,
+            vmin=clip_min,
+        )
+        plt.colorbar()
+        plt.savefig(path)
+        plt.close()
+    def clip(self, *args):
+        val_max, val_min = [], []
+        for each in args:
+            val_max.append(torch.max(each))
+            val_min.append(torch.min(each))
+        return max(val_max), min(val_min)

voicefixer/restorer/model_kqq_bn.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from voicefixer.restorer.modules import *
+from voicefixer.tools.pytorch_util import *
+class UNetResComplex_100Mb(nn.Module):
+    def __init__(self, channels, nsrc=1):
+        super(UNetResComplex_100Mb, self).__init__()
+        activation = "relu"
+        momentum = 0.01
+        self.nsrc = nsrc
+        self.channels = channels
+        self.downsample_ratio = 2**6  # This number equals 2^{#encoder_blcoks}
+        self.encoder_block1 = EncoderBlockRes(
+            in_channels=channels * nsrc,
+            out_channels=32,
+            downsample=(2, 2),
+            activation=activation,
+            momentum=momentum,
+        )
+        self.encoder_block2 = EncoderBlockRes(
+            in_channels=32,
+            out_channels=64,
+            downsample=(2, 2),
+            activation=activation,
+            momentum=momentum,
+        )
+        self.encoder_block3 = EncoderBlockRes(
+            in_channels=64,
+            out_channels=128,
+            downsample=(2, 2),
+            activation=activation,
+            momentum=momentum,
+        )
+        self.encoder_block4 = EncoderBlockRes(
+            in_channels=128,
+            out_channels=256,
+            downsample=(2, 2),
+            activation=activation,
+            momentum=momentum,
+        )
+        self.encoder_block5 = EncoderBlockRes(
+            in_channels=256,
+            out_channels=384,
+            downsample=(2, 2),
+            activation=activation,
+            momentum=momentum,
+        )
+        self.encoder_block6 = EncoderBlockRes(
+            in_channels=384,
+            out_channels=384,
+            downsample=(2, 2),
+            activation=activation,
+            momentum=momentum,
+        )
+        self.conv_block7 = ConvBlockRes(
+            in_channels=384,
+            out_channels=384,
+            size=3,
+            activation=activation,
+            momentum=momentum,
+        )
+        self.decoder_block1 = DecoderBlockRes(
+            in_channels=384,
+            out_channels=384,
+            stride=(2, 2),
+            activation=activation,
+            momentum=momentum,
+        )
+        self.decoder_block2 = DecoderBlockRes(
+            in_channels=384,
+            out_channels=384,
+            stride=(2, 2),
+            activation=activation,
+            momentum=momentum,
+        )
+        self.decoder_block3 = DecoderBlockRes(
+            in_channels=384,
+            out_channels=256,
+            stride=(2, 2),
+            activation=activation,
+            momentum=momentum,
+        )
+        self.decoder_block4 = DecoderBlockRes(
+            in_channels=256,
+            out_channels=128,
+            stride=(2, 2),
+            activation=activation,
+            momentum=momentum,
+        )
+        self.decoder_block5 = DecoderBlockRes(
+            in_channels=128,
+            out_channels=64,
+            stride=(2, 2),
+            activation=activation,
+            momentum=momentum,
+        )
+        self.decoder_block6 = DecoderBlockRes(
+            in_channels=64,
+            out_channels=32,
+            stride=(2, 2),
+            activation=activation,
+            momentum=momentum,
+        )
+        self.after_conv_block1 = ConvBlockRes(
+            in_channels=32,
+            out_channels=32,
+            size=3,
+            activation=activation,
+            momentum=momentum,
+        )
+        self.after_conv2 = nn.Conv2d(
+            in_channels=32,
+            out_channels=1,
+            kernel_size=(1, 1),
+            stride=(1, 1),
+            padding=(0, 0),
+            bias=True,
+        )
+        self.init_weights()
+    def init_weights(self):
+        init_layer(self.after_conv2)
+    def forward(self, sp):
+        """
+        Args:
+          input: (batch_size, channels_num, segment_samples)
+        Outputs:
+          output_dict: {
+            'wav': (batch_size, channels_num, segment_samples),
+            'sp': (batch_size, channels_num, time_steps, freq_bins)}
+        """
+        # Batch normalization
+        x = sp
+        # Pad spectrogram to be evenly divided by downsample ratio.
+        origin_len = x.shape[2]  # time_steps
+        pad_len = (
+            int(np.ceil(x.shape[2] / self.downsample_ratio)) * self.downsample_ratio
+            - origin_len
+        )
+        x = F.pad(x, pad=(0, 0, 0, pad_len))
+        x = x[..., 0 : x.shape[-1] - 1]  # (bs, channels, T, F)
+        # UNet
+        (x1_pool, x1) = self.encoder_block1(x)  # x1_pool: (bs, 32, T / 2, F / 2)
+        (x2_pool, x2) = self.encoder_block2(x1_pool)  # x2_pool: (bs, 64, T / 4, F / 4)
+        (x3_pool, x3) = self.encoder_block3(x2_pool)  # x3_pool: (bs, 128, T / 8, F / 8)
+        (x4_pool, x4) = self.encoder_block4(
+            x3_pool
+        )  # x4_pool: (bs, 256, T / 16, F / 16)
+        (x5_pool, x5) = self.encoder_block5(
+            x4_pool
+        )  # x5_pool: (bs, 512, T / 32, F / 32)
+        (x6_pool, x6) = self.encoder_block6(
+            x5_pool
+        )  # x6_pool: (bs, 1024, T / 64, F / 64)
+        x_center = self.conv_block7(x6_pool)  # (bs, 2048, T / 64, F / 64)
+        x7 = self.decoder_block1(x_center, x6)  # (bs, 1024, T / 32, F / 32)
+        x8 = self.decoder_block2(x7, x5)  # (bs, 512, T / 16, F / 16)
+        x9 = self.decoder_block3(x8, x4)  # (bs, 256, T / 8, F / 8)
+        x10 = self.decoder_block4(x9, x3)  # (bs, 128, T / 4, F / 4)
+        x11 = self.decoder_block5(x10, x2)  # (bs, 64, T / 2, F / 2)
+        x12 = self.decoder_block6(x11, x1)  # (bs, 32, T, F)
+        x = self.after_conv_block1(x12)  # (bs, 32, T, F)
+        x = self.after_conv2(x)  # (bs, channels, T, F)
+        # Recover shape
+        x = F.pad(x, pad=(0, 1))
+        x = x[:, :, 0:origin_len, :]
+        output_dict = {"mel": x}
+        return output_dict
+if __name__ == "__main__":
+    model = UNetResComplex_100Mb(channels=1)
+    print(model(torch.randn((1, 1, 101, 128)))["mel"].size())

voicefixer/restorer/modules.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import math
+class ConvBlockRes(nn.Module):
+    def __init__(self, in_channels, out_channels, size, activation, momentum):
+        super(ConvBlockRes, self).__init__()
+        self.activation = activation
+        if type(size) == type((3, 4)):
+            pad = size[0] // 2
+            size = size[0]
+        else:
+            pad = size // 2
+            size = size
+        self.conv1 = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(size, size),
+            stride=(1, 1),
+            dilation=(1, 1),
+            padding=(pad, pad),
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm2d(in_channels, momentum=momentum)
+        # self.abn1 = InPlaceABN(num_features=in_channels, momentum=momentum, activation='leaky_relu')
+        self.conv2 = nn.Conv2d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=(size, size),
+            stride=(1, 1),
+            dilation=(1, 1),
+            padding=(pad, pad),
+            bias=False,
+        )
+        self.bn2 = nn.BatchNorm2d(out_channels, momentum=momentum)
+        # self.abn2 = InPlaceABN(num_features=out_channels, momentum=momentum, activation='leaky_relu')
+        if in_channels != out_channels:
+            self.shortcut = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=(1, 1),
+                stride=(1, 1),
+                padding=(0, 0),
+            )
+            self.is_shortcut = True
+        else:
+            self.is_shortcut = False
+        self.init_weights()
+    def init_weights(self):
+        init_bn(self.bn1)
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+        if self.is_shortcut:
+            init_layer(self.shortcut)
+    def forward(self, x):
+        origin = x
+        x = self.conv1(F.leaky_relu_(self.bn1(x), negative_slope=0.01))
+        x = self.conv2(F.leaky_relu_(self.bn2(x), negative_slope=0.01))
+        if self.is_shortcut:
+            return self.shortcut(origin) + x
+        else:
+            return origin + x
+class EncoderBlockRes(nn.Module):
+    def __init__(self, in_channels, out_channels, downsample, activation, momentum):
+        super(EncoderBlockRes, self).__init__()
+        size = 3
+        self.conv_block1 = ConvBlockRes(
+            in_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block2 = ConvBlockRes(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block3 = ConvBlockRes(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block4 = ConvBlockRes(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.downsample = downsample
+    def forward(self, x):
+        encoder = self.conv_block1(x)
+        encoder = self.conv_block2(encoder)
+        encoder = self.conv_block3(encoder)
+        encoder = self.conv_block4(encoder)
+        encoder_pool = F.avg_pool2d(encoder, kernel_size=self.downsample)
+        return encoder_pool, encoder
+class DecoderBlockRes(nn.Module):
+    def __init__(self, in_channels, out_channels, stride, activation, momentum):
+        super(DecoderBlockRes, self).__init__()
+        size = 3
+        self.activation = activation
+        self.conv1 = torch.nn.ConvTranspose2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=(size, size),
+            stride=stride,
+            padding=(0, 0),
+            output_padding=(0, 0),
+            bias=False,
+            dilation=(1, 1),
+        )
+        self.bn1 = nn.BatchNorm2d(in_channels)
+        self.conv_block2 = ConvBlockRes(
+            out_channels * 2, out_channels, size, activation, momentum
+        )
+        self.conv_block3 = ConvBlockRes(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block4 = ConvBlockRes(
+            out_channels, out_channels, size, activation, momentum
+        )
+        self.conv_block5 = ConvBlockRes(
+            out_channels, out_channels, size, activation, momentum
+        )
+    def init_weights(self):
+        init_layer(self.conv1)
+    def prune(self, x, both=False):
+        """Prune the shape of x after transpose convolution."""
+        if both:
+            x = x[:, :, 0:-1, 0:-1]
+        else:
+            x = x[:, :, 0:-1, :]
+        return x
+    def forward(self, input_tensor, concat_tensor, both=False):
+        x = self.conv1(F.relu_(self.bn1(input_tensor)))
+        x = self.prune(x, both=both)
+        x = torch.cat((x, concat_tensor), dim=1)
+        x = self.conv_block2(x)
+        x = self.conv_block3(x)
+        x = self.conv_block4(x)
+        x = self.conv_block5(x)
+        return x
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer."""
+    nn.init.xavier_uniform_(layer.weight)
+    if hasattr(layer, "bias"):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.0)
+def init_bn(bn):
+    """Initialize a Batchnorm layer."""
+    bn.bias.data.fill_(0.0)
+    bn.weight.data.fill_(1.0)
+def init_gru(rnn):
+    """Initialize a GRU layer."""
+    def _concat_init(tensor, init_funcs):
+        (length, fan_out) = tensor.shape
+        fan_in = length // len(init_funcs)
+        for (i, init_func) in enumerate(init_funcs):
+            init_func(tensor[i * fan_in : (i + 1) * fan_in, :])
+    def _inner_uniform(tensor):
+        fan_in = nn.init._calculate_correct_fan(tensor, "fan_in")
+        nn.init.uniform_(tensor, -math.sqrt(3 / fan_in), math.sqrt(3 / fan_in))
+    for i in range(rnn.num_layers):
+        _concat_init(
+            getattr(rnn, "weight_ih_l{}".format(i)),
+            [_inner_uniform, _inner_uniform, _inner_uniform],
+        )
+        torch.nn.init.constant_(getattr(rnn, "bias_ih_l{}".format(i)), 0)
+        _concat_init(
+            getattr(rnn, "weight_hh_l{}".format(i)),
+            [_inner_uniform, _inner_uniform, nn.init.orthogonal_],
+        )
+        torch.nn.init.constant_(getattr(rnn, "bias_hh_l{}".format(i)), 0)
+from torch.cuda import init
+def act(x, activation):
+    if activation == "relu":
+        return F.relu_(x)
+    elif activation == "leaky_relu":
+        return F.leaky_relu_(x, negative_slope=0.2)
+    elif activation == "swish":
+        return x * torch.sigmoid(x)
+    else:
+        raise Exception("Incorrect activation!")

voicefixer/tools/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@File    :   __init__.py.py
+@Contact :   [email protected]
+@License :   (C)Copyright 2020-2100
+@Modify Time      @Author    @Version    @Desciption
+------------      -------    --------    -----------
+9/14/21 12:28 AM   Haohe Liu      1.0         None
+"""

voicefixer/tools/base.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import math
+import numpy as np
+import torch
+import os
+import torch.fft
+os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
+def get_window(window_size, window_type, square_root_window=True):
+    """Return the window"""
+    window = {
+        "hamming": torch.hamming_window(window_size),
+        "hanning": torch.hann_window(window_size),
+    }[window_type]
+    if square_root_window:
+        window = torch.sqrt(window)
+    return window
+def fft_point(dim):
+    assert dim > 0
+    num = math.log(dim, 2)
+    num_point = 2 ** (math.ceil(num))
+    return num_point
+def pre_emphasis(signal, coefficient=0.97):
+    """Pre-emphasis original signal
+    y(n) = x(n) - a*x(n-1)
+    """
+    return np.append(signal[0], signal[1:] - coefficient * signal[:-1])
+def de_emphasis(signal, coefficient=0.97):
+    """De-emphasis original signal
+    y(n) = x(n) + a*x(n-1)
+    """
+    length = signal.shape[0]
+    for i in range(1, length):
+        signal[i] = signal[i] + coefficient * signal[i - 1]
+    return signal
+def seperate_magnitude(magnitude, phase):
+    real = torch.cos(phase) * magnitude
+    imagine = torch.sin(phase) * magnitude
+    expand_dim = len(list(real.size()))
+    return torch.stack((real, imagine), expand_dim)
+def stft_single(
+    signal,
+    sample_rate=44100,
+    frame_length=46,
+    frame_shift=10,
+    window_type="hanning",
+    device=torch.device("cuda"),
+    square_root_window=True,
+):
+    """Compute the Short Time Fourier Transform.
+    Args:
+        signal: input speech signal,
+        sample_rate: waveform datas sample frequency (Hz)
+        frame_length: frame length in milliseconds
+        frame_shift: frame shift in milliseconds
+        window_type: type of window
+        square_root_window: square root window
+    Return:
+        fft: (n/2)+1 dim complex STFT restults
+    """
+    hop_length = int(
+        sample_rate * frame_shift / 1000
+    )  # The greater sample_rate, the greater hop_length
+    win_length = int(sample_rate * frame_length / 1000)
+    # num_point = fft_point(win_length)
+    num_point = win_length
+    window = get_window(num_point, window_type, square_root_window)
+    if "cuda" in str(device):
+        window = window.cuda(device)
+    feat = torch.stft(
+        signal,
+        n_fft=num_point,
+        hop_length=hop_length,
+        win_length=window.shape[0],
+        window=window,
+    )
+    real, imag = feat[..., 0], feat[..., 1]
+    return real.permute(0, 2, 1).unsqueeze(1), imag.permute(0, 2, 1).unsqueeze(1)
+def istft(
+    real,
+    imag,
+    length,
+    sample_rate=44100,
+    frame_length=46,
+    frame_shift=10,
+    window_type="hanning",
+    preemphasis=0.0,
+    device=torch.device("cuda"),
+    square_root_window=True,
+):
+    """Convert frames to signal using overlap-and-add systhesis.
+    Args:
+        spectrum: magnitude spectrum [batchsize,x,y,2]
+        signal: wave signal to supply phase information
+    Return:
+        wav: synthesied output waveform
+    """
+    real = real.permute(0, 3, 2, 1)
+    imag = imag.permute(0, 3, 2, 1)
+    spectrum = torch.cat([real, imag], dim=-1)
+    hop_length = int(sample_rate * frame_shift / 1000)
+    win_length = int(sample_rate * frame_length / 1000)
+    # num_point = fft_point(win_length)
+    num_point = win_length
+    if "cuda" in str(device):
+        window = get_window(num_point, window_type, square_root_window).cuda(device)
+    else:
+        window = get_window(num_point, window_type, square_root_window)
+    wav = torch_istft(
+        spectrum,
+        num_point,
+        hop_length=hop_length,
+        win_length=window.shape[0],
+        window=window,
+    )
+    return wav[..., :length]
+def torch_istft(
+    stft_matrix,  # type: Tensor
+    n_fft,  # type: int
+    hop_length=None,  # type: Optional[int]
+    win_length=None,  # type: Optional[int]
+    window=None,  # type: Optional[Tensor]
+    center=True,  # type: bool
+    pad_mode="reflect",  # type: str
+    normalized=False,  # type: bool
+    onesided=True,  # type: bool
+    length=None,  # type: Optional[int]
+):
+    # type: (...) -> Tensor
+    stft_matrix_dim = stft_matrix.dim()
+    assert 3 <= stft_matrix_dim <= 4, "Incorrect stft dimension: %d" % (stft_matrix_dim)
+    if stft_matrix_dim == 3:
+        # add a channel dimension
+        stft_matrix = stft_matrix.unsqueeze(0)
+    dtype = stft_matrix.dtype
+    device = stft_matrix.device
+    fft_size = stft_matrix.size(1)
+    assert (onesided and n_fft // 2 + 1 == fft_size) or (
+        not onesided and n_fft == fft_size
+    ), (
+        "one_sided implies that n_fft // 2 + 1 == fft_size and not one_sided implies n_fft == fft_size. "
+        + "Given values were onesided: %s, n_fft: %d, fft_size: %d"
+        % ("True" if onesided else False, n_fft, fft_size)
+    )
+    # use stft defaults for Optionals
+    if win_length is None:
+        win_length = n_fft
+    if hop_length is None:
+        hop_length = int(win_length // 4)
+    # There must be overlap
+    assert 0 < hop_length <= win_length
+    assert 0 < win_length <= n_fft
+    if window is None:
+        window = torch.ones(win_length, requires_grad=False, device=device, dtype=dtype)
+    assert window.dim() == 1 and window.size(0) == win_length
+    if win_length != n_fft:
+        # center window with pad left and right zeros
+        left = (n_fft - win_length) // 2
+        window = torch.nn.functional.pad(window, (left, n_fft - win_length - left))
+        assert window.size(0) == n_fft
+    # win_length and n_fft are synonymous from here on
+    stft_matrix = stft_matrix.transpose(1, 2)  # size (channel, n_frames, fft_size, 2)
+    stft_matrix = torch.irfft(
+        stft_matrix, 1, normalized, onesided, signal_sizes=(n_fft,)
+    )  # size (channel, n_frames, n_fft)
+    assert stft_matrix.size(2) == n_fft
+    n_frames = stft_matrix.size(1)
+    ytmp = stft_matrix * window.view(1, 1, n_fft)  # size (channel, n_frames, n_fft)
+    # each column of a channel is a frame which needs to be overlap added at the right place
+    ytmp = ytmp.transpose(1, 2)  # size (channel, n_fft, n_frames)
+    eye = torch.eye(n_fft, requires_grad=False, device=device, dtype=dtype).unsqueeze(
+        1
+    )  # size (n_fft, 1, n_fft)
+    # this does overlap add where the frames of ytmp are added such that the i'th frame of
+    # ytmp is added starting at i*hop_length in the output
+    y = torch.nn.functional.conv_transpose1d(
+        ytmp, eye, stride=hop_length, padding=0
+    )  # size (channel, 1, expected_signal_len)
+    # do the same for the window function
+    window_sq = (
+        window.pow(2).view(n_fft, 1).repeat((1, n_frames)).unsqueeze(0)
+    )  # size (1, n_fft, n_frames)
+    window_envelop = torch.nn.functional.conv_transpose1d(
+        window_sq, eye, stride=hop_length, padding=0
+    )  # size (1, 1, expected_signal_len)
+    expected_signal_len = n_fft + hop_length * (n_frames - 1)
+    assert y.size(2) == expected_signal_len
+    assert window_envelop.size(2) == expected_signal_len
+    half_n_fft = n_fft // 2
+    # we need to trim the front padding away if center
+    start = half_n_fft if center else 0
+    end = -half_n_fft if length is None else start + length
+    y = y[:, :, start:end]
+    window_envelop = window_envelop[:, :, start:end]
+    # check NOLA non-zero overlap condition
+    window_envelop_lowest = window_envelop.abs().min()
+    assert window_envelop_lowest > 1e-11, "window overlap add min: %f" % (
+        window_envelop_lowest
+    )
+    y = (y / window_envelop).squeeze(1)  # size (channel, expected_signal_len)
+    if stft_matrix_dim == 3:  # remove the channel dimension
+        y = y.squeeze(0)
+    return y

voicefixer/tools/io.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import json
+import pickle
+def read_list(fname):
+    result = []
+    with open(fname, "r") as f:
+        for each in f.readlines():
+            each = each.strip("\n")
+            result.append(each)
+    return result
+def write_list(list, fname):
+    with open(fname, "w") as f:
+        for word in list:
+            f.write(word)
+            f.write("\n")
+def write_json(my_dict, fname):
+    # print("Save json file at "+fname)
+    json_str = json.dumps(my_dict)
+    with open(fname, "w") as json_file:
+        json_file.write(json_str)
+def load_json(fname):
+    with open(fname, "r") as f:
+        data = json.load(f)
+        return data
+def save_pickle(obj, fname):
+    # print("Save pickle at "+fname)
+    with open(fname, "wb") as f:
+        pickle.dump(obj, f)
+def load_pickle(fname):
+    # print("Load pickle at "+fname)
+    with open(fname, "rb") as f:
+        res = pickle.load(f)
+    return res

voicefixer/tools/mel_scale.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import torch
+from torch import Tensor
+from typing import Optional
+import math
+import warnings
+class MelScale(torch.nn.Module):
+    r"""Turn a normal STFT into a mel frequency STFT, using a conversion
+    matrix.  This uses triangular filter banks.
+    User can control which device the filter bank (`fb`) is (e.g. fb.to(spec_f.device)).
+    Args:
+        n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
+        sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
+        f_min (float, optional): Minimum frequency. (Default: ``0.``)
+        f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
+        n_stft (int, optional): Number of bins in STFT. See ``n_fft`` in :class:`Spectrogram`. (Default: ``201``)
+        norm (str or None, optional): If 'slaney', divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+    See also:
+        :py:func:`torchaudio.functional.melscale_fbanks` - The function used to
+        generate the filter banks.
+    """
+    __constants__ = ["n_mels", "sample_rate", "f_min", "f_max"]
+    def __init__(
+        self,
+        n_mels: int = 128,
+        sample_rate: int = 16000,
+        f_min: float = 0.0,
+        f_max: Optional[float] = None,
+        n_stft: int = 201,
+        norm: Optional[str] = None,
+        mel_scale: str = "htk",
+    ) -> None:
+        super(MelScale, self).__init__()
+        self.n_mels = n_mels
+        self.sample_rate = sample_rate
+        self.f_max = f_max if f_max is not None else float(sample_rate // 2)
+        self.f_min = f_min
+        self.norm = norm
+        self.mel_scale = mel_scale
+        assert f_min <= self.f_max, "Require f_min: {} < f_max: {}".format(
+            f_min, self.f_max
+        )
+        fb = melscale_fbanks(
+            n_stft,
+            self.f_min,
+            self.f_max,
+            self.n_mels,
+            self.sample_rate,
+            self.norm,
+            self.mel_scale,
+        )
+        self.register_buffer("fb", fb)
+    def forward(self, specgram: Tensor) -> Tensor:
+        r"""
+        Args:
+            specgram (Tensor): A spectrogram STFT of dimension (..., freq, time).
+        Returns:
+            Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
+        """
+        # (..., time, freq) dot (freq, n_mels) -> (..., n_mels, time)
+        mel_specgram = torch.matmul(specgram.transpose(-1, -2), self.fb).transpose(
+            -1, -2
+        )
+        return mel_specgram
+def _hz_to_mel(freq: float, mel_scale: str = "htk") -> float:
+    r"""Convert Hz to Mels.
+    Args:
+        freqs (float): Frequencies in Hz
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+    Returns:
+        mels (float): Frequency in Mels
+    """
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+    if mel_scale == "htk":
+        return 2595.0 * math.log10(1.0 + (freq / 700.0))
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    mels = (freq - f_min) / f_sp
+    # Fill in the log-scale part
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - f_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+    if freq >= min_log_hz:
+        mels = min_log_mel + math.log(freq / min_log_hz) / logstep
+    return mels
+def _mel_to_hz(mels: Tensor, mel_scale: str = "htk") -> Tensor:
+    """Convert mel bin numbers to frequencies.
+    Args:
+        mels (Tensor): Mel frequencies
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+    Returns:
+        freqs (Tensor): Mels converted in Hz
+    """
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+    if mel_scale == "htk":
+        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
+    # Fill in the linear scale
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mels
+    # And now the nonlinear scale
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - f_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+    log_t = mels >= min_log_mel
+    freqs[log_t] = min_log_hz * torch.exp(logstep * (mels[log_t] - min_log_mel))
+    return freqs
+def _create_triangular_filterbank(
+    all_freqs: Tensor,
+    f_pts: Tensor,
+) -> Tensor:
+    """Create a triangular filter bank.
+    Args:
+        all_freqs (Tensor): STFT freq points of size (`n_freqs`).
+        f_pts (Tensor): Filter mid points of size (`n_filter`).
+    Returns:
+        fb (Tensor): The filter bank of size (`n_freqs`, `n_filter`).
+    """
+    # Adopted from Librosa
+    # calculate the difference between each filter mid point and each stft freq point in hertz
+    f_diff = f_pts[1:] - f_pts[:-1]  # (n_filter + 1)
+    slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1)  # (n_freqs, n_filter + 2)
+    # create overlapping triangles
+    zero = torch.zeros(1)
+    down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_filter)
+    up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_filter)
+    fb = torch.max(zero, torch.min(down_slopes, up_slopes))
+    return fb
+def melscale_fbanks(
+    n_freqs: int,
+    f_min: float,
+    f_max: float,
+    n_mels: int,
+    sample_rate: int,
+    norm: Optional[str] = None,
+    mel_scale: str = "htk",
+) -> Tensor:
+    r"""Create a frequency bin conversion matrix.
+    Note:
+        For the sake of the numerical compatibility with librosa, not all the coefficients
+        in the resulting filter bank has magnitude of 1.
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/mel_fbanks.png
+           :alt: Visualization of generated filter bank
+    Args:
+        n_freqs (int): Number of frequencies to highlight/apply
+        f_min (float): Minimum frequency (Hz)
+        f_max (float): Maximum frequency (Hz)
+        n_mels (int): Number of mel filterbanks
+        sample_rate (int): Sample rate of the audio waveform
+        norm (str or None, optional): If 'slaney', divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+    Returns:
+        Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``)
+        meaning number of frequencies to highlight/apply to x the number of filterbanks.
+        Each column is a filterbank so that assuming there is a matrix A of
+        size (..., ``n_freqs``), the applied result would be
+        ``A * melscale_fbanks(A.size(-1), ...)``.
+    """
+    if norm is not None and norm != "slaney":
+        raise ValueError("norm must be one of None or 'slaney'")
+    # freq bins
+    all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
+    # calculate mel freq bins
+    m_min = _hz_to_mel(f_min, mel_scale=mel_scale)
+    m_max = _hz_to_mel(f_max, mel_scale=mel_scale)
+    m_pts = torch.linspace(m_min, m_max, n_mels + 2)
+    f_pts = _mel_to_hz(m_pts, mel_scale=mel_scale)
+    # create filterbank
+    fb = _create_triangular_filterbank(all_freqs, f_pts)
+    if norm is not None and norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels])
+        fb *= enorm.unsqueeze(0)
+    if (fb.max(dim=0).values == 0.0).any():
+        warnings.warn(
+            "At least one mel filterbank has all zero values. "
+            f"The value for `n_mels` ({n_mels}) may be set too high. "
+            f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
+        )
+    return fb

voicefixer/tools/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@File    :   __init__.py.py
+@Contact :   [email protected]
+@License :   (C)Copyright 2020-2100
+@Modify Time      @Author    @Version    @Desciption
+------------      -------    --------    -----------
+9/14/21 12:29 AM   Haohe Liu      1.0         None
+"""

voicefixer/tools/modules/fDomainHelper.py ADDED Viewed

	@@ -0,0 +1,234 @@

+from torchlibrosa.stft import STFT, ISTFT, magphase
+import torch
+import torch.nn as nn
+import numpy as np
+from voicefixer.tools.modules.pqmf import PQMF
+class FDomainHelper(nn.Module):
+    def __init__(
+        self,
+        window_size=2048,
+        hop_size=441,
+        center=True,
+        pad_mode="reflect",
+        window="hann",
+        freeze_parameters=True,
+        subband=None,
+        root="/Users/admin/Documents/projects/",
+    ):
+        super(FDomainHelper, self).__init__()
+        self.subband = subband
+        # assert torchlibrosa.__version__ == "0.0.7", "Error: Found torchlibrosa version %s. Please install 0.0.7 version of torchlibrosa by: pip install torchlibrosa==0.0.7." % torchlibrosa.__version__
+        if self.subband is None:
+            self.stft = STFT(
+                n_fft=window_size,
+                hop_length=hop_size,
+                win_length=window_size,
+                window=window,
+                center=center,
+                pad_mode=pad_mode,
+                freeze_parameters=freeze_parameters,
+            )
+            self.istft = ISTFT(
+                n_fft=window_size,
+                hop_length=hop_size,
+                win_length=window_size,
+                window=window,
+                center=center,
+                pad_mode=pad_mode,
+                freeze_parameters=freeze_parameters,
+            )
+        else:
+            self.stft = STFT(
+                n_fft=window_size // self.subband,
+                hop_length=hop_size // self.subband,
+                win_length=window_size // self.subband,
+                window=window,
+                center=center,
+                pad_mode=pad_mode,
+                freeze_parameters=freeze_parameters,
+            )
+            self.istft = ISTFT(
+                n_fft=window_size // self.subband,
+                hop_length=hop_size // self.subband,
+                win_length=window_size // self.subband,
+                window=window,
+                center=center,
+                pad_mode=pad_mode,
+                freeze_parameters=freeze_parameters,
+            )
+        if subband is not None and root is not None:
+            self.qmf = PQMF(subband, 64, root)
+    def complex_spectrogram(self, input, eps=0.0):
+        # [batchsize, samples]
+        # return [batchsize, 2, t-steps, f-bins]
+        real, imag = self.stft(input)
+        return torch.cat([real, imag], dim=1)
+    def reverse_complex_spectrogram(self, input, eps=0.0, length=None):
+        # [batchsize, 2[real,imag], t-steps, f-bins]
+        wav = self.istft(input[:, 0:1, ...], input[:, 1:2, ...], length=length)
+        return wav
+    def spectrogram(self, input, eps=0.0):
+        (real, imag) = self.stft(input.float())
+        return torch.clamp(real**2 + imag**2, eps, np.inf) ** 0.5
+    def spectrogram_phase(self, input, eps=0.0):
+        (real, imag) = self.stft(input.float())
+        mag = torch.clamp(real**2 + imag**2, eps, np.inf) ** 0.5
+        cos = real / mag
+        sin = imag / mag
+        return mag, cos, sin
+    def wav_to_spectrogram_phase(self, input, eps=1e-8):
+        """Waveform to spectrogram.
+        Args:
+          input: (batch_size, channels_num, segment_samples)
+        Outputs:
+          output: (batch_size, channels_num, time_steps, freq_bins)
+        """
+        sp_list = []
+        cos_list = []
+        sin_list = []
+        channels_num = input.shape[1]
+        for channel in range(channels_num):
+            mag, cos, sin = self.spectrogram_phase(input[:, channel, :], eps=eps)
+            sp_list.append(mag)
+            cos_list.append(cos)
+            sin_list.append(sin)
+        sps = torch.cat(sp_list, dim=1)
+        coss = torch.cat(cos_list, dim=1)
+        sins = torch.cat(sin_list, dim=1)
+        return sps, coss, sins
+    def spectrogram_phase_to_wav(self, sps, coss, sins, length):
+        channels_num = sps.size()[1]
+        res = []
+        for i in range(channels_num):
+            res.append(
+                self.istft(
+                    sps[:, i : i + 1, ...] * coss[:, i : i + 1, ...],
+                    sps[:, i : i + 1, ...] * sins[:, i : i + 1, ...],
+                    length,
+                )
+            )
+            res[-1] = res[-1].unsqueeze(1)
+        return torch.cat(res, dim=1)
+    def wav_to_spectrogram(self, input, eps=1e-8):
+        """Waveform to spectrogram.
+        Args:
+          input: (batch_size,channels_num, segment_samples)
+        Outputs:
+          output: (batch_size, channels_num, time_steps, freq_bins)
+        """
+        sp_list = []
+        channels_num = input.shape[1]
+        for channel in range(channels_num):
+            sp_list.append(self.spectrogram(input[:, channel, :], eps=eps))
+        output = torch.cat(sp_list, dim=1)
+        return output
+    def spectrogram_to_wav(self, input, spectrogram, length=None):
+        """Spectrogram to waveform.
+        Args:
+          input: (batch_size, segment_samples, channels_num)
+          spectrogram: (batch_size, channels_num, time_steps, freq_bins)
+        Outputs:
+          output: (batch_size, segment_samples, channels_num)
+        """
+        channels_num = input.shape[1]
+        wav_list = []
+        for channel in range(channels_num):
+            (real, imag) = self.stft(input[:, channel, :])
+            (_, cos, sin) = magphase(real, imag)
+            wav_list.append(
+                self.istft(
+                    spectrogram[:, channel : channel + 1, :, :] * cos,
+                    spectrogram[:, channel : channel + 1, :, :] * sin,
+                    length,
+                )
+            )
+        output = torch.stack(wav_list, dim=1)
+        return output
+    # todo the following code is not bug free!
+    def wav_to_complex_spectrogram(self, input, eps=0.0):
+        # [batchsize , channels, samples]
+        # [batchsize, 2[real,imag]*channels, t-steps, f-bins]
+        res = []
+        channels_num = input.shape[1]
+        for channel in range(channels_num):
+            res.append(self.complex_spectrogram(input[:, channel, :], eps=eps))
+        return torch.cat(res, dim=1)
+    def complex_spectrogram_to_wav(self, input, eps=0.0, length=None):
+        # [batchsize, 2[real,imag]*channels, t-steps, f-bins]
+        # return  [batchsize, channels, samples]
+        channels = input.size()[1] // 2
+        wavs = []
+        for i in range(channels):
+            wavs.append(
+                self.reverse_complex_spectrogram(
+                    input[:, 2 * i : 2 * i + 2, ...], eps=eps, length=length
+                )
+            )
+            wavs[-1] = wavs[-1].unsqueeze(1)
+        return torch.cat(wavs, dim=1)
+    def wav_to_complex_subband_spectrogram(self, input, eps=0.0):
+        # [batchsize, channels, samples]
+        # [batchsize, 2[real,imag]*subband*channels, t-steps, f-bins]
+        subwav = self.qmf.analysis(input)  # [batchsize, subband*channels, samples]
+        subspec = self.wav_to_complex_spectrogram(subwav)
+        return subspec
+    def complex_subband_spectrogram_to_wav(self, input, eps=0.0):
+        # [batchsize, 2[real,imag]*subband*channels, t-steps, f-bins]
+        # [batchsize, channels, samples]
+        subwav = self.complex_spectrogram_to_wav(input)
+        data = self.qmf.synthesis(subwav)
+        return data
+    def wav_to_mag_phase_subband_spectrogram(self, input, eps=1e-8):
+        """
+        :param input:
+        :param eps:
+        :return:
+            loss = torch.nn.L1Loss()
+            models = FDomainHelper(subband=4)
+            data = torch.randn((3,1, 44100*3))
+            sps, coss, sins = models.wav_to_mag_phase_subband_spectrogram(data)
+            wav = models.mag_phase_subband_spectrogram_to_wav(sps,coss,sins,44100*3//4)
+            print(loss(data,wav))
+            print(torch.max(torch.abs(data-wav)))
+        """
+        # [batchsize, channels, samples]
+        # [batchsize, 2[real,imag]*subband*channels, t-steps, f-bins]
+        subwav = self.qmf.analysis(input)  # [batchsize, subband*channels, samples]
+        sps, coss, sins = self.wav_to_spectrogram_phase(subwav, eps=eps)
+        return sps, coss, sins
+    def mag_phase_subband_spectrogram_to_wav(self, sps, coss, sins, length, eps=0.0):
+        # [batchsize, 2[real,imag]*subband*channels, t-steps, f-bins]
+        # [batchsize, channels, samples]
+        subwav = self.spectrogram_phase_to_wav(
+            sps, coss, sins, length + self.qmf.pad_samples // self.qmf.N
+        )
+        data = self.qmf.synthesis(subwav)
+        return data

voicefixer/tools/modules/filters/f_2_64.mat ADDED Viewed

File without changes

voicefixer/tools/modules/filters/f_4_64.mat ADDED Viewed

File without changes

voicefixer/tools/modules/filters/f_8_64.mat ADDED Viewed

File without changes

voicefixer/tools/modules/filters/h_2_64.mat ADDED Viewed

File without changes

voicefixer/tools/modules/filters/h_4_64.mat ADDED Viewed

File without changes

voicefixer/tools/modules/filters/h_8_64.mat ADDED Viewed

File without changes

voicefixer/tools/modules/pqmf.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+@File    :   subband_util.py
+@Contact :   [email protected]
+@License :   (C)Copyright 2020-2021
+@Modify Time      @Author    @Version    @Desciption
+------------      -------    --------    -----------
+2020/4/3 4:54 PM   Haohe Liu      1.0         None
+"""
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+import numpy as np
+import os.path as op
+from scipy.io import loadmat
+def load_mat2numpy(fname=""):
+    if len(fname) == 0:
+        return None
+    else:
+        return loadmat(fname)
+class PQMF(nn.Module):
+    def __init__(self, N, M, project_root):
+        super().__init__()
+        self.N = N  # nsubband
+        self.M = M  # nfilter
+        try:
+            assert (N, M) in [(8, 64), (4, 64), (2, 64)]
+        except:
+            print("Warning:", N, "subbandand ", M, " filter is not supported")
+        self.pad_samples = 64
+        self.name = str(N) + "_" + str(M) + ".mat"
+        self.ana_conv_filter = nn.Conv1d(
+            1, out_channels=N, kernel_size=M, stride=N, bias=False
+        )
+        data = load_mat2numpy(
+            op.join(
+                project_root,
+                "arnold_workspace/restorer/tools/pytorch/modules/filters/f_"
+                + self.name,
+            )
+        )
+        data = data["f"].astype(np.float32) / N
+        data = np.flipud(data.T).T
+        data = np.reshape(data, (N, 1, M)).copy()
+        dict_new = self.ana_conv_filter.state_dict().copy()
+        dict_new["weight"] = torch.from_numpy(data)
+        self.ana_pad = nn.ConstantPad1d((M - N, 0), 0)
+        self.ana_conv_filter.load_state_dict(dict_new)
+        self.syn_pad = nn.ConstantPad1d((0, M // N - 1), 0)
+        self.syn_conv_filter = nn.Conv1d(
+            N, out_channels=N, kernel_size=M // N, stride=1, bias=False
+        )
+        gk = load_mat2numpy(
+            op.join(
+                project_root,
+                "arnold_workspace/restorer/tools/pytorch/modules/filters/h_"
+                + self.name,
+            )
+        )
+        gk = gk["h"].astype(np.float32)
+        gk = np.transpose(np.reshape(gk, (N, M // N, N)), (1, 0, 2)) * N
+        gk = np.transpose(gk[::-1, :, :], (2, 1, 0)).copy()
+        dict_new = self.syn_conv_filter.state_dict().copy()
+        dict_new["weight"] = torch.from_numpy(gk)
+        self.syn_conv_filter.load_state_dict(dict_new)
+        for param in self.parameters():
+            param.requires_grad = False
+    def __analysis_channel(self, inputs):
+        return self.ana_conv_filter(self.ana_pad(inputs))
+    def __systhesis_channel(self, inputs):
+        ret = self.syn_conv_filter(self.syn_pad(inputs)).permute(0, 2, 1)
+        return torch.reshape(ret, (ret.shape[0], 1, -1))
+    def analysis(self, inputs):
+        """
+        :param inputs: [batchsize,channel,raw_wav],value:[0,1]
+        :return:
+        """
+        inputs = F.pad(inputs, ((0, self.pad_samples)))
+        ret = None
+        for i in range(inputs.size()[1]):  # channels
+            if ret is None:
+                ret = self.__analysis_channel(inputs[:, i : i + 1, :])
+            else:
+                ret = torch.cat(
+                    (ret, self.__analysis_channel(inputs[:, i : i + 1, :])), dim=1
+                )
+        return ret
+    def synthesis(self, data):
+        """
+        :param data: [batchsize,self.N*K,raw_wav_sub],value:[0,1]
+        :return:
+        """
+        ret = None
+        # data = F.pad(data,((0,self.pad_samples//self.N)))
+        for i in range(data.size()[1]):  # channels
+            if i % self.N == 0:
+                if ret is None:
+                    ret = self.__systhesis_channel(data[:, i : i + self.N, :])
+                else:
+                    new = self.__systhesis_channel(data[:, i : i + self.N, :])
+                    ret = torch.cat((ret, new), dim=1)
+        ret = ret[..., : -self.pad_samples]
+        return ret
+    def forward(self, inputs):
+        return self.ana_conv_filter(self.ana_pad(inputs))

voicefixer/tools/path.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+def find_and_build(root, path):
+    path = os.path.join(root, path)
+    if not os.path.exists(path):
+        os.makedirs(path, exist_ok=True)
+    return path
+def root_path(repo_name="voicefixer"):
+    path = os.path.abspath(__file__)
+    return path.split(repo_name)[0]

voicefixer/tools/pytorch_util.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import torch
+import torch.nn as nn
+import numpy as np
+def check_cuda_availability(cuda):
+    if cuda and not torch.cuda.is_available():
+        raise RuntimeError("Error: You set cuda=True but no cuda device found.")
+def try_tensor_cuda(tensor, cuda):
+    if cuda and torch.cuda.is_available():
+        return tensor.cuda()
+    else:
+        return tensor.cpu()
+def to_log(input):
+    assert torch.sum(input < 0) == 0, (
+        str(input) + " has negative values counts " + str(torch.sum(input < 0))
+    )
+    return torch.log10(torch.clip(input, min=1e-8))
+def from_log(input):
+    input = torch.clip(input, min=-np.inf, max=5)
+    return 10**input
+def move_data_to_device(x, device):
+    if "float" in str(x.dtype):
+        x = torch.Tensor(x)
+    elif "int" in str(x.dtype):
+        x = torch.LongTensor(x)
+    else:
+        return x
+    return x.to(device)
+def tensor2numpy(tensor):
+    if "cuda" in str(tensor.device):
+        return tensor.detach().cpu().numpy()
+    else:
+        return tensor.detach().numpy()
+def count_parameters(model):
+    for p in model.parameters():
+        if p.requires_grad:
+            print(p.shape)
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+def count_flops(model, audio_length):
+    multiply_adds = False
+    list_conv2d = []
+    def conv2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        output_channels, output_height, output_width = output[0].size()
+        kernel_ops = (
+            self.kernel_size[0]
+            * self.kernel_size[1]
+            * (self.in_channels / self.groups)
+            * (2 if multiply_adds else 1)
+        )
+        bias_ops = 1 if self.bias is not None else 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_height * output_width
+        list_conv2d.append(flops)
+    list_conv1d = []
+    def conv1d_hook(self, input, output):
+        batch_size, input_channels, input_length = input[0].size()
+        output_channels, output_length = output[0].size()
+        kernel_ops = (
+            self.kernel_size[0]
+            * (self.in_channels / self.groups)
+            * (2 if multiply_adds else 1)
+        )
+        bias_ops = 1 if self.bias is not None else 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_length
+        list_conv1d.append(flops)
+    list_linear = []
+    def linear_hook(self, input, output):
+        batch_size = input[0].size(0) if input[0].dim() == 2 else 1
+        weight_ops = self.weight.nelement() * (2 if multiply_adds else 1)
+        bias_ops = self.bias.nelement()
+        flops = batch_size * (weight_ops + bias_ops)
+        list_linear.append(flops)
+    list_bn = []
+    def bn_hook(self, input, output):
+        list_bn.append(input[0].nelement())
+    list_relu = []
+    def relu_hook(self, input, output):
+        list_relu.append(input[0].nelement())
+    list_pooling2d = []
+    def pooling2d_hook(self, input, output):
+        batch_size, input_channels, input_height, input_width = input[0].size()
+        output_channels, output_height, output_width = output[0].size()
+        kernel_ops = self.kernel_size * self.kernel_size
+        bias_ops = 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_height * output_width
+        list_pooling2d.append(flops)
+    list_pooling1d = []
+    def pooling1d_hook(self, input, output):
+        batch_size, input_channels, input_length = input[0].size()
+        output_channels, output_length = output[0].size()
+        kernel_ops = self.kernel_size
+        bias_ops = 0
+        params = output_channels * (kernel_ops + bias_ops)
+        flops = batch_size * params * output_length
+        list_pooling2d.append(flops)
+    def foo(net):
+        childrens = list(net.children())
+        if not childrens:
+            if isinstance(net, nn.Conv2d):
+                net.register_forward_hook(conv2d_hook)
+            elif isinstance(net, nn.ConvTranspose2d):
+                net.register_forward_hook(conv2d_hook)
+            elif isinstance(net, nn.Conv1d):
+                net.register_forward_hook(conv1d_hook)
+            elif isinstance(net, nn.Linear):
+                net.register_forward_hook(linear_hook)
+            elif isinstance(net, nn.BatchNorm2d) or isinstance(net, nn.BatchNorm1d):
+                net.register_forward_hook(bn_hook)
+            elif isinstance(net, nn.ReLU):
+                net.register_forward_hook(relu_hook)
+            elif isinstance(net, nn.AvgPool2d) or isinstance(net, nn.MaxPool2d):
+                net.register_forward_hook(pooling2d_hook)
+            elif isinstance(net, nn.AvgPool1d) or isinstance(net, nn.MaxPool1d):
+                net.register_forward_hook(pooling1d_hook)
+            else:
+                print("Warning: flop of module {} is not counted!".format(net))
+            return
+        for c in childrens:
+            foo(c)
+    foo(model)
+    input = torch.rand(1, audio_length, 2)
+    out = model(input)
+    total_flops = (
+        sum(list_conv2d)
+        + sum(list_conv1d)
+        + sum(list_linear)
+        + sum(list_bn)
+        + sum(list_relu)
+        + sum(list_pooling2d)
+        + sum(list_pooling1d)
+    )
+    return total_flops

voicefixer/tools/random_.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import random
+import torch
+RANDOM_RESOLUTION = 2**31
+def random_torch(high, to_int=True):
+    if to_int:
+        return int((torch.rand(1)) * high)  # do not use numpy.random.random
+    else:
+        return (torch.rand(1)) * high  # do not use numpy.random.random
+def shuffle_torch(list):
+    length = len(list)
+    res = []
+    order = torch.randperm(length)
+    for each in order:
+        res.append(list[each])
+    assert len(list) == len(res)
+    return res
+def random_choose_list(list):
+    num = int(uniform_torch(0, len(list)))
+    return list[num]
+def normal_torch(mean=0, segma=1):
+    return float(torch.normal(mean=mean, std=torch.Tensor([segma]))[0])
+def uniform_torch(lower, upper):
+    if abs(lower - upper) < 1e-5:
+        return upper
+    return (upper - lower) * torch.rand(1) + lower
+def random_key(keys: list, weights: list):
+    return random.choices(keys, weights=weights)[0]
+def random_select(probs):
+    res = []
+    chance = random_torch(RANDOM_RESOLUTION)
+    threshold = None
+    for prob in probs:
+        # if(threshold is None):threshold=prob
+        # else:threshold*=prob
+        threshold = prob
+        res.append(chance < threshold * RANDOM_RESOLUTION)
+    return res, chance

voicefixer/tools/wav.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import wave
+import os
+import numpy as np
+import scipy.signal as signal
+import soundfile as sf
+import librosa
+def save_wave(frames: np.ndarray, fname, sample_rate=44100):
+    shape = list(frames.shape)
+    if len(shape) == 1:
+        frames = frames[..., None]
+    in_samples, in_channels = shape[-2], shape[-1]
+    if in_channels >= 3:
+        if len(shape) == 2:
+            frames = np.transpose(frames, (1, 0))
+        elif len(shape) == 3:
+            frames = np.transpose(frames, (0, 2, 1))
+        msg = (
+            "Warning: Save audio with "
+            + str(in_channels)
+            + " channels, save permute audio with shape "
+            + str(list(frames.shape))
+            + " please check if it's correct."
+        )
+        # print(msg)
+    if (
+        np.max(frames) <= 1
+        and frames.dtype == np.float32
+        or frames.dtype == np.float16
+        or frames.dtype == np.float64
+    ):
+        frames *= 2**15
+    frames = frames.astype(np.short)
+    if len(frames.shape) >= 3:
+        frames = frames[0, ...]
+    sf.write(fname, frames, samplerate=sample_rate)
+def constrain_length(chunk, length):
+    frames_length = chunk.shape[0]
+    if frames_length == length:
+        return chunk
+    elif frames_length < length:
+        return np.pad(chunk, ((0, int(length - frames_length)), (0, 0)), "constant")
+    else:
+        return chunk[:length, ...]
+def random_chunk_wav_file(fname, chunk_length):
+    """
+    fname: path to wav file
+    chunk_length: frame length in seconds
+    """
+    with wave.open(fname) as f:
+        params = f.getparams()
+        duration = params[3] / params[2]
+        sample_rate = params[2]
+        sample_length = params[3]
+        if duration < chunk_length or abs(duration - chunk_length) < 1e-4:
+            frames = read_wave(fname, sample_rate)
+            return frames, duration, sample_rate  # [-1,1]
+        else:
+            # Random trunk
+            random_starts = np.random.randint(
+                0, sample_length - sample_rate * chunk_length
+            )
+            random_end = random_starts + sample_rate * chunk_length
+            random_starts, random_end = (
+                random_starts / sample_rate,
+                random_end / sample_rate,
+            )
+            random_starts, random_end = random_starts / duration, random_end / duration
+            frames = read_wave(
+                fname, sample_rate, portion_start=random_starts, portion_end=random_end
+            )
+            frames = constrain_length(frames, length=int(chunk_length * sample_rate))
+            return frames, chunk_length, sample_rate
+def random_chunk_wav_file_v2(fname, chunk_length, random_starts=None, random_end=None):
+    """
+    fname: path to wav file
+    chunk_length: frame length in seconds
+    """
+    with wave.open(fname) as f:
+        params = f.getparams()
+        duration = params[3] / params[2]
+        sample_rate = params[2]
+        sample_length = params[3]
+        if duration < chunk_length or abs(duration - chunk_length) < 1e-4:
+            frames = read_wave(fname, sample_rate)
+            return frames, duration, sample_rate  # [-1,1]
+        else:
+            # Random trunk
+            if random_starts is None and random_end is None:
+                random_starts = np.random.randint(
+                    0, sample_length - sample_rate * chunk_length
+                )
+                random_end = random_starts + sample_rate * chunk_length
+                random_starts, random_end = (
+                    random_starts / sample_rate,
+                    random_end / sample_rate,
+                )
+                random_starts, random_end = (
+                    random_starts / duration,
+                    random_end / duration,
+                )
+            frames = read_wave(
+                fname, sample_rate, portion_start=random_starts, portion_end=random_end
+            )
+            frames = constrain_length(frames, length=int(chunk_length * sample_rate))
+            return frames, chunk_length, sample_rate, random_starts, random_end
+def read_wave(
+    fname,
+    sample_rate,
+    portion_start=0,
+    portion_end=1,
+):  # Whether you want raw bytes
+    """
+    :param fname: wav file path
+    :param sample_rate:
+    :param portion_start:
+    :param portion_end:
+    :return: [sample, channels]
+    """
+    # sr = get_sample_rate(fname)
+    # if(sr != sample_rate):
+    #     print("Warning: Sample rate not match, may lead to unexpected behavior.")
+    if portion_end > 1 and portion_end < 1.1:
+        portion_end = 1
+    if portion_end != 1:
+        duration = get_duration(fname)
+        wav, _ = librosa.load(
+            fname,
+            sr=sample_rate,
+            offset=portion_start * duration,
+            duration=(portion_end - portion_start) * duration,
+            mono=False,
+        )
+    else:
+        wav, _ = librosa.load(fname, sr=sample_rate, mono=False)
+    if len(list(wav.shape)) == 1:
+        wav = wav[..., None]
+    else:
+        wav = np.transpose(wav, (1, 0))
+    return wav
+def get_channels_sampwidth_and_sample_rate(fname):
+    with wave.open(fname) as f:
+        params = f.getparams()
+    return (
+        params[0],
+        params[1],
+        params[2],
+    )  # == (2,2,44100),(params[0],params[1],params[2])
+def get_channels(fname):
+    with wave.open(fname) as f:
+        params = f.getparams()
+    return params[0]
+def get_sample_rate(fname):
+    with wave.open(fname) as f:
+        params = f.getparams()
+    return params[2]
+def get_duration(fname):
+    with wave.open(fname) as f:
+        params = f.getparams()
+    return params[3] / params[2]
+def get_framesLength(fname):
+    with wave.open(fname) as f:
+        params = f.getparams()
+    return params[3]
+def restore_wave(zxx):
+    _, w = signal.istft(zxx)
+    return w
+def calculate_total_times(dir):
+    total = 0
+    for each in os.listdir(dir):
+        fname = os.path.join(dir, each)
+        try:
+            duration = get_duration(fname)
+        except:
+            print(fname)
+        total += duration
+    return total
+def filter(pth):
+    global dic
+    temp = []
+    for each in os.listdir(pth):
+        temp.append(os.path.join(pth, each))
+    for each in temp:
+        sr = get_sample_rate(each)
+        if sr not in dic.keys():
+            dic[sr] = []
+        dic[sr].append(each)
+    for each in dic[16000]:
+        # print(each)
+        pass
+    print(dic.keys())
+    for each in list(dic.keys()):
+        print(each, len(dic[each]))
+if __name__ == "__main__":
+    path = "/Users/admin/Desktop/p376_025.wav"
+    stereo = "/Users/admin/Desktop/vocals.wav"
+    path_16 = "/Users/admin/Desktop/SI869.WAV.wav"
+    import time
+    start = time.time()
+    for i in range(1000):
+        frames, duration, sample_rate = random_chunk_wav_file(stereo, chunk_length=3.0)
+        print(frames.shape, np.max(frames))
+        save_wave(frames, "stero.wav", sample_rate=44100)
+        frames, duration, sample_rate = random_chunk_wav_file(path, chunk_length=3.0)
+        print(frames.shape, np.max(frames))
+        save_wave(frames, "mono.wav", sample_rate=44100)
+        frames, duration, sample_rate = random_chunk_wav_file(path_16, chunk_length=3.0)
+        print(frames.shape, np.max(frames))
+        save_wave(frames, "16.wav", sample_rate=16000)
+    print(time.time() - start)
+    # frames = read_wave(stereo,sample_rate=44100)
+    print(frames.shape)
+    print(frames)

voicefixer/vocoder/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@File    :   __init__.py.py
+@Contact :   [email protected]
+@License :   (C)Copyright 2020-2100
+@Modify Time      @Author    @Version    @Desciption
+------------      -------    --------    -----------
+9/14/21 1:00 AM   Haohe Liu      1.0         None
+"""
+import os
+from voicefixer.vocoder.config import Config
+import urllib.request
+if not os.path.exists(Config.ckpt):
+    os.makedirs(os.path.dirname(Config.ckpt), exist_ok=True)
+    print("Downloading the weight of neural vocoder: TFGAN")
+    urllib.request.urlretrieve(
+        "https://zenodo.org/record/5469951/files/model.ckpt-1490000_trimed.pt?download=1",
+        Config.ckpt,
+    )
+    print(
+        "Weights downloaded in: {} Size: {}".format(
+            Config.ckpt, os.path.getsize(Config.ckpt)
+        )
+    )
+    # cmd = "wget https://zenodo.org/record/5469951/files/model.ckpt-1490000_trimed.pt?download=1 -O " + Config.ckpt
+    # os.system(cmd)

voicefixer/vocoder/base.py ADDED Viewed

	@@ -0,0 +1,86 @@

+from voicefixer.vocoder.model.generator import Generator
+from voicefixer.tools.wav import read_wave, save_wave
+from voicefixer.tools.pytorch_util import *
+from voicefixer.vocoder.model.util import *
+from voicefixer.vocoder.config import Config
+import os
+import numpy as np
+class Vocoder(nn.Module):
+    def __init__(self, sample_rate):
+        super(Vocoder, self).__init__()
+        Config.refresh(sample_rate)
+        self.rate = sample_rate
+        if(not os.path.exists(Config.ckpt)):
+            raise RuntimeError("Error 1: The checkpoint for synthesis module / vocoder (model.ckpt-1490000_trimed) is not found in ~/.cache/voicefixer/synthesis_module/44100. \
+                                By default the checkpoint should be download automatically by this program. Something bad may happened. Apologies for the inconvenience.\
+                                But don't worry! Alternatively you can download it directly from Zenodo: https://zenodo.org/record/5600188/files/model.ckpt-1490000_trimed.pt?download=1")
+        self._load_pretrain(Config.ckpt)
+        self.weight_torch = Config.get_mel_weight_torch(percent=1.0)[
+            None, None, None, ...
+        ]
+    def _load_pretrain(self, pth):
+        self.model = Generator(Config.cin_channels)
+        checkpoint = load_checkpoint(pth, torch.device("cpu"))
+        load_try(checkpoint["generator"], self.model)
+        self.model.eval()
+        self.model.remove_weight_norm()
+        self.model.remove_weight_norm()
+        for p in self.model.parameters():
+            p.requires_grad = False
+    # def vocoder_mel_npy(self, mel, save_dir, sample_rate, gain):
+    #     mel = mel / Config.get_mel_weight(percent=gain)[...,None]
+    #     mel = normalize(amp_to_db(np.abs(mel)) - 20)
+    #     mel = pre(np.transpose(mel, (1, 0)))
+    #     with torch.no_grad():
+    #         wav_re = self.model(mel) # torch.Size([1, 1, 104076])
+    #         save_wave(tensor2numpy(wav_re)*2**15,save_dir,sample_rate=sample_rate)
+    def forward(self, mel, cuda=False):
+        """
+        :param non normalized mel spectrogram: [batchsize, 1, t-steps, n_mel]
+        :return: [batchsize, 1, samples]
+        """
+        assert mel.size()[-1] == 128
+        check_cuda_availability(cuda=cuda)
+        self.model = try_tensor_cuda(self.model, cuda=cuda)
+        mel = try_tensor_cuda(mel, cuda=cuda)
+        self.weight_torch = self.weight_torch.type_as(mel)
+        mel = mel / self.weight_torch
+        mel = tr_normalize(tr_amp_to_db(torch.abs(mel)) - 20.0)
+        mel = tr_pre(mel[:, 0, ...])
+        wav_re = self.model(mel)
+        return wav_re
+    def oracle(self, fpath, out_path, cuda=False):
+        check_cuda_availability(cuda=cuda)
+        self.model = try_tensor_cuda(self.model, cuda=cuda)
+        wav = read_wave(fpath, sample_rate=self.rate)[..., 0]
+        wav = wav / np.max(np.abs(wav))
+        stft = np.abs(
+            librosa.stft(
+                wav,
+                hop_length=Config.hop_length,
+                win_length=Config.win_size,
+                n_fft=Config.n_fft,
+            )
+        )
+        mel = linear_to_mel(stft)
+        mel = normalize(amp_to_db(np.abs(mel)) - 20)
+        mel = pre(np.transpose(mel, (1, 0)))
+        mel = try_tensor_cuda(mel, cuda=cuda)
+        with torch.no_grad():
+            wav_re = self.model(mel)
+            save_wave(tensor2numpy(wav_re * 2**15), out_path, sample_rate=self.rate)
+if __name__ == "__main__":
+    model = Vocoder(sample_rate=44100)
+    print(model.device)
+    # model.load_pretrain(Config.ckpt)
+    # model.oracle(path="/Users/liuhaohe/Desktop/test.wav",
+    #         sample_rate=44100,
+    #         save_dir="/Users/liuhaohe/Desktop/test_vocoder.wav")

voicefixer/vocoder/config.py ADDED Viewed

	@@ -0,0 +1,316 @@

+import torch
+import numpy as np
+import os
+from voicefixer.tools.path import root_path
+class Config:
+    @classmethod
+    def refresh(cls, sr):
+        if sr == 44100:
+            Config.ckpt = os.path.join(
+                os.path.expanduser("~"),
+                ".cache/voicefixer/synthesis_module/44100/model.ckpt-1490000_trimed.pt",
+            )
+            Config.cond_channels = 512
+            Config.m_channels = 768
+            Config.resstack_depth = [8, 8, 8, 8]
+            Config.channels = 1024
+            Config.cin_channels = 128
+            Config.upsample_scales = [7, 7, 3, 3]
+            Config.num_mels = 128
+            Config.n_fft = 2048
+            Config.hop_length = 441
+            Config.sample_rate = 44100
+            Config.fmax = 22000
+            Config.mel_win = 128
+            Config.local_condition_dim = 128
+        else:
+            raise RuntimeError(
+                "Error: Vocoder currently only support 44100 samplerate."
+            )
+    ckpt = os.path.join(
+        os.path.expanduser("~"),
+        ".cache/voicefixer/synthesis_module/44100/model.ckpt-1490000_trimed.pt",
+    )
+    m_channels = 384
+    bits = 10
+    opt = "Ralamb"
+    cond_channels = 256
+    clip = 0.5
+    num_bands = 1
+    cin_channels = 128
+    upsample_scales = [7, 7, 3, 3]
+    filterbands = "test/filterbanks_4bands.dat"
+    ##For inference
+    tag = ""
+    min_db = -115
+    num_mels = 128
+    n_fft = 2048
+    hop_length = 441
+    win_size = None
+    sample_rate = 44100
+    frame_shift_ms = None
+    trim_fft_size = 512
+    trim_hop_size = 128
+    trim_top_db = 23
+    signal_normalization = True
+    allow_clipping_in_normalization = True
+    symmetric_mels = True
+    max_abs_value = 4.0
+    preemphasis = 0.85
+    min_level_db = -100
+    ref_level_db = 20
+    fmin = 50
+    fmax = 22000
+    power = 1.5
+    griffin_lim_iters = 60
+    rescale = False
+    rescaling_max = 0.95
+    trim_silence = False
+    clip_mels_length = True
+    max_mel_frames = 2000
+    mel_win = 128
+    batch_size = 24
+    g_learning_rate = 0.001
+    d_learning_rate = 0.001
+    warmup_steps = 100000
+    decay_learning_rate = 0.5
+    exponential_moving_average = True
+    ema_decay = 0.99
+    reset_opt = False
+    reset_g_opt = False
+    reset_d_opt = False
+    local_condition_dim = 128
+    lambda_update_G = 1
+    multiscale_D = 3
+    lambda_adv = 4.0
+    lambda_fm_loss = 0.0
+    lambda_sc_loss = 5.0
+    lambda_mag_loss = 5.0
+    lambda_mel_loss = 50.0
+    use_mle_loss = False
+    lambda_mle_loss = 5.0
+    lambda_freq_loss = 2.0
+    lambda_energy_loss = 100.0
+    lambda_t_loss = 200.0
+    lambda_phase_loss = 100.0
+    lambda_f0_loss = 1.0
+    use_elu = False
+    de_preem = False  # train
+    up_org = False
+    use_one = True
+    use_small_D = False
+    use_condnet = True
+    use_depreem = False  # inference
+    use_msd = False
+    model_type = "tfgan"  # or bytewave, frame level vocoder using istft
+    use_hjcud = False
+    no_skip = False
+    out_channels = 1
+    use_postnet = False  # wn in postnet
+    use_wn = False  # wn in resstack
+    up_type = "transpose"
+    use_smooth = False
+    use_drop = False
+    use_shift_scale = False
+    use_gcnn = False
+    resstack_depth = [6, 6, 6, 6]
+    kernel_size = [3, 3, 3, 3]
+    channels = 512
+    use_f0_loss = False
+    use_sine = False
+    use_cond_rnn = False
+    use_rnn = False
+    f0_step = 120
+    use_lowfreq_loss = False
+    lambda_lowfreq_loss = 1.0
+    use_film = False
+    use_mb_mr_gan = False
+    use_mssl = False
+    use_ml_gan = False
+    use_mb_gan = True
+    use_mpd = False
+    use_spec_gan = True
+    use_rwd = False
+    use_mr_gan = True
+    use_pqmf_rwd = False
+    no_sine = False
+    use_frame_mask = False
+    lambda_var_loss = 0.0
+    discriminator_train_start_steps = 40000  # 80k
+    aux_d_train_start_steps = 40000  # 100k
+    rescale_out = 0.40
+    use_dist = True
+    dist_backend = "nccl"
+    dist_url = "tcp://localhost:12345"
+    world_size = 1
+    mel_weight_torch = torch.tensor(
+        [
+            19.40951426,
+            19.94047336,
+            20.4859038,
+            21.04629067,
+            21.62194148,
+            22.21335214,
+            22.8210215,
+            23.44529231,
+            24.08660962,
+            24.74541882,
+            25.42234287,
+            26.11770576,
+            26.83212784,
+            27.56615283,
+            28.32007747,
+            29.0947679,
+            29.89060111,
+            30.70832636,
+            31.54828121,
+            32.41121487,
+            33.29780773,
+            34.20865341,
+            35.14437675,
+            36.1056621,
+            37.09332763,
+            38.10795802,
+            39.15039691,
+            40.22119881,
+            41.32154931,
+            42.45172373,
+            43.61293329,
+            44.80609379,
+            46.031602,
+            47.29070223,
+            48.58427549,
+            49.91327905,
+            51.27863232,
+            52.68119708,
+            54.1222372,
+            55.60274206,
+            57.12364703,
+            58.68617876,
+            60.29148652,
+            61.94081306,
+            63.63501986,
+            65.37562658,
+            67.16408954,
+            69.00109084,
+            70.88850318,
+            72.82736101,
+            74.81985537,
+            76.86654792,
+            78.96885475,
+            81.12900906,
+            83.34840929,
+            85.62810662,
+            87.97005418,
+            90.37689804,
+            92.84887686,
+            95.38872881,
+            97.99777002,
+            100.67862715,
+            103.43232942,
+            106.26140638,
+            109.16827015,
+            112.15470471,
+            115.22184756,
+            118.37439245,
+            121.6122689,
+            124.93877158,
+            128.35661454,
+            131.86761321,
+            135.47417938,
+            139.18059494,
+            142.98713744,
+            146.89771854,
+            150.91684347,
+            155.0446638,
+            159.28614648,
+            163.64270198,
+            168.12035831,
+            172.71749158,
+            177.44220154,
+            182.29556933,
+            187.28286676,
+            192.40502126,
+            197.6682721,
+            203.07516896,
+            208.63088733,
+            214.33770931,
+            220.19910108,
+            226.22363072,
+            232.41087124,
+            238.76803591,
+            245.30079083,
+            252.01064464,
+            258.90261676,
+            265.98474,
+            273.26010248,
+            280.73496362,
+            288.41440094,
+            296.30489752,
+            304.41180337,
+            312.7377183,
+            321.28877878,
+            330.07870237,
+            339.10812951,
+            348.38276173,
+            357.91393924,
+            367.70513992,
+            377.76413924,
+            388.09467408,
+            398.70920178,
+            409.61813793,
+            420.81980127,
+            432.33215467,
+            444.16083117,
+            456.30919947,
+            468.78589276,
+            481.61325588,
+            494.78824596,
+            508.31969844,
+            522.2238331,
+            536.51163441,
+            551.18859414,
+            566.26142988,
+            581.75006061,
+            597.66210737,
+        ]
+    )
+    x_orig = np.linspace(1, mel_weight_torch.shape[0], num=mel_weight_torch.shape[0])
+    x_orig_torch = torch.linspace(
+        1, mel_weight_torch.shape[0], steps=mel_weight_torch.shape[0]
+    )
+    @classmethod
+    def get_mel_weight(cls, percent=1, a=18.8927416350036, b=0.0269863588184314):
+        b = percent * b
+        def func(a, b, x):
+            return a * np.exp(b * x)
+        return func(a, b, Config.x_orig)
+    @classmethod
+    def get_mel_weight_torch(cls, percent=1, a=18.8927416350036, b=0.0269863588184314):
+        b = percent * b
+        def func(a, b, x):
+            return a * torch.exp(b * x)
+        return func(a, b, Config.x_orig_torch)

voicefixer/vocoder/model/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@File    :   __init__.py.py
+@Contact :   [email protected]
+@License :   (C)Copyright 2020-2100
+@Modify Time      @Author    @Version    @Desciption
+------------      -------    --------    -----------
+9/14/21 1:00 AM   Haohe Liu      1.0         None
+"""

voicefixer/vocoder/model/generator.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from voicefixer.vocoder.model.modules import UpsampleNet, ResStack
+from voicefixer.vocoder.config import Config
+from voicefixer.vocoder.model.pqmf import PQMF
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
+class Generator(nn.Module):
+    def __init__(
+        self,
+        in_channels=128,
+        use_elu=False,
+        use_gcnn=False,
+        up_org=False,
+        group=1,
+        hp=None,
+    ):
+        super(Generator, self).__init__()
+        self.hp = hp
+        channels = Config.channels
+        self.upsample_scales = Config.upsample_scales
+        self.use_condnet = Config.use_condnet
+        self.out_channels = Config.out_channels
+        self.resstack_depth = Config.resstack_depth
+        self.use_postnet = Config.use_postnet
+        self.use_cond_rnn = Config.use_cond_rnn
+        if self.use_condnet:
+            cond_channels = Config.cond_channels
+            self.condnet = nn.Sequential(
+                nn.utils.weight_norm(
+                    nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
+                ),
+                nn.ELU(),
+                nn.utils.weight_norm(
+                    nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+                ),
+                nn.ELU(),
+                nn.utils.weight_norm(
+                    nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+                ),
+                nn.ELU(),
+                nn.utils.weight_norm(
+                    nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+                ),
+                nn.ELU(),
+                nn.utils.weight_norm(
+                    nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+                ),
+                nn.ELU(),
+            )
+            in_channels = cond_channels
+        if self.use_cond_rnn:
+            self.rnn = nn.GRU(
+                cond_channels,
+                cond_channels // 2,
+                num_layers=1,
+                batch_first=True,
+                bidirectional=True,
+            )
+        if use_elu:
+            act = nn.ELU()
+        else:
+            act = nn.LeakyReLU(0.2, True)
+        kernel_size = Config.kernel_size
+        if self.out_channels == 1:
+            self.generator = nn.Sequential(
+                nn.ReflectionPad1d(3),
+                nn.utils.weight_norm(nn.Conv1d(in_channels, channels, kernel_size=7)),
+                act,
+                UpsampleNet(channels, channels // 2, self.upsample_scales[0], hp, 0),
+                ResStack(channels // 2, kernel_size[0], self.resstack_depth[0], hp),
+                act,
+                UpsampleNet(
+                    channels // 2, channels // 4, self.upsample_scales[1], hp, 1
+                ),
+                ResStack(channels // 4, kernel_size[1], self.resstack_depth[1], hp),
+                act,
+                UpsampleNet(
+                    channels // 4, channels // 8, self.upsample_scales[2], hp, 2
+                ),
+                ResStack(channels // 8, kernel_size[2], self.resstack_depth[2], hp),
+                act,
+                UpsampleNet(
+                    channels // 8, channels // 16, self.upsample_scales[3], hp, 3
+                ),
+                ResStack(channels // 16, kernel_size[3], self.resstack_depth[3], hp),
+                act,
+                nn.ReflectionPad1d(3),
+                nn.utils.weight_norm(
+                    nn.Conv1d(channels // 16, self.out_channels, kernel_size=7)
+                ),
+                nn.Tanh(),
+            )
+        else:
+            channels = Config.m_channels
+            self.generator = nn.Sequential(
+                nn.ReflectionPad1d(3),
+                nn.utils.weight_norm(nn.Conv1d(in_channels, channels, kernel_size=7)),
+                act,
+                UpsampleNet(channels, channels // 2, self.upsample_scales[0], hp),
+                ResStack(channels // 2, kernel_size[0], self.resstack_depth[0], hp),
+                act,
+                UpsampleNet(channels // 2, channels // 4, self.upsample_scales[1], hp),
+                ResStack(channels // 4, kernel_size[1], self.resstack_depth[1], hp),
+                act,
+                UpsampleNet(channels // 4, channels // 8, self.upsample_scales[3], hp),
+                ResStack(channels // 8, kernel_size[3], self.resstack_depth[2], hp),
+                act,
+                nn.ReflectionPad1d(3),
+                nn.utils.weight_norm(
+                    nn.Conv1d(channels // 8, self.out_channels, kernel_size=7)
+                ),
+                nn.Tanh(),
+            )
+        if self.out_channels > 1:
+            self.pqmf = PQMF(4, 64)
+        self.num_params()
+    def forward(self, conditions, use_res=False, f0=None):
+        res = conditions
+        if self.use_condnet:
+            conditions = self.condnet(conditions)
+        if self.use_cond_rnn:
+            conditions, _ = self.rnn(conditions.transpose(1, 2))
+            conditions = conditions.transpose(1, 2)
+        wav = self.generator(conditions)
+        if self.out_channels > 1:
+            B = wav.size(0)
+            f_wav = (
+                self.pqmf.synthesis(wav)
+                .transpose(1, 2)
+                .reshape(B, 1, -1)
+                .clamp(-0.99, 0.99)
+            )
+            return f_wav, wav
+        return wav
+    def num_params(self):
+        parameters = filter(lambda p: p.requires_grad, self.parameters())
+        parameters = sum([np.prod(p.size()) for p in parameters]) / 1_000_000
+        return parameters
+        # print('Trainable Parameters: %.3f million' % parameters)
+    def remove_weight_norm(self):
+        def _remove_weight_norm(m):
+            try:
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)
+if __name__ == "__main__":
+    model = Generator(128)
+    x = torch.randn(3, 128, 13)
+    print(x.shape)
+    y = model(x)
+    print(y.shape)

voicefixer/vocoder/model/modules.py ADDED Viewed

	@@ -0,0 +1,947 @@

+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from voicefixer.vocoder.config import Config
+# From xin wang of nii
+class SineGen(torch.nn.Module):
+    """Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+    def __init__(
+        self,
+        samp_rate=24000,
+        harmonic_num=0,
+        sine_amp=0.1,
+        noise_std=0.003,
+        voiced_threshold=0,
+        flag_for_pulse=False,
+    ):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.dim = self.harmonic_num + 1
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+        self.flag_for_pulse = flag_for_pulse
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = torch.ones_like(f0)
+        uv = uv * (f0 > self.voiced_threshold)
+        return uv
+    def _f02sine(self, f0_values):
+        """f0_values: (batchsize, length, dim)
+        where dim indicates fundamental tone and overtones
+        """
+        # convert to F0 in rad. The interger part n can be ignored
+        # because 2 * np.pi * n doesn't affect phase
+        rad_values = (f0_values / self.sampling_rate) % 1
+        # initial phase noise (no noise for fundamental component)
+        rand_ini = torch.rand(
+            f0_values.shape[0], f0_values.shape[2], device=f0_values.device
+        )
+        rand_ini[:, 0] = 0
+        rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
+        # instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
+        if not self.flag_for_pulse:
+            # for normal case
+            # To prevent torch.cumsum numerical overflow,
+            # it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
+            # Buffer tmp_over_one_idx indicates the time step to add -1.
+            # This will not change F0 of sine because (x-1) * 2*pi = x *2*pi
+            tmp_over_one = torch.cumsum(rad_values, 1) % 1
+            tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
+            cumsum_shift = torch.zeros_like(rad_values)
+            cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
+            sines = torch.sin(
+                torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
+            )
+        else:
+            # If necessary, make sure that the first time step of every
+            # voiced segments is sin(pi) or cos(0)
+            # This is used for pulse-train generation
+            # identify the last time step in unvoiced segments
+            uv = self._f02uv(f0_values)
+            uv_1 = torch.roll(uv, shifts=-1, dims=1)
+            uv_1[:, -1, :] = 1
+            u_loc = (uv < 1) * (uv_1 > 0)
+            # get the instantanouse phase
+            tmp_cumsum = torch.cumsum(rad_values, dim=1)
+            # different batch needs to be processed differently
+            for idx in range(f0_values.shape[0]):
+                temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
+                temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
+                # stores the accumulation of i.phase within
+                # each voiced segments
+                tmp_cumsum[idx, :, :] = 0
+                tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
+            # rad_values - tmp_cumsum: remove the accumulation of i.phase
+            # within the previous voiced segment.
+            i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
+            # get the sines
+            sines = torch.cos(i_phase * 2 * np.pi)
+        return sines
+    def forward(self, f0):
+        """sine_tensor, uv = forward(f0)
+        input F0: tensor(batchsize=1, length, dim=1)
+                  f0 for unvoiced steps should be 0
+        output sine_tensor: tensor(batchsize=1, length, dim)
+        output uv: tensor(batchsize=1, length, 1)
+        """
+        with torch.no_grad():
+            f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
+            # fundamental component
+            f0_buf[:, :, 0] = f0[:, :, 0]
+            for idx in np.arange(self.harmonic_num):
+                # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
+                f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (idx + 2)
+            # generate sine waveforms
+            sine_waves = self._f02sine(f0_buf) * self.sine_amp
+            # generate uv signal
+            # uv = torch.ones(f0.shape)
+            # uv = uv * (f0 > self.voiced_threshold)
+            uv = self._f02uv(f0)
+            # noise: for unvoiced should be similar to sine_amp
+            #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+            # .       for voiced regions is self.noise_std
+            noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+            noise = noise_amp * torch.randn_like(sine_waves)
+            # first: set the unvoiced part to 0 by uv
+            # then: additive noise
+            sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+class LowpassBlur(nn.Module):
+    """perform low pass filter after upsampling for anti-aliasing"""
+    def __init__(self, channels=128, filt_size=3, pad_type="reflect", pad_off=0):
+        super(LowpassBlur, self).__init__()
+        self.filt_size = filt_size
+        self.pad_off = pad_off
+        self.pad_sizes = [
+            int(1.0 * (filt_size - 1) / 2),
+            int(np.ceil(1.0 * (filt_size - 1) / 2)),
+        ]
+        self.pad_sizes = [pad_size + pad_off for pad_size in self.pad_sizes]
+        self.off = 0
+        self.channels = channels
+        if self.filt_size == 1:
+            a = np.array(
+                [
+                    1.0,
+                ]
+            )
+        elif self.filt_size == 2:
+            a = np.array([1.0, 1.0])
+        elif self.filt_size == 3:
+            a = np.array([1.0, 2.0, 1.0])
+        elif self.filt_size == 4:
+            a = np.array([1.0, 3.0, 3.0, 1.0])
+        elif self.filt_size == 5:
+            a = np.array([1.0, 4.0, 6.0, 4.0, 1.0])
+        elif self.filt_size == 6:
+            a = np.array([1.0, 5.0, 10.0, 10.0, 5.0, 1.0])
+        elif self.filt_size == 7:
+            a = np.array([1.0, 6.0, 15.0, 20.0, 15.0, 6.0, 1.0])
+        filt = torch.Tensor(a)
+        filt = filt / torch.sum(filt)
+        self.register_buffer("filt", filt[None, None, :].repeat((self.channels, 1, 1)))
+        self.pad = get_pad_layer_1d(pad_type)(self.pad_sizes)
+    def forward(self, inp):
+        if self.filt_size == 1:
+            return inp
+        return F.conv1d(self.pad(inp), self.filt, groups=inp.shape[1])
+def get_pad_layer_1d(pad_type):
+    if pad_type in ["refl", "reflect"]:
+        PadLayer = nn.ReflectionPad1d
+    elif pad_type in ["repl", "replicate"]:
+        PadLayer = nn.ReplicationPad1d
+    elif pad_type == "zero":
+        PadLayer = nn.ZeroPad1d
+    else:
+        print("Pad type [%s] not recognized" % pad_type)
+    return PadLayer
+class MovingAverageSmooth(torch.nn.Conv1d):
+    def __init__(self, channels, window_len=3):
+        """Initialize Conv1d module."""
+        super(MovingAverageSmooth, self).__init__(
+            in_channels=channels,
+            out_channels=channels,
+            kernel_size=1,
+            groups=channels,
+            bias=False,
+        )
+        torch.nn.init.constant_(self.weight, 1.0 / window_len)
+        for p in self.parameters():
+            p.requires_grad = False
+    def forward(self, data):
+        return super(MovingAverageSmooth, self).forward(data)
+class Conv1d(torch.nn.Conv1d):
+    """Conv1d module with customized initialization."""
+    def __init__(self, *args, **kwargs):
+        """Initialize Conv1d module."""
+        super(Conv1d, self).__init__(*args, **kwargs)
+    def reset_parameters(self):
+        """Reset parameters."""
+        torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            torch.nn.init.constant_(self.bias, 0.0)
+class Stretch2d(torch.nn.Module):
+    """Stretch2d module."""
+    def __init__(self, x_scale, y_scale, mode="nearest"):
+        """Initialize Stretch2d module.
+        Args:
+            x_scale (int): X scaling factor (Time axis in spectrogram).
+            y_scale (int): Y scaling factor (Frequency axis in spectrogram).
+            mode (str): Interpolation mode.
+        """
+        super(Stretch2d, self).__init__()
+        self.x_scale = x_scale
+        self.y_scale = y_scale
+        self.mode = mode
+    def forward(self, x):
+        """Calculate forward propagation.
+        Args:
+            x (Tensor): Input tensor (B, C, F, T).
+        Returns:
+            Tensor: Interpolated tensor (B, C, F * y_scale, T * x_scale),
+        """
+        return F.interpolate(
+            x, scale_factor=(self.y_scale, self.x_scale), mode=self.mode
+        )
+class Conv2d(torch.nn.Conv2d):
+    """Conv2d module with customized initialization."""
+    def __init__(self, *args, **kwargs):
+        """Initialize Conv2d module."""
+        super(Conv2d, self).__init__(*args, **kwargs)
+    def reset_parameters(self):
+        """Reset parameters."""
+        self.weight.data.fill_(1.0 / np.prod(self.kernel_size))
+        if self.bias is not None:
+            torch.nn.init.constant_(self.bias, 0.0)
+class UpsampleNetwork(torch.nn.Module):
+    """Upsampling network module."""
+    def __init__(
+        self,
+        upsample_scales,
+        nonlinear_activation=None,
+        nonlinear_activation_params={},
+        interpolate_mode="nearest",
+        freq_axis_kernel_size=1,
+        use_causal_conv=False,
+    ):
+        """Initialize upsampling network module.
+        Args:
+            upsample_scales (list): List of upsampling scales.
+            nonlinear_activation (str): Activation function name.
+            nonlinear_activation_params (dict): Arguments for specified activation function.
+            interpolate_mode (str): Interpolation mode.
+            freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
+        """
+        super(UpsampleNetwork, self).__init__()
+        self.use_causal_conv = use_causal_conv
+        self.up_layers = torch.nn.ModuleList()
+        for scale in upsample_scales:
+            # interpolation layer
+            stretch = Stretch2d(scale, 1, interpolate_mode)
+            self.up_layers += [stretch]
+            # conv layer
+            assert (
+                freq_axis_kernel_size - 1
+            ) % 2 == 0, "Not support even number freq axis kernel size."
+            freq_axis_padding = (freq_axis_kernel_size - 1) // 2
+            kernel_size = (freq_axis_kernel_size, scale * 2 + 1)
+            if use_causal_conv:
+                padding = (freq_axis_padding, scale * 2)
+            else:
+                padding = (freq_axis_padding, scale)
+            conv = Conv2d(1, 1, kernel_size=kernel_size, padding=padding, bias=False)
+            self.up_layers += [conv]
+            # nonlinear
+            if nonlinear_activation is not None:
+                nonlinear = getattr(torch.nn, nonlinear_activation)(
+                    **nonlinear_activation_params
+                )
+                self.up_layers += [nonlinear]
+    def forward(self, c):
+        """Calculate forward propagation.
+        Args:
+            c : Input tensor (B, C, T).
+        Returns:
+            Tensor: Upsampled tensor (B, C, T'), where T' = T * prod(upsample_scales).
+        """
+        c = c.unsqueeze(1)  # (B, 1, C, T)
+        for f in self.up_layers:
+            if self.use_causal_conv and isinstance(f, Conv2d):
+                c = f(c)[..., : c.size(-1)]
+            else:
+                c = f(c)
+        return c.squeeze(1)  # (B, C, T')
+class ConvInUpsampleNetwork(torch.nn.Module):
+    """Convolution + upsampling network module."""
+    def __init__(
+        self,
+        upsample_scales=[3, 4, 5, 5],
+        nonlinear_activation="ReLU",
+        nonlinear_activation_params={},
+        interpolate_mode="nearest",
+        freq_axis_kernel_size=1,
+        aux_channels=80,
+        aux_context_window=0,
+        use_causal_conv=False,
+    ):
+        """Initialize convolution + upsampling network module.
+        Args:
+            upsample_scales (list): List of upsampling scales.
+            nonlinear_activation (str): Activation function name.
+            nonlinear_activation_params (dict): Arguments for specified activation function.
+            mode (str): Interpolation mode.
+            freq_axis_kernel_size (int): Kernel size in the direction of frequency axis.
+            aux_channels (int): Number of channels of pre-convolutional layer.
+            aux_context_window (int): Context window size of the pre-convolutional layer.
+            use_causal_conv (bool): Whether to use causal structure.
+        """
+        super(ConvInUpsampleNetwork, self).__init__()
+        self.aux_context_window = aux_context_window
+        self.use_causal_conv = use_causal_conv and aux_context_window > 0
+        # To capture wide-context information in conditional features
+        kernel_size = (
+            aux_context_window + 1 if use_causal_conv else 2 * aux_context_window + 1
+        )
+        # NOTE(kan-bayashi): Here do not use padding because the input is already padded
+        self.conv_in = Conv1d(
+            aux_channels, aux_channels, kernel_size=kernel_size, bias=False
+        )
+        self.upsample = UpsampleNetwork(
+            upsample_scales=upsample_scales,
+            nonlinear_activation=nonlinear_activation,
+            nonlinear_activation_params=nonlinear_activation_params,
+            interpolate_mode=interpolate_mode,
+            freq_axis_kernel_size=freq_axis_kernel_size,
+            use_causal_conv=use_causal_conv,
+        )
+    def forward(self, c):
+        """Calculate forward propagation.
+        Args:
+            c : Input tensor (B, C, T').
+        Returns:
+            Tensor: Upsampled tensor (B, C, T),
+                where T = (T' - aux_context_window * 2) * prod(upsample_scales).
+        Note:
+            The length of inputs considers the context window size.
+        """
+        c_ = self.conv_in(c)
+        c = c_[:, :, : -self.aux_context_window] if self.use_causal_conv else c_
+        return self.upsample(c)
+class DownsampleNet(nn.Module):
+    def __init__(self, input_size, output_size, upsample_factor, hp=None, index=0):
+        super(DownsampleNet, self).__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.upsample_factor = upsample_factor
+        self.skip_conv = nn.Conv1d(input_size, output_size, kernel_size=1)
+        self.index = index
+        layer = nn.Conv1d(
+            input_size,
+            output_size,
+            kernel_size=upsample_factor * 2,
+            stride=upsample_factor,
+            padding=upsample_factor // 2 + upsample_factor % 2,
+        )
+        self.layer = nn.utils.weight_norm(layer)
+    def forward(self, inputs):
+        B, C, T = inputs.size()
+        res = inputs[:, :, :: self.upsample_factor]
+        skip = self.skip_conv(res)
+        outputs = self.layer(inputs)
+        outputs = outputs + skip
+        return outputs
+class UpsampleNet(nn.Module):
+    def __init__(self, input_size, output_size, upsample_factor, hp=None, index=0):
+        super(UpsampleNet, self).__init__()
+        self.up_type = Config.up_type
+        self.use_smooth = Config.use_smooth
+        self.use_drop = Config.use_drop
+        self.input_size = input_size
+        self.output_size = output_size
+        self.upsample_factor = upsample_factor
+        self.skip_conv = nn.Conv1d(input_size, output_size, kernel_size=1)
+        self.index = index
+        if self.use_smooth:
+            window_lens = [5, 5, 4, 3]
+            self.window_len = window_lens[index]
+        if self.up_type != "pn" or self.index < 3:
+            # if self.up_type != "pn":
+            layer = nn.ConvTranspose1d(
+                input_size,
+                output_size,
+                upsample_factor * 2,
+                upsample_factor,
+                padding=upsample_factor // 2 + upsample_factor % 2,
+                output_padding=upsample_factor % 2,
+            )
+            self.layer = nn.utils.weight_norm(layer)
+        else:
+            self.layer = nn.Sequential(
+                nn.ReflectionPad1d(1),
+                nn.utils.weight_norm(
+                    nn.Conv1d(input_size, output_size * upsample_factor, kernel_size=3)
+                ),
+                nn.LeakyReLU(),
+                nn.ReflectionPad1d(1),
+                nn.utils.weight_norm(
+                    nn.Conv1d(
+                        output_size * upsample_factor,
+                        output_size * upsample_factor,
+                        kernel_size=3,
+                    )
+                ),
+                nn.LeakyReLU(),
+                nn.ReflectionPad1d(1),
+                nn.utils.weight_norm(
+                    nn.Conv1d(
+                        output_size * upsample_factor,
+                        output_size * upsample_factor,
+                        kernel_size=3,
+                    )
+                ),
+                nn.LeakyReLU(),
+            )
+        if hp is not None:
+            self.org = Config.up_org
+            self.no_skip = Config.no_skip
+        else:
+            self.org = False
+            self.no_skip = True
+        if self.use_smooth:
+            self.mas = nn.Sequential(
+                # LowpassBlur(output_size, self.window_len),
+                MovingAverageSmooth(output_size, self.window_len),
+                # MovingAverageSmooth(output_size, self.window_len),
+            )
+    def forward(self, inputs):
+        if not self.org:
+            inputs = inputs + torch.sin(inputs)
+            B, C, T = inputs.size()
+            res = inputs.repeat(1, self.upsample_factor, 1).view(B, C, -1)
+            skip = self.skip_conv(res)
+            if self.up_type == "repeat":
+                return skip
+        outputs = self.layer(inputs)
+        if self.up_type == "pn" and self.index > 2:
+            B, c, l = outputs.size()
+            outputs = outputs.view(B, -1, l * self.upsample_factor)
+        if self.no_skip:
+            return outputs
+        if not self.org:
+            outputs = outputs + skip
+        if self.use_smooth:
+            outputs = self.mas(outputs)
+        if self.use_drop:
+            outputs = F.dropout(outputs, p=0.05)
+        return outputs
+class ResStack(nn.Module):
+    def __init__(self, channel, kernel_size=3, resstack_depth=4, hp=None):
+        super(ResStack, self).__init__()
+        self.use_wn = Config.use_wn
+        self.use_shift_scale = Config.use_shift_scale
+        self.channel = channel
+        def get_padding(kernel_size, dilation=1):
+            return int((kernel_size * dilation - dilation) / 2)
+        if self.use_shift_scale:
+            self.scale_conv = nn.utils.weight_norm(
+                nn.Conv1d(
+                    channel, 2 * channel, kernel_size=kernel_size, dilation=1, padding=1
+                )
+            )
+        if not self.use_wn:
+            self.layers = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.LeakyReLU(),
+                        nn.utils.weight_norm(
+                            nn.Conv1d(
+                                channel,
+                                channel,
+                                kernel_size=kernel_size,
+                                dilation=3 ** (i % 10),
+                                padding=get_padding(kernel_size, 3 ** (i % 10)),
+                            )
+                        ),
+                        nn.LeakyReLU(),
+                        nn.utils.weight_norm(
+                            nn.Conv1d(
+                                channel,
+                                channel,
+                                kernel_size=kernel_size,
+                                dilation=1,
+                                padding=get_padding(kernel_size, 1),
+                            )
+                        ),
+                    )
+                    for i in range(resstack_depth)
+                ]
+            )
+        else:
+            self.wn = WaveNet(
+                in_channels=channel,
+                out_channels=channel,
+                cin_channels=-1,
+                num_layers=resstack_depth,
+                residual_channels=channel,
+                gate_channels=channel,
+                skip_channels=channel,
+                # kernel_size=5,
+                # dilation_rate=3,
+                causal=False,
+                use_downup=False,
+            )
+    def forward(self, x):
+        if not self.use_wn:
+            for layer in self.layers:
+                x = x + layer(x)
+        else:
+            x = self.wn(x)
+        if self.use_shift_scale:
+            m_s = self.scale_conv(x)
+            m_s = m_s[:, :, :-1]
+            m, s = torch.split(m_s, self.channel, dim=1)
+            s = F.softplus(s)
+            x = m + s * x[:, :, 1:]  # key!!!
+            x = F.pad(x, pad=(1, 0), mode="constant", value=0)
+        return x
+class WaveNet(nn.Module):
+    def __init__(
+        self,
+        in_channels=1,
+        out_channels=1,
+        num_layers=10,
+        residual_channels=64,
+        gate_channels=64,
+        skip_channels=64,
+        kernel_size=3,
+        dilation_rate=2,
+        cin_channels=80,
+        hp=None,
+        causal=False,
+        use_downup=False,
+    ):
+        super(WaveNet, self).__init__()
+        self.in_channels = in_channels
+        self.causal = causal
+        self.num_layers = num_layers
+        self.out_channels = out_channels
+        self.gate_channels = gate_channels
+        self.residual_channels = residual_channels
+        self.skip_channels = skip_channels
+        self.cin_channels = cin_channels
+        self.kernel_size = kernel_size
+        self.use_downup = use_downup
+        self.front_conv = nn.Sequential(
+            nn.Conv1d(
+                in_channels=self.in_channels,
+                out_channels=self.residual_channels,
+                kernel_size=3,
+                padding=1,
+            ),
+            nn.ReLU(),
+        )
+        if self.use_downup:
+            self.downup_conv = nn.Sequential(
+                nn.Conv1d(
+                    in_channels=self.residual_channels,
+                    out_channels=self.residual_channels,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                ),
+                nn.ReLU(),
+                nn.Conv1d(
+                    in_channels=self.residual_channels,
+                    out_channels=self.residual_channels,
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                ),
+                nn.ReLU(),
+                UpsampleNet(self.residual_channels, self.residual_channels, 4, hp),
+            )
+        self.res_blocks = nn.ModuleList()
+        for n in range(self.num_layers):
+            self.res_blocks.append(
+                ResBlock(
+                    self.residual_channels,
+                    self.gate_channels,
+                    self.skip_channels,
+                    self.kernel_size,
+                    dilation=dilation_rate**n,
+                    cin_channels=self.cin_channels,
+                    local_conditioning=(self.cin_channels > 0),
+                    causal=self.causal,
+                    mode="SAME",
+                )
+            )
+        self.final_conv = nn.Sequential(
+            nn.ReLU(),
+            Conv(self.skip_channels, self.skip_channels, 1, causal=self.causal),
+            nn.ReLU(),
+            Conv(self.skip_channels, self.out_channels, 1, causal=self.causal),
+        )
+    def forward(self, x, c=None):
+        return self.wavenet(x, c)
+    def wavenet(self, tensor, c=None):
+        h = self.front_conv(tensor)
+        if self.use_downup:
+            h = self.downup_conv(h)
+        skip = 0
+        for i, f in enumerate(self.res_blocks):
+            h, s = f(h, c)
+            skip += s
+        out = self.final_conv(skip)
+        return out
+    def receptive_field_size(self):
+        num_dir = 1 if self.causal else 2
+        dilations = [2 ** (i % self.num_layers) for i in range(self.num_layers)]
+        return (
+            num_dir * (self.kernel_size - 1) * sum(dilations)
+            + 1
+            + (self.front_channels - 1)
+        )
+    def remove_weight_norm(self):
+        for f in self.res_blocks:
+            f.remove_weight_norm()
+class Conv(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation=1,
+        causal=False,
+        mode="SAME",
+    ):
+        super(Conv, self).__init__()
+        self.causal = causal
+        self.mode = mode
+        if self.causal and self.mode == "SAME":
+            self.padding = dilation * (kernel_size - 1)
+        elif self.mode == "SAME":
+            self.padding = dilation * (kernel_size - 1) // 2
+        else:
+            self.padding = 0
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            dilation=dilation,
+            padding=self.padding,
+        )
+        self.conv = nn.utils.weight_norm(self.conv)
+        nn.init.kaiming_normal_(self.conv.weight)
+    def forward(self, tensor):
+        out = self.conv(tensor)
+        if self.causal and self.padding is not 0:
+            out = out[:, :, : -self.padding]
+        return out
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.conv)
+class ResBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        skip_channels,
+        kernel_size,
+        dilation,
+        cin_channels=None,
+        local_conditioning=True,
+        causal=False,
+        mode="SAME",
+    ):
+        super(ResBlock, self).__init__()
+        self.causal = causal
+        self.local_conditioning = local_conditioning
+        self.cin_channels = cin_channels
+        self.mode = mode
+        self.filter_conv = Conv(
+            in_channels, out_channels, kernel_size, dilation, causal, mode
+        )
+        self.gate_conv = Conv(
+            in_channels, out_channels, kernel_size, dilation, causal, mode
+        )
+        self.res_conv = nn.Conv1d(out_channels, in_channels, kernel_size=1)
+        self.skip_conv = nn.Conv1d(out_channels, skip_channels, kernel_size=1)
+        self.res_conv = nn.utils.weight_norm(self.res_conv)
+        self.skip_conv = nn.utils.weight_norm(self.skip_conv)
+        if self.local_conditioning:
+            self.filter_conv_c = nn.Conv1d(cin_channels, out_channels, kernel_size=1)
+            self.gate_conv_c = nn.Conv1d(cin_channels, out_channels, kernel_size=1)
+            self.filter_conv_c = nn.utils.weight_norm(self.filter_conv_c)
+            self.gate_conv_c = nn.utils.weight_norm(self.gate_conv_c)
+    def forward(self, tensor, c=None):
+        h_filter = self.filter_conv(tensor)
+        h_gate = self.gate_conv(tensor)
+        if self.local_conditioning:
+            h_filter += self.filter_conv_c(c)
+            h_gate += self.gate_conv_c(c)
+        out = torch.tanh(h_filter) * torch.sigmoid(h_gate)
+        res = self.res_conv(out)
+        skip = self.skip_conv(out)
+        if self.mode == "SAME":
+            return (tensor + res) * math.sqrt(0.5), skip
+        else:
+            return (tensor[:, :, 1:] + res) * math.sqrt(0.5), skip
+    def remove_weight_norm(self):
+        self.filter_conv.remove_weight_norm()
+        self.gate_conv.remove_weight_norm()
+        nn.utils.remove_weight_norm(self.res_conv)
+        nn.utils.remove_weight_norm(self.skip_conv)
+        nn.utils.remove_weight_norm(self.filter_conv_c)
+        nn.utils.remove_weight_norm(self.gate_conv_c)
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:])
+    acts = t_act * s_act
+    return acts
+@torch.jit.script
+def fused_res_skip(tensor, res_skip, n_channels):
+    n_channels_int = n_channels[0]
+    res = res_skip[:, :n_channels_int]
+    skip = res_skip[:, n_channels_int:]
+    return (tensor + res), skip
+class ResStack2D(nn.Module):
+    def __init__(self, channels=16, kernel_size=3, resstack_depth=4, hp=None):
+        super(ResStack2D, self).__init__()
+        channels = 16
+        kernel_size = 3
+        resstack_depth = 2
+        self.channels = channels
+        def get_padding(kernel_size, dilation=1):
+            return int((kernel_size * dilation - dilation) / 2)
+        self.layers = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.LeakyReLU(),
+                    nn.utils.weight_norm(
+                        nn.Conv2d(
+                            1,
+                            self.channels,
+                            kernel_size,
+                            dilation=(1, 3 ** (i)),
+                            padding=(1, get_padding(kernel_size, 3 ** (i))),
+                        )
+                    ),
+                    nn.LeakyReLU(),
+                    nn.utils.weight_norm(
+                        nn.Conv2d(
+                            self.channels,
+                            self.channels,
+                            kernel_size,
+                            dilation=(1, 3 ** (i)),
+                            padding=(1, get_padding(kernel_size, 3 ** (i))),
+                        )
+                    ),
+                    nn.LeakyReLU(),
+                    nn.utils.weight_norm(nn.Conv2d(self.channels, 1, kernel_size=1)),
+                )
+                for i in range(resstack_depth)
+            ]
+        )
+    def forward(self, tensor):
+        x = tensor.unsqueeze(1)
+        for layer in self.layers:
+            x = x + layer(x)
+        x = x.squeeze(1)
+        return x
+class FiLM(nn.Module):
+    """
+    feature-wise linear modulation
+    """
+    def __init__(self, input_dim, attribute_dim):
+        super().__init__()
+        self.input_dim = input_dim
+        self.generator = nn.Conv1d(
+            attribute_dim, input_dim * 2, kernel_size=3, padding=1
+        )
+    def forward(self, x, c):
+        """
+        x: (B, input_dim, seq)
+        c: (B, attribute_dim, seq)
+        """
+        c = self.generator(c)
+        m, s = torch.split(c, self.input_dim, dim=1)
+        return x * s + m
+class FiLMConv1d(nn.Module):
+    """
+    Conv1d with FiLMs in between
+    """
+    def __init__(self, in_size, out_size, attribute_dim, ins_norm=True, loop=1):
+        super().__init__()
+        self.loop = loop
+        self.mlps = nn.ModuleList(
+            [nn.Conv1d(in_size, out_size, kernel_size=3, padding=1)]
+            + [
+                nn.Conv1d(out_size, out_size, kernel_size=3, padding=1)
+                for i in range(loop - 1)
+            ]
+        )
+        self.films = nn.ModuleList([FiLM(out_size, attribute_dim) for i in range(loop)])
+        self.ins_norm = ins_norm
+        if self.ins_norm:
+            self.norm = nn.InstanceNorm1d(attribute_dim)
+    def forward(self, x, c):
+        """
+        x: (B, input_dim, seq)
+        c: (B, attribute_dim, seq)
+        """
+        if self.ins_norm:
+            c = self.norm(c)
+        for i in range(self.loop):
+            x = self.mlps[i](x)
+            x = F.relu(x)
+            x = self.films[i](x, c)
+        return x

voicefixer/vocoder/model/pqmf.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import os
+import sys
+import torch
+import torch.nn as nn
+import numpy as np
+import scipy.io.wavfile
+class PQMF(nn.Module):
+    def __init__(self, N, M, file_path="utils/pqmf_hk_4_64.dat"):
+        super().__init__()
+        self.N = N  # nsubband
+        self.M = M  # nfilter
+        self.ana_conv_filter = nn.Conv1d(
+            1, out_channels=N, kernel_size=M, stride=N, bias=False
+        )
+        data = np.reshape(np.fromfile(file_path, dtype=np.float32), (N, M))
+        data = np.flipud(data.T).T
+        gk = data.copy()
+        data = np.reshape(data, (N, 1, M)).copy()
+        dict_new = self.ana_conv_filter.state_dict().copy()
+        dict_new["weight"] = torch.from_numpy(data)
+        self.ana_pad = nn.ConstantPad1d((M - N, 0), 0)
+        self.ana_conv_filter.load_state_dict(dict_new)
+        self.syn_pad = nn.ConstantPad1d((0, M // N - 1), 0)
+        self.syn_conv_filter = nn.Conv1d(
+            N, out_channels=N, kernel_size=M // N, stride=1, bias=False
+        )
+        gk = np.transpose(np.reshape(gk, (4, 16, 4)), (1, 0, 2)) * N
+        gk = np.transpose(gk[::-1, :, :], (2, 1, 0)).copy()
+        dict_new = self.syn_conv_filter.state_dict().copy()
+        dict_new["weight"] = torch.from_numpy(gk)
+        self.syn_conv_filter.load_state_dict(dict_new)
+        for param in self.parameters():
+            param.requires_grad = False
+    def analysis(self, inputs):
+        return self.ana_conv_filter(self.ana_pad(inputs))
+    def synthesis(self, inputs):
+        return self.syn_conv_filter(self.syn_pad(inputs))
+    def forward(self, inputs):
+        return self.ana_conv_filter(self.ana_pad(inputs))
+if __name__ == "__main__":
+    a = PQMF(4, 64)
+    # x = np.load('data/train/audio/010000.npy')
+    x = np.zeros([8, 24000], np.float32)
+    x = np.reshape(x, (8, 1, -1))
+    x = torch.from_numpy(x)
+    b = a.analysis(x)
+    c = a.synthesis(b)
+    print(x.shape, b.shape, c.shape)
+    b = (b * 32768).numpy()
+    b = np.reshape(np.transpose(b, (0, 2, 1)), (-1, 1)).astype(np.int16)
+    # b.tofile('1.pcm')
+    # np.reshape(np.transpose(c.numpy()*32768, (0, 2, 1)), (-1,1)).astype(np.int16).tofile('2.pcm')

voicefixer/vocoder/model/res_msd.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+LRELU_SLOPE = 0.1
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+class ResStack(nn.Module):
+    def __init__(self, channels=384, kernel_size=3, resstack_depth=3, hp=None):
+        super(ResStack, self).__init__()
+        dilation = [2 * i + 1 for i in range(resstack_depth)]  # [1, 3, 5]
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation[i],
+                        padding=get_padding(kernel_size, dilation[i]),
+                    )
+                )
+                for i in range(resstack_depth)
+            ]
+        )
+        self.convs1.apply(init_weights)
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                )
+                for i in range(resstack_depth)
+            ]
+        )
+        self.convs2.apply(init_weights)
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)

voicefixer/vocoder/model/util.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from voicefixer.vocoder.config import Config
+from voicefixer.tools.pytorch_util import try_tensor_cuda, check_cuda_availability
+import torch
+import librosa
+import numpy as np
+def tr_normalize(S):
+    if Config.allow_clipping_in_normalization:
+        if Config.symmetric_mels:
+            return torch.clip(
+                (2 * Config.max_abs_value) * ((S - Config.min_db) / (-Config.min_db))
+                - Config.max_abs_value,
+                -Config.max_abs_value,
+                Config.max_abs_value,
+            )
+        else:
+            return torch.clip(
+                Config.max_abs_value * ((S - Config.min_db) / (-Config.min_db)),
+                0,
+                Config.max_abs_value,
+            )
+    assert S.max() <= 0 and S.min() - Config.min_db >= 0
+    if Config.symmetric_mels:
+        return (2 * Config.max_abs_value) * (
+            (S - Config.min_db) / (-Config.min_db)
+        ) - Config.max_abs_value
+    else:
+        return Config.max_abs_value * ((S - Config.min_db) / (-Config.min_db))
+def tr_amp_to_db(x):
+    min_level = torch.exp(Config.min_level_db / 20 * torch.log(torch.tensor(10.0)))
+    min_level = min_level.type_as(x)
+    return 20 * torch.log10(torch.maximum(min_level, x))
+def normalize(S):
+    if Config.allow_clipping_in_normalization:
+        if Config.symmetric_mels:
+            return np.clip(
+                (2 * Config.max_abs_value) * ((S - Config.min_db) / (-Config.min_db))
+                - Config.max_abs_value,
+                -Config.max_abs_value,
+                Config.max_abs_value,
+            )
+        else:
+            return np.clip(
+                Config.max_abs_value * ((S - Config.min_db) / (-Config.min_db)),
+                0,
+                Config.max_abs_value,
+            )
+    assert S.max() <= 0 and S.min() - Config.min_db >= 0
+    if Config.symmetric_mels:
+        return (2 * Config.max_abs_value) * (
+            (S - Config.min_db) / (-Config.min_db)
+        ) - Config.max_abs_value
+    else:
+        return Config.max_abs_value * ((S - Config.min_db) / (-Config.min_db))
+def amp_to_db(x):
+    min_level = np.exp(Config.min_level_db / 20 * np.log(10))
+    return 20 * np.log10(np.maximum(min_level, x))
+def tr_pre(npy):
+    # conditions = torch.FloatTensor(npy).type_as(npy) # to(device)
+    conditions = npy.transpose(1, 2)
+    l = conditions.size(-1)
+    pad_tail = l % 2 + 4
+    zeros = (
+        torch.zeros([conditions.size()[0], Config.num_mels, pad_tail]).type_as(
+            conditions
+        )
+        + -4.0
+    )
+    return torch.cat([conditions, zeros], dim=-1)
+def pre(npy):
+    conditions = npy
+    ## padding tail
+    if type(conditions) == np.ndarray:
+        conditions = torch.FloatTensor(conditions).unsqueeze(0)
+    else:
+        conditions = torch.FloatTensor(conditions.float()).unsqueeze(0)
+    conditions = conditions.transpose(1, 2)
+    l = conditions.size(-1)
+    pad_tail = l % 2 + 4
+    zeros = torch.zeros([1, Config.num_mels, pad_tail]) + -4.0
+    return torch.cat([conditions, zeros], dim=-1)
+def load_try(state, model):
+    model_dict = model.state_dict()
+    try:
+        model_dict.update(state)
+        model.load_state_dict(model_dict)
+    except RuntimeError as e:
+        print(str(e))
+        model_dict = model.state_dict()
+        for k, v in state.items():
+            model_dict[k] = v
+            model.load_state_dict(model_dict)
+def load_checkpoint(checkpoint_path, device):
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    return checkpoint
+def build_mel_basis():
+    return librosa.filters.mel(
+        Config.sample_rate,
+        Config.n_fft,
+        htk=True,
+        n_mels=Config.num_mels,
+        fmin=0,
+        fmax=int(Config.sample_rate // 2),
+    )
+def linear_to_mel(spectogram):
+    _mel_basis = build_mel_basis()
+    return np.dot(_mel_basis, spectogram)
+if __name__ == "__main__":
+    data = torch.randn((3, 5, 100))
+    b = normalize(amp_to_db(data.numpy()))
+    a = tr_normalize(tr_amp_to_db(data)).numpy()
+    print(a - b)