Spaces:

sereich
/

BroadcastAudioUpscaling

Running

App Files Files Community

sereich commited on Sep 27, 2024

Commit

f113387

1 Parent(s): 50ec1db

Initial commit of Radio Upscaling UI (minus models)

Browse files

Files changed (15) hide show

app.py +54 -0
conf/experiment/aero_441-441_512_256.yaml +77 -0
conf/experiment/seanet_441-441.yaml +46 -0
html/directions.html +6 -0
html/information.html +16 -0
processAudio.py +87 -0
requirements.txt +4 -0
src/models/aero.py +524 -0
src/models/modelFactory.py +15 -0
src/models/modules.py +327 -0
src/models/seanet.py +177 -0
src/models/snake.py +66 -0
src/models/spec.py +39 -0
src/models/utils.py +50 -0
src/utils.py +52 -0

app.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import gradio as gr
+import torch
+import numpy as np
+from torchaudio.functional import resample
+from processAudio import upscaleAudio
+class Object(object):
+    pass
+def processAudio(model: gr.Dropdown, audioData: gr.Audio):
+    if len(audioData[1].shape) == 1: #Convert mono to stereo
+        lrAudio = torch.tensor(np.array([
+            audioData[1].copy().astype(np.float32)/32768,
+            audioData[1].copy().astype(np.float32)/32768
+        ]))
+    elif audioData[1].shape[1] > 2:
+        raise gr.Error("Audio with more than 2 channels is not supported.")
+    else: #re-order channel data from [samples, 2] to [2, samples]
+        lrAudio = torch.tensor(audioData[1].copy().astype(np.float32)/32768).transpose(0,1)
+    if audioData[0] != 44100:
+        lrAudio = resample(lrAudio, audioData[0], 44100)
+    hrAudio=upscaleAudio(lrAudio, "models/" + model)
+    hrAudio=hrAudio / max(hrAudio.abs().max().item(), 1)
+    outAudio=(hrAudio*32767).numpy().astype(np.int16).transpose(1,0)
+    return tuple([44100, outAudio])
+with gr.Blocks(theme=gr.themes.Default().set(body_background_fill="#CCEEFF")) as layout:
+    with gr.Row():
+        gr.Markdown("<h2>Broadcast Audio Upscaler</h2>")
+    with gr.Row():
+        with open("html/directions.html", "r") as directionsHtml:
+            gr.Markdown(directionsHtml.read())
+    with gr.Row():
+        modelSelect = gr.Dropdown(
+            [
+                ["FM Radio Super Resolution","FM_Radio_SR.th"],
+                ["AM Radio Super Resolution (Beta)","AM_Radio_SR.th"]
+            ],
+            label="Select Model:",
+            value="FM_Radio_SR.th",
+        )
+    with gr.Row():
+        with gr.Column():
+            audioFileSelect = gr.Audio(label="Audio File (Mono or Stereo):",sources="upload")
+        with gr.Column():
+            audioOutput = gr.Audio(show_download_button=True, label="Restored Audio:", sources=[])
+    with gr.Row():
+        submit = gr.Button("Process Audio", variant="primary").click(fn=processAudio, inputs=[modelSelect, audioFileSelect], outputs=audioOutput)
+    with gr.Row():
+        with gr.Accordion("More Information:", open=False):
+            with open("html/information.html", "r") as informationHtml:
+                gr.Markdown(informationHtml.read())
+layout.launch()

conf/experiment/aero_441-441_512_256.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+# @package experiment
+name: aero-nfft=${experiment.nfft}-hl=${experiment.hop_length}
+# Dataset related
+lr_sr: 44100 # low resolution sample rate, added to support BWE. Should be included in training cfg
+hr_sr: 44100 # high resolution sample rate. Should be included in training cfg
+segment: 2
+stride: 4    # in seconds, how much to stride between training examples
+pad: true   # if training sample is too short, pad it
+upsample: false
+batch_size: 6
+nfft: 512
+hop_length: 256
+# models related
+model: aero
+aero: # see aero.py for a detailed description
+  in_channels: 2
+  out_channels: 2
+  # Channels
+  channels: 64 #Small: 48 Large: 64
+  growth: 2
+  # STFT
+  nfft: 512
+  hop_length: 256
+  end_iters: 0
+  cac: true
+  # Main structure
+  rewrite: true
+  hybrid: false
+  hybrid_old: false
+  # Frequency Branch
+  freq_emb: 0.2
+  emb_scale: 10
+  emb_smooth: true
+  # Convolutions
+  kernel_size: 8
+  strides: [ 4,4,2,2 ]
+  context: 1
+  context_enc: 0
+  freq_ends: 4
+  enc_freq_attn: 0
+  # normalization
+  norm_starts: 2
+  norm_groups: 4
+  # DConv residual branch
+  dconv_mode: 1
+  dconv_depth: 2
+  dconv_comp: 4
+  dconv_time_attn: 2
+  dconv_lstm: 2
+  dconv_init: 0.001
+  # Weight init
+  rescale: 0.1
+  lr_sr: 44100
+  hr_sr: 44100
+  spec_upsample: true
+  act_func: snake
+  debug: false
+adversarial: True
+features_loss_lambda: 100
+only_features_loss: False
+only_adversarial_loss: False
+discriminator_models: [ msd, mpd ] #msd_melgan/msd_hifi/mpd/hifi
+melgan_discriminator:
+  n_layers: 5
+  num_D: 3
+  downsampling_factor: 1
+  ndf: 16
+  channels: 2
+msd:
+  hidden: 32 #Small: 16 Large: 32
+  channels: 2
+mpd:
+  hidden: 32 #Small: 16 Large: 32
+  channels: 2

conf/experiment/seanet_441-441.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+# @package experiment
+name: seanet-nfft=${experiment.nfft}-hl=${experiment.hop_length}
+# Dataset related
+lr_sr: 44100 # low resolution sample rate, added to support BWE. Should be included in training cfg
+hr_sr: 44100 # high resolution sample rate. Should be included in training cfg
+segment: 2
+stride: 2    # in seconds, how much to stride between training examples
+pad: true   # if training sample is too short, pad it
+upsample: false
+batch_size: 5
+nfft: 1024
+hop_length: 512
+# models related
+model: seanet
+seanet:
+  latent_space_size: 256
+  ngf: 32
+  n_residual_layers: 4
+  resample: 1
+  normalize: False
+  floor: 0.0005
+  ratios: [ 8,8,2,2 ]
+  lr_sr: 44100
+  hr_sr: 44100
+  in_channels: 2
+  out_channels: 2
+adversarial: True
+features_loss_lambda: 100
+discriminator_models: [ msd, mpd ] #msd_melgan/msd_hifi/mpd/hifi
+only_features_loss: False
+only_adversarial_loss: False
+msd:
+  hidden: 16
+  channels: 2
+mpd:
+  hidden: 16
+  channels: 2
+melgan_discriminator:
+  n_layers: 5
+  num_D: 3
+  downsampling_factor: 1
+  ndf: 16
+  channels: 2

html/directions.html ADDED Viewed

	@@ -0,0 +1,6 @@

+<h3>Directions:</h3>
+<ol>
+    <li>Select a model from the model drop-down.</li>
+    <li>Select an audio file.</li>
+    <li>After the file loads, press "Process Audio" to process the file.</li>
+</ol>

html/information.html ADDED Viewed

	@@ -0,0 +1,16 @@

+<p>This tool is based on the "aero" audio upscaling project (<a href="https://github.com/slp-rl/aero">original version</a>, <a href="https://github.com/pokepress/aero">modified version used here</a>). It takes digitized radio recordings and attempts to reduce noise/interference while recreating lost frequencies.</p>
+<B>The FM model should be able to:</B>
+<ul>
+<li>Reduce stereo hiss</li>
+<li>Reduce certain kinds of static/interference</li>
+<li>Reduce 19khz stereo pilot</li>
+<li>Reduce overly warm bass frequencies (boost them 3 or so dB if you miss them)</li>
+<li>Restore lost high & low frequencies (generally only up to 16khz since most training data is MP3-based)</li>
+</ul>
+<B>The AM model should be able to:</B>
+<ul>
+<li>Reduce certain kinds of static/interference</li>
+<li>Restore lost high & low frequencies (capacity tends to diminish after 10khz)</li>
+</ul>

processAudio.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import math
+import os
+import time
+import torch
+import logging
+from pathlib import Path
+import torchaudio
+from torchaudio.functional import resample
+from gradio import Progress
+from src.models import modelFactory
+from src.utils import bold
+logger = logging.getLogger(__name__)
+SEGMENT_DURATION_SEC = 5
+SEGMENT_OVERLAP_SAMPLES = 2048
+SERIALIZE_KEY_STATE = 'state'
+def _load_model(checkpoint_file="models/FM_Radio_SR.th",model_name="aero"):
+    checkpoint_file = Path(checkpoint_file)
+    model = modelFactory.get_model(model_name)['generator']
+    package = torch.load(checkpoint_file, 'cpu')
+    if  'state' in package.keys(): #raw model file
+        logger.info(bold(f'Loading model {model_name} from file.'))
+        model.load_state_dict(package[SERIALIZE_KEY_STATE])
+    return model
+def crossfade_and_blend(out_clip, in_clip):
+    fade_out = torchaudio.transforms.Fade(0,SEGMENT_OVERLAP_SAMPLES)
+    fade_in = torchaudio.transforms.Fade(SEGMENT_OVERLAP_SAMPLES, 0)
+    return fade_out(out_clip) + fade_in(in_clip)
+def upscaleAudio(lr_sig, checkpoint_file: str, sr=44100, hr_sr=44100, model_name="aero", progress=Progress()):
+    model = _load_model(checkpoint_file,model_name)
+    device = torch.device('cpu')
+    #model.cuda()
+    logger.info(f'lr wav shape: {lr_sig.shape}')
+    segment_duration_samples = sr * SEGMENT_DURATION_SEC
+    n_chunks = math.ceil(lr_sig.shape[-1] / segment_duration_samples)
+    logger.info(f'number of chunks: {n_chunks}')
+    lr_chunks = []
+    for i in range(n_chunks):
+        start = i * segment_duration_samples
+        end = min((i + 1) * segment_duration_samples, lr_sig.shape[-1])
+        lr_chunks.append(lr_sig[:, start:end])
+    pr_chunks = []
+    lr_segment_overlap_samples = int((sr/hr_sr) * SEGMENT_OVERLAP_SAMPLES)
+    model.eval()
+    pred_start = time.time()
+    with torch.no_grad():
+        previous_chunk = None
+        progress(0)
+        for i, lr_chunk in enumerate(lr_chunks):
+            progress(i/n_chunks)
+            pr_chunk = None
+            if previous_chunk is not None:
+                combined_chunk = torch.cat((previous_chunk[...,-lr_segment_overlap_samples:], lr_chunk), 1)
+                pr_combined_chunk = model(combined_chunk.unsqueeze(0).to(device)).squeeze(0)
+                pr_chunk = pr_combined_chunk[...,SEGMENT_OVERLAP_SAMPLES:]
+                pr_chunks[-1][...,-SEGMENT_OVERLAP_SAMPLES:] = crossfade_and_blend(pr_chunks[-1][...,-SEGMENT_OVERLAP_SAMPLES:], pr_combined_chunk.cpu()[...,:SEGMENT_OVERLAP_SAMPLES] )
+            else:
+                pr_chunk = model(lr_chunk.unsqueeze(0).to(device)).squeeze(0)
+            logger.info(f'lr chunk {i} shape: {lr_chunk.shape}')
+            logger.info(f'pr chunk {i} shape: {pr_chunk.shape}')
+            pr_chunks.append(pr_chunk.cpu())
+            previous_chunk = lr_chunk
+    pred_duration = time.time() - pred_start
+    logger.info(f'prediction duration: {pred_duration}')
+    pr = torch.concat(pr_chunks, dim=-1)
+    logger.info(f'pr wav shape: {pr.shape}')
+    return pr

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch==2.2.1
+torchvision=0.17.2
+torchaudio==2.2.1
+opencv-python

src/models/aero.py ADDED Viewed

	@@ -0,0 +1,524 @@

+"""
+This code is based on Facebook's HDemucs code: https://github.com/facebookresearch/demucs
+"""
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from src.models.utils import capture_init
+from src.models.spec import spectro, ispectro
+from src.models.modules import DConv, ScaledEmbedding, FTB
+import logging
+logger = logging.getLogger(__name__)
+def rescale_conv(conv, reference):
+    std = conv.weight.std().detach()
+    scale = (std / reference) ** 0.5
+    conv.weight.data /= scale
+    if conv.bias is not None:
+        conv.bias.data /= scale
+def rescale_module(module, reference):
+    for sub in module.modules():
+        if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)):
+            rescale_conv(sub, reference)
+class HEncLayer(nn.Module):
+    def __init__(self, chin, chout, kernel_size=8, stride=4, norm_groups=1, empty=False,
+                 freq=True, dconv=True, is_first=False, freq_attn=False, freq_dim=None, norm=True, context=0,
+                 dconv_kw={}, pad=True,
+                 rewrite=True):
+        """Encoder layer. This used both by the time and the frequency branch.
+        Args:
+            chin: number of input channels.
+            chout: number of output channels.
+            norm_groups: number of groups for group norm.
+            empty: used to make a layer with just the first conv. this is used
+                before merging the time and freq. branches.
+            freq: this is acting on frequencies.
+            dconv: insert DConv residual branches.
+            norm: use GroupNorm.
+            context: context size for the 1x1 conv.
+            dconv_kw: list of kwargs for the DConv class.
+            pad: pad the input. Padding is done so that the output size is
+                always the input size / stride.
+            rewrite: add 1x1 conv at the end of the layer.
+        """
+        super().__init__()
+        norm_fn = lambda d: nn.Identity()  # noqa
+        if norm:
+            norm_fn = lambda d: nn.GroupNorm(norm_groups, d)  # noqa
+        if stride == 1 and kernel_size % 2 == 0 and kernel_size > 1:
+            kernel_size -= 1
+        if pad:
+            pad = (kernel_size - stride) // 2
+        else:
+            pad = 0
+        klass = nn.Conv2d
+        self.chin = chin
+        self.chout = chout
+        self.freq = freq
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.empty = empty
+        self.freq_attn = freq_attn
+        self.freq_dim = freq_dim
+        self.norm = norm
+        self.pad = pad
+        if freq:
+            kernel_size = [kernel_size, 1]
+            stride = [stride, 1]
+            if pad != 0:
+                pad = [pad, 0]
+            # klass = nn.Conv2d
+        else:
+            kernel_size = [1, kernel_size]
+            stride = [1, stride]
+            if pad != 0:
+                pad = [0, pad]
+        self.is_first = is_first
+        if is_first:
+            self.pre_conv = nn.Conv2d(chin, chout, [1, 1])
+            chin = chout
+        if self.freq_attn:
+            self.freq_attn_block = FTB(input_dim=freq_dim, in_channel=chin)
+        self.conv = klass(chin, chout, kernel_size, stride, pad)
+        if self.empty:
+            return
+        self.norm1 = norm_fn(chout)
+        self.rewrite = None
+        if rewrite:
+            self.rewrite = klass(chout, 2 * chout, 1 + 2 * context, 1, context)
+            self.norm2 = norm_fn(2 * chout)
+        self.dconv = None
+        if dconv:
+            self.dconv = DConv(chout, **dconv_kw)
+    def forward(self, x, inject=None):
+        """
+        `inject` is used to inject the result from the time branch into the frequency branch,
+        when both have the same stride.
+        """
+        if not self.freq:
+            le = x.shape[-1]
+            if not le % self.stride == 0:
+                x = F.pad(x, (0, self.stride - (le % self.stride)))
+        if self.is_first:
+            x = self.pre_conv(x)
+        if self.freq_attn:
+            x = self.freq_attn_block(x)
+        x = self.conv(x)
+        x = F.gelu(self.norm1(x))
+        if self.dconv:
+            x = self.dconv(x)
+        if self.rewrite:
+            x = self.norm2(self.rewrite(x))
+            x = F.glu(x, dim=1)
+        return x
+class HDecLayer(nn.Module):
+    def __init__(self, chin, chout, last=False, kernel_size=8, stride=4, norm_groups=1, empty=False,
+                 freq=True, dconv=True, norm=True, context=1, dconv_kw={}, pad=True,
+                 context_freq=True, rewrite=True):
+        """
+        Same as HEncLayer but for decoder. See `HEncLayer` for documentation.
+        """
+        super().__init__()
+        norm_fn = lambda d: nn.Identity()  # noqa
+        if norm:
+            norm_fn = lambda d: nn.GroupNorm(norm_groups, d)  # noqa
+        if stride == 1 and kernel_size % 2 == 0 and kernel_size > 1:
+            kernel_size -= 1
+        if pad:
+            pad = (kernel_size - stride) // 2
+        else:
+            pad = 0
+        self.pad = pad
+        self.last = last
+        self.freq = freq
+        self.chin = chin
+        self.empty = empty
+        self.stride = stride
+        self.kernel_size = kernel_size
+        self.norm = norm
+        self.context_freq = context_freq
+        klass = nn.Conv2d
+        klass_tr = nn.ConvTranspose2d
+        if freq:
+            kernel_size = [kernel_size, 1]
+            stride = [stride, 1]
+        else:
+            kernel_size = [1, kernel_size]
+            stride = [1, stride]
+        self.conv_tr = klass_tr(chin, chout, kernel_size, stride)
+        self.norm2 = norm_fn(chout)
+        if self.empty:
+            return
+        self.rewrite = None
+        if rewrite:
+            if context_freq:
+                self.rewrite = klass(chin, 2 * chin, 1 + 2 * context, 1, context)
+            else:
+                self.rewrite = klass(chin, 2 * chin, [1, 1 + 2 * context], 1,
+                                     [0, context])
+            self.norm1 = norm_fn(2 * chin)
+        self.dconv = None
+        if dconv:
+            self.dconv = DConv(chin, **dconv_kw)
+    def forward(self, x, skip, length):
+        if self.freq and x.dim() == 3:
+            B, C, T = x.shape
+            x = x.view(B, self.chin, -1, T)
+        if not self.empty:
+            x = torch.cat([x, skip], dim=1)
+            if self.rewrite:
+                y = F.glu(self.norm1(self.rewrite(x)), dim=1)
+            else:
+                y = x
+            if self.dconv:
+                y = self.dconv(y)
+        else:
+            y = x
+            assert skip is None
+        z = self.norm2(self.conv_tr(y))
+        if self.freq:
+            if self.pad:
+                z = z[..., self.pad:-self.pad, :]
+        else:
+            z = z[..., self.pad:self.pad + length]
+            assert z.shape[-1] == length, (z.shape[-1], length)
+        if not self.last:
+            z = F.gelu(z)
+        return z
+class Aero(nn.Module):
+    """
+    Deep model for Audio Super Resolution.
+    """
+    @capture_init
+    def __init__(self,
+                 # Channels
+                 in_channels=1,
+                 out_channels=1,
+                 audio_channels=2,
+                 channels=48,
+                 growth=2,
+                 # STFT
+                 nfft=512,
+                 hop_length=64,
+                 end_iters=0,
+                 cac=True,
+                 # Main structure
+                 rewrite=True,
+                 hybrid=False,
+                 hybrid_old=False,
+                 # Frequency branch
+                 freq_emb=0.2,
+                 emb_scale=10,
+                 emb_smooth=True,
+                 # Convolutions
+                 kernel_size=8,
+                 strides=[4, 4, 2, 2],
+                 context=1,
+                 context_enc=0,
+                 freq_ends=4,
+                 enc_freq_attn=4,
+                 # Normalization
+                 norm_starts=2,
+                 norm_groups=4,
+                 # DConv residual branch
+                 dconv_mode=1,
+                 dconv_depth=2,
+                 dconv_comp=4,
+                 dconv_time_attn=2,
+                 dconv_lstm=2,
+                 dconv_init=1e-3,
+                 # Weight init
+                 rescale=0.1,
+                 # Metadata
+                 lr_sr=4000,
+                 hr_sr=16000,
+                 spec_upsample=True,
+                 act_func='snake',
+                 debug=False):
+        """
+        Args:
+            sources (list[str]): list of source names.
+            audio_channels (int): input/output audio channels.
+            channels (int): initial number of hidden channels.
+            growth: increase the number of hidden channels by this factor at each layer.
+            nfft: number of fft bins. Note that changing this require careful computation of
+                various shape parameters and will not work out of the box for hybrid models.
+            end_iters: same but at train time. For a hybrid model, must be equal to `wiener_iters`.
+            cac: uses complex as channels, i.e. complex numbers are 2 channels each
+                in input and output. no further processing is done before ISTFT.
+            depth (int): number of layers in the encoder and in the decoder.
+            rewrite (bool): add 1x1 convolution to each layer.
+            hybrid (bool): make a hybrid time/frequency domain, otherwise frequency only.
+            hybrid_old: some models trained for MDX had a padding bug. This replicates
+                this bug to avoid retraining them.
+            freq_emb: add frequency embedding after the first frequency layer if > 0,
+                the actual value controls the weight of the embedding.
+            emb_scale: equivalent to scaling the embedding learning rate
+            emb_smooth: initialize the embedding with a smooth one (with respect to frequencies).
+            kernel_size: kernel_size for encoder and decoder layers.
+            stride: stride for encoder and decoder layers.
+            context: context for 1x1 conv in the decoder.
+            context_enc: context for 1x1 conv in the encoder.
+            norm_starts: layer at which group norm starts being used.
+                decoder layers are numbered in reverse order.
+            norm_groups: number of groups for group norm.
+            dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both.
+            dconv_depth: depth of residual DConv branch.
+            dconv_comp: compression of DConv branch.
+            dconv_freq_attn: adds freq attention layers in DConv branch starting at this layer.
+            dconv_time_attn: adds time attention layers in DConv branch starting at this layer.
+            dconv_lstm: adds a LSTM layer in DConv branch starting at this layer.
+            dconv_init: initial scale for the DConv branch LayerScale.
+            rescale: weight recaling trick
+            lr_sr: source low-resolution sample-rate
+            hr_sr: target high-resolution sample-rate
+            spec_upsample: if true, upsamples in the spectral domain, otherwise performs sinc-interpolation beforehand
+            act_func: 'snake'/'relu'
+            debug: if true, prints out input dimensions throughout model layers.
+        """
+        super().__init__()
+        self.cac = cac
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.audio_channels = audio_channels
+        self.kernel_size = kernel_size
+        self.context = context
+        self.strides = strides
+        self.depth = len(strides)
+        self.channels = channels
+        self.lr_sr = lr_sr
+        self.hr_sr = hr_sr
+        self.spec_upsample = spec_upsample
+        self.scale = hr_sr / lr_sr if self.spec_upsample else 1
+        self.nfft = nfft
+        self.hop_length = int(hop_length // self.scale)  # this is for the input signal
+        self.win_length = int(self.nfft // self.scale)  # this is for the input signal
+        self.end_iters = end_iters
+        self.freq_emb = None
+        self.hybrid = hybrid
+        self.hybrid_old = hybrid_old
+        self.debug = debug
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        chin_z = self.in_channels
+        if self.cac:
+            chin_z *= 2
+        chout_z = channels
+        freqs = nfft // 2
+        for index in range(self.depth):
+            freq_attn = index >= enc_freq_attn
+            lstm = index >= dconv_lstm
+            time_attn = index >= dconv_time_attn
+            norm = index >= norm_starts
+            freq = index <= freq_ends
+            stri = strides[index]
+            ker = kernel_size
+            pad = True
+            if freq and freqs < kernel_size:
+                ker = freqs
+            kw = {
+                'kernel_size': ker,
+                'stride': stri,
+                'freq': freq,
+                'pad': pad,
+                'norm': norm,
+                'rewrite': rewrite,
+                'norm_groups': norm_groups,
+                'dconv_kw': {
+                    'lstm': lstm,
+                    'time_attn': time_attn,
+                    'depth': dconv_depth,
+                    'compress': dconv_comp,
+                    'init': dconv_init,
+                    'act_func': act_func,
+                    'reshape': True,
+                    'freq_dim': freqs // strides[index] if freq else freqs
+                }
+            }
+            kw_dec = dict(kw)
+            enc = HEncLayer(chin_z, chout_z,
+                            dconv=dconv_mode & 1, context=context_enc,
+                            is_first=index == 0, freq_attn=freq_attn, freq_dim=freqs,
+                            **kw)
+            self.encoder.append(enc)
+            if index == 0:
+                chin = self.out_channels
+                chin_z = chin
+                if self.cac:
+                    chin_z *= 2
+            dec = HDecLayer(2 * chout_z, chin_z, dconv=dconv_mode & 2,
+                            last=index == 0, context=context, **kw_dec)
+            self.decoder.insert(0, dec)
+            chin_z = chout_z
+            chout_z = int(growth * chout_z)
+            if freq:
+                freqs //= strides[index]
+            if index == 0 and freq_emb:
+                self.freq_emb = ScaledEmbedding(
+                    freqs, chin_z, smooth=emb_smooth, scale=emb_scale)
+                self.freq_emb_scale = freq_emb
+        if rescale:
+            rescale_module(self, reference=rescale)
+    def _spec(self, x, scale=False):
+        if np.mod(x.shape[-1], self.hop_length):
+            x = F.pad(x, (0, self.hop_length - np.mod(x.shape[-1], self.hop_length)))
+        hl = self.hop_length
+        nfft = self.nfft
+        win_length = self.win_length
+        if scale:
+            hl = int(hl * self.scale)
+            win_length = int(win_length * self.scale)
+        z = spectro(x, nfft, hl, win_length=win_length)[..., :-1, :]
+        return z
+    def _ispec(self, z):
+        hl = int(self.hop_length * self.scale)
+        win_length = int(self.win_length * self.scale)
+        z = F.pad(z, (0, 0, 0, 1))
+        x = ispectro(z, hl, win_length=win_length)
+        return x
+    def _move_complex_to_channels_dim(self, z):
+        B, C, Fr, T = z.shape
+        m = torch.view_as_real(z).permute(0, 1, 4, 2, 3)
+        m = m.reshape(B, C * 2, Fr, T)
+        return m
+    def _convert_to_complex(self, x):
+        """
+        :param x: signal of shape [Batch, Channels, 2, Freq, TimeFrames]
+        :return: complex signal of shape [Batch, Channels, Freq, TimeFrames]
+        """
+        out = x.permute(0, 1, 3, 4, 2)
+        out = torch.view_as_complex(out.contiguous())
+        return out
+    def forward(self, mix, return_spec=False, return_lr_spec=False):
+        x = mix
+        length = x.shape[-1]
+        if self.debug:
+            logger.info(f'hdemucs in shape: {x.shape}')
+        z = self._spec(x)
+        x = self._move_complex_to_channels_dim(z)
+        if self.debug:
+            logger.info(f'x spec shape: {x.shape}')
+        B, C, Fq, T = x.shape
+        # unlike previous Demucs, we always normalize because it is easier.
+        mean = x.mean(dim=(1, 2, 3), keepdim=True)
+        std = x.std(dim=(1, 2, 3), keepdim=True)
+        x = (x - mean) / (1e-5 + std)
+        # okay, this is a giant mess I know...
+        saved = []  # skip connections, freq.
+        lengths = []  # saved lengths to properly remove padding, freq branch.
+        for idx, encode in enumerate(self.encoder):
+            lengths.append(x.shape[-1])
+            inject = None
+            x = encode(x, inject)
+            if self.debug:
+                logger.info(f'encoder {idx} out shape: {x.shape}')
+            if idx == 0 and self.freq_emb is not None:
+                # add frequency embedding to allow for non equivariant convolutions
+                # over the frequency axis.
+                frs = torch.arange(x.shape[-2], device=x.device)
+                emb = self.freq_emb(frs).t()[None, :, :, None].expand_as(x)
+                x = x + self.freq_emb_scale * emb
+            saved.append(x)
+        x = torch.zeros_like(x)
+        # initialize everything to zero (signal will go through u-net skips).
+        for idx, decode in enumerate(self.decoder):
+            skip = saved.pop(-1)
+            x = decode(x, skip, lengths.pop(-1))
+            if self.debug:
+                logger.info(f'decoder {idx} out shape: {x.shape}')
+        # Let's make sure we used all stored skip connections.
+        assert len(saved) == 0
+        x = x.view(B, self.out_channels, -1, Fq, T)
+        x = x * std[:, None] + mean[:, None]
+        if self.debug:
+            logger.info(f'post view shape: {x.shape}')
+        x_spec_complex = self._convert_to_complex(x)
+        if self.debug:
+            logger.info(f'x_spec_complex shape: {x_spec_complex.shape}')
+        x = self._ispec(x_spec_complex)
+        if self.debug:
+            logger.info(f'hdemucs out shape: {x.shape}')
+        x = x[..., :int(length * self.scale)]
+        if self.debug:
+            logger.info(f'hdemucs out - trimmed shape: {x.shape}')
+        if return_spec:
+            if return_lr_spec:
+                return x, x_spec_complex, z
+            else:
+                return x, x_spec_complex
+        return x

src/models/modelFactory.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from src.models.aero import Aero
+from src.models.seanet import Seanet
+from yaml import safe_load
+def get_model(model_name="aero"):
+    if model_name == 'aero':
+        with open("conf/experiment/aero_441-441_512_256.yaml") as f:
+            generator = Aero(**safe_load(f)["aero"])
+    elif model_name == 'seanet':
+        with open("conf/experiment/seanet_441-441.yaml") as f:
+            generator = Seanet(**safe_load(f)["seanet"])
+    models = {'generator': generator}
+    return models

src/models/modules.py ADDED Viewed

	@@ -0,0 +1,327 @@

+import torch
+from torch import nn
+from torch.nn.utils.parametrizations import weight_norm
+import math
+from src.models.snake import Snake
+from src.models.utils import unfold
+import typing as tp
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+class BLSTM(nn.Module):
+    """
+    BiLSTM with same hidden units as input dim.
+    If `max_steps` is not None, input will be splitting in overlapping
+    chunks and the LSTM applied separately on each chunk.
+    """
+    def __init__(self, dim, layers=1, max_steps=None, skip=False):
+        super().__init__()
+        assert max_steps is None or max_steps % 4 == 0
+        self.max_steps = max_steps
+        self.lstm = nn.LSTM(bidirectional=True, num_layers=layers, hidden_size=dim, input_size=dim)
+        self.linear = nn.Linear(2 * dim, dim)
+        self.skip = skip
+    def forward(self, x):
+        B, C, T = x.shape
+        y = x
+        framed = False
+        if self.max_steps is not None and T > self.max_steps:
+            width = self.max_steps
+            stride = width // 2
+            frames = unfold(x, width, stride)
+            nframes = frames.shape[2]
+            framed = True
+            x = frames.permute(0, 2, 1, 3).reshape(-1, C, width)
+        x = x.permute(2, 0, 1)
+        x = self.lstm(x)[0]
+        x = self.linear(x)
+        x = x.permute(1, 2, 0)
+        if framed:
+            out = []
+            frames = x.reshape(B, -1, C, width)
+            limit = stride // 2
+            for k in range(nframes):
+                if k == 0:
+                    out.append(frames[:, k, :, :-limit])
+                elif k == nframes - 1:
+                    out.append(frames[:, k, :, limit:])
+                else:
+                    out.append(frames[:, k, :, limit:-limit])
+            out = torch.cat(out, -1)
+            out = out[..., :T]
+            x = out
+        if self.skip:
+            x = x + y
+        return x
+class LocalState(nn.Module):
+    """Local state allows to have attention based only on data (no positional embedding),
+    but while setting a constraint on the time window (e.g. decaying penalty term).
+    Also a failed experiments with trying to provide some frequency based attention.
+    """
+    def __init__(self, channels: int, heads: int = 4, nfreqs: int = 0, ndecay: int = 4):
+        super().__init__()
+        assert channels % heads == 0, (channels, heads)
+        self.heads = heads
+        self.nfreqs = nfreqs
+        self.ndecay = ndecay
+        self.content = nn.Conv1d(channels, channels, 1)
+        self.query = nn.Conv1d(channels, channels, 1)
+        self.key = nn.Conv1d(channels, channels, 1)
+        if nfreqs:
+            self.query_freqs = nn.Conv1d(channels, heads * nfreqs, 1)
+        if ndecay:
+            self.query_decay = nn.Conv1d(channels, heads * ndecay, 1)
+            # Initialize decay close to zero (there is a sigmoid), for maximum initial window.
+            self.query_decay.weight.data *= 0.01
+            assert self.query_decay.bias is not None  # stupid type checker
+            self.query_decay.bias.data[:] = -2
+        # self.proj = nn.Conv1d(channels + heads * nfreqs, channels, 1)
+        self.proj = nn.Conv1d(channels, channels, 1)
+    def forward(self, x):
+        B, C, T = x.shape
+        heads = self.heads
+        indexes = torch.arange(T, device=x.device, dtype=x.dtype)
+        # left index are keys, right index are queries
+        delta = indexes[:, None] - indexes[None, :]
+        queries = self.query(x).view(B, heads, -1, T)
+        keys = self.key(x).view(B, heads, -1, T)
+        # t are keys, s are queries
+        dots = torch.einsum("bhct,bhcs->bhts", keys, queries)
+        dots /= keys.shape[2] ** 0.5
+        if self.nfreqs:
+            periods = torch.arange(1, self.nfreqs + 1, device=x.device, dtype=x.dtype)
+            freq_kernel = torch.cos(2 * math.pi * delta / periods.view(-1, 1, 1))
+            freq_q = self.query_freqs(x).view(B, heads, -1, T) / self.nfreqs ** 0.5
+            tmp = torch.einsum("fts,bhfs->bhts", freq_kernel, freq_q)
+            dots += tmp
+        if self.ndecay:
+            decays = torch.arange(1, self.ndecay + 1, device=x.device, dtype=x.dtype)
+            decay_q = self.query_decay(x).view(B, heads, -1, T)
+            decay_q = torch.sigmoid(decay_q) / 2
+            decay_kernel = - decays.view(-1, 1, 1) * delta.abs() / self.ndecay ** 0.5
+            dots += torch.einsum("fts,bhfs->bhts", decay_kernel, decay_q)
+        # Kill self reference.
+        dots.masked_fill_(torch.eye(T, device=dots.device, dtype=torch.bool), -100)
+        weights = torch.softmax(dots, dim=2)
+        content = self.content(x).view(B, heads, -1, T)
+        result = torch.einsum("bhts,bhct->bhcs", weights, content)
+        result = result.reshape(B, -1, T)
+        return x + self.proj(result)
+class LayerScale(nn.Module):
+    """Layer scale from [Touvron et al 2021] (https://arxiv.org/pdf/2103.17239.pdf).
+    This rescales diagonaly residual outputs close to 0 initially, then learnt.
+    """
+    def __init__(self, channels: int, init: float = 0):
+        super().__init__()
+        self.scale = nn.Parameter(torch.zeros(channels, requires_grad=True))
+        self.scale.data[:] = init
+    def forward(self, x):
+        return self.scale[:, None] * x
+class DConv(nn.Module):
+    """
+    New residual branches in each encoder layer.
+    This alternates dilated convolutions, potentially with LSTMs and attention.
+    Also before entering each residual branch, dimension is projected on a smaller subspace,
+    e.g. of dim `channels // compress`.
+    """
+    def __init__(self, channels: int, compress: float = 4, depth: int = 2, init: float = 1e-4,
+                 norm=True, time_attn=False, heads=4, ndecay=4, lstm=False,
+                 act_func='gelu', freq_dim=None, reshape=False,
+                 kernel=3, dilate=True):
+        """
+        Args:
+            channels: input/output channels for residual branch.
+            compress: amount of channel compression inside the branch.
+            depth: number of layers in the residual branch. Each layer has its own
+                projection, and potentially LSTM and attention.
+            init: initial scale for LayerNorm.
+            norm: use GroupNorm.
+            time_attn: use LocalAttention.
+            heads: number of heads for the LocalAttention.
+            ndecay: number of decay controls in the LocalAttention.
+            lstm: use LSTM.
+            gelu: Use GELU activation.
+            kernel: kernel size for the (dilated) convolutions.
+            dilate: if true, use dilation, increasing with the depth.
+        """
+        super().__init__()
+        assert kernel % 2 == 1
+        self.channels = channels
+        self.compress = compress
+        self.depth = abs(depth)
+        dilate = depth > 0
+        self.time_attn = time_attn
+        self.lstm = lstm
+        self.reshape = reshape
+        self.act_func = act_func
+        self.freq_dim = freq_dim
+        norm_fn: tp.Callable[[int], nn.Module]
+        norm_fn = lambda d: nn.Identity()  # noqa
+        if norm:
+            norm_fn = lambda d: nn.GroupNorm(1, d)  # noqa
+        self.hidden = int(channels / compress)
+        act: tp.Type[nn.Module]
+        if act_func == 'gelu':
+            act = nn.GELU
+        elif act_func == 'snake':
+            act = Snake
+        else:
+            act = nn.ReLU
+        self.layers = nn.ModuleList([])
+        for d in range(self.depth):
+            layer = nn.ModuleDict()
+            dilation = 2 ** d if dilate else 1
+            padding = dilation * (kernel // 2)
+            conv1 = nn.ModuleList([nn.Conv1d(channels, self.hidden, kernel, dilation=dilation, padding=padding),
+                                   norm_fn(self.hidden)])
+            act_layer = act(freq_dim) if act_func == 'snake' else act()
+            conv2 = nn.ModuleList([nn.Conv1d(self.hidden, 2 * channels, 1),
+                                   norm_fn(2 * channels), nn.GLU(1),
+                                   LayerScale(channels, init)])
+            layer.update({'conv1': nn.Sequential(*conv1), 'act': act_layer, 'conv2': nn.Sequential(*conv2)})
+            if lstm:
+                layer.update({'lstm': BLSTM(self.hidden, layers=2, max_steps=200, skip=True)})
+            if time_attn:
+                layer.update({'time_attn': LocalState(self.hidden, heads=heads, ndecay=ndecay)})
+            self.layers.append(layer)
+    def forward(self, x):
+        if self.reshape:
+            B, C, Fr, T = x.shape
+            x = x.permute(0, 2, 1, 3).reshape(-1, C, T)
+        for layer in self.layers:
+            skip = x
+            x = layer['conv1'](x)
+            if self.act_func == 'snake' and self.reshape:
+                x = x.view(B, Fr, self.hidden, T).permute(0, 2, 3, 1)
+            x = layer['act'](x)
+            if self.act_func == 'snake' and self.reshape:
+                x = x.permute(0, 3, 1, 2).reshape(-1, self.hidden, T)
+            if self.lstm:
+                x = layer['lstm'](x)
+            if self.time_attn:
+                x = layer['time_attn'](x)
+            x = layer['conv2'](x)
+            x = skip + x
+        if self.reshape:
+            x = x.view(B, Fr, C, T).permute(0, 2, 1, 3)
+        return x
+class ScaledEmbedding(nn.Module):
+    """
+    Boost learning rate for embeddings (with `scale`).
+    Also, can make embeddings continuous with `smooth`.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int,
+                 scale: float = 10., smooth=False):
+        super().__init__()
+        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
+        if smooth:
+            weight = torch.cumsum(self.embedding.weight.data, dim=0)
+            # when summing gaussian, overscale raises as sqrt(n), so we nornalize by that.
+            weight = weight / torch.arange(1, num_embeddings + 1).to(weight).sqrt()[:, None]
+            self.embedding.weight.data[:] = weight
+        self.embedding.weight.data /= scale
+        self.scale = scale
+    @property
+    def weight(self):
+        return self.embedding.weight * self.scale
+    def forward(self, x):
+        out = self.embedding(x) * self.scale
+        return out
+class FTB(nn.Module):
+    def __init__(self, input_dim=257, in_channel=9, r_channel=5):
+        super(FTB, self).__init__()
+        self.input_dim = input_dim
+        self.in_channel = in_channel
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channel, r_channel, kernel_size=[1, 1]),
+            nn.BatchNorm2d(r_channel),
+            nn.ReLU()
+        )
+        self.conv1d = nn.Sequential(
+            nn.Conv1d(r_channel * input_dim, in_channel, kernel_size=9, padding=4),
+            nn.BatchNorm1d(in_channel),
+            nn.ReLU()
+        )
+        self.freq_fc = nn.Linear(input_dim, input_dim, bias=False)
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_channel * 2, in_channel, kernel_size=[1, 1]),
+            nn.BatchNorm2d(in_channel),
+            nn.ReLU()
+        )
+    def forward(self, inputs):
+        '''
+        inputs should be [Batch, Ca, Dim, Time]
+        '''
+        # T-F attention
+        conv1_out = self.conv1(inputs)
+        B, C, D, T = conv1_out.size()
+        reshape1_out = torch.reshape(conv1_out, [B, C * D, T])
+        conv1d_out = self.conv1d(reshape1_out)
+        conv1d_out = torch.reshape(conv1d_out, [B, self.in_channel, 1, T])
+        # now is also [B,C,D,T]
+        att_out = conv1d_out * inputs
+        # tranpose to [B,C,T,D]
+        att_out = torch.transpose(att_out, 2, 3)
+        freqfc_out = self.freq_fc(att_out)
+        att_out = torch.transpose(freqfc_out, 2, 3)
+        cat_out = torch.cat([att_out, inputs], 1)
+        outputs = self.conv2(cat_out)
+        return outputs

src/models/seanet.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import torch.nn as nn
+import math
+from src.models.utils import capture_init, weights_init
+from src.models.modules import WNConv1d, WNConvTranspose1d
+from torchaudio.functional import resample
+from torch.nn import functional as F
+class ResnetBlock(nn.Module):
+    def __init__(self, dim, dilation=1):
+        super().__init__()
+        self.block = nn.Sequential(
+            nn.LeakyReLU(0.2),
+            nn.ReflectionPad1d(dilation),
+            WNConv1d(dim, dim, kernel_size=3, dilation=dilation),
+            nn.LeakyReLU(0.2),
+            WNConv1d(dim, dim, kernel_size=1),
+        )
+        self.shortcut = WNConv1d(dim, dim, kernel_size=1)
+    def forward(self, x):
+        return self.shortcut(x) + self.block(x)
+class Seanet(nn.Module):
+    @capture_init
+    def __init__(self,
+                 latent_space_size=128,
+                 ngf=32, n_residual_layers=3,
+                 resample=1,
+                 normalize=True,
+                 floor=1e-3,
+                 ratios=[8, 8, 2, 2],
+                 in_channels=1,
+                 out_channels=1,
+                 lr_sr=16000,
+                 hr_sr=16000,
+                 upsample=True):
+        super().__init__()
+        self.resample = resample
+        self.normalize = normalize
+        self.floor = floor
+        self.lr_sr = lr_sr
+        self.hr_sr = hr_sr
+        self.scale_factor = int(self.hr_sr / self.lr_sr)
+        self.upsample = upsample
+        self.encoder = nn.ModuleList()
+        self.decoder = nn.ModuleList()
+        self.ratios = ratios
+        mult = int(2 ** len(ratios))
+        decoder_wrapper_conv_layer = [
+            nn.LeakyReLU(0.2),
+            nn.ReflectionPad1d(3),
+            WNConv1d(latent_space_size, mult * ngf, kernel_size=7, padding=0),
+        ]
+        encoder_wrapper_conv_layer = [
+            nn.LeakyReLU(0.2),
+            nn.ReflectionPad1d(3),
+            WNConv1d(mult * ngf, latent_space_size, kernel_size=7, padding=0)
+        ]
+        self.encoder.insert(0, nn.Sequential(*encoder_wrapper_conv_layer))
+        self.decoder.append(nn.Sequential(*decoder_wrapper_conv_layer))
+        for i, r in enumerate(ratios):
+            encoder_block = [
+                nn.LeakyReLU(0.2),
+                WNConv1d(mult * ngf // 2,
+                         mult * ngf,
+                         kernel_size=r * 2,
+                         stride=r,
+                         padding=r // 2 + r % 2,
+                         ),
+            ]
+            decoder_block = [
+                nn.LeakyReLU(0.2),
+                WNConvTranspose1d(
+                    mult * ngf,
+                    mult * ngf // 2,
+                    kernel_size=r * 2,
+                    stride=r,
+                    padding=r // 2 + r % 2,
+                    output_padding=r % 2,
+                ),
+            ]
+            for j in range(n_residual_layers - 1, -1, -1):
+                encoder_block = [ResnetBlock(mult * ngf // 2, dilation=3 ** j)] + encoder_block
+            for j in range(n_residual_layers):
+                decoder_block += [ResnetBlock(mult * ngf // 2, dilation=3 ** j)]
+            mult //= 2
+            self.encoder.insert(0, nn.Sequential(*encoder_block))
+            self.decoder.append(nn.Sequential(*decoder_block))
+        encoder_wrapper_conv_layer = [
+            nn.ReflectionPad1d(3),
+            WNConv1d(in_channels, ngf, kernel_size=7, padding=0),
+            nn.Tanh(),
+        ]
+        self.encoder.insert(0, nn.Sequential(*encoder_wrapper_conv_layer))
+        decoder_wrapper_conv_layer = [
+            nn.LeakyReLU(0.2),
+            nn.ReflectionPad1d(3),
+            WNConv1d(ngf, out_channels, kernel_size=7, padding=0),
+            nn.Tanh(),
+        ]
+        self.decoder.append(nn.Sequential(*decoder_wrapper_conv_layer))
+        self.apply(weights_init)
+    def estimate_output_length(self, length):
+        """
+        Return the nearest valid length to use with the model so that
+        there is no time steps left over in a convolutions, e.g. for all
+        layers, size of the input - kernel_size % stride = 0.
+        If the mixture has a valid length, the estimated sources
+        will have exactly the same length.
+        """
+        depth = len(self.ratios)
+        for idx in range(depth - 1, -1, -1):
+            stride = self.ratios[idx]
+            kernel_size = 2 * stride
+            padding = stride // 2 + stride % 2
+            length = math.ceil((length - kernel_size + 2 * padding) / stride) + 1
+            length = max(length, 1)
+        for idx in range(depth):
+            stride = self.ratios[idx]
+            kernel_size = 2 * stride
+            padding = stride // 2 + stride % 2
+            output_padding = stride % 2
+            length = (length - 1) * stride + kernel_size - 2 * padding + output_padding
+        return int(length)
+    def pad_to_valid_length(self, signal):
+        valid_length = self.estimate_output_length(signal.shape[-1])
+        padding_len = valid_length - signal.shape[-1]
+        signal = F.pad(signal, (0, padding_len))
+        return signal, padding_len
+    def forward(self, signal):
+        target_len = signal.shape[-1]
+        if self.upsample:
+            target_len *= self.scale_factor
+        if self.normalize:
+            mono = signal.mean(dim=1, keepdim=True)
+            std = mono.std(dim=-1, keepdim=True)
+            signal = signal / (self.floor + std)
+        else:
+            std = 1
+        x = signal
+        if self.upsample:
+            x = resample(x, self.lr_sr, self.hr_sr)
+        x, padding_len = self.pad_to_valid_length(x)
+        skips = []
+        for i, encode in enumerate(self.encoder):
+            skips.append(x)
+            x = encode(x)
+        for j, decode in enumerate(self.decoder):
+            x = decode(x)
+            skip = skips.pop(-1)
+            x = x + skip
+        if target_len < x.shape[-1]:
+            x = x[..., :target_len]
+        return std * x

src/models/snake.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+from torch.distributions.exponential import Exponential
+class Snake(nn.Module):
+    '''
+    Implementation of the serpentine-like sine-based periodic activation function:
+    .. math::
+         Snake_a := x + \frac{1}{a} sin^2(ax) = x - \frac{1}{2a}cos{2ax} + \frac{1}{2a}
+    This activation function is able to better extrapolate to previously unseen data,
+    especially in the case of learning periodic functions
+    Shape:
+        - Input: (N, *) where * means, any number of additional
+          dimensions
+        - Output: (N, *), same shape as the input
+    Parameters:
+        - a - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, a=None, trainable=True):
+        '''
+        Initialization.
+        Args:
+            in_features: shape of the input
+            a: trainable parameter
+            trainable: sets `a` as a trainable parameter
+            `a` is initialized to 1 by default, higher values = higher-frequency,
+            5-50 is a good starting point if you already think your data is periodic,
+            consider starting lower e.g. 0.5 if you think not, but don't worry,
+            `a` will be trained along with the rest of your model
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features if isinstance(in_features, list) else [in_features]
+        # Initialize `a`
+        if a is not None:
+            self.a = Parameter(torch.ones(self.in_features) * a)  # create a tensor out of alpha
+        else:
+            m = Exponential(torch.tensor([0.1]))
+            self.a = Parameter((m.rsample(self.in_features)).squeeze())  # random init = mix of frequencies
+        self.a.requiresGrad = trainable  # set the training of `a` to true
+    def extra_repr(self) -> str:
+        return 'in_features={}'.format(self.in_features)
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a* sin^2 (xa)
+        '''
+        return x + (1.0 / self.a) * pow(sin(x * self.a), 2)

src/models/spec.py ADDED Viewed

	@@ -0,0 +1,39 @@

+"""
+This code is based on Facebook's HDemucs code: https://github.com/facebookresearch/demucs
+"""
+"""Conveniance wrapper to perform STFT and iSTFT"""
+import torch as th
+def spectro(x, n_fft=512, hop_length=None, pad=0, win_length=None):
+    *other, length = x.shape
+    x = x.reshape(-1, length)
+    z = th.stft(x,
+                n_fft * (1 + pad),
+                hop_length or n_fft // 4,
+                window=th.hann_window(win_length).to(x),
+                win_length=win_length or n_fft,
+                normalized=True,
+                center=True,
+                return_complex=True,
+                pad_mode='reflect')
+    _, freqs, frame = z.shape
+    return z.view(*other, freqs, frame)
+def ispectro(z, hop_length=None, length=None, pad=0, win_length=None):
+    *other, freqs, frames = z.shape
+    n_fft = 2 * freqs - 2
+    z = z.view(-1, freqs, frames)
+    win_length = win_length or n_fft // (1 + pad)
+    x = th.istft(z,
+                 n_fft,
+                 hop_length or n_fft // 2,
+                 window=th.hann_window(win_length).to(z.real),
+                 win_length=win_length,
+                 normalized=True,
+                 length=length,
+                 center=True)
+    _, length = x.shape
+    return x.view(*other, length)

src/models/utils.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import functools
+import math
+from torch.nn import functional as F
+import torchaudio
+def capture_init(init):
+    """capture_init.
+    Decorate `__init__` with this, and you can then
+    recover the *args and **kwargs passed to it in `self._init_args_kwargs`
+    """
+    @functools.wraps(init)
+    def __init__(self, *args, **kwargs):
+        self._init_args_kwargs = (args, kwargs)
+        init(self, *args, **kwargs)
+    return __init__
+def unfold(a, kernel_size, stride):
+    """Given input of size [*OT, T], output Tensor of size [*OT, F, K]
+    with K the kernel size, by extracting frames with the given stride.
+    This will pad the input so that `F = ceil(T / K)`.
+    see https://github.com/pytorch/pytorch/issues/60466
+    """
+    *shape, length = a.shape
+    n_frames = math.ceil(length / stride)
+    tgt_length = (n_frames - 1) * stride + kernel_size
+    a = F.pad(a, (0, tgt_length - length))
+    strides = list(a.stride())
+    assert strides[-1] == 1, 'data should be contiguous'
+    strides = strides[:-1] + [stride, 1]
+    return a.as_strided([*shape, n_frames, kernel_size], strides)
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(0.0, 0.02)
+    elif classname.find("BatchNorm2d") != -1:
+        m.weight.data.normal_(1.0, 0.02)
+        m.bias.data.fill_(0)
+def write(wav, filename, sr):
+    # Normalize audio if it prevents clipping
+    wav = wav / max(wav.abs().max().item(), 1)
+    torchaudio.save(filename, wav.cpu(), sr)

src/utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import functools
+import math
+from torch.nn import functional as F
+from contextlib import contextmanager
+def capture_init(init):
+    """capture_init.
+    Decorate `__init__` with this, and you can then
+    recover the *args and **kwargs passed to it in `self._init_args_kwargs`
+    """
+    @functools.wraps(init)
+    def __init__(self, *args, **kwargs):
+        self._init_args_kwargs = (args, kwargs)
+        init(self, *args, **kwargs)
+    return __init__
+def unfold(a, kernel_size, stride):
+    """Given input of size [*OT, T], output Tensor of size [*OT, F, K]
+    with K the kernel size, by extracting frames with the given stride.
+    This will pad the input so that `F = ceil(T / K)`.
+    see https://github.com/pytorch/pytorch/issues/60466
+    """
+    *shape, length = a.shape
+    n_frames = math.ceil(length / stride)
+    tgt_length = (n_frames - 1) * stride + kernel_size
+    a = F.pad(a, (0, tgt_length - length))
+    strides = list(a.stride())
+    assert strides[-1] == 1, 'data should be contiguous'
+    strides = strides[:-1] + [stride, 1]
+    return a.as_strided([*shape, n_frames, kernel_size], strides)
+def colorize(text, color):
+    """
+    Display text with some ANSI color in the terminal.
+    """
+    code = f"\033[{color}m"
+    restore = "\033[0m"
+    return "".join([code, text, restore])
+def bold(text):
+    """
+    Display text in bold in the terminal.
+    """
+    return colorize(text, "1")