Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

.gitignore +189 -0
neucodec/__init__.py +3 -0
neucodec/activations.py +120 -0
neucodec/alias_free_torch/__init__.py +6 -0
neucodec/alias_free_torch/act.py +28 -0
neucodec/alias_free_torch/filter.py +95 -0
neucodec/alias_free_torch/resample.py +49 -0
neucodec/bs_roformer5.py +120 -0
neucodec/codec_decoder_vocos.py +431 -0
neucodec/codec_encoder.py +84 -0
neucodec/model.py +269 -0
neucodec/module.py +114 -0
setup.py +32 -0
tests/__init__.py +0 -0
tests/test_neucodec.py +128 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,189 @@

+# Emacs
+*~
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+/runs
+/checkpoints
+/base
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+/runs
+/.cache
+/__pycache__
+*.wav
+*.pth
+*.pt
+*.pt.gz
+wandb/
+sven_latest_checkpoint/
+sven_qwen/
+pretrained_models/
+xcodec/
+small_speaker_shards_all/
+sven_all_shards/
+qwen_380k/
+evals/
+*.safetensors
+*.pt
+.ruff_cache

neucodec/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .codec_encoder import CodecEncoder
+from .codec_decoder_vocos import CodecDecoderVocos
+from .model import NeuCodec

neucodec/activations.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x
+class SnakeBeta(nn.Module):
+    '''
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale: # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else: # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+        return x

neucodec/alias_free_torch/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+from .filter import *
+from .resample import *
+from .act import *

neucodec/alias_free_torch/act.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+import torch.nn as nn
+from .resample import UpSample1d, DownSample1d
+class Activation1d(nn.Module):
+    def __init__(self,
+                 activation,
+                 up_ratio: int = 2,
+                 down_ratio: int = 2,
+                 up_kernel_size: int = 12,
+                 down_kernel_size: int = 12):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+        return x

neucodec/alias_free_torch/filter.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+if 'sinc' in dir(torch):
+    sinc = torch.sinc
+else:
+    # This code is adopted from adefossez's julius.core.sinc under the MIT License
+    # https://adefossez.github.io/julius/julius/core.html
+    #   LICENSE is in incl_licenses directory.
+    def sinc(x: torch.Tensor):
+        """
+        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+        """
+        return torch.where(x == 0,
+                           torch.tensor(1., device=x.device, dtype=x.dtype),
+                           torch.sin(math.pi * x) / math.pi / x)
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+#   LICENSE is in incl_licenses directory.
+def kaiser_sinc_filter1d(cutoff, half_width, kernel_size): # return filter [1,1,kernel_size]
+    even = (kernel_size % 2 == 0)
+    half_size = kernel_size // 2
+    #For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.:
+        beta = 0.5842 * (A - 21)**0.4 + 0.07886 * (A - 21.)
+    else:
+        beta = 0.
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = (torch.arange(-half_size, half_size) + 0.5)
+    else:
+        time = torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+        # Normalize filter to have sum = 1, otherwise we will have a small leakage
+        # of the constant component in the input signal.
+        filter_ /= filter_.sum()
+        filter = filter_.view(1, 1, kernel_size)
+    return filter
+class LowPassFilter1d(nn.Module):
+    def __init__(self,
+                 cutoff=0.5,
+                 half_width=0.6,
+                 stride: int = 1,
+                 padding: bool = True,
+                 padding_mode: str = 'replicate',
+                 kernel_size: int = 12):
+        # kernel_size should be even number for stylegan3 setup,
+        # in this implementation, odd number is also possible.
+        super().__init__()
+        if cutoff < -0.:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = (kernel_size % 2 == 0)
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter)
+    #input [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right),
+                      mode=self.padding_mode)
+        out = F.conv1d(x, self.filter.expand(C, -1, -1),
+                       stride=self.stride, groups=C)
+        return out

neucodec/alias_free_torch/resample.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+import torch.nn as nn
+from torch.nn import functional as F
+from .filter import LowPassFilter1d
+from .filter import kaiser_sinc_filter1d
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        filter = kaiser_sinc_filter1d(cutoff=0.5 / ratio,
+                                      half_width=0.6 / ratio,
+                                      kernel_size=self.kernel_size)
+        self.register_buffer("filter", filter)
+    # x: [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+        x = F.pad(x, (self.pad, self.pad), mode='replicate')
+        x = self.ratio * F.conv_transpose1d(
+            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+        x = x[..., self.pad_left:-self.pad_right]
+        return x
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        self.lowpass = LowPassFilter1d(cutoff=0.5 / ratio,
+                                       half_width=0.6 / ratio,
+                                       stride=ratio,
+                                       kernel_size=self.kernel_size)
+    def forward(self, x):
+        xx = self.lowpass(x)
+        return xx

neucodec/bs_roformer5.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+import numpy as np
+from torch.nn import Module, ModuleList
+from einops import rearrange
+from torchtune.modules import RotaryPositionalEmbeddings
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        r"""https://github.com/meta-llama/llama/blob/main/llama/model.py"""
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        norm_x = torch.mean(x ** 2, dim=-1, keepdim=True)
+        output = x * torch.rsqrt(norm_x + self.eps) * self.weight
+        return output
+class MLP(nn.Module):
+    def __init__(self, dim: int) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(dim, 4 * dim, bias=False)
+        self.silu = nn.SiLU()
+        self.fc2 = nn.Linear(4 * dim, dim, bias=False)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.silu(x)
+        x = self.fc2(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim: int, n_heads: int, rotary_embed: RotaryPositionalEmbeddings):
+        super().__init__()
+        assert dim % n_heads == 0
+        self.n_heads = n_heads
+        self.dim = dim
+        self.rotary_embed = rotary_embed
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        assert self.flash, "Must have flash attention."
+        self.c_attn = nn.Linear(dim, 3 * dim, bias=False)
+        self.c_proj = nn.Linear(dim, dim, bias=False)
+    def forward(self, x):
+        r"""
+        Args:
+            x: (b, t, h*d)
+        Constants:
+            b: batch_size
+            t: time steps
+            r: 3
+            h: heads_num
+            d: heads_dim
+        """
+        B, T, C = x.size()
+        q, k, v = rearrange(self.c_attn(x), 'b t (r h d) -> r b h t d', r=3, h=self.n_heads)
+        # q, k, v: (b, h, t, d)
+        q = self.rotary_embed(q)
+        k = self.rotary_embed(k)
+        if self.flash:
+            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0, is_causal=False)
+        y = rearrange(y, 'b h t d -> b t (h d)')
+        y = self.c_proj(y)
+        # shape: (b, t, h*d)
+        return y
+class TransformerBlock(nn.Module):
+    def __init__(self, dim: int, n_heads: int, rotary_embed: RotaryPositionalEmbeddings):
+        super().__init__()
+        self.dim = dim
+        self.n_heads = n_heads
+        self.att_norm = RMSNorm(dim)
+        self.ffn_norm = RMSNorm(dim)
+        self.att = Attention(dim=dim, n_heads=n_heads, rotary_embed=rotary_embed)
+        self.mlp = MLP(dim=dim)
+    def forward(
+        self,
+        x: torch.Tensor,
+    ):
+        x = x + self.att(self.att_norm(x))
+        x = x + self.mlp(self.ffn_norm(x))
+        return x
+if __name__ == '__main__':
+    rotary_embed_128 = RotaryPositionalEmbeddings(dim=128)
+    transformer_block = TransformerBlock(
+        dim=1024,
+        n_heads=8,
+        rotary_embed=rotary_embed_128
+    )
+    x = torch.randn(2, 128, 1024)
+    y = transformer_block(x)
+    print(y.shape)
+    c=1

neucodec/codec_decoder_vocos.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import torch
+import torch.nn as nn
+from typing import List
+from torchtune.modules import RotaryPositionalEmbeddings
+from vector_quantize_pytorch import ResidualFSQ
+from .bs_roformer5 import TransformerBlock
+class ISTFT(nn.Module):
+    """
+    Custom implementation of ISTFT since torch.istft doesn't allow custom padding (other than `center=True`) with
+    windowing. This is because the NOLA (Nonzero Overlap Add) check fails at the edges.
+    See issue: https://github.com/pytorch/pytorch/issues/62323
+    Specifically, in the context of neural vocoding we are interested in "same" padding analogous to CNNs.
+    The NOLA constraint is met as we trim padded samples anyway.
+    Args:
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames.
+        win_length (int): The size of window frame and STFT filter.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(
+        self, n_fft: int, hop_length: int, win_length: int, padding: str = "same"
+    ):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        window = torch.hann_window(win_length)
+        self.register_buffer("window", window)
+    def forward(self, spec: torch.Tensor) -> torch.Tensor:
+        """
+        Compute the Inverse Short Time Fourier Transform (ISTFT) of a complex spectrogram.
+        Args:
+            spec (Tensor): Input complex spectrogram of shape (B, N, T), where B is the batch size,
+                            N is the number of frequency bins, and T is the number of time frames.
+        Returns:
+            Tensor: Reconstructed time-domain signal of shape (B, L), where L is the length of the output signal.
+        """
+        if self.padding == "center":
+            # Fallback to pytorch native implementation
+            return torch.istft(
+                spec,
+                self.n_fft,
+                self.hop_length,
+                self.win_length,
+                self.window,
+                center=True,
+            )
+        elif self.padding == "same":
+            pad = (self.win_length - self.hop_length) // 2
+        else:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        assert spec.dim() == 3, "Expected a 3D tensor as input"
+        B, N, T = spec.shape
+        # Inverse FFT
+        ifft = torch.fft.irfft(spec, self.n_fft, dim=1, norm="backward")
+        ifft = ifft * self.window[None, :, None]
+        # Overlap and Add
+        output_size = (T - 1) * self.hop_length + self.win_length
+        y = torch.nn.functional.fold(
+            ifft,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        )[:, 0, 0, pad:-pad]
+        # Window envelope
+        window_sq = self.window.square().expand(1, T, -1).transpose(1, 2)
+        window_envelope = torch.nn.functional.fold(
+            window_sq,
+            output_size=(1, output_size),
+            kernel_size=(1, self.win_length),
+            stride=(1, self.hop_length),
+        ).squeeze()[pad:-pad]
+        # Normalize
+        assert (window_envelope > 1e-11).all()
+        y = y / window_envelope
+        return y
+class FourierHead(nn.Module):
+    """Base class for inverse fourier modules."""
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        raise NotImplementedError("Subclasses must implement the forward method.")
+class ISTFTHead(FourierHead):
+    """
+    ISTFT Head module for predicting STFT complex coefficients.
+    Args:
+        dim (int): Hidden dimension of the model.
+        n_fft (int): Size of Fourier transform.
+        hop_length (int): The distance between neighboring sliding window frames, which should align with
+                          the resolution of the input features.
+        padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
+    """
+    def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same"):
+        super().__init__()
+        out_dim = n_fft + 2
+        self.out = torch.nn.Linear(dim, out_dim)
+        self.istft = ISTFT(
+            n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass of the ISTFTHead module.
+        Args:
+            x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
+                        L is the sequence length, and H denotes the model dimension.
+        Returns:
+            Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
+        """
+        x_pred = self.out(x)
+        # x_pred = x
+        x_pred = x_pred.transpose(1, 2)
+        mag, p = x_pred.chunk(2, dim=1)
+        mag = torch.exp(mag)
+        mag = torch.clip(
+            mag, max=1e2
+        )  # safeguard to prevent excessively large magnitudes
+        # wrapping happens here. These two lines produce real and imaginary value
+        x = torch.cos(p)
+        y = torch.sin(p)
+        # recalculating phase here does not produce anything new
+        # only costs time
+        # phase = torch.atan2(y, x)
+        # S = mag * torch.exp(phase * 1j)
+        # better directly produce the complex value
+        S = mag * (x + 1j * y)
+        audio = self.istft(S)
+        return audio.unsqueeze(1), x_pred
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True
+    )
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout,
+        temb_channels=512,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv1d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv1d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv1d(
+                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
+                )
+            else:
+                self.nin_shortcut = torch.nn.Conv1d(
+                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
+                )
+    def forward(self, x, temb=None):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class Backbone(nn.Module):
+    """Base class for the generator's backbone. It preserves the same temporal resolution across all layers."""
+    def forward(self, x: torch.Tensor, **kwargs) -> torch.Tensor:
+        """
+        Args:
+            x (Tensor): Input tensor of shape (B, C, L), where B is the batch size,
+                        C denotes output features, and L is the sequence length.
+        Returns:
+            Tensor: Output of shape (B, L, H), where B is the batch size, L is the sequence length,
+                    and H denotes the model dimension.
+        """
+        raise NotImplementedError("Subclasses must implement the forward method.")
+class VocosBackbone(Backbone):
+    """
+    Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization
+    Args:
+        input_channels (int): Number of input features channels.
+        dim (int): Hidden dimension of the model.
+        intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
+        num_layers (int): Number of ConvNeXtBlock layers.
+        layer_scale_init_value (float, optional): Initial value for layer scaling. Defaults to `1 / num_layers`.
+        adanorm_num_embeddings (int, optional): Number of embeddings for AdaLayerNorm.
+                                                None means non-conditional model. Defaults to None.
+    """
+    def __init__(self, hidden_dim=1024, depth=12, heads=16, pos_meb_dim=64):
+        super().__init__()
+        self.embed = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=7, padding=3)
+        self.temb_ch = 0
+        block_in = hidden_dim
+        dropout = 0.1
+        prior_net: List[nn.Module] = [
+            ResnetBlock(
+                in_channels=block_in,
+                out_channels=block_in,
+                temb_channels=self.temb_ch,
+                dropout=dropout,
+            ),
+            ResnetBlock(
+                in_channels=block_in,
+                out_channels=block_in,
+                temb_channels=self.temb_ch,
+                dropout=dropout,
+            ),
+        ]
+        self.prior_net = nn.Sequential(*prior_net)
+        depth = depth
+        time_rotary_embed = RotaryPositionalEmbeddings(dim=pos_meb_dim)
+        transformer_blocks = [
+            TransformerBlock(
+                dim=hidden_dim, n_heads=heads, rotary_embed=time_rotary_embed
+            )
+            for _ in range(depth)
+        ]
+        self.transformers = nn.Sequential(*transformer_blocks)
+        self.final_layer_norm = nn.LayerNorm(hidden_dim, eps=1e-6)
+        post_net: List[nn.Module] = [
+            ResnetBlock(
+                in_channels=block_in,
+                out_channels=block_in,
+                temb_channels=self.temb_ch,
+                dropout=dropout,
+            ),
+            ResnetBlock(
+                in_channels=block_in,
+                out_channels=block_in,
+                temb_channels=self.temb_ch,
+                dropout=dropout,
+            ),
+        ]
+        self.post_net = nn.Sequential(*post_net)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.transpose(1, 2)
+        x = self.embed(x)
+        x = self.prior_net(x)
+        x = x.transpose(1, 2)
+        x = self.transformers(x)
+        x = x.transpose(1, 2)
+        x = self.post_net(x)
+        x = x.transpose(1, 2)
+        x = self.final_layer_norm(x)
+        return x
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)
+class CodecDecoderVocos(nn.Module):
+    def __init__(
+        self,
+        hidden_dim=1024,
+        depth=12,
+        heads=16,
+        pos_meb_dim=64,
+        hop_length=320,
+        vq_num_quantizers=1,
+        vq_dim=2048,  # 1024 2048
+        vq_commit_weight=0.25,
+        vq_weight_init=False,
+        vq_full_commit_loss=False,
+        codebook_size=16384,
+        codebook_dim=16,
+    ):
+        super().__init__()
+        self.hop_length = hop_length
+        self.quantizer = ResidualFSQ(
+            dim=vq_dim, levels=[4, 4, 4, 4, 4, 4, 4, 4], num_quantizers=1
+        )
+        self.backbone = VocosBackbone(
+            hidden_dim=hidden_dim, depth=depth, heads=heads, pos_meb_dim=pos_meb_dim
+        )
+        self.head = ISTFTHead(
+            dim=hidden_dim,
+            n_fft=self.hop_length * 4,
+            hop_length=self.hop_length,
+            padding="same",
+        )
+        self.reset_parameters()
+    def forward(self, x, vq=True):
+        if vq is True:
+            # x, q, commit_loss = self.quantizer(x)
+            x = x.permute(0, 2, 1)
+            x, q = self.quantizer(x)
+            x = x.permute(0, 2, 1)
+            q = q.permute(0, 2, 1)
+            return x, q, None
+        x = self.backbone(x)
+        x, _ = self.head(x)
+        return x, _
+    def vq2emb(self, vq):
+        self.quantizer = self.quantizer.eval()
+        x = self.quantizer.vq2emb(vq)
+        return x
+    def get_emb(self):
+        self.quantizer = self.quantizer.eval()
+        embs = self.quantizer.get_emb()
+        return embs
+    def inference_vq(self, vq):
+        x = vq[None, :, :]
+        x = self.model(x)
+        return x
+    def inference_0(self, x):
+        x, q, loss, perp = self.quantizer(x)
+        x = self.model(x)
+        return x, None
+    def inference(self, x):
+        x = self.model(x)
+        return x, None
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+        def _remove_weight_norm(m):
+            try:
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+        def _apply_weight_norm(m):
+            if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
+                torch.nn.utils.weight_norm(m)
+        self.apply(_apply_weight_norm)
+    def reset_parameters(self):
+        self.apply(init_weights)

neucodec/codec_encoder.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch
+import numpy as np
+from torch import nn
+from .module import WNConv1d, EncoderBlock
+from .alias_free_torch import Activation1d
+from . import activations
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)
+class CodecEncoder(nn.Module):
+    def __init__(
+        self,
+        ngf=48,
+        up_ratios=[2, 2, 4, 4, 5],
+        dilations=(1, 3, 9),
+        hidden_dim=1024,
+        depth=12,
+        heads=12,
+        pos_meb_dim=64,
+    ):
+        super().__init__()
+        self.hop_length = np.prod(up_ratios)
+        self.ngf = ngf
+        self.up_ratios = up_ratios
+        d_model = ngf
+        self.conv_blocks = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
+        for i, stride in enumerate(up_ratios):
+            d_model *= 2
+            self.conv_blocks += [
+                EncoderBlock(d_model, stride=stride, dilations=dilations)
+            ]
+        self.conv_blocks = nn.Sequential(*self.conv_blocks)
+        self.conv_final_block = [
+            Activation1d(
+                activation=activations.SnakeBeta(d_model, alpha_logscale=True)
+            ),
+            WNConv1d(d_model, hidden_dim, kernel_size=3, padding=1),
+        ]
+        self.conv_final_block = nn.Sequential(*self.conv_final_block)
+        self.reset_parameters()
+    def forward(self, x):
+        x = self.conv_blocks(x)
+        x = self.conv_final_block(x)
+        x = x.permute(0, 2, 1)
+        return x
+    def inference(self, x):
+        return self.block(x)
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+        def _remove_weight_norm(m):
+            try:
+                torch.nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+        self.apply(_remove_weight_norm)
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+        def _apply_weight_norm(m):
+            if isinstance(m, nn.Conv1d):
+                torch.nn.utils.weight_norm(m)
+        self.apply(_apply_weight_norm)
+    def reset_parameters(self):
+        self.apply(init_weights)

neucodec/model.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import soundfile as sf
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from typing import Optional
+from torchaudio import transforms as T
+from transformers import AutoFeatureExtractor, Wav2Vec2BertModel
+from .codec_encoder import CodecEncoder
+from .codec_decoder_vocos import CodecDecoderVocos
+from .module import SemanticEncoder
+class NeuCodec(nn.Module):
+    def __init__(self, ckpt_path: str, sample_rate: int, hop_length: int):
+        super().__init__()
+        # load ckpt
+        ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=False)
+        self.sample_rate = sample_rate
+        self.hop_length = hop_length
+        # load modules
+        self.semantic_model = Wav2Vec2BertModel.from_pretrained(
+            "facebook/w2v-bert-2.0", output_hidden_states=True
+        )
+        self.semantic_model.eval()
+        self.feature_extractor = AutoFeatureExtractor.from_pretrained(
+            "facebook/w2v-bert-2.0"
+        )
+        self.SemanticEncoder_module = SemanticEncoder(1024, 1024, 1024)
+        self.CodecEnc = CodecEncoder()
+        self.generator = CodecDecoderVocos(hop_length=hop_length)
+        self.fc_prior = nn.Linear(2048, 2048)
+        self.fc_post_a = nn.Linear(2048, 1024)
+        # load checkpoint
+        self._load_ckpt(ckpt)
+    def _load_ckpt(self, ckpt):
+        # differentiate between `.ckpt` and `.bin`
+        if ckpt.get("state_dict"):
+            state_dicts = ckpt.get("state_dict")
+        else:
+            state_dicts = ckpt
+        # assign keys to correct model components
+        filtered_enc = {}
+        filtered_gen = {}
+        filtered_post = {}
+        filtered_prior = {}
+        filtered_semantic = {}
+        for key, value in state_dicts.items():
+            if key.startswith("CodecEnc."):
+                new_key = key[len("CodecEnc."):]
+                filtered_enc[new_key] = value
+            elif key.startswith("generator."):
+                new_key = key[len("generator."):]
+                filtered_gen[new_key] = value
+            elif key.startswith("fc_post_a."):
+                new_key = key[len("fc_post_a."):]
+                filtered_post[new_key] = value
+            elif key.startswith("SemanticEncoder_module."):
+                new_key = key[len("SemanticEncoder_module."):]
+                filtered_semantic[new_key] = value
+            elif key.startswith("fc_prior."):
+                new_key = key[len("fc_prior."):]
+                filtered_prior[new_key] = value
+        # load
+        self.CodecEnc.load_state_dict(filtered_enc)
+        self.CodecEnc.eval()
+        self.generator.load_state_dict(filtered_gen, strict=False)
+        self.generator.eval()
+        self.fc_post_a.load_state_dict(filtered_post)
+        self.fc_post_a.eval()
+        self.fc_prior.load_state_dict(filtered_prior)
+        self.SemanticEncoder_module.load_state_dict(filtered_semantic)
+        self.SemanticEncoder_module.eval()
+    @torch.inference_mode()
+    def encode_code(
+        self,
+        input_waveform: torch.Tensor,
+        semantic_features: torch.Tensor = None,
+        sample_rate: int = 16_000,
+    ) -> torch.Tensor:
+        pad_for_wav = 320 - (input_waveform.shape[1] % 320)
+        input_waveform = torch.nn.functional.pad(input_waveform, (0, pad_for_wav))
+        if semantic_features is None:
+            semantic_features = self.feature_extractor(
+                input_waveform, sampling_rate=sample_rate, return_tensors="pt"
+            ).input_features.to(self.device)  # [batch, frames, feat_dim]
+        else:
+            semantic_features = semantic_features[:, 0, :, :]
+        semantic_output = self.semantic_model(semantic_features)
+        semantic_hidden_16 = semantic_output.hidden_states[16]
+        semantic_hidden_16 = semantic_hidden_16.transpose(
+            1, 2
+        )  # [batch, hidden_dim, frames]
+        semantic_encoded = self.SemanticEncoder_module(semantic_hidden_16)
+        if len(input_waveform.shape) == 2:
+            wav = input_waveform.unsqueeze(1).to(self.device)  # shape: [batch, 1, time]
+        else:
+            wav = input_waveform.to(self.device)
+        vq_emb = self.CodecEnc(wav)  # [batch, time//down, 1024]
+        vq_emb = vq_emb.transpose(1, 2)  # -> [batch, 1024, frames]
+        if vq_emb.shape[-1] != semantic_encoded.shape[-1]:
+            min_len = min(vq_emb.shape[-1], semantic_encoded.shape[-1])
+            vq_emb = vq_emb[:, :, :min_len]
+            semantic_encoded = semantic_encoded[:, :, :min_len]
+        concat_emb = torch.cat(
+            [semantic_encoded, vq_emb], dim=1
+        )  # [batch, 2048, frames]
+        concat_emb = self.fc_prior(concat_emb.transpose(1, 2)).transpose(1, 2)
+        _, vq_code, _ = self.generator(concat_emb, vq=True)
+        return vq_code
+    @torch.inference_mode()
+    def decode_code(self, vq_code: torch.Tensor) -> torch.Tensor:
+        vq_post_emb = self.generator.quantizer.get_output_from_indices(
+            vq_code.transpose(1, 2)
+        )
+        vq_post_emb = vq_post_emb.transpose(1, 2)  # [batch, 1024, frames]
+        vq_post_emb = self.fc_post_a(vq_post_emb.transpose(1, 2)).transpose(
+            1, 2
+        )  # [batch, 1024, frames]
+        recon_audio = self.generator(vq_post_emb.transpose(1, 2), vq=False)[
+            0
+        ]  # [batch, time]
+        return recon_audio
+    @torch.inference_mode()
+    def autoencode(self, fpath: str, output_fpath: Optional[str] = None):
+        y, sr = torchaudio.load(fpath)
+        if sr != 16_000:
+            y = T.Resample(sr, 16_000)(y)
+        vq_codes = self.encode_code(y)
+        recon = self.decode_code(vq_codes)
+        if output_fpath is None:
+            name, fext = os.path.splitext(fpath)
+            output_fpath = f"{name}_recon{fext}"
+        sf.write(output_fpath, recon[0, 0, :].cpu(), self.sample_rate)
+    @torch.inference_mode()
+    def batch_encode(
+        self, fpaths: list[str], return_tensor: bool = False
+    ) -> tuple[list[torch.Tensor], list[int]] | tuple[torch.Tensor, list[int]]:
+        # prepare batch
+        wavs_batch, semantic_batch, token_durations = self._pad_batch(
+            [self._preprocess_file(fpath) for fpath in fpaths]
+        )
+        vq_codes = self.encode_code(wavs_batch, semantic_batch)
+        # return, unpad if we want to
+        if return_tensor:
+            return vq_codes, list(token_durations)
+        unpadded_vq_codes = []
+        for idx, token_dur in enumerate(token_durations):
+            curr_codes = vq_codes[idx, :, :token_dur]
+            unpadded_vq_codes.append(curr_codes)
+        return unpadded_vq_codes, None
+    @torch.inference_mode()
+    def batch_decode(
+        self,
+        vq_codes: list[torch.Tensor] | torch.Tensor,
+        token_durations: Optional[list[int]] = None,
+    ):
+        # pad tensor if need be
+        if isinstance(vq_codes, list):
+            vq_codes, token_durations = self._pad_codes(vq_codes)
+        else:
+            assert token_durations is not None
+        # decode
+        recons = self.decode_code(vq_codes)
+        # unpad
+        cut_recons = []
+        for idx, token_dur in enumerate(token_durations):
+            curr_recon = recons[idx, :, : int(token_dur * self.hop_length)]
+            cut_recons.append(curr_recon)
+        return cut_recons
+    @torch.inference_mode()
+    def batch_autoencode(
+        self, fpaths: list[str], output_fpaths: Optional[list[str]] = None
+    ) -> list[torch.Tensor]:
+        vq_codes, token_durations = self.batch_encode(fpaths, return_tensor=True)
+        cut_recons = self.batch_decode(vq_codes, token_durations)
+        if output_fpaths:
+            for recon, output_fpath in zip(cut_recons, output_fpaths):
+                sf.write(output_fpath, recon.cpu().numpy()[0, :], self.sample_rate)
+        return cut_recons
+    def _preprocess_file(self, fpath: str):
+        # load and resample
+        y, sr = torchaudio.load(fpath)
+        if sr != 16_000:
+            y = T.Resample(sr, 16_000)(y)
+        # compute duration for any cutting we might need to do, in terms of n_tokens
+        token_duration = int((y.shape[-1] / 16_000) * 50)
+        # get semantic model features: [harry] note i don't think this can be batched
+        semantic_model_input = self.feature_extractor(
+            y, sampling_rate=16_000, return_tensors="pt"
+        ).input_features
+        return y.to(self.device), semantic_model_input.to(self.device), token_duration
+    def _pad_batch(self, batch: list[tuple[torch.Tensor, torch.Tensor, int]]):
+        # unpack batch
+        wavs, semantic_features, token_durations = zip(*batch)
+        max_length_semantic = max([f.shape[1] for f in semantic_features])
+        max_length = max_length_semantic * 320
+        # pad wavs
+        wavs_padded = []
+        for audio in wavs:
+            padding = max_length - audio.shape[1]
+            if padding > 0:
+                padded_audio = F.pad(audio, (0, padding), mode="constant", value=0)
+            else:
+                padded_audio = audio[:, :max_length]
+            wavs_padded.append(padded_audio)
+        wavs_tensor = torch.stack(wavs_padded)
+        # pad semantic features
+        semantic_features_padded = []
+        for feat in semantic_features:
+            padding = max_length_semantic - feat.shape[1]
+            padded_feat = F.pad(feat, (0, 0, 0, padding), mode="constant", value=0)
+            semantic_features_padded.append(padded_feat)
+        semantic_feature_tensor = torch.stack(semantic_features_padded)
+        return wavs_tensor, semantic_feature_tensor, token_durations
+    def _pad_codes(self, vq_codes: list[torch.Tensor]):
+        max_len = max([i.shape[-1] for i in vq_codes])
+        token_durations = []
+        padded_codes = []
+        for curr_codes in vq_codes:
+            curr_len = curr_codes.shape[-1]
+            token_durations.append(curr_len)
+            padding = max_len - curr_len
+            curr_codes = F.pad(curr_codes, (0, padding), mode="constant", value=0)
+            padded_codes.append(curr_codes)
+        return torch.stack(padded_codes), token_durations
+    @property
+    def device(self):
+        return next(self.parameters()).device

neucodec/module.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch.nn as nn
+from torch.nn.utils import weight_norm
+from .activations import SnakeBeta
+from .alias_free_torch import Activation1d
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+class ResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Activation1d(activation=SnakeBeta(dim, alpha_logscale=True)),
+            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
+            Activation1d(activation=SnakeBeta(dim, alpha_logscale=True)),
+            WNConv1d(dim, dim, kernel_size=1),
+        )
+    def forward(self, x):
+        return x + self.block(x)
+class EncoderBlock(nn.Module):
+    def __init__(self, dim: int = 16, stride: int = 1, dilations=(1, 3, 9)):
+        super().__init__()
+        runits = [ResidualUnit(dim // 2, dilation=d) for d in dilations]
+        self.block = nn.Sequential(
+            *runits,
+            Activation1d(activation=SnakeBeta(dim // 2, alpha_logscale=True)),
+            WNConv1d(
+                dim // 2,
+                dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=stride // 2 + stride % 2,
+            ),
+        )
+    def forward(self, x):
+        return self.block(x)
+class SemanticEncoder(nn.Module):
+    def __init__(
+        self,
+        input_channels: int,
+        code_dim: int,
+        encode_channels: int,
+        kernel_size: int = 3,
+        bias: bool = True,
+    ):
+        super(SemanticEncoder, self).__init__()
+        # 初始卷积，将 input_channels 映射到 encode_channels
+        self.initial_conv = nn.Conv1d(
+            in_channels=input_channels,
+            out_channels=encode_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            bias=False,
+        )
+        # 残差块
+        self.residual_blocks = nn.Sequential(
+            nn.ReLU(inplace=True),
+            nn.Conv1d(
+                encode_channels,
+                encode_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=(kernel_size - 1) // 2,
+                bias=bias,
+            ),
+            nn.ReLU(inplace=True),
+            nn.Conv1d(
+                encode_channels,
+                encode_channels,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=(kernel_size - 1) // 2,
+                bias=bias,
+            ),
+        )
+        # 最终卷积，将 encode_channels 映射到 code_dim
+        self.final_conv = nn.Conv1d(
+            in_channels=encode_channels,
+            out_channels=code_dim,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            bias=False,
+        )
+    def forward(self, x):
+        """
+        前向传播方法。
+        Args:
+            x (Tensor): 输入张量，形状为 (Batch, Input_channels, Length)
+        Returns:
+            Tensor: 编码后的张量，形状为 (Batch, Code_dim, Length)
+        """
+        x = self.initial_conv(x)  # (Batch, Encode_channels, Length)
+        x = self.residual_blocks(x) + x  # 残差连接
+        x = self.final_conv(x)  # (Batch, Code_dim, Length)
+        return x

setup.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from setuptools import setup, find_packages
+setup(
+    name='neucodec',
+    version='0.0.1',
+    description='A package for neucodec, based on xcodec2.',
+    long_description_content_type='text/markdown',
+    author='Harry Julian',
+    author_email='[email protected]',
+    packages=find_packages(),
+    install_requires=[
+        'librosa',
+        'soundfile',
+        'numpy>=2.0.2',
+        'omegaconf>=2.3.0',
+        'torch>=2.5.1',
+        'torchaudio>=2.5.1',
+        'torchao>=0.5.0',
+        'torchtune>=0.3.1',
+        'vector-quantize-pytorch>=1.17.8',
+        'rotary-embedding-torch>=0.8.4',
+        'transformers>=4.44.2',
+        'boto3>1.0',
+        'tqdm',
+    ],
+    classifiers=[
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.10',
+    ],
+)

tests/__init__.py ADDED Viewed

File without changes

tests/test_neucodec.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import pytest
+import torch
+import torchaudio
+import librosa
+from xcodec2 import XCodec2, MiniXCodec2Encoder
+@pytest.fixture
+def model_16khz():
+    return XCodec2.from_cache("16khz")
+@pytest.fixture
+def model_24khz():
+    return XCodec2.from_cache("24khz")
+@pytest.fixture
+def model_asr_encoder():
+    return MiniXCodec2Encoder.from_cache()
+@pytest.fixture
+def example_audio():
+    y, sr = torchaudio.load(librosa.ex("libri1"))
+    return y, sr
+@pytest.fixture
+def example_fpath():
+    return librosa.ex("libri1")
+@pytest.fixture
+def batch_fpaths():
+    return [librosa.ex("libri1"), librosa.ex("libri2")]
+def load_and_validate_audio(save_path, sample_rate):
+    _, sr = torchaudio.load(save_path)
+    assert sr == sample_rate
+def test_16khz_autoencode(example_fpath, tmp_path, model_16khz):
+    save_path = str(tmp_path / "0.wav")
+    model_16khz.autoencode(example_fpath, save_path)
+    load_and_validate_audio(save_path, 16_000)
+def test_24khz_autoencode(example_fpath, tmp_path, model_24khz):
+    save_path = str(tmp_path / "0.wav")
+    model_24khz.autoencode(example_fpath, save_path)
+    load_and_validate_audio(save_path, 24_000)
+def test_24khz_encode_decode_single(example_audio, model_24khz):
+    y, sr = example_audio
+    if sr != 16_000:
+        y = torchaudio.transforms.Resample(sr, 16_000)(y)
+        sr = 16_000
+    # encode
+    vq_codes = model_24khz.encode_code(y, sample_rate=sr)
+    assert isinstance(vq_codes, torch.Tensor)
+    assert vq_codes.dim() == 3  # [batch, channels, time]
+    # decode
+    reconstructed = model_24khz.decode_code(vq_codes)
+    assert isinstance(reconstructed, torch.Tensor)
+    assert reconstructed.dim() == 3  # [batch, channels, time]
+def test_24khz_batch_encode(batch_fpaths, model_24khz):
+    vq_codes_list, token_durations = model_24khz.batch_encode(batch_fpaths, return_tensor=False)
+    assert isinstance(vq_codes_list, list)
+    assert token_durations is None
+    assert len(vq_codes_list) == 2
+    for codes in vq_codes_list:
+        assert isinstance(codes, torch.Tensor)
+        assert codes.dim() == 2  # [channels, time]
+def test_24khz_batch_encode_tensor(batch_fpaths, model_24khz):
+    vq_codes_tensor, token_durations = model_24khz.batch_encode(batch_fpaths, return_tensor=True)
+    assert isinstance(vq_codes_tensor, torch.Tensor)
+    assert isinstance(token_durations, list)
+    assert vq_codes_tensor.dim() == 3  # [batch, channels, time]
+    assert len(token_durations) == 2
+    assert len(set(token_durations)) == 2 # ensure we get two different durations back
+def test_24khz_batch_decode(batch_fpaths, model_24khz):
+    vq_codes_tensor, token_durations = model_24khz.batch_encode(batch_fpaths, return_tensor=True)
+    reconstructed_list = model_24khz.batch_decode(vq_codes_tensor, token_durations)
+    assert isinstance(reconstructed_list, list)
+    assert len(reconstructed_list) == 2
+    for recon in reconstructed_list:
+        assert isinstance(recon, torch.Tensor)
+        assert recon.dim() == 2  # [channels, time]
+def test_24khz_batch_decode_list_input(batch_fpaths, model_24khz):
+    vq_codes_list, _ = model_24khz.batch_encode(batch_fpaths, return_tensor=False)
+    reconstructed_list = model_24khz.batch_decode(vq_codes_list)
+    assert isinstance(reconstructed_list, list)
+    assert len(reconstructed_list) == 2
+    for recon in reconstructed_list:
+        assert isinstance(recon, torch.Tensor)
+        assert recon.dim() == 2  # [channels, time]
+def test_24khz_batch_autoencode(batch_fpaths, tmp_path, model_24khz):
+    output_paths = [str(tmp_path / f"{i}.wav") for i in range(len(batch_fpaths))]
+    reconstructed_list = model_24khz.batch_autoencode(batch_fpaths, output_paths)
+    assert isinstance(reconstructed_list, list)
+    assert len(reconstructed_list) == 2
+    for i, output_path in enumerate(output_paths):
+        load_and_validate_audio(output_path, 24_000)
+def test_asr_encoder_encode(example_audio, model_asr_encoder):
+    y, sr = example_audio
+    if sr != model_asr_encoder.sample_rate:
+        y = torchaudio.transforms.Resample(sr, model_asr_encoder.sample_rate)(y)
+    vq_codes = model_asr_encoder.encode_code(y)
+    assert isinstance(vq_codes, torch.Tensor)
+    assert vq_codes.dim() == 3