Spaces:
Running
Running
| import torch | |
| import torch.nn as nn | |
| from torch import Tensor | |
| import torch.nn.init as init | |
| import torch.nn.functional as F | |
| class UniDeepFsmn(nn.Module): | |
| def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None, dropout_p=0.1): | |
| super(UniDeepFsmn, self).__init__() | |
| self.input_dim = input_dim | |
| self.output_dim = output_dim | |
| if lorder is None: | |
| return | |
| self.lorder = lorder | |
| self.rorder = lorder | |
| self.hidden_size = hidden_size | |
| self.linear = nn.Linear(input_dim, hidden_size) | |
| self.project = nn.Linear(hidden_size, output_dim, bias=False) | |
| self.conv1 = nn.Conv2d(input_dim, output_dim, [self.lorder+self.rorder-1, 1], [1, 1], groups=input_dim, bias=False) | |
| self.norm = nn.LayerNorm(input_dim) | |
| self.dropout = nn.Dropout(p=dropout_p) | |
| self.swish = Swish() | |
| def forward(self, input): | |
| ## input: batch (b) x sequence(T) x feature (h) | |
| f1 = self.swish(self.linear(self.norm(input))) | |
| p1 = self.project(f1) | |
| x = torch.unsqueeze(p1, 1) | |
| #x: batch (b) x channel (c) x sequence(T) x feature (h) | |
| x_per = x.permute(0, 3, 2, 1) | |
| #x_per: batch (b) x feature (h) x sequence(T) x channel (c) | |
| y = F.pad(x_per, [0, 0, self.lorder - 1, self.rorder - 1]) | |
| out = x_per + self.conv1(y) | |
| out1 = out.permute(0, 3, 2, 1) | |
| #out1: batch (b) x channel (c) x sequence(T) x feature (h) | |
| return input + out1.squeeze() | |
| class GlobalLayerNorm(nn.Module): | |
| """Calculate Global Layer Normalization. | |
| Arguments | |
| --------- | |
| dim : (int or list or torch.Size) | |
| Input shape from an expected input of size. | |
| eps : float | |
| A value added to the denominator for numerical stability. | |
| elementwise_affine : bool | |
| A boolean value that when set to True, | |
| this module has learnable per-element affine parameters | |
| initialized to ones (for weights) and zeros (for biases). | |
| Example | |
| ------- | |
| >>> x = torch.randn(5, 10, 20) | |
| >>> GLN = GlobalLayerNorm(10, 3) | |
| >>> x_norm = GLN(x) | |
| """ | |
| def __init__(self, dim, shape, eps=1e-8, elementwise_affine=True): | |
| super(GlobalLayerNorm, self).__init__() | |
| self.dim = dim | |
| self.eps = eps | |
| self.elementwise_affine = elementwise_affine | |
| if self.elementwise_affine: | |
| if shape == 3: | |
| self.weight = nn.Parameter(torch.ones(self.dim, 1)) | |
| self.bias = nn.Parameter(torch.zeros(self.dim, 1)) | |
| if shape == 4: | |
| self.weight = nn.Parameter(torch.ones(self.dim, 1, 1)) | |
| self.bias = nn.Parameter(torch.zeros(self.dim, 1, 1)) | |
| else: | |
| self.register_parameter("weight", None) | |
| self.register_parameter("bias", None) | |
| def forward(self, x): | |
| """Returns the normalized tensor. | |
| Arguments | |
| --------- | |
| x : torch.Tensor | |
| Tensor of size [N, C, K, S] or [N, C, L]. | |
| """ | |
| # x = N x C x K x S or N x C x L | |
| # N x 1 x 1 | |
| # cln: mean,var N x 1 x K x S | |
| # gln: mean,var N x 1 x 1 | |
| if x.dim() == 3: | |
| mean = torch.mean(x, (1, 2), keepdim=True) | |
| var = torch.mean((x - mean) ** 2, (1, 2), keepdim=True) | |
| if self.elementwise_affine: | |
| x = ( | |
| self.weight * (x - mean) / torch.sqrt(var + self.eps) | |
| + self.bias | |
| ) | |
| else: | |
| x = (x - mean) / torch.sqrt(var + self.eps) | |
| if x.dim() == 4: | |
| mean = torch.mean(x, (1, 2, 3), keepdim=True) | |
| var = torch.mean((x - mean) ** 2, (1, 2, 3), keepdim=True) | |
| if self.elementwise_affine: | |
| x = ( | |
| self.weight * (x - mean) / torch.sqrt(var + self.eps) | |
| + self.bias | |
| ) | |
| else: | |
| x = (x - mean) / torch.sqrt(var + self.eps) | |
| return x | |
| class CumulativeLayerNorm(nn.LayerNorm): | |
| """Calculate Cumulative Layer Normalization. | |
| Arguments | |
| --------- | |
| dim : int | |
| Dimension that you want to normalize. | |
| elementwise_affine : True | |
| Learnable per-element affine parameters. | |
| Example | |
| ------- | |
| >>> x = torch.randn(5, 10, 20) | |
| >>> CLN = CumulativeLayerNorm(10) | |
| >>> x_norm = CLN(x) | |
| """ | |
| def __init__(self, dim, elementwise_affine=True): | |
| super(CumulativeLayerNorm, self).__init__( | |
| dim, elementwise_affine=elementwise_affine, eps=1e-8 | |
| ) | |
| def forward(self, x): | |
| """Returns the normalized tensor. | |
| Arguments | |
| --------- | |
| x : torch.Tensor | |
| Tensor size [N, C, K, S] or [N, C, L] | |
| """ | |
| # x: N x C x K x S or N x C x L | |
| # N x K x S x C | |
| if x.dim() == 4: | |
| x = x.permute(0, 2, 3, 1).contiguous() | |
| # N x K x S x C == only channel norm | |
| x = super().forward(x) | |
| # N x C x K x S | |
| x = x.permute(0, 3, 1, 2).contiguous() | |
| if x.dim() == 3: | |
| x = torch.transpose(x, 1, 2) | |
| # N x L x C == only channel norm | |
| x = super().forward(x) | |
| # N x C x L | |
| x = torch.transpose(x, 1, 2) | |
| return x | |
| def select_norm(norm, dim, shape): | |
| """Just a wrapper to select the normalization type. | |
| """ | |
| if norm == "gln": | |
| return GlobalLayerNorm(dim, shape, elementwise_affine=True) | |
| if norm == "cln": | |
| return CumulativeLayerNorm(dim, elementwise_affine=True) | |
| if norm == "ln": | |
| return nn.GroupNorm(1, dim, eps=1e-8) | |
| else: | |
| return nn.BatchNorm1d(dim) | |
| class Swish(nn.Module): | |
| """ | |
| Swish is a smooth, non-monotonic function that consistently matches or outperforms ReLU on deep networks applied | |
| to a variety of challenging domains such as Image classification and Machine translation. | |
| """ | |
| def __init__(self): | |
| super(Swish, self).__init__() | |
| def forward(self, inputs: Tensor) -> Tensor: | |
| return inputs * inputs.sigmoid() | |
| class GLU(nn.Module): | |
| """ | |
| The gating mechanism is called Gated Linear Units (GLU), which was first introduced for natural language processing | |
| in the paper “Language Modeling with Gated Convolutional Networks” | |
| """ | |
| def __init__(self, dim: int) -> None: | |
| super(GLU, self).__init__() | |
| self.dim = dim | |
| def forward(self, inputs: Tensor) -> Tensor: | |
| outputs, gate = inputs.chunk(2, dim=self.dim) | |
| return outputs * gate.sigmoid() | |
| class Transpose(nn.Module): | |
| """ Wrapper class of torch.transpose() for Sequential module. """ | |
| def __init__(self, shape: tuple): | |
| super(Transpose, self).__init__() | |
| self.shape = shape | |
| def forward(self, x: Tensor) -> Tensor: | |
| return x.transpose(*self.shape) | |
| class Linear(nn.Module): | |
| """ | |
| Wrapper class of torch.nn.Linear | |
| Weight initialize by xavier initialization and bias initialize to zeros. | |
| """ | |
| def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None: | |
| super(Linear, self).__init__() | |
| self.linear = nn.Linear(in_features, out_features, bias=bias) | |
| init.xavier_uniform_(self.linear.weight) | |
| if bias: | |
| init.zeros_(self.linear.bias) | |
| def forward(self, x: Tensor) -> Tensor: | |
| return self.linear(x) | |
| class DepthwiseConv1d(nn.Module): | |
| """ | |
| When groups == in_channels and out_channels == K * in_channels, where K is a positive integer, | |
| this operation is termed in literature as depthwise convolution. | |
| Args: | |
| in_channels (int): Number of channels in the input | |
| out_channels (int): Number of channels produced by the convolution | |
| kernel_size (int or tuple): Size of the convolving kernel | |
| stride (int, optional): Stride of the convolution. Default: 1 | |
| padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0 | |
| bias (bool, optional): If True, adds a learnable bias to the output. Default: True | |
| Inputs: inputs | |
| - **inputs** (batch, in_channels, time): Tensor containing input vector | |
| Returns: outputs | |
| - **outputs** (batch, out_channels, time): Tensor produces by depthwise 1-D convolution. | |
| """ | |
| def __init__( | |
| self, | |
| in_channels: int, | |
| out_channels: int, | |
| kernel_size: int, | |
| stride: int = 1, | |
| padding: int = 0, | |
| bias: bool = False, | |
| ) -> None: | |
| super(DepthwiseConv1d, self).__init__() | |
| assert out_channels % in_channels == 0, "out_channels should be constant multiple of in_channels" | |
| self.conv = nn.Conv1d( | |
| in_channels=in_channels, | |
| out_channels=out_channels, | |
| kernel_size=kernel_size, | |
| groups=in_channels, | |
| stride=stride, | |
| padding=padding, | |
| bias=bias, | |
| ) | |
| def forward(self, inputs: Tensor) -> Tensor: | |
| return self.conv(inputs) | |
| class DepthwiseConv2d(nn.Module): | |
| """ | |
| When groups == in_channels and out_channels == K * in_channels, where K is a positive integer, | |
| this operation is termed in literature as depthwise convolution. | |
| Args: | |
| in_channels (int): Number of channels in the input | |
| out_channels (int): Number of channels produced by the convolution | |
| kernel_size (int or tuple): Size of the convolving kernel | |
| stride (int, optional): Stride of the convolution. Default: 1 | |
| padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0 | |
| bias (bool, optional): If True, adds a learnable bias to the output. Default: True | |
| Inputs: inputs | |
| - **inputs** (batch, in_channels, time): Tensor containing input vector | |
| Returns: outputs | |
| - **outputs** (batch, out_channels, time): Tensor produces by depthwise 1-D convolution. | |
| """ | |
| def __init__( | |
| self, | |
| in_channels: int, | |
| out_channels: int, | |
| kernel_size: int, | |
| stride: int = 1, | |
| padding: int = 0, | |
| bias: bool = False, | |
| ) -> None: | |
| super(DepthwiseConv2d, self).__init__() | |
| assert out_channels % in_channels == 0, "out_channels should be constant multiple of in_channels" | |
| self.lorder = kernel_size | |
| self.rorder = self.lorder | |
| self.conv = nn.Conv2d(in_channels, out_channels, [self.lorder+self.rorder-1, 1], [1, 1], groups=in_channels, bias=False) | |
| ''' | |
| self.conv = nn.Conv1d( | |
| in_channels=in_channels, | |
| out_channels=out_channels, | |
| kernel_size=kernel_size, | |
| groups=in_channels, | |
| stride=stride, | |
| padding=padding, | |
| bias=bias, | |
| ) | |
| ''' | |
| def forward(self, inputs: Tensor) -> Tensor: | |
| ##input: batch x feature x sequence | |
| x = torch.unsqueeze(inputs, -1) | |
| #x_per = x.permute(0, 3, 2, 1) | |
| #x_per: batch (b) x feature (h) x sequence(T) x channel (c) | |
| #y = F.pad(x_per, [0, 0, self.lorder - 1, 0]) | |
| y = F.pad(x, [0, 0, self.lorder - 1, self.rorder - 1]) | |
| out = x + self.conv(y) | |
| #out1 = out.permute(0, 3, 2, 1) | |
| #out1: batch (b) x channel (c) x sequence(T) x feature (h) | |
| return out.squeeze(-1) | |
| class PointwiseConv1d(nn.Module): | |
| """ | |
| When kernel size == 1 conv1d, this operation is termed in literature as pointwise convolution. | |
| This operation often used to match dimensions. | |
| Args: | |
| in_channels (int): Number of channels in the input | |
| out_channels (int): Number of channels produced by the convolution | |
| stride (int, optional): Stride of the convolution. Default: 1 | |
| padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0 | |
| bias (bool, optional): If True, adds a learnable bias to the output. Default: True | |
| Inputs: inputs | |
| - **inputs** (batch, in_channels, time): Tensor containing input vector | |
| Returns: outputs | |
| - **outputs** (batch, out_channels, time): Tensor produces by pointwise 1-D convolution. | |
| """ | |
| def __init__( | |
| self, | |
| in_channels: int, | |
| out_channels: int, | |
| stride: int = 1, | |
| padding: int = 0, | |
| bias: bool = True, | |
| ) -> None: | |
| super(PointwiseConv1d, self).__init__() | |
| self.conv = nn.Conv1d( | |
| in_channels=in_channels, | |
| out_channels=out_channels, | |
| kernel_size=1, | |
| stride=stride, | |
| padding=padding, | |
| bias=bias, | |
| ) | |
| def forward(self, inputs: Tensor) -> Tensor: | |
| return self.conv(inputs) | |
| class ConvModule(nn.Module): | |
| """ | |
| Modified from Conformer convolution module | |
| Args: | |
| in_channels (int): Number of channels in the input | |
| kernel_size (int or tuple, optional): Size of the convolving kernel Default: 31 | |
| dropout_p (float, optional): probability of dropout | |
| Inputs: inputs | |
| inputs (batch, time, dim): Tensor contains input sequences | |
| Outputs: outputs | |
| outputs (batch, time, dim): Tensor produces by conformer convolution module. | |
| """ | |
| def __init__( | |
| self, | |
| in_channels: int, | |
| kernel_size: int = 31, | |
| expansion_factor: int = 2, | |
| dropout_p: float = 0.1, | |
| ) -> None: | |
| super(ConvModule, self).__init__() | |
| assert (kernel_size - 1) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding" | |
| assert expansion_factor == 2, "Currently, Only Supports expansion_factor 2" | |
| self.sequential = nn.Sequential( | |
| Transpose(shape=(1, 2)), | |
| DepthwiseConv1d(in_channels, in_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2), | |
| ) | |
| def forward(self, inputs: Tensor) -> Tensor: | |
| return inputs + self.sequential(inputs).transpose(1, 2) | |
| class ConvModule_Gating(nn.Module): | |
| """ | |
| Modified from Conformer convolution module | |
| Args: | |
| in_channels (int): Number of channels in the input | |
| kernel_size (int or tuple, optional): Size of the convolving kernel Default: 31 | |
| dropout_p (float, optional): probability of dropout | |
| Inputs: inputs | |
| inputs (batch, time, dim): Tensor contains input sequences | |
| Outputs: outputs | |
| outputs (batch, time, dim): Tensor produces by conformer convolution module. | |
| """ | |
| def __init__( | |
| self, | |
| in_channels: int, | |
| kernel_size: int = 20, | |
| expansion_factor: int = 2, | |
| dropout_p: float = 0.1, | |
| ) -> None: | |
| super(ConvModule_Gating, self).__init__() | |
| assert (kernel_size - 1) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding" | |
| assert expansion_factor == 2, "Currently, Only Supports expansion_factor 2" | |
| self.sequential = nn.Sequential( | |
| Transpose(shape=(1, 2)), | |
| DepthwiseConv1d(in_channels, in_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2), | |
| ) | |
| def forward(self, inputs: Tensor) -> Tensor: | |
| return inputs * self.sequential(inputs).transpose(1, 2) | |
| class Conformer_ConvModule(nn.Module): | |
| """ | |
| Conformer convolution module starts with a pointwise convolution and a gated linear unit (GLU). | |
| This is followed by a single 1-D depthwise convolution layer. Batchnorm is deployed just after the convolution | |
| to aid training deep models. | |
| Args: | |
| in_channels (int): Number of channels in the input | |
| kernel_size (int or tuple, optional): Size of the convolving kernel Default: 31 | |
| dropout_p (float, optional): probability of dropout | |
| Inputs: inputs | |
| inputs (batch, dim, time): Tensor contains input sequences | |
| Outputs: outputs | |
| outputs (batch, dim, time): Tensor produces by conformer convolution module. | |
| """ | |
| def __init__( | |
| self, | |
| in_channels: int, | |
| kernel_size: int = 21, | |
| expansion_factor: int = 2, | |
| dropout_p: float = 0.1, | |
| ) -> None: | |
| super(Conformer_ConvModule, self).__init__() | |
| assert (kernel_size - 1) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding" | |
| assert expansion_factor == 2, "Currently, Only Supports expansion_factor 2" | |
| self.sequential = nn.Sequential( | |
| select_norm('ln',in_channels,3), | |
| PointwiseConv1d(in_channels, in_channels * expansion_factor, stride=1, padding=0, bias=True), | |
| GLU(dim=1), | |
| DepthwiseConv1d(in_channels, in_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2), | |
| select_norm('bn',in_channels,3), | |
| Swish(), | |
| PointwiseConv1d(in_channels, in_channels, stride=1, padding=0, bias=True), | |
| nn.Dropout(p=dropout_p), | |
| ) | |
| def forward(self, inputs: Tensor) -> Tensor: | |
| return inputs + self.sequential(inputs) | |
| class FeedForwardModule(nn.Module): | |
| """ | |
| Conformer Feed Forward Module follow pre-norm residual units and apply layer normalization within the residual unit | |
| and on the input before the first linear layer. This module also apply Swish activation and dropout, which helps | |
| regularizing the network. | |
| Args: | |
| encoder_dim (int): Dimension of conformer encoder | |
| expansion_factor (int): Expansion factor of feed forward module. | |
| dropout_p (float): Ratio of dropout | |
| Inputs: inputs | |
| - **inputs** (batch, time, dim): Tensor contains input sequences | |
| Outputs: outputs | |
| - **outputs** (batch, time, dim): Tensor produces by feed forward module. | |
| """ | |
| def __init__( | |
| self, | |
| encoder_dim: int = 512, | |
| expansion_factor: int = 4, | |
| dropout_p: float = 0.1, | |
| ) -> None: | |
| super(FeedForwardModule, self).__init__() | |
| self.sequential = nn.Sequential( | |
| nn.LayerNorm(encoder_dim), | |
| Linear(encoder_dim, encoder_dim * expansion_factor, bias=True), | |
| Swish(), | |
| nn.Dropout(p=dropout_p), | |
| Linear(encoder_dim * expansion_factor, encoder_dim, bias=True), | |
| nn.Dropout(p=dropout_p), | |
| ) | |
| def forward(self, inputs: Tensor) -> Tensor: | |
| return self.sequential(inputs) | |