Upload model

Browse files

Files changed (12) hide show

Patcher.py +103 -0
config.json +35 -0
configuration_gramt_binaural_time.py +49 -0
droppath.py +41 -0
model.py +309 -0
model.safetensors +3 -0
modeling_gramt_binaural_time.py +41 -0
mwmae.py +434 -0
patching_utils.py +126 -0
pos_embed.py +210 -0
swin.py +522 -0
utils.py +249 -0

Patcher.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from abc import ABC
+from .patching_utils import combine_patches, generate_patches, get_shape
+class PatchStrategy(ABC):
+    def __init__(self, tstride, tshape, fstride, fshape, input_fdim, input_tdim):
+        self.tstride = tstride
+        self.tshape = tshape
+        self.fstride = fstride
+        self.fshape = fshape
+        self.input_fdim = input_fdim
+        self.input_tdim = input_tdim
+    def _patch(self, x):
+        patches = generate_patches(
+            input=x,
+            fstride=self.fstride,
+            tstride=self.tstride,
+            fshape=self.fshape,
+            tshape=self.tshape,
+        )
+        return patches
+    def patch(self, x):
+        return self._patch(x)
+    def embed(self, x, patch_embedder):
+        return patch_embedder(x)
+    def patch_and_embed(self, x, patch_embedder):
+        """
+        Generate patches from the input spectrogram and embed them.
+        This method creates patches based on the frequency and temporal stride/shape
+        parameters, and then applies the given patch embedding function.
+        Parameters
+        ----------
+        x : torch.Tensor
+            The input spectrogram tensor to be patched and embedded.
+        patch_embedder : Callable
+            A function that applies embedding to the patches.
+        Returns
+        -------
+        Tuple[torch.Tensor, torch.Tensor]
+            The generated patches and their embeddings.
+        """
+        # Generate patches for knowing the input.
+        patches = generate_patches(
+            input=x,
+            fstride=self.fstride,
+            tstride=self.tstride,
+            fshape=self.fshape,
+            tshape=self.tshape,
+        )
+        x = patch_embedder(x)
+        return patches, x
+    def get_patch_size(self):
+        p_f_dim, p_t_dim = get_shape(
+            fstride=self.fstride,
+            tstride=self.tstride,
+            input_fdim=self.input_fdim,
+            input_tdim=self.input_tdim,
+            fshape=self.fshape,
+            tshape=self.tshape,
+        )
+        return p_f_dim, p_t_dim
+    def combine_patches(self, patches, original_size):
+        return combine_patches(
+            patches, original_size, self.fstride, self.tstride, self.fshape, self.tshape
+        )
+class TimePatching(PatchStrategy):
+    def __init__(
+        self, input_tdim, tstride=2, tshape=2, fstride=128, fshape=128, input_fdim=128
+    ):
+        super().__init__(
+            tstride=tstride,
+            tshape=tshape,
+            fstride=fstride,
+            fshape=fshape,
+            input_fdim=input_fdim,
+            input_tdim=input_tdim,
+        )
+class FramePatching(PatchStrategy):
+    def __init__(
+        self, input_tdim, tstride=16, tshape=16, fstride=16, fshape=16, input_fdim=128
+    ):
+        super().__init__(
+            tstride=tstride,
+            tshape=tshape,
+            fstride=fstride,
+            fshape=fshape,
+            input_fdim=input_fdim,
+            input_tdim=input_tdim,
+        )

config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "architectures": [
+    "GRAMTBinauralTimeModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_gramt_binaural_time.GRAMTBinauralTimeConfig",
+    "AutoModel": "modeling_gramt_binaural_time.GRAMTBinauralTimeModel"
+  },
+  "decoder_depth": 8,
+  "decoder_embedding_dim": 512,
+  "decoder_mlp_ratio": 4.0,
+  "decoder_num_heads": 8,
+  "decoder_window_sizes": [
+    2,
+    5,
+    10,
+    25,
+    50,
+    0,
+    0,
+    0
+  ],
+  "encoder_attention_dropout": 0.0,
+  "encoder_dropout": 0.0,
+  "encoder_hidden_dim": 768,
+  "encoder_mlp_ratio": 4.0,
+  "encoder_norm_layer_eps": 1e-06,
+  "encoder_num_heads": 12,
+  "encoder_num_layers": 12,
+  "input_length": 200,
+  "model_type": "gramt-binaural-time",
+  "num_mel_bins": 128,
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.3"
+}

configuration_gramt_binaural_time.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from transformers import PretrainedConfig
+from typing import List
+class GRAMTBinauralTimeConfig(PretrainedConfig):
+    model_type = "gramt-binaural-time"
+    model_size = "base"
+    in_channels: int = 2
+    patch_size = (128,2)
+    frequency_stride = 128
+    time_stride = 2
+    def __init__(
+        self,
+        decoder_mlp_ratio: float = 4.0,
+        decoder_depth: int = 8,
+        decoder_num_heads: int = 8,
+        decoder_embedding_dim: int = 512,
+        decoder_window_sizes: List[int] = [2, 5, 10, 25, 50, 0, 0, 0],
+        encoder_num_layers = 12,
+        encoder_num_heads = 12,
+        encoder_hidden_dim = 768,
+        encoder_mlp_ratio = 4.0,
+        encoder_dropout = 0.0,
+        encoder_attention_dropout = 0.0,
+        encoder_norm_layer_eps = 1e-6,
+        input_length = 200,
+        num_mel_bins = 128,
+        **kwargs,
+    ):
+        self.decoder_mlp_ratio = decoder_mlp_ratio
+        self.decoder_depth = decoder_depth
+        self.decoder_num_heads = decoder_num_heads
+        self.decoder_embedding_dim = decoder_embedding_dim
+        self.decoder_window_sizes = decoder_window_sizes
+        self.encoder_num_layers = encoder_num_layers
+        self.encoder_num_heads = encoder_num_heads
+        self.encoder_hidden_dim = encoder_hidden_dim
+        self.encoder_mlp_ratio = encoder_mlp_ratio
+        self.encoder_dropout = encoder_dropout
+        self.encoder_attention_dropout = encoder_attention_dropout
+        self.encoder_norm_layer_eps = encoder_norm_layer_eps
+        self.input_length = input_length
+        self.num_mel_bins = num_mel_bins
+        super().__init__(**kwargs)

droppath.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""
+Implementation of DropPath (Stochastic Depth) regularization
+Inspired by the PyTorch implementation in timm (https://github.com/rwightman/pytorch-image-models)
+by Ross Wightman, 2022
+"""
+import torch
+import torch.nn as nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (
+        x.ndim - 1
+    )  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=0.0):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x, training=True):
+        return drop_path(x, self.drop_prob, training)

model.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import torch
+from torch import nn
+from .Patcher import PatchStrategy
+from .mwmae import MWMHABlock
+from .pos_embed import get_2d_sincos_pos_embed
+from .utils import PatchEmbed, create_pretrained_model, repeat_token
+from einops import rearrange
+def conv3x3(in_channels, out_channels, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(
+        in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False
+    )
+class GRAMT(nn.Module):
+    def __init__(
+        self,
+        model_size="base",
+        in_channels = 2,
+        decoder_mlp_ratio: float = 4.0,
+        decoder_depth: int = 8,
+        decoder_num_heads: int = 8,
+        decoder_embedding_dim: int = 512,
+        decoder_window_sizes: list[int] = [2, 5, 10, 25, 50, 100, 0, 0],
+        encoder_num_layers = 12,
+        encoder_num_heads = 12,
+        encoder_hidden_dim = 768,
+        encoder_mlp_ratio = 4.0,
+        encoder_dropout = 0.0,
+        encoder_attention_dropout = 0.0,
+        encoder_norm_layer_eps = 1e-6,
+        patch_size = (16,8),
+        frequency_stride = 16,
+        time_stride = 8,
+        input_length = 200,
+        num_mel_bins = 128,
+        **kwargs,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.input_length = input_length
+        # Calculate intermediate shape after masking
+        self.patch_strategy = PatchStrategy(tstride = time_stride,
+                                            tshape = patch_size[1],
+                                            fstride = frequency_stride,
+                                            fshape = patch_size[0],
+                                            input_fdim = num_mel_bins,
+                                            input_tdim = self.input_length)
+        self.p_f_dim, self.p_t_dim = self.patch_strategy.get_patch_size()
+        self.num_patches = self.p_f_dim * self.p_t_dim
+        self.grid_size = (self.p_f_dim, self.p_t_dim)
+        # This is our encoder.
+        # --------------------------------------------------------------------------
+        # Transformer
+        (
+            self.encoder,
+            self.encoder_embedding_dim,
+        ) = create_pretrained_model(model_size,
+                                    encoder_num_layers = encoder_num_layers,
+                                    encoder_num_heads = encoder_num_heads,
+                                    encoder_hidden_dim = encoder_hidden_dim,
+                                    encoder_mlp_dim = int(encoder_hidden_dim * encoder_mlp_ratio),
+                                    encoder_dropout = encoder_dropout,
+                                    encoder_attention_dropout = encoder_attention_dropout,
+                                    encoder_norm_layer_eps = encoder_norm_layer_eps)
+        self.encoder_cls_token_num = 1
+        # Patch Embedder
+        self.patch_embed = PatchEmbed()
+        self._update_patch_embed_layers(self.patch_embed)
+        # Norm/Pos
+        self.register_buffer("cls_token",nn.Parameter(torch.zeros([1, 1, self.encoder_embedding_dim]), requires_grad = True))
+        torch.nn.init.normal_(self.cls_token, std=0.02)
+        # This is our decoder.
+        # --------------------------------------------------------------------------
+        # MAE decoder specifics
+        self.decoder_depth = decoder_depth
+        self.decoder_num_heads = decoder_num_heads
+        self.decoder_embedding_dim = decoder_embedding_dim
+        self.decoder_window_sizes = decoder_window_sizes
+        self.decoder_embed = nn.Linear(
+            self.encoder_embedding_dim, self.decoder_embedding_dim, bias=True
+        )
+        self.register_buffer("mask_token", nn.Parameter(torch.zeros(1, 1, self.decoder_embedding_dim, requires_grad = True)))
+        torch.nn.init.normal_(self.mask_token, std=0.02)
+        self.decoder_blocks = nn.ModuleList(
+            [
+                MWMHABlock(
+                    dim=decoder_embedding_dim,
+                    num_heads=decoder_num_heads,
+                    window_sizes=decoder_window_sizes,
+                    shift_windows=False,
+                    mlp_ratio=decoder_mlp_ratio,
+                    qkv_bias=True,
+                    norm_layer=nn.LayerNorm,
+                )
+                for i in range(self.decoder_depth)
+            ]
+        )
+        cls_token_num = 0
+        self.encoder.pos_embedding = self._get_pos_embed_params()
+        # Pos Embed init w/o the cls token num
+        self.register_buffer("decoder_pos_embed", nn.Parameter(
+            torch.zeros(1, self.num_patches, decoder_embedding_dim),
+            requires_grad=False,
+        ))
+        pos_embed = get_2d_sincos_pos_embed(
+            decoder_embedding_dim, self.grid_size, cls_token_num=cls_token_num
+        )
+        self.decoder_pos_embed.data.copy_(
+            torch.from_numpy(pos_embed).float().unsqueeze(0)
+        )
+        # Define prediction layers for Masked Auto Encoder pretraining
+        self.spec_pred = nn.Sequential(
+            nn.Linear(
+                decoder_embedding_dim,
+                self.patch_strategy.fshape
+                * self.patch_strategy.tshape
+                * self.in_channels,
+                bias=True,
+            ),
+        )
+        self.decoder_norm = nn.LayerNorm(decoder_embedding_dim)
+        # Normalize binaural/ambisonic spectrograms with Layer norm later.
+        self.spectrogram_normalize = nn.LayerNorm(
+                    [self.in_channels, num_mel_bins, self.input_length],
+                    elementwise_affine=False
+                )
+        self.input_shape = [num_mel_bins, self.input_length]
+        compile_modules = kwargs.get("compile_modules", None)
+        if (compile_modules is not None) and (compile_modules):
+            self._compile_operations()
+    def _compile_operations(self):
+        """
+        Use torch.compile on the extractor, encoder and decoder blocks for faster forward
+        """
+        try:
+            self.forward = torch.compile(self.get_audio_representation, mode = "reduce-overhead")
+        except Exception as e:
+            print(f"Warning: Could not compile operations: {e}")
+            self.use_compiled_forward = False
+    def _get_pos_embed_params(self):
+        """Calculates the pos embedding embedding parameters and returns them."""
+        # Update positional embedding
+        pos_embed = nn.Parameter(
+            torch.zeros(
+                1,
+                self.num_patches + self.encoder_cls_token_num,
+                self.encoder_embedding_dim,
+            ),
+            requires_grad=False,
+        )
+        pos_embed_data = get_2d_sincos_pos_embed(
+            self.encoder_embedding_dim,
+            self.grid_size,
+            cls_token_num=self.encoder_cls_token_num,
+        )
+        pos_embed.data.copy_(torch.from_numpy(pos_embed_data).float().unsqueeze(0))
+        return pos_embed
+    def _update_patch_embed_layers(self, patch_embed):
+        """Updates the patch embedding embedding layers."""
+        # Update patch projection layer
+        # Use 2, as the spectrogram has 2 channels
+        patch_embed.proj = torch.nn.Conv2d(
+            self.in_channels,
+            self.encoder_embedding_dim,
+            kernel_size=(self.patch_strategy.fshape, self.patch_strategy.tshape),
+            stride=(self.patch_strategy.fstride, self.patch_strategy.tstride),
+        )
+        patch_embed.num_patch = self.num_patches
+    def pass_through_encoder(self, x, non_mask_index, B):
+        """Passes the input through the Encoder Transformer network."""
+        # Add positional embeddings to the x.
+        x = x + self.encoder.pos_embedding[:, self.encoder_cls_token_num :, :]
+        x = x[non_mask_index, :].reshape((B, -1, x.shape[-1]))
+        cls_token = (
+            self.cls_token.expand(B, -1, -1)
+            + self.encoder.pos_embedding[:, :1, :]
+        )
+        try:
+            dist_token = (
+                self.encoder.dist_token.expand(B, -1, -1)
+                + self.encoder.pos_embedding[:, 1:2, :]
+            )
+            x = torch.cat((cls_token, dist_token, x), dim=1)
+        except Exception as e:
+            x = torch.cat((cls_token, x), dim=1)
+        x = self.encoder.dropout(x)
+        for block in self.encoder.layers:
+            x = block(x)
+        return self.encoder.ln(x)
+    def pass_through_decoder(self, encoder_output, non_mask_index, B):
+        encoder_output = self.decoder_embed(encoder_output)
+        x_ = repeat_token(
+            self.mask_token, (B, self.num_patches)
+        ).type_as(encoder_output)
+        x_[non_mask_index, :] = encoder_output[
+            :, self.encoder_cls_token_num :, :
+        ].reshape((-1, encoder_output.shape[-1]))
+        x_ = x_.reshape((B, -1, encoder_output.shape[-1]))
+        # Concatenate the CLS and Possibly Distill tokens from the encoder
+        # We can not do it with multi windowed attention though!
+        # So remove the CLS token from the decoder!
+        if self.use_mwmae_decoder:
+            x = x_
+            return_cut = 0
+        else:
+            x = torch.cat(
+                [encoder_output[:, : self.encoder_cls_token_num, :], x_], dim=1
+            )
+            return_cut = self.encoder_cls_token_num
+        x = x + self.decoder_pos_embed  # add the pos embeds
+        # Pass through transformer blocks
+        for blk in self.decoder_blocks:
+            x = blk(x)
+        x = self.decoder_norm(x)
+        pred = self.spec_pred(x)
+        pred = pred[:, return_cut:, :]
+        return pred
+    def _get_segment_representation(self, x, strategy="mean"):
+        """Extract audio representation using different strategies."""
+        # Put the model in eval mode when getting representations.
+        assert x.shape[1] == self.in_channels, f"The GRAM has in channels {self.in_channels}, but the feature has shape {x.shape} which the channels are incompatible"
+        B = x.shape[0]
+        x = x.transpose(2, 3)
+        x = self.spectrogram_normalize(x)
+        patches = self.patch_strategy.patch(x)
+        patches = patches.flatten(2)
+        encoded_patches = self.patch_strategy.embed(x, self.patch_embed)
+        mask = torch.zeros((B, self.num_patches), dtype=torch.bool, device=x.device)
+        x = self.pass_through_encoder(encoded_patches, ~mask, B)
+        if strategy == "mean":
+            return x[:, self.encoder_cls_token_num :, :].mean(axis=1)
+        elif strategy == "sum":
+            return x[:, self.encoder_cls_token_num :, :].sum(axis=1)
+        elif strategy == "cls":
+            return x[:, 0, :]
+        elif strategy == "raw":
+            x = x[:, self.encoder_cls_token_num :, :]
+            grid_size = self.grid_size
+            f, t = grid_size
+            # We have 25 time patches in 2 second audio. We need to have 20 for STARSS22.
+            outcome = rearrange(
+                x, "b (f t) d -> b t (f d)", f=f, d=self.encoder_embedding_dim
+            )
+            return outcome
+        else:
+            raise ValueError(f"Strategy '{strategy}' is unrecognized.")
+    def get_audio_representation(self, x, strategy = "mean"):
+        unit_frames = self.input_length
+        cur_frames = x.shape[2]
+        pad_frames = unit_frames - (cur_frames % unit_frames)
+        if pad_frames > 0:
+            # Padding with constant 0s
+            pad_arg = (
+                0,
+                0,
+                0,
+                pad_frames,
+            )  # (channel, channel, height, height, width, width)
+            x = torch.nn.functional.pad(x, pad_arg, mode="constant")
+        embeddings = []
+        # Now get the embeddings of the model.
+        for i in range(x.shape[2] // unit_frames):
+            x_inp = x[:, :, i * unit_frames : (i + 1) * unit_frames, :]
+            with torch.no_grad():
+                embedding = self._get_segment_representation(
+                        x_inp, strategy=strategy
+                    )
+            embeddings.append(embedding)
+        # Stack the embeddings here if it is raw
+        if strategy == "raw":
+            x = torch.hstack(embeddings)
+            pad_emb_frames = int(embeddings[0].shape[1] * pad_frames / unit_frames)
+            if pad_emb_frames > 0:
+                x = x[:, :-pad_emb_frames]  # remove padded tail
+            return x
+        else:
+            x = torch.stack(embeddings, dim=1)
+            return x

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d5b20b8decda54204192d9d7eae5fdd70e93bb1071e3d4cdebf261fd7e7d160
+size 446080184

modeling_gramt_binaural_time.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from transformers import PreTrainedModel
+from transformers import AutoConfig, AutoModel
+from .model import GRAMT
+from .configuration_gramt_binaural_time import GRAMTBinauralTimeConfig
+class GRAMTBinauralTimeModel(PreTrainedModel):
+    config_class = GRAMTBinauralTimeConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = GRAMT(
+                in_channels = config.in_channels,
+                decoder_mlp_ratio = config.decoder_mlp_ratio,
+                decoder_depth = config.decoder_depth,
+                decoder_num_heads = config.decoder_num_heads,
+                decoder_embedding_dim = config.decoder_embedding_dim,
+                decoder_window_sizes = config.decoder_window_sizes,
+                encoder_num_layers = config.encoder_num_layers,
+                encoder_num_heads = config.encoder_num_heads,
+                encoder_hidden_dim = config.encoder_hidden_dim,
+                encoder_mlp_ratio = config.encoder_mlp_ratio,
+                encoder_dropout = config.encoder_dropout,
+                encoder_attention_dropout = config.encoder_attention_dropout,
+                encoder_norm_layer_eps = config.encoder_norm_layer_eps,
+                patch_size = config.patch_size,
+                frequency_stride = config.frequency_stride,
+                time_stride = config.time_stride,
+                max_length = config.max_length,
+                num_mel_bins = config.num_mel_bins
+        )
+    def forward(self, tensor, strategy = "raw"):
+        return self.model.get_audio_representation(tensor, strategy = strategy)
+gram = GRAMTBinauralTimeModel(GRAMTBinauralTimeConfig())
+AutoConfig.register("gramt-binaural-time", GRAMTBinauralTimeConfig)
+AutoModel.register(GRAMTBinauralTimeConfig, GRAMTBinauralTimeModel)

mwmae.py ADDED Viewed

	@@ -0,0 +1,434 @@

+import collections.abc
+from itertools import repeat
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .droppath import DropPath
+from .swin import Mlp
+def constant_init(tensor, constant=0.0):
+    nn.init.constant_(tensor, constant)
+    return tensor
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features=None,
+        hidden_features=None,
+        out_features=None,
+        activation=F.gelu,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = activation
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x, train: bool = True):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x) if train else x
+        x = self.fc2(x)
+        x = self.drop(x) if train else x
+        return x
+class Attention(nn.Module):
+    """
+    Default multihead attention
+    """
+    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        nn.init.xavier_uniform_(self.qkv.weight)
+        nn.init.xavier_uniform_(self.proj.weight)
+    def forward(self, x, train: bool = True):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn) if train else attn
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x) if train else x
+        return x
+def window_partition1d(x, window_size):
+    B, W, C = x.shape
+    x = x.view(B, W // window_size, window_size, C)
+    windows = x.view(-1, window_size, C)
+    return windows
+def window_reverse1d(windows, window_size, W: int):
+    B = int(windows.shape[0] / (W / window_size))
+    x = windows.view(B, W // window_size, window_size, -1)
+    x = x.view(B, W, -1)
+    return x
+def get_relative_position_index1d(win_w):
+    # get pair-wise relative position index for each token inside the window
+    coords = torch.stack(torch.meshgrid(torch.arange(win_w)))
+    relative_coords = coords[:, :, None] - coords[:, None, :]  # 1, Ww, Ww
+    relative_coords = relative_coords.permute(1, 2, 0)  # Ww, Ww, 1
+    relative_coords[:, :, 0] += win_w - 1  # shift to start from 0
+    return relative_coords.sum(-1)  # Ww*Ww
+class WindowedAttentionHead(nn.Module):
+    def __init__(self, head_dim, window_size, shift_windows=False, attn_drop=0.0):
+        super().__init__()
+        self.head_dim = head_dim
+        self.window_size = window_size
+        self.shift_windows = shift_windows
+        self.attn_drop = attn_drop
+        self.scale = self.head_dim**-0.5
+        self.window_area = self.window_size * 1
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size - 1, 1))
+        )
+        nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
+        # Get relative position index
+        self.register_buffer(
+            "relative_position_index", get_relative_position_index1d(window_size)
+        )
+        self.drop_layer = nn.Dropout(attn_drop) if attn_drop > 0 else None
+        if shift_windows:
+            self.shift_size = window_size // 2
+        else:
+            self.shift_size = 0
+        assert 0 <= self.shift_size < self.window_size, (
+            "shift_size must in 0-window_size"
+        )
+    def forward(self, q, k, v, train: bool = True):
+        B, W, C = q.shape
+        mask = None
+        if self.shift_size > 0:
+            img_mask = torch.zeros((1, W, 1), device=q.device)
+            cnt = 0
+            for w in (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            ):
+                img_mask[:, w, :] = cnt
+                cnt += 1
+            mask_windows = window_partition1d(img_mask, self.window_size)
+            mask_windows = mask_windows.view(-1, self.window_size)
+            mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            mask = mask.masked_fill(mask != 0, -100.0).masked_fill(mask == 0, 0.0)
+            q = torch.roll(q, shifts=-self.shift_size, dims=1)
+            k = torch.roll(k, shifts=-self.shift_size, dims=1)
+            v = torch.roll(v, shifts=-self.shift_size, dims=1)
+        q = window_partition1d(q, self.window_size)
+        k = window_partition1d(k, self.window_size)
+        v = window_partition1d(v, self.window_size)
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        if train:
+            attn = attn + self._get_rel_pos_bias()
+        else:
+            attn = attn + self._get_rel_pos_bias()
+        if mask is not None:
+            B_, N, _ = attn.shape
+            num_win = mask.shape[0]
+            attn = attn.view(B_ // num_win, num_win, N, N) + mask.unsqueeze(0)
+            attn = attn.view(-1, N, N)
+            attn = attn.softmax(dim=-1)
+        else:
+            attn = attn.softmax(dim=-1)
+        if self.drop_layer is not None and train:
+            attn = self.drop_layer(attn)
+        x = attn @ v
+        # merge windows
+        shifted_x = window_reverse1d(x, self.window_size, W=W)
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=self.shift_size, dims=1)
+        else:
+            x = shifted_x
+        return x, attn
+    def _get_rel_pos_bias(self):
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(self.window_area, self.window_area, -1)  # Ww,Ww,1
+        relative_position_bias = relative_position_bias.permute(2, 0, 1)  # 1, Ww, Ww
+        return relative_position_bias
+class AttentionHead(nn.Module):
+    def __init__(self, head_dim, attn_drop=0.0):
+        super().__init__()
+        self.head_dim = head_dim
+        self.scale = head_dim**-0.5
+        self.drop_layer = nn.Dropout(attn_drop) if attn_drop > 0 else None
+    def forward(self, q, k, v, train: bool = True):
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        if self.drop_layer is not None and train:
+            attn = self.drop_layer(attn)
+        x = attn @ v
+        return x, attn
+class WindowedMultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        window_sizes,
+        shift_windows=False,
+        num_heads=8,
+        qkv_bias=False,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        nn.init.xavier_uniform_(self.qkv.weight)
+        if isinstance(window_sizes, int):
+            window_sizes = _ntuple(num_heads)(window_sizes)
+        else:
+            assert len(window_sizes) == num_heads
+        self.attn_heads = nn.ModuleList()
+        for i in range(num_heads):
+            ws_i = window_sizes[i]
+            if ws_i == 0:
+                self.attn_heads.append(AttentionHead(self.head_dim, attn_drop))
+            else:
+                self.attn_heads.append(
+                    WindowedAttentionHead(
+                        self.head_dim,
+                        window_size=ws_i,
+                        shift_windows=shift_windows,
+                        attn_drop=attn_drop,
+                    )
+                )
+        self.proj = nn.Linear(dim, dim)
+        nn.init.xavier_uniform_(self.proj.weight)
+        self.drop_layer = nn.Dropout(proj_drop) if proj_drop > 0 else None
+    def forward(self, x, train: bool = True):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 3, 0, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        o = []
+        for i in range(self.num_heads):
+            head_i, attn_i = self.attn_heads[i](q[i], k[i], v[i], train=train)
+            o.append(head_i.unsqueeze(0))
+        o = torch.cat(o, dim=0)
+        o = o.permute(1, 2, 0, 3).reshape(B, N, -1)
+        o = self.proj(o)
+        if self.drop_layer is not None and train:
+            o = self.drop_layer(o)
+        return o
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5):
+        super().__init__()
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x):
+        return x * self.gamma
+class BNWrapper(nn.Module):
+    def __init__(
+        self, num_features, use_running_average=True, use_bias=True, use_scale=True
+    ):
+        super().__init__()
+        self.bn = nn.BatchNorm1d(num_features, affine=use_scale or use_bias)
+    def forward(self, x, train=True):
+        return self.bn(x, train)
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        init_values=None,
+        drop_path=0.0,
+        act_layer=F.gelu,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            out_features=dim,
+            activation=act_layer,
+            drop=drop,
+        )
+        self.init_values = init_values
+        if init_values is not None:
+            self.layer_scale1 = LayerScale(dim, init_values)
+            self.layer_scale2 = LayerScale(dim, init_values)
+    def forward(self, x, train: bool = True):
+        outputs1 = self.attn(self.norm1(x), train=train)
+        if self.init_values is not None:
+            outputs1 = self.layer_scale1(outputs1)
+        x = x + self.drop_path(outputs1) if train else x + outputs1
+        outputs2 = self.mlp(self.norm2(x), train=train)
+        if self.init_values is not None:
+            outputs2 = self.layer_scale2(outputs2)
+        x = x + self.drop_path(outputs2) if train else x + outputs2
+        return x
+class MWMHABlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        window_sizes,
+        shift_windows=False,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        drop=0.0,
+        attn_drop=0.0,
+        init_values=None,
+        drop_path=0.0,
+        act_layer=F.gelu,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.wmha = WindowedMultiHeadAttention(
+            dim,
+            window_sizes=window_sizes,
+            shift_windows=shift_windows,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            out_features=dim,
+            activation=act_layer,
+            drop=drop,
+        )
+        self.init_values = init_values
+        if init_values is not None:
+            self.layer_scale1 = LayerScale(dim, init_values)
+            self.layer_scale2 = LayerScale(dim, init_values)
+    def forward(self, x, train: bool = True):
+        outputs1 = self.wmha(self.norm1(x), train=train)
+        if self.init_values is not None:
+            outputs1 = self.layer_scale1(outputs1)
+        x = x + self.drop_path(outputs1) if train else x + outputs1
+        outputs2 = self.mlp(self.norm2(x), train=train)
+        if self.init_values is not None:
+            outputs2 = self.layer_scale2(outputs2)
+        x = x + self.drop_path(outputs2) if train else x + outputs2
+        return x

patching_utils.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import torch
+from torch import nn
+def generate_patches(input, fstride, tstride, fshape, tshape):
+    r"""Function that extract patches from tensors and stacks them.
+    See :class:`~kornia.contrib.ExtractTensorPatches` for details.
+    Args:
+        input: tensor image where to extract the patches with shape :math:`(B, C, H, W)`.
+    Returns:
+        the tensor with the extracted patches with shape :math:`(B, N, C, H_{out}, W_{out})`.
+    Examples:
+        >>> input = torch.arange(9.).view(1, 1, 3, 3)
+        >>> patches = extract_tensor_patches(input, (2, 3))
+        >>> input
+        tensor([[[[0., 1., 2.],
+                  [3., 4., 5.],
+                  [6., 7., 8.]]]])
+        >>> patches[:, -1]
+        tensor([[[[3., 4., 5.],
+                  [6., 7., 8.]]]])
+    """
+    batch_size, num_channels = input.size()[:2]
+    dims = range(2, input.dim())
+    for dim, patch_size, stride in zip(dims, (fshape, tshape), (fstride, tstride)):
+        input = input.unfold(dim, patch_size, stride)
+    input = input.permute(0, *dims, 1, *(dim + len(dims) for dim in dims)).contiguous()
+    return input.view(batch_size, -1, num_channels, fshape, tshape)
+def combine_patches(
+    patches,
+    original_size,
+    fstride,
+    tstride,
+    fshape,
+    tshape,
+    eps: float = 1e-8,
+):
+    r"""Restore input from patches.
+    See :class:`~kornia.contrib.CombineTensorPatches` for details.
+    Args:
+        patches: patched tensor with shape :math:`(B, N, C, H_{out}, W_{out})`.
+    Return:
+        The combined patches in an image tensor with shape :math:`(B, C, H, W)`.
+    Example:
+        >>> out = extract_tensor_patches(torch.arange(16).view(1, 1, 4, 4), window_size=(2, 2), stride=(2, 2))
+        >>> combine_tensor_patches(out, original_size=(4, 4), window_size=(2, 2), stride=(2, 2))
+        tensor([[[[ 0,  1,  2,  3],
+                  [ 4,  5,  6,  7],
+                  [ 8,  9, 10, 11],
+                  [12, 13, 14, 15]]]])
+    .. note::
+        This function is supposed to be used in conjunction with :func:`extract_tensor_patches`.
+    """
+    if patches.ndim != 5:
+        raise ValueError(
+            f"Invalid input shape, we expect BxNxCxHxW. Got: {patches.shape}"
+        )
+    ones = torch.ones(
+        patches.shape[0],
+        patches.shape[2],
+        original_size[0],
+        original_size[1],
+        device=patches.device,
+        dtype=patches.dtype,
+    )
+    restored_size = ones.shape[2:]
+    patches = patches.permute(0, 2, 3, 4, 1)
+    patches = patches.reshape(patches.shape[0], -1, patches.shape[-1])
+    int_flag = 0
+    if not torch.is_floating_point(patches):
+        int_flag = 1
+        dtype = patches.dtype
+        patches = patches.float()
+        ones = ones.float()
+    # Calculate normalization map
+    unfold_ones = torch.nn.functional.unfold(
+        ones, kernel_size=(fshape, tshape), stride=(fstride, tstride)
+    )
+    norm_map = torch.nn.functional.fold(
+        input=unfold_ones,
+        output_size=restored_size,
+        kernel_size=(fshape, tshape),
+        stride=(fstride, tstride),
+    )
+    # Restored tensor
+    saturated_restored_tensor = torch.nn.functional.fold(
+        input=patches,
+        output_size=restored_size,
+        kernel_size=(fshape, tshape),
+        stride=(fstride, tstride),
+    )
+    # Remove satuation effect due to multiple summations
+    restored_tensor = saturated_restored_tensor / (norm_map + eps)
+    if int_flag:
+        restored_tensor = restored_tensor.to(dtype)
+    return restored_tensor
+# get the shape of intermediate representation.
+def get_shape(fstride, tstride, input_fdim, input_tdim, fshape, tshape):
+    test_input = torch.randn(1, 2, input_fdim, input_tdim)
+    test_proj = nn.Conv2d(
+        2,
+        2,
+        kernel_size=(fshape, tshape),
+        stride=(fstride, tstride),
+    )
+    test_out = test_proj(test_input)
+    f_dim = test_out.shape[2]
+    t_dim = test_out.shape[3]
+    return f_dim, t_dim

pos_embed.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# Position embedding utils
+# --------------------------------------------------------
+# https://github.com/facebookresearch/AudioMAE/blob/main/util/pos_embed.py
+import numpy as np
+import torch
+# --------------------------------------------------------
+# 2D sine-cosine position embedding
+# References:
+# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py
+# MoCo v3: https://github.com/facebookresearch/moco-v3
+# --------------------------------------------------------
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token_num):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    if grid_size is int:
+        gH = grid_size
+        gW = grid_size
+    else:
+        gH = grid_size[0]
+        gW = grid_size[1]
+    grid_h = np.arange(gH, dtype=np.float64)
+    grid_w = np.arange(gW, dtype=np.float64)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, gH, gW])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    for _ in range(cls_token_num):
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_flexible(embed_dim, grid_size, cls_token=False):
+    """
+    grid_size: int of the grid height and width
+    return:
+    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
+    """
+    grid_h = np.arange(grid_size[0], dtype=np.float64)
+    grid_w = np.arange(grid_size[1], dtype=np.float64)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size[0], grid_size[1]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    assert embed_dim % 2 == 0
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position
+    pos: a list of positions to be encoded: size (M,)
+    out: (M, D)
+    """
+    assert embed_dim % 2 == 0
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+# --------------------------------------------------------
+# Interpolate position embeddings for high-resolution
+# References:
+# DeiT: https://github.com/facebookresearch/deit
+# --------------------------------------------------------
+def interpolate_pos_embed(model, checkpoint_model):
+    if "pos_embed" in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model["pos_embed"]
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches**0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print(
+                "Position interpolate from %dx%d to %dx%d"
+                % (orig_size, orig_size, new_size, new_size)
+            )
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(
+                -1, orig_size, orig_size, embedding_size
+            ).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens,
+                size=(new_size, new_size),
+                mode="bicubic",
+                align_corners=False,
+            )
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model["pos_embed"] = new_pos_embed
+def interpolate_pos_embed_img2audio(model, checkpoint_model, orig_size, new_size):
+    if "pos_embed" in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model["pos_embed"]
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        # orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        # new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print(
+                "Position interpolate from %dx%d to %dx%d"
+                % (orig_size[0], orig_size[1], new_size[0], new_size[1])
+            )
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(
+                -1, orig_size[0], orig_size[1], embedding_size
+            ).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens,
+                size=(new_size[0], new_size[1]),
+                mode="bicubic",
+                align_corners=False,
+            )
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model["pos_embed"] = new_pos_embed
+def interpolate_pos_embed_audio(model, checkpoint_model, orig_size, new_size):
+    if "pos_embed" in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model["pos_embed"]
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        if orig_size != new_size:
+            print(
+                "Position interpolate from %dx%d to %dx%d"
+                % (orig_size[0], orig_size[1], new_size[0], new_size[1])
+            )
+            # extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            cls_token = pos_embed_checkpoint[:, 0, :].unsqueeze(1)
+            pos_tokens = pos_embed_checkpoint[:, 1:, :]  # remove
+            pos_tokens = pos_tokens.reshape(
+                -1, orig_size[0], orig_size[1], embedding_size
+            )  # .permute(0, 3, 1, 2)
+            # pos_tokens = torch.nn.functional.interpolate(
+            #    pos_tokens, size=(new_size[0], new_size[1]), mode='bicubic', align_corners=False)
+            # pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            pos_tokens = pos_tokens[:, :, : new_size[1], :]  # assume only time diff
+            pos_tokens = pos_tokens.flatten(1, 2)
+            new_pos_embed = torch.cat((cls_token, pos_tokens), dim=1)
+            checkpoint_model["pos_embed"] = new_pos_embed
+def interpolate_patch_embed_audio(
+    model,
+    checkpoint_model,
+    orig_channel,
+    new_channel=1,
+    kernel_size=(16, 16),
+    stride=(16, 16),
+    padding=(0, 0),
+):
+    if orig_channel != new_channel:
+        if "patch_embed.proj.weight" in checkpoint_model:
+            # aggregate 3 channels in rgb ckpt to 1 channel for audio
+            new_proj_weight = torch.nn.Parameter(
+                torch.sum(checkpoint_model["patch_embed.proj.weight"], dim=1).unsqueeze(
+                    1
+                )
+            )
+            checkpoint_model["patch_embed.proj.weight"] = new_proj_weight

swin.py ADDED Viewed

	@@ -0,0 +1,522 @@

+# --------------------------------------------------------
+# SimMIM
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# Modified by Zhenda Xie
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    )
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(
+        B, H // window_size, W // window_size, window_size, window_size, -1
+    )
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+class WindowAttention(nn.Module):
+    r"""Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = (
+            coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        )  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0
+        ).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1],
+            -1,
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(
+                1
+            ).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}"
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+class SwinTransformerBlock(nn.Module):
+    r"""Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, (
+            "shift_size must in 0-window_size"
+        )
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            w_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+            mask_windows = window_partition(
+                img_mask, self.window_size
+            )  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(
+                attn_mask != 0, float(-100.0)
+            ).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+        self.register_buffer("attn_mask", attn_mask)
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)
+            )
+        else:
+            shifted_x = x
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=self.attn_mask
+        )  # nW*B, window_size*window_size, C
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)
+            )
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+    def extra_repr(self) -> str:
+        return (
+            f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, "
+            f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+        )
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+class PatchMerging(nn.Module):
+    r"""Patch Merging Layer.
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+        x = x.view(B, H, W, C)
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+        x = self.norm(x)
+        x = self.reduction(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+    def flops(self):
+        H, W = self.input_resolution
+        flops = H * W * self.dim
+        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        return flops
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(
+        self,
+        dim,
+        input_resolution,
+        depth,
+        num_heads,
+        window_size,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    input_resolution=input_resolution,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i]
+                    if isinstance(drop_path, list)
+                    else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                input_resolution, dim=dim, norm_layer=norm_layer
+            )
+        else:
+            self.downsample = None
+    def forward(self, x):
+        print("IN", x.shape)
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops

utils.py ADDED Viewed

	@@ -0,0 +1,249 @@

+import collections.abc
+import math
+import sys
+from itertools import repeat
+import matplotlib.pyplot as plt
+import numpy as np
+import timm
+import torch
+from torch import nn
+from torchvision.models.vision_transformer import Encoder
+from typing import Tuple
+from functools import partial
+from collections.abc import Iterable   # import directly from collections for Python < 3.3
+def plot_fbank(fbank, title=None, save_path=None, **kwargs):
+    fig, axs = plt.subplots(min(4, fbank.shape[0]), 1, sharex=True, sharey=True)
+    if not isinstance(axs, Iterable):
+        axs = np.array([axs])
+    vmin, vmax = kwargs.get("vmin", None), kwargs.get("vmax", None)
+    # max 4 channels...
+    for channel in range(0, min(4, fbank.shape[0])):
+        axs[channel].set_title(f"Filter bank channel {channel}, {title}")
+        im = axs[channel].imshow(fbank[channel].T, aspect="auto", vmin=vmin, vmax=vmax)
+        axs[channel].set_ylabel("mel")
+        axs[channel].set_xlabel("time")
+    plt.gca().invert_yaxis()
+    plt.tight_layout()
+    fig.colorbar(im, ax=axs.ravel().tolist())
+    plt.show()
+    if save_path:
+        fig.savefig(save_path)
+    plt.close()
+    return fig
+# From PyTorch Internals to create the tuples of the given iterable.
+def _ntuple(n):
+    def parse(x):
+        # if x is already an instance of iterable object, create a tuple out of it
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        # Otherwise repeat the x, n times, and create a tuple.
+        return tuple(repeat(x, n))
+    return parse
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding"""
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = _ntuple(2)(img_size)
+        patch_size = _ntuple(2)(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(
+            in_channels=in_chans,
+            out_channels=embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+        )
+    # We need to override these.
+    def forward(self, x):
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+def get_sinusoid_encoding(n_position, d_hid):
+    """Sinusoid position encoding table"""
+    def get_position_angle_vec(position):
+        return [
+            position / np.power(10000, 2 * (hid_j // 2) / d_hid)
+            for hid_j in range(d_hid)
+        ]
+    sinusoid_table = np.array(
+        [get_position_angle_vec(pos_i) for pos_i in range(n_position)]
+    )
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+    return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+def create_pretrained_model(model_size,
+                            encoder_num_layers = 12,
+                            encoder_num_heads = 12,
+                            encoder_hidden_dim = 768,
+                            encoder_mlp_dim= 3072,
+                            encoder_dropout = 0.0,
+                            encoder_attention_dropout = 0.0,
+                            encoder_norm_layer_eps = 1e-6):
+    if model_size == "tiny":
+        v = timm.create_model("deit_tiny_distilled_patch16_224", pretrained=False)
+        hidden_dim = 182
+    elif model_size == "small":
+        v = timm.create_model("deit_small_distilled_patch16_224", pretrained=False)
+        hidden_dim = 384
+    elif model_size == "base":
+        v = Encoder(
+            seq_length = 0, #Only used for pos_embeddings and we set them later!
+            num_layers = encoder_num_layers,
+            num_heads = encoder_num_heads,
+            hidden_dim = encoder_hidden_dim,
+            mlp_dim= encoder_mlp_dim,
+            dropout = encoder_dropout,
+            attention_dropout = encoder_attention_dropout,
+            norm_layer = partial(nn.LayerNorm, eps=encoder_norm_layer_eps))
+        hidden_dim = encoder_hidden_dim
+    elif model_size == "base_nokd":
+        v = timm.create_model("deit_base_patch16_384", pretrained=False)
+        hidden_dim = 768
+    else:
+        print("Wrong model size!")
+        sys.exit(0)
+    return v, hidden_dim
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    left = norm_cdf((a - mean) / std)
+    up = norm_cdf((b - mean) / std)
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * left - 1, 2 * up - 1)
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+    return tensor
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are
+    applied while sampling the normal with mean/std applied, therefore a, b args
+    should be adjusted to match the range of mean, std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    with torch.no_grad():
+        return _trunc_normal_(tensor, mean, std, a, b)
+def expand_index_like(index: torch.Tensor, tokens: torch.Tensor) -> torch.Tensor:
+    """Expands the index along the last dimension of the input tokens.
+    Args:
+        index:
+            Index tensor with shape (batch_size, idx_length) where each entry is
+            an index in [0, sequence_length).
+        tokens:
+            Tokens tensor with shape (batch_size, sequence_length, dim).
+    Returns:
+        Index tensor with shape (batch_size, idx_length, dim) where the original
+        indices are repeated dim times along the last dimension.
+    """
+    dim = tokens.shape[-1]
+    index = index.unsqueeze(-1).expand(-1, -1, dim)
+    return index
+def set_at_index(
+    tokens: torch.Tensor, index: torch.Tensor, value: torch.Tensor
+) -> torch.Tensor:
+    """Copies all values into the input tensor at the given indices.
+    Args:
+        tokens:
+            Tokens tensor with shape (batch_size, sequence_length, dim).
+        index:
+            Index tensor with shape (batch_size, index_length).
+        value:
+            Value tensor with shape (batch_size, index_length, dim).
+    Returns:
+        Tokens tensor with shape (batch_size, sequence_length, dim) containing
+        the new values.
+    """
+    index = expand_index_like(index, tokens)
+    return torch.scatter(tokens, 1, index, value)
+def repeat_token(token: torch.Tensor, size: Tuple[int, int]) -> torch.Tensor:
+    """Repeats a token size times.
+    Args:
+        token:
+            Token tensor with shape (1, 1, dim).
+        size:
+            (batch_size, sequence_length) tuple.
+    Returns:
+        Tensor with shape (batch_size, sequence_length, dim) containing copies
+        of the input token.
+    """
+    batch_size, sequence_length = size
+    return token.repeat(batch_size, sequence_length, 1)